1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
7 #include <sys/timerfd.h>
10 #include "sd-daemon.h"
13 #include "sd-messages.h"
15 #include "alloc-util.h"
17 #include "event-source.h"
20 #include "glyph-util.h"
22 #include "hexdecoct.h"
24 #include "logarithm.h"
26 #include "mallinfo-util.h"
27 #include "memory-util.h"
28 #include "missing_magic.h"
29 #include "missing_syscall.h"
30 #include "missing_threads.h"
31 #include "origin-id.h"
32 #include "path-util.h"
34 #include "process-util.h"
37 #include "signal-util.h"
38 #include "socket-util.h"
39 #include "stat-util.h"
40 #include "string-table.h"
41 #include "string-util.h"
43 #include "time-util.h"
45 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
47 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source
*s
) {
48 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
50 s
->type
== SOURCE_CHILD
&&
51 s
->child
.pidfd
>= 0 &&
52 s
->child
.options
== WEXITED
;
55 static bool event_source_is_online(sd_event_source
*s
) {
57 return s
->enabled
!= SD_EVENT_OFF
&& !s
->ratelimited
;
60 static bool event_source_is_offline(sd_event_source
*s
) {
62 return s
->enabled
== SD_EVENT_OFF
|| s
->ratelimited
;
65 static const char* const event_source_type_table
[_SOURCE_EVENT_SOURCE_TYPE_MAX
] = {
67 [SOURCE_TIME_REALTIME
] = "realtime",
68 [SOURCE_TIME_BOOTTIME
] = "boottime",
69 [SOURCE_TIME_MONOTONIC
] = "monotonic",
70 [SOURCE_TIME_REALTIME_ALARM
] = "realtime-alarm",
71 [SOURCE_TIME_BOOTTIME_ALARM
] = "boottime-alarm",
72 [SOURCE_SIGNAL
] = "signal",
73 [SOURCE_CHILD
] = "child",
74 [SOURCE_DEFER
] = "defer",
75 [SOURCE_POST
] = "post",
76 [SOURCE_EXIT
] = "exit",
77 [SOURCE_WATCHDOG
] = "watchdog",
78 [SOURCE_INOTIFY
] = "inotify",
79 [SOURCE_MEMORY_PRESSURE
] = "memory-pressure",
82 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type
, int);
84 #define EVENT_SOURCE_IS_TIME(t) \
86 SOURCE_TIME_REALTIME, \
87 SOURCE_TIME_BOOTTIME, \
88 SOURCE_TIME_MONOTONIC, \
89 SOURCE_TIME_REALTIME_ALARM, \
90 SOURCE_TIME_BOOTTIME_ALARM)
92 #define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
95 SOURCE_TIME_REALTIME, \
96 SOURCE_TIME_BOOTTIME, \
97 SOURCE_TIME_MONOTONIC, \
98 SOURCE_TIME_REALTIME_ALARM, \
99 SOURCE_TIME_BOOTTIME_ALARM, \
103 SOURCE_MEMORY_PRESSURE)
105 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
106 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
107 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
108 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
119 /* timerfd_create() only supports these five clocks so far. We
120 * can add support for more clocks when the kernel learns to
121 * deal with them, too. */
122 struct clock_data realtime
;
123 struct clock_data boottime
;
124 struct clock_data monotonic
;
125 struct clock_data realtime_alarm
;
126 struct clock_data boottime_alarm
;
130 sd_event_source
**signal_sources
; /* indexed by signal number */
131 Hashmap
*signal_data
; /* indexed by priority */
133 Hashmap
*child_sources
;
134 unsigned n_online_child_sources
;
140 Hashmap
*inotify_data
; /* indexed by priority */
142 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
143 LIST_HEAD(struct inode_data
, inode_data_to_close_list
);
145 /* A list of inotify objects that already have events buffered which aren't processed yet */
146 LIST_HEAD(struct inotify_data
, buffered_inotify_data_list
);
148 /* A list of memory pressure event sources that still need their subscription string written */
149 LIST_HEAD(sd_event_source
, memory_pressure_write_list
);
154 triple_timestamp timestamp
;
157 bool exit_requested
:1;
158 bool need_process_child
:1;
160 bool profile_delays
:1;
165 sd_event
**default_event_ptr
;
167 usec_t watchdog_last
, watchdog_period
;
171 struct epoll_event
*event_queue
;
173 LIST_HEAD(sd_event_source
, sources
);
175 sd_event_source
*sigint_event_source
, *sigterm_event_source
;
177 usec_t last_run_usec
, last_log_usec
;
178 unsigned delays
[sizeof(usec_t
) * 8];
181 DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event
, event
);
183 static thread_local sd_event
*default_event
= NULL
;
185 static void source_disconnect(sd_event_source
*s
);
186 static void event_gc_inode_data(sd_event
*e
, struct inode_data
*d
);
188 static sd_event
*event_resolve(sd_event
*e
) {
189 return e
== SD_EVENT_DEFAULT
? default_event
: e
;
192 static int pending_prioq_compare(const void *a
, const void *b
) {
193 const sd_event_source
*x
= a
, *y
= b
;
199 /* Enabled ones first */
200 r
= CMP(x
->enabled
== SD_EVENT_OFF
, y
->enabled
== SD_EVENT_OFF
);
204 /* Non rate-limited ones first. */
205 r
= CMP(!!x
->ratelimited
, !!y
->ratelimited
);
209 /* Lower priority values first */
210 r
= CMP(x
->priority
, y
->priority
);
214 /* Older entries first */
215 return CMP(x
->pending_iteration
, y
->pending_iteration
);
218 static int prepare_prioq_compare(const void *a
, const void *b
) {
219 const sd_event_source
*x
= a
, *y
= b
;
225 /* Enabled ones first */
226 r
= CMP(x
->enabled
== SD_EVENT_OFF
, y
->enabled
== SD_EVENT_OFF
);
230 /* Non rate-limited ones first. */
231 r
= CMP(!!x
->ratelimited
, !!y
->ratelimited
);
235 /* Move most recently prepared ones last, so that we can stop
236 * preparing as soon as we hit one that has already been
237 * prepared in the current iteration */
238 r
= CMP(x
->prepare_iteration
, y
->prepare_iteration
);
242 /* Lower priority values first */
243 return CMP(x
->priority
, y
->priority
);
246 static usec_t
time_event_source_next(const sd_event_source
*s
) {
249 /* We have two kinds of event sources that have elapsation times associated with them: the actual
250 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
251 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
252 * looking at here. */
254 if (s
->ratelimited
) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
255 assert(s
->rate_limit
.begin
!= 0);
256 assert(s
->rate_limit
.interval
!= 0);
257 return usec_add(s
->rate_limit
.begin
, s
->rate_limit
.interval
);
260 /* Otherwise this must be a time event source, if not ratelimited */
261 if (EVENT_SOURCE_IS_TIME(s
->type
))
264 return USEC_INFINITY
;
267 static usec_t
time_event_source_latest(const sd_event_source
*s
) {
270 if (s
->ratelimited
) { /* For ratelimited stuff the earliest and the latest time shall actually be the
271 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
273 assert(s
->rate_limit
.begin
!= 0);
274 assert(s
->rate_limit
.interval
!= 0);
275 return usec_add(s
->rate_limit
.begin
, s
->rate_limit
.interval
);
278 /* Must be a time event source, if not ratelimited */
279 if (EVENT_SOURCE_IS_TIME(s
->type
))
280 return usec_add(s
->time
.next
, s
->time
.accuracy
);
282 return USEC_INFINITY
;
285 static bool event_source_timer_candidate(const sd_event_source
*s
) {
288 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
289 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
290 return !s
->pending
|| s
->ratelimited
;
293 static int time_prioq_compare(const void *a
, const void *b
, usec_t (*time_func
)(const sd_event_source
*s
)) {
294 const sd_event_source
*x
= a
, *y
= b
;
297 /* Enabled ones first */
298 r
= CMP(x
->enabled
== SD_EVENT_OFF
, y
->enabled
== SD_EVENT_OFF
);
302 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
303 r
= CMP(!event_source_timer_candidate(x
), !event_source_timer_candidate(y
));
308 return CMP(time_func(x
), time_func(y
));
311 static int earliest_time_prioq_compare(const void *a
, const void *b
) {
312 return time_prioq_compare(a
, b
, time_event_source_next
);
315 static int latest_time_prioq_compare(const void *a
, const void *b
) {
316 return time_prioq_compare(a
, b
, time_event_source_latest
);
319 static int exit_prioq_compare(const void *a
, const void *b
) {
320 const sd_event_source
*x
= a
, *y
= b
;
323 assert(x
->type
== SOURCE_EXIT
);
324 assert(y
->type
== SOURCE_EXIT
);
326 /* Enabled ones first */
327 r
= CMP(x
->enabled
== SD_EVENT_OFF
, y
->enabled
== SD_EVENT_OFF
);
331 /* Lower priority values first */
332 return CMP(x
->priority
, y
->priority
);
335 static void free_clock_data(struct clock_data
*d
) {
337 assert(d
->wakeup
== WAKEUP_CLOCK_DATA
);
340 prioq_free(d
->earliest
);
341 prioq_free(d
->latest
);
344 static sd_event
*event_free(sd_event
*e
) {
349 e
->sigterm_event_source
= sd_event_source_unref(e
->sigterm_event_source
);
350 e
->sigint_event_source
= sd_event_source_unref(e
->sigint_event_source
);
352 while ((s
= e
->sources
)) {
354 source_disconnect(s
);
355 sd_event_source_unref(s
);
358 assert(e
->n_sources
== 0);
360 if (e
->default_event_ptr
)
361 *(e
->default_event_ptr
) = NULL
;
363 safe_close(e
->epoll_fd
);
364 safe_close(e
->watchdog_fd
);
366 free_clock_data(&e
->realtime
);
367 free_clock_data(&e
->boottime
);
368 free_clock_data(&e
->monotonic
);
369 free_clock_data(&e
->realtime_alarm
);
370 free_clock_data(&e
->boottime_alarm
);
372 prioq_free(e
->pending
);
373 prioq_free(e
->prepare
);
376 free(e
->signal_sources
);
377 hashmap_free(e
->signal_data
);
379 hashmap_free(e
->inotify_data
);
381 hashmap_free(e
->child_sources
);
382 set_free(e
->post_sources
);
384 free(e
->event_queue
);
389 _public_
int sd_event_new(sd_event
** ret
) {
393 assert_return(ret
, -EINVAL
);
395 e
= new(sd_event
, 1);
402 .watchdog_fd
= -EBADF
,
403 .realtime
.wakeup
= WAKEUP_CLOCK_DATA
,
404 .realtime
.fd
= -EBADF
,
405 .realtime
.next
= USEC_INFINITY
,
406 .boottime
.wakeup
= WAKEUP_CLOCK_DATA
,
407 .boottime
.fd
= -EBADF
,
408 .boottime
.next
= USEC_INFINITY
,
409 .monotonic
.wakeup
= WAKEUP_CLOCK_DATA
,
410 .monotonic
.fd
= -EBADF
,
411 .monotonic
.next
= USEC_INFINITY
,
412 .realtime_alarm
.wakeup
= WAKEUP_CLOCK_DATA
,
413 .realtime_alarm
.fd
= -EBADF
,
414 .realtime_alarm
.next
= USEC_INFINITY
,
415 .boottime_alarm
.wakeup
= WAKEUP_CLOCK_DATA
,
416 .boottime_alarm
.fd
= -EBADF
,
417 .boottime_alarm
.next
= USEC_INFINITY
,
418 .perturb
= USEC_INFINITY
,
419 .origin_id
= origin_id_query(),
422 r
= prioq_ensure_allocated(&e
->pending
, pending_prioq_compare
);
426 e
->epoll_fd
= epoll_create1(EPOLL_CLOEXEC
);
427 if (e
->epoll_fd
< 0) {
432 e
->epoll_fd
= fd_move_above_stdio(e
->epoll_fd
);
434 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
435 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
436 special_glyph(SPECIAL_GLYPH_ELLIPSIS
));
437 e
->profile_delays
= true;
448 /* Define manually so we can add the origin check */
449 _public_ sd_event
*sd_event_ref(sd_event
*e
) {
452 if (event_origin_changed(e
))
460 _public_ sd_event
* sd_event_unref(sd_event
*e
) {
463 if (event_origin_changed(e
))
466 assert(e
->n_ref
> 0);
470 return event_free(e
);
473 #define PROTECT_EVENT(e) \
474 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
476 _public_ sd_event_source
* sd_event_source_disable_unref(sd_event_source
*s
) {
478 (void) sd_event_source_set_enabled(s
, SD_EVENT_OFF
);
479 return sd_event_source_unref(s
);
482 static void source_io_unregister(sd_event_source
*s
) {
484 assert(s
->type
== SOURCE_IO
);
486 if (event_origin_changed(s
->event
))
489 if (!s
->io
.registered
)
492 if (epoll_ctl(s
->event
->epoll_fd
, EPOLL_CTL_DEL
, s
->io
.fd
, NULL
) < 0)
493 log_debug_errno(errno
, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
494 strna(s
->description
), event_source_type_to_string(s
->type
));
496 s
->io
.registered
= false;
499 static int source_io_register(
505 assert(s
->type
== SOURCE_IO
);
506 assert(enabled
!= SD_EVENT_OFF
);
508 struct epoll_event ev
= {
509 .events
= events
| (enabled
== SD_EVENT_ONESHOT
? EPOLLONESHOT
: 0),
513 if (epoll_ctl(s
->event
->epoll_fd
,
514 s
->io
.registered
? EPOLL_CTL_MOD
: EPOLL_CTL_ADD
,
518 s
->io
.registered
= true;
523 static void source_child_pidfd_unregister(sd_event_source
*s
) {
525 assert(s
->type
== SOURCE_CHILD
);
527 if (event_origin_changed(s
->event
))
530 if (!s
->child
.registered
)
533 if (EVENT_SOURCE_WATCH_PIDFD(s
))
534 if (epoll_ctl(s
->event
->epoll_fd
, EPOLL_CTL_DEL
, s
->child
.pidfd
, NULL
) < 0)
535 log_debug_errno(errno
, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
536 strna(s
->description
), event_source_type_to_string(s
->type
));
538 s
->child
.registered
= false;
541 static int source_child_pidfd_register(sd_event_source
*s
, int enabled
) {
543 assert(s
->type
== SOURCE_CHILD
);
544 assert(enabled
!= SD_EVENT_OFF
);
546 if (EVENT_SOURCE_WATCH_PIDFD(s
)) {
547 struct epoll_event ev
= {
548 .events
= EPOLLIN
| (enabled
== SD_EVENT_ONESHOT
? EPOLLONESHOT
: 0),
552 if (epoll_ctl(s
->event
->epoll_fd
,
553 s
->child
.registered
? EPOLL_CTL_MOD
: EPOLL_CTL_ADD
,
554 s
->child
.pidfd
, &ev
) < 0)
558 s
->child
.registered
= true;
562 static void source_memory_pressure_unregister(sd_event_source
*s
) {
564 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
566 if (event_origin_changed(s
->event
))
569 if (!s
->memory_pressure
.registered
)
572 if (epoll_ctl(s
->event
->epoll_fd
, EPOLL_CTL_DEL
, s
->memory_pressure
.fd
, NULL
) < 0)
573 log_debug_errno(errno
, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
574 strna(s
->description
), event_source_type_to_string(s
->type
));
576 s
->memory_pressure
.registered
= false;
579 static int source_memory_pressure_register(sd_event_source
*s
, int enabled
) {
581 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
582 assert(enabled
!= SD_EVENT_OFF
);
584 struct epoll_event ev
= {
585 .events
= s
->memory_pressure
.write_buffer_size
> 0 ? EPOLLOUT
:
586 (s
->memory_pressure
.events
| (enabled
== SD_EVENT_ONESHOT
? EPOLLONESHOT
: 0)),
590 if (epoll_ctl(s
->event
->epoll_fd
,
591 s
->memory_pressure
.registered
? EPOLL_CTL_MOD
: EPOLL_CTL_ADD
,
592 s
->memory_pressure
.fd
, &ev
) < 0)
595 s
->memory_pressure
.registered
= true;
599 static void source_memory_pressure_add_to_write_list(sd_event_source
*s
) {
601 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
603 if (s
->memory_pressure
.in_write_list
)
606 LIST_PREPEND(memory_pressure
.write_list
, s
->event
->memory_pressure_write_list
, s
);
607 s
->memory_pressure
.in_write_list
= true;
610 static void source_memory_pressure_remove_from_write_list(sd_event_source
*s
) {
612 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
614 if (!s
->memory_pressure
.in_write_list
)
617 LIST_REMOVE(memory_pressure
.write_list
, s
->event
->memory_pressure_write_list
, s
);
618 s
->memory_pressure
.in_write_list
= false;
621 static clockid_t
event_source_type_to_clock(EventSourceType t
) {
625 case SOURCE_TIME_REALTIME
:
626 return CLOCK_REALTIME
;
628 case SOURCE_TIME_BOOTTIME
:
629 return CLOCK_BOOTTIME
;
631 case SOURCE_TIME_MONOTONIC
:
632 return CLOCK_MONOTONIC
;
634 case SOURCE_TIME_REALTIME_ALARM
:
635 return CLOCK_REALTIME_ALARM
;
637 case SOURCE_TIME_BOOTTIME_ALARM
:
638 return CLOCK_BOOTTIME_ALARM
;
641 return (clockid_t
) -1;
645 static EventSourceType
clock_to_event_source_type(clockid_t clock
) {
650 return SOURCE_TIME_REALTIME
;
653 return SOURCE_TIME_BOOTTIME
;
655 case CLOCK_MONOTONIC
:
656 return SOURCE_TIME_MONOTONIC
;
658 case CLOCK_REALTIME_ALARM
:
659 return SOURCE_TIME_REALTIME_ALARM
;
661 case CLOCK_BOOTTIME_ALARM
:
662 return SOURCE_TIME_BOOTTIME_ALARM
;
665 return _SOURCE_EVENT_SOURCE_TYPE_INVALID
;
669 static struct clock_data
* event_get_clock_data(sd_event
*e
, EventSourceType t
) {
674 case SOURCE_TIME_REALTIME
:
677 case SOURCE_TIME_BOOTTIME
:
680 case SOURCE_TIME_MONOTONIC
:
681 return &e
->monotonic
;
683 case SOURCE_TIME_REALTIME_ALARM
:
684 return &e
->realtime_alarm
;
686 case SOURCE_TIME_BOOTTIME_ALARM
:
687 return &e
->boottime_alarm
;
694 static void event_free_signal_data(sd_event
*e
, struct signal_data
*d
) {
700 hashmap_remove(e
->signal_data
, &d
->priority
);
705 static int event_make_signal_data(
708 struct signal_data
**ret
) {
710 struct signal_data
*d
;
718 if (event_origin_changed(e
))
721 if (e
->signal_sources
&& e
->signal_sources
[sig
])
722 priority
= e
->signal_sources
[sig
]->priority
;
724 priority
= SD_EVENT_PRIORITY_NORMAL
;
726 d
= hashmap_get(e
->signal_data
, &priority
);
728 if (sigismember(&d
->sigset
, sig
) > 0) {
734 d
= new(struct signal_data
, 1);
738 *d
= (struct signal_data
) {
739 .wakeup
= WAKEUP_SIGNAL_DATA
,
741 .priority
= priority
,
744 r
= hashmap_ensure_put(&e
->signal_data
, &uint64_hash_ops
, &d
->priority
, d
);
754 assert_se(sigaddset(&ss_copy
, sig
) >= 0);
756 r
= signalfd(d
->fd
>= 0 ? d
->fd
: -1, /* the first arg must be -1 or a valid signalfd */
758 SFD_NONBLOCK
|SFD_CLOEXEC
);
772 d
->fd
= fd_move_above_stdio(r
);
774 struct epoll_event ev
= {
779 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_ADD
, d
->fd
, &ev
) < 0) {
791 event_free_signal_data(e
, d
);
796 static void event_unmask_signal_data(sd_event
*e
, struct signal_data
*d
, int sig
) {
800 /* Turns off the specified signal in the signal data
801 * object. If the signal mask of the object becomes empty that
804 if (sigismember(&d
->sigset
, sig
) == 0)
807 assert_se(sigdelset(&d
->sigset
, sig
) >= 0);
809 if (sigisemptyset(&d
->sigset
)) {
810 /* If all the mask is all-zero we can get rid of the structure */
811 event_free_signal_data(e
, d
);
815 if (event_origin_changed(e
))
820 if (signalfd(d
->fd
, &d
->sigset
, SFD_NONBLOCK
|SFD_CLOEXEC
) < 0)
821 log_debug_errno(errno
, "Failed to unset signal bit, ignoring: %m");
824 static void event_gc_signal_data(sd_event
*e
, const int64_t *priority
, int sig
) {
825 struct signal_data
*d
;
826 static const int64_t zero_priority
= 0;
830 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
831 * and possibly drop the signalfd for it. */
833 if (sig
== SIGCHLD
&&
834 e
->n_online_child_sources
> 0)
837 if (e
->signal_sources
&&
838 e
->signal_sources
[sig
] &&
839 event_source_is_online(e
->signal_sources
[sig
]))
843 * The specified signal might be enabled in three different queues:
845 * 1) the one that belongs to the priority passed (if it is non-NULL)
846 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
847 * 3) the 0 priority (to cover the SIGCHLD case)
849 * Hence, let's remove it from all three here.
853 d
= hashmap_get(e
->signal_data
, priority
);
855 event_unmask_signal_data(e
, d
, sig
);
858 if (e
->signal_sources
&& e
->signal_sources
[sig
]) {
859 d
= hashmap_get(e
->signal_data
, &e
->signal_sources
[sig
]->priority
);
861 event_unmask_signal_data(e
, d
, sig
);
864 d
= hashmap_get(e
->signal_data
, &zero_priority
);
866 event_unmask_signal_data(e
, d
, sig
);
869 static void event_source_pp_prioq_reshuffle(sd_event_source
*s
) {
872 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
873 * they are enabled/disabled or marked pending and such. */
876 prioq_reshuffle(s
->event
->pending
, s
, &s
->pending_index
);
879 prioq_reshuffle(s
->event
->prepare
, s
, &s
->prepare_index
);
882 static void event_source_time_prioq_reshuffle(sd_event_source
*s
) {
883 struct clock_data
*d
;
887 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
888 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
892 d
= &s
->event
->monotonic
;
893 else if (EVENT_SOURCE_IS_TIME(s
->type
))
894 assert_se(d
= event_get_clock_data(s
->event
, s
->type
));
896 return; /* no-op for an event source which is neither a timer nor ratelimited. */
898 prioq_reshuffle(d
->earliest
, s
, &s
->earliest_index
);
899 prioq_reshuffle(d
->latest
, s
, &s
->latest_index
);
900 d
->needs_rearm
= true;
903 static void event_source_time_prioq_remove(
905 struct clock_data
*d
) {
910 prioq_remove(d
->earliest
, s
, &s
->earliest_index
);
911 prioq_remove(d
->latest
, s
, &s
->latest_index
);
912 s
->earliest_index
= s
->latest_index
= PRIOQ_IDX_NULL
;
913 d
->needs_rearm
= true;
916 static void source_disconnect(sd_event_source
*s
) {
925 assert(s
->event
->n_sources
> 0);
931 source_io_unregister(s
);
935 case SOURCE_TIME_REALTIME
:
936 case SOURCE_TIME_BOOTTIME
:
937 case SOURCE_TIME_MONOTONIC
:
938 case SOURCE_TIME_REALTIME_ALARM
:
939 case SOURCE_TIME_BOOTTIME_ALARM
:
940 /* Only remove this event source from the time event source here if it is not ratelimited. If
941 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
942 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
944 if (!s
->ratelimited
) {
945 struct clock_data
*d
;
946 assert_se(d
= event_get_clock_data(s
->event
, s
->type
));
947 event_source_time_prioq_remove(s
, d
);
953 if (s
->signal
.sig
> 0) {
955 if (s
->event
->signal_sources
)
956 s
->event
->signal_sources
[s
->signal
.sig
] = NULL
;
958 event_gc_signal_data(s
->event
, &s
->priority
, s
->signal
.sig
);
960 if (s
->signal
.unblock
) {
963 if (sigemptyset(&new_ss
) < 0)
964 log_debug_errno(errno
, "Failed to reset signal set, ignoring: %m");
965 else if (sigaddset(&new_ss
, s
->signal
.sig
) < 0)
966 log_debug_errno(errno
, "Failed to add signal %i to signal mask, ignoring: %m", s
->signal
.sig
);
968 r
= pthread_sigmask(SIG_UNBLOCK
, &new_ss
, NULL
);
970 log_debug_errno(r
, "Failed to unblock signal %i, ignoring: %m", s
->signal
.sig
);
978 if (event_origin_changed(s
->event
))
979 s
->child
.process_owned
= false;
981 if (s
->child
.pid
> 0) {
982 if (event_source_is_online(s
)) {
983 assert(s
->event
->n_online_child_sources
> 0);
984 s
->event
->n_online_child_sources
--;
987 (void) hashmap_remove(s
->event
->child_sources
, PID_TO_PTR(s
->child
.pid
));
990 if (EVENT_SOURCE_WATCH_PIDFD(s
))
991 source_child_pidfd_unregister(s
);
993 event_gc_signal_data(s
->event
, &s
->priority
, SIGCHLD
);
1002 set_remove(s
->event
->post_sources
, s
);
1006 prioq_remove(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
1009 case SOURCE_INOTIFY
: {
1010 struct inode_data
*inode_data
;
1012 inode_data
= s
->inotify
.inode_data
;
1014 struct inotify_data
*inotify_data
;
1015 assert_se(inotify_data
= inode_data
->inotify_data
);
1017 /* Detach this event source from the inode object */
1018 LIST_REMOVE(inotify
.by_inode_data
, inode_data
->event_sources
, s
);
1019 s
->inotify
.inode_data
= NULL
;
1022 assert(inotify_data
->n_pending
> 0);
1023 inotify_data
->n_pending
--;
1026 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1027 * continued to being watched. That's because inotify doesn't really have an API for that: we
1028 * can only change watch masks with access to the original inode either by fd or by path. But
1029 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1030 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1031 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1032 * there), but given the need for open_by_handle_at() which is privileged and not universally
1033 * available this would be quite an incomplete solution. Hence we go the other way, leave the
1034 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1035 * anymore after reception. Yes, this sucks, but … Linux … */
1037 /* Maybe release the inode data (and its inotify) */
1038 event_gc_inode_data(s
->event
, inode_data
);
1044 case SOURCE_MEMORY_PRESSURE
:
1045 source_memory_pressure_remove_from_write_list(s
);
1046 source_memory_pressure_unregister(s
);
1050 assert_not_reached();
1054 prioq_remove(s
->event
->pending
, s
, &s
->pending_index
);
1057 prioq_remove(s
->event
->prepare
, s
, &s
->prepare_index
);
1060 event_source_time_prioq_remove(s
, &s
->event
->monotonic
);
1062 event
= TAKE_PTR(s
->event
);
1063 LIST_REMOVE(sources
, event
->sources
, s
);
1066 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1067 * pidfd associated with this event source, which we'll do only on source_free(). */
1070 sd_event_unref(event
);
1073 static sd_event_source
* source_free(sd_event_source
*s
) {
1076 source_disconnect(s
);
1078 if (s
->type
== SOURCE_IO
&& s
->io
.owned
)
1079 s
->io
.fd
= safe_close(s
->io
.fd
);
1081 if (s
->type
== SOURCE_CHILD
) {
1082 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1084 if (s
->child
.process_owned
) {
1086 if (!s
->child
.exited
) {
1089 if (s
->child
.pidfd
>= 0) {
1090 if (pidfd_send_signal(s
->child
.pidfd
, SIGKILL
, NULL
, 0) < 0) {
1091 if (errno
== ESRCH
) /* Already dead */
1093 else if (!ERRNO_IS_NOT_SUPPORTED(errno
))
1094 log_debug_errno(errno
, "Failed to kill process " PID_FMT
" via pidfd_send_signal(), re-trying via kill(): %m",
1101 if (kill(s
->child
.pid
, SIGKILL
) < 0)
1102 if (errno
!= ESRCH
) /* Already dead */
1103 log_debug_errno(errno
, "Failed to kill process " PID_FMT
" via kill(), ignoring: %m",
1107 if (!s
->child
.waited
) {
1110 /* Reap the child if we can */
1111 (void) waitid(P_PID
, s
->child
.pid
, &si
, WEXITED
);
1115 if (s
->child
.pidfd_owned
)
1116 s
->child
.pidfd
= safe_close(s
->child
.pidfd
);
1119 if (s
->type
== SOURCE_MEMORY_PRESSURE
) {
1120 s
->memory_pressure
.fd
= safe_close(s
->memory_pressure
.fd
);
1121 s
->memory_pressure
.write_buffer
= mfree(s
->memory_pressure
.write_buffer
);
1124 if (s
->destroy_callback
)
1125 s
->destroy_callback(s
->userdata
);
1127 free(s
->description
);
1130 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source
*, source_free
);
1132 static int source_set_pending(sd_event_source
*s
, bool b
) {
1136 assert(s
->type
!= SOURCE_EXIT
);
1138 if (s
->pending
== b
)
1144 s
->pending_iteration
= s
->event
->iteration
;
1146 r
= prioq_put(s
->event
->pending
, s
, &s
->pending_index
);
1152 assert_se(prioq_remove(s
->event
->pending
, s
, &s
->pending_index
));
1154 if (EVENT_SOURCE_IS_TIME(s
->type
))
1155 event_source_time_prioq_reshuffle(s
);
1157 if (s
->type
== SOURCE_SIGNAL
&& !b
) {
1158 struct signal_data
*d
;
1160 d
= hashmap_get(s
->event
->signal_data
, &s
->priority
);
1161 if (d
&& d
->current
== s
)
1165 if (s
->type
== SOURCE_INOTIFY
) {
1167 assert(s
->inotify
.inode_data
);
1168 assert(s
->inotify
.inode_data
->inotify_data
);
1171 s
->inotify
.inode_data
->inotify_data
->n_pending
++;
1173 assert(s
->inotify
.inode_data
->inotify_data
->n_pending
> 0);
1174 s
->inotify
.inode_data
->inotify_data
->n_pending
--;
1181 static sd_event_source
*source_new(sd_event
*e
, bool floating
, EventSourceType type
) {
1183 /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1184 * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1186 static const size_t size_table
[_SOURCE_EVENT_SOURCE_TYPE_MAX
] = {
1187 [SOURCE_IO
] = endoffsetof_field(sd_event_source
, io
),
1188 [SOURCE_TIME_REALTIME
] = endoffsetof_field(sd_event_source
, time
),
1189 [SOURCE_TIME_BOOTTIME
] = endoffsetof_field(sd_event_source
, time
),
1190 [SOURCE_TIME_MONOTONIC
] = endoffsetof_field(sd_event_source
, time
),
1191 [SOURCE_TIME_REALTIME_ALARM
] = endoffsetof_field(sd_event_source
, time
),
1192 [SOURCE_TIME_BOOTTIME_ALARM
] = endoffsetof_field(sd_event_source
, time
),
1193 [SOURCE_SIGNAL
] = endoffsetof_field(sd_event_source
, signal
),
1194 [SOURCE_CHILD
] = endoffsetof_field(sd_event_source
, child
),
1195 [SOURCE_DEFER
] = endoffsetof_field(sd_event_source
, defer
),
1196 [SOURCE_POST
] = endoffsetof_field(sd_event_source
, post
),
1197 [SOURCE_EXIT
] = endoffsetof_field(sd_event_source
, exit
),
1198 [SOURCE_INOTIFY
] = endoffsetof_field(sd_event_source
, inotify
),
1199 [SOURCE_MEMORY_PRESSURE
] = endoffsetof_field(sd_event_source
, memory_pressure
),
1206 assert(type
< _SOURCE_EVENT_SOURCE_TYPE_MAX
);
1207 assert(size_table
[type
] > 0);
1209 s
= malloc0(size_table
[type
]);
1212 /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1213 * size, even if we only allocate the initial part we need. */
1214 s
= expand_to_usable(s
, sizeof(sd_event_source
));
1216 /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1217 * than what we allocated here. */
1220 s
->floating
= floating
;
1222 s
->pending_index
= PRIOQ_IDX_NULL
;
1223 s
->prepare_index
= PRIOQ_IDX_NULL
;
1228 LIST_PREPEND(sources
, e
->sources
, s
);
1234 static int io_exit_callback(sd_event_source
*s
, int fd
, uint32_t revents
, void *userdata
) {
1237 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1240 _public_
int sd_event_add_io(
1242 sd_event_source
**ret
,
1245 sd_event_io_handler_t callback
,
1248 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1251 assert_return(e
, -EINVAL
);
1252 assert_return(e
= event_resolve(e
), -ENOPKG
);
1253 assert_return(fd
>= 0, -EBADF
);
1254 assert_return(!(events
& ~(EPOLLIN
|EPOLLOUT
|EPOLLRDHUP
|EPOLLPRI
|EPOLLERR
|EPOLLHUP
|EPOLLET
)), -EINVAL
);
1255 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1256 assert_return(!event_origin_changed(e
), -ECHILD
);
1259 callback
= io_exit_callback
;
1261 s
= source_new(e
, !ret
, SOURCE_IO
);
1265 s
->wakeup
= WAKEUP_EVENT_SOURCE
;
1267 s
->io
.events
= events
;
1268 s
->io
.callback
= callback
;
1269 s
->userdata
= userdata
;
1270 s
->enabled
= SD_EVENT_ON
;
1272 r
= source_io_register(s
, s
->enabled
, events
);
1283 static void initialize_perturb(sd_event
*e
) {
1286 /* When we sleep for longer, we try to realign the wakeup to the same time within each
1287 * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1288 * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1289 * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1290 * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1292 if (_likely_(e
->perturb
!= USEC_INFINITY
))
1295 if (sd_id128_get_boot(&id
) >= 0 || sd_id128_get_machine(&id
) >= 0)
1296 e
->perturb
= (id
.qwords
[0] ^ id
.qwords
[1]) % USEC_PER_MINUTE
;
1298 e
->perturb
= 0; /* This is a super early process without /proc and /etc ?? */
1301 static int event_setup_timer_fd(
1303 struct clock_data
*d
,
1309 if (_likely_(d
->fd
>= 0))
1312 _cleanup_close_
int fd
= -EBADF
;
1314 fd
= timerfd_create(clock
, TFD_NONBLOCK
|TFD_CLOEXEC
);
1318 fd
= fd_move_above_stdio(fd
);
1320 struct epoll_event ev
= {
1325 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_ADD
, fd
, &ev
) < 0)
1328 d
->fd
= TAKE_FD(fd
);
1332 static int time_exit_callback(sd_event_source
*s
, uint64_t usec
, void *userdata
) {
1335 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1338 static int setup_clock_data(sd_event
*e
, struct clock_data
*d
, clockid_t clock
) {
1344 r
= event_setup_timer_fd(e
, d
, clock
);
1349 r
= prioq_ensure_allocated(&d
->earliest
, earliest_time_prioq_compare
);
1353 r
= prioq_ensure_allocated(&d
->latest
, latest_time_prioq_compare
);
1360 static int event_source_time_prioq_put(
1362 struct clock_data
*d
) {
1368 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s
->type
));
1370 r
= prioq_put(d
->earliest
, s
, &s
->earliest_index
);
1374 r
= prioq_put(d
->latest
, s
, &s
->latest_index
);
1376 assert_se(prioq_remove(d
->earliest
, s
, &s
->earliest_index
) > 0);
1377 s
->earliest_index
= PRIOQ_IDX_NULL
;
1381 d
->needs_rearm
= true;
1385 _public_
int sd_event_add_time(
1387 sd_event_source
**ret
,
1391 sd_event_time_handler_t callback
,
1394 EventSourceType type
;
1395 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1396 struct clock_data
*d
;
1399 assert_return(e
, -EINVAL
);
1400 assert_return(e
= event_resolve(e
), -ENOPKG
);
1401 assert_return(accuracy
!= UINT64_MAX
, -EINVAL
);
1402 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1403 assert_return(!event_origin_changed(e
), -ECHILD
);
1405 if (!clock_supported(clock
)) /* Checks whether the kernel supports the clock */
1408 type
= clock_to_event_source_type(clock
); /* checks whether sd-event supports this clock */
1413 callback
= time_exit_callback
;
1415 assert_se(d
= event_get_clock_data(e
, type
));
1417 r
= setup_clock_data(e
, d
, clock
);
1421 s
= source_new(e
, !ret
, type
);
1425 s
->time
.next
= usec
;
1426 s
->time
.accuracy
= accuracy
== 0 ? DEFAULT_ACCURACY_USEC
: accuracy
;
1427 s
->time
.callback
= callback
;
1428 s
->earliest_index
= s
->latest_index
= PRIOQ_IDX_NULL
;
1429 s
->userdata
= userdata
;
1430 s
->enabled
= SD_EVENT_ONESHOT
;
1432 r
= event_source_time_prioq_put(s
, d
);
1443 _public_
int sd_event_add_time_relative(
1445 sd_event_source
**ret
,
1449 sd_event_time_handler_t callback
,
1455 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1456 * checks for overflow. */
1458 r
= sd_event_now(e
, clock
, &t
);
1462 if (usec
>= USEC_INFINITY
- t
)
1465 return sd_event_add_time(e
, ret
, clock
, t
+ usec
, accuracy
, callback
, userdata
);
1468 static int signal_exit_callback(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
1471 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1474 _public_
int sd_event_add_signal(
1476 sd_event_source
**ret
,
1478 sd_event_signal_handler_t callback
,
1481 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1482 struct signal_data
*d
;
1487 assert_return(e
, -EINVAL
);
1488 assert_return(e
= event_resolve(e
), -ENOPKG
);
1489 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1490 assert_return(!event_origin_changed(e
), -ECHILD
);
1492 /* Let's make sure our special flag stays outside of the valid signal range */
1493 assert_cc(_NSIG
< SD_EVENT_SIGNAL_PROCMASK
);
1495 if (sig
& SD_EVENT_SIGNAL_PROCMASK
) {
1496 sig
&= ~SD_EVENT_SIGNAL_PROCMASK
;
1497 assert_return(SIGNAL_VALID(sig
), -EINVAL
);
1501 assert_return(SIGNAL_VALID(sig
), -EINVAL
);
1503 r
= signal_is_blocked(sig
);
1513 callback
= signal_exit_callback
;
1515 if (!e
->signal_sources
) {
1516 e
->signal_sources
= new0(sd_event_source
*, _NSIG
);
1517 if (!e
->signal_sources
)
1519 } else if (e
->signal_sources
[sig
])
1522 s
= source_new(e
, !ret
, SOURCE_SIGNAL
);
1526 s
->signal
.sig
= sig
;
1527 s
->signal
.callback
= callback
;
1528 s
->userdata
= userdata
;
1529 s
->enabled
= SD_EVENT_ON
;
1531 e
->signal_sources
[sig
] = s
;
1536 if (sigemptyset(&new_ss
) < 0)
1539 if (sigaddset(&new_ss
, sig
) < 0)
1542 r
= pthread_sigmask(SIG_BLOCK
, &new_ss
, &old_ss
);
1546 r
= sigismember(&old_ss
, sig
);
1550 s
->signal
.unblock
= !r
;
1552 s
->signal
.unblock
= false;
1554 r
= event_make_signal_data(e
, sig
, &d
);
1556 if (s
->signal
.unblock
)
1557 (void) pthread_sigmask(SIG_UNBLOCK
, &new_ss
, NULL
);
1562 /* Use the signal name as description for the event source by default */
1563 (void) sd_event_source_set_description(s
, signal_to_string(sig
));
1572 static int child_exit_callback(sd_event_source
*s
, const siginfo_t
*si
, void *userdata
) {
1575 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1578 static bool shall_use_pidfd(void) {
1579 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1580 return secure_getenv_bool("SYSTEMD_PIDFD") != 0;
1583 _public_
int sd_event_add_child(
1585 sd_event_source
**ret
,
1588 sd_event_child_handler_t callback
,
1591 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1594 assert_return(e
, -EINVAL
);
1595 assert_return(e
= event_resolve(e
), -ENOPKG
);
1596 assert_return(pid
> 1, -EINVAL
);
1597 assert_return(!(options
& ~(WEXITED
|WSTOPPED
|WCONTINUED
)), -EINVAL
);
1598 assert_return(options
!= 0, -EINVAL
);
1599 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1600 assert_return(!event_origin_changed(e
), -ECHILD
);
1603 callback
= child_exit_callback
;
1605 if (e
->n_online_child_sources
== 0) {
1606 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1607 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1608 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1611 * (As an optimization we only do this check on the first child event source created.) */
1612 r
= signal_is_blocked(SIGCHLD
);
1619 r
= hashmap_ensure_allocated(&e
->child_sources
, NULL
);
1623 if (hashmap_contains(e
->child_sources
, PID_TO_PTR(pid
)))
1626 s
= source_new(e
, !ret
, SOURCE_CHILD
);
1630 s
->wakeup
= WAKEUP_EVENT_SOURCE
;
1631 s
->child
.options
= options
;
1632 s
->child
.callback
= callback
;
1633 s
->userdata
= userdata
;
1634 s
->enabled
= SD_EVENT_ONESHOT
;
1636 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1637 * pin the PID, and make regular waitid() handling race-free. */
1639 if (shall_use_pidfd()) {
1640 s
->child
.pidfd
= pidfd_open(pid
, 0);
1641 if (s
->child
.pidfd
< 0) {
1642 /* Propagate errors unless the syscall is not supported or blocked */
1643 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
))
1646 s
->child
.pidfd_owned
= true; /* If we allocate the pidfd we own it by default */
1648 s
->child
.pidfd
= -EBADF
;
1650 if (EVENT_SOURCE_WATCH_PIDFD(s
)) {
1651 /* We have a pidfd and we only want to watch for exit */
1652 r
= source_child_pidfd_register(s
, s
->enabled
);
1657 /* We have no pidfd or we shall wait for some other event than WEXITED */
1658 r
= event_make_signal_data(e
, SIGCHLD
, NULL
);
1662 e
->need_process_child
= true;
1665 r
= hashmap_put(e
->child_sources
, PID_TO_PTR(pid
), s
);
1669 /* These must be done after everything succeeds. */
1671 e
->n_online_child_sources
++;
1679 _public_
int sd_event_add_child_pidfd(
1681 sd_event_source
**ret
,
1684 sd_event_child_handler_t callback
,
1688 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1692 assert_return(e
, -EINVAL
);
1693 assert_return(e
= event_resolve(e
), -ENOPKG
);
1694 assert_return(pidfd
>= 0, -EBADF
);
1695 assert_return(!(options
& ~(WEXITED
|WSTOPPED
|WCONTINUED
)), -EINVAL
);
1696 assert_return(options
!= 0, -EINVAL
);
1697 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1698 assert_return(!event_origin_changed(e
), -ECHILD
);
1701 callback
= child_exit_callback
;
1703 if (e
->n_online_child_sources
== 0) {
1704 r
= signal_is_blocked(SIGCHLD
);
1711 r
= hashmap_ensure_allocated(&e
->child_sources
, NULL
);
1715 r
= pidfd_get_pid(pidfd
, &pid
);
1719 if (hashmap_contains(e
->child_sources
, PID_TO_PTR(pid
)))
1722 s
= source_new(e
, !ret
, SOURCE_CHILD
);
1726 s
->wakeup
= WAKEUP_EVENT_SOURCE
;
1727 s
->child
.pidfd
= pidfd
;
1729 s
->child
.options
= options
;
1730 s
->child
.callback
= callback
;
1731 s
->child
.pidfd_owned
= false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1732 s
->userdata
= userdata
;
1733 s
->enabled
= SD_EVENT_ONESHOT
;
1735 r
= hashmap_put(e
->child_sources
, PID_TO_PTR(pid
), s
);
1739 if (EVENT_SOURCE_WATCH_PIDFD(s
)) {
1740 /* We only want to watch for WEXITED */
1741 r
= source_child_pidfd_register(s
, s
->enabled
);
1745 /* We shall wait for some other event than WEXITED */
1746 r
= event_make_signal_data(e
, SIGCHLD
, NULL
);
1750 e
->need_process_child
= true;
1753 e
->n_online_child_sources
++;
1761 static int generic_exit_callback(sd_event_source
*s
, void *userdata
) {
1764 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1767 _public_
int sd_event_add_defer(
1769 sd_event_source
**ret
,
1770 sd_event_handler_t callback
,
1773 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1776 assert_return(e
, -EINVAL
);
1777 assert_return(e
= event_resolve(e
), -ENOPKG
);
1778 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1779 assert_return(!event_origin_changed(e
), -ECHILD
);
1782 callback
= generic_exit_callback
;
1784 s
= source_new(e
, !ret
, SOURCE_DEFER
);
1788 s
->defer
.callback
= callback
;
1789 s
->userdata
= userdata
;
1790 s
->enabled
= SD_EVENT_ONESHOT
;
1792 r
= source_set_pending(s
, true);
1803 _public_
int sd_event_add_post(
1805 sd_event_source
**ret
,
1806 sd_event_handler_t callback
,
1809 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1812 assert_return(e
, -EINVAL
);
1813 assert_return(e
= event_resolve(e
), -ENOPKG
);
1814 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1815 assert_return(!event_origin_changed(e
), -ECHILD
);
1818 callback
= generic_exit_callback
;
1820 s
= source_new(e
, !ret
, SOURCE_POST
);
1824 s
->post
.callback
= callback
;
1825 s
->userdata
= userdata
;
1826 s
->enabled
= SD_EVENT_ON
;
1828 r
= set_ensure_put(&e
->post_sources
, NULL
, s
);
1840 _public_
int sd_event_add_exit(
1842 sd_event_source
**ret
,
1843 sd_event_handler_t callback
,
1846 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1849 assert_return(e
, -EINVAL
);
1850 assert_return(e
= event_resolve(e
), -ENOPKG
);
1851 assert_return(callback
, -EINVAL
);
1852 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1853 assert_return(!event_origin_changed(e
), -ECHILD
);
1855 r
= prioq_ensure_allocated(&e
->exit
, exit_prioq_compare
);
1859 s
= source_new(e
, !ret
, SOURCE_EXIT
);
1863 s
->exit
.callback
= callback
;
1864 s
->userdata
= userdata
;
1865 s
->exit
.prioq_index
= PRIOQ_IDX_NULL
;
1866 s
->enabled
= SD_EVENT_ONESHOT
;
1868 r
= prioq_put(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
1879 _public_
int sd_event_trim_memory(void) {
1882 /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1883 * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1884 * NULL callback parameter. */
1886 log_debug("Memory pressure event, trimming malloc() memory.");
1888 #if HAVE_GENERIC_MALLINFO
1889 generic_mallinfo before_mallinfo
= generic_mallinfo_get();
1892 usec_t before_timestamp
= now(CLOCK_MONOTONIC
);
1893 hashmap_trim_pools();
1895 usec_t after_timestamp
= now(CLOCK_MONOTONIC
);
1898 log_debug("Successfully trimmed some memory.");
1900 log_debug("Couldn't trim any memory.");
1902 usec_t period
= after_timestamp
- before_timestamp
;
1904 #if HAVE_GENERIC_MALLINFO
1905 generic_mallinfo after_mallinfo
= generic_mallinfo_get();
1906 size_t l
= LESS_BY((size_t) before_mallinfo
.hblkhd
, (size_t) after_mallinfo
.hblkhd
) +
1907 LESS_BY((size_t) before_mallinfo
.arena
, (size_t) after_mallinfo
.arena
);
1908 log_struct(LOG_DEBUG
,
1909 LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1910 FORMAT_TIMESPAN(period
, 0),
1912 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR
,
1913 "TRIMMED_BYTES=%zu", l
,
1914 "TRIMMED_USEC=" USEC_FMT
, period
);
1916 log_struct(LOG_DEBUG
,
1917 LOG_MESSAGE("Memory trimming took %s.",
1918 FORMAT_TIMESPAN(period
, 0)),
1919 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR
,
1920 "TRIMMED_USEC=" USEC_FMT
, period
);
1926 static int memory_pressure_callback(sd_event_source
*s
, void *userdata
) {
1929 sd_event_trim_memory();
1933 _public_
int sd_event_add_memory_pressure(
1935 sd_event_source
**ret
,
1936 sd_event_handler_t callback
,
1939 _cleanup_free_
char *w
= NULL
;
1940 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1941 _cleanup_close_
int path_fd
= -EBADF
, fd
= -EBADF
;
1942 _cleanup_free_
void *write_buffer
= NULL
;
1943 const char *watch
, *watch_fallback
= NULL
, *env
;
1944 size_t write_buffer_size
= 0;
1950 assert_return(e
, -EINVAL
);
1951 assert_return(e
= event_resolve(e
), -ENOPKG
);
1952 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1953 assert_return(!event_origin_changed(e
), -ECHILD
);
1956 callback
= memory_pressure_callback
;
1958 s
= source_new(e
, !ret
, SOURCE_MEMORY_PRESSURE
);
1962 s
->wakeup
= WAKEUP_EVENT_SOURCE
;
1963 s
->memory_pressure
.callback
= callback
;
1964 s
->userdata
= userdata
;
1965 s
->enabled
= SD_EVENT_ON
;
1966 s
->memory_pressure
.fd
= -EBADF
;
1968 env
= secure_getenv("MEMORY_PRESSURE_WATCH");
1970 if (isempty(env
) || path_equal(env
, "/dev/null"))
1971 return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN
),
1972 "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1974 if (!path_is_absolute(env
) || !path_is_normalized(env
))
1975 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
1976 "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env
);
1980 env
= secure_getenv("MEMORY_PRESSURE_WRITE");
1982 r
= unbase64mem(env
, &write_buffer
, &write_buffer_size
);
1990 r
= is_pressure_supported();
1996 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1997 * the system wide pressure if for some reason we cannot (which could be: memory controller
1998 * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1999 * only use the system-wide logic. */
2000 r
= cg_all_unified();
2004 watch
= "/proc/pressure/memory";
2006 _cleanup_free_
char *cg
= NULL
;
2008 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &cg
);
2012 w
= path_join("/sys/fs/cgroup", cg
, "memory.pressure");
2017 watch_fallback
= "/proc/pressure/memory";
2020 /* Android uses three levels in its userspace low memory killer logic:
2021 * some 70000 1000000
2022 * some 100000 1000000
2023 * full 70000 1000000
2025 * GNOME's low memory monitor uses:
2026 * some 70000 1000000
2027 * some 100000 1000000
2028 * full 100000 1000000
2030 * We'll default to the middle level that both agree on. Except we do it on a 2s window
2031 * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2032 * kernel will allow us to do unprivileged, also in the future. */
2033 if (asprintf((char**) &write_buffer
,
2034 "%s " USEC_FMT
" " USEC_FMT
,
2035 MEMORY_PRESSURE_DEFAULT_TYPE
,
2036 MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC
,
2037 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
) < 0)
2040 write_buffer_size
= strlen(write_buffer
) + 1;
2044 path_fd
= open(watch
, O_PATH
|O_CLOEXEC
);
2046 if (errno
!= ENOENT
)
2049 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2050 * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2051 * the PSI service apparently is not supported) */
2052 if (!watch_fallback
)
2053 return locked
? -ENOENT
: -EOPNOTSUPP
;
2055 path_fd
= open(watch_fallback
, O_PATH
|O_CLOEXEC
);
2057 if (errno
== ENOENT
) /* PSI is not available in the kernel even under the fallback path? */
2063 if (fstat(path_fd
, &st
) < 0)
2066 if (S_ISSOCK(st
.st_mode
)) {
2067 fd
= socket(AF_UNIX
, SOCK_STREAM
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, 0);
2071 r
= connect_unix_path(fd
, path_fd
, NULL
);
2077 } else if (S_ISREG(st
.st_mode
) || S_ISFIFO(st
.st_mode
) || S_ISCHR(st
.st_mode
)) {
2078 fd
= fd_reopen(path_fd
, (write_buffer_size
> 0 ? O_RDWR
: O_RDONLY
) |O_CLOEXEC
|O_NONBLOCK
|O_NOCTTY
);
2082 if (S_ISREG(st
.st_mode
)) {
2085 /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2087 if (fstatfs(fd
, &sfs
) < 0)
2090 if (!is_fs_type(&sfs
, PROC_SUPER_MAGIC
) &&
2091 !is_fs_type(&sfs
, CGROUP2_SUPER_MAGIC
))
2096 /* For fifos and char devices just watch for EPOLLIN */
2099 } else if (S_ISDIR(st
.st_mode
))
2104 s
->memory_pressure
.fd
= TAKE_FD(fd
);
2105 s
->memory_pressure
.write_buffer
= TAKE_PTR(write_buffer
);
2106 s
->memory_pressure
.write_buffer_size
= write_buffer_size
;
2107 s
->memory_pressure
.events
= events
;
2108 s
->memory_pressure
.locked
= locked
;
2110 /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2111 * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2112 * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2113 * event sources on which writes must be executed before the first event loop iteration is
2114 * executed. (We could also write the data here, right away, but we want to give the caller the
2115 * freedom to call sd_event_source_set_memory_pressure_type() and
2116 * sd_event_source_set_memory_pressure_rate() before we write it. */
2118 if (s
->memory_pressure
.write_buffer_size
> 0)
2119 source_memory_pressure_add_to_write_list(s
);
2121 r
= source_memory_pressure_register(s
, s
->enabled
);
2133 static void event_free_inotify_data(sd_event
*e
, struct inotify_data
*d
) {
2139 assert(hashmap_isempty(d
->inodes
));
2140 assert(hashmap_isempty(d
->wd
));
2142 if (d
->buffer_filled
> 0)
2143 LIST_REMOVE(buffered
, e
->buffered_inotify_data_list
, d
);
2145 hashmap_free(d
->inodes
);
2146 hashmap_free(d
->wd
);
2148 assert_se(hashmap_remove(e
->inotify_data
, &d
->priority
) == d
);
2151 if (!event_origin_changed(e
) &&
2152 epoll_ctl(e
->epoll_fd
, EPOLL_CTL_DEL
, d
->fd
, NULL
) < 0)
2153 log_debug_errno(errno
, "Failed to remove inotify fd from epoll, ignoring: %m");
2160 static int event_make_inotify_data(
2163 struct inotify_data
**ret
) {
2165 _cleanup_close_
int fd
= -EBADF
;
2166 struct inotify_data
*d
;
2171 d
= hashmap_get(e
->inotify_data
, &priority
);
2178 fd
= inotify_init1(IN_NONBLOCK
|O_CLOEXEC
);
2182 fd
= fd_move_above_stdio(fd
);
2184 d
= new(struct inotify_data
, 1);
2188 *d
= (struct inotify_data
) {
2189 .wakeup
= WAKEUP_INOTIFY_DATA
,
2191 .priority
= priority
,
2194 r
= hashmap_ensure_put(&e
->inotify_data
, &uint64_hash_ops
, &d
->priority
, d
);
2196 d
->fd
= safe_close(d
->fd
);
2201 struct epoll_event ev
= {
2206 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_ADD
, d
->fd
, &ev
) < 0) {
2208 d
->fd
= safe_close(d
->fd
); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2209 * remove the fd from the epoll first, which we don't want as we couldn't
2210 * add it in the first place. */
2211 event_free_inotify_data(e
, d
);
2221 static int inode_data_compare(const struct inode_data
*x
, const struct inode_data
*y
) {
2227 r
= CMP(x
->dev
, y
->dev
);
2231 return CMP(x
->ino
, y
->ino
);
2234 static void inode_data_hash_func(const struct inode_data
*d
, struct siphash
*state
) {
2237 siphash24_compress_typesafe(d
->dev
, state
);
2238 siphash24_compress_typesafe(d
->ino
, state
);
2241 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops
, struct inode_data
, inode_data_hash_func
, inode_data_compare
);
2243 static void event_free_inode_data(
2245 struct inode_data
*d
) {
2252 assert(!d
->event_sources
);
2255 LIST_REMOVE(to_close
, e
->inode_data_to_close_list
, d
);
2259 if (d
->inotify_data
) {
2262 if (d
->inotify_data
->fd
>= 0 && !event_origin_changed(e
)) {
2263 /* So here's a problem. At the time this runs the watch descriptor might already be
2264 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2265 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2266 * likely case to happen. */
2268 if (inotify_rm_watch(d
->inotify_data
->fd
, d
->wd
) < 0 && errno
!= EINVAL
)
2269 log_debug_errno(errno
, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d
->wd
);
2272 assert_se(hashmap_remove(d
->inotify_data
->wd
, INT_TO_PTR(d
->wd
)) == d
);
2275 assert_se(hashmap_remove(d
->inotify_data
->inodes
, d
) == d
);
2282 static void event_gc_inotify_data(
2284 struct inotify_data
*d
) {
2288 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2289 * any inode with it anymore, which in turn happens if no event source of this priority is interested
2290 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2291 * (under the expectation that the GC is called again once the counter is decremented). */
2296 if (!hashmap_isempty(d
->inodes
))
2302 event_free_inotify_data(e
, d
);
2305 static void event_gc_inode_data(
2307 struct inode_data
*d
) {
2309 struct inotify_data
*inotify_data
;
2316 if (d
->event_sources
)
2319 inotify_data
= d
->inotify_data
;
2320 event_free_inode_data(e
, d
);
2322 event_gc_inotify_data(e
, inotify_data
);
2325 static int event_make_inode_data(
2327 struct inotify_data
*inotify_data
,
2330 struct inode_data
**ret
) {
2332 struct inode_data
*d
, key
;
2336 assert(inotify_data
);
2338 key
= (struct inode_data
) {
2343 d
= hashmap_get(inotify_data
->inodes
, &key
);
2351 r
= hashmap_ensure_allocated(&inotify_data
->inodes
, &inode_data_hash_ops
);
2355 d
= new(struct inode_data
, 1);
2359 *d
= (struct inode_data
) {
2364 .inotify_data
= inotify_data
,
2367 r
= hashmap_put(inotify_data
->inodes
, d
, d
);
2379 static uint32_t inode_data_determine_mask(struct inode_data
*d
) {
2380 bool excl_unlink
= true;
2381 uint32_t combined
= 0;
2385 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2386 * the IN_EXCL_UNLINK flag is ANDed instead.
2388 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2389 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2390 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2391 * events we don't care for client-side. */
2393 LIST_FOREACH(inotify
.by_inode_data
, s
, d
->event_sources
) {
2395 if ((s
->inotify
.mask
& IN_EXCL_UNLINK
) == 0)
2396 excl_unlink
= false;
2398 combined
|= s
->inotify
.mask
;
2401 return (combined
& ~(IN_ONESHOT
|IN_DONT_FOLLOW
|IN_ONLYDIR
|IN_EXCL_UNLINK
)) | (excl_unlink
? IN_EXCL_UNLINK
: 0);
2404 static int inode_data_realize_watch(sd_event
*e
, struct inode_data
*d
) {
2405 uint32_t combined_mask
;
2411 combined_mask
= inode_data_determine_mask(d
);
2413 if (d
->wd
>= 0 && combined_mask
== d
->combined_mask
)
2416 r
= hashmap_ensure_allocated(&d
->inotify_data
->wd
, NULL
);
2420 wd
= inotify_add_watch_fd(d
->inotify_data
->fd
, d
->fd
, combined_mask
);
2425 r
= hashmap_put(d
->inotify_data
->wd
, INT_TO_PTR(wd
), d
);
2427 (void) inotify_rm_watch(d
->inotify_data
->fd
, wd
);
2433 } else if (d
->wd
!= wd
) {
2435 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2436 (void) inotify_rm_watch(d
->fd
, wd
);
2440 d
->combined_mask
= combined_mask
;
2444 static int inotify_exit_callback(sd_event_source
*s
, const struct inotify_event
*event
, void *userdata
) {
2447 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
2450 static int event_add_inotify_fd_internal(
2452 sd_event_source
**ret
,
2456 sd_event_inotify_handler_t callback
,
2459 _cleanup_close_
int donated_fd
= donate
? fd
: -EBADF
;
2460 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
2461 struct inotify_data
*inotify_data
= NULL
;
2462 struct inode_data
*inode_data
= NULL
;
2466 assert_return(e
, -EINVAL
);
2467 assert_return(e
= event_resolve(e
), -ENOPKG
);
2468 assert_return(fd
>= 0, -EBADF
);
2469 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2470 assert_return(!event_origin_changed(e
), -ECHILD
);
2473 callback
= inotify_exit_callback
;
2475 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2476 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2477 * the user can't use them for us. */
2478 if (mask
& IN_MASK_ADD
)
2481 if (fstat(fd
, &st
) < 0)
2484 s
= source_new(e
, !ret
, SOURCE_INOTIFY
);
2488 s
->enabled
= mask
& IN_ONESHOT
? SD_EVENT_ONESHOT
: SD_EVENT_ON
;
2489 s
->inotify
.mask
= mask
;
2490 s
->inotify
.callback
= callback
;
2491 s
->userdata
= userdata
;
2493 /* Allocate an inotify object for this priority, and an inode object within it */
2494 r
= event_make_inotify_data(e
, SD_EVENT_PRIORITY_NORMAL
, &inotify_data
);
2498 r
= event_make_inode_data(e
, inotify_data
, st
.st_dev
, st
.st_ino
, &inode_data
);
2500 event_gc_inotify_data(e
, inotify_data
);
2504 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2505 * the event source, until then, for which we need the original inode. */
2506 if (inode_data
->fd
< 0) {
2507 if (donated_fd
>= 0)
2508 inode_data
->fd
= TAKE_FD(donated_fd
);
2510 inode_data
->fd
= fcntl(fd
, F_DUPFD_CLOEXEC
, 3);
2511 if (inode_data
->fd
< 0) {
2513 event_gc_inode_data(e
, inode_data
);
2518 LIST_PREPEND(to_close
, e
->inode_data_to_close_list
, inode_data
);
2520 _cleanup_free_
char *path
= NULL
;
2521 r
= fd_get_path(inode_data
->fd
, &path
);
2522 if (r
< 0 && r
!= -ENOSYS
) { /* The path is optional, hence ignore -ENOSYS. */
2523 event_gc_inode_data(e
, inode_data
);
2527 free_and_replace(inode_data
->path
, path
);
2530 /* Link our event source to the inode data object */
2531 LIST_PREPEND(inotify
.by_inode_data
, inode_data
->event_sources
, s
);
2532 s
->inotify
.inode_data
= inode_data
;
2534 /* Actually realize the watch now */
2535 r
= inode_data_realize_watch(e
, inode_data
);
2546 _public_
int sd_event_add_inotify_fd(
2548 sd_event_source
**ret
,
2551 sd_event_inotify_handler_t callback
,
2554 return event_add_inotify_fd_internal(e
, ret
, fd
, /* donate= */ false, mask
, callback
, userdata
);
2557 _public_
int sd_event_add_inotify(
2559 sd_event_source
**ret
,
2562 sd_event_inotify_handler_t callback
,
2565 sd_event_source
*s
= NULL
; /* avoid false maybe-uninitialized warning */
2568 assert_return(path
, -EINVAL
);
2570 fd
= open(path
, O_PATH
| O_CLOEXEC
|
2571 (mask
& IN_ONLYDIR
? O_DIRECTORY
: 0) |
2572 (mask
& IN_DONT_FOLLOW
? O_NOFOLLOW
: 0));
2576 r
= event_add_inotify_fd_internal(e
, &s
, fd
, /* donate= */ true, mask
, callback
, userdata
);
2580 (void) sd_event_source_set_description(s
, path
);
2588 static sd_event_source
* event_source_free(sd_event_source
*s
) {
2592 /* Here's a special hack: when we are called from a
2593 * dispatch handler we won't free the event source
2594 * immediately, but we will detach the fd from the
2595 * epoll. This way it is safe for the caller to unref
2596 * the event source and immediately close the fd, but
2597 * we still retain a valid event source object after
2601 source_disconnect(s
);
2608 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source
, sd_event_source
, event_source_free
);
2610 _public_
int sd_event_source_set_description(sd_event_source
*s
, const char *description
) {
2611 assert_return(s
, -EINVAL
);
2612 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2614 return free_and_strdup(&s
->description
, description
);
2617 _public_
int sd_event_source_get_description(sd_event_source
*s
, const char **description
) {
2618 assert_return(s
, -EINVAL
);
2619 assert_return(description
, -EINVAL
);
2621 if (!s
->description
)
2624 *description
= s
->description
;
2628 _public_ sd_event
*sd_event_source_get_event(sd_event_source
*s
) {
2629 assert_return(s
, NULL
);
2630 assert_return(!event_origin_changed(s
->event
), NULL
);
2635 _public_
int sd_event_source_get_pending(sd_event_source
*s
) {
2636 assert_return(s
, -EINVAL
);
2637 assert_return(s
->type
!= SOURCE_EXIT
, -EDOM
);
2638 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2639 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2644 _public_
int sd_event_source_get_io_fd(sd_event_source
*s
) {
2645 assert_return(s
, -EINVAL
);
2646 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2647 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2652 _public_
int sd_event_source_set_io_fd(sd_event_source
*s
, int fd
) {
2655 assert_return(s
, -EINVAL
);
2656 assert_return(fd
>= 0, -EBADF
);
2657 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2658 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2663 saved_fd
= s
->io
.fd
;
2666 assert(event_source_is_offline(s
) == !s
->io
.registered
);
2668 if (s
->io
.registered
) {
2669 s
->io
.registered
= false;
2671 r
= source_io_register(s
, s
->enabled
, s
->io
.events
);
2673 s
->io
.fd
= saved_fd
;
2674 s
->io
.registered
= true;
2678 (void) epoll_ctl(s
->event
->epoll_fd
, EPOLL_CTL_DEL
, saved_fd
, NULL
);
2682 safe_close(saved_fd
);
2687 _public_
int sd_event_source_get_io_fd_own(sd_event_source
*s
) {
2688 assert_return(s
, -EINVAL
);
2689 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2690 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2695 _public_
int sd_event_source_set_io_fd_own(sd_event_source
*s
, int own
) {
2696 assert_return(s
, -EINVAL
);
2697 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2698 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2704 _public_
int sd_event_source_get_io_events(sd_event_source
*s
, uint32_t* events
) {
2705 assert_return(s
, -EINVAL
);
2706 assert_return(events
, -EINVAL
);
2707 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2708 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2710 *events
= s
->io
.events
;
2714 _public_
int sd_event_source_set_io_events(sd_event_source
*s
, uint32_t events
) {
2717 assert_return(s
, -EINVAL
);
2718 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2719 assert_return(!(events
& ~(EPOLLIN
|EPOLLOUT
|EPOLLRDHUP
|EPOLLPRI
|EPOLLERR
|EPOLLHUP
|EPOLLET
)), -EINVAL
);
2720 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2721 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2723 /* edge-triggered updates are never skipped, so we can reset edges */
2724 if (s
->io
.events
== events
&& !(events
& EPOLLET
))
2727 r
= source_set_pending(s
, false);
2731 if (event_source_is_online(s
)) {
2732 r
= source_io_register(s
, s
->enabled
, events
);
2737 s
->io
.events
= events
;
2742 _public_
int sd_event_source_get_io_revents(sd_event_source
*s
, uint32_t* revents
) {
2743 assert_return(s
, -EINVAL
);
2744 assert_return(revents
, -EINVAL
);
2745 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2746 assert_return(s
->pending
, -ENODATA
);
2747 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2749 *revents
= s
->io
.revents
;
2753 _public_
int sd_event_source_get_signal(sd_event_source
*s
) {
2754 assert_return(s
, -EINVAL
);
2755 assert_return(s
->type
== SOURCE_SIGNAL
, -EDOM
);
2756 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2758 return s
->signal
.sig
;
2761 _public_
int sd_event_source_get_priority(sd_event_source
*s
, int64_t *priority
) {
2762 assert_return(s
, -EINVAL
);
2763 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2765 *priority
= s
->priority
;
2769 _public_
int sd_event_source_set_priority(sd_event_source
*s
, int64_t priority
) {
2770 bool rm_inotify
= false, rm_inode
= false;
2771 struct inotify_data
*new_inotify_data
= NULL
;
2772 struct inode_data
*new_inode_data
= NULL
;
2775 assert_return(s
, -EINVAL
);
2776 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2777 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2779 if (s
->priority
== priority
)
2782 if (s
->type
== SOURCE_INOTIFY
) {
2783 struct inode_data
*old_inode_data
;
2785 assert(s
->inotify
.inode_data
);
2786 old_inode_data
= s
->inotify
.inode_data
;
2788 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2789 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2790 * events we allow priority changes only until the first following iteration. */
2791 if (old_inode_data
->fd
< 0)
2794 r
= event_make_inotify_data(s
->event
, priority
, &new_inotify_data
);
2799 r
= event_make_inode_data(s
->event
, new_inotify_data
, old_inode_data
->dev
, old_inode_data
->ino
, &new_inode_data
);
2804 if (new_inode_data
->fd
< 0) {
2805 /* Duplicate the fd for the new inode object if we don't have any yet */
2806 new_inode_data
->fd
= fcntl(old_inode_data
->fd
, F_DUPFD_CLOEXEC
, 3);
2807 if (new_inode_data
->fd
< 0) {
2812 LIST_PREPEND(to_close
, s
->event
->inode_data_to_close_list
, new_inode_data
);
2814 _cleanup_free_
char *path
= NULL
;
2815 r
= fd_get_path(new_inode_data
->fd
, &path
);
2816 if (r
< 0 && r
!= -ENOSYS
)
2819 free_and_replace(new_inode_data
->path
, path
);
2822 /* Move the event source to the new inode data structure */
2823 LIST_REMOVE(inotify
.by_inode_data
, old_inode_data
->event_sources
, s
);
2824 LIST_PREPEND(inotify
.by_inode_data
, new_inode_data
->event_sources
, s
);
2825 s
->inotify
.inode_data
= new_inode_data
;
2827 /* Now create the new watch */
2828 r
= inode_data_realize_watch(s
->event
, new_inode_data
);
2831 LIST_REMOVE(inotify
.by_inode_data
, new_inode_data
->event_sources
, s
);
2832 LIST_PREPEND(inotify
.by_inode_data
, old_inode_data
->event_sources
, s
);
2833 s
->inotify
.inode_data
= old_inode_data
;
2837 s
->priority
= priority
;
2839 event_gc_inode_data(s
->event
, old_inode_data
);
2841 } else if (s
->type
== SOURCE_SIGNAL
&& event_source_is_online(s
)) {
2842 struct signal_data
*old
, *d
;
2844 /* Move us from the signalfd belonging to the old
2845 * priority to the signalfd of the new priority */
2847 assert_se(old
= hashmap_get(s
->event
->signal_data
, &s
->priority
));
2849 s
->priority
= priority
;
2851 r
= event_make_signal_data(s
->event
, s
->signal
.sig
, &d
);
2853 s
->priority
= old
->priority
;
2857 event_unmask_signal_data(s
->event
, old
, s
->signal
.sig
);
2859 s
->priority
= priority
;
2861 event_source_pp_prioq_reshuffle(s
);
2863 if (s
->type
== SOURCE_EXIT
)
2864 prioq_reshuffle(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
2870 event_free_inode_data(s
->event
, new_inode_data
);
2873 event_free_inotify_data(s
->event
, new_inotify_data
);
2878 _public_
int sd_event_source_get_enabled(sd_event_source
*s
, int *ret
) {
2879 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2883 assert_return(s
, -EINVAL
);
2884 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2889 return s
->enabled
!= SD_EVENT_OFF
;
2892 static int event_source_offline(
2901 assert(enabled
== SD_EVENT_OFF
|| ratelimited
);
2903 /* Unset the pending flag when this event source is disabled */
2904 if (s
->enabled
!= SD_EVENT_OFF
&&
2905 enabled
== SD_EVENT_OFF
&&
2906 !IN_SET(s
->type
, SOURCE_DEFER
, SOURCE_EXIT
)) {
2907 r
= source_set_pending(s
, false);
2912 was_offline
= event_source_is_offline(s
);
2913 s
->enabled
= enabled
;
2914 s
->ratelimited
= ratelimited
;
2919 source_io_unregister(s
);
2923 event_gc_signal_data(s
->event
, &s
->priority
, s
->signal
.sig
);
2928 assert(s
->event
->n_online_child_sources
> 0);
2929 s
->event
->n_online_child_sources
--;
2932 if (EVENT_SOURCE_WATCH_PIDFD(s
))
2933 source_child_pidfd_unregister(s
);
2935 event_gc_signal_data(s
->event
, &s
->priority
, SIGCHLD
);
2939 prioq_reshuffle(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
2942 case SOURCE_MEMORY_PRESSURE
:
2943 source_memory_pressure_unregister(s
);
2946 case SOURCE_TIME_REALTIME
:
2947 case SOURCE_TIME_BOOTTIME
:
2948 case SOURCE_TIME_MONOTONIC
:
2949 case SOURCE_TIME_REALTIME_ALARM
:
2950 case SOURCE_TIME_BOOTTIME_ALARM
:
2953 case SOURCE_INOTIFY
:
2957 assert_not_reached();
2960 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2961 event_source_time_prioq_reshuffle(s
);
2966 static int event_source_online(
2975 assert(enabled
!= SD_EVENT_OFF
|| !ratelimited
);
2977 /* Unset the pending flag when this event source is enabled */
2978 if (s
->enabled
== SD_EVENT_OFF
&&
2979 enabled
!= SD_EVENT_OFF
&&
2980 !IN_SET(s
->type
, SOURCE_DEFER
, SOURCE_EXIT
)) {
2981 r
= source_set_pending(s
, false);
2986 /* Are we really ready for onlining? */
2987 if (enabled
== SD_EVENT_OFF
|| ratelimited
) {
2988 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2989 s
->enabled
= enabled
;
2990 s
->ratelimited
= ratelimited
;
2994 was_online
= event_source_is_online(s
);
2998 r
= source_io_register(s
, enabled
, s
->io
.events
);
3004 r
= event_make_signal_data(s
->event
, s
->signal
.sig
, NULL
);
3006 event_gc_signal_data(s
->event
, &s
->priority
, s
->signal
.sig
);
3013 if (EVENT_SOURCE_WATCH_PIDFD(s
)) {
3014 /* yes, we have pidfd */
3016 r
= source_child_pidfd_register(s
, enabled
);
3020 /* no pidfd, or something other to watch for than WEXITED */
3022 r
= event_make_signal_data(s
->event
, SIGCHLD
, NULL
);
3024 event_gc_signal_data(s
->event
, &s
->priority
, SIGCHLD
);
3030 s
->event
->n_online_child_sources
++;
3033 case SOURCE_MEMORY_PRESSURE
:
3034 r
= source_memory_pressure_register(s
, enabled
);
3040 case SOURCE_TIME_REALTIME
:
3041 case SOURCE_TIME_BOOTTIME
:
3042 case SOURCE_TIME_MONOTONIC
:
3043 case SOURCE_TIME_REALTIME_ALARM
:
3044 case SOURCE_TIME_BOOTTIME_ALARM
:
3048 case SOURCE_INOTIFY
:
3052 assert_not_reached();
3055 s
->enabled
= enabled
;
3056 s
->ratelimited
= ratelimited
;
3058 /* Non-failing operations below */
3059 if (s
->type
== SOURCE_EXIT
)
3060 prioq_reshuffle(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
3062 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3063 event_source_time_prioq_reshuffle(s
);
3068 _public_
int sd_event_source_set_enabled(sd_event_source
*s
, int m
) {
3071 assert_return(IN_SET(m
, SD_EVENT_OFF
, SD_EVENT_ON
, SD_EVENT_ONESHOT
), -EINVAL
);
3073 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3074 if (m
== SD_EVENT_OFF
&& !s
)
3077 assert_return(s
, -EINVAL
);
3078 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3080 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3081 if (s
->event
->state
== SD_EVENT_FINISHED
)
3082 return m
== SD_EVENT_OFF
? 0 : -ESTALE
;
3084 if (s
->enabled
== m
) /* No change? */
3087 if (m
== SD_EVENT_OFF
)
3088 r
= event_source_offline(s
, m
, s
->ratelimited
);
3090 if (s
->enabled
!= SD_EVENT_OFF
) {
3091 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3092 * event source is already enabled after all. */
3097 r
= event_source_online(s
, m
, s
->ratelimited
);
3102 event_source_pp_prioq_reshuffle(s
);
3106 _public_
int sd_event_source_get_time(sd_event_source
*s
, uint64_t *usec
) {
3107 assert_return(s
, -EINVAL
);
3108 assert_return(usec
, -EINVAL
);
3109 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
3110 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3112 *usec
= s
->time
.next
;
3116 _public_
int sd_event_source_set_time(sd_event_source
*s
, uint64_t usec
) {
3119 assert_return(s
, -EINVAL
);
3120 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
3121 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
3122 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3124 r
= source_set_pending(s
, false);
3128 s
->time
.next
= usec
;
3130 event_source_time_prioq_reshuffle(s
);
3134 _public_
int sd_event_source_set_time_relative(sd_event_source
*s
, uint64_t usec
) {
3138 assert_return(s
, -EINVAL
);
3139 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
3140 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3142 if (usec
== USEC_INFINITY
)
3143 return sd_event_source_set_time(s
, USEC_INFINITY
);
3145 r
= sd_event_now(s
->event
, event_source_type_to_clock(s
->type
), &t
);
3149 usec
= usec_add(t
, usec
);
3150 if (usec
== USEC_INFINITY
)
3153 return sd_event_source_set_time(s
, usec
);
3156 _public_
int sd_event_source_get_time_accuracy(sd_event_source
*s
, uint64_t *usec
) {
3157 assert_return(s
, -EINVAL
);
3158 assert_return(usec
, -EINVAL
);
3159 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
3160 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3162 *usec
= s
->time
.accuracy
;
3166 _public_
int sd_event_source_set_time_accuracy(sd_event_source
*s
, uint64_t usec
) {
3169 assert_return(s
, -EINVAL
);
3170 assert_return(usec
!= UINT64_MAX
, -EINVAL
);
3171 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
3172 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
3173 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3175 r
= source_set_pending(s
, false);
3180 usec
= DEFAULT_ACCURACY_USEC
;
3182 s
->time
.accuracy
= usec
;
3184 event_source_time_prioq_reshuffle(s
);
3188 _public_
int sd_event_source_get_time_clock(sd_event_source
*s
, clockid_t
*clock
) {
3189 assert_return(s
, -EINVAL
);
3190 assert_return(clock
, -EINVAL
);
3191 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
3192 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3194 *clock
= event_source_type_to_clock(s
->type
);
3198 _public_
int sd_event_source_get_child_pid(sd_event_source
*s
, pid_t
*pid
) {
3199 assert_return(s
, -EINVAL
);
3200 assert_return(pid
, -EINVAL
);
3201 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3202 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3204 *pid
= s
->child
.pid
;
3208 _public_
int sd_event_source_get_child_pidfd(sd_event_source
*s
) {
3209 assert_return(s
, -EINVAL
);
3210 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3211 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3213 if (s
->child
.pidfd
< 0)
3216 return s
->child
.pidfd
;
3219 _public_
int sd_event_source_send_child_signal(sd_event_source
*s
, int sig
, const siginfo_t
*si
, unsigned flags
) {
3220 assert_return(s
, -EINVAL
);
3221 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3222 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3223 assert_return(SIGNAL_VALID(sig
), -EINVAL
);
3225 /* If we already have seen indication the process exited refuse sending a signal early. This way we
3226 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3228 if (s
->child
.exited
)
3231 if (s
->child
.pidfd
>= 0) {
3234 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3239 if (pidfd_send_signal(s
->child
.pidfd
, sig
, si
? ©
: NULL
, 0) < 0) {
3240 /* Let's propagate the error only if the system call is not implemented or prohibited */
3241 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
))
3247 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3253 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3254 siginfo_t copy
= *si
;
3256 if (rt_sigqueueinfo(s
->child
.pid
, sig
, ©
) < 0)
3258 } else if (kill(s
->child
.pid
, sig
) < 0)
3264 _public_
int sd_event_source_get_child_pidfd_own(sd_event_source
*s
) {
3265 assert_return(s
, -EINVAL
);
3266 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3267 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3269 if (s
->child
.pidfd
< 0)
3272 return s
->child
.pidfd_owned
;
3275 _public_
int sd_event_source_set_child_pidfd_own(sd_event_source
*s
, int own
) {
3276 assert_return(s
, -EINVAL
);
3277 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3278 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3280 if (s
->child
.pidfd
< 0)
3283 s
->child
.pidfd_owned
= own
;
3287 _public_
int sd_event_source_get_child_process_own(sd_event_source
*s
) {
3288 assert_return(s
, -EINVAL
);
3289 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3290 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3292 return s
->child
.process_owned
;
3295 _public_
int sd_event_source_set_child_process_own(sd_event_source
*s
, int own
) {
3296 assert_return(s
, -EINVAL
);
3297 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3298 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3300 s
->child
.process_owned
= own
;
3304 _public_
int sd_event_source_get_inotify_mask(sd_event_source
*s
, uint32_t *ret
) {
3305 assert_return(s
, -EINVAL
);
3306 assert_return(ret
, -EINVAL
);
3307 assert_return(s
->type
== SOURCE_INOTIFY
, -EDOM
);
3308 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3310 *ret
= s
->inotify
.mask
;
3314 _public_
int sd_event_source_get_inotify_path(sd_event_source
*s
, const char **ret
) {
3315 assert_return(s
, -EINVAL
);
3316 assert_return(ret
, -EINVAL
);
3317 assert_return(s
->type
== SOURCE_INOTIFY
, -EDOM
);
3318 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3320 if (!s
->inotify
.inode_data
)
3321 return -ESTALE
; /* already disconnected. */
3323 if (!s
->inotify
.inode_data
->path
)
3324 return -ENOSYS
; /* /proc was not mounted? */
3326 *ret
= s
->inotify
.inode_data
->path
;
3330 _public_
int sd_event_source_set_prepare(sd_event_source
*s
, sd_event_handler_t callback
) {
3333 assert_return(s
, -EINVAL
);
3334 assert_return(s
->type
!= SOURCE_EXIT
, -EDOM
);
3335 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
3336 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3338 if (s
->prepare
== callback
)
3341 if (callback
&& s
->prepare
) {
3342 s
->prepare
= callback
;
3346 r
= prioq_ensure_allocated(&s
->event
->prepare
, prepare_prioq_compare
);
3350 s
->prepare
= callback
;
3353 r
= prioq_put(s
->event
->prepare
, s
, &s
->prepare_index
);
3357 prioq_remove(s
->event
->prepare
, s
, &s
->prepare_index
);
3362 _public_
void* sd_event_source_get_userdata(sd_event_source
*s
) {
3363 assert_return(s
, NULL
);
3364 assert_return(!event_origin_changed(s
->event
), NULL
);
3369 _public_
void *sd_event_source_set_userdata(sd_event_source
*s
, void *userdata
) {
3372 assert_return(s
, NULL
);
3373 assert_return(!event_origin_changed(s
->event
), NULL
);
3376 s
->userdata
= userdata
;
3381 static int event_source_enter_ratelimited(sd_event_source
*s
) {
3386 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3387 * the end of the rate limit time window, much as if it was a timer event source. */
3390 return 0; /* Already ratelimited, this is a NOP hence */
3392 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3393 r
= setup_clock_data(s
->event
, &s
->event
->monotonic
, CLOCK_MONOTONIC
);
3397 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3398 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3399 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3400 if (EVENT_SOURCE_IS_TIME(s
->type
))
3401 event_source_time_prioq_remove(s
, event_get_clock_data(s
->event
, s
->type
));
3403 /* Now, let's add the event source to the monotonic clock instead */
3404 r
= event_source_time_prioq_put(s
, &s
->event
->monotonic
);
3408 /* And let's take the event source officially offline */
3409 r
= event_source_offline(s
, s
->enabled
, /* ratelimited= */ true);
3411 event_source_time_prioq_remove(s
, &s
->event
->monotonic
);
3415 event_source_pp_prioq_reshuffle(s
);
3417 log_debug("Event source %p (%s) entered rate limit state.", s
, strna(s
->description
));
3421 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3422 * space for it should already be allocated. */
3423 if (EVENT_SOURCE_IS_TIME(s
->type
))
3424 assert_se(event_source_time_prioq_put(s
, event_get_clock_data(s
->event
, s
->type
)) >= 0);
3429 static int event_source_leave_ratelimit(sd_event_source
*s
, bool run_callback
) {
3434 if (!s
->ratelimited
)
3437 /* Let's take the event source out of the monotonic prioq first. */
3438 event_source_time_prioq_remove(s
, &s
->event
->monotonic
);
3440 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3441 if (EVENT_SOURCE_IS_TIME(s
->type
)) {
3442 r
= event_source_time_prioq_put(s
, event_get_clock_data(s
->event
, s
->type
));
3447 /* Let's try to take it online again. */
3448 r
= event_source_online(s
, s
->enabled
, /* ratelimited= */ false);
3450 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3451 if (EVENT_SOURCE_IS_TIME(s
->type
))
3452 event_source_time_prioq_remove(s
, event_get_clock_data(s
->event
, s
->type
));
3457 event_source_pp_prioq_reshuffle(s
);
3458 ratelimit_reset(&s
->rate_limit
);
3460 log_debug("Event source %p (%s) left rate limit state.", s
, strna(s
->description
));
3462 if (run_callback
&& s
->ratelimit_expire_callback
) {
3463 s
->dispatching
= true;
3464 r
= s
->ratelimit_expire_callback(s
, s
->userdata
);
3465 s
->dispatching
= false;
3468 log_debug_errno(r
, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3469 strna(s
->description
),
3470 event_source_type_to_string(s
->type
),
3471 s
->exit_on_failure
? "exiting" : "disabling");
3473 if (s
->exit_on_failure
)
3474 (void) sd_event_exit(s
->event
, r
);
3480 assert_se(sd_event_source_set_enabled(s
, SD_EVENT_OFF
) >= 0);
3488 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3489 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3490 assert_se(event_source_time_prioq_put(s
, &s
->event
->monotonic
) >= 0);
3495 static usec_t
sleep_between(sd_event
*e
, usec_t a
, usec_t b
) {
3502 if (a
>= USEC_INFINITY
)
3503 return USEC_INFINITY
;
3508 initialize_perturb(e
);
3511 Find a good time to wake up again between times a and b. We
3512 have two goals here:
3514 a) We want to wake up as seldom as possible, hence prefer
3515 later times over earlier times.
3517 b) But if we have to wake up, then let's make sure to
3518 dispatch as much as possible on the entire system.
3520 We implement this by waking up everywhere at the same time
3521 within any given minute if we can, synchronised via the
3522 perturbation value determined from the boot ID. If we can't,
3523 then we try to find the same spot in every 10s, then 1s and
3524 then 250ms step. Otherwise, we pick the last possible time
3528 c
= (b
/ USEC_PER_MINUTE
) * USEC_PER_MINUTE
+ e
->perturb
;
3530 if (_unlikely_(c
< USEC_PER_MINUTE
))
3533 c
-= USEC_PER_MINUTE
;
3539 c
= (b
/ (USEC_PER_SEC
*10)) * (USEC_PER_SEC
*10) + (e
->perturb
% (USEC_PER_SEC
*10));
3541 if (_unlikely_(c
< USEC_PER_SEC
*10))
3544 c
-= USEC_PER_SEC
*10;
3550 c
= (b
/ USEC_PER_SEC
) * USEC_PER_SEC
+ (e
->perturb
% USEC_PER_SEC
);
3552 if (_unlikely_(c
< USEC_PER_SEC
))
3561 c
= (b
/ (USEC_PER_MSEC
*250)) * (USEC_PER_MSEC
*250) + (e
->perturb
% (USEC_PER_MSEC
*250));
3563 if (_unlikely_(c
< USEC_PER_MSEC
*250))
3566 c
-= USEC_PER_MSEC
*250;
3575 static int event_arm_timer(
3577 struct clock_data
*d
) {
3579 struct itimerspec its
= {};
3580 sd_event_source
*a
, *b
;
3586 if (!d
->needs_rearm
)
3589 d
->needs_rearm
= false;
3591 a
= prioq_peek(d
->earliest
);
3592 assert(!a
|| EVENT_SOURCE_USES_TIME_PRIOQ(a
->type
));
3593 if (!a
|| a
->enabled
== SD_EVENT_OFF
|| time_event_source_next(a
) == USEC_INFINITY
) {
3598 if (d
->next
== USEC_INFINITY
)
3602 if (timerfd_settime(d
->fd
, TFD_TIMER_ABSTIME
, &its
, NULL
) < 0)
3605 d
->next
= USEC_INFINITY
;
3609 b
= prioq_peek(d
->latest
);
3610 assert(!b
|| EVENT_SOURCE_USES_TIME_PRIOQ(b
->type
));
3611 assert(b
&& b
->enabled
!= SD_EVENT_OFF
);
3613 t
= sleep_between(e
, time_event_source_next(a
), time_event_source_latest(b
));
3617 assert_se(d
->fd
>= 0);
3620 /* We don't want to disarm here, just mean some time looooong ago. */
3621 its
.it_value
.tv_sec
= 0;
3622 its
.it_value
.tv_nsec
= 1;
3624 timespec_store(&its
.it_value
, t
);
3626 if (timerfd_settime(d
->fd
, TFD_TIMER_ABSTIME
, &its
, NULL
) < 0)
3633 static int process_io(sd_event
*e
, sd_event_source
*s
, uint32_t revents
) {
3636 assert(s
->type
== SOURCE_IO
);
3638 /* If the event source was already pending, we just OR in the
3639 * new revents, otherwise we reset the value. The ORing is
3640 * necessary to handle EPOLLONESHOT events properly where
3641 * readability might happen independently of writability, and
3642 * we need to keep track of both */
3645 s
->io
.revents
|= revents
;
3647 s
->io
.revents
= revents
;
3649 return source_set_pending(s
, true);
3652 static int flush_timer(sd_event
*e
, int fd
, uint32_t events
, usec_t
*next
) {
3659 assert_return(events
== EPOLLIN
, -EIO
);
3661 ss
= read(fd
, &x
, sizeof(x
));
3663 if (ERRNO_IS_TRANSIENT(errno
))
3669 if (_unlikely_(ss
!= sizeof(x
)))
3673 *next
= USEC_INFINITY
;
3678 static int process_timer(
3681 struct clock_data
*d
) {
3684 bool callback_invoked
= false;
3691 s
= prioq_peek(d
->earliest
);
3692 assert(!s
|| EVENT_SOURCE_USES_TIME_PRIOQ(s
->type
));
3694 if (!s
|| time_event_source_next(s
) > n
)
3697 if (s
->ratelimited
) {
3698 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3700 assert(s
->ratelimited
);
3702 r
= event_source_leave_ratelimit(s
, /* run_callback */ true);
3706 callback_invoked
= true;
3711 if (s
->enabled
== SD_EVENT_OFF
|| s
->pending
)
3714 r
= source_set_pending(s
, true);
3718 event_source_time_prioq_reshuffle(s
);
3721 return callback_invoked
;
3724 static int process_child(sd_event
*e
, int64_t threshold
, int64_t *ret_min_priority
) {
3725 int64_t min_priority
= threshold
;
3726 bool something_new
= false;
3731 assert(ret_min_priority
);
3733 if (!e
->need_process_child
) {
3734 *ret_min_priority
= min_priority
;
3738 e
->need_process_child
= false;
3740 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3741 * for, instead of using P_ALL. This is because we only want to get child information of very
3742 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3743 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3744 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3745 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3746 * to handle SIGCHLD yourself.
3748 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3749 * source is dispatched so that the callback still sees the process as a zombie. */
3751 HASHMAP_FOREACH(s
, e
->child_sources
) {
3752 assert(s
->type
== SOURCE_CHILD
);
3754 if (s
->priority
> threshold
)
3760 if (event_source_is_offline(s
))
3763 if (s
->child
.exited
)
3766 if (EVENT_SOURCE_WATCH_PIDFD(s
))
3767 /* There's a usable pidfd known for this event source? Then don't waitid() for
3771 zero(s
->child
.siginfo
);
3772 if (waitid(P_PID
, s
->child
.pid
, &s
->child
.siginfo
,
3773 WNOHANG
| (s
->child
.options
& WEXITED
? WNOWAIT
: 0) | s
->child
.options
) < 0)
3774 return negative_errno();
3776 if (s
->child
.siginfo
.si_pid
!= 0) {
3777 bool zombie
= IN_SET(s
->child
.siginfo
.si_code
, CLD_EXITED
, CLD_KILLED
, CLD_DUMPED
);
3780 s
->child
.exited
= true;
3782 if (!zombie
&& (s
->child
.options
& WEXITED
)) {
3783 /* If the child isn't dead then let's immediately remove the state
3784 * change from the queue, since there's no benefit in leaving it
3787 assert(s
->child
.options
& (WSTOPPED
|WCONTINUED
));
3788 (void) waitid(P_PID
, s
->child
.pid
, &s
->child
.siginfo
, WNOHANG
|(s
->child
.options
& (WSTOPPED
|WCONTINUED
)));
3791 r
= source_set_pending(s
, true);
3795 something_new
= true;
3796 min_priority
= MIN(min_priority
, s
->priority
);
3801 *ret_min_priority
= min_priority
;
3802 return something_new
;
3805 static int process_pidfd(sd_event
*e
, sd_event_source
*s
, uint32_t revents
) {
3808 assert(s
->type
== SOURCE_CHILD
);
3813 if (event_source_is_offline(s
))
3816 if (!EVENT_SOURCE_WATCH_PIDFD(s
))
3819 zero(s
->child
.siginfo
);
3820 if (waitid(P_PID
, s
->child
.pid
, &s
->child
.siginfo
, WNOHANG
| WNOWAIT
| s
->child
.options
) < 0)
3823 if (s
->child
.siginfo
.si_pid
== 0)
3826 if (IN_SET(s
->child
.siginfo
.si_code
, CLD_EXITED
, CLD_KILLED
, CLD_DUMPED
))
3827 s
->child
.exited
= true;
3829 return source_set_pending(s
, true);
3832 static int process_signal(sd_event
*e
, struct signal_data
*d
, uint32_t events
, int64_t *min_priority
) {
3837 assert_return(events
== EPOLLIN
, -EIO
);
3838 assert(min_priority
);
3840 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3841 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3842 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3843 * but we might have higher priority children we care about hence we need to check that
3846 if (sigismember(&d
->sigset
, SIGCHLD
))
3847 e
->need_process_child
= true;
3849 /* If there's already an event source pending for this priority we don't read another */
3854 struct signalfd_siginfo si
;
3856 sd_event_source
*s
= NULL
;
3858 n
= read(d
->fd
, &si
, sizeof(si
));
3860 if (ERRNO_IS_TRANSIENT(errno
))
3866 if (_unlikely_(n
!= sizeof(si
)))
3869 assert(SIGNAL_VALID(si
.ssi_signo
));
3871 if (e
->signal_sources
)
3872 s
= e
->signal_sources
[si
.ssi_signo
];
3878 s
->signal
.siginfo
= si
;
3881 r
= source_set_pending(s
, true);
3884 if (r
> 0 && *min_priority
>= s
->priority
) {
3885 *min_priority
= s
->priority
;
3886 return 1; /* an event source with smaller priority is queued. */
3893 static int event_inotify_data_read(sd_event
*e
, struct inotify_data
*d
, uint32_t revents
, int64_t threshold
) {
3899 assert_return(revents
== EPOLLIN
, -EIO
);
3901 /* If there's already an event source pending for this priority, don't read another */
3902 if (d
->n_pending
> 0)
3905 /* Is the read buffer non-empty? If so, let's not read more */
3906 if (d
->buffer_filled
> 0)
3909 if (d
->priority
> threshold
)
3912 n
= read(d
->fd
, &d
->buffer
, sizeof(d
->buffer
));
3914 if (ERRNO_IS_TRANSIENT(errno
))
3921 d
->buffer_filled
= (size_t) n
;
3922 LIST_PREPEND(buffered
, e
->buffered_inotify_data_list
, d
);
3927 static void event_inotify_data_drop(sd_event
*e
, struct inotify_data
*d
, size_t sz
) {
3930 assert(sz
<= d
->buffer_filled
);
3935 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3936 memmove(d
->buffer
.raw
, d
->buffer
.raw
+ sz
, d
->buffer_filled
- sz
);
3937 d
->buffer_filled
-= sz
;
3939 if (d
->buffer_filled
== 0)
3940 LIST_REMOVE(buffered
, e
->buffered_inotify_data_list
, d
);
3943 static int event_inotify_data_process(sd_event
*e
, struct inotify_data
*d
) {
3949 /* If there's already an event source pending for this priority, don't read another */
3950 if (d
->n_pending
> 0)
3953 while (d
->buffer_filled
> 0) {
3956 /* Let's validate that the event structures are complete */
3957 if (d
->buffer_filled
< offsetof(struct inotify_event
, name
))
3960 sz
= offsetof(struct inotify_event
, name
) + d
->buffer
.ev
.len
;
3961 if (d
->buffer_filled
< sz
)
3964 if (d
->buffer
.ev
.mask
& IN_Q_OVERFLOW
) {
3965 struct inode_data
*inode_data
;
3967 /* The queue overran, let's pass this event to all event sources connected to this inotify
3970 HASHMAP_FOREACH(inode_data
, d
->inodes
)
3971 LIST_FOREACH(inotify
.by_inode_data
, s
, inode_data
->event_sources
) {
3973 if (event_source_is_offline(s
))
3976 r
= source_set_pending(s
, true);
3981 struct inode_data
*inode_data
;
3983 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3984 * our watch descriptor table. */
3985 if (d
->buffer
.ev
.mask
& IN_IGNORED
) {
3987 inode_data
= hashmap_remove(d
->wd
, INT_TO_PTR(d
->buffer
.ev
.wd
));
3989 event_inotify_data_drop(e
, d
, sz
);
3993 /* The watch descriptor was removed by the kernel, let's drop it here too */
3994 inode_data
->wd
= -1;
3996 inode_data
= hashmap_get(d
->wd
, INT_TO_PTR(d
->buffer
.ev
.wd
));
3998 event_inotify_data_drop(e
, d
, sz
);
4003 /* Trigger all event sources that are interested in these events. Also trigger all event
4004 * sources if IN_IGNORED or IN_UNMOUNT is set. */
4005 LIST_FOREACH(inotify
.by_inode_data
, s
, inode_data
->event_sources
) {
4007 if (event_source_is_offline(s
))
4010 if ((d
->buffer
.ev
.mask
& (IN_IGNORED
|IN_UNMOUNT
)) == 0 &&
4011 (s
->inotify
.mask
& d
->buffer
.ev
.mask
& IN_ALL_EVENTS
) == 0)
4014 r
= source_set_pending(s
, true);
4020 /* Something pending now? If so, let's finish, otherwise let's read more. */
4021 if (d
->n_pending
> 0)
4028 static int process_inotify(sd_event
*e
) {
4033 LIST_FOREACH(buffered
, d
, e
->buffered_inotify_data_list
) {
4034 r
= event_inotify_data_process(e
, d
);
4044 static int process_memory_pressure(sd_event_source
*s
, uint32_t revents
) {
4046 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
4049 s
->memory_pressure
.revents
|= revents
;
4051 s
->memory_pressure
.revents
= revents
;
4053 return source_set_pending(s
, true);
4056 static int source_memory_pressure_write(sd_event_source
*s
) {
4061 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
4063 /* once we start writing, the buffer is locked, we allow no further changes. */
4064 s
->memory_pressure
.locked
= true;
4066 if (s
->memory_pressure
.write_buffer_size
> 0) {
4067 n
= write(s
->memory_pressure
.fd
, s
->memory_pressure
.write_buffer
, s
->memory_pressure
.write_buffer_size
);
4069 if (!ERRNO_IS_TRANSIENT(errno
)) {
4070 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4071 * files, but then generates EOPNOSUPP on read() and write() (instead of on
4072 * open()!). This sucks hard, since we can only detect this kind of failure
4073 * so late. Let's make the best of it, and turn off the event source like we
4074 * do for failed event source handlers. */
4076 log_debug_errno(errno
, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4077 assert_se(sd_event_source_set_enabled(s
, SD_EVENT_OFF
) >= 0);
4088 if ((size_t) n
== s
->memory_pressure
.write_buffer_size
) {
4089 s
->memory_pressure
.write_buffer
= mfree(s
->memory_pressure
.write_buffer
);
4092 s
->memory_pressure
.write_buffer_size
= 0;
4094 /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4095 r
= source_memory_pressure_register(s
, s
->enabled
);
4100 _cleanup_free_
void *c
= NULL
;
4102 assert((size_t) n
< s
->memory_pressure
.write_buffer_size
);
4104 c
= memdup((uint8_t*) s
->memory_pressure
.write_buffer
+ n
, s
->memory_pressure
.write_buffer_size
- n
);
4108 free_and_replace(s
->memory_pressure
.write_buffer
, c
);
4109 s
->memory_pressure
.write_buffer_size
-= n
;
4116 static int source_memory_pressure_initiate_dispatch(sd_event_source
*s
) {
4120 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
4122 r
= source_memory_pressure_write(s
);
4126 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4127 * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4129 /* No pending incoming IO? Then let's not continue further */
4130 if ((s
->memory_pressure
.revents
& (EPOLLIN
|EPOLLPRI
)) == 0) {
4132 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4133 if ((s
->memory_pressure
.revents
& (EPOLLHUP
|EPOLLERR
|EPOLLRDHUP
)) != 0)
4136 return 1; /* leave dispatch, we already processed everything */
4139 if (s
->memory_pressure
.revents
& EPOLLIN
) {
4140 uint8_t pipe_buf
[PIPE_BUF
];
4143 /* If the fd is readable, then flush out anything that might be queued */
4145 n
= read(s
->memory_pressure
.fd
, pipe_buf
, sizeof(pipe_buf
));
4146 if (n
< 0 && !ERRNO_IS_TRANSIENT(errno
))
4150 return 0; /* go on, dispatch to user callback */
4153 static int source_dispatch(sd_event_source
*s
) {
4154 EventSourceType saved_type
;
4155 sd_event
*saved_event
;
4159 assert(s
->pending
|| s
->type
== SOURCE_EXIT
);
4161 /* Save the event source type, here, so that we still know it after the event callback which might
4162 * invalidate the event. */
4163 saved_type
= s
->type
;
4165 /* Similarly, store a reference to the event loop object, so that we can still access it after the
4166 * callback might have invalidated/disconnected the event source. */
4167 saved_event
= s
->event
;
4168 PROTECT_EVENT(saved_event
);
4170 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4171 assert(!s
->ratelimited
);
4172 if (!ratelimit_below(&s
->rate_limit
)) {
4173 r
= event_source_enter_ratelimited(s
);
4180 if (!IN_SET(s
->type
, SOURCE_DEFER
, SOURCE_EXIT
)) {
4181 r
= source_set_pending(s
, false);
4186 if (s
->type
!= SOURCE_POST
) {
4189 /* If we execute a non-post source, let's mark all post sources as pending. */
4191 SET_FOREACH(z
, s
->event
->post_sources
) {
4192 if (event_source_is_offline(z
))
4195 r
= source_set_pending(z
, true);
4201 if (s
->type
== SOURCE_MEMORY_PRESSURE
) {
4202 r
= source_memory_pressure_initiate_dispatch(s
);
4203 if (r
== -EIO
) /* handle EIO errors similar to callback errors */
4207 if (r
> 0) /* already handled */
4211 if (s
->enabled
== SD_EVENT_ONESHOT
) {
4212 r
= sd_event_source_set_enabled(s
, SD_EVENT_OFF
);
4217 s
->dispatching
= true;
4222 r
= s
->io
.callback(s
, s
->io
.fd
, s
->io
.revents
, s
->userdata
);
4225 case SOURCE_TIME_REALTIME
:
4226 case SOURCE_TIME_BOOTTIME
:
4227 case SOURCE_TIME_MONOTONIC
:
4228 case SOURCE_TIME_REALTIME_ALARM
:
4229 case SOURCE_TIME_BOOTTIME_ALARM
:
4230 r
= s
->time
.callback(s
, s
->time
.next
, s
->userdata
);
4234 r
= s
->signal
.callback(s
, &s
->signal
.siginfo
, s
->userdata
);
4237 case SOURCE_CHILD
: {
4240 zombie
= IN_SET(s
->child
.siginfo
.si_code
, CLD_EXITED
, CLD_KILLED
, CLD_DUMPED
);
4242 r
= s
->child
.callback(s
, &s
->child
.siginfo
, s
->userdata
);
4244 /* Now, reap the PID for good. */
4246 (void) waitid(P_PID
, s
->child
.pid
, &s
->child
.siginfo
, WNOHANG
|WEXITED
);
4247 s
->child
.waited
= true;
4254 r
= s
->defer
.callback(s
, s
->userdata
);
4258 r
= s
->post
.callback(s
, s
->userdata
);
4262 r
= s
->exit
.callback(s
, s
->userdata
);
4265 case SOURCE_INOTIFY
: {
4266 struct sd_event
*e
= s
->event
;
4267 struct inotify_data
*d
;
4270 assert(s
->inotify
.inode_data
);
4271 assert_se(d
= s
->inotify
.inode_data
->inotify_data
);
4273 assert(d
->buffer_filled
>= offsetof(struct inotify_event
, name
));
4274 sz
= offsetof(struct inotify_event
, name
) + d
->buffer
.ev
.len
;
4275 assert(d
->buffer_filled
>= sz
);
4277 /* If the inotify callback destroys the event source then this likely means we don't need to
4278 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4279 * free it immediately, then we couldn't drop the event from the inotify event queue without
4280 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4281 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4282 * explicitly GC it after we are done dropping the inotify event from the buffer. */
4284 r
= s
->inotify
.callback(s
, &d
->buffer
.ev
, s
->userdata
);
4287 /* When no event is pending anymore on this inotify object, then let's drop the event from
4288 * the inotify event queue buffer. */
4289 if (d
->n_pending
== 0)
4290 event_inotify_data_drop(e
, d
, sz
);
4292 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4293 event_gc_inotify_data(e
, d
);
4297 case SOURCE_MEMORY_PRESSURE
:
4298 r
= s
->memory_pressure
.callback(s
, s
->userdata
);
4301 case SOURCE_WATCHDOG
:
4302 case _SOURCE_EVENT_SOURCE_TYPE_MAX
:
4303 case _SOURCE_EVENT_SOURCE_TYPE_INVALID
:
4304 assert_not_reached();
4307 s
->dispatching
= false;
4311 log_debug_errno(r
, "Event source %s (type %s) returned error, %s: %m",
4312 strna(s
->description
),
4313 event_source_type_to_string(saved_type
),
4314 s
->exit_on_failure
? "exiting" : "disabling");
4316 if (s
->exit_on_failure
)
4317 (void) sd_event_exit(saved_event
, r
);
4323 assert_se(sd_event_source_set_enabled(s
, SD_EVENT_OFF
) >= 0);
4328 static int event_prepare(sd_event
*e
) {
4336 s
= prioq_peek(e
->prepare
);
4337 if (!s
|| s
->prepare_iteration
== e
->iteration
|| event_source_is_offline(s
))
4340 s
->prepare_iteration
= e
->iteration
;
4341 prioq_reshuffle(e
->prepare
, s
, &s
->prepare_index
);
4344 s
->dispatching
= true;
4345 r
= s
->prepare(s
, s
->userdata
);
4346 s
->dispatching
= false;
4349 log_debug_errno(r
, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4350 strna(s
->description
),
4351 event_source_type_to_string(s
->type
),
4352 s
->exit_on_failure
? "exiting" : "disabling");
4354 if (s
->exit_on_failure
)
4355 (void) sd_event_exit(e
, r
);
4361 assert_se(sd_event_source_set_enabled(s
, SD_EVENT_OFF
) >= 0);
4367 static int dispatch_exit(sd_event
*e
) {
4373 p
= prioq_peek(e
->exit
);
4374 assert(!p
|| p
->type
== SOURCE_EXIT
);
4376 if (!p
|| event_source_is_offline(p
)) {
4377 e
->state
= SD_EVENT_FINISHED
;
4383 e
->state
= SD_EVENT_EXITING
;
4384 r
= source_dispatch(p
);
4385 e
->state
= SD_EVENT_INITIAL
;
4389 static sd_event_source
* event_next_pending(sd_event
*e
) {
4394 p
= prioq_peek(e
->pending
);
4398 if (event_source_is_offline(p
))
4404 static int arm_watchdog(sd_event
*e
) {
4405 struct itimerspec its
= {};
4409 assert(e
->watchdog_fd
>= 0);
4411 t
= sleep_between(e
,
4412 usec_add(e
->watchdog_last
, (e
->watchdog_period
/ 2)),
4413 usec_add(e
->watchdog_last
, (e
->watchdog_period
* 3 / 4)));
4415 timespec_store(&its
.it_value
, t
);
4417 /* Make sure we never set the watchdog to 0, which tells the
4418 * kernel to disable it. */
4419 if (its
.it_value
.tv_sec
== 0 && its
.it_value
.tv_nsec
== 0)
4420 its
.it_value
.tv_nsec
= 1;
4422 return RET_NERRNO(timerfd_settime(e
->watchdog_fd
, TFD_TIMER_ABSTIME
, &its
, NULL
));
4425 static int process_watchdog(sd_event
*e
) {
4431 /* Don't notify watchdog too often */
4432 if (e
->watchdog_last
+ e
->watchdog_period
/ 4 > e
->timestamp
.monotonic
)
4435 sd_notify(false, "WATCHDOG=1");
4436 e
->watchdog_last
= e
->timestamp
.monotonic
;
4438 return arm_watchdog(e
);
4441 static void event_close_inode_data_fds(sd_event
*e
) {
4442 struct inode_data
*d
;
4446 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4447 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4448 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4449 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4452 while ((d
= e
->inode_data_to_close_list
)) {
4454 d
->fd
= safe_close(d
->fd
);
4456 LIST_REMOVE(to_close
, e
->inode_data_to_close_list
, d
);
4460 static int event_memory_pressure_write_list(sd_event
*e
) {
4468 s
= LIST_POP(memory_pressure
.write_list
, e
->memory_pressure_write_list
);
4472 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
4473 assert(s
->memory_pressure
.write_buffer_size
> 0);
4474 s
->memory_pressure
.in_write_list
= false;
4476 r
= source_memory_pressure_write(s
);
4484 _public_
int sd_event_prepare(sd_event
*e
) {
4487 assert_return(e
, -EINVAL
);
4488 assert_return(e
= event_resolve(e
), -ENOPKG
);
4489 assert_return(!event_origin_changed(e
), -ECHILD
);
4490 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4491 assert_return(e
->state
== SD_EVENT_INITIAL
, -EBUSY
);
4493 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4494 * this check here once, since gettid() is typically not cached, and thus want to minimize
4496 assert_return(!e
->default_event_ptr
|| e
->tid
== gettid(), -EREMOTEIO
);
4498 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4501 if (e
->exit_requested
)
4506 e
->state
= SD_EVENT_PREPARING
;
4507 r
= event_prepare(e
);
4508 e
->state
= SD_EVENT_INITIAL
;
4512 r
= event_memory_pressure_write_list(e
);
4516 r
= event_arm_timer(e
, &e
->realtime
);
4520 r
= event_arm_timer(e
, &e
->boottime
);
4524 r
= event_arm_timer(e
, &e
->monotonic
);
4528 r
= event_arm_timer(e
, &e
->realtime_alarm
);
4532 r
= event_arm_timer(e
, &e
->boottime_alarm
);
4536 event_close_inode_data_fds(e
);
4538 if (event_next_pending(e
) || e
->need_process_child
|| e
->buffered_inotify_data_list
)
4541 e
->state
= SD_EVENT_ARMED
;
4546 e
->state
= SD_EVENT_ARMED
;
4547 r
= sd_event_wait(e
, 0);
4549 e
->state
= SD_EVENT_ARMED
;
4554 static int epoll_wait_usec(
4556 struct epoll_event
*events
,
4561 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4563 #if HAVE_EPOLL_PWAIT2
4564 static bool epoll_pwait2_absent
= false;
4567 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4568 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4569 * is not that obvious to implement given the libc and kernel definitions differ in the last
4570 * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4571 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4574 if (!epoll_pwait2_absent
&& timeout
!= USEC_INFINITY
) {
4575 r
= epoll_pwait2(fd
,
4578 TIMESPEC_STORE(timeout
),
4582 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
))
4583 return -errno
; /* Only fallback to old epoll_wait() if the syscall is masked or not
4586 epoll_pwait2_absent
= true;
4590 if (timeout
== USEC_INFINITY
)
4595 k
= DIV_ROUND_UP(timeout
, USEC_PER_MSEC
);
4597 msec
= INT_MAX
; /* Saturate */
4602 return RET_NERRNO(epoll_wait(fd
, events
, maxevents
, msec
));
4605 static int process_epoll(sd_event
*e
, usec_t timeout
, int64_t threshold
, int64_t *ret_min_priority
) {
4606 size_t n_event_queue
, m
, n_event_max
;
4607 int64_t min_priority
= threshold
;
4608 bool something_new
= false;
4612 assert(ret_min_priority
);
4614 n_event_queue
= MAX(e
->n_sources
, 1u);
4615 if (!GREEDY_REALLOC(e
->event_queue
, n_event_queue
))
4618 n_event_max
= MALLOC_ELEMENTSOF(e
->event_queue
);
4620 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4621 if (e
->buffered_inotify_data_list
)
4625 r
= epoll_wait_usec(
4635 if (m
< n_event_max
)
4638 if (n_event_max
>= n_event_queue
* 10)
4641 if (!GREEDY_REALLOC(e
->event_queue
, n_event_max
+ n_event_queue
))
4644 n_event_max
= MALLOC_ELEMENTSOF(e
->event_queue
);
4648 /* Set timestamp only when this is called first time. */
4649 if (threshold
== INT64_MAX
)
4650 triple_timestamp_now(&e
->timestamp
);
4652 for (size_t i
= 0; i
< m
; i
++) {
4654 if (e
->event_queue
[i
].data
.ptr
== INT_TO_PTR(SOURCE_WATCHDOG
))
4655 r
= flush_timer(e
, e
->watchdog_fd
, e
->event_queue
[i
].events
, NULL
);
4657 WakeupType
*t
= e
->event_queue
[i
].data
.ptr
;
4661 case WAKEUP_EVENT_SOURCE
: {
4662 sd_event_source
*s
= e
->event_queue
[i
].data
.ptr
;
4666 if (s
->priority
> threshold
)
4669 min_priority
= MIN(min_priority
, s
->priority
);
4674 r
= process_io(e
, s
, e
->event_queue
[i
].events
);
4678 r
= process_pidfd(e
, s
, e
->event_queue
[i
].events
);
4681 case SOURCE_MEMORY_PRESSURE
:
4682 r
= process_memory_pressure(s
, e
->event_queue
[i
].events
);
4686 assert_not_reached();
4692 case WAKEUP_CLOCK_DATA
: {
4693 struct clock_data
*d
= e
->event_queue
[i
].data
.ptr
;
4697 r
= flush_timer(e
, d
->fd
, e
->event_queue
[i
].events
, &d
->next
);
4701 case WAKEUP_SIGNAL_DATA
:
4702 r
= process_signal(e
, e
->event_queue
[i
].data
.ptr
, e
->event_queue
[i
].events
, &min_priority
);
4705 case WAKEUP_INOTIFY_DATA
:
4706 r
= event_inotify_data_read(e
, e
->event_queue
[i
].data
.ptr
, e
->event_queue
[i
].events
, threshold
);
4710 assert_not_reached();
4716 something_new
= true;
4719 *ret_min_priority
= min_priority
;
4720 return something_new
;
4723 _public_
int sd_event_wait(sd_event
*e
, uint64_t timeout
) {
4726 assert_return(e
, -EINVAL
);
4727 assert_return(e
= event_resolve(e
), -ENOPKG
);
4728 assert_return(!event_origin_changed(e
), -ECHILD
);
4729 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4730 assert_return(e
->state
== SD_EVENT_ARMED
, -EBUSY
);
4732 if (e
->exit_requested
) {
4733 e
->state
= SD_EVENT_PENDING
;
4737 for (int64_t threshold
= INT64_MAX
; ; threshold
--) {
4738 int64_t epoll_min_priority
, child_min_priority
;
4740 /* There may be a possibility that new epoll (especially IO) and child events are
4741 * triggered just after process_epoll() call but before process_child(), and the new IO
4742 * events may have higher priority than the child events. To salvage these events,
4743 * let's call epoll_wait() again, but accepts only events with higher priority than the
4744 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4745 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4746 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4748 r
= process_epoll(e
, timeout
, threshold
, &epoll_min_priority
);
4750 e
->state
= SD_EVENT_PENDING
;
4755 if (r
== 0 && threshold
< INT64_MAX
)
4756 /* No new epoll event. */
4759 r
= process_child(e
, threshold
, &child_min_priority
);
4763 /* No new child event. */
4766 threshold
= MIN(epoll_min_priority
, child_min_priority
);
4767 if (threshold
== INT64_MIN
)
4773 r
= process_watchdog(e
);
4777 r
= process_inotify(e
);
4781 r
= process_timer(e
, e
->timestamp
.realtime
, &e
->realtime
);
4785 r
= process_timer(e
, e
->timestamp
.boottime
, &e
->boottime
);
4789 r
= process_timer(e
, e
->timestamp
.realtime
, &e
->realtime_alarm
);
4793 r
= process_timer(e
, e
->timestamp
.boottime
, &e
->boottime_alarm
);
4797 r
= process_timer(e
, e
->timestamp
.monotonic
, &e
->monotonic
);
4801 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4802 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4803 * there were potentially re-enabled by the callback.
4805 * Wondering why we treat only this invocation of process_timer() differently? Once event
4806 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4807 * ratelimit expiry callback is never called for any other timer type. */
4812 if (event_next_pending(e
)) {
4813 e
->state
= SD_EVENT_PENDING
;
4820 e
->state
= SD_EVENT_INITIAL
;
4825 _public_
int sd_event_dispatch(sd_event
*e
) {
4829 assert_return(e
, -EINVAL
);
4830 assert_return(e
= event_resolve(e
), -ENOPKG
);
4831 assert_return(!event_origin_changed(e
), -ECHILD
);
4832 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4833 assert_return(e
->state
== SD_EVENT_PENDING
, -EBUSY
);
4835 if (e
->exit_requested
)
4836 return dispatch_exit(e
);
4838 p
= event_next_pending(e
);
4842 e
->state
= SD_EVENT_RUNNING
;
4843 r
= source_dispatch(p
);
4844 e
->state
= SD_EVENT_INITIAL
;
4848 e
->state
= SD_EVENT_INITIAL
;
4853 static void event_log_delays(sd_event
*e
) {
4854 char b
[ELEMENTSOF(e
->delays
) * DECIMAL_STR_MAX(unsigned) + 1], *p
;
4859 for (i
= 0; i
< ELEMENTSOF(e
->delays
); i
++) {
4860 l
= strpcpyf(&p
, l
, "%u ", e
->delays
[i
]);
4863 log_debug("Event loop iterations: %s", b
);
4866 _public_
int sd_event_run(sd_event
*e
, uint64_t timeout
) {
4869 assert_return(e
, -EINVAL
);
4870 assert_return(e
= event_resolve(e
), -ENOPKG
);
4871 assert_return(!event_origin_changed(e
), -ECHILD
);
4872 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4873 assert_return(e
->state
== SD_EVENT_INITIAL
, -EBUSY
);
4875 if (e
->profile_delays
&& e
->last_run_usec
!= 0) {
4879 this_run
= now(CLOCK_MONOTONIC
);
4881 l
= log2u64(this_run
- e
->last_run_usec
);
4882 assert(l
< ELEMENTSOF(e
->delays
));
4885 if (this_run
- e
->last_log_usec
>= 5*USEC_PER_SEC
) {
4886 event_log_delays(e
);
4887 e
->last_log_usec
= this_run
;
4891 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4894 r
= sd_event_prepare(e
);
4896 /* There was nothing? Then wait... */
4897 r
= sd_event_wait(e
, timeout
);
4899 if (e
->profile_delays
)
4900 e
->last_run_usec
= now(CLOCK_MONOTONIC
);
4903 /* There's something now, then let's dispatch it */
4904 r
= sd_event_dispatch(e
);
4914 _public_
int sd_event_loop(sd_event
*e
) {
4917 assert_return(e
, -EINVAL
);
4918 assert_return(e
= event_resolve(e
), -ENOPKG
);
4919 assert_return(!event_origin_changed(e
), -ECHILD
);
4920 assert_return(e
->state
== SD_EVENT_INITIAL
, -EBUSY
);
4925 while (e
->state
!= SD_EVENT_FINISHED
) {
4926 r
= sd_event_run(e
, UINT64_MAX
);
4931 return e
->exit_code
;
4934 _public_
int sd_event_get_fd(sd_event
*e
) {
4935 assert_return(e
, -EINVAL
);
4936 assert_return(e
= event_resolve(e
), -ENOPKG
);
4937 assert_return(!event_origin_changed(e
), -ECHILD
);
4942 _public_
int sd_event_get_state(sd_event
*e
) {
4943 assert_return(e
, -EINVAL
);
4944 assert_return(e
= event_resolve(e
), -ENOPKG
);
4945 assert_return(!event_origin_changed(e
), -ECHILD
);
4950 _public_
int sd_event_get_exit_code(sd_event
*e
, int *code
) {
4951 assert_return(e
, -EINVAL
);
4952 assert_return(e
= event_resolve(e
), -ENOPKG
);
4953 assert_return(!event_origin_changed(e
), -ECHILD
);
4955 if (!e
->exit_requested
)
4959 *code
= e
->exit_code
;
4963 _public_
int sd_event_exit(sd_event
*e
, int code
) {
4964 assert_return(e
, -EINVAL
);
4965 assert_return(e
= event_resolve(e
), -ENOPKG
);
4966 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4967 assert_return(!event_origin_changed(e
), -ECHILD
);
4969 e
->exit_requested
= true;
4970 e
->exit_code
= code
;
4975 _public_
int sd_event_now(sd_event
*e
, clockid_t clock
, uint64_t *usec
) {
4976 assert_return(e
, -EINVAL
);
4977 assert_return(e
= event_resolve(e
), -ENOPKG
);
4978 assert_return(usec
, -EINVAL
);
4979 assert_return(!event_origin_changed(e
), -ECHILD
);
4981 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock
))
4984 if (!triple_timestamp_is_set(&e
->timestamp
)) {
4985 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4990 *usec
= triple_timestamp_by_clock(&e
->timestamp
, clock
);
4994 _public_
int sd_event_default(sd_event
**ret
) {
4999 return !!default_event
;
5001 if (default_event
) {
5002 *ret
= sd_event_ref(default_event
);
5006 r
= sd_event_new(&e
);
5010 e
->default_event_ptr
= &default_event
;
5018 _public_
int sd_event_get_tid(sd_event
*e
, pid_t
*tid
) {
5019 assert_return(e
, -EINVAL
);
5020 assert_return(e
= event_resolve(e
), -ENOPKG
);
5021 assert_return(tid
, -EINVAL
);
5022 assert_return(!event_origin_changed(e
), -ECHILD
);
5032 _public_
int sd_event_set_watchdog(sd_event
*e
, int b
) {
5035 assert_return(e
, -EINVAL
);
5036 assert_return(e
= event_resolve(e
), -ENOPKG
);
5037 assert_return(!event_origin_changed(e
), -ECHILD
);
5039 if (e
->watchdog
== !!b
)
5043 r
= sd_watchdog_enabled(false, &e
->watchdog_period
);
5047 /* Issue first ping immediately */
5048 sd_notify(false, "WATCHDOG=1");
5049 e
->watchdog_last
= now(CLOCK_MONOTONIC
);
5051 e
->watchdog_fd
= timerfd_create(CLOCK_MONOTONIC
, TFD_NONBLOCK
|TFD_CLOEXEC
);
5052 if (e
->watchdog_fd
< 0)
5055 r
= arm_watchdog(e
);
5059 struct epoll_event ev
= {
5061 .data
.ptr
= INT_TO_PTR(SOURCE_WATCHDOG
),
5064 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_ADD
, e
->watchdog_fd
, &ev
) < 0) {
5070 if (e
->watchdog_fd
>= 0) {
5071 (void) epoll_ctl(e
->epoll_fd
, EPOLL_CTL_DEL
, e
->watchdog_fd
, NULL
);
5072 e
->watchdog_fd
= safe_close(e
->watchdog_fd
);
5080 e
->watchdog_fd
= safe_close(e
->watchdog_fd
);
5084 _public_
int sd_event_get_watchdog(sd_event
*e
) {
5085 assert_return(e
, -EINVAL
);
5086 assert_return(e
= event_resolve(e
), -ENOPKG
);
5087 assert_return(!event_origin_changed(e
), -ECHILD
);
5092 _public_
int sd_event_get_iteration(sd_event
*e
, uint64_t *ret
) {
5093 assert_return(e
, -EINVAL
);
5094 assert_return(e
= event_resolve(e
), -ENOPKG
);
5095 assert_return(!event_origin_changed(e
), -ECHILD
);
5097 *ret
= e
->iteration
;
5101 _public_
int sd_event_source_set_destroy_callback(sd_event_source
*s
, sd_event_destroy_t callback
) {
5102 assert_return(s
, -EINVAL
);
5103 assert_return(s
->event
, -EINVAL
);
5104 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5106 s
->destroy_callback
= callback
;
5110 _public_
int sd_event_source_get_destroy_callback(sd_event_source
*s
, sd_event_destroy_t
*ret
) {
5111 assert_return(s
, -EINVAL
);
5112 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5115 *ret
= s
->destroy_callback
;
5117 return !!s
->destroy_callback
;
5120 _public_
int sd_event_source_get_floating(sd_event_source
*s
) {
5121 assert_return(s
, -EINVAL
);
5122 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5127 _public_
int sd_event_source_set_floating(sd_event_source
*s
, int b
) {
5128 assert_return(s
, -EINVAL
);
5129 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5131 if (s
->floating
== !!b
)
5134 if (!s
->event
) /* Already disconnected */
5140 sd_event_source_ref(s
);
5141 sd_event_unref(s
->event
);
5143 sd_event_ref(s
->event
);
5144 sd_event_source_unref(s
);
5150 _public_
int sd_event_source_get_exit_on_failure(sd_event_source
*s
) {
5151 assert_return(s
, -EINVAL
);
5152 assert_return(s
->type
!= SOURCE_EXIT
, -EDOM
);
5153 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5155 return s
->exit_on_failure
;
5158 _public_
int sd_event_source_set_exit_on_failure(sd_event_source
*s
, int b
) {
5159 assert_return(s
, -EINVAL
);
5160 assert_return(s
->type
!= SOURCE_EXIT
, -EDOM
);
5161 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5163 if (s
->exit_on_failure
== !!b
)
5166 s
->exit_on_failure
= b
;
5170 _public_
int sd_event_source_set_ratelimit(sd_event_source
*s
, uint64_t interval
, unsigned burst
) {
5173 assert_return(s
, -EINVAL
);
5174 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5176 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5177 * so is a programming error. */
5178 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s
->type
), -EDOM
);
5180 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5181 * non-ratelimited. */
5182 r
= event_source_leave_ratelimit(s
, /* run_callback */ false);
5186 s
->rate_limit
= (RateLimit
) { interval
, burst
};
5190 _public_
int sd_event_source_set_ratelimit_expire_callback(sd_event_source
*s
, sd_event_handler_t callback
) {
5191 assert_return(s
, -EINVAL
);
5192 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5194 s
->ratelimit_expire_callback
= callback
;
5198 _public_
int sd_event_source_get_ratelimit(sd_event_source
*s
, uint64_t *ret_interval
, unsigned *ret_burst
) {
5199 assert_return(s
, -EINVAL
);
5200 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5202 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5203 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5204 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s
->type
))
5207 if (!ratelimit_configured(&s
->rate_limit
))
5211 *ret_interval
= s
->rate_limit
.interval
;
5213 *ret_burst
= s
->rate_limit
.burst
;
5218 _public_
int sd_event_source_is_ratelimited(sd_event_source
*s
) {
5219 assert_return(s
, -EINVAL
);
5220 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5222 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s
->type
))
5225 if (!ratelimit_configured(&s
->rate_limit
))
5228 return s
->ratelimited
;
5231 _public_
int sd_event_source_leave_ratelimit(sd_event_source
*s
) {
5234 assert_return(s
, -EINVAL
);
5236 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s
->type
))
5239 if (!ratelimit_configured(&s
->rate_limit
))
5242 if (!s
->ratelimited
)
5245 r
= event_source_leave_ratelimit(s
, /* run_callback */ false);
5249 return 1; /* tell caller that we indeed just left the ratelimit state */
5252 _public_
int sd_event_set_signal_exit(sd_event
*e
, int b
) {
5253 bool change
= false;
5256 assert_return(e
, -EINVAL
);
5259 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5260 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5261 * floating after creation (and undo this before deleting them again). */
5263 if (!e
->sigint_event_source
) {
5264 r
= sd_event_add_signal(e
, &e
->sigint_event_source
, SIGINT
| SD_EVENT_SIGNAL_PROCMASK
, NULL
, NULL
);
5268 assert(sd_event_source_set_floating(e
->sigint_event_source
, true) >= 0);
5272 if (!e
->sigterm_event_source
) {
5273 r
= sd_event_add_signal(e
, &e
->sigterm_event_source
, SIGTERM
| SD_EVENT_SIGNAL_PROCMASK
, NULL
, NULL
);
5276 assert(sd_event_source_set_floating(e
->sigint_event_source
, false) >= 0);
5277 e
->sigint_event_source
= sd_event_source_unref(e
->sigint_event_source
);
5283 assert(sd_event_source_set_floating(e
->sigterm_event_source
, true) >= 0);
5288 if (e
->sigint_event_source
) {
5289 assert(sd_event_source_set_floating(e
->sigint_event_source
, false) >= 0);
5290 e
->sigint_event_source
= sd_event_source_unref(e
->sigint_event_source
);
5294 if (e
->sigterm_event_source
) {
5295 assert(sd_event_source_set_floating(e
->sigterm_event_source
, false) >= 0);
5296 e
->sigterm_event_source
= sd_event_source_unref(e
->sigterm_event_source
);
5304 _public_
int sd_event_source_set_memory_pressure_type(sd_event_source
*s
, const char *ty
) {
5305 _cleanup_free_
char *b
= NULL
;
5306 _cleanup_free_
void *w
= NULL
;
5308 assert_return(s
, -EINVAL
);
5309 assert_return(s
->type
== SOURCE_MEMORY_PRESSURE
, -EDOM
);
5310 assert_return(ty
, -EINVAL
);
5311 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5313 if (!STR_IN_SET(ty
, "some", "full"))
5316 if (s
->memory_pressure
.locked
) /* Refuse adjusting parameters, if caller told us how to watch for events */
5319 char* space
= memchr(s
->memory_pressure
.write_buffer
, ' ', s
->memory_pressure
.write_buffer_size
);
5323 size_t l
= (char*) space
- (char*) s
->memory_pressure
.write_buffer
;
5324 b
= memdup_suffix0(s
->memory_pressure
.write_buffer
, l
);
5327 if (!STR_IN_SET(b
, "some", "full"))
5333 size_t nl
= strlen(ty
) + (s
->memory_pressure
.write_buffer_size
- l
);
5338 memcpy(stpcpy(w
, ty
), space
, (s
->memory_pressure
.write_buffer_size
- l
));
5340 free_and_replace(s
->memory_pressure
.write_buffer
, w
);
5341 s
->memory_pressure
.write_buffer_size
= nl
;
5342 s
->memory_pressure
.locked
= false;
5347 _public_
int sd_event_source_set_memory_pressure_period(sd_event_source
*s
, uint64_t threshold_usec
, uint64_t window_usec
) {
5348 _cleanup_free_
char *b
= NULL
;
5349 _cleanup_free_
void *w
= NULL
;
5351 assert_return(s
, -EINVAL
);
5352 assert_return(s
->type
== SOURCE_MEMORY_PRESSURE
, -EDOM
);
5353 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5355 if (threshold_usec
<= 0 || threshold_usec
>= UINT64_MAX
)
5357 if (window_usec
<= 0 || window_usec
>= UINT64_MAX
)
5359 if (threshold_usec
> window_usec
)
5362 if (s
->memory_pressure
.locked
) /* Refuse adjusting parameters, if caller told us how to watch for events */
5365 char* space
= memchr(s
->memory_pressure
.write_buffer
, ' ', s
->memory_pressure
.write_buffer_size
);
5369 size_t l
= (char*) space
- (char*) s
->memory_pressure
.write_buffer
;
5370 b
= memdup_suffix0(s
->memory_pressure
.write_buffer
, l
);
5373 if (!STR_IN_SET(b
, "some", "full"))
5376 if (asprintf((char**) &w
,
5377 "%s " USEC_FMT
" " USEC_FMT
"",
5384 if (memcmp_nn(s
->memory_pressure
.write_buffer
, s
->memory_pressure
.write_buffer_size
, w
, l
) == 0)
5387 free_and_replace(s
->memory_pressure
.write_buffer
, w
);
5388 s
->memory_pressure
.write_buffer_size
= l
;
5389 s
->memory_pressure
.locked
= false;