1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
4 #include <sys/timerfd.h>
10 #include "sd-messages.h"
12 #include "alloc-util.h"
14 #include "event-source.h"
17 #include "glyph-util.h"
19 #include "hexdecoct.h"
21 #include "logarithm.h"
23 #include "mallinfo-util.h"
24 #include "memory-util.h"
25 #include "missing_magic.h"
26 #include "missing_syscall.h"
27 #include "missing_threads.h"
28 #include "origin-id.h"
29 #include "path-util.h"
31 #include "process-util.h"
34 #include "signal-util.h"
35 #include "socket-util.h"
36 #include "stat-util.h"
37 #include "string-table.h"
38 #include "string-util.h"
40 #include "time-util.h"
42 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
44 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source
*s
) {
45 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
47 s
->type
== SOURCE_CHILD
&&
48 s
->child
.pidfd
>= 0 &&
49 s
->child
.options
== WEXITED
;
52 static bool event_source_is_online(sd_event_source
*s
) {
54 return s
->enabled
!= SD_EVENT_OFF
&& !s
->ratelimited
;
57 static bool event_source_is_offline(sd_event_source
*s
) {
59 return s
->enabled
== SD_EVENT_OFF
|| s
->ratelimited
;
62 static const char* const event_source_type_table
[_SOURCE_EVENT_SOURCE_TYPE_MAX
] = {
64 [SOURCE_TIME_REALTIME
] = "realtime",
65 [SOURCE_TIME_BOOTTIME
] = "boottime",
66 [SOURCE_TIME_MONOTONIC
] = "monotonic",
67 [SOURCE_TIME_REALTIME_ALARM
] = "realtime-alarm",
68 [SOURCE_TIME_BOOTTIME_ALARM
] = "boottime-alarm",
69 [SOURCE_SIGNAL
] = "signal",
70 [SOURCE_CHILD
] = "child",
71 [SOURCE_DEFER
] = "defer",
72 [SOURCE_POST
] = "post",
73 [SOURCE_EXIT
] = "exit",
74 [SOURCE_WATCHDOG
] = "watchdog",
75 [SOURCE_INOTIFY
] = "inotify",
76 [SOURCE_MEMORY_PRESSURE
] = "memory-pressure",
79 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type
, int);
81 #define EVENT_SOURCE_IS_TIME(t) \
83 SOURCE_TIME_REALTIME, \
84 SOURCE_TIME_BOOTTIME, \
85 SOURCE_TIME_MONOTONIC, \
86 SOURCE_TIME_REALTIME_ALARM, \
87 SOURCE_TIME_BOOTTIME_ALARM)
89 #define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
92 SOURCE_TIME_REALTIME, \
93 SOURCE_TIME_BOOTTIME, \
94 SOURCE_TIME_MONOTONIC, \
95 SOURCE_TIME_REALTIME_ALARM, \
96 SOURCE_TIME_BOOTTIME_ALARM, \
100 SOURCE_MEMORY_PRESSURE)
102 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
103 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
104 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
105 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
116 /* timerfd_create() only supports these five clocks so far. We
117 * can add support for more clocks when the kernel learns to
118 * deal with them, too. */
119 struct clock_data realtime
;
120 struct clock_data boottime
;
121 struct clock_data monotonic
;
122 struct clock_data realtime_alarm
;
123 struct clock_data boottime_alarm
;
127 sd_event_source
**signal_sources
; /* indexed by signal number */
128 Hashmap
*signal_data
; /* indexed by priority */
130 Hashmap
*child_sources
;
131 unsigned n_online_child_sources
;
137 Hashmap
*inotify_data
; /* indexed by priority */
139 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
140 LIST_HEAD(struct inode_data
, inode_data_to_close_list
);
142 /* A list of inotify objects that already have events buffered which aren't processed yet */
143 LIST_HEAD(struct inotify_data
, buffered_inotify_data_list
);
145 /* A list of memory pressure event sources that still need their subscription string written */
146 LIST_HEAD(sd_event_source
, memory_pressure_write_list
);
151 triple_timestamp timestamp
;
154 bool exit_requested
:1;
155 bool need_process_child
:1;
157 bool profile_delays
:1;
162 sd_event
**default_event_ptr
;
164 usec_t watchdog_last
, watchdog_period
;
168 struct epoll_event
*event_queue
;
170 LIST_HEAD(sd_event_source
, sources
);
172 sd_event_source
*sigint_event_source
, *sigterm_event_source
;
174 usec_t last_run_usec
, last_log_usec
;
175 unsigned delays
[sizeof(usec_t
) * 8];
178 DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event
, event
);
180 static thread_local sd_event
*default_event
= NULL
;
182 static void source_disconnect(sd_event_source
*s
);
183 static void event_gc_inode_data(sd_event
*e
, struct inode_data
*d
);
185 static sd_event
*event_resolve(sd_event
*e
) {
186 return e
== SD_EVENT_DEFAULT
? default_event
: e
;
189 static int pending_prioq_compare(const void *a
, const void *b
) {
190 const sd_event_source
*x
= a
, *y
= b
;
196 /* Enabled ones first */
197 r
= CMP(x
->enabled
== SD_EVENT_OFF
, y
->enabled
== SD_EVENT_OFF
);
201 /* Non rate-limited ones first. */
202 r
= CMP(!!x
->ratelimited
, !!y
->ratelimited
);
206 /* Lower priority values first */
207 r
= CMP(x
->priority
, y
->priority
);
211 /* Older entries first */
212 return CMP(x
->pending_iteration
, y
->pending_iteration
);
215 static int prepare_prioq_compare(const void *a
, const void *b
) {
216 const sd_event_source
*x
= a
, *y
= b
;
222 /* Enabled ones first */
223 r
= CMP(x
->enabled
== SD_EVENT_OFF
, y
->enabled
== SD_EVENT_OFF
);
227 /* Non rate-limited ones first. */
228 r
= CMP(!!x
->ratelimited
, !!y
->ratelimited
);
232 /* Move most recently prepared ones last, so that we can stop
233 * preparing as soon as we hit one that has already been
234 * prepared in the current iteration */
235 r
= CMP(x
->prepare_iteration
, y
->prepare_iteration
);
239 /* Lower priority values first */
240 return CMP(x
->priority
, y
->priority
);
243 static usec_t
time_event_source_next(const sd_event_source
*s
) {
246 /* We have two kinds of event sources that have elapsation times associated with them: the actual
247 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
248 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
249 * looking at here. */
251 if (s
->ratelimited
) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
252 assert(s
->rate_limit
.begin
!= 0);
253 assert(s
->rate_limit
.interval
!= 0);
254 return usec_add(s
->rate_limit
.begin
, s
->rate_limit
.interval
);
257 /* Otherwise this must be a time event source, if not ratelimited */
258 if (EVENT_SOURCE_IS_TIME(s
->type
))
261 return USEC_INFINITY
;
264 static usec_t
time_event_source_latest(const sd_event_source
*s
) {
267 if (s
->ratelimited
) { /* For ratelimited stuff the earliest and the latest time shall actually be the
268 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
270 assert(s
->rate_limit
.begin
!= 0);
271 assert(s
->rate_limit
.interval
!= 0);
272 return usec_add(s
->rate_limit
.begin
, s
->rate_limit
.interval
);
275 /* Must be a time event source, if not ratelimited */
276 if (EVENT_SOURCE_IS_TIME(s
->type
))
277 return usec_add(s
->time
.next
, s
->time
.accuracy
);
279 return USEC_INFINITY
;
282 static bool event_source_timer_candidate(const sd_event_source
*s
) {
285 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
286 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
287 return !s
->pending
|| s
->ratelimited
;
290 static int time_prioq_compare(const void *a
, const void *b
, usec_t (*time_func
)(const sd_event_source
*s
)) {
291 const sd_event_source
*x
= a
, *y
= b
;
294 /* Enabled ones first */
295 r
= CMP(x
->enabled
== SD_EVENT_OFF
, y
->enabled
== SD_EVENT_OFF
);
299 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
300 r
= CMP(!event_source_timer_candidate(x
), !event_source_timer_candidate(y
));
305 return CMP(time_func(x
), time_func(y
));
308 static int earliest_time_prioq_compare(const void *a
, const void *b
) {
309 return time_prioq_compare(a
, b
, time_event_source_next
);
312 static int latest_time_prioq_compare(const void *a
, const void *b
) {
313 return time_prioq_compare(a
, b
, time_event_source_latest
);
316 static int exit_prioq_compare(const void *a
, const void *b
) {
317 const sd_event_source
*x
= a
, *y
= b
;
320 assert(x
->type
== SOURCE_EXIT
);
321 assert(y
->type
== SOURCE_EXIT
);
323 /* Enabled ones first */
324 r
= CMP(x
->enabled
== SD_EVENT_OFF
, y
->enabled
== SD_EVENT_OFF
);
328 /* Lower priority values first */
329 return CMP(x
->priority
, y
->priority
);
332 static void free_clock_data(struct clock_data
*d
) {
334 assert(d
->wakeup
== WAKEUP_CLOCK_DATA
);
337 prioq_free(d
->earliest
);
338 prioq_free(d
->latest
);
341 static sd_event
*event_free(sd_event
*e
) {
346 e
->sigterm_event_source
= sd_event_source_unref(e
->sigterm_event_source
);
347 e
->sigint_event_source
= sd_event_source_unref(e
->sigint_event_source
);
349 while ((s
= e
->sources
)) {
351 source_disconnect(s
);
352 sd_event_source_unref(s
);
355 assert(e
->n_sources
== 0);
357 if (e
->default_event_ptr
)
358 *(e
->default_event_ptr
) = NULL
;
360 safe_close(e
->epoll_fd
);
361 safe_close(e
->watchdog_fd
);
363 free_clock_data(&e
->realtime
);
364 free_clock_data(&e
->boottime
);
365 free_clock_data(&e
->monotonic
);
366 free_clock_data(&e
->realtime_alarm
);
367 free_clock_data(&e
->boottime_alarm
);
369 prioq_free(e
->pending
);
370 prioq_free(e
->prepare
);
373 free(e
->signal_sources
);
374 hashmap_free(e
->signal_data
);
376 hashmap_free(e
->inotify_data
);
378 hashmap_free(e
->child_sources
);
379 set_free(e
->post_sources
);
381 free(e
->event_queue
);
386 _public_
int sd_event_new(sd_event
** ret
) {
390 assert_return(ret
, -EINVAL
);
392 e
= new(sd_event
, 1);
399 .watchdog_fd
= -EBADF
,
400 .realtime
.wakeup
= WAKEUP_CLOCK_DATA
,
401 .realtime
.fd
= -EBADF
,
402 .realtime
.next
= USEC_INFINITY
,
403 .boottime
.wakeup
= WAKEUP_CLOCK_DATA
,
404 .boottime
.fd
= -EBADF
,
405 .boottime
.next
= USEC_INFINITY
,
406 .monotonic
.wakeup
= WAKEUP_CLOCK_DATA
,
407 .monotonic
.fd
= -EBADF
,
408 .monotonic
.next
= USEC_INFINITY
,
409 .realtime_alarm
.wakeup
= WAKEUP_CLOCK_DATA
,
410 .realtime_alarm
.fd
= -EBADF
,
411 .realtime_alarm
.next
= USEC_INFINITY
,
412 .boottime_alarm
.wakeup
= WAKEUP_CLOCK_DATA
,
413 .boottime_alarm
.fd
= -EBADF
,
414 .boottime_alarm
.next
= USEC_INFINITY
,
415 .perturb
= USEC_INFINITY
,
416 .origin_id
= origin_id_query(),
419 r
= prioq_ensure_allocated(&e
->pending
, pending_prioq_compare
);
423 e
->epoll_fd
= epoll_create1(EPOLL_CLOEXEC
);
424 if (e
->epoll_fd
< 0) {
429 e
->epoll_fd
= fd_move_above_stdio(e
->epoll_fd
);
431 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
432 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
433 special_glyph(SPECIAL_GLYPH_ELLIPSIS
));
434 e
->profile_delays
= true;
445 /* Define manually so we can add the origin check */
446 _public_ sd_event
*sd_event_ref(sd_event
*e
) {
449 if (event_origin_changed(e
))
457 _public_ sd_event
* sd_event_unref(sd_event
*e
) {
460 if (event_origin_changed(e
))
463 assert(e
->n_ref
> 0);
467 return event_free(e
);
470 #define PROTECT_EVENT(e) \
471 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
473 _public_ sd_event_source
* sd_event_source_disable_unref(sd_event_source
*s
) {
475 (void) sd_event_source_set_enabled(s
, SD_EVENT_OFF
);
476 return sd_event_source_unref(s
);
479 static void source_io_unregister(sd_event_source
*s
) {
481 assert(s
->type
== SOURCE_IO
);
483 if (event_origin_changed(s
->event
))
486 if (!s
->io
.registered
)
489 if (epoll_ctl(s
->event
->epoll_fd
, EPOLL_CTL_DEL
, s
->io
.fd
, NULL
) < 0)
490 log_debug_errno(errno
, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
491 strna(s
->description
), event_source_type_to_string(s
->type
));
493 s
->io
.registered
= false;
496 static int source_io_register(
502 assert(s
->type
== SOURCE_IO
);
503 assert(enabled
!= SD_EVENT_OFF
);
505 struct epoll_event ev
= {
506 .events
= events
| (enabled
== SD_EVENT_ONESHOT
? EPOLLONESHOT
: 0),
510 if (epoll_ctl(s
->event
->epoll_fd
,
511 s
->io
.registered
? EPOLL_CTL_MOD
: EPOLL_CTL_ADD
,
515 s
->io
.registered
= true;
520 static void source_child_pidfd_unregister(sd_event_source
*s
) {
522 assert(s
->type
== SOURCE_CHILD
);
524 if (event_origin_changed(s
->event
))
527 if (!s
->child
.registered
)
530 if (EVENT_SOURCE_WATCH_PIDFD(s
))
531 if (epoll_ctl(s
->event
->epoll_fd
, EPOLL_CTL_DEL
, s
->child
.pidfd
, NULL
) < 0)
532 log_debug_errno(errno
, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
533 strna(s
->description
), event_source_type_to_string(s
->type
));
535 s
->child
.registered
= false;
538 static int source_child_pidfd_register(sd_event_source
*s
, int enabled
) {
540 assert(s
->type
== SOURCE_CHILD
);
541 assert(enabled
!= SD_EVENT_OFF
);
543 if (EVENT_SOURCE_WATCH_PIDFD(s
)) {
544 struct epoll_event ev
= {
545 .events
= EPOLLIN
| (enabled
== SD_EVENT_ONESHOT
? EPOLLONESHOT
: 0),
549 if (epoll_ctl(s
->event
->epoll_fd
,
550 s
->child
.registered
? EPOLL_CTL_MOD
: EPOLL_CTL_ADD
,
551 s
->child
.pidfd
, &ev
) < 0)
555 s
->child
.registered
= true;
559 static void source_memory_pressure_unregister(sd_event_source
*s
) {
561 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
563 if (event_origin_changed(s
->event
))
566 if (!s
->memory_pressure
.registered
)
569 if (epoll_ctl(s
->event
->epoll_fd
, EPOLL_CTL_DEL
, s
->memory_pressure
.fd
, NULL
) < 0)
570 log_debug_errno(errno
, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
571 strna(s
->description
), event_source_type_to_string(s
->type
));
573 s
->memory_pressure
.registered
= false;
576 static int source_memory_pressure_register(sd_event_source
*s
, int enabled
) {
578 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
579 assert(enabled
!= SD_EVENT_OFF
);
581 struct epoll_event ev
= {
582 .events
= s
->memory_pressure
.write_buffer_size
> 0 ? EPOLLOUT
:
583 (s
->memory_pressure
.events
| (enabled
== SD_EVENT_ONESHOT
? EPOLLONESHOT
: 0)),
587 if (epoll_ctl(s
->event
->epoll_fd
,
588 s
->memory_pressure
.registered
? EPOLL_CTL_MOD
: EPOLL_CTL_ADD
,
589 s
->memory_pressure
.fd
, &ev
) < 0)
592 s
->memory_pressure
.registered
= true;
596 static void source_memory_pressure_add_to_write_list(sd_event_source
*s
) {
598 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
600 if (s
->memory_pressure
.in_write_list
)
603 LIST_PREPEND(memory_pressure
.write_list
, s
->event
->memory_pressure_write_list
, s
);
604 s
->memory_pressure
.in_write_list
= true;
607 static void source_memory_pressure_remove_from_write_list(sd_event_source
*s
) {
609 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
611 if (!s
->memory_pressure
.in_write_list
)
614 LIST_REMOVE(memory_pressure
.write_list
, s
->event
->memory_pressure_write_list
, s
);
615 s
->memory_pressure
.in_write_list
= false;
618 static clockid_t
event_source_type_to_clock(EventSourceType t
) {
622 case SOURCE_TIME_REALTIME
:
623 return CLOCK_REALTIME
;
625 case SOURCE_TIME_BOOTTIME
:
626 return CLOCK_BOOTTIME
;
628 case SOURCE_TIME_MONOTONIC
:
629 return CLOCK_MONOTONIC
;
631 case SOURCE_TIME_REALTIME_ALARM
:
632 return CLOCK_REALTIME_ALARM
;
634 case SOURCE_TIME_BOOTTIME_ALARM
:
635 return CLOCK_BOOTTIME_ALARM
;
638 return (clockid_t
) -1;
642 static EventSourceType
clock_to_event_source_type(clockid_t clock
) {
647 return SOURCE_TIME_REALTIME
;
650 return SOURCE_TIME_BOOTTIME
;
652 case CLOCK_MONOTONIC
:
653 return SOURCE_TIME_MONOTONIC
;
655 case CLOCK_REALTIME_ALARM
:
656 return SOURCE_TIME_REALTIME_ALARM
;
658 case CLOCK_BOOTTIME_ALARM
:
659 return SOURCE_TIME_BOOTTIME_ALARM
;
662 return _SOURCE_EVENT_SOURCE_TYPE_INVALID
;
666 static struct clock_data
* event_get_clock_data(sd_event
*e
, EventSourceType t
) {
671 case SOURCE_TIME_REALTIME
:
674 case SOURCE_TIME_BOOTTIME
:
677 case SOURCE_TIME_MONOTONIC
:
678 return &e
->monotonic
;
680 case SOURCE_TIME_REALTIME_ALARM
:
681 return &e
->realtime_alarm
;
683 case SOURCE_TIME_BOOTTIME_ALARM
:
684 return &e
->boottime_alarm
;
691 static void event_free_signal_data(sd_event
*e
, struct signal_data
*d
) {
697 hashmap_remove(e
->signal_data
, &d
->priority
);
702 static int event_make_signal_data(
705 struct signal_data
**ret
) {
707 struct signal_data
*d
;
715 if (event_origin_changed(e
))
718 if (e
->signal_sources
&& e
->signal_sources
[sig
])
719 priority
= e
->signal_sources
[sig
]->priority
;
721 priority
= SD_EVENT_PRIORITY_NORMAL
;
723 d
= hashmap_get(e
->signal_data
, &priority
);
725 if (sigismember(&d
->sigset
, sig
) > 0) {
731 d
= new(struct signal_data
, 1);
735 *d
= (struct signal_data
) {
736 .wakeup
= WAKEUP_SIGNAL_DATA
,
738 .priority
= priority
,
741 r
= hashmap_ensure_put(&e
->signal_data
, &uint64_hash_ops
, &d
->priority
, d
);
751 assert_se(sigaddset(&ss_copy
, sig
) >= 0);
753 r
= signalfd(d
->fd
>= 0 ? d
->fd
: -1, /* the first arg must be -1 or a valid signalfd */
755 SFD_NONBLOCK
|SFD_CLOEXEC
);
769 d
->fd
= fd_move_above_stdio(r
);
771 struct epoll_event ev
= {
776 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_ADD
, d
->fd
, &ev
) < 0) {
788 event_free_signal_data(e
, d
);
793 static void event_unmask_signal_data(sd_event
*e
, struct signal_data
*d
, int sig
) {
797 /* Turns off the specified signal in the signal data
798 * object. If the signal mask of the object becomes empty that
801 if (sigismember(&d
->sigset
, sig
) == 0)
804 assert_se(sigdelset(&d
->sigset
, sig
) >= 0);
806 if (sigisemptyset(&d
->sigset
)) {
807 /* If all the mask is all-zero we can get rid of the structure */
808 event_free_signal_data(e
, d
);
812 if (event_origin_changed(e
))
817 if (signalfd(d
->fd
, &d
->sigset
, SFD_NONBLOCK
|SFD_CLOEXEC
) < 0)
818 log_debug_errno(errno
, "Failed to unset signal bit, ignoring: %m");
821 static void event_gc_signal_data(sd_event
*e
, const int64_t *priority
, int sig
) {
822 struct signal_data
*d
;
823 static const int64_t zero_priority
= 0;
827 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
828 * and possibly drop the signalfd for it. */
830 if (sig
== SIGCHLD
&&
831 e
->n_online_child_sources
> 0)
834 if (e
->signal_sources
&&
835 e
->signal_sources
[sig
] &&
836 event_source_is_online(e
->signal_sources
[sig
]))
840 * The specified signal might be enabled in three different queues:
842 * 1) the one that belongs to the priority passed (if it is non-NULL)
843 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
844 * 3) the 0 priority (to cover the SIGCHLD case)
846 * Hence, let's remove it from all three here.
850 d
= hashmap_get(e
->signal_data
, priority
);
852 event_unmask_signal_data(e
, d
, sig
);
855 if (e
->signal_sources
&& e
->signal_sources
[sig
]) {
856 d
= hashmap_get(e
->signal_data
, &e
->signal_sources
[sig
]->priority
);
858 event_unmask_signal_data(e
, d
, sig
);
861 d
= hashmap_get(e
->signal_data
, &zero_priority
);
863 event_unmask_signal_data(e
, d
, sig
);
866 static void event_source_pp_prioq_reshuffle(sd_event_source
*s
) {
869 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
870 * they are enabled/disabled or marked pending and such. */
873 prioq_reshuffle(s
->event
->pending
, s
, &s
->pending_index
);
876 prioq_reshuffle(s
->event
->prepare
, s
, &s
->prepare_index
);
879 static void event_source_time_prioq_reshuffle(sd_event_source
*s
) {
880 struct clock_data
*d
;
884 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
885 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
889 d
= &s
->event
->monotonic
;
890 else if (EVENT_SOURCE_IS_TIME(s
->type
))
891 assert_se(d
= event_get_clock_data(s
->event
, s
->type
));
893 return; /* no-op for an event source which is neither a timer nor ratelimited. */
895 prioq_reshuffle(d
->earliest
, s
, &s
->earliest_index
);
896 prioq_reshuffle(d
->latest
, s
, &s
->latest_index
);
897 d
->needs_rearm
= true;
900 static void event_source_time_prioq_remove(
902 struct clock_data
*d
) {
907 prioq_remove(d
->earliest
, s
, &s
->earliest_index
);
908 prioq_remove(d
->latest
, s
, &s
->latest_index
);
909 s
->earliest_index
= s
->latest_index
= PRIOQ_IDX_NULL
;
910 d
->needs_rearm
= true;
913 static void source_disconnect(sd_event_source
*s
) {
922 assert(s
->event
->n_sources
> 0);
928 source_io_unregister(s
);
932 case SOURCE_TIME_REALTIME
:
933 case SOURCE_TIME_BOOTTIME
:
934 case SOURCE_TIME_MONOTONIC
:
935 case SOURCE_TIME_REALTIME_ALARM
:
936 case SOURCE_TIME_BOOTTIME_ALARM
:
937 /* Only remove this event source from the time event source here if it is not ratelimited. If
938 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
939 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
941 if (!s
->ratelimited
) {
942 struct clock_data
*d
;
943 assert_se(d
= event_get_clock_data(s
->event
, s
->type
));
944 event_source_time_prioq_remove(s
, d
);
950 if (s
->signal
.sig
> 0) {
952 if (s
->event
->signal_sources
)
953 s
->event
->signal_sources
[s
->signal
.sig
] = NULL
;
955 event_gc_signal_data(s
->event
, &s
->priority
, s
->signal
.sig
);
957 if (s
->signal
.unblock
) {
960 if (sigemptyset(&new_ss
) < 0)
961 log_debug_errno(errno
, "Failed to reset signal set, ignoring: %m");
962 else if (sigaddset(&new_ss
, s
->signal
.sig
) < 0)
963 log_debug_errno(errno
, "Failed to add signal %i to signal mask, ignoring: %m", s
->signal
.sig
);
965 r
= pthread_sigmask(SIG_UNBLOCK
, &new_ss
, NULL
);
967 log_debug_errno(r
, "Failed to unblock signal %i, ignoring: %m", s
->signal
.sig
);
975 if (event_origin_changed(s
->event
))
976 s
->child
.process_owned
= false;
978 if (s
->child
.pid
> 0) {
979 if (event_source_is_online(s
)) {
980 assert(s
->event
->n_online_child_sources
> 0);
981 s
->event
->n_online_child_sources
--;
984 (void) hashmap_remove(s
->event
->child_sources
, PID_TO_PTR(s
->child
.pid
));
987 if (EVENT_SOURCE_WATCH_PIDFD(s
))
988 source_child_pidfd_unregister(s
);
990 event_gc_signal_data(s
->event
, &s
->priority
, SIGCHLD
);
999 set_remove(s
->event
->post_sources
, s
);
1003 prioq_remove(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
1006 case SOURCE_INOTIFY
: {
1007 struct inode_data
*inode_data
;
1009 inode_data
= s
->inotify
.inode_data
;
1011 struct inotify_data
*inotify_data
;
1012 assert_se(inotify_data
= inode_data
->inotify_data
);
1014 /* Detach this event source from the inode object */
1015 LIST_REMOVE(inotify
.by_inode_data
, inode_data
->event_sources
, s
);
1016 s
->inotify
.inode_data
= NULL
;
1019 assert(inotify_data
->n_pending
> 0);
1020 inotify_data
->n_pending
--;
1023 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1024 * continued to being watched. That's because inotify doesn't really have an API for that: we
1025 * can only change watch masks with access to the original inode either by fd or by path. But
1026 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1027 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1028 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1029 * there), but given the need for open_by_handle_at() which is privileged and not universally
1030 * available this would be quite an incomplete solution. Hence we go the other way, leave the
1031 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1032 * anymore after reception. Yes, this sucks, but … Linux … */
1034 /* Maybe release the inode data (and its inotify) */
1035 event_gc_inode_data(s
->event
, inode_data
);
1041 case SOURCE_MEMORY_PRESSURE
:
1042 source_memory_pressure_remove_from_write_list(s
);
1043 source_memory_pressure_unregister(s
);
1047 assert_not_reached();
1051 prioq_remove(s
->event
->pending
, s
, &s
->pending_index
);
1054 prioq_remove(s
->event
->prepare
, s
, &s
->prepare_index
);
1057 event_source_time_prioq_remove(s
, &s
->event
->monotonic
);
1059 event
= TAKE_PTR(s
->event
);
1060 LIST_REMOVE(sources
, event
->sources
, s
);
1063 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1064 * pidfd associated with this event source, which we'll do only on source_free(). */
1067 sd_event_unref(event
);
1070 static sd_event_source
* source_free(sd_event_source
*s
) {
1073 source_disconnect(s
);
1075 if (s
->type
== SOURCE_IO
&& s
->io
.owned
)
1076 s
->io
.fd
= safe_close(s
->io
.fd
);
1078 if (s
->type
== SOURCE_CHILD
) {
1079 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1081 if (s
->child
.process_owned
) {
1083 if (!s
->child
.exited
) {
1086 if (s
->child
.pidfd
>= 0) {
1087 if (pidfd_send_signal(s
->child
.pidfd
, SIGKILL
, NULL
, 0) < 0) {
1088 if (errno
== ESRCH
) /* Already dead */
1090 else if (!ERRNO_IS_NOT_SUPPORTED(errno
))
1091 log_debug_errno(errno
, "Failed to kill process " PID_FMT
" via pidfd_send_signal(), re-trying via kill(): %m",
1098 if (kill(s
->child
.pid
, SIGKILL
) < 0)
1099 if (errno
!= ESRCH
) /* Already dead */
1100 log_debug_errno(errno
, "Failed to kill process " PID_FMT
" via kill(), ignoring: %m",
1104 if (!s
->child
.waited
) {
1107 /* Reap the child if we can */
1108 (void) waitid(P_PID
, s
->child
.pid
, &si
, WEXITED
);
1112 if (s
->child
.pidfd_owned
)
1113 s
->child
.pidfd
= safe_close(s
->child
.pidfd
);
1116 if (s
->type
== SOURCE_MEMORY_PRESSURE
) {
1117 s
->memory_pressure
.fd
= safe_close(s
->memory_pressure
.fd
);
1118 s
->memory_pressure
.write_buffer
= mfree(s
->memory_pressure
.write_buffer
);
1121 if (s
->destroy_callback
)
1122 s
->destroy_callback(s
->userdata
);
1124 free(s
->description
);
1127 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source
*, source_free
);
1129 static int source_set_pending(sd_event_source
*s
, bool b
) {
1133 assert(s
->type
!= SOURCE_EXIT
);
1135 if (s
->pending
== b
)
1141 s
->pending_iteration
= s
->event
->iteration
;
1143 r
= prioq_put(s
->event
->pending
, s
, &s
->pending_index
);
1149 assert_se(prioq_remove(s
->event
->pending
, s
, &s
->pending_index
));
1151 if (EVENT_SOURCE_IS_TIME(s
->type
))
1152 event_source_time_prioq_reshuffle(s
);
1154 if (s
->type
== SOURCE_SIGNAL
&& !b
) {
1155 struct signal_data
*d
;
1157 d
= hashmap_get(s
->event
->signal_data
, &s
->priority
);
1158 if (d
&& d
->current
== s
)
1162 if (s
->type
== SOURCE_INOTIFY
) {
1164 assert(s
->inotify
.inode_data
);
1165 assert(s
->inotify
.inode_data
->inotify_data
);
1168 s
->inotify
.inode_data
->inotify_data
->n_pending
++;
1170 assert(s
->inotify
.inode_data
->inotify_data
->n_pending
> 0);
1171 s
->inotify
.inode_data
->inotify_data
->n_pending
--;
1178 static sd_event_source
*source_new(sd_event
*e
, bool floating
, EventSourceType type
) {
1180 /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1181 * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1183 static const size_t size_table
[_SOURCE_EVENT_SOURCE_TYPE_MAX
] = {
1184 [SOURCE_IO
] = endoffsetof_field(sd_event_source
, io
),
1185 [SOURCE_TIME_REALTIME
] = endoffsetof_field(sd_event_source
, time
),
1186 [SOURCE_TIME_BOOTTIME
] = endoffsetof_field(sd_event_source
, time
),
1187 [SOURCE_TIME_MONOTONIC
] = endoffsetof_field(sd_event_source
, time
),
1188 [SOURCE_TIME_REALTIME_ALARM
] = endoffsetof_field(sd_event_source
, time
),
1189 [SOURCE_TIME_BOOTTIME_ALARM
] = endoffsetof_field(sd_event_source
, time
),
1190 [SOURCE_SIGNAL
] = endoffsetof_field(sd_event_source
, signal
),
1191 [SOURCE_CHILD
] = endoffsetof_field(sd_event_source
, child
),
1192 [SOURCE_DEFER
] = endoffsetof_field(sd_event_source
, defer
),
1193 [SOURCE_POST
] = endoffsetof_field(sd_event_source
, post
),
1194 [SOURCE_EXIT
] = endoffsetof_field(sd_event_source
, exit
),
1195 [SOURCE_INOTIFY
] = endoffsetof_field(sd_event_source
, inotify
),
1196 [SOURCE_MEMORY_PRESSURE
] = endoffsetof_field(sd_event_source
, memory_pressure
),
1203 assert(type
< _SOURCE_EVENT_SOURCE_TYPE_MAX
);
1204 assert(size_table
[type
] > 0);
1206 s
= malloc0(size_table
[type
]);
1209 /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1210 * size, even if we only allocate the initial part we need. */
1211 s
= expand_to_usable(s
, sizeof(sd_event_source
));
1213 /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1214 * than what we allocated here. */
1217 s
->floating
= floating
;
1219 s
->pending_index
= PRIOQ_IDX_NULL
;
1220 s
->prepare_index
= PRIOQ_IDX_NULL
;
1225 LIST_PREPEND(sources
, e
->sources
, s
);
1231 static int io_exit_callback(sd_event_source
*s
, int fd
, uint32_t revents
, void *userdata
) {
1234 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1237 _public_
int sd_event_add_io(
1239 sd_event_source
**ret
,
1242 sd_event_io_handler_t callback
,
1245 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1248 assert_return(e
, -EINVAL
);
1249 assert_return(e
= event_resolve(e
), -ENOPKG
);
1250 assert_return(fd
>= 0, -EBADF
);
1251 assert_return(!(events
& ~(EPOLLIN
|EPOLLOUT
|EPOLLRDHUP
|EPOLLPRI
|EPOLLERR
|EPOLLHUP
|EPOLLET
)), -EINVAL
);
1252 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1253 assert_return(!event_origin_changed(e
), -ECHILD
);
1256 callback
= io_exit_callback
;
1258 s
= source_new(e
, !ret
, SOURCE_IO
);
1262 s
->wakeup
= WAKEUP_EVENT_SOURCE
;
1264 s
->io
.events
= events
;
1265 s
->io
.callback
= callback
;
1266 s
->userdata
= userdata
;
1267 s
->enabled
= SD_EVENT_ON
;
1269 r
= source_io_register(s
, s
->enabled
, events
);
1280 static void initialize_perturb(sd_event
*e
) {
1283 /* When we sleep for longer, we try to realign the wakeup to the same time within each
1284 * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1285 * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1286 * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1287 * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1289 if (_likely_(e
->perturb
!= USEC_INFINITY
))
1292 if (sd_id128_get_boot(&id
) >= 0 || sd_id128_get_machine(&id
) >= 0)
1293 e
->perturb
= (id
.qwords
[0] ^ id
.qwords
[1]) % USEC_PER_MINUTE
;
1295 e
->perturb
= 0; /* This is a super early process without /proc and /etc ?? */
1298 static int event_setup_timer_fd(
1300 struct clock_data
*d
,
1306 if (_likely_(d
->fd
>= 0))
1309 _cleanup_close_
int fd
= -EBADF
;
1311 fd
= timerfd_create(clock
, TFD_NONBLOCK
|TFD_CLOEXEC
);
1315 fd
= fd_move_above_stdio(fd
);
1317 struct epoll_event ev
= {
1322 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_ADD
, fd
, &ev
) < 0)
1325 d
->fd
= TAKE_FD(fd
);
1329 static int time_exit_callback(sd_event_source
*s
, uint64_t usec
, void *userdata
) {
1332 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1335 static int setup_clock_data(sd_event
*e
, struct clock_data
*d
, clockid_t clock
) {
1341 r
= event_setup_timer_fd(e
, d
, clock
);
1346 r
= prioq_ensure_allocated(&d
->earliest
, earliest_time_prioq_compare
);
1350 r
= prioq_ensure_allocated(&d
->latest
, latest_time_prioq_compare
);
1357 static int event_source_time_prioq_put(
1359 struct clock_data
*d
) {
1365 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s
->type
));
1367 r
= prioq_put(d
->earliest
, s
, &s
->earliest_index
);
1371 r
= prioq_put(d
->latest
, s
, &s
->latest_index
);
1373 assert_se(prioq_remove(d
->earliest
, s
, &s
->earliest_index
) > 0);
1374 s
->earliest_index
= PRIOQ_IDX_NULL
;
1378 d
->needs_rearm
= true;
1382 _public_
int sd_event_add_time(
1384 sd_event_source
**ret
,
1388 sd_event_time_handler_t callback
,
1391 EventSourceType type
;
1392 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1393 struct clock_data
*d
;
1396 assert_return(e
, -EINVAL
);
1397 assert_return(e
= event_resolve(e
), -ENOPKG
);
1398 assert_return(accuracy
!= UINT64_MAX
, -EINVAL
);
1399 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1400 assert_return(!event_origin_changed(e
), -ECHILD
);
1402 if (!clock_supported(clock
)) /* Checks whether the kernel supports the clock */
1405 type
= clock_to_event_source_type(clock
); /* checks whether sd-event supports this clock */
1410 callback
= time_exit_callback
;
1412 assert_se(d
= event_get_clock_data(e
, type
));
1414 r
= setup_clock_data(e
, d
, clock
);
1418 s
= source_new(e
, !ret
, type
);
1422 s
->time
.next
= usec
;
1423 s
->time
.accuracy
= accuracy
== 0 ? DEFAULT_ACCURACY_USEC
: accuracy
;
1424 s
->time
.callback
= callback
;
1425 s
->earliest_index
= s
->latest_index
= PRIOQ_IDX_NULL
;
1426 s
->userdata
= userdata
;
1427 s
->enabled
= SD_EVENT_ONESHOT
;
1429 r
= event_source_time_prioq_put(s
, d
);
1440 _public_
int sd_event_add_time_relative(
1442 sd_event_source
**ret
,
1446 sd_event_time_handler_t callback
,
1452 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1453 * checks for overflow. */
1455 r
= sd_event_now(e
, clock
, &t
);
1459 if (usec
>= USEC_INFINITY
- t
)
1462 return sd_event_add_time(e
, ret
, clock
, t
+ usec
, accuracy
, callback
, userdata
);
1465 static int signal_exit_callback(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
1468 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1471 _public_
int sd_event_add_signal(
1473 sd_event_source
**ret
,
1475 sd_event_signal_handler_t callback
,
1478 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1479 struct signal_data
*d
;
1484 assert_return(e
, -EINVAL
);
1485 assert_return(e
= event_resolve(e
), -ENOPKG
);
1486 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1487 assert_return(!event_origin_changed(e
), -ECHILD
);
1489 /* Let's make sure our special flag stays outside of the valid signal range */
1490 assert_cc(_NSIG
< SD_EVENT_SIGNAL_PROCMASK
);
1492 if (sig
& SD_EVENT_SIGNAL_PROCMASK
) {
1493 sig
&= ~SD_EVENT_SIGNAL_PROCMASK
;
1494 assert_return(SIGNAL_VALID(sig
), -EINVAL
);
1498 assert_return(SIGNAL_VALID(sig
), -EINVAL
);
1500 r
= signal_is_blocked(sig
);
1510 callback
= signal_exit_callback
;
1512 if (!e
->signal_sources
) {
1513 e
->signal_sources
= new0(sd_event_source
*, _NSIG
);
1514 if (!e
->signal_sources
)
1516 } else if (e
->signal_sources
[sig
])
1519 s
= source_new(e
, !ret
, SOURCE_SIGNAL
);
1523 s
->signal
.sig
= sig
;
1524 s
->signal
.callback
= callback
;
1525 s
->userdata
= userdata
;
1526 s
->enabled
= SD_EVENT_ON
;
1528 e
->signal_sources
[sig
] = s
;
1533 if (sigemptyset(&new_ss
) < 0)
1536 if (sigaddset(&new_ss
, sig
) < 0)
1539 r
= pthread_sigmask(SIG_BLOCK
, &new_ss
, &old_ss
);
1543 r
= sigismember(&old_ss
, sig
);
1547 s
->signal
.unblock
= !r
;
1549 s
->signal
.unblock
= false;
1551 r
= event_make_signal_data(e
, sig
, &d
);
1553 if (s
->signal
.unblock
)
1554 (void) pthread_sigmask(SIG_UNBLOCK
, &new_ss
, NULL
);
1559 /* Use the signal name as description for the event source by default */
1560 (void) sd_event_source_set_description(s
, signal_to_string(sig
));
1569 static int child_exit_callback(sd_event_source
*s
, const siginfo_t
*si
, void *userdata
) {
1572 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1575 static bool shall_use_pidfd(void) {
1576 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1577 return secure_getenv_bool("SYSTEMD_PIDFD") != 0;
1580 _public_
int sd_event_add_child(
1582 sd_event_source
**ret
,
1585 sd_event_child_handler_t callback
,
1588 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1591 assert_return(e
, -EINVAL
);
1592 assert_return(e
= event_resolve(e
), -ENOPKG
);
1593 assert_return(pid
> 1, -EINVAL
);
1594 assert_return(!(options
& ~(WEXITED
|WSTOPPED
|WCONTINUED
)), -EINVAL
);
1595 assert_return(options
!= 0, -EINVAL
);
1596 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1597 assert_return(!event_origin_changed(e
), -ECHILD
);
1600 callback
= child_exit_callback
;
1602 if (e
->n_online_child_sources
== 0) {
1603 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1604 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1605 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1608 * (As an optimization we only do this check on the first child event source created.) */
1609 r
= signal_is_blocked(SIGCHLD
);
1616 r
= hashmap_ensure_allocated(&e
->child_sources
, NULL
);
1620 if (hashmap_contains(e
->child_sources
, PID_TO_PTR(pid
)))
1623 s
= source_new(e
, !ret
, SOURCE_CHILD
);
1627 s
->wakeup
= WAKEUP_EVENT_SOURCE
;
1628 s
->child
.options
= options
;
1629 s
->child
.callback
= callback
;
1630 s
->userdata
= userdata
;
1631 s
->enabled
= SD_EVENT_ONESHOT
;
1633 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1634 * pin the PID, and make regular waitid() handling race-free. */
1636 if (shall_use_pidfd()) {
1637 s
->child
.pidfd
= pidfd_open(pid
, 0);
1638 if (s
->child
.pidfd
< 0) {
1639 /* Propagate errors unless the syscall is not supported or blocked */
1640 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
))
1643 s
->child
.pidfd_owned
= true; /* If we allocate the pidfd we own it by default */
1645 s
->child
.pidfd
= -EBADF
;
1647 if (EVENT_SOURCE_WATCH_PIDFD(s
)) {
1648 /* We have a pidfd and we only want to watch for exit */
1649 r
= source_child_pidfd_register(s
, s
->enabled
);
1654 /* We have no pidfd or we shall wait for some other event than WEXITED */
1655 r
= event_make_signal_data(e
, SIGCHLD
, NULL
);
1659 e
->need_process_child
= true;
1662 r
= hashmap_put(e
->child_sources
, PID_TO_PTR(pid
), s
);
1666 /* These must be done after everything succeeds. */
1668 e
->n_online_child_sources
++;
1676 _public_
int sd_event_add_child_pidfd(
1678 sd_event_source
**ret
,
1681 sd_event_child_handler_t callback
,
1685 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1689 assert_return(e
, -EINVAL
);
1690 assert_return(e
= event_resolve(e
), -ENOPKG
);
1691 assert_return(pidfd
>= 0, -EBADF
);
1692 assert_return(!(options
& ~(WEXITED
|WSTOPPED
|WCONTINUED
)), -EINVAL
);
1693 assert_return(options
!= 0, -EINVAL
);
1694 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1695 assert_return(!event_origin_changed(e
), -ECHILD
);
1698 callback
= child_exit_callback
;
1700 if (e
->n_online_child_sources
== 0) {
1701 r
= signal_is_blocked(SIGCHLD
);
1708 r
= hashmap_ensure_allocated(&e
->child_sources
, NULL
);
1712 r
= pidfd_get_pid(pidfd
, &pid
);
1716 if (hashmap_contains(e
->child_sources
, PID_TO_PTR(pid
)))
1719 s
= source_new(e
, !ret
, SOURCE_CHILD
);
1723 s
->wakeup
= WAKEUP_EVENT_SOURCE
;
1724 s
->child
.pidfd
= pidfd
;
1726 s
->child
.options
= options
;
1727 s
->child
.callback
= callback
;
1728 s
->child
.pidfd_owned
= false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1729 s
->userdata
= userdata
;
1730 s
->enabled
= SD_EVENT_ONESHOT
;
1732 r
= hashmap_put(e
->child_sources
, PID_TO_PTR(pid
), s
);
1736 if (EVENT_SOURCE_WATCH_PIDFD(s
)) {
1737 /* We only want to watch for WEXITED */
1738 r
= source_child_pidfd_register(s
, s
->enabled
);
1742 /* We shall wait for some other event than WEXITED */
1743 r
= event_make_signal_data(e
, SIGCHLD
, NULL
);
1747 e
->need_process_child
= true;
1750 e
->n_online_child_sources
++;
1758 static int generic_exit_callback(sd_event_source
*s
, void *userdata
) {
1761 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
1764 _public_
int sd_event_add_defer(
1766 sd_event_source
**ret
,
1767 sd_event_handler_t callback
,
1770 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1773 assert_return(e
, -EINVAL
);
1774 assert_return(e
= event_resolve(e
), -ENOPKG
);
1775 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1776 assert_return(!event_origin_changed(e
), -ECHILD
);
1779 callback
= generic_exit_callback
;
1781 s
= source_new(e
, !ret
, SOURCE_DEFER
);
1785 s
->defer
.callback
= callback
;
1786 s
->userdata
= userdata
;
1787 s
->enabled
= SD_EVENT_ONESHOT
;
1789 r
= source_set_pending(s
, true);
1800 _public_
int sd_event_add_post(
1802 sd_event_source
**ret
,
1803 sd_event_handler_t callback
,
1806 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1809 assert_return(e
, -EINVAL
);
1810 assert_return(e
= event_resolve(e
), -ENOPKG
);
1811 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1812 assert_return(!event_origin_changed(e
), -ECHILD
);
1815 callback
= generic_exit_callback
;
1817 s
= source_new(e
, !ret
, SOURCE_POST
);
1821 s
->post
.callback
= callback
;
1822 s
->userdata
= userdata
;
1823 s
->enabled
= SD_EVENT_ON
;
1825 r
= set_ensure_put(&e
->post_sources
, NULL
, s
);
1837 _public_
int sd_event_add_exit(
1839 sd_event_source
**ret
,
1840 sd_event_handler_t callback
,
1843 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1846 assert_return(e
, -EINVAL
);
1847 assert_return(e
= event_resolve(e
), -ENOPKG
);
1848 assert_return(callback
, -EINVAL
);
1849 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1850 assert_return(!event_origin_changed(e
), -ECHILD
);
1852 r
= prioq_ensure_allocated(&e
->exit
, exit_prioq_compare
);
1856 s
= source_new(e
, !ret
, SOURCE_EXIT
);
1860 s
->exit
.callback
= callback
;
1861 s
->userdata
= userdata
;
1862 s
->exit
.prioq_index
= PRIOQ_IDX_NULL
;
1863 s
->enabled
= SD_EVENT_ONESHOT
;
1865 r
= prioq_put(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
1876 _public_
int sd_event_trim_memory(void) {
1879 /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1880 * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1881 * NULL callback parameter. */
1883 log_debug("Memory pressure event, trimming malloc() memory.");
1885 #if HAVE_GENERIC_MALLINFO
1886 generic_mallinfo before_mallinfo
= generic_mallinfo_get();
1889 usec_t before_timestamp
= now(CLOCK_MONOTONIC
);
1890 hashmap_trim_pools();
1892 usec_t after_timestamp
= now(CLOCK_MONOTONIC
);
1895 log_debug("Successfully trimmed some memory.");
1897 log_debug("Couldn't trim any memory.");
1899 usec_t period
= after_timestamp
- before_timestamp
;
1901 #if HAVE_GENERIC_MALLINFO
1902 generic_mallinfo after_mallinfo
= generic_mallinfo_get();
1903 size_t l
= LESS_BY((size_t) before_mallinfo
.hblkhd
, (size_t) after_mallinfo
.hblkhd
) +
1904 LESS_BY((size_t) before_mallinfo
.arena
, (size_t) after_mallinfo
.arena
);
1905 log_struct(LOG_DEBUG
,
1906 LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1907 FORMAT_TIMESPAN(period
, 0),
1909 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR
,
1910 "TRIMMED_BYTES=%zu", l
,
1911 "TRIMMED_USEC=" USEC_FMT
, period
);
1913 log_struct(LOG_DEBUG
,
1914 LOG_MESSAGE("Memory trimming took %s.",
1915 FORMAT_TIMESPAN(period
, 0)),
1916 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR
,
1917 "TRIMMED_USEC=" USEC_FMT
, period
);
1923 static int memory_pressure_callback(sd_event_source
*s
, void *userdata
) {
1926 sd_event_trim_memory();
1930 _public_
int sd_event_add_memory_pressure(
1932 sd_event_source
**ret
,
1933 sd_event_handler_t callback
,
1936 _cleanup_free_
char *w
= NULL
;
1937 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
1938 _cleanup_close_
int path_fd
= -EBADF
, fd
= -EBADF
;
1939 _cleanup_free_
void *write_buffer
= NULL
;
1940 const char *watch
, *watch_fallback
= NULL
, *env
;
1941 size_t write_buffer_size
= 0;
1947 assert_return(e
, -EINVAL
);
1948 assert_return(e
= event_resolve(e
), -ENOPKG
);
1949 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
1950 assert_return(!event_origin_changed(e
), -ECHILD
);
1953 callback
= memory_pressure_callback
;
1955 s
= source_new(e
, !ret
, SOURCE_MEMORY_PRESSURE
);
1959 s
->wakeup
= WAKEUP_EVENT_SOURCE
;
1960 s
->memory_pressure
.callback
= callback
;
1961 s
->userdata
= userdata
;
1962 s
->enabled
= SD_EVENT_ON
;
1963 s
->memory_pressure
.fd
= -EBADF
;
1965 env
= secure_getenv("MEMORY_PRESSURE_WATCH");
1967 if (isempty(env
) || path_equal(env
, "/dev/null"))
1968 return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN
),
1969 "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1971 if (!path_is_absolute(env
) || !path_is_normalized(env
))
1972 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
1973 "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env
);
1977 env
= secure_getenv("MEMORY_PRESSURE_WRITE");
1979 r
= unbase64mem(env
, &write_buffer
, &write_buffer_size
);
1987 r
= is_pressure_supported();
1993 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1994 * the system wide pressure if for some reason we cannot (which could be: memory controller
1995 * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1996 * only use the system-wide logic. */
1997 r
= cg_all_unified();
2001 watch
= "/proc/pressure/memory";
2003 _cleanup_free_
char *cg
= NULL
;
2005 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &cg
);
2009 w
= path_join("/sys/fs/cgroup", cg
, "memory.pressure");
2014 watch_fallback
= "/proc/pressure/memory";
2017 /* Android uses three levels in its userspace low memory killer logic:
2018 * some 70000 1000000
2019 * some 100000 1000000
2020 * full 70000 1000000
2022 * GNOME's low memory monitor uses:
2023 * some 70000 1000000
2024 * some 100000 1000000
2025 * full 100000 1000000
2027 * We'll default to the middle level that both agree on. Except we do it on a 2s window
2028 * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2029 * kernel will allow us to do unprivileged, also in the future. */
2030 if (asprintf((char**) &write_buffer
,
2031 "%s " USEC_FMT
" " USEC_FMT
,
2032 MEMORY_PRESSURE_DEFAULT_TYPE
,
2033 MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC
,
2034 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC
) < 0)
2037 write_buffer_size
= strlen(write_buffer
) + 1;
2041 path_fd
= open(watch
, O_PATH
|O_CLOEXEC
);
2043 if (errno
!= ENOENT
)
2046 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2047 * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2048 * the PSI service apparently is not supported) */
2049 if (!watch_fallback
)
2050 return locked
? -ENOENT
: -EOPNOTSUPP
;
2052 path_fd
= open(watch_fallback
, O_PATH
|O_CLOEXEC
);
2054 if (errno
== ENOENT
) /* PSI is not available in the kernel even under the fallback path? */
2060 if (fstat(path_fd
, &st
) < 0)
2063 if (S_ISSOCK(st
.st_mode
)) {
2064 fd
= socket(AF_UNIX
, SOCK_STREAM
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, 0);
2068 r
= connect_unix_path(fd
, path_fd
, NULL
);
2074 } else if (S_ISREG(st
.st_mode
) || S_ISFIFO(st
.st_mode
) || S_ISCHR(st
.st_mode
)) {
2075 fd
= fd_reopen(path_fd
, (write_buffer_size
> 0 ? O_RDWR
: O_RDONLY
) |O_CLOEXEC
|O_NONBLOCK
|O_NOCTTY
);
2079 if (S_ISREG(st
.st_mode
)) {
2082 /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2084 if (fstatfs(fd
, &sfs
) < 0)
2087 if (!is_fs_type(&sfs
, PROC_SUPER_MAGIC
) &&
2088 !is_fs_type(&sfs
, CGROUP2_SUPER_MAGIC
))
2093 /* For fifos and char devices just watch for EPOLLIN */
2096 } else if (S_ISDIR(st
.st_mode
))
2101 s
->memory_pressure
.fd
= TAKE_FD(fd
);
2102 s
->memory_pressure
.write_buffer
= TAKE_PTR(write_buffer
);
2103 s
->memory_pressure
.write_buffer_size
= write_buffer_size
;
2104 s
->memory_pressure
.events
= events
;
2105 s
->memory_pressure
.locked
= locked
;
2107 /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2108 * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2109 * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2110 * event sources on which writes must be executed before the first event loop iteration is
2111 * executed. (We could also write the data here, right away, but we want to give the caller the
2112 * freedom to call sd_event_source_set_memory_pressure_type() and
2113 * sd_event_source_set_memory_pressure_rate() before we write it. */
2115 if (s
->memory_pressure
.write_buffer_size
> 0)
2116 source_memory_pressure_add_to_write_list(s
);
2118 r
= source_memory_pressure_register(s
, s
->enabled
);
2130 static void event_free_inotify_data(sd_event
*e
, struct inotify_data
*d
) {
2136 assert(hashmap_isempty(d
->inodes
));
2137 assert(hashmap_isempty(d
->wd
));
2139 if (d
->buffer_filled
> 0)
2140 LIST_REMOVE(buffered
, e
->buffered_inotify_data_list
, d
);
2142 hashmap_free(d
->inodes
);
2143 hashmap_free(d
->wd
);
2145 assert_se(hashmap_remove(e
->inotify_data
, &d
->priority
) == d
);
2148 if (!event_origin_changed(e
) &&
2149 epoll_ctl(e
->epoll_fd
, EPOLL_CTL_DEL
, d
->fd
, NULL
) < 0)
2150 log_debug_errno(errno
, "Failed to remove inotify fd from epoll, ignoring: %m");
2157 static int event_make_inotify_data(
2160 struct inotify_data
**ret
) {
2162 _cleanup_close_
int fd
= -EBADF
;
2163 struct inotify_data
*d
;
2168 d
= hashmap_get(e
->inotify_data
, &priority
);
2175 fd
= inotify_init1(IN_NONBLOCK
|O_CLOEXEC
);
2179 fd
= fd_move_above_stdio(fd
);
2181 d
= new(struct inotify_data
, 1);
2185 *d
= (struct inotify_data
) {
2186 .wakeup
= WAKEUP_INOTIFY_DATA
,
2188 .priority
= priority
,
2191 r
= hashmap_ensure_put(&e
->inotify_data
, &uint64_hash_ops
, &d
->priority
, d
);
2193 d
->fd
= safe_close(d
->fd
);
2198 struct epoll_event ev
= {
2203 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_ADD
, d
->fd
, &ev
) < 0) {
2205 d
->fd
= safe_close(d
->fd
); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2206 * remove the fd from the epoll first, which we don't want as we couldn't
2207 * add it in the first place. */
2208 event_free_inotify_data(e
, d
);
2218 static int inode_data_compare(const struct inode_data
*x
, const struct inode_data
*y
) {
2224 r
= CMP(x
->dev
, y
->dev
);
2228 return CMP(x
->ino
, y
->ino
);
2231 static void inode_data_hash_func(const struct inode_data
*d
, struct siphash
*state
) {
2234 siphash24_compress_typesafe(d
->dev
, state
);
2235 siphash24_compress_typesafe(d
->ino
, state
);
2238 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops
, struct inode_data
, inode_data_hash_func
, inode_data_compare
);
2240 static void event_free_inode_data(
2242 struct inode_data
*d
) {
2249 assert(!d
->event_sources
);
2252 LIST_REMOVE(to_close
, e
->inode_data_to_close_list
, d
);
2256 if (d
->inotify_data
) {
2259 if (d
->inotify_data
->fd
>= 0 && !event_origin_changed(e
)) {
2260 /* So here's a problem. At the time this runs the watch descriptor might already be
2261 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2262 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2263 * likely case to happen. */
2265 if (inotify_rm_watch(d
->inotify_data
->fd
, d
->wd
) < 0 && errno
!= EINVAL
)
2266 log_debug_errno(errno
, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d
->wd
);
2269 assert_se(hashmap_remove(d
->inotify_data
->wd
, INT_TO_PTR(d
->wd
)) == d
);
2272 assert_se(hashmap_remove(d
->inotify_data
->inodes
, d
) == d
);
2279 static void event_gc_inotify_data(
2281 struct inotify_data
*d
) {
2285 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2286 * any inode with it anymore, which in turn happens if no event source of this priority is interested
2287 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2288 * (under the expectation that the GC is called again once the counter is decremented). */
2293 if (!hashmap_isempty(d
->inodes
))
2299 event_free_inotify_data(e
, d
);
2302 static void event_gc_inode_data(
2304 struct inode_data
*d
) {
2306 struct inotify_data
*inotify_data
;
2313 if (d
->event_sources
)
2316 inotify_data
= d
->inotify_data
;
2317 event_free_inode_data(e
, d
);
2319 event_gc_inotify_data(e
, inotify_data
);
2322 static int event_make_inode_data(
2324 struct inotify_data
*inotify_data
,
2327 struct inode_data
**ret
) {
2329 struct inode_data
*d
, key
;
2333 assert(inotify_data
);
2335 key
= (struct inode_data
) {
2340 d
= hashmap_get(inotify_data
->inodes
, &key
);
2348 r
= hashmap_ensure_allocated(&inotify_data
->inodes
, &inode_data_hash_ops
);
2352 d
= new(struct inode_data
, 1);
2356 *d
= (struct inode_data
) {
2361 .inotify_data
= inotify_data
,
2364 r
= hashmap_put(inotify_data
->inodes
, d
, d
);
2376 static uint32_t inode_data_determine_mask(struct inode_data
*d
) {
2377 bool excl_unlink
= true;
2378 uint32_t combined
= 0;
2382 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2383 * the IN_EXCL_UNLINK flag is ANDed instead.
2385 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2386 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2387 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2388 * events we don't care for client-side. */
2390 LIST_FOREACH(inotify
.by_inode_data
, s
, d
->event_sources
) {
2392 if ((s
->inotify
.mask
& IN_EXCL_UNLINK
) == 0)
2393 excl_unlink
= false;
2395 combined
|= s
->inotify
.mask
;
2398 return (combined
& ~(IN_ONESHOT
|IN_DONT_FOLLOW
|IN_ONLYDIR
|IN_EXCL_UNLINK
)) | (excl_unlink
? IN_EXCL_UNLINK
: 0);
2401 static int inode_data_realize_watch(sd_event
*e
, struct inode_data
*d
) {
2402 uint32_t combined_mask
;
2408 combined_mask
= inode_data_determine_mask(d
);
2410 if (d
->wd
>= 0 && combined_mask
== d
->combined_mask
)
2413 r
= hashmap_ensure_allocated(&d
->inotify_data
->wd
, NULL
);
2417 wd
= inotify_add_watch_fd(d
->inotify_data
->fd
, d
->fd
, combined_mask
);
2422 r
= hashmap_put(d
->inotify_data
->wd
, INT_TO_PTR(wd
), d
);
2424 (void) inotify_rm_watch(d
->inotify_data
->fd
, wd
);
2430 } else if (d
->wd
!= wd
) {
2432 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2433 (void) inotify_rm_watch(d
->fd
, wd
);
2437 d
->combined_mask
= combined_mask
;
2441 static int inotify_exit_callback(sd_event_source
*s
, const struct inotify_event
*event
, void *userdata
) {
2444 return sd_event_exit(sd_event_source_get_event(s
), PTR_TO_INT(userdata
));
2447 static int event_add_inotify_fd_internal(
2449 sd_event_source
**ret
,
2453 sd_event_inotify_handler_t callback
,
2456 _cleanup_close_
int donated_fd
= donate
? fd
: -EBADF
;
2457 _cleanup_(source_freep
) sd_event_source
*s
= NULL
;
2458 struct inotify_data
*inotify_data
= NULL
;
2459 struct inode_data
*inode_data
= NULL
;
2463 assert_return(e
, -EINVAL
);
2464 assert_return(e
= event_resolve(e
), -ENOPKG
);
2465 assert_return(fd
>= 0, -EBADF
);
2466 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2467 assert_return(!event_origin_changed(e
), -ECHILD
);
2470 callback
= inotify_exit_callback
;
2472 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2473 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2474 * the user can't use them for us. */
2475 if (mask
& IN_MASK_ADD
)
2478 if (fstat(fd
, &st
) < 0)
2481 s
= source_new(e
, !ret
, SOURCE_INOTIFY
);
2485 s
->enabled
= mask
& IN_ONESHOT
? SD_EVENT_ONESHOT
: SD_EVENT_ON
;
2486 s
->inotify
.mask
= mask
;
2487 s
->inotify
.callback
= callback
;
2488 s
->userdata
= userdata
;
2490 /* Allocate an inotify object for this priority, and an inode object within it */
2491 r
= event_make_inotify_data(e
, SD_EVENT_PRIORITY_NORMAL
, &inotify_data
);
2495 r
= event_make_inode_data(e
, inotify_data
, st
.st_dev
, st
.st_ino
, &inode_data
);
2497 event_gc_inotify_data(e
, inotify_data
);
2501 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2502 * the event source, until then, for which we need the original inode. */
2503 if (inode_data
->fd
< 0) {
2504 if (donated_fd
>= 0)
2505 inode_data
->fd
= TAKE_FD(donated_fd
);
2507 inode_data
->fd
= fcntl(fd
, F_DUPFD_CLOEXEC
, 3);
2508 if (inode_data
->fd
< 0) {
2510 event_gc_inode_data(e
, inode_data
);
2515 LIST_PREPEND(to_close
, e
->inode_data_to_close_list
, inode_data
);
2517 _cleanup_free_
char *path
= NULL
;
2518 r
= fd_get_path(inode_data
->fd
, &path
);
2519 if (r
< 0 && r
!= -ENOSYS
) { /* The path is optional, hence ignore -ENOSYS. */
2520 event_gc_inode_data(e
, inode_data
);
2524 free_and_replace(inode_data
->path
, path
);
2527 /* Link our event source to the inode data object */
2528 LIST_PREPEND(inotify
.by_inode_data
, inode_data
->event_sources
, s
);
2529 s
->inotify
.inode_data
= inode_data
;
2531 /* Actually realize the watch now */
2532 r
= inode_data_realize_watch(e
, inode_data
);
2543 _public_
int sd_event_add_inotify_fd(
2545 sd_event_source
**ret
,
2548 sd_event_inotify_handler_t callback
,
2551 return event_add_inotify_fd_internal(e
, ret
, fd
, /* donate= */ false, mask
, callback
, userdata
);
2554 _public_
int sd_event_add_inotify(
2556 sd_event_source
**ret
,
2559 sd_event_inotify_handler_t callback
,
2562 sd_event_source
*s
= NULL
; /* avoid false maybe-uninitialized warning */
2565 assert_return(path
, -EINVAL
);
2567 fd
= open(path
, O_PATH
| O_CLOEXEC
|
2568 (mask
& IN_ONLYDIR
? O_DIRECTORY
: 0) |
2569 (mask
& IN_DONT_FOLLOW
? O_NOFOLLOW
: 0));
2573 r
= event_add_inotify_fd_internal(e
, &s
, fd
, /* donate= */ true, mask
, callback
, userdata
);
2577 (void) sd_event_source_set_description(s
, path
);
2585 static sd_event_source
* event_source_free(sd_event_source
*s
) {
2589 /* Here's a special hack: when we are called from a
2590 * dispatch handler we won't free the event source
2591 * immediately, but we will detach the fd from the
2592 * epoll. This way it is safe for the caller to unref
2593 * the event source and immediately close the fd, but
2594 * we still retain a valid event source object after
2598 source_disconnect(s
);
2605 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source
, sd_event_source
, event_source_free
);
2607 _public_
int sd_event_source_set_description(sd_event_source
*s
, const char *description
) {
2608 assert_return(s
, -EINVAL
);
2609 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2611 return free_and_strdup(&s
->description
, description
);
2614 _public_
int sd_event_source_get_description(sd_event_source
*s
, const char **description
) {
2615 assert_return(s
, -EINVAL
);
2616 assert_return(description
, -EINVAL
);
2618 if (!s
->description
)
2621 *description
= s
->description
;
2625 _public_ sd_event
*sd_event_source_get_event(sd_event_source
*s
) {
2626 assert_return(s
, NULL
);
2627 assert_return(!event_origin_changed(s
->event
), NULL
);
2632 _public_
int sd_event_source_get_pending(sd_event_source
*s
) {
2633 assert_return(s
, -EINVAL
);
2634 assert_return(s
->type
!= SOURCE_EXIT
, -EDOM
);
2635 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2636 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2641 _public_
int sd_event_source_get_io_fd(sd_event_source
*s
) {
2642 assert_return(s
, -EINVAL
);
2643 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2644 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2649 _public_
int sd_event_source_set_io_fd(sd_event_source
*s
, int fd
) {
2652 assert_return(s
, -EINVAL
);
2653 assert_return(fd
>= 0, -EBADF
);
2654 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2655 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2660 saved_fd
= s
->io
.fd
;
2663 assert(event_source_is_offline(s
) == !s
->io
.registered
);
2665 if (s
->io
.registered
) {
2666 s
->io
.registered
= false;
2668 r
= source_io_register(s
, s
->enabled
, s
->io
.events
);
2670 s
->io
.fd
= saved_fd
;
2671 s
->io
.registered
= true;
2675 (void) epoll_ctl(s
->event
->epoll_fd
, EPOLL_CTL_DEL
, saved_fd
, NULL
);
2679 safe_close(saved_fd
);
2684 _public_
int sd_event_source_get_io_fd_own(sd_event_source
*s
) {
2685 assert_return(s
, -EINVAL
);
2686 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2687 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2692 _public_
int sd_event_source_set_io_fd_own(sd_event_source
*s
, int own
) {
2693 assert_return(s
, -EINVAL
);
2694 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2695 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2701 _public_
int sd_event_source_get_io_events(sd_event_source
*s
, uint32_t* events
) {
2702 assert_return(s
, -EINVAL
);
2703 assert_return(events
, -EINVAL
);
2704 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2705 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2707 *events
= s
->io
.events
;
2711 _public_
int sd_event_source_set_io_events(sd_event_source
*s
, uint32_t events
) {
2714 assert_return(s
, -EINVAL
);
2715 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2716 assert_return(!(events
& ~(EPOLLIN
|EPOLLOUT
|EPOLLRDHUP
|EPOLLPRI
|EPOLLERR
|EPOLLHUP
|EPOLLET
)), -EINVAL
);
2717 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2718 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2720 /* edge-triggered updates are never skipped, so we can reset edges */
2721 if (s
->io
.events
== events
&& !(events
& EPOLLET
))
2724 r
= source_set_pending(s
, false);
2728 if (event_source_is_online(s
)) {
2729 r
= source_io_register(s
, s
->enabled
, events
);
2734 s
->io
.events
= events
;
2739 _public_
int sd_event_source_get_io_revents(sd_event_source
*s
, uint32_t* revents
) {
2740 assert_return(s
, -EINVAL
);
2741 assert_return(revents
, -EINVAL
);
2742 assert_return(s
->type
== SOURCE_IO
, -EDOM
);
2743 assert_return(s
->pending
, -ENODATA
);
2744 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2746 *revents
= s
->io
.revents
;
2750 _public_
int sd_event_source_get_signal(sd_event_source
*s
) {
2751 assert_return(s
, -EINVAL
);
2752 assert_return(s
->type
== SOURCE_SIGNAL
, -EDOM
);
2753 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2755 return s
->signal
.sig
;
2758 _public_
int sd_event_source_get_priority(sd_event_source
*s
, int64_t *priority
) {
2759 assert_return(s
, -EINVAL
);
2760 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2762 *priority
= s
->priority
;
2766 _public_
int sd_event_source_set_priority(sd_event_source
*s
, int64_t priority
) {
2767 bool rm_inotify
= false, rm_inode
= false;
2768 struct inotify_data
*new_inotify_data
= NULL
;
2769 struct inode_data
*new_inode_data
= NULL
;
2772 assert_return(s
, -EINVAL
);
2773 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
2774 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2776 if (s
->priority
== priority
)
2779 if (s
->type
== SOURCE_INOTIFY
) {
2780 struct inode_data
*old_inode_data
;
2782 assert(s
->inotify
.inode_data
);
2783 old_inode_data
= s
->inotify
.inode_data
;
2785 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2786 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2787 * events we allow priority changes only until the first following iteration. */
2788 if (old_inode_data
->fd
< 0)
2791 r
= event_make_inotify_data(s
->event
, priority
, &new_inotify_data
);
2796 r
= event_make_inode_data(s
->event
, new_inotify_data
, old_inode_data
->dev
, old_inode_data
->ino
, &new_inode_data
);
2801 if (new_inode_data
->fd
< 0) {
2802 /* Duplicate the fd for the new inode object if we don't have any yet */
2803 new_inode_data
->fd
= fcntl(old_inode_data
->fd
, F_DUPFD_CLOEXEC
, 3);
2804 if (new_inode_data
->fd
< 0) {
2809 LIST_PREPEND(to_close
, s
->event
->inode_data_to_close_list
, new_inode_data
);
2811 _cleanup_free_
char *path
= NULL
;
2812 r
= fd_get_path(new_inode_data
->fd
, &path
);
2813 if (r
< 0 && r
!= -ENOSYS
)
2816 free_and_replace(new_inode_data
->path
, path
);
2819 /* Move the event source to the new inode data structure */
2820 LIST_REMOVE(inotify
.by_inode_data
, old_inode_data
->event_sources
, s
);
2821 LIST_PREPEND(inotify
.by_inode_data
, new_inode_data
->event_sources
, s
);
2822 s
->inotify
.inode_data
= new_inode_data
;
2824 /* Now create the new watch */
2825 r
= inode_data_realize_watch(s
->event
, new_inode_data
);
2828 LIST_REMOVE(inotify
.by_inode_data
, new_inode_data
->event_sources
, s
);
2829 LIST_PREPEND(inotify
.by_inode_data
, old_inode_data
->event_sources
, s
);
2830 s
->inotify
.inode_data
= old_inode_data
;
2834 s
->priority
= priority
;
2836 event_gc_inode_data(s
->event
, old_inode_data
);
2838 } else if (s
->type
== SOURCE_SIGNAL
&& event_source_is_online(s
)) {
2839 struct signal_data
*old
, *d
;
2841 /* Move us from the signalfd belonging to the old
2842 * priority to the signalfd of the new priority */
2844 assert_se(old
= hashmap_get(s
->event
->signal_data
, &s
->priority
));
2846 s
->priority
= priority
;
2848 r
= event_make_signal_data(s
->event
, s
->signal
.sig
, &d
);
2850 s
->priority
= old
->priority
;
2854 event_unmask_signal_data(s
->event
, old
, s
->signal
.sig
);
2856 s
->priority
= priority
;
2858 event_source_pp_prioq_reshuffle(s
);
2860 if (s
->type
== SOURCE_EXIT
)
2861 prioq_reshuffle(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
2867 event_free_inode_data(s
->event
, new_inode_data
);
2870 event_free_inotify_data(s
->event
, new_inotify_data
);
2875 _public_
int sd_event_source_get_enabled(sd_event_source
*s
, int *ret
) {
2876 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2880 assert_return(s
, -EINVAL
);
2881 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
2886 return s
->enabled
!= SD_EVENT_OFF
;
2889 static int event_source_offline(
2898 assert(enabled
== SD_EVENT_OFF
|| ratelimited
);
2900 /* Unset the pending flag when this event source is disabled */
2901 if (s
->enabled
!= SD_EVENT_OFF
&&
2902 enabled
== SD_EVENT_OFF
&&
2903 !IN_SET(s
->type
, SOURCE_DEFER
, SOURCE_EXIT
)) {
2904 r
= source_set_pending(s
, false);
2909 was_offline
= event_source_is_offline(s
);
2910 s
->enabled
= enabled
;
2911 s
->ratelimited
= ratelimited
;
2916 source_io_unregister(s
);
2920 event_gc_signal_data(s
->event
, &s
->priority
, s
->signal
.sig
);
2925 assert(s
->event
->n_online_child_sources
> 0);
2926 s
->event
->n_online_child_sources
--;
2929 if (EVENT_SOURCE_WATCH_PIDFD(s
))
2930 source_child_pidfd_unregister(s
);
2932 event_gc_signal_data(s
->event
, &s
->priority
, SIGCHLD
);
2936 prioq_reshuffle(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
2939 case SOURCE_MEMORY_PRESSURE
:
2940 source_memory_pressure_unregister(s
);
2943 case SOURCE_TIME_REALTIME
:
2944 case SOURCE_TIME_BOOTTIME
:
2945 case SOURCE_TIME_MONOTONIC
:
2946 case SOURCE_TIME_REALTIME_ALARM
:
2947 case SOURCE_TIME_BOOTTIME_ALARM
:
2950 case SOURCE_INOTIFY
:
2954 assert_not_reached();
2957 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2958 event_source_time_prioq_reshuffle(s
);
2963 static int event_source_online(
2972 assert(enabled
!= SD_EVENT_OFF
|| !ratelimited
);
2974 /* Unset the pending flag when this event source is enabled */
2975 if (s
->enabled
== SD_EVENT_OFF
&&
2976 enabled
!= SD_EVENT_OFF
&&
2977 !IN_SET(s
->type
, SOURCE_DEFER
, SOURCE_EXIT
)) {
2978 r
= source_set_pending(s
, false);
2983 /* Are we really ready for onlining? */
2984 if (enabled
== SD_EVENT_OFF
|| ratelimited
) {
2985 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2986 s
->enabled
= enabled
;
2987 s
->ratelimited
= ratelimited
;
2991 was_online
= event_source_is_online(s
);
2995 r
= source_io_register(s
, enabled
, s
->io
.events
);
3001 r
= event_make_signal_data(s
->event
, s
->signal
.sig
, NULL
);
3003 event_gc_signal_data(s
->event
, &s
->priority
, s
->signal
.sig
);
3010 if (EVENT_SOURCE_WATCH_PIDFD(s
)) {
3011 /* yes, we have pidfd */
3013 r
= source_child_pidfd_register(s
, enabled
);
3017 /* no pidfd, or something other to watch for than WEXITED */
3019 r
= event_make_signal_data(s
->event
, SIGCHLD
, NULL
);
3021 event_gc_signal_data(s
->event
, &s
->priority
, SIGCHLD
);
3027 s
->event
->n_online_child_sources
++;
3030 case SOURCE_MEMORY_PRESSURE
:
3031 r
= source_memory_pressure_register(s
, enabled
);
3037 case SOURCE_TIME_REALTIME
:
3038 case SOURCE_TIME_BOOTTIME
:
3039 case SOURCE_TIME_MONOTONIC
:
3040 case SOURCE_TIME_REALTIME_ALARM
:
3041 case SOURCE_TIME_BOOTTIME_ALARM
:
3045 case SOURCE_INOTIFY
:
3049 assert_not_reached();
3052 s
->enabled
= enabled
;
3053 s
->ratelimited
= ratelimited
;
3055 /* Non-failing operations below */
3056 if (s
->type
== SOURCE_EXIT
)
3057 prioq_reshuffle(s
->event
->exit
, s
, &s
->exit
.prioq_index
);
3059 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3060 event_source_time_prioq_reshuffle(s
);
3065 _public_
int sd_event_source_set_enabled(sd_event_source
*s
, int m
) {
3068 assert_return(IN_SET(m
, SD_EVENT_OFF
, SD_EVENT_ON
, SD_EVENT_ONESHOT
), -EINVAL
);
3070 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3071 if (m
== SD_EVENT_OFF
&& !s
)
3074 assert_return(s
, -EINVAL
);
3075 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3077 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3078 if (s
->event
->state
== SD_EVENT_FINISHED
)
3079 return m
== SD_EVENT_OFF
? 0 : -ESTALE
;
3081 if (s
->enabled
== m
) /* No change? */
3084 if (m
== SD_EVENT_OFF
)
3085 r
= event_source_offline(s
, m
, s
->ratelimited
);
3087 if (s
->enabled
!= SD_EVENT_OFF
) {
3088 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3089 * event source is already enabled after all. */
3094 r
= event_source_online(s
, m
, s
->ratelimited
);
3099 event_source_pp_prioq_reshuffle(s
);
3103 _public_
int sd_event_source_get_time(sd_event_source
*s
, uint64_t *usec
) {
3104 assert_return(s
, -EINVAL
);
3105 assert_return(usec
, -EINVAL
);
3106 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
3107 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3109 *usec
= s
->time
.next
;
3113 _public_
int sd_event_source_set_time(sd_event_source
*s
, uint64_t usec
) {
3116 assert_return(s
, -EINVAL
);
3117 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
3118 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
3119 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3121 r
= source_set_pending(s
, false);
3125 s
->time
.next
= usec
;
3127 event_source_time_prioq_reshuffle(s
);
3131 _public_
int sd_event_source_set_time_relative(sd_event_source
*s
, uint64_t usec
) {
3135 assert_return(s
, -EINVAL
);
3136 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
3137 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3139 if (usec
== USEC_INFINITY
)
3140 return sd_event_source_set_time(s
, USEC_INFINITY
);
3142 r
= sd_event_now(s
->event
, event_source_type_to_clock(s
->type
), &t
);
3146 usec
= usec_add(t
, usec
);
3147 if (usec
== USEC_INFINITY
)
3150 return sd_event_source_set_time(s
, usec
);
3153 _public_
int sd_event_source_get_time_accuracy(sd_event_source
*s
, uint64_t *usec
) {
3154 assert_return(s
, -EINVAL
);
3155 assert_return(usec
, -EINVAL
);
3156 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
3157 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3159 *usec
= s
->time
.accuracy
;
3163 _public_
int sd_event_source_set_time_accuracy(sd_event_source
*s
, uint64_t usec
) {
3166 assert_return(s
, -EINVAL
);
3167 assert_return(usec
!= UINT64_MAX
, -EINVAL
);
3168 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
3169 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
3170 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3172 r
= source_set_pending(s
, false);
3177 usec
= DEFAULT_ACCURACY_USEC
;
3179 s
->time
.accuracy
= usec
;
3181 event_source_time_prioq_reshuffle(s
);
3185 _public_
int sd_event_source_get_time_clock(sd_event_source
*s
, clockid_t
*clock
) {
3186 assert_return(s
, -EINVAL
);
3187 assert_return(clock
, -EINVAL
);
3188 assert_return(EVENT_SOURCE_IS_TIME(s
->type
), -EDOM
);
3189 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3191 *clock
= event_source_type_to_clock(s
->type
);
3195 _public_
int sd_event_source_get_child_pid(sd_event_source
*s
, pid_t
*pid
) {
3196 assert_return(s
, -EINVAL
);
3197 assert_return(pid
, -EINVAL
);
3198 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3199 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3201 *pid
= s
->child
.pid
;
3205 _public_
int sd_event_source_get_child_pidfd(sd_event_source
*s
) {
3206 assert_return(s
, -EINVAL
);
3207 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3208 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3210 if (s
->child
.pidfd
< 0)
3213 return s
->child
.pidfd
;
3216 _public_
int sd_event_source_send_child_signal(sd_event_source
*s
, int sig
, const siginfo_t
*si
, unsigned flags
) {
3217 assert_return(s
, -EINVAL
);
3218 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3219 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3220 assert_return(SIGNAL_VALID(sig
), -EINVAL
);
3222 /* If we already have seen indication the process exited refuse sending a signal early. This way we
3223 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3225 if (s
->child
.exited
)
3228 if (s
->child
.pidfd
>= 0) {
3231 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3236 if (pidfd_send_signal(s
->child
.pidfd
, sig
, si
? ©
: NULL
, 0) < 0) {
3237 /* Let's propagate the error only if the system call is not implemented or prohibited */
3238 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
))
3244 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3250 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3251 siginfo_t copy
= *si
;
3253 if (rt_sigqueueinfo(s
->child
.pid
, sig
, ©
) < 0)
3255 } else if (kill(s
->child
.pid
, sig
) < 0)
3261 _public_
int sd_event_source_get_child_pidfd_own(sd_event_source
*s
) {
3262 assert_return(s
, -EINVAL
);
3263 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3264 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3266 if (s
->child
.pidfd
< 0)
3269 return s
->child
.pidfd_owned
;
3272 _public_
int sd_event_source_set_child_pidfd_own(sd_event_source
*s
, int own
) {
3273 assert_return(s
, -EINVAL
);
3274 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3275 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3277 if (s
->child
.pidfd
< 0)
3280 s
->child
.pidfd_owned
= own
;
3284 _public_
int sd_event_source_get_child_process_own(sd_event_source
*s
) {
3285 assert_return(s
, -EINVAL
);
3286 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3287 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3289 return s
->child
.process_owned
;
3292 _public_
int sd_event_source_set_child_process_own(sd_event_source
*s
, int own
) {
3293 assert_return(s
, -EINVAL
);
3294 assert_return(s
->type
== SOURCE_CHILD
, -EDOM
);
3295 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3297 s
->child
.process_owned
= own
;
3301 _public_
int sd_event_source_get_inotify_mask(sd_event_source
*s
, uint32_t *ret
) {
3302 assert_return(s
, -EINVAL
);
3303 assert_return(ret
, -EINVAL
);
3304 assert_return(s
->type
== SOURCE_INOTIFY
, -EDOM
);
3305 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3307 *ret
= s
->inotify
.mask
;
3311 _public_
int sd_event_source_get_inotify_path(sd_event_source
*s
, const char **ret
) {
3312 assert_return(s
, -EINVAL
);
3313 assert_return(ret
, -EINVAL
);
3314 assert_return(s
->type
== SOURCE_INOTIFY
, -EDOM
);
3315 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3317 if (!s
->inotify
.inode_data
)
3318 return -ESTALE
; /* already disconnected. */
3320 if (!s
->inotify
.inode_data
->path
)
3321 return -ENOSYS
; /* /proc was not mounted? */
3323 *ret
= s
->inotify
.inode_data
->path
;
3327 _public_
int sd_event_source_set_prepare(sd_event_source
*s
, sd_event_handler_t callback
) {
3330 assert_return(s
, -EINVAL
);
3331 assert_return(s
->type
!= SOURCE_EXIT
, -EDOM
);
3332 assert_return(s
->event
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
3333 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
3335 if (s
->prepare
== callback
)
3338 if (callback
&& s
->prepare
) {
3339 s
->prepare
= callback
;
3343 r
= prioq_ensure_allocated(&s
->event
->prepare
, prepare_prioq_compare
);
3347 s
->prepare
= callback
;
3350 r
= prioq_put(s
->event
->prepare
, s
, &s
->prepare_index
);
3354 prioq_remove(s
->event
->prepare
, s
, &s
->prepare_index
);
3359 _public_
void* sd_event_source_get_userdata(sd_event_source
*s
) {
3360 assert_return(s
, NULL
);
3361 assert_return(!event_origin_changed(s
->event
), NULL
);
3366 _public_
void *sd_event_source_set_userdata(sd_event_source
*s
, void *userdata
) {
3369 assert_return(s
, NULL
);
3370 assert_return(!event_origin_changed(s
->event
), NULL
);
3373 s
->userdata
= userdata
;
3378 static int event_source_enter_ratelimited(sd_event_source
*s
) {
3383 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3384 * the end of the rate limit time window, much as if it was a timer event source. */
3387 return 0; /* Already ratelimited, this is a NOP hence */
3389 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3390 r
= setup_clock_data(s
->event
, &s
->event
->monotonic
, CLOCK_MONOTONIC
);
3394 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3395 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3396 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3397 if (EVENT_SOURCE_IS_TIME(s
->type
))
3398 event_source_time_prioq_remove(s
, event_get_clock_data(s
->event
, s
->type
));
3400 /* Now, let's add the event source to the monotonic clock instead */
3401 r
= event_source_time_prioq_put(s
, &s
->event
->monotonic
);
3405 /* And let's take the event source officially offline */
3406 r
= event_source_offline(s
, s
->enabled
, /* ratelimited= */ true);
3408 event_source_time_prioq_remove(s
, &s
->event
->monotonic
);
3412 event_source_pp_prioq_reshuffle(s
);
3414 log_debug("Event source %p (%s) entered rate limit state.", s
, strna(s
->description
));
3418 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3419 * space for it should already be allocated. */
3420 if (EVENT_SOURCE_IS_TIME(s
->type
))
3421 assert_se(event_source_time_prioq_put(s
, event_get_clock_data(s
->event
, s
->type
)) >= 0);
3426 static int event_source_leave_ratelimit(sd_event_source
*s
, bool run_callback
) {
3431 if (!s
->ratelimited
)
3434 /* Let's take the event source out of the monotonic prioq first. */
3435 event_source_time_prioq_remove(s
, &s
->event
->monotonic
);
3437 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3438 if (EVENT_SOURCE_IS_TIME(s
->type
)) {
3439 r
= event_source_time_prioq_put(s
, event_get_clock_data(s
->event
, s
->type
));
3444 /* Let's try to take it online again. */
3445 r
= event_source_online(s
, s
->enabled
, /* ratelimited= */ false);
3447 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3448 if (EVENT_SOURCE_IS_TIME(s
->type
))
3449 event_source_time_prioq_remove(s
, event_get_clock_data(s
->event
, s
->type
));
3454 event_source_pp_prioq_reshuffle(s
);
3455 ratelimit_reset(&s
->rate_limit
);
3457 log_debug("Event source %p (%s) left rate limit state.", s
, strna(s
->description
));
3459 if (run_callback
&& s
->ratelimit_expire_callback
) {
3460 s
->dispatching
= true;
3461 r
= s
->ratelimit_expire_callback(s
, s
->userdata
);
3462 s
->dispatching
= false;
3465 log_debug_errno(r
, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3466 strna(s
->description
),
3467 event_source_type_to_string(s
->type
),
3468 s
->exit_on_failure
? "exiting" : "disabling");
3470 if (s
->exit_on_failure
)
3471 (void) sd_event_exit(s
->event
, r
);
3477 assert_se(sd_event_source_set_enabled(s
, SD_EVENT_OFF
) >= 0);
3485 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3486 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3487 assert_se(event_source_time_prioq_put(s
, &s
->event
->monotonic
) >= 0);
3492 static usec_t
sleep_between(sd_event
*e
, usec_t a
, usec_t b
) {
3499 if (a
>= USEC_INFINITY
)
3500 return USEC_INFINITY
;
3505 initialize_perturb(e
);
3508 Find a good time to wake up again between times a and b. We
3509 have two goals here:
3511 a) We want to wake up as seldom as possible, hence prefer
3512 later times over earlier times.
3514 b) But if we have to wake up, then let's make sure to
3515 dispatch as much as possible on the entire system.
3517 We implement this by waking up everywhere at the same time
3518 within any given minute if we can, synchronised via the
3519 perturbation value determined from the boot ID. If we can't,
3520 then we try to find the same spot in every 10s, then 1s and
3521 then 250ms step. Otherwise, we pick the last possible time
3525 c
= (b
/ USEC_PER_MINUTE
) * USEC_PER_MINUTE
+ e
->perturb
;
3527 if (_unlikely_(c
< USEC_PER_MINUTE
))
3530 c
-= USEC_PER_MINUTE
;
3536 c
= (b
/ (USEC_PER_SEC
*10)) * (USEC_PER_SEC
*10) + (e
->perturb
% (USEC_PER_SEC
*10));
3538 if (_unlikely_(c
< USEC_PER_SEC
*10))
3541 c
-= USEC_PER_SEC
*10;
3547 c
= (b
/ USEC_PER_SEC
) * USEC_PER_SEC
+ (e
->perturb
% USEC_PER_SEC
);
3549 if (_unlikely_(c
< USEC_PER_SEC
))
3558 c
= (b
/ (USEC_PER_MSEC
*250)) * (USEC_PER_MSEC
*250) + (e
->perturb
% (USEC_PER_MSEC
*250));
3560 if (_unlikely_(c
< USEC_PER_MSEC
*250))
3563 c
-= USEC_PER_MSEC
*250;
3572 static int event_arm_timer(
3574 struct clock_data
*d
) {
3576 struct itimerspec its
= {};
3577 sd_event_source
*a
, *b
;
3583 if (!d
->needs_rearm
)
3586 d
->needs_rearm
= false;
3588 a
= prioq_peek(d
->earliest
);
3589 assert(!a
|| EVENT_SOURCE_USES_TIME_PRIOQ(a
->type
));
3590 if (!a
|| a
->enabled
== SD_EVENT_OFF
|| time_event_source_next(a
) == USEC_INFINITY
) {
3595 if (d
->next
== USEC_INFINITY
)
3599 if (timerfd_settime(d
->fd
, TFD_TIMER_ABSTIME
, &its
, NULL
) < 0)
3602 d
->next
= USEC_INFINITY
;
3606 b
= prioq_peek(d
->latest
);
3607 assert(!b
|| EVENT_SOURCE_USES_TIME_PRIOQ(b
->type
));
3608 assert(b
&& b
->enabled
!= SD_EVENT_OFF
);
3610 t
= sleep_between(e
, time_event_source_next(a
), time_event_source_latest(b
));
3614 assert_se(d
->fd
>= 0);
3617 /* We don't want to disarm here, just mean some time looooong ago. */
3618 its
.it_value
.tv_sec
= 0;
3619 its
.it_value
.tv_nsec
= 1;
3621 timespec_store(&its
.it_value
, t
);
3623 if (timerfd_settime(d
->fd
, TFD_TIMER_ABSTIME
, &its
, NULL
) < 0)
3630 static int process_io(sd_event
*e
, sd_event_source
*s
, uint32_t revents
) {
3633 assert(s
->type
== SOURCE_IO
);
3635 /* If the event source was already pending, we just OR in the
3636 * new revents, otherwise we reset the value. The ORing is
3637 * necessary to handle EPOLLONESHOT events properly where
3638 * readability might happen independently of writability, and
3639 * we need to keep track of both */
3642 s
->io
.revents
|= revents
;
3644 s
->io
.revents
= revents
;
3646 return source_set_pending(s
, true);
3649 static int flush_timer(sd_event
*e
, int fd
, uint32_t events
, usec_t
*next
) {
3656 assert_return(events
== EPOLLIN
, -EIO
);
3658 ss
= read(fd
, &x
, sizeof(x
));
3660 if (ERRNO_IS_TRANSIENT(errno
))
3666 if (_unlikely_(ss
!= sizeof(x
)))
3670 *next
= USEC_INFINITY
;
3675 static int process_timer(
3678 struct clock_data
*d
) {
3681 bool callback_invoked
= false;
3688 s
= prioq_peek(d
->earliest
);
3689 assert(!s
|| EVENT_SOURCE_USES_TIME_PRIOQ(s
->type
));
3691 if (!s
|| time_event_source_next(s
) > n
)
3694 if (s
->ratelimited
) {
3695 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3697 assert(s
->ratelimited
);
3699 r
= event_source_leave_ratelimit(s
, /* run_callback */ true);
3703 callback_invoked
= true;
3708 if (s
->enabled
== SD_EVENT_OFF
|| s
->pending
)
3711 r
= source_set_pending(s
, true);
3715 event_source_time_prioq_reshuffle(s
);
3718 return callback_invoked
;
3721 static int process_child(sd_event
*e
, int64_t threshold
, int64_t *ret_min_priority
) {
3722 int64_t min_priority
= threshold
;
3723 bool something_new
= false;
3728 assert(ret_min_priority
);
3730 if (!e
->need_process_child
) {
3731 *ret_min_priority
= min_priority
;
3735 e
->need_process_child
= false;
3737 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3738 * for, instead of using P_ALL. This is because we only want to get child information of very
3739 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3740 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3741 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3742 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3743 * to handle SIGCHLD yourself.
3745 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3746 * source is dispatched so that the callback still sees the process as a zombie. */
3748 HASHMAP_FOREACH(s
, e
->child_sources
) {
3749 assert(s
->type
== SOURCE_CHILD
);
3751 if (s
->priority
> threshold
)
3757 if (event_source_is_offline(s
))
3760 if (s
->child
.exited
)
3763 if (EVENT_SOURCE_WATCH_PIDFD(s
))
3764 /* There's a usable pidfd known for this event source? Then don't waitid() for
3768 zero(s
->child
.siginfo
);
3769 if (waitid(P_PID
, s
->child
.pid
, &s
->child
.siginfo
,
3770 WNOHANG
| (s
->child
.options
& WEXITED
? WNOWAIT
: 0) | s
->child
.options
) < 0)
3771 return negative_errno();
3773 if (s
->child
.siginfo
.si_pid
!= 0) {
3774 bool zombie
= IN_SET(s
->child
.siginfo
.si_code
, CLD_EXITED
, CLD_KILLED
, CLD_DUMPED
);
3777 s
->child
.exited
= true;
3779 if (!zombie
&& (s
->child
.options
& WEXITED
)) {
3780 /* If the child isn't dead then let's immediately remove the state
3781 * change from the queue, since there's no benefit in leaving it
3784 assert(s
->child
.options
& (WSTOPPED
|WCONTINUED
));
3785 (void) waitid(P_PID
, s
->child
.pid
, &s
->child
.siginfo
, WNOHANG
|(s
->child
.options
& (WSTOPPED
|WCONTINUED
)));
3788 r
= source_set_pending(s
, true);
3792 something_new
= true;
3793 min_priority
= MIN(min_priority
, s
->priority
);
3798 *ret_min_priority
= min_priority
;
3799 return something_new
;
3802 static int process_pidfd(sd_event
*e
, sd_event_source
*s
, uint32_t revents
) {
3805 assert(s
->type
== SOURCE_CHILD
);
3810 if (event_source_is_offline(s
))
3813 if (!EVENT_SOURCE_WATCH_PIDFD(s
))
3816 zero(s
->child
.siginfo
);
3817 if (waitid(P_PID
, s
->child
.pid
, &s
->child
.siginfo
, WNOHANG
| WNOWAIT
| s
->child
.options
) < 0)
3820 if (s
->child
.siginfo
.si_pid
== 0)
3823 if (IN_SET(s
->child
.siginfo
.si_code
, CLD_EXITED
, CLD_KILLED
, CLD_DUMPED
))
3824 s
->child
.exited
= true;
3826 return source_set_pending(s
, true);
3829 static int process_signal(sd_event
*e
, struct signal_data
*d
, uint32_t events
, int64_t *min_priority
) {
3834 assert_return(events
== EPOLLIN
, -EIO
);
3835 assert(min_priority
);
3837 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3838 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3839 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3840 * but we might have higher priority children we care about hence we need to check that
3843 if (sigismember(&d
->sigset
, SIGCHLD
))
3844 e
->need_process_child
= true;
3846 /* If there's already an event source pending for this priority we don't read another */
3851 struct signalfd_siginfo si
;
3853 sd_event_source
*s
= NULL
;
3855 n
= read(d
->fd
, &si
, sizeof(si
));
3857 if (ERRNO_IS_TRANSIENT(errno
))
3863 if (_unlikely_(n
!= sizeof(si
)))
3866 assert(SIGNAL_VALID(si
.ssi_signo
));
3868 if (e
->signal_sources
)
3869 s
= e
->signal_sources
[si
.ssi_signo
];
3875 s
->signal
.siginfo
= si
;
3878 r
= source_set_pending(s
, true);
3881 if (r
> 0 && *min_priority
>= s
->priority
) {
3882 *min_priority
= s
->priority
;
3883 return 1; /* an event source with smaller priority is queued. */
3890 static int event_inotify_data_read(sd_event
*e
, struct inotify_data
*d
, uint32_t revents
, int64_t threshold
) {
3896 assert_return(revents
== EPOLLIN
, -EIO
);
3898 /* If there's already an event source pending for this priority, don't read another */
3899 if (d
->n_pending
> 0)
3902 /* Is the read buffer non-empty? If so, let's not read more */
3903 if (d
->buffer_filled
> 0)
3906 if (d
->priority
> threshold
)
3909 n
= read(d
->fd
, &d
->buffer
, sizeof(d
->buffer
));
3911 if (ERRNO_IS_TRANSIENT(errno
))
3918 d
->buffer_filled
= (size_t) n
;
3919 LIST_PREPEND(buffered
, e
->buffered_inotify_data_list
, d
);
3924 static void event_inotify_data_drop(sd_event
*e
, struct inotify_data
*d
, size_t sz
) {
3927 assert(sz
<= d
->buffer_filled
);
3932 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3933 memmove(d
->buffer
.raw
, d
->buffer
.raw
+ sz
, d
->buffer_filled
- sz
);
3934 d
->buffer_filled
-= sz
;
3936 if (d
->buffer_filled
== 0)
3937 LIST_REMOVE(buffered
, e
->buffered_inotify_data_list
, d
);
3940 static int event_inotify_data_process(sd_event
*e
, struct inotify_data
*d
) {
3946 /* If there's already an event source pending for this priority, don't read another */
3947 if (d
->n_pending
> 0)
3950 while (d
->buffer_filled
> 0) {
3953 /* Let's validate that the event structures are complete */
3954 if (d
->buffer_filled
< offsetof(struct inotify_event
, name
))
3957 sz
= offsetof(struct inotify_event
, name
) + d
->buffer
.ev
.len
;
3958 if (d
->buffer_filled
< sz
)
3961 if (d
->buffer
.ev
.mask
& IN_Q_OVERFLOW
) {
3962 struct inode_data
*inode_data
;
3964 /* The queue overran, let's pass this event to all event sources connected to this inotify
3967 HASHMAP_FOREACH(inode_data
, d
->inodes
)
3968 LIST_FOREACH(inotify
.by_inode_data
, s
, inode_data
->event_sources
) {
3970 if (event_source_is_offline(s
))
3973 r
= source_set_pending(s
, true);
3978 struct inode_data
*inode_data
;
3980 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3981 * our watch descriptor table. */
3982 if (d
->buffer
.ev
.mask
& IN_IGNORED
) {
3984 inode_data
= hashmap_remove(d
->wd
, INT_TO_PTR(d
->buffer
.ev
.wd
));
3986 event_inotify_data_drop(e
, d
, sz
);
3990 /* The watch descriptor was removed by the kernel, let's drop it here too */
3991 inode_data
->wd
= -1;
3993 inode_data
= hashmap_get(d
->wd
, INT_TO_PTR(d
->buffer
.ev
.wd
));
3995 event_inotify_data_drop(e
, d
, sz
);
4000 /* Trigger all event sources that are interested in these events. Also trigger all event
4001 * sources if IN_IGNORED or IN_UNMOUNT is set. */
4002 LIST_FOREACH(inotify
.by_inode_data
, s
, inode_data
->event_sources
) {
4004 if (event_source_is_offline(s
))
4007 if ((d
->buffer
.ev
.mask
& (IN_IGNORED
|IN_UNMOUNT
)) == 0 &&
4008 (s
->inotify
.mask
& d
->buffer
.ev
.mask
& IN_ALL_EVENTS
) == 0)
4011 r
= source_set_pending(s
, true);
4017 /* Something pending now? If so, let's finish, otherwise let's read more. */
4018 if (d
->n_pending
> 0)
4025 static int process_inotify(sd_event
*e
) {
4030 LIST_FOREACH(buffered
, d
, e
->buffered_inotify_data_list
) {
4031 r
= event_inotify_data_process(e
, d
);
4041 static int process_memory_pressure(sd_event_source
*s
, uint32_t revents
) {
4043 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
4046 s
->memory_pressure
.revents
|= revents
;
4048 s
->memory_pressure
.revents
= revents
;
4050 return source_set_pending(s
, true);
4053 static int source_memory_pressure_write(sd_event_source
*s
) {
4058 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
4060 /* once we start writing, the buffer is locked, we allow no further changes. */
4061 s
->memory_pressure
.locked
= true;
4063 if (s
->memory_pressure
.write_buffer_size
> 0) {
4064 n
= write(s
->memory_pressure
.fd
, s
->memory_pressure
.write_buffer
, s
->memory_pressure
.write_buffer_size
);
4066 if (!ERRNO_IS_TRANSIENT(errno
)) {
4067 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4068 * files, but then generates EOPNOSUPP on read() and write() (instead of on
4069 * open()!). This sucks hard, since we can only detect this kind of failure
4070 * so late. Let's make the best of it, and turn off the event source like we
4071 * do for failed event source handlers. */
4073 log_debug_errno(errno
, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4074 assert_se(sd_event_source_set_enabled(s
, SD_EVENT_OFF
) >= 0);
4085 if ((size_t) n
== s
->memory_pressure
.write_buffer_size
) {
4086 s
->memory_pressure
.write_buffer
= mfree(s
->memory_pressure
.write_buffer
);
4089 s
->memory_pressure
.write_buffer_size
= 0;
4091 /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4092 r
= source_memory_pressure_register(s
, s
->enabled
);
4097 _cleanup_free_
void *c
= NULL
;
4099 assert((size_t) n
< s
->memory_pressure
.write_buffer_size
);
4101 c
= memdup((uint8_t*) s
->memory_pressure
.write_buffer
+ n
, s
->memory_pressure
.write_buffer_size
- n
);
4105 free_and_replace(s
->memory_pressure
.write_buffer
, c
);
4106 s
->memory_pressure
.write_buffer_size
-= n
;
4113 static int source_memory_pressure_initiate_dispatch(sd_event_source
*s
) {
4117 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
4119 r
= source_memory_pressure_write(s
);
4123 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4124 * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4126 /* No pending incoming IO? Then let's not continue further */
4127 if ((s
->memory_pressure
.revents
& (EPOLLIN
|EPOLLPRI
)) == 0) {
4129 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4130 if ((s
->memory_pressure
.revents
& (EPOLLHUP
|EPOLLERR
|EPOLLRDHUP
)) != 0)
4133 return 1; /* leave dispatch, we already processed everything */
4136 if (s
->memory_pressure
.revents
& EPOLLIN
) {
4137 uint8_t pipe_buf
[PIPE_BUF
];
4140 /* If the fd is readable, then flush out anything that might be queued */
4142 n
= read(s
->memory_pressure
.fd
, pipe_buf
, sizeof(pipe_buf
));
4143 if (n
< 0 && !ERRNO_IS_TRANSIENT(errno
))
4147 return 0; /* go on, dispatch to user callback */
4150 static int source_dispatch(sd_event_source
*s
) {
4151 EventSourceType saved_type
;
4152 sd_event
*saved_event
;
4156 assert(s
->pending
|| s
->type
== SOURCE_EXIT
);
4158 /* Save the event source type, here, so that we still know it after the event callback which might
4159 * invalidate the event. */
4160 saved_type
= s
->type
;
4162 /* Similarly, store a reference to the event loop object, so that we can still access it after the
4163 * callback might have invalidated/disconnected the event source. */
4164 saved_event
= s
->event
;
4165 PROTECT_EVENT(saved_event
);
4167 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4168 assert(!s
->ratelimited
);
4169 if (!ratelimit_below(&s
->rate_limit
)) {
4170 r
= event_source_enter_ratelimited(s
);
4177 if (!IN_SET(s
->type
, SOURCE_DEFER
, SOURCE_EXIT
)) {
4178 r
= source_set_pending(s
, false);
4183 if (s
->type
!= SOURCE_POST
) {
4186 /* If we execute a non-post source, let's mark all post sources as pending. */
4188 SET_FOREACH(z
, s
->event
->post_sources
) {
4189 if (event_source_is_offline(z
))
4192 r
= source_set_pending(z
, true);
4198 if (s
->type
== SOURCE_MEMORY_PRESSURE
) {
4199 r
= source_memory_pressure_initiate_dispatch(s
);
4200 if (r
== -EIO
) /* handle EIO errors similar to callback errors */
4204 if (r
> 0) /* already handled */
4208 if (s
->enabled
== SD_EVENT_ONESHOT
) {
4209 r
= sd_event_source_set_enabled(s
, SD_EVENT_OFF
);
4214 s
->dispatching
= true;
4219 r
= s
->io
.callback(s
, s
->io
.fd
, s
->io
.revents
, s
->userdata
);
4222 case SOURCE_TIME_REALTIME
:
4223 case SOURCE_TIME_BOOTTIME
:
4224 case SOURCE_TIME_MONOTONIC
:
4225 case SOURCE_TIME_REALTIME_ALARM
:
4226 case SOURCE_TIME_BOOTTIME_ALARM
:
4227 r
= s
->time
.callback(s
, s
->time
.next
, s
->userdata
);
4231 r
= s
->signal
.callback(s
, &s
->signal
.siginfo
, s
->userdata
);
4234 case SOURCE_CHILD
: {
4237 zombie
= IN_SET(s
->child
.siginfo
.si_code
, CLD_EXITED
, CLD_KILLED
, CLD_DUMPED
);
4239 r
= s
->child
.callback(s
, &s
->child
.siginfo
, s
->userdata
);
4241 /* Now, reap the PID for good. */
4243 (void) waitid(P_PID
, s
->child
.pid
, &s
->child
.siginfo
, WNOHANG
|WEXITED
);
4244 s
->child
.waited
= true;
4251 r
= s
->defer
.callback(s
, s
->userdata
);
4255 r
= s
->post
.callback(s
, s
->userdata
);
4259 r
= s
->exit
.callback(s
, s
->userdata
);
4262 case SOURCE_INOTIFY
: {
4263 struct sd_event
*e
= s
->event
;
4264 struct inotify_data
*d
;
4267 assert(s
->inotify
.inode_data
);
4268 assert_se(d
= s
->inotify
.inode_data
->inotify_data
);
4270 assert(d
->buffer_filled
>= offsetof(struct inotify_event
, name
));
4271 sz
= offsetof(struct inotify_event
, name
) + d
->buffer
.ev
.len
;
4272 assert(d
->buffer_filled
>= sz
);
4274 /* If the inotify callback destroys the event source then this likely means we don't need to
4275 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4276 * free it immediately, then we couldn't drop the event from the inotify event queue without
4277 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4278 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4279 * explicitly GC it after we are done dropping the inotify event from the buffer. */
4281 r
= s
->inotify
.callback(s
, &d
->buffer
.ev
, s
->userdata
);
4284 /* When no event is pending anymore on this inotify object, then let's drop the event from
4285 * the inotify event queue buffer. */
4286 if (d
->n_pending
== 0)
4287 event_inotify_data_drop(e
, d
, sz
);
4289 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4290 event_gc_inotify_data(e
, d
);
4294 case SOURCE_MEMORY_PRESSURE
:
4295 r
= s
->memory_pressure
.callback(s
, s
->userdata
);
4298 case SOURCE_WATCHDOG
:
4299 case _SOURCE_EVENT_SOURCE_TYPE_MAX
:
4300 case _SOURCE_EVENT_SOURCE_TYPE_INVALID
:
4301 assert_not_reached();
4304 s
->dispatching
= false;
4308 log_debug_errno(r
, "Event source %s (type %s) returned error, %s: %m",
4309 strna(s
->description
),
4310 event_source_type_to_string(saved_type
),
4311 s
->exit_on_failure
? "exiting" : "disabling");
4313 if (s
->exit_on_failure
)
4314 (void) sd_event_exit(saved_event
, r
);
4320 assert_se(sd_event_source_set_enabled(s
, SD_EVENT_OFF
) >= 0);
4325 static int event_prepare(sd_event
*e
) {
4333 s
= prioq_peek(e
->prepare
);
4334 if (!s
|| s
->prepare_iteration
== e
->iteration
|| event_source_is_offline(s
))
4337 s
->prepare_iteration
= e
->iteration
;
4338 prioq_reshuffle(e
->prepare
, s
, &s
->prepare_index
);
4341 s
->dispatching
= true;
4342 r
= s
->prepare(s
, s
->userdata
);
4343 s
->dispatching
= false;
4346 log_debug_errno(r
, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4347 strna(s
->description
),
4348 event_source_type_to_string(s
->type
),
4349 s
->exit_on_failure
? "exiting" : "disabling");
4351 if (s
->exit_on_failure
)
4352 (void) sd_event_exit(e
, r
);
4358 assert_se(sd_event_source_set_enabled(s
, SD_EVENT_OFF
) >= 0);
4364 static int dispatch_exit(sd_event
*e
) {
4370 p
= prioq_peek(e
->exit
);
4371 assert(!p
|| p
->type
== SOURCE_EXIT
);
4373 if (!p
|| event_source_is_offline(p
)) {
4374 e
->state
= SD_EVENT_FINISHED
;
4380 e
->state
= SD_EVENT_EXITING
;
4381 r
= source_dispatch(p
);
4382 e
->state
= SD_EVENT_INITIAL
;
4386 static sd_event_source
* event_next_pending(sd_event
*e
) {
4391 p
= prioq_peek(e
->pending
);
4395 if (event_source_is_offline(p
))
4401 static int arm_watchdog(sd_event
*e
) {
4402 struct itimerspec its
= {};
4406 assert(e
->watchdog_fd
>= 0);
4408 t
= sleep_between(e
,
4409 usec_add(e
->watchdog_last
, (e
->watchdog_period
/ 2)),
4410 usec_add(e
->watchdog_last
, (e
->watchdog_period
* 3 / 4)));
4412 timespec_store(&its
.it_value
, t
);
4414 /* Make sure we never set the watchdog to 0, which tells the
4415 * kernel to disable it. */
4416 if (its
.it_value
.tv_sec
== 0 && its
.it_value
.tv_nsec
== 0)
4417 its
.it_value
.tv_nsec
= 1;
4419 return RET_NERRNO(timerfd_settime(e
->watchdog_fd
, TFD_TIMER_ABSTIME
, &its
, NULL
));
4422 static int process_watchdog(sd_event
*e
) {
4428 /* Don't notify watchdog too often */
4429 if (e
->watchdog_last
+ e
->watchdog_period
/ 4 > e
->timestamp
.monotonic
)
4432 sd_notify(false, "WATCHDOG=1");
4433 e
->watchdog_last
= e
->timestamp
.monotonic
;
4435 return arm_watchdog(e
);
4438 static void event_close_inode_data_fds(sd_event
*e
) {
4439 struct inode_data
*d
;
4443 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4444 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4445 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4446 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4449 while ((d
= e
->inode_data_to_close_list
)) {
4451 d
->fd
= safe_close(d
->fd
);
4453 LIST_REMOVE(to_close
, e
->inode_data_to_close_list
, d
);
4457 static int event_memory_pressure_write_list(sd_event
*e
) {
4465 s
= LIST_POP(memory_pressure
.write_list
, e
->memory_pressure_write_list
);
4469 assert(s
->type
== SOURCE_MEMORY_PRESSURE
);
4470 assert(s
->memory_pressure
.write_buffer_size
> 0);
4471 s
->memory_pressure
.in_write_list
= false;
4473 r
= source_memory_pressure_write(s
);
4481 _public_
int sd_event_prepare(sd_event
*e
) {
4484 assert_return(e
, -EINVAL
);
4485 assert_return(e
= event_resolve(e
), -ENOPKG
);
4486 assert_return(!event_origin_changed(e
), -ECHILD
);
4487 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4488 assert_return(e
->state
== SD_EVENT_INITIAL
, -EBUSY
);
4490 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4491 * this check here once, since gettid() is typically not cached, and thus want to minimize
4493 assert_return(!e
->default_event_ptr
|| e
->tid
== gettid(), -EREMOTEIO
);
4495 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4498 if (e
->exit_requested
)
4503 e
->state
= SD_EVENT_PREPARING
;
4504 r
= event_prepare(e
);
4505 e
->state
= SD_EVENT_INITIAL
;
4509 r
= event_memory_pressure_write_list(e
);
4513 r
= event_arm_timer(e
, &e
->realtime
);
4517 r
= event_arm_timer(e
, &e
->boottime
);
4521 r
= event_arm_timer(e
, &e
->monotonic
);
4525 r
= event_arm_timer(e
, &e
->realtime_alarm
);
4529 r
= event_arm_timer(e
, &e
->boottime_alarm
);
4533 event_close_inode_data_fds(e
);
4535 if (event_next_pending(e
) || e
->need_process_child
|| e
->buffered_inotify_data_list
)
4538 e
->state
= SD_EVENT_ARMED
;
4543 e
->state
= SD_EVENT_ARMED
;
4544 r
= sd_event_wait(e
, 0);
4546 e
->state
= SD_EVENT_ARMED
;
4551 static int epoll_wait_usec(
4553 struct epoll_event
*events
,
4558 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4560 #if HAVE_EPOLL_PWAIT2
4561 static bool epoll_pwait2_absent
= false;
4564 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4565 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4566 * is not that obvious to implement given the libc and kernel definitions differ in the last
4567 * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4568 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4571 if (!epoll_pwait2_absent
&& timeout
!= USEC_INFINITY
) {
4572 r
= epoll_pwait2(fd
,
4575 TIMESPEC_STORE(timeout
),
4579 if (!ERRNO_IS_NOT_SUPPORTED(errno
) && !ERRNO_IS_PRIVILEGE(errno
))
4580 return -errno
; /* Only fallback to old epoll_wait() if the syscall is masked or not
4583 epoll_pwait2_absent
= true;
4587 if (timeout
== USEC_INFINITY
)
4592 k
= DIV_ROUND_UP(timeout
, USEC_PER_MSEC
);
4594 msec
= INT_MAX
; /* Saturate */
4599 return RET_NERRNO(epoll_wait(fd
, events
, maxevents
, msec
));
4602 static int process_epoll(sd_event
*e
, usec_t timeout
, int64_t threshold
, int64_t *ret_min_priority
) {
4603 size_t n_event_queue
, m
, n_event_max
;
4604 int64_t min_priority
= threshold
;
4605 bool something_new
= false;
4609 assert(ret_min_priority
);
4611 n_event_queue
= MAX(e
->n_sources
, 1u);
4612 if (!GREEDY_REALLOC(e
->event_queue
, n_event_queue
))
4615 n_event_max
= MALLOC_ELEMENTSOF(e
->event_queue
);
4617 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4618 if (e
->buffered_inotify_data_list
)
4622 r
= epoll_wait_usec(
4632 if (m
< n_event_max
)
4635 if (n_event_max
>= n_event_queue
* 10)
4638 if (!GREEDY_REALLOC(e
->event_queue
, n_event_max
+ n_event_queue
))
4641 n_event_max
= MALLOC_ELEMENTSOF(e
->event_queue
);
4645 /* Set timestamp only when this is called first time. */
4646 if (threshold
== INT64_MAX
)
4647 triple_timestamp_now(&e
->timestamp
);
4649 for (size_t i
= 0; i
< m
; i
++) {
4651 if (e
->event_queue
[i
].data
.ptr
== INT_TO_PTR(SOURCE_WATCHDOG
))
4652 r
= flush_timer(e
, e
->watchdog_fd
, e
->event_queue
[i
].events
, NULL
);
4654 WakeupType
*t
= e
->event_queue
[i
].data
.ptr
;
4658 case WAKEUP_EVENT_SOURCE
: {
4659 sd_event_source
*s
= e
->event_queue
[i
].data
.ptr
;
4663 if (s
->priority
> threshold
)
4666 min_priority
= MIN(min_priority
, s
->priority
);
4671 r
= process_io(e
, s
, e
->event_queue
[i
].events
);
4675 r
= process_pidfd(e
, s
, e
->event_queue
[i
].events
);
4678 case SOURCE_MEMORY_PRESSURE
:
4679 r
= process_memory_pressure(s
, e
->event_queue
[i
].events
);
4683 assert_not_reached();
4689 case WAKEUP_CLOCK_DATA
: {
4690 struct clock_data
*d
= e
->event_queue
[i
].data
.ptr
;
4694 r
= flush_timer(e
, d
->fd
, e
->event_queue
[i
].events
, &d
->next
);
4698 case WAKEUP_SIGNAL_DATA
:
4699 r
= process_signal(e
, e
->event_queue
[i
].data
.ptr
, e
->event_queue
[i
].events
, &min_priority
);
4702 case WAKEUP_INOTIFY_DATA
:
4703 r
= event_inotify_data_read(e
, e
->event_queue
[i
].data
.ptr
, e
->event_queue
[i
].events
, threshold
);
4707 assert_not_reached();
4713 something_new
= true;
4716 *ret_min_priority
= min_priority
;
4717 return something_new
;
4720 _public_
int sd_event_wait(sd_event
*e
, uint64_t timeout
) {
4723 assert_return(e
, -EINVAL
);
4724 assert_return(e
= event_resolve(e
), -ENOPKG
);
4725 assert_return(!event_origin_changed(e
), -ECHILD
);
4726 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4727 assert_return(e
->state
== SD_EVENT_ARMED
, -EBUSY
);
4729 if (e
->exit_requested
) {
4730 e
->state
= SD_EVENT_PENDING
;
4734 for (int64_t threshold
= INT64_MAX
; ; threshold
--) {
4735 int64_t epoll_min_priority
, child_min_priority
;
4737 /* There may be a possibility that new epoll (especially IO) and child events are
4738 * triggered just after process_epoll() call but before process_child(), and the new IO
4739 * events may have higher priority than the child events. To salvage these events,
4740 * let's call epoll_wait() again, but accepts only events with higher priority than the
4741 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4742 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4743 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4745 r
= process_epoll(e
, timeout
, threshold
, &epoll_min_priority
);
4747 e
->state
= SD_EVENT_PENDING
;
4752 if (r
== 0 && threshold
< INT64_MAX
)
4753 /* No new epoll event. */
4756 r
= process_child(e
, threshold
, &child_min_priority
);
4760 /* No new child event. */
4763 threshold
= MIN(epoll_min_priority
, child_min_priority
);
4764 if (threshold
== INT64_MIN
)
4770 r
= process_watchdog(e
);
4774 r
= process_inotify(e
);
4778 r
= process_timer(e
, e
->timestamp
.realtime
, &e
->realtime
);
4782 r
= process_timer(e
, e
->timestamp
.boottime
, &e
->boottime
);
4786 r
= process_timer(e
, e
->timestamp
.realtime
, &e
->realtime_alarm
);
4790 r
= process_timer(e
, e
->timestamp
.boottime
, &e
->boottime_alarm
);
4794 r
= process_timer(e
, e
->timestamp
.monotonic
, &e
->monotonic
);
4798 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4799 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4800 * there were potentially re-enabled by the callback.
4802 * Wondering why we treat only this invocation of process_timer() differently? Once event
4803 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4804 * ratelimit expiry callback is never called for any other timer type. */
4809 if (event_next_pending(e
)) {
4810 e
->state
= SD_EVENT_PENDING
;
4817 e
->state
= SD_EVENT_INITIAL
;
4822 _public_
int sd_event_dispatch(sd_event
*e
) {
4826 assert_return(e
, -EINVAL
);
4827 assert_return(e
= event_resolve(e
), -ENOPKG
);
4828 assert_return(!event_origin_changed(e
), -ECHILD
);
4829 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4830 assert_return(e
->state
== SD_EVENT_PENDING
, -EBUSY
);
4832 if (e
->exit_requested
)
4833 return dispatch_exit(e
);
4835 p
= event_next_pending(e
);
4839 e
->state
= SD_EVENT_RUNNING
;
4840 r
= source_dispatch(p
);
4841 e
->state
= SD_EVENT_INITIAL
;
4845 e
->state
= SD_EVENT_INITIAL
;
4850 static void event_log_delays(sd_event
*e
) {
4851 char b
[ELEMENTSOF(e
->delays
) * DECIMAL_STR_MAX(unsigned) + 1], *p
;
4856 for (i
= 0; i
< ELEMENTSOF(e
->delays
); i
++) {
4857 l
= strpcpyf(&p
, l
, "%u ", e
->delays
[i
]);
4860 log_debug("Event loop iterations: %s", b
);
4863 _public_
int sd_event_run(sd_event
*e
, uint64_t timeout
) {
4866 assert_return(e
, -EINVAL
);
4867 assert_return(e
= event_resolve(e
), -ENOPKG
);
4868 assert_return(!event_origin_changed(e
), -ECHILD
);
4869 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4870 assert_return(e
->state
== SD_EVENT_INITIAL
, -EBUSY
);
4872 if (e
->profile_delays
&& e
->last_run_usec
!= 0) {
4876 this_run
= now(CLOCK_MONOTONIC
);
4878 l
= log2u64(this_run
- e
->last_run_usec
);
4879 assert(l
< ELEMENTSOF(e
->delays
));
4882 if (this_run
- e
->last_log_usec
>= 5*USEC_PER_SEC
) {
4883 event_log_delays(e
);
4884 e
->last_log_usec
= this_run
;
4888 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4891 r
= sd_event_prepare(e
);
4893 /* There was nothing? Then wait... */
4894 r
= sd_event_wait(e
, timeout
);
4896 if (e
->profile_delays
)
4897 e
->last_run_usec
= now(CLOCK_MONOTONIC
);
4900 /* There's something now, then let's dispatch it */
4901 r
= sd_event_dispatch(e
);
4911 _public_
int sd_event_loop(sd_event
*e
) {
4914 assert_return(e
, -EINVAL
);
4915 assert_return(e
= event_resolve(e
), -ENOPKG
);
4916 assert_return(!event_origin_changed(e
), -ECHILD
);
4917 assert_return(e
->state
== SD_EVENT_INITIAL
, -EBUSY
);
4922 while (e
->state
!= SD_EVENT_FINISHED
) {
4923 r
= sd_event_run(e
, UINT64_MAX
);
4928 return e
->exit_code
;
4931 _public_
int sd_event_get_fd(sd_event
*e
) {
4932 assert_return(e
, -EINVAL
);
4933 assert_return(e
= event_resolve(e
), -ENOPKG
);
4934 assert_return(!event_origin_changed(e
), -ECHILD
);
4939 _public_
int sd_event_get_state(sd_event
*e
) {
4940 assert_return(e
, -EINVAL
);
4941 assert_return(e
= event_resolve(e
), -ENOPKG
);
4942 assert_return(!event_origin_changed(e
), -ECHILD
);
4947 _public_
int sd_event_get_exit_code(sd_event
*e
, int *code
) {
4948 assert_return(e
, -EINVAL
);
4949 assert_return(e
= event_resolve(e
), -ENOPKG
);
4950 assert_return(!event_origin_changed(e
), -ECHILD
);
4952 if (!e
->exit_requested
)
4956 *code
= e
->exit_code
;
4960 _public_
int sd_event_exit(sd_event
*e
, int code
) {
4961 assert_return(e
, -EINVAL
);
4962 assert_return(e
= event_resolve(e
), -ENOPKG
);
4963 assert_return(e
->state
!= SD_EVENT_FINISHED
, -ESTALE
);
4964 assert_return(!event_origin_changed(e
), -ECHILD
);
4966 e
->exit_requested
= true;
4967 e
->exit_code
= code
;
4972 _public_
int sd_event_now(sd_event
*e
, clockid_t clock
, uint64_t *usec
) {
4973 assert_return(e
, -EINVAL
);
4974 assert_return(e
= event_resolve(e
), -ENOPKG
);
4975 assert_return(usec
, -EINVAL
);
4976 assert_return(!event_origin_changed(e
), -ECHILD
);
4978 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock
))
4981 if (!triple_timestamp_is_set(&e
->timestamp
)) {
4982 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4987 *usec
= triple_timestamp_by_clock(&e
->timestamp
, clock
);
4991 _public_
int sd_event_default(sd_event
**ret
) {
4996 return !!default_event
;
4998 if (default_event
) {
4999 *ret
= sd_event_ref(default_event
);
5003 r
= sd_event_new(&e
);
5007 e
->default_event_ptr
= &default_event
;
5015 _public_
int sd_event_get_tid(sd_event
*e
, pid_t
*tid
) {
5016 assert_return(e
, -EINVAL
);
5017 assert_return(e
= event_resolve(e
), -ENOPKG
);
5018 assert_return(tid
, -EINVAL
);
5019 assert_return(!event_origin_changed(e
), -ECHILD
);
5029 _public_
int sd_event_set_watchdog(sd_event
*e
, int b
) {
5032 assert_return(e
, -EINVAL
);
5033 assert_return(e
= event_resolve(e
), -ENOPKG
);
5034 assert_return(!event_origin_changed(e
), -ECHILD
);
5036 if (e
->watchdog
== !!b
)
5040 r
= sd_watchdog_enabled(false, &e
->watchdog_period
);
5044 /* Issue first ping immediately */
5045 sd_notify(false, "WATCHDOG=1");
5046 e
->watchdog_last
= now(CLOCK_MONOTONIC
);
5048 e
->watchdog_fd
= timerfd_create(CLOCK_MONOTONIC
, TFD_NONBLOCK
|TFD_CLOEXEC
);
5049 if (e
->watchdog_fd
< 0)
5052 r
= arm_watchdog(e
);
5056 struct epoll_event ev
= {
5058 .data
.ptr
= INT_TO_PTR(SOURCE_WATCHDOG
),
5061 if (epoll_ctl(e
->epoll_fd
, EPOLL_CTL_ADD
, e
->watchdog_fd
, &ev
) < 0) {
5067 if (e
->watchdog_fd
>= 0) {
5068 (void) epoll_ctl(e
->epoll_fd
, EPOLL_CTL_DEL
, e
->watchdog_fd
, NULL
);
5069 e
->watchdog_fd
= safe_close(e
->watchdog_fd
);
5077 e
->watchdog_fd
= safe_close(e
->watchdog_fd
);
5081 _public_
int sd_event_get_watchdog(sd_event
*e
) {
5082 assert_return(e
, -EINVAL
);
5083 assert_return(e
= event_resolve(e
), -ENOPKG
);
5084 assert_return(!event_origin_changed(e
), -ECHILD
);
5089 _public_
int sd_event_get_iteration(sd_event
*e
, uint64_t *ret
) {
5090 assert_return(e
, -EINVAL
);
5091 assert_return(e
= event_resolve(e
), -ENOPKG
);
5092 assert_return(!event_origin_changed(e
), -ECHILD
);
5094 *ret
= e
->iteration
;
5098 _public_
int sd_event_source_set_destroy_callback(sd_event_source
*s
, sd_event_destroy_t callback
) {
5099 assert_return(s
, -EINVAL
);
5100 assert_return(s
->event
, -EINVAL
);
5101 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5103 s
->destroy_callback
= callback
;
5107 _public_
int sd_event_source_get_destroy_callback(sd_event_source
*s
, sd_event_destroy_t
*ret
) {
5108 assert_return(s
, -EINVAL
);
5109 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5112 *ret
= s
->destroy_callback
;
5114 return !!s
->destroy_callback
;
5117 _public_
int sd_event_source_get_floating(sd_event_source
*s
) {
5118 assert_return(s
, -EINVAL
);
5119 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5124 _public_
int sd_event_source_set_floating(sd_event_source
*s
, int b
) {
5125 assert_return(s
, -EINVAL
);
5126 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5128 if (s
->floating
== !!b
)
5131 if (!s
->event
) /* Already disconnected */
5137 sd_event_source_ref(s
);
5138 sd_event_unref(s
->event
);
5140 sd_event_ref(s
->event
);
5141 sd_event_source_unref(s
);
5147 _public_
int sd_event_source_get_exit_on_failure(sd_event_source
*s
) {
5148 assert_return(s
, -EINVAL
);
5149 assert_return(s
->type
!= SOURCE_EXIT
, -EDOM
);
5150 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5152 return s
->exit_on_failure
;
5155 _public_
int sd_event_source_set_exit_on_failure(sd_event_source
*s
, int b
) {
5156 assert_return(s
, -EINVAL
);
5157 assert_return(s
->type
!= SOURCE_EXIT
, -EDOM
);
5158 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5160 if (s
->exit_on_failure
== !!b
)
5163 s
->exit_on_failure
= b
;
5167 _public_
int sd_event_source_set_ratelimit(sd_event_source
*s
, uint64_t interval
, unsigned burst
) {
5170 assert_return(s
, -EINVAL
);
5171 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5173 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5174 * so is a programming error. */
5175 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s
->type
), -EDOM
);
5177 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5178 * non-ratelimited. */
5179 r
= event_source_leave_ratelimit(s
, /* run_callback */ false);
5183 s
->rate_limit
= (RateLimit
) { interval
, burst
};
5187 _public_
int sd_event_source_set_ratelimit_expire_callback(sd_event_source
*s
, sd_event_handler_t callback
) {
5188 assert_return(s
, -EINVAL
);
5189 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5191 s
->ratelimit_expire_callback
= callback
;
5195 _public_
int sd_event_source_get_ratelimit(sd_event_source
*s
, uint64_t *ret_interval
, unsigned *ret_burst
) {
5196 assert_return(s
, -EINVAL
);
5197 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5199 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5200 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5201 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s
->type
))
5204 if (!ratelimit_configured(&s
->rate_limit
))
5208 *ret_interval
= s
->rate_limit
.interval
;
5210 *ret_burst
= s
->rate_limit
.burst
;
5215 _public_
int sd_event_source_is_ratelimited(sd_event_source
*s
) {
5216 assert_return(s
, -EINVAL
);
5217 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5219 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s
->type
))
5222 if (!ratelimit_configured(&s
->rate_limit
))
5225 return s
->ratelimited
;
5228 _public_
int sd_event_source_leave_ratelimit(sd_event_source
*s
) {
5231 assert_return(s
, -EINVAL
);
5233 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s
->type
))
5236 if (!ratelimit_configured(&s
->rate_limit
))
5239 if (!s
->ratelimited
)
5242 r
= event_source_leave_ratelimit(s
, /* run_callback */ false);
5246 return 1; /* tell caller that we indeed just left the ratelimit state */
5249 _public_
int sd_event_set_signal_exit(sd_event
*e
, int b
) {
5250 bool change
= false;
5253 assert_return(e
, -EINVAL
);
5256 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5257 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5258 * floating after creation (and undo this before deleting them again). */
5260 if (!e
->sigint_event_source
) {
5261 r
= sd_event_add_signal(e
, &e
->sigint_event_source
, SIGINT
| SD_EVENT_SIGNAL_PROCMASK
, NULL
, NULL
);
5265 assert(sd_event_source_set_floating(e
->sigint_event_source
, true) >= 0);
5269 if (!e
->sigterm_event_source
) {
5270 r
= sd_event_add_signal(e
, &e
->sigterm_event_source
, SIGTERM
| SD_EVENT_SIGNAL_PROCMASK
, NULL
, NULL
);
5273 assert(sd_event_source_set_floating(e
->sigint_event_source
, false) >= 0);
5274 e
->sigint_event_source
= sd_event_source_unref(e
->sigint_event_source
);
5280 assert(sd_event_source_set_floating(e
->sigterm_event_source
, true) >= 0);
5285 if (e
->sigint_event_source
) {
5286 assert(sd_event_source_set_floating(e
->sigint_event_source
, false) >= 0);
5287 e
->sigint_event_source
= sd_event_source_unref(e
->sigint_event_source
);
5291 if (e
->sigterm_event_source
) {
5292 assert(sd_event_source_set_floating(e
->sigterm_event_source
, false) >= 0);
5293 e
->sigterm_event_source
= sd_event_source_unref(e
->sigterm_event_source
);
5301 _public_
int sd_event_source_set_memory_pressure_type(sd_event_source
*s
, const char *ty
) {
5302 _cleanup_free_
char *b
= NULL
;
5303 _cleanup_free_
void *w
= NULL
;
5305 assert_return(s
, -EINVAL
);
5306 assert_return(s
->type
== SOURCE_MEMORY_PRESSURE
, -EDOM
);
5307 assert_return(ty
, -EINVAL
);
5308 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5310 if (!STR_IN_SET(ty
, "some", "full"))
5313 if (s
->memory_pressure
.locked
) /* Refuse adjusting parameters, if caller told us how to watch for events */
5316 char* space
= memchr(s
->memory_pressure
.write_buffer
, ' ', s
->memory_pressure
.write_buffer_size
);
5320 size_t l
= (char*) space
- (char*) s
->memory_pressure
.write_buffer
;
5321 b
= memdup_suffix0(s
->memory_pressure
.write_buffer
, l
);
5324 if (!STR_IN_SET(b
, "some", "full"))
5330 size_t nl
= strlen(ty
) + (s
->memory_pressure
.write_buffer_size
- l
);
5335 memcpy(stpcpy(w
, ty
), space
, (s
->memory_pressure
.write_buffer_size
- l
));
5337 free_and_replace(s
->memory_pressure
.write_buffer
, w
);
5338 s
->memory_pressure
.write_buffer_size
= nl
;
5339 s
->memory_pressure
.locked
= false;
5344 _public_
int sd_event_source_set_memory_pressure_period(sd_event_source
*s
, uint64_t threshold_usec
, uint64_t window_usec
) {
5345 _cleanup_free_
char *b
= NULL
;
5346 _cleanup_free_
void *w
= NULL
;
5348 assert_return(s
, -EINVAL
);
5349 assert_return(s
->type
== SOURCE_MEMORY_PRESSURE
, -EDOM
);
5350 assert_return(!event_origin_changed(s
->event
), -ECHILD
);
5352 if (threshold_usec
<= 0 || threshold_usec
>= UINT64_MAX
)
5354 if (window_usec
<= 0 || window_usec
>= UINT64_MAX
)
5356 if (threshold_usec
> window_usec
)
5359 if (s
->memory_pressure
.locked
) /* Refuse adjusting parameters, if caller told us how to watch for events */
5362 char* space
= memchr(s
->memory_pressure
.write_buffer
, ' ', s
->memory_pressure
.write_buffer_size
);
5366 size_t l
= (char*) space
- (char*) s
->memory_pressure
.write_buffer
;
5367 b
= memdup_suffix0(s
->memory_pressure
.write_buffer
, l
);
5370 if (!STR_IN_SET(b
, "some", "full"))
5373 if (asprintf((char**) &w
,
5374 "%s " USEC_FMT
" " USEC_FMT
"",
5381 if (memcmp_nn(s
->memory_pressure
.write_buffer
, s
->memory_pressure
.write_buffer_size
, w
, l
) == 0)
5384 free_and_replace(s
->memory_pressure
.write_buffer
, w
);
5385 s
->memory_pressure
.write_buffer_size
= l
;
5386 s
->memory_pressure
.locked
= false;