]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/libsystemd/sd-event/sd-event.c
Merge pull request #25485 from DaanDeMeyer/gpt-fix
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
fd38203a
LP
2
3#include <sys/epoll.h>
4#include <sys/timerfd.h>
5#include <sys/wait.h>
6
cde93897 7#include "sd-daemon.h"
07630cea
LP
8#include "sd-event.h"
9#include "sd-id128.h"
10
b5efdb8a 11#include "alloc-util.h"
f8f3f926 12#include "env-util.h"
a137a1c3 13#include "event-source.h"
3ffd4af2 14#include "fd-util.h"
97ef5391 15#include "fs-util.h"
28e5e1e9 16#include "glyph-util.h"
fd38203a 17#include "hashmap.h"
07630cea 18#include "list.h"
3ae6b3bf 19#include "logarithm.h"
07630cea 20#include "macro.h"
0a970718 21#include "memory-util.h"
f5947a5e 22#include "missing_syscall.h"
07630cea 23#include "prioq.h"
4a0b58c4 24#include "process-util.h"
6e9feda3 25#include "set.h"
24882e06 26#include "signal-util.h"
55cbfaa5 27#include "string-table.h"
07630cea 28#include "string-util.h"
442ac269 29#include "strxcpyx.h"
07630cea 30#include "time-util.h"
fd38203a 31
c2ba3ad6 32#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
fd38203a 33
f8f3f926
LP
34static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
35 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
36 return s &&
37 s->type == SOURCE_CHILD &&
38 s->child.pidfd >= 0 &&
39 s->child.options == WEXITED;
40}
41
b6d5481b
LP
42static bool event_source_is_online(sd_event_source *s) {
43 assert(s);
44 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
45}
46
47static bool event_source_is_offline(sd_event_source *s) {
48 assert(s);
49 return s->enabled == SD_EVENT_OFF || s->ratelimited;
50}
51
55cbfaa5 52static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
3f3548f8
ZJS
53 [SOURCE_IO] = "io",
54 [SOURCE_TIME_REALTIME] = "realtime",
55 [SOURCE_TIME_BOOTTIME] = "bootime",
56 [SOURCE_TIME_MONOTONIC] = "monotonic",
55cbfaa5
DM
57 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
58 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
3f3548f8
ZJS
59 [SOURCE_SIGNAL] = "signal",
60 [SOURCE_CHILD] = "child",
61 [SOURCE_DEFER] = "defer",
62 [SOURCE_POST] = "post",
63 [SOURCE_EXIT] = "exit",
64 [SOURCE_WATCHDOG] = "watchdog",
65 [SOURCE_INOTIFY] = "inotify",
55cbfaa5
DM
66};
67
68DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
69
b6d5481b
LP
70#define EVENT_SOURCE_IS_TIME(t) \
71 IN_SET((t), \
72 SOURCE_TIME_REALTIME, \
73 SOURCE_TIME_BOOTTIME, \
74 SOURCE_TIME_MONOTONIC, \
75 SOURCE_TIME_REALTIME_ALARM, \
76 SOURCE_TIME_BOOTTIME_ALARM)
77
78#define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
79 IN_SET((t), \
80 SOURCE_IO, \
81 SOURCE_TIME_REALTIME, \
82 SOURCE_TIME_BOOTTIME, \
83 SOURCE_TIME_MONOTONIC, \
84 SOURCE_TIME_REALTIME_ALARM, \
85 SOURCE_TIME_BOOTTIME_ALARM, \
86 SOURCE_SIGNAL, \
87 SOURCE_DEFER, \
88 SOURCE_INOTIFY)
6a0f1f6d 89
19947509
ZJS
90/* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
91 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
92 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
93#define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
94
fd38203a 95struct sd_event {
da7e457c 96 unsigned n_ref;
fd38203a
LP
97
98 int epoll_fd;
cde93897 99 int watchdog_fd;
fd38203a
LP
100
101 Prioq *pending;
102 Prioq *prepare;
c2ba3ad6 103
a8548816 104 /* timerfd_create() only supports these five clocks so far. We
6a0f1f6d
LP
105 * can add support for more clocks when the kernel learns to
106 * deal with them, too. */
107 struct clock_data realtime;
a8548816 108 struct clock_data boottime;
6a0f1f6d
LP
109 struct clock_data monotonic;
110 struct clock_data realtime_alarm;
111 struct clock_data boottime_alarm;
fd38203a 112
da7e457c
LP
113 usec_t perturb;
114
9da4cb2b
LP
115 sd_event_source **signal_sources; /* indexed by signal number */
116 Hashmap *signal_data; /* indexed by priority */
fd38203a
LP
117
118 Hashmap *child_sources;
b6d5481b 119 unsigned n_online_child_sources;
fd38203a 120
6e9feda3
LP
121 Set *post_sources;
122
6203e07a 123 Prioq *exit;
fd38203a 124
97ef5391
LP
125 Hashmap *inotify_data; /* indexed by priority */
126
127 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
ed828563 128 LIST_HEAD(struct inode_data, inode_data_to_close_list);
97ef5391
LP
129
130 /* A list of inotify objects that already have events buffered which aren't processed yet */
0601b958 131 LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
97ef5391 132
da7e457c 133 pid_t original_pid;
c2ba3ad6 134
60a3b1e1 135 uint64_t iteration;
e475d10c 136 triple_timestamp timestamp;
da7e457c 137 int state;
eaa3cbef 138
6203e07a 139 bool exit_requested:1;
da7e457c 140 bool need_process_child:1;
cde93897 141 bool watchdog:1;
34b87517 142 bool profile_delays:1;
afc6adb5 143
6203e07a
LP
144 int exit_code;
145
afc6adb5
LP
146 pid_t tid;
147 sd_event **default_event_ptr;
cde93897
LP
148
149 usec_t watchdog_last, watchdog_period;
15b38f93
LP
150
151 unsigned n_sources;
a71fe8b8 152
5cddd924 153 struct epoll_event *event_queue;
5cddd924 154
a71fe8b8 155 LIST_HEAD(sd_event_source, sources);
34b87517 156
baf3fdec
LP
157 sd_event_source *sigint_event_source, *sigterm_event_source;
158
e6a7bee5 159 usec_t last_run_usec, last_log_usec;
34b87517 160 unsigned delays[sizeof(usec_t) * 8];
fd38203a
LP
161};
162
b937d761
NM
163static thread_local sd_event *default_event = NULL;
164
a71fe8b8 165static void source_disconnect(sd_event_source *s);
97ef5391 166static void event_gc_inode_data(sd_event *e, struct inode_data *d);
a71fe8b8 167
b937d761
NM
168static sd_event *event_resolve(sd_event *e) {
169 return e == SD_EVENT_DEFAULT ? default_event : e;
170}
171
fd38203a
LP
172static int pending_prioq_compare(const void *a, const void *b) {
173 const sd_event_source *x = a, *y = b;
9c57a73b 174 int r;
fd38203a
LP
175
176 assert(x->pending);
177 assert(y->pending);
178
baf76283 179 /* Enabled ones first */
06e13147
YW
180 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
181 if (r != 0)
182 return r;
fd38203a 183
b6d5481b
LP
184 /* Non rate-limited ones first. */
185 r = CMP(!!x->ratelimited, !!y->ratelimited);
186 if (r != 0)
187 return r;
188
fd38203a 189 /* Lower priority values first */
9c57a73b
YW
190 r = CMP(x->priority, y->priority);
191 if (r != 0)
192 return r;
fd38203a
LP
193
194 /* Older entries first */
9c57a73b 195 return CMP(x->pending_iteration, y->pending_iteration);
fd38203a
LP
196}
197
198static int prepare_prioq_compare(const void *a, const void *b) {
199 const sd_event_source *x = a, *y = b;
9c57a73b 200 int r;
fd38203a
LP
201
202 assert(x->prepare);
203 assert(y->prepare);
204
8046c457 205 /* Enabled ones first */
06e13147
YW
206 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
207 if (r != 0)
208 return r;
8046c457 209
b6d5481b
LP
210 /* Non rate-limited ones first. */
211 r = CMP(!!x->ratelimited, !!y->ratelimited);
212 if (r != 0)
213 return r;
214
fd38203a
LP
215 /* Move most recently prepared ones last, so that we can stop
216 * preparing as soon as we hit one that has already been
217 * prepared in the current iteration */
9c57a73b
YW
218 r = CMP(x->prepare_iteration, y->prepare_iteration);
219 if (r != 0)
220 return r;
fd38203a 221
fd38203a 222 /* Lower priority values first */
9c57a73b 223 return CMP(x->priority, y->priority);
fd38203a
LP
224}
225
b6d5481b
LP
226static usec_t time_event_source_next(const sd_event_source *s) {
227 assert(s);
228
229 /* We have two kinds of event sources that have elapsation times associated with them: the actual
230 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
231 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
232 * looking at here. */
233
234 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
235 assert(s->rate_limit.begin != 0);
236 assert(s->rate_limit.interval != 0);
237 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
238 }
239
240 /* Otherwise this must be a time event source, if not ratelimited */
241 if (EVENT_SOURCE_IS_TIME(s->type))
242 return s->time.next;
243
244 return USEC_INFINITY;
245}
246
1bce0ffa 247static usec_t time_event_source_latest(const sd_event_source *s) {
b6d5481b
LP
248 assert(s);
249
250 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
251 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
252 * window */
253 assert(s->rate_limit.begin != 0);
254 assert(s->rate_limit.interval != 0);
255 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
256 }
257
258 /* Must be a time event source, if not ratelimited */
259 if (EVENT_SOURCE_IS_TIME(s->type))
260 return usec_add(s->time.next, s->time.accuracy);
261
262 return USEC_INFINITY;
1bce0ffa
LP
263}
264
81107b84
LP
265static bool event_source_timer_candidate(const sd_event_source *s) {
266 assert(s);
267
268 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
269 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
270 return !s->pending || s->ratelimited;
271}
272
273static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
c2ba3ad6 274 const sd_event_source *x = a, *y = b;
06e13147 275 int r;
c2ba3ad6 276
baf76283 277 /* Enabled ones first */
06e13147
YW
278 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
279 if (r != 0)
280 return r;
c2ba3ad6 281
81107b84 282 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
06e13147
YW
283 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
284 if (r != 0)
285 return r;
c2ba3ad6
LP
286
287 /* Order by time */
81107b84
LP
288 return CMP(time_func(x), time_func(y));
289}
290
291static int earliest_time_prioq_compare(const void *a, const void *b) {
292 return time_prioq_compare(a, b, time_event_source_next);
293}
294
295static int latest_time_prioq_compare(const void *a, const void *b) {
296 return time_prioq_compare(a, b, time_event_source_latest);
c2ba3ad6
LP
297}
298
6203e07a 299static int exit_prioq_compare(const void *a, const void *b) {
da7e457c 300 const sd_event_source *x = a, *y = b;
06e13147 301 int r;
da7e457c 302
6203e07a
LP
303 assert(x->type == SOURCE_EXIT);
304 assert(y->type == SOURCE_EXIT);
da7e457c 305
baf76283 306 /* Enabled ones first */
06e13147
YW
307 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
308 if (r != 0)
309 return r;
da7e457c
LP
310
311 /* Lower priority values first */
6dd91b36 312 return CMP(x->priority, y->priority);
da7e457c
LP
313}
314
6a0f1f6d
LP
315static void free_clock_data(struct clock_data *d) {
316 assert(d);
9da4cb2b 317 assert(d->wakeup == WAKEUP_CLOCK_DATA);
6a0f1f6d
LP
318
319 safe_close(d->fd);
320 prioq_free(d->earliest);
321 prioq_free(d->latest);
322}
323
8301aa0b 324static sd_event *event_free(sd_event *e) {
a71fe8b8
LP
325 sd_event_source *s;
326
fd38203a 327 assert(e);
a71fe8b8 328
baf3fdec
LP
329 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
330 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
331
a71fe8b8
LP
332 while ((s = e->sources)) {
333 assert(s->floating);
334 source_disconnect(s);
335 sd_event_source_unref(s);
336 }
337
15b38f93 338 assert(e->n_sources == 0);
fd38203a 339
afc6adb5
LP
340 if (e->default_event_ptr)
341 *(e->default_event_ptr) = NULL;
342
03e334a1 343 safe_close(e->epoll_fd);
03e334a1 344 safe_close(e->watchdog_fd);
cde93897 345
6a0f1f6d 346 free_clock_data(&e->realtime);
a8548816 347 free_clock_data(&e->boottime);
6a0f1f6d
LP
348 free_clock_data(&e->monotonic);
349 free_clock_data(&e->realtime_alarm);
350 free_clock_data(&e->boottime_alarm);
351
fd38203a
LP
352 prioq_free(e->pending);
353 prioq_free(e->prepare);
6203e07a 354 prioq_free(e->exit);
fd38203a
LP
355
356 free(e->signal_sources);
9da4cb2b 357 hashmap_free(e->signal_data);
fd38203a 358
97ef5391
LP
359 hashmap_free(e->inotify_data);
360
fd38203a 361 hashmap_free(e->child_sources);
6e9feda3 362 set_free(e->post_sources);
8301aa0b 363
5cddd924
LP
364 free(e->event_queue);
365
8301aa0b 366 return mfree(e);
fd38203a
LP
367}
368
f7262a9f 369_public_ int sd_event_new(sd_event** ret) {
fd38203a
LP
370 sd_event *e;
371 int r;
372
305f78bf 373 assert_return(ret, -EINVAL);
fd38203a 374
d08eb1fa 375 e = new(sd_event, 1);
fd38203a
LP
376 if (!e)
377 return -ENOMEM;
378
d08eb1fa
LP
379 *e = (sd_event) {
380 .n_ref = 1,
381 .epoll_fd = -1,
382 .watchdog_fd = -1,
383 .realtime.wakeup = WAKEUP_CLOCK_DATA,
384 .realtime.fd = -1,
385 .realtime.next = USEC_INFINITY,
386 .boottime.wakeup = WAKEUP_CLOCK_DATA,
387 .boottime.fd = -1,
388 .boottime.next = USEC_INFINITY,
389 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
390 .monotonic.fd = -1,
391 .monotonic.next = USEC_INFINITY,
392 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
393 .realtime_alarm.fd = -1,
394 .realtime_alarm.next = USEC_INFINITY,
395 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
396 .boottime_alarm.fd = -1,
397 .boottime_alarm.next = USEC_INFINITY,
398 .perturb = USEC_INFINITY,
399 .original_pid = getpid_cached(),
400 };
fd38203a 401
c983e776
EV
402 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
403 if (r < 0)
fd38203a 404 goto fail;
fd38203a
LP
405
406 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
407 if (e->epoll_fd < 0) {
408 r = -errno;
409 goto fail;
410 }
411
7fe2903c
LP
412 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
413
34b87517 414 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
28e5e1e9
DT
415 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
416 special_glyph(SPECIAL_GLYPH_ELLIPSIS));
34b87517
VC
417 e->profile_delays = true;
418 }
419
fd38203a
LP
420 *ret = e;
421 return 0;
422
423fail:
424 event_free(e);
425 return r;
426}
427
8301aa0b 428DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event, sd_event, event_free);
c8e9d15c
YW
429#define PROTECT_EVENT(e) \
430 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
fd38203a 431
afd15bbb
ZJS
432_public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
433 if (s)
434 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
435 return sd_event_source_unref(s);
436}
437
eaa3cbef
LP
438static bool event_pid_changed(sd_event *e) {
439 assert(e);
440
a2360a46 441 /* We don't support people creating an event loop and keeping
eaa3cbef
LP
442 * it around over a fork(). Let's complain. */
443
df0ff127 444 return e->original_pid != getpid_cached();
eaa3cbef
LP
445}
446
366e6411 447static void source_io_unregister(sd_event_source *s) {
fd38203a
LP
448 assert(s);
449 assert(s->type == SOURCE_IO);
450
f6806734 451 if (event_pid_changed(s->event))
366e6411 452 return;
f6806734 453
fd38203a 454 if (!s->io.registered)
366e6411 455 return;
fd38203a 456
d1cf2023 457 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
f80a5d6a 458 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
55cbfaa5 459 strna(s->description), event_source_type_to_string(s->type));
fd38203a
LP
460
461 s->io.registered = false;
fd38203a
LP
462}
463
305f78bf
LP
464static int source_io_register(
465 sd_event_source *s,
466 int enabled,
467 uint32_t events) {
468
fd38203a
LP
469 assert(s);
470 assert(s->type == SOURCE_IO);
baf76283 471 assert(enabled != SD_EVENT_OFF);
fd38203a 472
1eac7948 473 struct epoll_event ev = {
a82f89aa
LP
474 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
475 .data.ptr = s,
476 };
fd38203a 477
15c689d7 478 if (epoll_ctl(s->event->epoll_fd,
1eac7948 479 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
55c540d3 480 s->io.fd, &ev) < 0)
fd38203a
LP
481 return -errno;
482
483 s->io.registered = true;
484
485 return 0;
486}
487
f8f3f926
LP
488static void source_child_pidfd_unregister(sd_event_source *s) {
489 assert(s);
490 assert(s->type == SOURCE_CHILD);
491
492 if (event_pid_changed(s->event))
493 return;
494
495 if (!s->child.registered)
496 return;
497
498 if (EVENT_SOURCE_WATCH_PIDFD(s))
499 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
f80a5d6a 500 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
f8f3f926
LP
501 strna(s->description), event_source_type_to_string(s->type));
502
503 s->child.registered = false;
504}
505
506static int source_child_pidfd_register(sd_event_source *s, int enabled) {
f8f3f926
LP
507 assert(s);
508 assert(s->type == SOURCE_CHILD);
509 assert(enabled != SD_EVENT_OFF);
510
511 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1eac7948 512 struct epoll_event ev = {
f8f3f926
LP
513 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
514 .data.ptr = s,
515 };
516
55c540d3
ZJS
517 if (epoll_ctl(s->event->epoll_fd,
518 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
519 s->child.pidfd, &ev) < 0)
f8f3f926
LP
520 return -errno;
521 }
522
523 s->child.registered = true;
524 return 0;
525}
526
6a0f1f6d
LP
527static clockid_t event_source_type_to_clock(EventSourceType t) {
528
529 switch (t) {
530
531 case SOURCE_TIME_REALTIME:
532 return CLOCK_REALTIME;
533
a8548816
TG
534 case SOURCE_TIME_BOOTTIME:
535 return CLOCK_BOOTTIME;
536
6a0f1f6d
LP
537 case SOURCE_TIME_MONOTONIC:
538 return CLOCK_MONOTONIC;
539
540 case SOURCE_TIME_REALTIME_ALARM:
541 return CLOCK_REALTIME_ALARM;
542
543 case SOURCE_TIME_BOOTTIME_ALARM:
544 return CLOCK_BOOTTIME_ALARM;
545
546 default:
547 return (clockid_t) -1;
548 }
549}
550
551static EventSourceType clock_to_event_source_type(clockid_t clock) {
552
553 switch (clock) {
554
555 case CLOCK_REALTIME:
556 return SOURCE_TIME_REALTIME;
557
a8548816
TG
558 case CLOCK_BOOTTIME:
559 return SOURCE_TIME_BOOTTIME;
560
6a0f1f6d
LP
561 case CLOCK_MONOTONIC:
562 return SOURCE_TIME_MONOTONIC;
563
564 case CLOCK_REALTIME_ALARM:
565 return SOURCE_TIME_REALTIME_ALARM;
566
567 case CLOCK_BOOTTIME_ALARM:
568 return SOURCE_TIME_BOOTTIME_ALARM;
569
570 default:
571 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
572 }
573}
574
575static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
576 assert(e);
577
578 switch (t) {
579
580 case SOURCE_TIME_REALTIME:
581 return &e->realtime;
582
a8548816
TG
583 case SOURCE_TIME_BOOTTIME:
584 return &e->boottime;
585
6a0f1f6d
LP
586 case SOURCE_TIME_MONOTONIC:
587 return &e->monotonic;
588
589 case SOURCE_TIME_REALTIME_ALARM:
590 return &e->realtime_alarm;
591
592 case SOURCE_TIME_BOOTTIME_ALARM:
593 return &e->boottime_alarm;
594
595 default:
596 return NULL;
597 }
598}
599
3e4eb8e7
YW
600static void event_free_signal_data(sd_event *e, struct signal_data *d) {
601 assert(e);
602
603 if (!d)
604 return;
605
606 hashmap_remove(e->signal_data, &d->priority);
607 safe_close(d->fd);
608 free(d);
609}
610
9da4cb2b
LP
611static int event_make_signal_data(
612 sd_event *e,
613 int sig,
614 struct signal_data **ret) {
4807d2d0 615
9da4cb2b
LP
616 struct signal_data *d;
617 bool added = false;
618 sigset_t ss_copy;
619 int64_t priority;
f95387cd
ZJS
620 int r;
621
622 assert(e);
623
f6806734 624 if (event_pid_changed(e))
9da4cb2b 625 return -ECHILD;
f6806734 626
9da4cb2b
LP
627 if (e->signal_sources && e->signal_sources[sig])
628 priority = e->signal_sources[sig]->priority;
629 else
de05913d 630 priority = SD_EVENT_PRIORITY_NORMAL;
f95387cd 631
9da4cb2b
LP
632 d = hashmap_get(e->signal_data, &priority);
633 if (d) {
634 if (sigismember(&d->sigset, sig) > 0) {
635 if (ret)
636 *ret = d;
637 return 0;
638 }
639 } else {
d08eb1fa 640 d = new(struct signal_data, 1);
9da4cb2b
LP
641 if (!d)
642 return -ENOMEM;
643
d08eb1fa
LP
644 *d = (struct signal_data) {
645 .wakeup = WAKEUP_SIGNAL_DATA,
646 .fd = -1,
647 .priority = priority,
648 };
9da4cb2b 649
f656fdb6 650 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
90f604d1
ZJS
651 if (r < 0) {
652 free(d);
9da4cb2b 653 return r;
90f604d1 654 }
f95387cd 655
9da4cb2b
LP
656 added = true;
657 }
658
659 ss_copy = d->sigset;
660 assert_se(sigaddset(&ss_copy, sig) >= 0);
661
662 r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
663 if (r < 0) {
664 r = -errno;
665 goto fail;
666 }
667
668 d->sigset = ss_copy;
f95387cd 669
9da4cb2b
LP
670 if (d->fd >= 0) {
671 if (ret)
672 *ret = d;
f95387cd 673 return 0;
9da4cb2b
LP
674 }
675
7fe2903c 676 d->fd = fd_move_above_stdio(r);
f95387cd 677
1eac7948 678 struct epoll_event ev = {
a82f89aa
LP
679 .events = EPOLLIN,
680 .data.ptr = d,
681 };
f95387cd 682
15c689d7 683 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
9da4cb2b
LP
684 r = -errno;
685 goto fail;
f95387cd
ZJS
686 }
687
9da4cb2b
LP
688 if (ret)
689 *ret = d;
690
f95387cd 691 return 0;
9da4cb2b
LP
692
693fail:
3e4eb8e7
YW
694 if (added)
695 event_free_signal_data(e, d);
9da4cb2b
LP
696
697 return r;
698}
699
700static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
701 assert(e);
702 assert(d);
703
704 /* Turns off the specified signal in the signal data
705 * object. If the signal mask of the object becomes empty that
706 * way removes it. */
707
708 if (sigismember(&d->sigset, sig) == 0)
709 return;
710
711 assert_se(sigdelset(&d->sigset, sig) >= 0);
712
713 if (sigisemptyset(&d->sigset)) {
9da4cb2b 714 /* If all the mask is all-zero we can get rid of the structure */
3e4eb8e7 715 event_free_signal_data(e, d);
9da4cb2b
LP
716 return;
717 }
718
01e6af73
YW
719 if (event_pid_changed(e))
720 return;
721
9da4cb2b
LP
722 assert(d->fd >= 0);
723
724 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
725 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
726}
727
728static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
729 struct signal_data *d;
730 static const int64_t zero_priority = 0;
731
732 assert(e);
733
f8f3f926
LP
734 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
735 * and possibly drop the signalfd for it. */
9da4cb2b
LP
736
737 if (sig == SIGCHLD &&
b6d5481b 738 e->n_online_child_sources > 0)
9da4cb2b
LP
739 return;
740
741 if (e->signal_sources &&
742 e->signal_sources[sig] &&
b6d5481b 743 event_source_is_online(e->signal_sources[sig]))
9da4cb2b
LP
744 return;
745
746 /*
747 * The specified signal might be enabled in three different queues:
748 *
749 * 1) the one that belongs to the priority passed (if it is non-NULL)
750 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
751 * 3) the 0 priority (to cover the SIGCHLD case)
752 *
753 * Hence, let's remove it from all three here.
754 */
755
756 if (priority) {
757 d = hashmap_get(e->signal_data, priority);
758 if (d)
759 event_unmask_signal_data(e, d, sig);
760 }
761
762 if (e->signal_sources && e->signal_sources[sig]) {
763 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
764 if (d)
765 event_unmask_signal_data(e, d, sig);
766 }
767
768 d = hashmap_get(e->signal_data, &zero_priority);
769 if (d)
770 event_unmask_signal_data(e, d, sig);
f95387cd
ZJS
771}
772
e1951c16
MS
773static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
774 assert(s);
775
776 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
777 * they are enabled/disabled or marked pending and such. */
778
779 if (s->pending)
780 prioq_reshuffle(s->event->pending, s, &s->pending_index);
781
782 if (s->prepare)
783 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
784}
785
786static void event_source_time_prioq_reshuffle(sd_event_source *s) {
787 struct clock_data *d;
788
789 assert(s);
e1951c16
MS
790
791 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
5c08c7ab
YW
792 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
793 * properly again. */
b6d5481b
LP
794
795 if (s->ratelimited)
796 d = &s->event->monotonic;
5c08c7ab 797 else if (EVENT_SOURCE_IS_TIME(s->type))
b6d5481b 798 assert_se(d = event_get_clock_data(s->event, s->type));
5c08c7ab
YW
799 else
800 return; /* no-op for an event source which is neither a timer nor ratelimited. */
b6d5481b 801
f41315fc
LP
802 prioq_reshuffle(d->earliest, s, &s->earliest_index);
803 prioq_reshuffle(d->latest, s, &s->latest_index);
e1951c16
MS
804 d->needs_rearm = true;
805}
806
1e45e3fe
LP
807static void event_source_time_prioq_remove(
808 sd_event_source *s,
809 struct clock_data *d) {
810
811 assert(s);
812 assert(d);
813
f41315fc
LP
814 prioq_remove(d->earliest, s, &s->earliest_index);
815 prioq_remove(d->latest, s, &s->latest_index);
816 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
817 d->needs_rearm = true;
818}
819
a71fe8b8
LP
820static void source_disconnect(sd_event_source *s) {
821 sd_event *event;
897448bd 822 int r;
a71fe8b8 823
fd38203a
LP
824 assert(s);
825
a71fe8b8
LP
826 if (!s->event)
827 return;
15b38f93 828
a71fe8b8 829 assert(s->event->n_sources > 0);
fd38203a 830
a71fe8b8 831 switch (s->type) {
fd38203a 832
a71fe8b8
LP
833 case SOURCE_IO:
834 if (s->io.fd >= 0)
835 source_io_unregister(s);
fd38203a 836
a71fe8b8 837 break;
6a0f1f6d 838
a71fe8b8 839 case SOURCE_TIME_REALTIME:
a8548816 840 case SOURCE_TIME_BOOTTIME:
a71fe8b8
LP
841 case SOURCE_TIME_MONOTONIC:
842 case SOURCE_TIME_REALTIME_ALARM:
b6d5481b
LP
843 case SOURCE_TIME_BOOTTIME_ALARM:
844 /* Only remove this event source from the time event source here if it is not ratelimited. If
845 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
846 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
847
848 if (!s->ratelimited) {
849 struct clock_data *d;
850 assert_se(d = event_get_clock_data(s->event, s->type));
851 event_source_time_prioq_remove(s, d);
852 }
853
a71fe8b8 854 break;
a71fe8b8
LP
855
856 case SOURCE_SIGNAL:
857 if (s->signal.sig > 0) {
9da4cb2b 858
a71fe8b8
LP
859 if (s->event->signal_sources)
860 s->event->signal_sources[s->signal.sig] = NULL;
4807d2d0 861
9da4cb2b 862 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
897448bd
LP
863
864 if (s->signal.unblock) {
865 sigset_t new_ss;
866
867 if (sigemptyset(&new_ss) < 0)
868 log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
869 else if (sigaddset(&new_ss, s->signal.sig) < 0)
870 log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
871 else {
872 r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
873 if (r != 0)
874 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
875 }
876 }
6a0f1f6d 877 }
fd38203a 878
a71fe8b8 879 break;
fd38203a 880
a71fe8b8 881 case SOURCE_CHILD:
86587c93
YW
882 if (event_pid_changed(s->event))
883 s->child.process_owned = false;
884
a71fe8b8 885 if (s->child.pid > 0) {
b6d5481b
LP
886 if (event_source_is_online(s)) {
887 assert(s->event->n_online_child_sources > 0);
888 s->event->n_online_child_sources--;
4807d2d0 889 }
fd38203a 890
4a0b58c4 891 (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
a71fe8b8 892 }
fd38203a 893
f8f3f926
LP
894 if (EVENT_SOURCE_WATCH_PIDFD(s))
895 source_child_pidfd_unregister(s);
896 else
897 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
898
a71fe8b8 899 break;
fd38203a 900
a71fe8b8
LP
901 case SOURCE_DEFER:
902 /* nothing */
903 break;
fd38203a 904
a71fe8b8
LP
905 case SOURCE_POST:
906 set_remove(s->event->post_sources, s);
907 break;
da7e457c 908
a71fe8b8
LP
909 case SOURCE_EXIT:
910 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
911 break;
0eb2e0e3 912
97ef5391
LP
913 case SOURCE_INOTIFY: {
914 struct inode_data *inode_data;
915
916 inode_data = s->inotify.inode_data;
917 if (inode_data) {
918 struct inotify_data *inotify_data;
919 assert_se(inotify_data = inode_data->inotify_data);
920
921 /* Detach this event source from the inode object */
922 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
923 s->inotify.inode_data = NULL;
924
925 if (s->pending) {
926 assert(inotify_data->n_pending > 0);
927 inotify_data->n_pending--;
928 }
929
930 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
931 * continued to being watched. That's because inotify doesn't really have an API for that: we
932 * can only change watch masks with access to the original inode either by fd or by path. But
933 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
f21f31b2 934 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
97ef5391
LP
935 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
936 * there), but given the need for open_by_handle_at() which is privileged and not universally
937 * available this would be quite an incomplete solution. Hence we go the other way, leave the
938 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
939 * anymore after reception. Yes, this sucks, but … Linux … */
940
941 /* Maybe release the inode data (and its inotify) */
942 event_gc_inode_data(s->event, inode_data);
943 }
944
945 break;
946 }
947
a71fe8b8 948 default:
04499a70 949 assert_not_reached();
a71fe8b8 950 }
6e9feda3 951
a71fe8b8
LP
952 if (s->pending)
953 prioq_remove(s->event->pending, s, &s->pending_index);
9d3e3aa5 954
a71fe8b8
LP
955 if (s->prepare)
956 prioq_remove(s->event->prepare, s, &s->prepare_index);
fd38203a 957
b6d5481b
LP
958 if (s->ratelimited)
959 event_source_time_prioq_remove(s, &s->event->monotonic);
960
e514aa1e 961 event = TAKE_PTR(s->event);
a71fe8b8
LP
962 LIST_REMOVE(sources, event->sources, s);
963 event->n_sources--;
fd38203a 964
f5982559
LP
965 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
966 * pidfd associated with this event source, which we'll do only on source_free(). */
967
a71fe8b8
LP
968 if (!s->floating)
969 sd_event_unref(event);
970}
971
75db809a 972static sd_event_source* source_free(sd_event_source *s) {
a71fe8b8 973 assert(s);
fd38203a 974
a71fe8b8 975 source_disconnect(s);
ab93297c
NM
976
977 if (s->type == SOURCE_IO && s->io.owned)
15723a1d
LP
978 s->io.fd = safe_close(s->io.fd);
979
f8f3f926
LP
980 if (s->type == SOURCE_CHILD) {
981 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
982
983 if (s->child.process_owned) {
984
985 if (!s->child.exited) {
986 bool sent = false;
987
988 if (s->child.pidfd >= 0) {
989 if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
990 if (errno == ESRCH) /* Already dead */
991 sent = true;
992 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
993 log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
994 s->child.pid);
995 } else
996 sent = true;
997 }
998
999 if (!sent)
1000 if (kill(s->child.pid, SIGKILL) < 0)
1001 if (errno != ESRCH) /* Already dead */
1002 log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1003 s->child.pid);
1004 }
1005
1006 if (!s->child.waited) {
1007 siginfo_t si = {};
1008
1009 /* Reap the child if we can */
1010 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1011 }
1012 }
1013
1014 if (s->child.pidfd_owned)
1015 s->child.pidfd = safe_close(s->child.pidfd);
1016 }
1017
15723a1d
LP
1018 if (s->destroy_callback)
1019 s->destroy_callback(s->userdata);
ab93297c 1020
356779df 1021 free(s->description);
75db809a 1022 return mfree(s);
fd38203a 1023}
8c75fe17 1024DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
fd38203a
LP
1025
1026static int source_set_pending(sd_event_source *s, bool b) {
1027 int r;
1028
1029 assert(s);
6203e07a 1030 assert(s->type != SOURCE_EXIT);
fd38203a
LP
1031
1032 if (s->pending == b)
1033 return 0;
1034
1035 s->pending = b;
1036
1037 if (b) {
1038 s->pending_iteration = s->event->iteration;
1039
1040 r = prioq_put(s->event->pending, s, &s->pending_index);
1041 if (r < 0) {
1042 s->pending = false;
1043 return r;
1044 }
1045 } else
1046 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1047
e1951c16
MS
1048 if (EVENT_SOURCE_IS_TIME(s->type))
1049 event_source_time_prioq_reshuffle(s);
2576a19e 1050
9da4cb2b
LP
1051 if (s->type == SOURCE_SIGNAL && !b) {
1052 struct signal_data *d;
1053
1054 d = hashmap_get(s->event->signal_data, &s->priority);
1055 if (d && d->current == s)
1056 d->current = NULL;
1057 }
1058
97ef5391
LP
1059 if (s->type == SOURCE_INOTIFY) {
1060
1061 assert(s->inotify.inode_data);
1062 assert(s->inotify.inode_data->inotify_data);
1063
1064 if (b)
1065 s->inotify.inode_data->inotify_data->n_pending ++;
1066 else {
1067 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1068 s->inotify.inode_data->inotify_data->n_pending --;
1069 }
1070 }
1071
efd3be9d 1072 return 1;
fd38203a
LP
1073}
1074
a71fe8b8 1075static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
fd38203a
LP
1076 sd_event_source *s;
1077
1078 assert(e);
1079
d08eb1fa 1080 s = new(sd_event_source, 1);
fd38203a
LP
1081 if (!s)
1082 return NULL;
1083
d08eb1fa
LP
1084 *s = (struct sd_event_source) {
1085 .n_ref = 1,
1086 .event = e,
1087 .floating = floating,
1088 .type = type,
1089 .pending_index = PRIOQ_IDX_NULL,
1090 .prepare_index = PRIOQ_IDX_NULL,
1091 };
a71fe8b8
LP
1092
1093 if (!floating)
1094 sd_event_ref(e);
fd38203a 1095
a71fe8b8 1096 LIST_PREPEND(sources, e->sources, s);
313cefa1 1097 e->n_sources++;
15b38f93 1098
fd38203a
LP
1099 return s;
1100}
1101
b9350e70
LP
1102static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1103 assert(s);
1104
1105 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1106}
1107
f7262a9f 1108_public_ int sd_event_add_io(
fd38203a 1109 sd_event *e,
151b9b96 1110 sd_event_source **ret,
fd38203a
LP
1111 int fd,
1112 uint32_t events,
718db961 1113 sd_event_io_handler_t callback,
151b9b96 1114 void *userdata) {
fd38203a 1115
ec766a51 1116 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1117 int r;
1118
305f78bf 1119 assert_return(e, -EINVAL);
b937d761 1120 assert_return(e = event_resolve(e), -ENOPKG);
8ac43fee 1121 assert_return(fd >= 0, -EBADF);
2a16a986 1122 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 1123 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1124 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1125
b9350e70
LP
1126 if (!callback)
1127 callback = io_exit_callback;
1128
a71fe8b8 1129 s = source_new(e, !ret, SOURCE_IO);
fd38203a
LP
1130 if (!s)
1131 return -ENOMEM;
1132
9da4cb2b 1133 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1134 s->io.fd = fd;
1135 s->io.events = events;
1136 s->io.callback = callback;
1137 s->userdata = userdata;
baf76283 1138 s->enabled = SD_EVENT_ON;
fd38203a 1139
baf76283 1140 r = source_io_register(s, s->enabled, events);
ec766a51 1141 if (r < 0)
050f74f2 1142 return r;
fd38203a 1143
a71fe8b8
LP
1144 if (ret)
1145 *ret = s;
ec766a51 1146 TAKE_PTR(s);
a71fe8b8 1147
fd38203a
LP
1148 return 0;
1149}
1150
52444dc4
LP
1151static void initialize_perturb(sd_event *e) {
1152 sd_id128_t bootid = {};
1153
1154 /* When we sleep for longer, we try to realign the wakeup to
f21f31b2 1155 the same time within each minute/second/250ms, so that
52444dc4
LP
1156 events all across the system can be coalesced into a single
1157 CPU wakeup. However, let's take some system-specific
1158 randomness for this value, so that in a network of systems
1159 with synced clocks timer events are distributed a
1160 bit. Here, we calculate a perturbation usec offset from the
1161 boot ID. */
1162
3a43da28 1163 if (_likely_(e->perturb != USEC_INFINITY))
52444dc4
LP
1164 return;
1165
1166 if (sd_id128_get_boot(&bootid) >= 0)
1167 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1168}
1169
fd38203a
LP
1170static int event_setup_timer_fd(
1171 sd_event *e,
6a0f1f6d
LP
1172 struct clock_data *d,
1173 clockid_t clock) {
fd38203a 1174
fd38203a 1175 assert(e);
6a0f1f6d 1176 assert(d);
fd38203a 1177
6a0f1f6d 1178 if (_likely_(d->fd >= 0))
fd38203a
LP
1179 return 0;
1180
b44d87e2 1181 _cleanup_close_ int fd = -1;
b44d87e2 1182
6a0f1f6d 1183 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
fd38203a
LP
1184 if (fd < 0)
1185 return -errno;
1186
7fe2903c
LP
1187 fd = fd_move_above_stdio(fd);
1188
1eac7948 1189 struct epoll_event ev = {
a82f89aa
LP
1190 .events = EPOLLIN,
1191 .data.ptr = d,
1192 };
fd38203a 1193
15c689d7 1194 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
fd38203a 1195 return -errno;
fd38203a 1196
b44d87e2 1197 d->fd = TAKE_FD(fd);
fd38203a
LP
1198 return 0;
1199}
1200
c4f1aff2
TG
1201static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1202 assert(s);
1203
1204 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1205}
1206
41c63f36
LP
1207static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1208 int r;
1209
1210 assert(d);
1211
1212 if (d->fd < 0) {
1213 r = event_setup_timer_fd(e, d, clock);
1214 if (r < 0)
1215 return r;
1216 }
1217
1218 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1219 if (r < 0)
1220 return r;
1221
1222 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1223 if (r < 0)
1224 return r;
1225
1226 return 0;
1227}
1228
1e45e3fe
LP
1229static int event_source_time_prioq_put(
1230 sd_event_source *s,
1231 struct clock_data *d) {
1232
1233 int r;
1234
1235 assert(s);
1236 assert(d);
19947509 1237 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1e45e3fe 1238
f41315fc 1239 r = prioq_put(d->earliest, s, &s->earliest_index);
1e45e3fe
LP
1240 if (r < 0)
1241 return r;
1242
f41315fc 1243 r = prioq_put(d->latest, s, &s->latest_index);
1e45e3fe 1244 if (r < 0) {
f41315fc
LP
1245 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1246 s->earliest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
1247 return r;
1248 }
1249
1250 d->needs_rearm = true;
1251 return 0;
1252}
1253
6a0f1f6d 1254_public_ int sd_event_add_time(
fd38203a 1255 sd_event *e,
151b9b96 1256 sd_event_source **ret,
6a0f1f6d 1257 clockid_t clock,
fd38203a 1258 uint64_t usec,
c2ba3ad6 1259 uint64_t accuracy,
718db961 1260 sd_event_time_handler_t callback,
151b9b96 1261 void *userdata) {
fd38203a 1262
6a0f1f6d 1263 EventSourceType type;
ec766a51 1264 _cleanup_(source_freep) sd_event_source *s = NULL;
6a0f1f6d 1265 struct clock_data *d;
fd38203a
LP
1266 int r;
1267
305f78bf 1268 assert_return(e, -EINVAL);
b937d761 1269 assert_return(e = event_resolve(e), -ENOPKG);
f5fbe71d 1270 assert_return(accuracy != UINT64_MAX, -EINVAL);
da7e457c 1271 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1272 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1273
e475d10c
LP
1274 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1275 return -EOPNOTSUPP;
1276
1277 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1278 if (type < 0)
3411372e
LP
1279 return -EOPNOTSUPP;
1280
c4f1aff2
TG
1281 if (!callback)
1282 callback = time_exit_callback;
1283
1e45e3fe 1284 assert_se(d = event_get_clock_data(e, type));
c2ba3ad6 1285
41c63f36 1286 r = setup_clock_data(e, d, clock);
c983e776
EV
1287 if (r < 0)
1288 return r;
fd38203a 1289
a71fe8b8 1290 s = source_new(e, !ret, type);
fd38203a
LP
1291 if (!s)
1292 return -ENOMEM;
1293
1294 s->time.next = usec;
c2ba3ad6 1295 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
fd38203a 1296 s->time.callback = callback;
f41315fc 1297 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
fd38203a 1298 s->userdata = userdata;
baf76283 1299 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1300
1e45e3fe 1301 r = event_source_time_prioq_put(s, d);
c2ba3ad6 1302 if (r < 0)
ec766a51 1303 return r;
fd38203a 1304
a71fe8b8
LP
1305 if (ret)
1306 *ret = s;
ec766a51 1307 TAKE_PTR(s);
a71fe8b8 1308
fd38203a
LP
1309 return 0;
1310}
1311
d6a83dc4
LP
1312_public_ int sd_event_add_time_relative(
1313 sd_event *e,
1314 sd_event_source **ret,
1315 clockid_t clock,
1316 uint64_t usec,
1317 uint64_t accuracy,
1318 sd_event_time_handler_t callback,
1319 void *userdata) {
1320
1321 usec_t t;
1322 int r;
1323
1324 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1325 * checks for overflow. */
1326
1327 r = sd_event_now(e, clock, &t);
1328 if (r < 0)
1329 return r;
1330
1331 if (usec >= USEC_INFINITY - t)
1332 return -EOVERFLOW;
1333
1334 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1335}
1336
59bc1fd7
LP
1337static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1338 assert(s);
1339
1340 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1341}
1342
f7262a9f 1343_public_ int sd_event_add_signal(
305f78bf 1344 sd_event *e,
151b9b96 1345 sd_event_source **ret,
305f78bf 1346 int sig,
718db961 1347 sd_event_signal_handler_t callback,
151b9b96 1348 void *userdata) {
305f78bf 1349
ec766a51 1350 _cleanup_(source_freep) sd_event_source *s = NULL;
9da4cb2b 1351 struct signal_data *d;
897448bd
LP
1352 sigset_t new_ss;
1353 bool block_it;
fd38203a
LP
1354 int r;
1355
305f78bf 1356 assert_return(e, -EINVAL);
b937d761 1357 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1358 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1359 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1360
897448bd
LP
1361 /* Let's make sure our special flag stays outside of the valid signal range */
1362 assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1363
1364 if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1365 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1366 assert_return(SIGNAL_VALID(sig), -EINVAL);
1367
1368 block_it = true;
1369 } else {
1370 assert_return(SIGNAL_VALID(sig), -EINVAL);
1371
1372 r = signal_is_blocked(sig);
1373 if (r < 0)
1374 return r;
1375 if (r == 0)
1376 return -EBUSY;
1377
1378 block_it = false;
1379 }
1380
59bc1fd7
LP
1381 if (!callback)
1382 callback = signal_exit_callback;
1383
fd38203a
LP
1384 if (!e->signal_sources) {
1385 e->signal_sources = new0(sd_event_source*, _NSIG);
1386 if (!e->signal_sources)
1387 return -ENOMEM;
1388 } else if (e->signal_sources[sig])
1389 return -EBUSY;
1390
a71fe8b8 1391 s = source_new(e, !ret, SOURCE_SIGNAL);
fd38203a
LP
1392 if (!s)
1393 return -ENOMEM;
1394
1395 s->signal.sig = sig;
1396 s->signal.callback = callback;
1397 s->userdata = userdata;
baf76283 1398 s->enabled = SD_EVENT_ON;
fd38203a
LP
1399
1400 e->signal_sources[sig] = s;
fd38203a 1401
897448bd
LP
1402 if (block_it) {
1403 sigset_t old_ss;
1404
1405 if (sigemptyset(&new_ss) < 0)
1406 return -errno;
1407
1408 if (sigaddset(&new_ss, sig) < 0)
1409 return -errno;
1410
1411 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1412 if (r != 0)
1413 return -r;
1414
1415 r = sigismember(&old_ss, sig);
1416 if (r < 0)
1417 return -errno;
1418
1419 s->signal.unblock = !r;
1420 } else
1421 s->signal.unblock = false;
1422
9da4cb2b 1423 r = event_make_signal_data(e, sig, &d);
897448bd
LP
1424 if (r < 0) {
1425 if (s->signal.unblock)
1426 (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1427
9da4cb2b 1428 return r;
897448bd 1429 }
fd38203a 1430
f1f00dbb
LP
1431 /* Use the signal name as description for the event source by default */
1432 (void) sd_event_source_set_description(s, signal_to_string(sig));
1433
a71fe8b8
LP
1434 if (ret)
1435 *ret = s;
ec766a51 1436 TAKE_PTR(s);
a71fe8b8 1437
fd38203a
LP
1438 return 0;
1439}
1440
b9350e70
LP
1441static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1442 assert(s);
1443
1444 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1445}
1446
f8f3f926
LP
1447static bool shall_use_pidfd(void) {
1448 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1449 return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1450}
1451
f7262a9f 1452_public_ int sd_event_add_child(
305f78bf 1453 sd_event *e,
151b9b96 1454 sd_event_source **ret,
305f78bf
LP
1455 pid_t pid,
1456 int options,
718db961 1457 sd_event_child_handler_t callback,
151b9b96 1458 void *userdata) {
305f78bf 1459
ec766a51 1460 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1461 int r;
1462
305f78bf 1463 assert_return(e, -EINVAL);
b937d761 1464 assert_return(e = event_resolve(e), -ENOPKG);
305f78bf
LP
1465 assert_return(pid > 1, -EINVAL);
1466 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1467 assert_return(options != 0, -EINVAL);
da7e457c 1468 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1469 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1470
b9350e70
LP
1471 if (!callback)
1472 callback = child_exit_callback;
1473
b6d5481b 1474 if (e->n_online_child_sources == 0) {
ee880b37
LP
1475 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1476 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1477 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1478 * take effect.
1479 *
1480 * (As an optimization we only do this check on the first child event source created.) */
1481 r = signal_is_blocked(SIGCHLD);
1482 if (r < 0)
1483 return r;
1484 if (r == 0)
1485 return -EBUSY;
1486 }
1487
d5099efc 1488 r = hashmap_ensure_allocated(&e->child_sources, NULL);
fd38203a
LP
1489 if (r < 0)
1490 return r;
1491
4a0b58c4 1492 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
fd38203a
LP
1493 return -EBUSY;
1494
a71fe8b8 1495 s = source_new(e, !ret, SOURCE_CHILD);
fd38203a
LP
1496 if (!s)
1497 return -ENOMEM;
1498
f8f3f926 1499 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1500 s->child.options = options;
1501 s->child.callback = callback;
1502 s->userdata = userdata;
baf76283 1503 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1504
f8f3f926
LP
1505 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1506 * pin the PID, and make regular waitid() handling race-free. */
1507
1508 if (shall_use_pidfd()) {
54988a27 1509 s->child.pidfd = pidfd_open(pid, 0);
f8f3f926
LP
1510 if (s->child.pidfd < 0) {
1511 /* Propagate errors unless the syscall is not supported or blocked */
1512 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1513 return -errno;
1514 } else
1515 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1516 } else
1517 s->child.pidfd = -1;
1518
f8f3f926
LP
1519 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1520 /* We have a pidfd and we only want to watch for exit */
f8f3f926 1521 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1522 if (r < 0)
f8f3f926 1523 return r;
ac9f2640 1524
f8f3f926
LP
1525 } else {
1526 /* We have no pidfd or we shall wait for some other event than WEXITED */
f8f3f926 1527 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1528 if (r < 0)
f8f3f926 1529 return r;
f8f3f926
LP
1530
1531 e->need_process_child = true;
1532 }
c2ba3ad6 1533
54988a27
YW
1534 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1535 if (r < 0)
1536 return r;
1537
1538 /* These must be done after everything succeeds. */
1539 s->child.pid = pid;
b6d5481b 1540 e->n_online_child_sources++;
ac9f2640 1541
a71fe8b8
LP
1542 if (ret)
1543 *ret = s;
ec766a51 1544 TAKE_PTR(s);
f8f3f926
LP
1545 return 0;
1546}
1547
1548_public_ int sd_event_add_child_pidfd(
1549 sd_event *e,
1550 sd_event_source **ret,
1551 int pidfd,
1552 int options,
1553 sd_event_child_handler_t callback,
1554 void *userdata) {
1555
1556
1557 _cleanup_(source_freep) sd_event_source *s = NULL;
1558 pid_t pid;
1559 int r;
1560
1561 assert_return(e, -EINVAL);
1562 assert_return(e = event_resolve(e), -ENOPKG);
1563 assert_return(pidfd >= 0, -EBADF);
1564 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1565 assert_return(options != 0, -EINVAL);
f8f3f926
LP
1566 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1567 assert_return(!event_pid_changed(e), -ECHILD);
1568
b9350e70
LP
1569 if (!callback)
1570 callback = child_exit_callback;
1571
b6d5481b 1572 if (e->n_online_child_sources == 0) {
ee880b37
LP
1573 r = signal_is_blocked(SIGCHLD);
1574 if (r < 0)
1575 return r;
1576 if (r == 0)
1577 return -EBUSY;
1578 }
1579
f8f3f926
LP
1580 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1581 if (r < 0)
1582 return r;
1583
1584 r = pidfd_get_pid(pidfd, &pid);
1585 if (r < 0)
1586 return r;
1587
1588 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1589 return -EBUSY;
1590
1591 s = source_new(e, !ret, SOURCE_CHILD);
1592 if (!s)
1593 return -ENOMEM;
1594
1595 s->wakeup = WAKEUP_EVENT_SOURCE;
1596 s->child.pidfd = pidfd;
1597 s->child.pid = pid;
1598 s->child.options = options;
1599 s->child.callback = callback;
1600 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1601 s->userdata = userdata;
1602 s->enabled = SD_EVENT_ONESHOT;
1603
1604 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1605 if (r < 0)
1606 return r;
1607
f8f3f926
LP
1608 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1609 /* We only want to watch for WEXITED */
f8f3f926 1610 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1611 if (r < 0)
f8f3f926 1612 return r;
f8f3f926
LP
1613 } else {
1614 /* We shall wait for some other event than WEXITED */
f8f3f926 1615 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1616 if (r < 0)
f8f3f926 1617 return r;
a71fe8b8 1618
f8f3f926
LP
1619 e->need_process_child = true;
1620 }
1621
b6d5481b 1622 e->n_online_child_sources++;
ac9f2640 1623
f8f3f926
LP
1624 if (ret)
1625 *ret = s;
f8f3f926 1626 TAKE_PTR(s);
fd38203a
LP
1627 return 0;
1628}
1629
b9350e70
LP
1630static int generic_exit_callback(sd_event_source *s, void *userdata) {
1631 assert(s);
1632
1633 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1634}
1635
f7262a9f 1636_public_ int sd_event_add_defer(
305f78bf 1637 sd_event *e,
151b9b96 1638 sd_event_source **ret,
718db961 1639 sd_event_handler_t callback,
151b9b96 1640 void *userdata) {
305f78bf 1641
ec766a51 1642 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1643 int r;
1644
305f78bf 1645 assert_return(e, -EINVAL);
b937d761 1646 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1647 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1648 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1649
b9350e70
LP
1650 if (!callback)
1651 callback = generic_exit_callback;
1652
a71fe8b8 1653 s = source_new(e, !ret, SOURCE_DEFER);
fd38203a
LP
1654 if (!s)
1655 return -ENOMEM;
1656
1657 s->defer.callback = callback;
1658 s->userdata = userdata;
baf76283 1659 s->enabled = SD_EVENT_ONESHOT;
fd38203a
LP
1660
1661 r = source_set_pending(s, true);
ec766a51 1662 if (r < 0)
fd38203a 1663 return r;
fd38203a 1664
a71fe8b8
LP
1665 if (ret)
1666 *ret = s;
ec766a51 1667 TAKE_PTR(s);
a71fe8b8 1668
fd38203a
LP
1669 return 0;
1670}
1671
6e9feda3
LP
1672_public_ int sd_event_add_post(
1673 sd_event *e,
1674 sd_event_source **ret,
1675 sd_event_handler_t callback,
1676 void *userdata) {
1677
ec766a51 1678 _cleanup_(source_freep) sd_event_source *s = NULL;
6e9feda3
LP
1679 int r;
1680
1681 assert_return(e, -EINVAL);
b937d761 1682 assert_return(e = event_resolve(e), -ENOPKG);
6e9feda3
LP
1683 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1684 assert_return(!event_pid_changed(e), -ECHILD);
1685
b9350e70
LP
1686 if (!callback)
1687 callback = generic_exit_callback;
1688
a71fe8b8 1689 s = source_new(e, !ret, SOURCE_POST);
6e9feda3
LP
1690 if (!s)
1691 return -ENOMEM;
1692
1693 s->post.callback = callback;
1694 s->userdata = userdata;
1695 s->enabled = SD_EVENT_ON;
1696
de7fef4b 1697 r = set_ensure_put(&e->post_sources, NULL, s);
ec766a51 1698 if (r < 0)
6e9feda3 1699 return r;
de7fef4b 1700 assert(r > 0);
6e9feda3 1701
a71fe8b8
LP
1702 if (ret)
1703 *ret = s;
ec766a51 1704 TAKE_PTR(s);
a71fe8b8 1705
6e9feda3
LP
1706 return 0;
1707}
1708
6203e07a 1709_public_ int sd_event_add_exit(
305f78bf 1710 sd_event *e,
151b9b96 1711 sd_event_source **ret,
718db961 1712 sd_event_handler_t callback,
151b9b96 1713 void *userdata) {
305f78bf 1714
ec766a51 1715 _cleanup_(source_freep) sd_event_source *s = NULL;
da7e457c
LP
1716 int r;
1717
1718 assert_return(e, -EINVAL);
b937d761 1719 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
1720 assert_return(callback, -EINVAL);
1721 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1722 assert_return(!event_pid_changed(e), -ECHILD);
1723
c983e776
EV
1724 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1725 if (r < 0)
1726 return r;
da7e457c 1727
a71fe8b8 1728 s = source_new(e, !ret, SOURCE_EXIT);
fd38203a 1729 if (!s)
da7e457c 1730 return -ENOMEM;
fd38203a 1731
6203e07a 1732 s->exit.callback = callback;
da7e457c 1733 s->userdata = userdata;
6203e07a 1734 s->exit.prioq_index = PRIOQ_IDX_NULL;
baf76283 1735 s->enabled = SD_EVENT_ONESHOT;
da7e457c 1736
6203e07a 1737 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
ec766a51 1738 if (r < 0)
da7e457c 1739 return r;
da7e457c 1740
a71fe8b8
LP
1741 if (ret)
1742 *ret = s;
ec766a51 1743 TAKE_PTR(s);
a71fe8b8 1744
da7e457c
LP
1745 return 0;
1746}
1747
97ef5391
LP
1748static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
1749 assert(e);
1750
1751 if (!d)
1752 return;
1753
1754 assert(hashmap_isempty(d->inodes));
1755 assert(hashmap_isempty(d->wd));
1756
1757 if (d->buffer_filled > 0)
0601b958 1758 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
1759
1760 hashmap_free(d->inodes);
1761 hashmap_free(d->wd);
1762
1763 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
1764
1765 if (d->fd >= 0) {
fbae5090
YW
1766 if (!event_pid_changed(e) &&
1767 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
97ef5391
LP
1768 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
1769
1770 safe_close(d->fd);
1771 }
1772 free(d);
1773}
1774
1775static int event_make_inotify_data(
1776 sd_event *e,
1777 int64_t priority,
1778 struct inotify_data **ret) {
1779
1780 _cleanup_close_ int fd = -1;
1781 struct inotify_data *d;
97ef5391
LP
1782 int r;
1783
1784 assert(e);
1785
1786 d = hashmap_get(e->inotify_data, &priority);
1787 if (d) {
1788 if (ret)
1789 *ret = d;
1790 return 0;
1791 }
1792
1793 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
1794 if (fd < 0)
1795 return -errno;
1796
1797 fd = fd_move_above_stdio(fd);
1798
97ef5391
LP
1799 d = new(struct inotify_data, 1);
1800 if (!d)
1801 return -ENOMEM;
1802
1803 *d = (struct inotify_data) {
1804 .wakeup = WAKEUP_INOTIFY_DATA,
1805 .fd = TAKE_FD(fd),
1806 .priority = priority,
1807 };
1808
c2484a75 1809 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
97ef5391
LP
1810 if (r < 0) {
1811 d->fd = safe_close(d->fd);
1812 free(d);
1813 return r;
1814 }
1815
1eac7948 1816 struct epoll_event ev = {
97ef5391
LP
1817 .events = EPOLLIN,
1818 .data.ptr = d,
1819 };
1820
1821 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
1822 r = -errno;
1823 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
1824 * remove the fd from the epoll first, which we don't want as we couldn't
1825 * add it in the first place. */
1826 event_free_inotify_data(e, d);
1827 return r;
1828 }
1829
1830 if (ret)
1831 *ret = d;
1832
1833 return 1;
1834}
1835
7a08d314 1836static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
90c88092 1837 int r;
97ef5391
LP
1838
1839 assert(x);
1840 assert(y);
1841
90c88092
YW
1842 r = CMP(x->dev, y->dev);
1843 if (r != 0)
1844 return r;
97ef5391 1845
6dd91b36 1846 return CMP(x->ino, y->ino);
97ef5391
LP
1847}
1848
7a08d314
YW
1849static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
1850 assert(d);
97ef5391
LP
1851
1852 siphash24_compress(&d->dev, sizeof(d->dev), state);
1853 siphash24_compress(&d->ino, sizeof(d->ino), state);
1854}
1855
7a08d314 1856DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
97ef5391
LP
1857
1858static void event_free_inode_data(
1859 sd_event *e,
1860 struct inode_data *d) {
1861
1862 assert(e);
1863
1864 if (!d)
1865 return;
1866
64903d18 1867 assert(!d->event_sources);
97ef5391
LP
1868
1869 if (d->fd >= 0) {
ed828563 1870 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
1871 safe_close(d->fd);
1872 }
1873
1874 if (d->inotify_data) {
1875
1876 if (d->wd >= 0) {
fbae5090 1877 if (d->inotify_data->fd >= 0 && !event_pid_changed(e)) {
97ef5391
LP
1878 /* So here's a problem. At the time this runs the watch descriptor might already be
1879 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
1880 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
1881 * likely case to happen. */
1882
1883 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
1884 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
1885 }
1886
1887 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
1888 }
1889
1890 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
1891 }
1892
1893 free(d);
1894}
1895
53baf2ef
LP
1896static void event_gc_inotify_data(
1897 sd_event *e,
1898 struct inotify_data *d) {
1899
1900 assert(e);
1901
1902 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
1903 * any inode with it anymore, which in turn happens if no event source of this priority is interested
1904 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
1905 * (under the expectation that the GC is called again once the counter is decremented). */
1906
1907 if (!d)
1908 return;
1909
1910 if (!hashmap_isempty(d->inodes))
1911 return;
1912
1913 if (d->n_busy > 0)
1914 return;
1915
1916 event_free_inotify_data(e, d);
1917}
1918
97ef5391
LP
1919static void event_gc_inode_data(
1920 sd_event *e,
1921 struct inode_data *d) {
1922
1923 struct inotify_data *inotify_data;
1924
1925 assert(e);
1926
1927 if (!d)
1928 return;
1929
64903d18 1930 if (d->event_sources)
97ef5391
LP
1931 return;
1932
1933 inotify_data = d->inotify_data;
1934 event_free_inode_data(e, d);
1935
53baf2ef 1936 event_gc_inotify_data(e, inotify_data);
97ef5391
LP
1937}
1938
1939static int event_make_inode_data(
1940 sd_event *e,
1941 struct inotify_data *inotify_data,
1942 dev_t dev,
1943 ino_t ino,
1944 struct inode_data **ret) {
1945
1946 struct inode_data *d, key;
1947 int r;
1948
1949 assert(e);
1950 assert(inotify_data);
1951
1952 key = (struct inode_data) {
1953 .ino = ino,
1954 .dev = dev,
1955 };
1956
1957 d = hashmap_get(inotify_data->inodes, &key);
1958 if (d) {
1959 if (ret)
1960 *ret = d;
1961
1962 return 0;
1963 }
1964
1965 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
1966 if (r < 0)
1967 return r;
1968
1969 d = new(struct inode_data, 1);
1970 if (!d)
1971 return -ENOMEM;
1972
1973 *d = (struct inode_data) {
1974 .dev = dev,
1975 .ino = ino,
1976 .wd = -1,
1977 .fd = -1,
1978 .inotify_data = inotify_data,
1979 };
1980
1981 r = hashmap_put(inotify_data->inodes, d, d);
1982 if (r < 0) {
1983 free(d);
1984 return r;
1985 }
1986
1987 if (ret)
1988 *ret = d;
1989
1990 return 1;
1991}
1992
1993static uint32_t inode_data_determine_mask(struct inode_data *d) {
1994 bool excl_unlink = true;
1995 uint32_t combined = 0;
97ef5391
LP
1996
1997 assert(d);
1998
1999 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2000 * the IN_EXCL_UNLINK flag is ANDed instead.
2001 *
2002 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2003 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
f21f31b2 2004 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
97ef5391
LP
2005 * events we don't care for client-side. */
2006
2007 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2008
2009 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2010 excl_unlink = false;
2011
2012 combined |= s->inotify.mask;
2013 }
2014
2015 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2016}
2017
2018static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2019 uint32_t combined_mask;
2020 int wd, r;
2021
2022 assert(d);
2023 assert(d->fd >= 0);
2024
2025 combined_mask = inode_data_determine_mask(d);
2026
2027 if (d->wd >= 0 && combined_mask == d->combined_mask)
2028 return 0;
2029
2030 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2031 if (r < 0)
2032 return r;
2033
2034 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2035 if (wd < 0)
2036 return -errno;
2037
2038 if (d->wd < 0) {
2039 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2040 if (r < 0) {
2041 (void) inotify_rm_watch(d->inotify_data->fd, wd);
2042 return r;
2043 }
2044
2045 d->wd = wd;
2046
2047 } else if (d->wd != wd) {
2048
2049 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2050 (void) inotify_rm_watch(d->fd, wd);
2051 return -EINVAL;
2052 }
2053
2054 d->combined_mask = combined_mask;
2055 return 1;
2056}
2057
b9350e70
LP
2058static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2059 assert(s);
2060
2061 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2062}
2063
e67d738a 2064static int event_add_inotify_fd_internal(
97ef5391
LP
2065 sd_event *e,
2066 sd_event_source **ret,
e67d738a
LP
2067 int fd,
2068 bool donate,
97ef5391
LP
2069 uint32_t mask,
2070 sd_event_inotify_handler_t callback,
2071 void *userdata) {
2072
e67d738a
LP
2073 _cleanup_close_ int donated_fd = donate ? fd : -1;
2074 _cleanup_(source_freep) sd_event_source *s = NULL;
97ef5391
LP
2075 struct inotify_data *inotify_data = NULL;
2076 struct inode_data *inode_data = NULL;
97ef5391
LP
2077 struct stat st;
2078 int r;
2079
2080 assert_return(e, -EINVAL);
2081 assert_return(e = event_resolve(e), -ENOPKG);
e67d738a 2082 assert_return(fd >= 0, -EBADF);
97ef5391
LP
2083 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2084 assert_return(!event_pid_changed(e), -ECHILD);
2085
b9350e70
LP
2086 if (!callback)
2087 callback = inotify_exit_callback;
2088
97ef5391
LP
2089 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2090 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2091 * the user can't use them for us. */
2092 if (mask & IN_MASK_ADD)
2093 return -EINVAL;
2094
97ef5391
LP
2095 if (fstat(fd, &st) < 0)
2096 return -errno;
2097
2098 s = source_new(e, !ret, SOURCE_INOTIFY);
2099 if (!s)
2100 return -ENOMEM;
2101
2102 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2103 s->inotify.mask = mask;
2104 s->inotify.callback = callback;
2105 s->userdata = userdata;
2106
2107 /* Allocate an inotify object for this priority, and an inode object within it */
2108 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2109 if (r < 0)
8c75fe17 2110 return r;
97ef5391
LP
2111
2112 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
8c75fe17 2113 if (r < 0) {
e67d738a 2114 event_gc_inotify_data(e, inotify_data);
8c75fe17
ZJS
2115 return r;
2116 }
97ef5391
LP
2117
2118 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2119 * the event source, until then, for which we need the original inode. */
2120 if (inode_data->fd < 0) {
e67d738a
LP
2121 if (donated_fd >= 0)
2122 inode_data->fd = TAKE_FD(donated_fd);
2123 else {
2124 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2125 if (inode_data->fd < 0) {
2126 r = -errno;
2127 event_gc_inode_data(e, inode_data);
2128 return r;
2129 }
2130 }
2131
ed828563 2132 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
97ef5391
LP
2133 }
2134
2135 /* Link our event source to the inode data object */
2136 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2137 s->inotify.inode_data = inode_data;
2138
97ef5391
LP
2139 /* Actually realize the watch now */
2140 r = inode_data_realize_watch(e, inode_data);
2141 if (r < 0)
8c75fe17 2142 return r;
97ef5391 2143
97ef5391
LP
2144 if (ret)
2145 *ret = s;
8c75fe17 2146 TAKE_PTR(s);
97ef5391
LP
2147
2148 return 0;
97ef5391
LP
2149}
2150
e67d738a
LP
2151_public_ int sd_event_add_inotify_fd(
2152 sd_event *e,
2153 sd_event_source **ret,
2154 int fd,
2155 uint32_t mask,
2156 sd_event_inotify_handler_t callback,
2157 void *userdata) {
2158
2159 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2160}
2161
2162_public_ int sd_event_add_inotify(
2163 sd_event *e,
2164 sd_event_source **ret,
2165 const char *path,
2166 uint32_t mask,
2167 sd_event_inotify_handler_t callback,
2168 void *userdata) {
2169
2091c779 2170 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
e67d738a
LP
2171 int fd, r;
2172
2173 assert_return(path, -EINVAL);
2174
586c8cee
ZJS
2175 fd = open(path, O_PATH | O_CLOEXEC |
2176 (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2177 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
e67d738a
LP
2178 if (fd < 0)
2179 return -errno;
2180
2181 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2182 if (r < 0)
2183 return r;
2184
2185 (void) sd_event_source_set_description(s, path);
2186
2187 if (ret)
2188 *ret = s;
2189
2190 return r;
2191}
2192
8301aa0b 2193static sd_event_source* event_source_free(sd_event_source *s) {
6680dd6b
LP
2194 if (!s)
2195 return NULL;
da7e457c 2196
8301aa0b
YW
2197 /* Here's a special hack: when we are called from a
2198 * dispatch handler we won't free the event source
2199 * immediately, but we will detach the fd from the
2200 * epoll. This way it is safe for the caller to unref
2201 * the event source and immediately close the fd, but
2202 * we still retain a valid event source object after
2203 * the callback. */
fd38203a 2204
76d04c3a 2205 if (s->dispatching)
8301aa0b 2206 source_disconnect(s);
76d04c3a 2207 else
8301aa0b 2208 source_free(s);
fd38203a
LP
2209
2210 return NULL;
2211}
2212
8301aa0b
YW
2213DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2214
356779df 2215_public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
f7f53e9e 2216 assert_return(s, -EINVAL);
f4b2933e 2217 assert_return(!event_pid_changed(s->event), -ECHILD);
f7f53e9e 2218
356779df 2219 return free_and_strdup(&s->description, description);
f7f53e9e
TG
2220}
2221
356779df 2222_public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
f7f53e9e 2223 assert_return(s, -EINVAL);
356779df 2224 assert_return(description, -EINVAL);
f4b2933e 2225 assert_return(!event_pid_changed(s->event), -ECHILD);
f7f53e9e 2226
7d92a1a4
ZJS
2227 if (!s->description)
2228 return -ENXIO;
2229
356779df 2230 *description = s->description;
f7f53e9e
TG
2231 return 0;
2232}
2233
adcc4ca3 2234_public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
305f78bf 2235 assert_return(s, NULL);
eaa3cbef
LP
2236
2237 return s->event;
2238}
2239
f7262a9f 2240_public_ int sd_event_source_get_pending(sd_event_source *s) {
305f78bf 2241 assert_return(s, -EINVAL);
6203e07a 2242 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c 2243 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2244 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2245
2246 return s->pending;
2247}
2248
f7262a9f 2249_public_ int sd_event_source_get_io_fd(sd_event_source *s) {
305f78bf
LP
2250 assert_return(s, -EINVAL);
2251 assert_return(s->type == SOURCE_IO, -EDOM);
2252 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2253
2254 return s->io.fd;
2255}
2256
30caf8f3
LP
2257_public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2258 int r;
2259
2260 assert_return(s, -EINVAL);
8ac43fee 2261 assert_return(fd >= 0, -EBADF);
30caf8f3
LP
2262 assert_return(s->type == SOURCE_IO, -EDOM);
2263 assert_return(!event_pid_changed(s->event), -ECHILD);
2264
2265 if (s->io.fd == fd)
2266 return 0;
2267
b6d5481b 2268 if (event_source_is_offline(s)) {
30caf8f3
LP
2269 s->io.fd = fd;
2270 s->io.registered = false;
2271 } else {
2272 int saved_fd;
2273
2274 saved_fd = s->io.fd;
2275 assert(s->io.registered);
2276
2277 s->io.fd = fd;
2278 s->io.registered = false;
2279
2280 r = source_io_register(s, s->enabled, s->io.events);
2281 if (r < 0) {
2282 s->io.fd = saved_fd;
2283 s->io.registered = true;
2284 return r;
2285 }
2286
5a795bff 2287 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
30caf8f3
LP
2288 }
2289
2290 return 0;
2291}
2292
ab93297c
NM
2293_public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2294 assert_return(s, -EINVAL);
2295 assert_return(s->type == SOURCE_IO, -EDOM);
2296
2297 return s->io.owned;
2298}
2299
2300_public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2301 assert_return(s, -EINVAL);
2302 assert_return(s->type == SOURCE_IO, -EDOM);
2303
2304 s->io.owned = own;
2305 return 0;
2306}
2307
f7262a9f 2308_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
305f78bf
LP
2309 assert_return(s, -EINVAL);
2310 assert_return(events, -EINVAL);
2311 assert_return(s->type == SOURCE_IO, -EDOM);
2312 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2313
2314 *events = s->io.events;
2315 return 0;
2316}
2317
f7262a9f 2318_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
fd38203a
LP
2319 int r;
2320
305f78bf
LP
2321 assert_return(s, -EINVAL);
2322 assert_return(s->type == SOURCE_IO, -EDOM);
2a16a986 2323 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 2324 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2325 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2326
b63c8d4f
DH
2327 /* edge-triggered updates are never skipped, so we can reset edges */
2328 if (s->io.events == events && !(events & EPOLLET))
fd38203a
LP
2329 return 0;
2330
2a0dc6cd
LP
2331 r = source_set_pending(s, false);
2332 if (r < 0)
2333 return r;
2334
b6d5481b 2335 if (event_source_is_online(s)) {
e4715127 2336 r = source_io_register(s, s->enabled, events);
fd38203a
LP
2337 if (r < 0)
2338 return r;
2339 }
2340
2341 s->io.events = events;
2342
2343 return 0;
2344}
2345
f7262a9f 2346_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
305f78bf
LP
2347 assert_return(s, -EINVAL);
2348 assert_return(revents, -EINVAL);
2349 assert_return(s->type == SOURCE_IO, -EDOM);
2350 assert_return(s->pending, -ENODATA);
2351 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2352
2353 *revents = s->io.revents;
2354 return 0;
2355}
2356
f7262a9f 2357_public_ int sd_event_source_get_signal(sd_event_source *s) {
305f78bf
LP
2358 assert_return(s, -EINVAL);
2359 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2360 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2361
2362 return s->signal.sig;
2363}
2364
31927c16 2365_public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
305f78bf
LP
2366 assert_return(s, -EINVAL);
2367 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2368
6680b8d1
ME
2369 *priority = s->priority;
2370 return 0;
fd38203a
LP
2371}
2372
31927c16 2373_public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
97ef5391
LP
2374 bool rm_inotify = false, rm_inode = false;
2375 struct inotify_data *new_inotify_data = NULL;
2376 struct inode_data *new_inode_data = NULL;
9da4cb2b
LP
2377 int r;
2378
305f78bf 2379 assert_return(s, -EINVAL);
da7e457c 2380 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2381 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2382
2383 if (s->priority == priority)
2384 return 0;
2385
97ef5391
LP
2386 if (s->type == SOURCE_INOTIFY) {
2387 struct inode_data *old_inode_data;
2388
2389 assert(s->inotify.inode_data);
2390 old_inode_data = s->inotify.inode_data;
2391
2392 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2393 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2394 * events we allow priority changes only until the first following iteration. */
2395 if (old_inode_data->fd < 0)
2396 return -EOPNOTSUPP;
2397
2398 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2399 if (r < 0)
2400 return r;
2401 rm_inotify = r > 0;
2402
2403 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2404 if (r < 0)
2405 goto fail;
2406 rm_inode = r > 0;
2407
2408 if (new_inode_data->fd < 0) {
2409 /* Duplicate the fd for the new inode object if we don't have any yet */
2410 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2411 if (new_inode_data->fd < 0) {
2412 r = -errno;
2413 goto fail;
2414 }
2415
ed828563 2416 LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
97ef5391
LP
2417 }
2418
2419 /* Move the event source to the new inode data structure */
2420 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2421 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2422 s->inotify.inode_data = new_inode_data;
2423
2424 /* Now create the new watch */
2425 r = inode_data_realize_watch(s->event, new_inode_data);
2426 if (r < 0) {
2427 /* Move it back */
2428 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2429 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2430 s->inotify.inode_data = old_inode_data;
2431 goto fail;
2432 }
2433
2434 s->priority = priority;
2435
2436 event_gc_inode_data(s->event, old_inode_data);
2437
b6d5481b 2438 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
9da4cb2b
LP
2439 struct signal_data *old, *d;
2440
2441 /* Move us from the signalfd belonging to the old
2442 * priority to the signalfd of the new priority */
2443
2444 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2445
2446 s->priority = priority;
2447
2448 r = event_make_signal_data(s->event, s->signal.sig, &d);
2449 if (r < 0) {
2450 s->priority = old->priority;
2451 return r;
2452 }
2453
2454 event_unmask_signal_data(s->event, old, s->signal.sig);
2455 } else
2456 s->priority = priority;
fd38203a 2457
e1951c16 2458 event_source_pp_prioq_reshuffle(s);
fd38203a 2459
6203e07a
LP
2460 if (s->type == SOURCE_EXIT)
2461 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
305f78bf 2462
fd38203a 2463 return 0;
97ef5391
LP
2464
2465fail:
2466 if (rm_inode)
2467 event_free_inode_data(s->event, new_inode_data);
2468
2469 if (rm_inotify)
2470 event_free_inotify_data(s->event, new_inotify_data);
2471
2472 return r;
fd38203a
LP
2473}
2474
cad143a8 2475_public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
71193c0b
ZJS
2476 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2477 if (!s && !ret)
2478 return false;
2479
305f78bf 2480 assert_return(s, -EINVAL);
305f78bf 2481 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2482
cad143a8
LP
2483 if (ret)
2484 *ret = s->enabled;
2485
08c1eb0e 2486 return s->enabled != SD_EVENT_OFF;
fd38203a
LP
2487}
2488
b6d5481b
LP
2489static int event_source_offline(
2490 sd_event_source *s,
2491 int enabled,
2492 bool ratelimited) {
2493
2494 bool was_offline;
fd38203a
LP
2495 int r;
2496
ddfde737 2497 assert(s);
b6d5481b 2498 assert(enabled == SD_EVENT_OFF || ratelimited);
fd38203a 2499
ddfde737 2500 /* Unset the pending flag when this event source is disabled */
b6d5481b
LP
2501 if (s->enabled != SD_EVENT_OFF &&
2502 enabled == SD_EVENT_OFF &&
2503 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2504 r = source_set_pending(s, false);
2505 if (r < 0)
2506 return r;
2507 }
cc567911 2508
b6d5481b
LP
2509 was_offline = event_source_is_offline(s);
2510 s->enabled = enabled;
2511 s->ratelimited = ratelimited;
fd38203a 2512
ddfde737 2513 switch (s->type) {
fd38203a 2514
ddfde737
LP
2515 case SOURCE_IO:
2516 source_io_unregister(s);
2517 break;
ac989a78 2518
ddfde737
LP
2519 case SOURCE_SIGNAL:
2520 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2521 break;
fd38203a 2522
ddfde737 2523 case SOURCE_CHILD:
b6d5481b
LP
2524 if (!was_offline) {
2525 assert(s->event->n_online_child_sources > 0);
2526 s->event->n_online_child_sources--;
2527 }
fd38203a 2528
ddfde737
LP
2529 if (EVENT_SOURCE_WATCH_PIDFD(s))
2530 source_child_pidfd_unregister(s);
2531 else
2532 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2533 break;
4807d2d0 2534
ddfde737
LP
2535 case SOURCE_EXIT:
2536 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2537 break;
fd38203a 2538
2115b9b6
YW
2539 case SOURCE_TIME_REALTIME:
2540 case SOURCE_TIME_BOOTTIME:
2541 case SOURCE_TIME_MONOTONIC:
2542 case SOURCE_TIME_REALTIME_ALARM:
2543 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737
LP
2544 case SOURCE_DEFER:
2545 case SOURCE_POST:
2546 case SOURCE_INOTIFY:
2547 break;
fd38203a 2548
ddfde737 2549 default:
04499a70 2550 assert_not_reached();
ddfde737 2551 }
fd38203a 2552
2115b9b6
YW
2553 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2554 event_source_time_prioq_reshuffle(s);
2555
b6d5481b 2556 return 1;
ddfde737 2557}
f8f3f926 2558
b6d5481b
LP
2559static int event_source_online(
2560 sd_event_source *s,
2561 int enabled,
2562 bool ratelimited) {
2563
2564 bool was_online;
ddfde737 2565 int r;
fd38203a 2566
ddfde737 2567 assert(s);
b6d5481b 2568 assert(enabled != SD_EVENT_OFF || !ratelimited);
305f78bf 2569
ddfde737 2570 /* Unset the pending flag when this event source is enabled */
b6d5481b
LP
2571 if (s->enabled == SD_EVENT_OFF &&
2572 enabled != SD_EVENT_OFF &&
2573 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2574 r = source_set_pending(s, false);
2575 if (r < 0)
2576 return r;
2577 }
9d3e3aa5 2578
b6d5481b
LP
2579 /* Are we really ready for onlining? */
2580 if (enabled == SD_EVENT_OFF || ratelimited) {
2581 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2582 s->enabled = enabled;
2583 s->ratelimited = ratelimited;
2584 return 0;
2585 }
2586
2587 was_online = event_source_is_online(s);
2588
ddfde737 2589 switch (s->type) {
ddfde737 2590 case SOURCE_IO:
b6d5481b 2591 r = source_io_register(s, enabled, s->io.events);
d2eafe61 2592 if (r < 0)
ddfde737 2593 return r;
ddfde737 2594 break;
fd38203a 2595
ddfde737
LP
2596 case SOURCE_SIGNAL:
2597 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2598 if (r < 0) {
ddfde737
LP
2599 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2600 return r;
2601 }
fd38203a 2602
ddfde737 2603 break;
fd38203a 2604
ddfde737 2605 case SOURCE_CHILD:
ddfde737
LP
2606 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2607 /* yes, we have pidfd */
9da4cb2b 2608
b6d5481b 2609 r = source_child_pidfd_register(s, enabled);
ac9f2640 2610 if (r < 0)
9da4cb2b 2611 return r;
ddfde737
LP
2612 } else {
2613 /* no pidfd, or something other to watch for than WEXITED */
9da4cb2b 2614
ddfde737
LP
2615 r = event_make_signal_data(s->event, SIGCHLD, NULL);
2616 if (r < 0) {
ddfde737
LP
2617 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2618 return r;
2619 }
2620 }
fd38203a 2621
b6d5481b
LP
2622 if (!was_online)
2623 s->event->n_online_child_sources++;
ddfde737 2624 break;
4807d2d0 2625
d2eafe61
ZJS
2626 case SOURCE_TIME_REALTIME:
2627 case SOURCE_TIME_BOOTTIME:
2628 case SOURCE_TIME_MONOTONIC:
2629 case SOURCE_TIME_REALTIME_ALARM:
2630 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737 2631 case SOURCE_EXIT:
ddfde737
LP
2632 case SOURCE_DEFER:
2633 case SOURCE_POST:
2634 case SOURCE_INOTIFY:
2635 break;
9da4cb2b 2636
ddfde737 2637 default:
04499a70 2638 assert_not_reached();
ddfde737 2639 }
f8f3f926 2640
b6d5481b
LP
2641 s->enabled = enabled;
2642 s->ratelimited = ratelimited;
d2eafe61
ZJS
2643
2644 /* Non-failing operations below */
2115b9b6 2645 if (s->type == SOURCE_EXIT)
d2eafe61 2646 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
d2eafe61 2647
2115b9b6
YW
2648 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2649 event_source_time_prioq_reshuffle(s);
d2eafe61 2650
b6d5481b 2651 return 1;
ddfde737
LP
2652}
2653
2654_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
2655 int r;
9da4cb2b 2656
ddfde737 2657 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
7e922b05
ZJS
2658
2659 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
2660 if (m == SD_EVENT_OFF && !s)
2661 return 0;
2662
2663 assert_return(s, -EINVAL);
ddfde737 2664 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2665
ddfde737
LP
2666 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
2667 if (s->event->state == SD_EVENT_FINISHED)
2668 return m == SD_EVENT_OFF ? 0 : -ESTALE;
305f78bf 2669
ddfde737
LP
2670 if (s->enabled == m) /* No change? */
2671 return 0;
9d3e3aa5 2672
ddfde737 2673 if (m == SD_EVENT_OFF)
b6d5481b 2674 r = event_source_offline(s, m, s->ratelimited);
ddfde737
LP
2675 else {
2676 if (s->enabled != SD_EVENT_OFF) {
2677 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
2678 * event source is already enabled after all. */
2679 s->enabled = m;
2680 return 0;
fd38203a 2681 }
ddfde737 2682
b6d5481b 2683 r = event_source_online(s, m, s->ratelimited);
fd38203a 2684 }
ddfde737
LP
2685 if (r < 0)
2686 return r;
fd38203a 2687
e1951c16 2688 event_source_pp_prioq_reshuffle(s);
fd38203a
LP
2689 return 0;
2690}
2691
f7262a9f 2692_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
305f78bf
LP
2693 assert_return(s, -EINVAL);
2694 assert_return(usec, -EINVAL);
6a0f1f6d 2695 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
305f78bf 2696 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2697
2698 *usec = s->time.next;
2699 return 0;
2700}
2701
f7262a9f 2702_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
2a0dc6cd 2703 int r;
6a0f1f6d 2704
305f78bf 2705 assert_return(s, -EINVAL);
6a0f1f6d 2706 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 2707 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2708 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2709
2a0dc6cd
LP
2710 r = source_set_pending(s, false);
2711 if (r < 0)
2712 return r;
2576a19e 2713
2a0dc6cd 2714 s->time.next = usec;
fd38203a 2715
e1951c16 2716 event_source_time_prioq_reshuffle(s);
fd38203a
LP
2717 return 0;
2718}
2719
d6a83dc4
LP
2720_public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
2721 usec_t t;
2722 int r;
2723
2724 assert_return(s, -EINVAL);
2725 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2726
2727 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
2728 if (r < 0)
2729 return r;
2730
496db330
YW
2731 usec = usec_add(t, usec);
2732 if (usec == USEC_INFINITY)
d6a83dc4
LP
2733 return -EOVERFLOW;
2734
496db330 2735 return sd_event_source_set_time(s, usec);
d6a83dc4
LP
2736}
2737
f7262a9f 2738_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
305f78bf
LP
2739 assert_return(s, -EINVAL);
2740 assert_return(usec, -EINVAL);
6a0f1f6d 2741 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
305f78bf
LP
2742 assert_return(!event_pid_changed(s->event), -ECHILD);
2743
2744 *usec = s->time.accuracy;
2745 return 0;
2746}
2747
f7262a9f 2748_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
2a0dc6cd 2749 int r;
6a0f1f6d 2750
305f78bf 2751 assert_return(s, -EINVAL);
f5fbe71d 2752 assert_return(usec != UINT64_MAX, -EINVAL);
6a0f1f6d 2753 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 2754 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2755 assert_return(!event_pid_changed(s->event), -ECHILD);
eaa3cbef 2756
2a0dc6cd
LP
2757 r = source_set_pending(s, false);
2758 if (r < 0)
2759 return r;
2760
eaa3cbef
LP
2761 if (usec == 0)
2762 usec = DEFAULT_ACCURACY_USEC;
2763
eaa3cbef
LP
2764 s->time.accuracy = usec;
2765
e1951c16 2766 event_source_time_prioq_reshuffle(s);
6a0f1f6d
LP
2767 return 0;
2768}
2769
2770_public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
2771 assert_return(s, -EINVAL);
2772 assert_return(clock, -EINVAL);
2773 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2774 assert_return(!event_pid_changed(s->event), -ECHILD);
eaa3cbef 2775
6a0f1f6d 2776 *clock = event_source_type_to_clock(s->type);
eaa3cbef
LP
2777 return 0;
2778}
2779
f7262a9f 2780_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
4bee8012
LP
2781 assert_return(s, -EINVAL);
2782 assert_return(pid, -EINVAL);
2783 assert_return(s->type == SOURCE_CHILD, -EDOM);
2784 assert_return(!event_pid_changed(s->event), -ECHILD);
2785
2786 *pid = s->child.pid;
2787 return 0;
2788}
2789
f8f3f926
LP
2790_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
2791 assert_return(s, -EINVAL);
2792 assert_return(s->type == SOURCE_CHILD, -EDOM);
2793 assert_return(!event_pid_changed(s->event), -ECHILD);
2794
2795 if (s->child.pidfd < 0)
2796 return -EOPNOTSUPP;
2797
2798 return s->child.pidfd;
2799}
2800
2801_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
2802 assert_return(s, -EINVAL);
2803 assert_return(s->type == SOURCE_CHILD, -EDOM);
2804 assert_return(!event_pid_changed(s->event), -ECHILD);
2805 assert_return(SIGNAL_VALID(sig), -EINVAL);
2806
2807 /* If we already have seen indication the process exited refuse sending a signal early. This way we
2808 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
2809 * available. */
2810 if (s->child.exited)
2811 return -ESRCH;
2812
2813 if (s->child.pidfd >= 0) {
2814 siginfo_t copy;
2815
2816 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
2817 * structure here */
2818 if (si)
2819 copy = *si;
2820
2821 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
2822 /* Let's propagate the error only if the system call is not implemented or prohibited */
2823 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
2824 return -errno;
2825 } else
2826 return 0;
2827 }
2828
2829 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
2830 * this here. */
2831 if (flags != 0)
2832 return -EOPNOTSUPP;
2833
2834 if (si) {
2835 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
2836 siginfo_t copy = *si;
2837
2838 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
2839 return -errno;
2840 } else if (kill(s->child.pid, sig) < 0)
2841 return -errno;
2842
2843 return 0;
2844}
2845
2846_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
2847 assert_return(s, -EINVAL);
2848 assert_return(s->type == SOURCE_CHILD, -EDOM);
2849
2850 if (s->child.pidfd < 0)
2851 return -EOPNOTSUPP;
2852
2853 return s->child.pidfd_owned;
2854}
2855
2856_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
2857 assert_return(s, -EINVAL);
2858 assert_return(s->type == SOURCE_CHILD, -EDOM);
2859
2860 if (s->child.pidfd < 0)
2861 return -EOPNOTSUPP;
2862
2863 s->child.pidfd_owned = own;
2864 return 0;
2865}
2866
2867_public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
2868 assert_return(s, -EINVAL);
2869 assert_return(s->type == SOURCE_CHILD, -EDOM);
2870
2871 return s->child.process_owned;
2872}
2873
2874_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
2875 assert_return(s, -EINVAL);
2876 assert_return(s->type == SOURCE_CHILD, -EDOM);
2877
2878 s->child.process_owned = own;
2879 return 0;
2880}
2881
97ef5391
LP
2882_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
2883 assert_return(s, -EINVAL);
2884 assert_return(mask, -EINVAL);
2885 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
2886 assert_return(!event_pid_changed(s->event), -ECHILD);
2887
2888 *mask = s->inotify.mask;
2889 return 0;
2890}
2891
718db961 2892_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
fd38203a
LP
2893 int r;
2894
da7e457c 2895 assert_return(s, -EINVAL);
6203e07a 2896 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c
LP
2897 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2898 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2899
2900 if (s->prepare == callback)
2901 return 0;
2902
2903 if (callback && s->prepare) {
2904 s->prepare = callback;
2905 return 0;
2906 }
2907
2908 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
2909 if (r < 0)
2910 return r;
2911
2912 s->prepare = callback;
2913
2914 if (callback) {
2915 r = prioq_put(s->event->prepare, s, &s->prepare_index);
2916 if (r < 0)
2917 return r;
2918 } else
2919 prioq_remove(s->event->prepare, s, &s->prepare_index);
2920
2921 return 0;
2922}
2923
f7262a9f 2924_public_ void* sd_event_source_get_userdata(sd_event_source *s) {
da7e457c 2925 assert_return(s, NULL);
fd38203a
LP
2926
2927 return s->userdata;
2928}
2929
8f726607
LP
2930_public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
2931 void *ret;
2932
2933 assert_return(s, NULL);
2934
2935 ret = s->userdata;
2936 s->userdata = userdata;
2937
2938 return ret;
2939}
2940
b6d5481b
LP
2941static int event_source_enter_ratelimited(sd_event_source *s) {
2942 int r;
2943
2944 assert(s);
2945
2946 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
2947 * the end of the rate limit time window, much as if it was a timer event source. */
2948
2949 if (s->ratelimited)
2950 return 0; /* Already ratelimited, this is a NOP hence */
2951
2952 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
2953 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
2954 if (r < 0)
2955 return r;
2956
2957 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
2958 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
2959 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
2960 if (EVENT_SOURCE_IS_TIME(s->type))
2961 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
2962
2963 /* Now, let's add the event source to the monotonic clock instead */
2964 r = event_source_time_prioq_put(s, &s->event->monotonic);
2965 if (r < 0)
2966 goto fail;
2967
2968 /* And let's take the event source officially offline */
2969 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
2970 if (r < 0) {
2971 event_source_time_prioq_remove(s, &s->event->monotonic);
2972 goto fail;
2973 }
2974
2975 event_source_pp_prioq_reshuffle(s);
2976
2977 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
2978 return 0;
2979
2980fail:
2981 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
2982 * space for it should already be allocated. */
2983 if (EVENT_SOURCE_IS_TIME(s->type))
2984 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
2985
2986 return r;
2987}
2988
fd69f224 2989static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
b6d5481b
LP
2990 int r;
2991
2992 assert(s);
2993
2994 if (!s->ratelimited)
2995 return 0;
2996
2997 /* Let's take the event source out of the monotonic prioq first. */
2998 event_source_time_prioq_remove(s, &s->event->monotonic);
2999
3000 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3001 if (EVENT_SOURCE_IS_TIME(s->type)) {
3002 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3003 if (r < 0)
3004 goto fail;
3005 }
3006
3007 /* Let's try to take it online again. */
3008 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3009 if (r < 0) {
3010 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3011 if (EVENT_SOURCE_IS_TIME(s->type))
3012 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3013
3014 goto fail;
3015 }
3016
3017 event_source_pp_prioq_reshuffle(s);
3018 ratelimit_reset(&s->rate_limit);
3019
3020 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
fd69f224
MS
3021
3022 if (run_callback && s->ratelimit_expire_callback) {
3023 s->dispatching = true;
3024 r = s->ratelimit_expire_callback(s, s->userdata);
3025 s->dispatching = false;
3026
3027 if (r < 0) {
3028 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3029 strna(s->description),
3030 event_source_type_to_string(s->type),
3031 s->exit_on_failure ? "exiting" : "disabling");
3032
3033 if (s->exit_on_failure)
3034 (void) sd_event_exit(s->event, r);
3035 }
3036
3037 if (s->n_ref == 0)
3038 source_free(s);
3039 else if (r < 0)
0a040e64 3040 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd69f224
MS
3041
3042 return 1;
3043 }
3044
b6d5481b
LP
3045 return 0;
3046
3047fail:
3048 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3049 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3050 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3051
3052 return r;
3053}
3054
c2ba3ad6
LP
3055static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3056 usec_t c;
3057 assert(e);
3058 assert(a <= b);
3059
3060 if (a <= 0)
3061 return 0;
393003e1
LP
3062 if (a >= USEC_INFINITY)
3063 return USEC_INFINITY;
c2ba3ad6
LP
3064
3065 if (b <= a + 1)
3066 return a;
3067
52444dc4
LP
3068 initialize_perturb(e);
3069
c2ba3ad6
LP
3070 /*
3071 Find a good time to wake up again between times a and b. We
3072 have two goals here:
3073
3074 a) We want to wake up as seldom as possible, hence prefer
3075 later times over earlier times.
3076
3077 b) But if we have to wake up, then let's make sure to
3078 dispatch as much as possible on the entire system.
3079
3080 We implement this by waking up everywhere at the same time
850516e0 3081 within any given minute if we can, synchronised via the
c2ba3ad6 3082 perturbation value determined from the boot ID. If we can't,
ba276c81
LP
3083 then we try to find the same spot in every 10s, then 1s and
3084 then 250ms step. Otherwise, we pick the last possible time
3085 to wake up.
c2ba3ad6
LP
3086 */
3087
850516e0
LP
3088 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3089 if (c >= b) {
3090 if (_unlikely_(c < USEC_PER_MINUTE))
3091 return b;
3092
3093 c -= USEC_PER_MINUTE;
3094 }
3095
ba276c81
LP
3096 if (c >= a)
3097 return c;
3098
3099 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3100 if (c >= b) {
3101 if (_unlikely_(c < USEC_PER_SEC*10))
3102 return b;
3103
3104 c -= USEC_PER_SEC*10;
3105 }
3106
850516e0
LP
3107 if (c >= a)
3108 return c;
3109
3110 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
c2ba3ad6
LP
3111 if (c >= b) {
3112 if (_unlikely_(c < USEC_PER_SEC))
3113 return b;
3114
3115 c -= USEC_PER_SEC;
3116 }
3117
3118 if (c >= a)
3119 return c;
3120
3121 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3122 if (c >= b) {
3123 if (_unlikely_(c < USEC_PER_MSEC*250))
3124 return b;
3125
3126 c -= USEC_PER_MSEC*250;
3127 }
3128
3129 if (c >= a)
3130 return c;
3131
3132 return b;
3133}
3134
fd38203a
LP
3135static int event_arm_timer(
3136 sd_event *e,
6a0f1f6d 3137 struct clock_data *d) {
fd38203a
LP
3138
3139 struct itimerspec its = {};
c2ba3ad6
LP
3140 sd_event_source *a, *b;
3141 usec_t t;
fd38203a 3142
cde93897 3143 assert(e);
6a0f1f6d 3144 assert(d);
fd38203a 3145
d06441da 3146 if (!d->needs_rearm)
212bbb17 3147 return 0;
7e2bf71c
YW
3148
3149 d->needs_rearm = false;
212bbb17 3150
6a0f1f6d 3151 a = prioq_peek(d->earliest);
19947509 3152 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
b6d5481b 3153 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
72aedc1e 3154
6a0f1f6d 3155 if (d->fd < 0)
c57b5ca3
LP
3156 return 0;
3157
3a43da28 3158 if (d->next == USEC_INFINITY)
72aedc1e
LP
3159 return 0;
3160
3161 /* disarm */
15c689d7
LP
3162 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3163 return -errno;
72aedc1e 3164
3a43da28 3165 d->next = USEC_INFINITY;
fd38203a 3166 return 0;
72aedc1e 3167 }
fd38203a 3168
6a0f1f6d 3169 b = prioq_peek(d->latest);
19947509
ZJS
3170 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3171 assert(b && b->enabled != SD_EVENT_OFF);
c2ba3ad6 3172
b6d5481b 3173 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
6a0f1f6d 3174 if (d->next == t)
fd38203a
LP
3175 return 0;
3176
6a0f1f6d 3177 assert_se(d->fd >= 0);
fd38203a 3178
c2ba3ad6 3179 if (t == 0) {
fd38203a
LP
3180 /* We don' want to disarm here, just mean some time looooong ago. */
3181 its.it_value.tv_sec = 0;
3182 its.it_value.tv_nsec = 1;
3183 } else
c2ba3ad6 3184 timespec_store(&its.it_value, t);
fd38203a 3185
15c689d7 3186 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
cde93897 3187 return -errno;
fd38203a 3188
6a0f1f6d 3189 d->next = t;
fd38203a
LP
3190 return 0;
3191}
3192
9a800b56 3193static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
fd38203a
LP
3194 assert(e);
3195 assert(s);
3196 assert(s->type == SOURCE_IO);
3197
9a800b56
LP
3198 /* If the event source was already pending, we just OR in the
3199 * new revents, otherwise we reset the value. The ORing is
3200 * necessary to handle EPOLLONESHOT events properly where
3201 * readability might happen independently of writability, and
3202 * we need to keep track of both */
3203
3204 if (s->pending)
3205 s->io.revents |= revents;
3206 else
3207 s->io.revents = revents;
fd38203a 3208
fd38203a
LP
3209 return source_set_pending(s, true);
3210}
3211
72aedc1e 3212static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
fd38203a
LP
3213 uint64_t x;
3214 ssize_t ss;
3215
3216 assert(e);
da7e457c 3217 assert(fd >= 0);
72aedc1e 3218
305f78bf 3219 assert_return(events == EPOLLIN, -EIO);
fd38203a
LP
3220
3221 ss = read(fd, &x, sizeof(x));
3222 if (ss < 0) {
8add30a0 3223 if (ERRNO_IS_TRANSIENT(errno))
fd38203a
LP
3224 return 0;
3225
3226 return -errno;
3227 }
3228
8d35dae7 3229 if (_unlikely_(ss != sizeof(x)))
fd38203a
LP
3230 return -EIO;
3231
cde93897 3232 if (next)
3a43da28 3233 *next = USEC_INFINITY;
72aedc1e 3234
fd38203a
LP
3235 return 0;
3236}
3237
305f78bf
LP
3238static int process_timer(
3239 sd_event *e,
3240 usec_t n,
6a0f1f6d 3241 struct clock_data *d) {
305f78bf 3242
fd38203a 3243 sd_event_source *s;
fd69f224 3244 bool callback_invoked = false;
fd38203a
LP
3245 int r;
3246
3247 assert(e);
6a0f1f6d 3248 assert(d);
fd38203a
LP
3249
3250 for (;;) {
6a0f1f6d 3251 s = prioq_peek(d->earliest);
19947509
ZJS
3252 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3253
b6d5481b
LP
3254 if (!s || time_event_source_next(s) > n)
3255 break;
3256
3257 if (s->ratelimited) {
3258 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3259 * again. */
3260 assert(s->ratelimited);
3261
fd69f224 3262 r = event_source_leave_ratelimit(s, /* run_callback */ true);
b6d5481b
LP
3263 if (r < 0)
3264 return r;
fd69f224
MS
3265 else if (r == 1)
3266 callback_invoked = true;
b6d5481b
LP
3267
3268 continue;
3269 }
3270
3271 if (s->enabled == SD_EVENT_OFF || s->pending)
fd38203a
LP
3272 break;
3273
3274 r = source_set_pending(s, true);
3275 if (r < 0)
3276 return r;
3277
e1951c16 3278 event_source_time_prioq_reshuffle(s);
fd38203a
LP
3279 }
3280
fd69f224 3281 return callback_invoked;
fd38203a
LP
3282}
3283
efd3be9d
YW
3284static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3285 int64_t min_priority = threshold;
3286 bool something_new = false;
fd38203a 3287 sd_event_source *s;
fd38203a
LP
3288 int r;
3289
3290 assert(e);
efd3be9d
YW
3291 assert(ret_min_priority);
3292
3293 if (!e->need_process_child) {
3294 *ret_min_priority = min_priority;
3295 return 0;
3296 }
fd38203a 3297
c2ba3ad6
LP
3298 e->need_process_child = false;
3299
91c70071
YW
3300 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3301 * for, instead of using P_ALL. This is because we only want to get child information of very
3302 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3303 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3304 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3305 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3306 * to handle SIGCHLD yourself.
3307 *
3308 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3309 * source is dispatched so that the callback still sees the process as a zombie. */
fd38203a 3310
90e74a66 3311 HASHMAP_FOREACH(s, e->child_sources) {
fd38203a
LP
3312 assert(s->type == SOURCE_CHILD);
3313
efd3be9d
YW
3314 if (s->priority > threshold)
3315 continue;
3316
fd38203a
LP
3317 if (s->pending)
3318 continue;
3319
b6d5481b 3320 if (event_source_is_offline(s))
fd38203a
LP
3321 continue;
3322
f8f3f926
LP
3323 if (s->child.exited)
3324 continue;
3325
91c70071
YW
3326 if (EVENT_SOURCE_WATCH_PIDFD(s))
3327 /* There's a usable pidfd known for this event source? Then don't waitid() for
3328 * it here */
f8f3f926
LP
3329 continue;
3330
fd38203a 3331 zero(s->child.siginfo);
15c689d7
LP
3332 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3333 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
bfd9bfcc 3334 return negative_errno();
fd38203a
LP
3335
3336 if (s->child.siginfo.si_pid != 0) {
945c2931 3337 bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
08cd1552 3338
f8f3f926
LP
3339 if (zombie)
3340 s->child.exited = true;
3341
08cd1552 3342 if (!zombie && (s->child.options & WEXITED)) {
91c70071
YW
3343 /* If the child isn't dead then let's immediately remove the state
3344 * change from the queue, since there's no benefit in leaving it
3345 * queued. */
08cd1552
LP
3346
3347 assert(s->child.options & (WSTOPPED|WCONTINUED));
a5d27871 3348 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
08cd1552
LP
3349 }
3350
fd38203a
LP
3351 r = source_set_pending(s, true);
3352 if (r < 0)
3353 return r;
efd3be9d
YW
3354 if (r > 0) {
3355 something_new = true;
3356 min_priority = MIN(min_priority, s->priority);
3357 }
fd38203a
LP
3358 }
3359 }
3360
efd3be9d
YW
3361 *ret_min_priority = min_priority;
3362 return something_new;
fd38203a
LP
3363}
3364
f8f3f926
LP
3365static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3366 assert(e);
3367 assert(s);
3368 assert(s->type == SOURCE_CHILD);
3369
3370 if (s->pending)
3371 return 0;
3372
b6d5481b 3373 if (event_source_is_offline(s))
f8f3f926
LP
3374 return 0;
3375
3376 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3377 return 0;
3378
3379 zero(s->child.siginfo);
3380 if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3381 return -errno;
3382
3383 if (s->child.siginfo.si_pid == 0)
3384 return 0;
3385
3386 if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3387 s->child.exited = true;
3388
3389 return source_set_pending(s, true);
3390}
3391
efd3be9d 3392static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
fd38203a
LP
3393 int r;
3394
da7e457c 3395 assert(e);
97ef5391 3396 assert(d);
305f78bf 3397 assert_return(events == EPOLLIN, -EIO);
efd3be9d 3398 assert(min_priority);
fd38203a 3399
91c70071
YW
3400 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3401 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3402 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3403 * but we might have higher priority children we care about hence we need to check that
3404 * explicitly. */
9da4cb2b
LP
3405
3406 if (sigismember(&d->sigset, SIGCHLD))
3407 e->need_process_child = true;
3408
91c70071 3409 /* If there's already an event source pending for this priority we don't read another */
9da4cb2b
LP
3410 if (d->current)
3411 return 0;
3412
fd38203a 3413 for (;;) {
0eb2e0e3 3414 struct signalfd_siginfo si;
7057bd99 3415 ssize_t n;
92daebc0 3416 sd_event_source *s = NULL;
fd38203a 3417
9da4cb2b 3418 n = read(d->fd, &si, sizeof(si));
7057bd99 3419 if (n < 0) {
8add30a0 3420 if (ERRNO_IS_TRANSIENT(errno))
efd3be9d 3421 return 0;
fd38203a
LP
3422
3423 return -errno;
3424 }
3425
7057bd99 3426 if (_unlikely_(n != sizeof(si)))
fd38203a
LP
3427 return -EIO;
3428
6eb7c172 3429 assert(SIGNAL_VALID(si.ssi_signo));
7057bd99 3430
92daebc0
LP
3431 if (e->signal_sources)
3432 s = e->signal_sources[si.ssi_signo];
92daebc0
LP
3433 if (!s)
3434 continue;
9da4cb2b
LP
3435 if (s->pending)
3436 continue;
fd38203a
LP
3437
3438 s->signal.siginfo = si;
9da4cb2b
LP
3439 d->current = s;
3440
fd38203a
LP
3441 r = source_set_pending(s, true);
3442 if (r < 0)
3443 return r;
efd3be9d
YW
3444 if (r > 0 && *min_priority >= s->priority) {
3445 *min_priority = s->priority;
3446 return 1; /* an event source with smaller priority is queued. */
3447 }
9da4cb2b 3448
efd3be9d 3449 return 0;
fd38203a 3450 }
fd38203a
LP
3451}
3452
efd3be9d 3453static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
97ef5391
LP
3454 ssize_t n;
3455
3456 assert(e);
3457 assert(d);
3458
3459 assert_return(revents == EPOLLIN, -EIO);
3460
3461 /* If there's already an event source pending for this priority, don't read another */
3462 if (d->n_pending > 0)
3463 return 0;
3464
3465 /* Is the read buffer non-empty? If so, let's not read more */
3466 if (d->buffer_filled > 0)
3467 return 0;
3468
efd3be9d
YW
3469 if (d->priority > threshold)
3470 return 0;
3471
97ef5391
LP
3472 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3473 if (n < 0) {
8add30a0 3474 if (ERRNO_IS_TRANSIENT(errno))
97ef5391
LP
3475 return 0;
3476
3477 return -errno;
3478 }
3479
3480 assert(n > 0);
3481 d->buffer_filled = (size_t) n;
0601b958 3482 LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3483
3484 return 1;
3485}
3486
3487static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3488 assert(e);
3489 assert(d);
3490 assert(sz <= d->buffer_filled);
3491
3492 if (sz == 0)
3493 return;
3494
3495 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3496 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3497 d->buffer_filled -= sz;
3498
3499 if (d->buffer_filled == 0)
0601b958 3500 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3501}
3502
3503static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3504 int r;
3505
3506 assert(e);
3507 assert(d);
3508
3509 /* If there's already an event source pending for this priority, don't read another */
3510 if (d->n_pending > 0)
3511 return 0;
3512
3513 while (d->buffer_filled > 0) {
3514 size_t sz;
3515
3516 /* Let's validate that the event structures are complete */
3517 if (d->buffer_filled < offsetof(struct inotify_event, name))
3518 return -EIO;
3519
3520 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3521 if (d->buffer_filled < sz)
3522 return -EIO;
3523
3524 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3525 struct inode_data *inode_data;
97ef5391
LP
3526
3527 /* The queue overran, let's pass this event to all event sources connected to this inotify
3528 * object */
3529
03677889 3530 HASHMAP_FOREACH(inode_data, d->inodes)
97ef5391
LP
3531 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3532
b6d5481b 3533 if (event_source_is_offline(s))
97ef5391
LP
3534 continue;
3535
3536 r = source_set_pending(s, true);
3537 if (r < 0)
3538 return r;
3539 }
97ef5391
LP
3540 } else {
3541 struct inode_data *inode_data;
97ef5391
LP
3542
3543 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3544 * our watch descriptor table. */
3545 if (d->buffer.ev.mask & IN_IGNORED) {
3546
3547 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3548 if (!inode_data) {
3549 event_inotify_data_drop(e, d, sz);
3550 continue;
3551 }
3552
3553 /* The watch descriptor was removed by the kernel, let's drop it here too */
3554 inode_data->wd = -1;
3555 } else {
3556 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3557 if (!inode_data) {
3558 event_inotify_data_drop(e, d, sz);
3559 continue;
3560 }
3561 }
3562
3563 /* Trigger all event sources that are interested in these events. Also trigger all event
3564 * sources if IN_IGNORED or IN_UNMOUNT is set. */
3565 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3566
b6d5481b 3567 if (event_source_is_offline(s))
97ef5391
LP
3568 continue;
3569
3570 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3571 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3572 continue;
3573
3574 r = source_set_pending(s, true);
3575 if (r < 0)
3576 return r;
3577 }
3578 }
3579
3580 /* Something pending now? If so, let's finish, otherwise let's read more. */
3581 if (d->n_pending > 0)
3582 return 1;
3583 }
3584
3585 return 0;
3586}
3587
3588static int process_inotify(sd_event *e) {
97ef5391
LP
3589 int r, done = 0;
3590
3591 assert(e);
3592
0601b958 3593 LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
97ef5391
LP
3594 r = event_inotify_data_process(e, d);
3595 if (r < 0)
3596 return r;
3597 if (r > 0)
3598 done ++;
3599 }
3600
3601 return done;
3602}
3603
fd38203a 3604static int source_dispatch(sd_event_source *s) {
8f5c235d 3605 EventSourceType saved_type;
c8e9d15c 3606 sd_event *saved_event;
fe8245eb 3607 int r = 0;
fd38203a
LP
3608
3609 assert(s);
6203e07a 3610 assert(s->pending || s->type == SOURCE_EXIT);
fd38203a 3611
b778cba4
LP
3612 /* Save the event source type, here, so that we still know it after the event callback which might
3613 * invalidate the event. */
8f5c235d
LP
3614 saved_type = s->type;
3615
de02634c 3616 /* Similarly, store a reference to the event loop object, so that we can still access it after the
b778cba4 3617 * callback might have invalidated/disconnected the event source. */
c8e9d15c
YW
3618 saved_event = s->event;
3619 PROTECT_EVENT(saved_event);
b778cba4 3620
de02634c 3621 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
b6d5481b
LP
3622 assert(!s->ratelimited);
3623 if (!ratelimit_below(&s->rate_limit)) {
3624 r = event_source_enter_ratelimited(s);
3625 if (r < 0)
3626 return r;
3627
3628 return 1;
3629 }
3630
945c2931 3631 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
da7e457c
LP
3632 r = source_set_pending(s, false);
3633 if (r < 0)
3634 return r;
3635 }
fd38203a 3636
6e9feda3
LP
3637 if (s->type != SOURCE_POST) {
3638 sd_event_source *z;
6e9feda3 3639
de02634c 3640 /* If we execute a non-post source, let's mark all post sources as pending. */
6e9feda3 3641
90e74a66 3642 SET_FOREACH(z, s->event->post_sources) {
b6d5481b 3643 if (event_source_is_offline(z))
6e9feda3
LP
3644 continue;
3645
3646 r = source_set_pending(z, true);
3647 if (r < 0)
3648 return r;
3649 }
3650 }
3651
baf76283
LP
3652 if (s->enabled == SD_EVENT_ONESHOT) {
3653 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
fd38203a
LP
3654 if (r < 0)
3655 return r;
3656 }
3657
12179984 3658 s->dispatching = true;
b7484e2a 3659
fd38203a
LP
3660 switch (s->type) {
3661
3662 case SOURCE_IO:
3663 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
3664 break;
3665
6a0f1f6d 3666 case SOURCE_TIME_REALTIME:
a8548816 3667 case SOURCE_TIME_BOOTTIME:
6a0f1f6d
LP
3668 case SOURCE_TIME_MONOTONIC:
3669 case SOURCE_TIME_REALTIME_ALARM:
3670 case SOURCE_TIME_BOOTTIME_ALARM:
fd38203a
LP
3671 r = s->time.callback(s, s->time.next, s->userdata);
3672 break;
3673
3674 case SOURCE_SIGNAL:
3675 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
3676 break;
3677
08cd1552
LP
3678 case SOURCE_CHILD: {
3679 bool zombie;
3680
945c2931 3681 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
08cd1552 3682
fd38203a 3683 r = s->child.callback(s, &s->child.siginfo, s->userdata);
08cd1552
LP
3684
3685 /* Now, reap the PID for good. */
f8f3f926 3686 if (zombie) {
cc59d290 3687 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
f8f3f926
LP
3688 s->child.waited = true;
3689 }
08cd1552 3690
fd38203a 3691 break;
08cd1552 3692 }
fd38203a
LP
3693
3694 case SOURCE_DEFER:
3695 r = s->defer.callback(s, s->userdata);
3696 break;
da7e457c 3697
6e9feda3
LP
3698 case SOURCE_POST:
3699 r = s->post.callback(s, s->userdata);
3700 break;
3701
6203e07a
LP
3702 case SOURCE_EXIT:
3703 r = s->exit.callback(s, s->userdata);
da7e457c 3704 break;
9d3e3aa5 3705
97ef5391
LP
3706 case SOURCE_INOTIFY: {
3707 struct sd_event *e = s->event;
3708 struct inotify_data *d;
3709 size_t sz;
3710
3711 assert(s->inotify.inode_data);
3712 assert_se(d = s->inotify.inode_data->inotify_data);
3713
3714 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
3715 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3716 assert(d->buffer_filled >= sz);
3717
53baf2ef
LP
3718 /* If the inotify callback destroys the event source then this likely means we don't need to
3719 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
3720 * free it immediately, then we couldn't drop the event from the inotify event queue without
3721 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
3722 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
3723 * explicitly GC it after we are done dropping the inotify event from the buffer. */
3724 d->n_busy++;
97ef5391 3725 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
53baf2ef 3726 d->n_busy--;
97ef5391 3727
53baf2ef
LP
3728 /* When no event is pending anymore on this inotify object, then let's drop the event from
3729 * the inotify event queue buffer. */
97ef5391
LP
3730 if (d->n_pending == 0)
3731 event_inotify_data_drop(e, d, sz);
3732
53baf2ef
LP
3733 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
3734 event_gc_inotify_data(e, d);
97ef5391
LP
3735 break;
3736 }
3737
9d3e3aa5 3738 case SOURCE_WATCHDOG:
a71fe8b8 3739 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
9f2a50a3 3740 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
04499a70 3741 assert_not_reached();
fd38203a
LP
3742 }
3743
12179984
LP
3744 s->dispatching = false;
3745
b778cba4
LP
3746 if (r < 0) {
3747 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
3748 strna(s->description),
3749 event_source_type_to_string(saved_type),
3750 s->exit_on_failure ? "exiting" : "disabling");
3751
3752 if (s->exit_on_failure)
3753 (void) sd_event_exit(saved_event, r);
3754 }
12179984
LP
3755
3756 if (s->n_ref == 0)
3757 source_free(s);
3758 else if (r < 0)
c3c50474 3759 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
b7484e2a 3760
6203e07a 3761 return 1;
fd38203a
LP
3762}
3763
3764static int event_prepare(sd_event *e) {
3765 int r;
3766
3767 assert(e);
3768
3769 for (;;) {
3770 sd_event_source *s;
3771
3772 s = prioq_peek(e->prepare);
b6d5481b 3773 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
fd38203a
LP
3774 break;
3775
3776 s->prepare_iteration = e->iteration;
3777 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
3778 if (r < 0)
3779 return r;
3780
3781 assert(s->prepare);
12179984
LP
3782
3783 s->dispatching = true;
fd38203a 3784 r = s->prepare(s, s->userdata);
12179984
LP
3785 s->dispatching = false;
3786
b778cba4
LP
3787 if (r < 0) {
3788 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
3789 strna(s->description),
3790 event_source_type_to_string(s->type),
3791 s->exit_on_failure ? "exiting" : "disabling");
3792
3793 if (s->exit_on_failure)
3794 (void) sd_event_exit(e, r);
3795 }
fd38203a 3796
12179984
LP
3797 if (s->n_ref == 0)
3798 source_free(s);
3799 else if (r < 0)
c3c50474 3800 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd38203a
LP
3801 }
3802
3803 return 0;
3804}
3805
6203e07a 3806static int dispatch_exit(sd_event *e) {
da7e457c
LP
3807 sd_event_source *p;
3808 int r;
3809
3810 assert(e);
3811
6203e07a 3812 p = prioq_peek(e->exit);
19947509
ZJS
3813 assert(!p || p->type == SOURCE_EXIT);
3814
b6d5481b 3815 if (!p || event_source_is_offline(p)) {
da7e457c
LP
3816 e->state = SD_EVENT_FINISHED;
3817 return 0;
3818 }
3819
c8e9d15c 3820 PROTECT_EVENT(e);
da7e457c 3821 e->iteration++;
6203e07a 3822 e->state = SD_EVENT_EXITING;
da7e457c 3823 r = source_dispatch(p);
2b0c9ef7 3824 e->state = SD_EVENT_INITIAL;
da7e457c
LP
3825 return r;
3826}
3827
c2ba3ad6
LP
3828static sd_event_source* event_next_pending(sd_event *e) {
3829 sd_event_source *p;
3830
da7e457c
LP
3831 assert(e);
3832
c2ba3ad6
LP
3833 p = prioq_peek(e->pending);
3834 if (!p)
3835 return NULL;
3836
b6d5481b 3837 if (event_source_is_offline(p))
c2ba3ad6
LP
3838 return NULL;
3839
3840 return p;
3841}
3842
cde93897
LP
3843static int arm_watchdog(sd_event *e) {
3844 struct itimerspec its = {};
3845 usec_t t;
cde93897
LP
3846
3847 assert(e);
3848 assert(e->watchdog_fd >= 0);
3849
3850 t = sleep_between(e,
a595fb5c
YW
3851 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
3852 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
cde93897
LP
3853
3854 timespec_store(&its.it_value, t);
3855
75145780
LP
3856 /* Make sure we never set the watchdog to 0, which tells the
3857 * kernel to disable it. */
3858 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
3859 its.it_value.tv_nsec = 1;
3860
7c248223 3861 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
cde93897
LP
3862}
3863
3864static int process_watchdog(sd_event *e) {
3865 assert(e);
3866
3867 if (!e->watchdog)
3868 return 0;
3869
3870 /* Don't notify watchdog too often */
3871 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
3872 return 0;
3873
3874 sd_notify(false, "WATCHDOG=1");
3875 e->watchdog_last = e->timestamp.monotonic;
3876
3877 return arm_watchdog(e);
3878}
3879
97ef5391
LP
3880static void event_close_inode_data_fds(sd_event *e) {
3881 struct inode_data *d;
3882
3883 assert(e);
3884
3885 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
3886 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
365c2885 3887 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
97ef5391
LP
3888 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
3889 * compromise. */
3890
ed828563 3891 while ((d = e->inode_data_to_close_list)) {
97ef5391
LP
3892 assert(d->fd >= 0);
3893 d->fd = safe_close(d->fd);
3894
ed828563 3895 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
3896 }
3897}
3898
c45a5a74
TG
3899_public_ int sd_event_prepare(sd_event *e) {
3900 int r;
fd38203a 3901
da7e457c 3902 assert_return(e, -EINVAL);
b937d761 3903 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
3904 assert_return(!event_pid_changed(e), -ECHILD);
3905 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 3906 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 3907
e5446015
LP
3908 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
3909 * this check here once, since gettid() is typically not cached, and thus want to minimize
3910 * syscalls */
3911 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
3912
f814c871 3913 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 3914 PROTECT_EVENT(e);
f814c871 3915
6203e07a 3916 if (e->exit_requested)
c45a5a74 3917 goto pending;
fd38203a
LP
3918
3919 e->iteration++;
3920
0be6c2f6 3921 e->state = SD_EVENT_PREPARING;
fd38203a 3922 r = event_prepare(e);
0be6c2f6 3923 e->state = SD_EVENT_INITIAL;
fd38203a 3924 if (r < 0)
c45a5a74 3925 return r;
fd38203a 3926
6a0f1f6d
LP
3927 r = event_arm_timer(e, &e->realtime);
3928 if (r < 0)
c45a5a74 3929 return r;
6a0f1f6d 3930
a8548816
TG
3931 r = event_arm_timer(e, &e->boottime);
3932 if (r < 0)
c45a5a74 3933 return r;
a8548816 3934
6a0f1f6d
LP
3935 r = event_arm_timer(e, &e->monotonic);
3936 if (r < 0)
c45a5a74 3937 return r;
6a0f1f6d
LP
3938
3939 r = event_arm_timer(e, &e->realtime_alarm);
1b5995b0 3940 if (r < 0)
c45a5a74 3941 return r;
fd38203a 3942
6a0f1f6d 3943 r = event_arm_timer(e, &e->boottime_alarm);
1b5995b0 3944 if (r < 0)
c45a5a74 3945 return r;
fd38203a 3946
97ef5391
LP
3947 event_close_inode_data_fds(e);
3948
0601b958 3949 if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
c45a5a74
TG
3950 goto pending;
3951
2b0c9ef7 3952 e->state = SD_EVENT_ARMED;
c45a5a74
TG
3953
3954 return 0;
3955
3956pending:
2b0c9ef7 3957 e->state = SD_EVENT_ARMED;
6d148a84
TG
3958 r = sd_event_wait(e, 0);
3959 if (r == 0)
2b0c9ef7 3960 e->state = SD_EVENT_ARMED;
6d148a84
TG
3961
3962 return r;
c45a5a74
TG
3963}
3964
798445ab
LP
3965static int epoll_wait_usec(
3966 int fd,
3967 struct epoll_event *events,
3968 int maxevents,
3969 usec_t timeout) {
3970
7c248223 3971 int msec;
39f756d3
ZJS
3972#if 0
3973 static bool epoll_pwait2_absent = false;
52bb308c 3974 int r;
798445ab 3975
39f756d3
ZJS
3976 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not.
3977 *
3978 * FIXME: this is temporarily disabled until epoll_pwait2() becomes more widely available.
3979 * See https://github.com/systemd/systemd/pull/18973 and
3980 * https://github.com/systemd/systemd/issues/19052. */
798445ab
LP
3981
3982 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
798445ab
LP
3983 r = epoll_pwait2(fd,
3984 events,
3985 maxevents,
52bb308c 3986 TIMESPEC_STORE(timeout),
798445ab
LP
3987 NULL);
3988 if (r >= 0)
3989 return r;
7cb45dbf 3990 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
798445ab
LP
3991 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
3992 * supported. */
3993
3994 epoll_pwait2_absent = true;
3995 }
39f756d3 3996#endif
798445ab
LP
3997
3998 if (timeout == USEC_INFINITY)
3999 msec = -1;
4000 else {
4001 usec_t k;
4002
4003 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4004 if (k >= INT_MAX)
4005 msec = INT_MAX; /* Saturate */
4006 else
4007 msec = (int) k;
4008 }
4009
7c248223 4010 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
798445ab
LP
4011}
4012
efd3be9d 4013static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
319a4f4b 4014 size_t n_event_queue, m, n_event_max;
efd3be9d
YW
4015 int64_t min_priority = threshold;
4016 bool something_new = false;
798445ab 4017 int r;
c45a5a74 4018
efd3be9d
YW
4019 assert(e);
4020 assert(ret_min_priority);
6a0f1f6d 4021
8b9708d1 4022 n_event_queue = MAX(e->n_sources, 1u);
319a4f4b 4023 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
5cddd924 4024 return -ENOMEM;
fd38203a 4025
319a4f4b
LP
4026 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4027
97ef5391 4028 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
0601b958 4029 if (e->buffered_inotify_data_list)
798445ab 4030 timeout = 0;
97ef5391 4031
8b9708d1 4032 for (;;) {
319a4f4b
LP
4033 r = epoll_wait_usec(
4034 e->epoll_fd,
4035 e->event_queue,
4036 n_event_max,
4037 timeout);
798445ab 4038 if (r < 0)
efd3be9d 4039 return r;
c45a5a74 4040
8b9708d1
YW
4041 m = (size_t) r;
4042
319a4f4b 4043 if (m < n_event_max)
8b9708d1
YW
4044 break;
4045
319a4f4b 4046 if (n_event_max >= n_event_queue * 10)
8b9708d1
YW
4047 break;
4048
319a4f4b 4049 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
8b9708d1
YW
4050 return -ENOMEM;
4051
319a4f4b 4052 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
798445ab 4053 timeout = 0;
da7e457c 4054 }
fd38203a 4055
efd3be9d
YW
4056 /* Set timestamp only when this is called first time. */
4057 if (threshold == INT64_MAX)
4058 triple_timestamp_get(&e->timestamp);
fd38203a 4059
8b9708d1 4060 for (size_t i = 0; i < m; i++) {
fd38203a 4061
5cddd924
LP
4062 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4063 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
9da4cb2b 4064 else {
5cddd924 4065 WakeupType *t = e->event_queue[i].data.ptr;
9da4cb2b
LP
4066
4067 switch (*t) {
4068
f8f3f926 4069 case WAKEUP_EVENT_SOURCE: {
5cddd924 4070 sd_event_source *s = e->event_queue[i].data.ptr;
f8f3f926
LP
4071
4072 assert(s);
4073
efd3be9d
YW
4074 if (s->priority > threshold)
4075 continue;
4076
4077 min_priority = MIN(min_priority, s->priority);
4078
f8f3f926
LP
4079 switch (s->type) {
4080
4081 case SOURCE_IO:
5cddd924 4082 r = process_io(e, s, e->event_queue[i].events);
f8f3f926
LP
4083 break;
4084
4085 case SOURCE_CHILD:
5cddd924 4086 r = process_pidfd(e, s, e->event_queue[i].events);
f8f3f926
LP
4087 break;
4088
4089 default:
04499a70 4090 assert_not_reached();
f8f3f926
LP
4091 }
4092
9da4cb2b 4093 break;
f8f3f926 4094 }
fd38203a 4095
9da4cb2b 4096 case WAKEUP_CLOCK_DATA: {
5cddd924 4097 struct clock_data *d = e->event_queue[i].data.ptr;
f8f3f926
LP
4098
4099 assert(d);
4100
5cddd924 4101 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
9da4cb2b
LP
4102 break;
4103 }
4104
4105 case WAKEUP_SIGNAL_DATA:
efd3be9d 4106 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
9da4cb2b
LP
4107 break;
4108
97ef5391 4109 case WAKEUP_INOTIFY_DATA:
efd3be9d 4110 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
97ef5391
LP
4111 break;
4112
9da4cb2b 4113 default:
04499a70 4114 assert_not_reached();
9da4cb2b
LP
4115 }
4116 }
efd3be9d
YW
4117 if (r < 0)
4118 return r;
4119 if (r > 0)
4120 something_new = true;
4121 }
4122
4123 *ret_min_priority = min_priority;
4124 return something_new;
4125}
4126
4127_public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4128 int r;
4129
4130 assert_return(e, -EINVAL);
4131 assert_return(e = event_resolve(e), -ENOPKG);
4132 assert_return(!event_pid_changed(e), -ECHILD);
4133 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4134 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4135
4136 if (e->exit_requested) {
4137 e->state = SD_EVENT_PENDING;
4138 return 1;
4139 }
4140
4141 for (int64_t threshold = INT64_MAX; ; threshold--) {
4142 int64_t epoll_min_priority, child_min_priority;
4143
4144 /* There may be a possibility that new epoll (especially IO) and child events are
4145 * triggered just after process_epoll() call but before process_child(), and the new IO
4146 * events may have higher priority than the child events. To salvage these events,
4147 * let's call epoll_wait() again, but accepts only events with higher priority than the
4148 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4149 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4150 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4151
4152 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4153 if (r == -EINTR) {
4154 e->state = SD_EVENT_PENDING;
4155 return 1;
4156 }
4157 if (r < 0)
4158 goto finish;
4159 if (r == 0 && threshold < INT64_MAX)
4160 /* No new epoll event. */
4161 break;
4162
4163 r = process_child(e, threshold, &child_min_priority);
fd38203a 4164 if (r < 0)
da7e457c 4165 goto finish;
efd3be9d
YW
4166 if (r == 0)
4167 /* No new child event. */
4168 break;
4169
4170 threshold = MIN(epoll_min_priority, child_min_priority);
4171 if (threshold == INT64_MIN)
4172 break;
4173
4174 timeout = 0;
fd38203a
LP
4175 }
4176
cde93897
LP
4177 r = process_watchdog(e);
4178 if (r < 0)
4179 goto finish;
4180
fd69f224 4181 r = process_inotify(e);
6a0f1f6d
LP
4182 if (r < 0)
4183 goto finish;
4184
fd69f224 4185 r = process_timer(e, e->timestamp.realtime, &e->realtime);
a8548816
TG
4186 if (r < 0)
4187 goto finish;
4188
fd69f224 4189 r = process_timer(e, e->timestamp.boottime, &e->boottime);
6a0f1f6d
LP
4190 if (r < 0)
4191 goto finish;
4192
4193 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
fd38203a 4194 if (r < 0)
da7e457c 4195 goto finish;
fd38203a 4196
e475d10c 4197 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
fd38203a 4198 if (r < 0)
da7e457c 4199 goto finish;
fd38203a 4200
fd69f224 4201 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
97ef5391
LP
4202 if (r < 0)
4203 goto finish;
fd69f224
MS
4204 else if (r == 1) {
4205 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4206 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4207 * there were potentially re-enabled by the callback.
4208 *
4209 * Wondering why we treat only this invocation of process_timer() differently? Once event
4210 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4211 * ratelimit expiry callback is never called for any other timer type. */
4212 r = 0;
4213 goto finish;
4214 }
97ef5391 4215
c45a5a74
TG
4216 if (event_next_pending(e)) {
4217 e->state = SD_EVENT_PENDING;
c45a5a74 4218 return 1;
da7e457c
LP
4219 }
4220
c45a5a74 4221 r = 0;
fd38203a 4222
da7e457c 4223finish:
2b0c9ef7 4224 e->state = SD_EVENT_INITIAL;
da7e457c
LP
4225
4226 return r;
fd38203a
LP
4227}
4228
c45a5a74
TG
4229_public_ int sd_event_dispatch(sd_event *e) {
4230 sd_event_source *p;
4231 int r;
4232
4233 assert_return(e, -EINVAL);
b937d761 4234 assert_return(e = event_resolve(e), -ENOPKG);
c45a5a74
TG
4235 assert_return(!event_pid_changed(e), -ECHILD);
4236 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4237 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4238
4239 if (e->exit_requested)
4240 return dispatch_exit(e);
4241
4242 p = event_next_pending(e);
4243 if (p) {
c8e9d15c 4244 PROTECT_EVENT(e);
c45a5a74
TG
4245
4246 e->state = SD_EVENT_RUNNING;
4247 r = source_dispatch(p);
2b0c9ef7 4248 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4249 return r;
4250 }
4251
2b0c9ef7 4252 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4253
4254 return 1;
4255}
4256
34b87517 4257static void event_log_delays(sd_event *e) {
442ac269
YW
4258 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4259 size_t l, i;
34b87517 4260
442ac269
YW
4261 p = b;
4262 l = sizeof(b);
4263 for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4264 l = strpcpyf(&p, l, "%u ", e->delays[i]);
34b87517
VC
4265 e->delays[i] = 0;
4266 }
442ac269 4267 log_debug("Event loop iterations: %s", b);
34b87517
VC
4268}
4269
c45a5a74
TG
4270_public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4271 int r;
4272
4273 assert_return(e, -EINVAL);
b937d761 4274 assert_return(e = event_resolve(e), -ENOPKG);
c45a5a74
TG
4275 assert_return(!event_pid_changed(e), -ECHILD);
4276 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 4277 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
c45a5a74 4278
e6a7bee5 4279 if (e->profile_delays && e->last_run_usec != 0) {
34b87517
VC
4280 usec_t this_run;
4281 unsigned l;
4282
4283 this_run = now(CLOCK_MONOTONIC);
4284
58c34be8 4285 l = log2u64(this_run - e->last_run_usec);
cb9d621e 4286 assert(l < ELEMENTSOF(e->delays));
34b87517
VC
4287 e->delays[l]++;
4288
e6a7bee5 4289 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
34b87517 4290 event_log_delays(e);
e6a7bee5 4291 e->last_log_usec = this_run;
34b87517
VC
4292 }
4293 }
4294
f814c871 4295 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 4296 PROTECT_EVENT(e);
f814c871 4297
c45a5a74 4298 r = sd_event_prepare(e);
53bac4e0
LP
4299 if (r == 0)
4300 /* There was nothing? Then wait... */
4301 r = sd_event_wait(e, timeout);
c45a5a74 4302
34b87517 4303 if (e->profile_delays)
e6a7bee5 4304 e->last_run_usec = now(CLOCK_MONOTONIC);
34b87517 4305
02d30981 4306 if (r > 0) {
53bac4e0 4307 /* There's something now, then let's dispatch it */
02d30981
TG
4308 r = sd_event_dispatch(e);
4309 if (r < 0)
4310 return r;
53bac4e0
LP
4311
4312 return 1;
4313 }
4314
4315 return r;
c45a5a74
TG
4316}
4317
f7262a9f 4318_public_ int sd_event_loop(sd_event *e) {
fd38203a
LP
4319 int r;
4320
da7e457c 4321 assert_return(e, -EINVAL);
b937d761 4322 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 4323 assert_return(!event_pid_changed(e), -ECHILD);
2b0c9ef7 4324 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 4325
c8e9d15c 4326 PROTECT_EVENT(e);
fd38203a 4327
da7e457c 4328 while (e->state != SD_EVENT_FINISHED) {
f5fbe71d 4329 r = sd_event_run(e, UINT64_MAX);
fd38203a 4330 if (r < 0)
30dd293c 4331 return r;
fd38203a
LP
4332 }
4333
30dd293c 4334 return e->exit_code;
fd38203a
LP
4335}
4336
9b364545 4337_public_ int sd_event_get_fd(sd_event *e) {
9b364545 4338 assert_return(e, -EINVAL);
b937d761 4339 assert_return(e = event_resolve(e), -ENOPKG);
9b364545
TG
4340 assert_return(!event_pid_changed(e), -ECHILD);
4341
4342 return e->epoll_fd;
4343}
4344
f7262a9f 4345_public_ int sd_event_get_state(sd_event *e) {
da7e457c 4346 assert_return(e, -EINVAL);
b937d761 4347 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
4348 assert_return(!event_pid_changed(e), -ECHILD);
4349
4350 return e->state;
4351}
4352
6203e07a 4353_public_ int sd_event_get_exit_code(sd_event *e, int *code) {
da7e457c 4354 assert_return(e, -EINVAL);
b937d761 4355 assert_return(e = event_resolve(e), -ENOPKG);
6203e07a 4356 assert_return(code, -EINVAL);
da7e457c 4357 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 4358
6203e07a
LP
4359 if (!e->exit_requested)
4360 return -ENODATA;
4361
4362 *code = e->exit_code;
4363 return 0;
fd38203a
LP
4364}
4365
6203e07a 4366_public_ int sd_event_exit(sd_event *e, int code) {
da7e457c 4367 assert_return(e, -EINVAL);
b937d761 4368 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
4369 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4370 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 4371
6203e07a
LP
4372 e->exit_requested = true;
4373 e->exit_code = code;
4374
fd38203a
LP
4375 return 0;
4376}
46e8c825 4377
6a0f1f6d 4378_public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
46e8c825 4379 assert_return(e, -EINVAL);
b937d761 4380 assert_return(e = event_resolve(e), -ENOPKG);
46e8c825 4381 assert_return(usec, -EINVAL);
46e8c825
LP
4382 assert_return(!event_pid_changed(e), -ECHILD);
4383
e475d10c
LP
4384 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4385 return -EOPNOTSUPP;
4386
e475d10c 4387 if (!triple_timestamp_is_set(&e->timestamp)) {
15c689d7 4388 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
38a03f06
LP
4389 *usec = now(clock);
4390 return 1;
4391 }
46e8c825 4392
e475d10c 4393 *usec = triple_timestamp_by_clock(&e->timestamp, clock);
46e8c825
LP
4394 return 0;
4395}
afc6adb5
LP
4396
4397_public_ int sd_event_default(sd_event **ret) {
39883f62 4398 sd_event *e = NULL;
afc6adb5
LP
4399 int r;
4400
4401 if (!ret)
4402 return !!default_event;
4403
4404 if (default_event) {
4405 *ret = sd_event_ref(default_event);
4406 return 0;
4407 }
4408
4409 r = sd_event_new(&e);
4410 if (r < 0)
4411 return r;
4412
4413 e->default_event_ptr = &default_event;
4414 e->tid = gettid();
4415 default_event = e;
4416
4417 *ret = e;
4418 return 1;
4419}
4420
4421_public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4422 assert_return(e, -EINVAL);
b937d761 4423 assert_return(e = event_resolve(e), -ENOPKG);
afc6adb5 4424 assert_return(tid, -EINVAL);
76b54375 4425 assert_return(!event_pid_changed(e), -ECHILD);
afc6adb5 4426
76b54375
LP
4427 if (e->tid != 0) {
4428 *tid = e->tid;
4429 return 0;
4430 }
4431
4432 return -ENXIO;
afc6adb5 4433}
cde93897
LP
4434
4435_public_ int sd_event_set_watchdog(sd_event *e, int b) {
4436 int r;
4437
4438 assert_return(e, -EINVAL);
b937d761 4439 assert_return(e = event_resolve(e), -ENOPKG);
8f726607 4440 assert_return(!event_pid_changed(e), -ECHILD);
cde93897
LP
4441
4442 if (e->watchdog == !!b)
4443 return e->watchdog;
4444
4445 if (b) {
09812eb7
LP
4446 r = sd_watchdog_enabled(false, &e->watchdog_period);
4447 if (r <= 0)
cde93897 4448 return r;
cde93897
LP
4449
4450 /* Issue first ping immediately */
4451 sd_notify(false, "WATCHDOG=1");
4452 e->watchdog_last = now(CLOCK_MONOTONIC);
4453
4454 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4455 if (e->watchdog_fd < 0)
4456 return -errno;
4457
4458 r = arm_watchdog(e);
4459 if (r < 0)
4460 goto fail;
4461
1eac7948 4462 struct epoll_event ev = {
a82f89aa
LP
4463 .events = EPOLLIN,
4464 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
4465 };
cde93897 4466
15c689d7 4467 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
cde93897
LP
4468 r = -errno;
4469 goto fail;
4470 }
4471
4472 } else {
4473 if (e->watchdog_fd >= 0) {
5a795bff 4474 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
03e334a1 4475 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
4476 }
4477 }
4478
4479 e->watchdog = !!b;
4480 return e->watchdog;
4481
4482fail:
03e334a1 4483 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
4484 return r;
4485}
8f726607
LP
4486
4487_public_ int sd_event_get_watchdog(sd_event *e) {
4488 assert_return(e, -EINVAL);
b937d761 4489 assert_return(e = event_resolve(e), -ENOPKG);
8f726607
LP
4490 assert_return(!event_pid_changed(e), -ECHILD);
4491
4492 return e->watchdog;
4493}
60a3b1e1
LP
4494
4495_public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
4496 assert_return(e, -EINVAL);
b937d761 4497 assert_return(e = event_resolve(e), -ENOPKG);
60a3b1e1
LP
4498 assert_return(!event_pid_changed(e), -ECHILD);
4499
4500 *ret = e->iteration;
4501 return 0;
4502}
15723a1d
LP
4503
4504_public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
4505 assert_return(s, -EINVAL);
4506
4507 s->destroy_callback = callback;
4508 return 0;
4509}
4510
4511_public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
4512 assert_return(s, -EINVAL);
4513
4514 if (ret)
4515 *ret = s->destroy_callback;
4516
4517 return !!s->destroy_callback;
4518}
2382c936
YW
4519
4520_public_ int sd_event_source_get_floating(sd_event_source *s) {
4521 assert_return(s, -EINVAL);
4522
4523 return s->floating;
4524}
4525
4526_public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
4527 assert_return(s, -EINVAL);
4528
4529 if (s->floating == !!b)
4530 return 0;
4531
4532 if (!s->event) /* Already disconnected */
4533 return -ESTALE;
4534
4535 s->floating = b;
4536
4537 if (b) {
4538 sd_event_source_ref(s);
4539 sd_event_unref(s->event);
4540 } else {
4541 sd_event_ref(s->event);
4542 sd_event_source_unref(s);
4543 }
4544
4545 return 1;
4546}
b778cba4
LP
4547
4548_public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
4549 assert_return(s, -EINVAL);
4550 assert_return(s->type != SOURCE_EXIT, -EDOM);
4551
4552 return s->exit_on_failure;
4553}
4554
4555_public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
4556 assert_return(s, -EINVAL);
4557 assert_return(s->type != SOURCE_EXIT, -EDOM);
4558
4559 if (s->exit_on_failure == !!b)
4560 return 0;
4561
4562 s->exit_on_failure = b;
4563 return 1;
4564}
b6d5481b
LP
4565
4566_public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
4567 int r;
4568
4569 assert_return(s, -EINVAL);
4570
4571 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
4572 * so is a programming error. */
4573 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
4574
4575 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
4576 * non-ratelimited. */
fd69f224 4577 r = event_source_leave_ratelimit(s, /* run_callback */ false);
b6d5481b
LP
4578 if (r < 0)
4579 return r;
4580
4581 s->rate_limit = (RateLimit) { interval, burst };
4582 return 0;
fd69f224
MS
4583}
4584
4585_public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
4586 assert_return(s, -EINVAL);
4587
4588 s->ratelimit_expire_callback = callback;
4589 return 0;
b6d5481b
LP
4590}
4591
4592_public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
4593 assert_return(s, -EINVAL);
4594
6dd3b818
YW
4595 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
4596 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
b6d5481b
LP
4597 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
4598 return -EDOM;
4599
4600 if (!ratelimit_configured(&s->rate_limit))
4601 return -ENOEXEC;
4602
4603 if (ret_interval)
4604 *ret_interval = s->rate_limit.interval;
4605 if (ret_burst)
4606 *ret_burst = s->rate_limit.burst;
4607
4608 return 0;
4609}
4610
4611_public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
4612 assert_return(s, -EINVAL);
4613
4614 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
4615 return false;
4616
4617 if (!ratelimit_configured(&s->rate_limit))
4618 return false;
4619
4620 return s->ratelimited;
4621}
baf3fdec
LP
4622
4623_public_ int sd_event_set_signal_exit(sd_event *e, int b) {
4624 bool change = false;
4625 int r;
4626
4627 assert_return(e, -EINVAL);
4628
4629 if (b) {
4630 /* We want to maintain pointers to these event sources, so that we can destroy them when told
4631 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
4632 * floating after creation (and undo this before deleting them again). */
4633
4634 if (!e->sigint_event_source) {
4635 r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
4636 if (r < 0)
4637 return r;
4638
4639 assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
4640 change = true;
4641 }
4642
4643 if (!e->sigterm_event_source) {
4644 r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
4645 if (r < 0) {
4646 if (change) {
4647 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
4648 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
4649 }
4650
4651 return r;
4652 }
4653
4654 assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
4655 change = true;
4656 }
4657
4658 } else {
4659 if (e->sigint_event_source) {
4660 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
4661 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
4662 change = true;
4663 }
4664
4665 if (e->sigterm_event_source) {
4666 assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
4667 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
4668 change = true;
4669 }
4670 }
4671
4672 return change;
4673}