]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/libsystemd/sd-event/sd-event.c
sd-event: never pass negative errnos as signalfd to signalfd
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
fd38203a
LP
2
3#include <sys/epoll.h>
4#include <sys/timerfd.h>
5#include <sys/wait.h>
6
cde93897 7#include "sd-daemon.h"
07630cea
LP
8#include "sd-event.h"
9#include "sd-id128.h"
10
b5efdb8a 11#include "alloc-util.h"
f8f3f926 12#include "env-util.h"
a137a1c3 13#include "event-source.h"
3ffd4af2 14#include "fd-util.h"
97ef5391 15#include "fs-util.h"
28e5e1e9 16#include "glyph-util.h"
fd38203a 17#include "hashmap.h"
07630cea 18#include "list.h"
3ae6b3bf 19#include "logarithm.h"
07630cea 20#include "macro.h"
0a970718 21#include "memory-util.h"
f5947a5e 22#include "missing_syscall.h"
07630cea 23#include "prioq.h"
4a0b58c4 24#include "process-util.h"
6e9feda3 25#include "set.h"
24882e06 26#include "signal-util.h"
55cbfaa5 27#include "string-table.h"
07630cea 28#include "string-util.h"
442ac269 29#include "strxcpyx.h"
07630cea 30#include "time-util.h"
fd38203a 31
c2ba3ad6 32#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
fd38203a 33
f8f3f926
LP
34static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
35 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
36 return s &&
37 s->type == SOURCE_CHILD &&
38 s->child.pidfd >= 0 &&
39 s->child.options == WEXITED;
40}
41
b6d5481b
LP
42static bool event_source_is_online(sd_event_source *s) {
43 assert(s);
44 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
45}
46
47static bool event_source_is_offline(sd_event_source *s) {
48 assert(s);
49 return s->enabled == SD_EVENT_OFF || s->ratelimited;
50}
51
55cbfaa5 52static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
3f3548f8
ZJS
53 [SOURCE_IO] = "io",
54 [SOURCE_TIME_REALTIME] = "realtime",
55 [SOURCE_TIME_BOOTTIME] = "bootime",
56 [SOURCE_TIME_MONOTONIC] = "monotonic",
55cbfaa5
DM
57 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
58 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
3f3548f8
ZJS
59 [SOURCE_SIGNAL] = "signal",
60 [SOURCE_CHILD] = "child",
61 [SOURCE_DEFER] = "defer",
62 [SOURCE_POST] = "post",
63 [SOURCE_EXIT] = "exit",
64 [SOURCE_WATCHDOG] = "watchdog",
65 [SOURCE_INOTIFY] = "inotify",
55cbfaa5
DM
66};
67
68DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
69
b6d5481b
LP
70#define EVENT_SOURCE_IS_TIME(t) \
71 IN_SET((t), \
72 SOURCE_TIME_REALTIME, \
73 SOURCE_TIME_BOOTTIME, \
74 SOURCE_TIME_MONOTONIC, \
75 SOURCE_TIME_REALTIME_ALARM, \
76 SOURCE_TIME_BOOTTIME_ALARM)
77
78#define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
79 IN_SET((t), \
80 SOURCE_IO, \
81 SOURCE_TIME_REALTIME, \
82 SOURCE_TIME_BOOTTIME, \
83 SOURCE_TIME_MONOTONIC, \
84 SOURCE_TIME_REALTIME_ALARM, \
85 SOURCE_TIME_BOOTTIME_ALARM, \
86 SOURCE_SIGNAL, \
87 SOURCE_DEFER, \
88 SOURCE_INOTIFY)
6a0f1f6d 89
19947509
ZJS
90/* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
91 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
92 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
93#define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
94
fd38203a 95struct sd_event {
da7e457c 96 unsigned n_ref;
fd38203a
LP
97
98 int epoll_fd;
cde93897 99 int watchdog_fd;
fd38203a
LP
100
101 Prioq *pending;
102 Prioq *prepare;
c2ba3ad6 103
a8548816 104 /* timerfd_create() only supports these five clocks so far. We
6a0f1f6d
LP
105 * can add support for more clocks when the kernel learns to
106 * deal with them, too. */
107 struct clock_data realtime;
a8548816 108 struct clock_data boottime;
6a0f1f6d
LP
109 struct clock_data monotonic;
110 struct clock_data realtime_alarm;
111 struct clock_data boottime_alarm;
fd38203a 112
da7e457c
LP
113 usec_t perturb;
114
9da4cb2b
LP
115 sd_event_source **signal_sources; /* indexed by signal number */
116 Hashmap *signal_data; /* indexed by priority */
fd38203a
LP
117
118 Hashmap *child_sources;
b6d5481b 119 unsigned n_online_child_sources;
fd38203a 120
6e9feda3
LP
121 Set *post_sources;
122
6203e07a 123 Prioq *exit;
fd38203a 124
97ef5391
LP
125 Hashmap *inotify_data; /* indexed by priority */
126
127 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
ed828563 128 LIST_HEAD(struct inode_data, inode_data_to_close_list);
97ef5391
LP
129
130 /* A list of inotify objects that already have events buffered which aren't processed yet */
0601b958 131 LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
97ef5391 132
da7e457c 133 pid_t original_pid;
c2ba3ad6 134
60a3b1e1 135 uint64_t iteration;
e475d10c 136 triple_timestamp timestamp;
da7e457c 137 int state;
eaa3cbef 138
6203e07a 139 bool exit_requested:1;
da7e457c 140 bool need_process_child:1;
cde93897 141 bool watchdog:1;
34b87517 142 bool profile_delays:1;
afc6adb5 143
6203e07a
LP
144 int exit_code;
145
afc6adb5
LP
146 pid_t tid;
147 sd_event **default_event_ptr;
cde93897
LP
148
149 usec_t watchdog_last, watchdog_period;
15b38f93
LP
150
151 unsigned n_sources;
a71fe8b8 152
5cddd924 153 struct epoll_event *event_queue;
5cddd924 154
a71fe8b8 155 LIST_HEAD(sd_event_source, sources);
34b87517 156
baf3fdec
LP
157 sd_event_source *sigint_event_source, *sigterm_event_source;
158
e6a7bee5 159 usec_t last_run_usec, last_log_usec;
34b87517 160 unsigned delays[sizeof(usec_t) * 8];
fd38203a
LP
161};
162
b937d761
NM
163static thread_local sd_event *default_event = NULL;
164
a71fe8b8 165static void source_disconnect(sd_event_source *s);
97ef5391 166static void event_gc_inode_data(sd_event *e, struct inode_data *d);
a71fe8b8 167
b937d761
NM
168static sd_event *event_resolve(sd_event *e) {
169 return e == SD_EVENT_DEFAULT ? default_event : e;
170}
171
fd38203a
LP
172static int pending_prioq_compare(const void *a, const void *b) {
173 const sd_event_source *x = a, *y = b;
9c57a73b 174 int r;
fd38203a
LP
175
176 assert(x->pending);
177 assert(y->pending);
178
baf76283 179 /* Enabled ones first */
06e13147
YW
180 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
181 if (r != 0)
182 return r;
fd38203a 183
b6d5481b
LP
184 /* Non rate-limited ones first. */
185 r = CMP(!!x->ratelimited, !!y->ratelimited);
186 if (r != 0)
187 return r;
188
fd38203a 189 /* Lower priority values first */
9c57a73b
YW
190 r = CMP(x->priority, y->priority);
191 if (r != 0)
192 return r;
fd38203a
LP
193
194 /* Older entries first */
9c57a73b 195 return CMP(x->pending_iteration, y->pending_iteration);
fd38203a
LP
196}
197
198static int prepare_prioq_compare(const void *a, const void *b) {
199 const sd_event_source *x = a, *y = b;
9c57a73b 200 int r;
fd38203a
LP
201
202 assert(x->prepare);
203 assert(y->prepare);
204
8046c457 205 /* Enabled ones first */
06e13147
YW
206 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
207 if (r != 0)
208 return r;
8046c457 209
b6d5481b
LP
210 /* Non rate-limited ones first. */
211 r = CMP(!!x->ratelimited, !!y->ratelimited);
212 if (r != 0)
213 return r;
214
fd38203a
LP
215 /* Move most recently prepared ones last, so that we can stop
216 * preparing as soon as we hit one that has already been
217 * prepared in the current iteration */
9c57a73b
YW
218 r = CMP(x->prepare_iteration, y->prepare_iteration);
219 if (r != 0)
220 return r;
fd38203a 221
fd38203a 222 /* Lower priority values first */
9c57a73b 223 return CMP(x->priority, y->priority);
fd38203a
LP
224}
225
b6d5481b
LP
226static usec_t time_event_source_next(const sd_event_source *s) {
227 assert(s);
228
229 /* We have two kinds of event sources that have elapsation times associated with them: the actual
230 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
231 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
232 * looking at here. */
233
234 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
235 assert(s->rate_limit.begin != 0);
236 assert(s->rate_limit.interval != 0);
237 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
238 }
239
240 /* Otherwise this must be a time event source, if not ratelimited */
241 if (EVENT_SOURCE_IS_TIME(s->type))
242 return s->time.next;
243
244 return USEC_INFINITY;
245}
246
1bce0ffa 247static usec_t time_event_source_latest(const sd_event_source *s) {
b6d5481b
LP
248 assert(s);
249
250 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
251 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
252 * window */
253 assert(s->rate_limit.begin != 0);
254 assert(s->rate_limit.interval != 0);
255 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
256 }
257
258 /* Must be a time event source, if not ratelimited */
259 if (EVENT_SOURCE_IS_TIME(s->type))
260 return usec_add(s->time.next, s->time.accuracy);
261
262 return USEC_INFINITY;
1bce0ffa
LP
263}
264
81107b84
LP
265static bool event_source_timer_candidate(const sd_event_source *s) {
266 assert(s);
267
268 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
269 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
270 return !s->pending || s->ratelimited;
271}
272
273static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
c2ba3ad6 274 const sd_event_source *x = a, *y = b;
06e13147 275 int r;
c2ba3ad6 276
baf76283 277 /* Enabled ones first */
06e13147
YW
278 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
279 if (r != 0)
280 return r;
c2ba3ad6 281
81107b84 282 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
06e13147
YW
283 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
284 if (r != 0)
285 return r;
c2ba3ad6
LP
286
287 /* Order by time */
81107b84
LP
288 return CMP(time_func(x), time_func(y));
289}
290
291static int earliest_time_prioq_compare(const void *a, const void *b) {
292 return time_prioq_compare(a, b, time_event_source_next);
293}
294
295static int latest_time_prioq_compare(const void *a, const void *b) {
296 return time_prioq_compare(a, b, time_event_source_latest);
c2ba3ad6
LP
297}
298
6203e07a 299static int exit_prioq_compare(const void *a, const void *b) {
da7e457c 300 const sd_event_source *x = a, *y = b;
06e13147 301 int r;
da7e457c 302
6203e07a
LP
303 assert(x->type == SOURCE_EXIT);
304 assert(y->type == SOURCE_EXIT);
da7e457c 305
baf76283 306 /* Enabled ones first */
06e13147
YW
307 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
308 if (r != 0)
309 return r;
da7e457c
LP
310
311 /* Lower priority values first */
6dd91b36 312 return CMP(x->priority, y->priority);
da7e457c
LP
313}
314
6a0f1f6d
LP
315static void free_clock_data(struct clock_data *d) {
316 assert(d);
9da4cb2b 317 assert(d->wakeup == WAKEUP_CLOCK_DATA);
6a0f1f6d
LP
318
319 safe_close(d->fd);
320 prioq_free(d->earliest);
321 prioq_free(d->latest);
322}
323
8301aa0b 324static sd_event *event_free(sd_event *e) {
a71fe8b8
LP
325 sd_event_source *s;
326
fd38203a 327 assert(e);
a71fe8b8 328
baf3fdec
LP
329 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
330 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
331
a71fe8b8
LP
332 while ((s = e->sources)) {
333 assert(s->floating);
334 source_disconnect(s);
335 sd_event_source_unref(s);
336 }
337
15b38f93 338 assert(e->n_sources == 0);
fd38203a 339
afc6adb5
LP
340 if (e->default_event_ptr)
341 *(e->default_event_ptr) = NULL;
342
03e334a1 343 safe_close(e->epoll_fd);
03e334a1 344 safe_close(e->watchdog_fd);
cde93897 345
6a0f1f6d 346 free_clock_data(&e->realtime);
a8548816 347 free_clock_data(&e->boottime);
6a0f1f6d
LP
348 free_clock_data(&e->monotonic);
349 free_clock_data(&e->realtime_alarm);
350 free_clock_data(&e->boottime_alarm);
351
fd38203a
LP
352 prioq_free(e->pending);
353 prioq_free(e->prepare);
6203e07a 354 prioq_free(e->exit);
fd38203a
LP
355
356 free(e->signal_sources);
9da4cb2b 357 hashmap_free(e->signal_data);
fd38203a 358
97ef5391
LP
359 hashmap_free(e->inotify_data);
360
fd38203a 361 hashmap_free(e->child_sources);
6e9feda3 362 set_free(e->post_sources);
8301aa0b 363
5cddd924
LP
364 free(e->event_queue);
365
8301aa0b 366 return mfree(e);
fd38203a
LP
367}
368
f7262a9f 369_public_ int sd_event_new(sd_event** ret) {
fd38203a
LP
370 sd_event *e;
371 int r;
372
305f78bf 373 assert_return(ret, -EINVAL);
fd38203a 374
d08eb1fa 375 e = new(sd_event, 1);
fd38203a
LP
376 if (!e)
377 return -ENOMEM;
378
d08eb1fa
LP
379 *e = (sd_event) {
380 .n_ref = 1,
381 .epoll_fd = -1,
382 .watchdog_fd = -1,
383 .realtime.wakeup = WAKEUP_CLOCK_DATA,
384 .realtime.fd = -1,
385 .realtime.next = USEC_INFINITY,
386 .boottime.wakeup = WAKEUP_CLOCK_DATA,
387 .boottime.fd = -1,
388 .boottime.next = USEC_INFINITY,
389 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
390 .monotonic.fd = -1,
391 .monotonic.next = USEC_INFINITY,
392 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
393 .realtime_alarm.fd = -1,
394 .realtime_alarm.next = USEC_INFINITY,
395 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
396 .boottime_alarm.fd = -1,
397 .boottime_alarm.next = USEC_INFINITY,
398 .perturb = USEC_INFINITY,
399 .original_pid = getpid_cached(),
400 };
fd38203a 401
c983e776
EV
402 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
403 if (r < 0)
fd38203a 404 goto fail;
fd38203a
LP
405
406 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
407 if (e->epoll_fd < 0) {
408 r = -errno;
409 goto fail;
410 }
411
7fe2903c
LP
412 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
413
34b87517 414 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
28e5e1e9
DT
415 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
416 special_glyph(SPECIAL_GLYPH_ELLIPSIS));
34b87517
VC
417 e->profile_delays = true;
418 }
419
fd38203a
LP
420 *ret = e;
421 return 0;
422
423fail:
424 event_free(e);
425 return r;
426}
427
8301aa0b 428DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event, sd_event, event_free);
c8e9d15c
YW
429#define PROTECT_EVENT(e) \
430 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
fd38203a 431
afd15bbb
ZJS
432_public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
433 if (s)
434 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
435 return sd_event_source_unref(s);
436}
437
eaa3cbef
LP
438static bool event_pid_changed(sd_event *e) {
439 assert(e);
440
a2360a46 441 /* We don't support people creating an event loop and keeping
eaa3cbef
LP
442 * it around over a fork(). Let's complain. */
443
df0ff127 444 return e->original_pid != getpid_cached();
eaa3cbef
LP
445}
446
366e6411 447static void source_io_unregister(sd_event_source *s) {
fd38203a
LP
448 assert(s);
449 assert(s->type == SOURCE_IO);
450
f6806734 451 if (event_pid_changed(s->event))
366e6411 452 return;
f6806734 453
fd38203a 454 if (!s->io.registered)
366e6411 455 return;
fd38203a 456
d1cf2023 457 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
f80a5d6a 458 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
55cbfaa5 459 strna(s->description), event_source_type_to_string(s->type));
fd38203a
LP
460
461 s->io.registered = false;
fd38203a
LP
462}
463
305f78bf
LP
464static int source_io_register(
465 sd_event_source *s,
466 int enabled,
467 uint32_t events) {
468
fd38203a
LP
469 assert(s);
470 assert(s->type == SOURCE_IO);
baf76283 471 assert(enabled != SD_EVENT_OFF);
fd38203a 472
1eac7948 473 struct epoll_event ev = {
a82f89aa
LP
474 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
475 .data.ptr = s,
476 };
fd38203a 477
15c689d7 478 if (epoll_ctl(s->event->epoll_fd,
1eac7948 479 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
55c540d3 480 s->io.fd, &ev) < 0)
fd38203a
LP
481 return -errno;
482
483 s->io.registered = true;
484
485 return 0;
486}
487
f8f3f926
LP
488static void source_child_pidfd_unregister(sd_event_source *s) {
489 assert(s);
490 assert(s->type == SOURCE_CHILD);
491
492 if (event_pid_changed(s->event))
493 return;
494
495 if (!s->child.registered)
496 return;
497
498 if (EVENT_SOURCE_WATCH_PIDFD(s))
499 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
f80a5d6a 500 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
f8f3f926
LP
501 strna(s->description), event_source_type_to_string(s->type));
502
503 s->child.registered = false;
504}
505
506static int source_child_pidfd_register(sd_event_source *s, int enabled) {
f8f3f926
LP
507 assert(s);
508 assert(s->type == SOURCE_CHILD);
509 assert(enabled != SD_EVENT_OFF);
510
511 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1eac7948 512 struct epoll_event ev = {
f8f3f926
LP
513 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
514 .data.ptr = s,
515 };
516
55c540d3
ZJS
517 if (epoll_ctl(s->event->epoll_fd,
518 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
519 s->child.pidfd, &ev) < 0)
f8f3f926
LP
520 return -errno;
521 }
522
523 s->child.registered = true;
524 return 0;
525}
526
6a0f1f6d
LP
527static clockid_t event_source_type_to_clock(EventSourceType t) {
528
529 switch (t) {
530
531 case SOURCE_TIME_REALTIME:
532 return CLOCK_REALTIME;
533
a8548816
TG
534 case SOURCE_TIME_BOOTTIME:
535 return CLOCK_BOOTTIME;
536
6a0f1f6d
LP
537 case SOURCE_TIME_MONOTONIC:
538 return CLOCK_MONOTONIC;
539
540 case SOURCE_TIME_REALTIME_ALARM:
541 return CLOCK_REALTIME_ALARM;
542
543 case SOURCE_TIME_BOOTTIME_ALARM:
544 return CLOCK_BOOTTIME_ALARM;
545
546 default:
547 return (clockid_t) -1;
548 }
549}
550
551static EventSourceType clock_to_event_source_type(clockid_t clock) {
552
553 switch (clock) {
554
555 case CLOCK_REALTIME:
556 return SOURCE_TIME_REALTIME;
557
a8548816
TG
558 case CLOCK_BOOTTIME:
559 return SOURCE_TIME_BOOTTIME;
560
6a0f1f6d
LP
561 case CLOCK_MONOTONIC:
562 return SOURCE_TIME_MONOTONIC;
563
564 case CLOCK_REALTIME_ALARM:
565 return SOURCE_TIME_REALTIME_ALARM;
566
567 case CLOCK_BOOTTIME_ALARM:
568 return SOURCE_TIME_BOOTTIME_ALARM;
569
570 default:
571 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
572 }
573}
574
575static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
576 assert(e);
577
578 switch (t) {
579
580 case SOURCE_TIME_REALTIME:
581 return &e->realtime;
582
a8548816
TG
583 case SOURCE_TIME_BOOTTIME:
584 return &e->boottime;
585
6a0f1f6d
LP
586 case SOURCE_TIME_MONOTONIC:
587 return &e->monotonic;
588
589 case SOURCE_TIME_REALTIME_ALARM:
590 return &e->realtime_alarm;
591
592 case SOURCE_TIME_BOOTTIME_ALARM:
593 return &e->boottime_alarm;
594
595 default:
596 return NULL;
597 }
598}
599
3e4eb8e7
YW
600static void event_free_signal_data(sd_event *e, struct signal_data *d) {
601 assert(e);
602
603 if (!d)
604 return;
605
606 hashmap_remove(e->signal_data, &d->priority);
607 safe_close(d->fd);
608 free(d);
609}
610
9da4cb2b
LP
611static int event_make_signal_data(
612 sd_event *e,
613 int sig,
614 struct signal_data **ret) {
4807d2d0 615
9da4cb2b
LP
616 struct signal_data *d;
617 bool added = false;
618 sigset_t ss_copy;
619 int64_t priority;
f95387cd
ZJS
620 int r;
621
622 assert(e);
623
f6806734 624 if (event_pid_changed(e))
9da4cb2b 625 return -ECHILD;
f6806734 626
9da4cb2b
LP
627 if (e->signal_sources && e->signal_sources[sig])
628 priority = e->signal_sources[sig]->priority;
629 else
de05913d 630 priority = SD_EVENT_PRIORITY_NORMAL;
f95387cd 631
9da4cb2b
LP
632 d = hashmap_get(e->signal_data, &priority);
633 if (d) {
634 if (sigismember(&d->sigset, sig) > 0) {
635 if (ret)
636 *ret = d;
637 return 0;
638 }
639 } else {
d08eb1fa 640 d = new(struct signal_data, 1);
9da4cb2b
LP
641 if (!d)
642 return -ENOMEM;
643
d08eb1fa
LP
644 *d = (struct signal_data) {
645 .wakeup = WAKEUP_SIGNAL_DATA,
646 .fd = -1,
647 .priority = priority,
648 };
9da4cb2b 649
f656fdb6 650 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
90f604d1
ZJS
651 if (r < 0) {
652 free(d);
9da4cb2b 653 return r;
90f604d1 654 }
f95387cd 655
9da4cb2b
LP
656 added = true;
657 }
658
659 ss_copy = d->sigset;
660 assert_se(sigaddset(&ss_copy, sig) >= 0);
661
cbff793f
ZJS
662 r = signalfd(d->fd >= 0 ? d->fd : -1, /* the first arg must be -1 or a valid signalfd */
663 &ss_copy,
664 SFD_NONBLOCK|SFD_CLOEXEC);
9da4cb2b
LP
665 if (r < 0) {
666 r = -errno;
667 goto fail;
668 }
669
670 d->sigset = ss_copy;
f95387cd 671
9da4cb2b
LP
672 if (d->fd >= 0) {
673 if (ret)
674 *ret = d;
f95387cd 675 return 0;
9da4cb2b
LP
676 }
677
7fe2903c 678 d->fd = fd_move_above_stdio(r);
f95387cd 679
1eac7948 680 struct epoll_event ev = {
a82f89aa
LP
681 .events = EPOLLIN,
682 .data.ptr = d,
683 };
f95387cd 684
15c689d7 685 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
9da4cb2b
LP
686 r = -errno;
687 goto fail;
f95387cd
ZJS
688 }
689
9da4cb2b
LP
690 if (ret)
691 *ret = d;
692
f95387cd 693 return 0;
9da4cb2b
LP
694
695fail:
3e4eb8e7
YW
696 if (added)
697 event_free_signal_data(e, d);
9da4cb2b
LP
698
699 return r;
700}
701
702static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
703 assert(e);
704 assert(d);
705
706 /* Turns off the specified signal in the signal data
707 * object. If the signal mask of the object becomes empty that
708 * way removes it. */
709
710 if (sigismember(&d->sigset, sig) == 0)
711 return;
712
713 assert_se(sigdelset(&d->sigset, sig) >= 0);
714
715 if (sigisemptyset(&d->sigset)) {
9da4cb2b 716 /* If all the mask is all-zero we can get rid of the structure */
3e4eb8e7 717 event_free_signal_data(e, d);
9da4cb2b
LP
718 return;
719 }
720
01e6af73
YW
721 if (event_pid_changed(e))
722 return;
723
9da4cb2b
LP
724 assert(d->fd >= 0);
725
726 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
727 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
728}
729
730static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
731 struct signal_data *d;
732 static const int64_t zero_priority = 0;
733
734 assert(e);
735
f8f3f926
LP
736 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
737 * and possibly drop the signalfd for it. */
9da4cb2b
LP
738
739 if (sig == SIGCHLD &&
b6d5481b 740 e->n_online_child_sources > 0)
9da4cb2b
LP
741 return;
742
743 if (e->signal_sources &&
744 e->signal_sources[sig] &&
b6d5481b 745 event_source_is_online(e->signal_sources[sig]))
9da4cb2b
LP
746 return;
747
748 /*
749 * The specified signal might be enabled in three different queues:
750 *
751 * 1) the one that belongs to the priority passed (if it is non-NULL)
752 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
753 * 3) the 0 priority (to cover the SIGCHLD case)
754 *
755 * Hence, let's remove it from all three here.
756 */
757
758 if (priority) {
759 d = hashmap_get(e->signal_data, priority);
760 if (d)
761 event_unmask_signal_data(e, d, sig);
762 }
763
764 if (e->signal_sources && e->signal_sources[sig]) {
765 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
766 if (d)
767 event_unmask_signal_data(e, d, sig);
768 }
769
770 d = hashmap_get(e->signal_data, &zero_priority);
771 if (d)
772 event_unmask_signal_data(e, d, sig);
f95387cd
ZJS
773}
774
e1951c16
MS
775static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
776 assert(s);
777
778 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
779 * they are enabled/disabled or marked pending and such. */
780
781 if (s->pending)
782 prioq_reshuffle(s->event->pending, s, &s->pending_index);
783
784 if (s->prepare)
785 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
786}
787
788static void event_source_time_prioq_reshuffle(sd_event_source *s) {
789 struct clock_data *d;
790
791 assert(s);
e1951c16
MS
792
793 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
5c08c7ab
YW
794 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
795 * properly again. */
b6d5481b
LP
796
797 if (s->ratelimited)
798 d = &s->event->monotonic;
5c08c7ab 799 else if (EVENT_SOURCE_IS_TIME(s->type))
b6d5481b 800 assert_se(d = event_get_clock_data(s->event, s->type));
5c08c7ab
YW
801 else
802 return; /* no-op for an event source which is neither a timer nor ratelimited. */
b6d5481b 803
f41315fc
LP
804 prioq_reshuffle(d->earliest, s, &s->earliest_index);
805 prioq_reshuffle(d->latest, s, &s->latest_index);
e1951c16
MS
806 d->needs_rearm = true;
807}
808
1e45e3fe
LP
809static void event_source_time_prioq_remove(
810 sd_event_source *s,
811 struct clock_data *d) {
812
813 assert(s);
814 assert(d);
815
f41315fc
LP
816 prioq_remove(d->earliest, s, &s->earliest_index);
817 prioq_remove(d->latest, s, &s->latest_index);
818 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
819 d->needs_rearm = true;
820}
821
a71fe8b8
LP
822static void source_disconnect(sd_event_source *s) {
823 sd_event *event;
897448bd 824 int r;
a71fe8b8 825
fd38203a
LP
826 assert(s);
827
a71fe8b8
LP
828 if (!s->event)
829 return;
15b38f93 830
a71fe8b8 831 assert(s->event->n_sources > 0);
fd38203a 832
a71fe8b8 833 switch (s->type) {
fd38203a 834
a71fe8b8
LP
835 case SOURCE_IO:
836 if (s->io.fd >= 0)
837 source_io_unregister(s);
fd38203a 838
a71fe8b8 839 break;
6a0f1f6d 840
a71fe8b8 841 case SOURCE_TIME_REALTIME:
a8548816 842 case SOURCE_TIME_BOOTTIME:
a71fe8b8
LP
843 case SOURCE_TIME_MONOTONIC:
844 case SOURCE_TIME_REALTIME_ALARM:
b6d5481b
LP
845 case SOURCE_TIME_BOOTTIME_ALARM:
846 /* Only remove this event source from the time event source here if it is not ratelimited. If
847 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
848 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
849
850 if (!s->ratelimited) {
851 struct clock_data *d;
852 assert_se(d = event_get_clock_data(s->event, s->type));
853 event_source_time_prioq_remove(s, d);
854 }
855
a71fe8b8 856 break;
a71fe8b8
LP
857
858 case SOURCE_SIGNAL:
859 if (s->signal.sig > 0) {
9da4cb2b 860
a71fe8b8
LP
861 if (s->event->signal_sources)
862 s->event->signal_sources[s->signal.sig] = NULL;
4807d2d0 863
9da4cb2b 864 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
897448bd
LP
865
866 if (s->signal.unblock) {
867 sigset_t new_ss;
868
869 if (sigemptyset(&new_ss) < 0)
870 log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
871 else if (sigaddset(&new_ss, s->signal.sig) < 0)
872 log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
873 else {
874 r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
875 if (r != 0)
876 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
877 }
878 }
6a0f1f6d 879 }
fd38203a 880
a71fe8b8 881 break;
fd38203a 882
a71fe8b8 883 case SOURCE_CHILD:
86587c93
YW
884 if (event_pid_changed(s->event))
885 s->child.process_owned = false;
886
a71fe8b8 887 if (s->child.pid > 0) {
b6d5481b
LP
888 if (event_source_is_online(s)) {
889 assert(s->event->n_online_child_sources > 0);
890 s->event->n_online_child_sources--;
4807d2d0 891 }
fd38203a 892
4a0b58c4 893 (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
a71fe8b8 894 }
fd38203a 895
f8f3f926
LP
896 if (EVENT_SOURCE_WATCH_PIDFD(s))
897 source_child_pidfd_unregister(s);
898 else
899 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
900
a71fe8b8 901 break;
fd38203a 902
a71fe8b8
LP
903 case SOURCE_DEFER:
904 /* nothing */
905 break;
fd38203a 906
a71fe8b8
LP
907 case SOURCE_POST:
908 set_remove(s->event->post_sources, s);
909 break;
da7e457c 910
a71fe8b8
LP
911 case SOURCE_EXIT:
912 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
913 break;
0eb2e0e3 914
97ef5391
LP
915 case SOURCE_INOTIFY: {
916 struct inode_data *inode_data;
917
918 inode_data = s->inotify.inode_data;
919 if (inode_data) {
920 struct inotify_data *inotify_data;
921 assert_se(inotify_data = inode_data->inotify_data);
922
923 /* Detach this event source from the inode object */
924 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
925 s->inotify.inode_data = NULL;
926
927 if (s->pending) {
928 assert(inotify_data->n_pending > 0);
929 inotify_data->n_pending--;
930 }
931
932 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
933 * continued to being watched. That's because inotify doesn't really have an API for that: we
934 * can only change watch masks with access to the original inode either by fd or by path. But
935 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
f21f31b2 936 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
97ef5391
LP
937 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
938 * there), but given the need for open_by_handle_at() which is privileged and not universally
939 * available this would be quite an incomplete solution. Hence we go the other way, leave the
940 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
941 * anymore after reception. Yes, this sucks, but … Linux … */
942
943 /* Maybe release the inode data (and its inotify) */
944 event_gc_inode_data(s->event, inode_data);
945 }
946
947 break;
948 }
949
a71fe8b8 950 default:
04499a70 951 assert_not_reached();
a71fe8b8 952 }
6e9feda3 953
a71fe8b8
LP
954 if (s->pending)
955 prioq_remove(s->event->pending, s, &s->pending_index);
9d3e3aa5 956
a71fe8b8
LP
957 if (s->prepare)
958 prioq_remove(s->event->prepare, s, &s->prepare_index);
fd38203a 959
b6d5481b
LP
960 if (s->ratelimited)
961 event_source_time_prioq_remove(s, &s->event->monotonic);
962
e514aa1e 963 event = TAKE_PTR(s->event);
a71fe8b8
LP
964 LIST_REMOVE(sources, event->sources, s);
965 event->n_sources--;
fd38203a 966
f5982559
LP
967 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
968 * pidfd associated with this event source, which we'll do only on source_free(). */
969
a71fe8b8
LP
970 if (!s->floating)
971 sd_event_unref(event);
972}
973
75db809a 974static sd_event_source* source_free(sd_event_source *s) {
a71fe8b8 975 assert(s);
fd38203a 976
a71fe8b8 977 source_disconnect(s);
ab93297c
NM
978
979 if (s->type == SOURCE_IO && s->io.owned)
15723a1d
LP
980 s->io.fd = safe_close(s->io.fd);
981
f8f3f926
LP
982 if (s->type == SOURCE_CHILD) {
983 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
984
985 if (s->child.process_owned) {
986
987 if (!s->child.exited) {
988 bool sent = false;
989
990 if (s->child.pidfd >= 0) {
991 if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
992 if (errno == ESRCH) /* Already dead */
993 sent = true;
994 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
995 log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
996 s->child.pid);
997 } else
998 sent = true;
999 }
1000
1001 if (!sent)
1002 if (kill(s->child.pid, SIGKILL) < 0)
1003 if (errno != ESRCH) /* Already dead */
1004 log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1005 s->child.pid);
1006 }
1007
1008 if (!s->child.waited) {
1009 siginfo_t si = {};
1010
1011 /* Reap the child if we can */
1012 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1013 }
1014 }
1015
1016 if (s->child.pidfd_owned)
1017 s->child.pidfd = safe_close(s->child.pidfd);
1018 }
1019
15723a1d
LP
1020 if (s->destroy_callback)
1021 s->destroy_callback(s->userdata);
ab93297c 1022
356779df 1023 free(s->description);
75db809a 1024 return mfree(s);
fd38203a 1025}
8c75fe17 1026DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
fd38203a
LP
1027
1028static int source_set_pending(sd_event_source *s, bool b) {
1029 int r;
1030
1031 assert(s);
6203e07a 1032 assert(s->type != SOURCE_EXIT);
fd38203a
LP
1033
1034 if (s->pending == b)
1035 return 0;
1036
1037 s->pending = b;
1038
1039 if (b) {
1040 s->pending_iteration = s->event->iteration;
1041
1042 r = prioq_put(s->event->pending, s, &s->pending_index);
1043 if (r < 0) {
1044 s->pending = false;
1045 return r;
1046 }
1047 } else
1048 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1049
e1951c16
MS
1050 if (EVENT_SOURCE_IS_TIME(s->type))
1051 event_source_time_prioq_reshuffle(s);
2576a19e 1052
9da4cb2b
LP
1053 if (s->type == SOURCE_SIGNAL && !b) {
1054 struct signal_data *d;
1055
1056 d = hashmap_get(s->event->signal_data, &s->priority);
1057 if (d && d->current == s)
1058 d->current = NULL;
1059 }
1060
97ef5391
LP
1061 if (s->type == SOURCE_INOTIFY) {
1062
1063 assert(s->inotify.inode_data);
1064 assert(s->inotify.inode_data->inotify_data);
1065
1066 if (b)
1067 s->inotify.inode_data->inotify_data->n_pending ++;
1068 else {
1069 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1070 s->inotify.inode_data->inotify_data->n_pending --;
1071 }
1072 }
1073
efd3be9d 1074 return 1;
fd38203a
LP
1075}
1076
a71fe8b8 1077static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
fd38203a
LP
1078 sd_event_source *s;
1079
1080 assert(e);
1081
d08eb1fa 1082 s = new(sd_event_source, 1);
fd38203a
LP
1083 if (!s)
1084 return NULL;
1085
d08eb1fa
LP
1086 *s = (struct sd_event_source) {
1087 .n_ref = 1,
1088 .event = e,
1089 .floating = floating,
1090 .type = type,
1091 .pending_index = PRIOQ_IDX_NULL,
1092 .prepare_index = PRIOQ_IDX_NULL,
1093 };
a71fe8b8
LP
1094
1095 if (!floating)
1096 sd_event_ref(e);
fd38203a 1097
a71fe8b8 1098 LIST_PREPEND(sources, e->sources, s);
313cefa1 1099 e->n_sources++;
15b38f93 1100
fd38203a
LP
1101 return s;
1102}
1103
b9350e70
LP
1104static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1105 assert(s);
1106
1107 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1108}
1109
f7262a9f 1110_public_ int sd_event_add_io(
fd38203a 1111 sd_event *e,
151b9b96 1112 sd_event_source **ret,
fd38203a
LP
1113 int fd,
1114 uint32_t events,
718db961 1115 sd_event_io_handler_t callback,
151b9b96 1116 void *userdata) {
fd38203a 1117
ec766a51 1118 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1119 int r;
1120
305f78bf 1121 assert_return(e, -EINVAL);
b937d761 1122 assert_return(e = event_resolve(e), -ENOPKG);
8ac43fee 1123 assert_return(fd >= 0, -EBADF);
2a16a986 1124 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 1125 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1126 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1127
b9350e70
LP
1128 if (!callback)
1129 callback = io_exit_callback;
1130
a71fe8b8 1131 s = source_new(e, !ret, SOURCE_IO);
fd38203a
LP
1132 if (!s)
1133 return -ENOMEM;
1134
9da4cb2b 1135 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1136 s->io.fd = fd;
1137 s->io.events = events;
1138 s->io.callback = callback;
1139 s->userdata = userdata;
baf76283 1140 s->enabled = SD_EVENT_ON;
fd38203a 1141
baf76283 1142 r = source_io_register(s, s->enabled, events);
ec766a51 1143 if (r < 0)
050f74f2 1144 return r;
fd38203a 1145
a71fe8b8
LP
1146 if (ret)
1147 *ret = s;
ec766a51 1148 TAKE_PTR(s);
a71fe8b8 1149
fd38203a
LP
1150 return 0;
1151}
1152
52444dc4
LP
1153static void initialize_perturb(sd_event *e) {
1154 sd_id128_t bootid = {};
1155
1156 /* When we sleep for longer, we try to realign the wakeup to
f21f31b2 1157 the same time within each minute/second/250ms, so that
52444dc4
LP
1158 events all across the system can be coalesced into a single
1159 CPU wakeup. However, let's take some system-specific
1160 randomness for this value, so that in a network of systems
1161 with synced clocks timer events are distributed a
1162 bit. Here, we calculate a perturbation usec offset from the
1163 boot ID. */
1164
3a43da28 1165 if (_likely_(e->perturb != USEC_INFINITY))
52444dc4
LP
1166 return;
1167
1168 if (sd_id128_get_boot(&bootid) >= 0)
1169 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1170}
1171
fd38203a
LP
1172static int event_setup_timer_fd(
1173 sd_event *e,
6a0f1f6d
LP
1174 struct clock_data *d,
1175 clockid_t clock) {
fd38203a 1176
fd38203a 1177 assert(e);
6a0f1f6d 1178 assert(d);
fd38203a 1179
6a0f1f6d 1180 if (_likely_(d->fd >= 0))
fd38203a
LP
1181 return 0;
1182
b44d87e2 1183 _cleanup_close_ int fd = -1;
b44d87e2 1184
6a0f1f6d 1185 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
fd38203a
LP
1186 if (fd < 0)
1187 return -errno;
1188
7fe2903c
LP
1189 fd = fd_move_above_stdio(fd);
1190
1eac7948 1191 struct epoll_event ev = {
a82f89aa
LP
1192 .events = EPOLLIN,
1193 .data.ptr = d,
1194 };
fd38203a 1195
15c689d7 1196 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
fd38203a 1197 return -errno;
fd38203a 1198
b44d87e2 1199 d->fd = TAKE_FD(fd);
fd38203a
LP
1200 return 0;
1201}
1202
c4f1aff2
TG
1203static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1204 assert(s);
1205
1206 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1207}
1208
41c63f36
LP
1209static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1210 int r;
1211
1212 assert(d);
1213
1214 if (d->fd < 0) {
1215 r = event_setup_timer_fd(e, d, clock);
1216 if (r < 0)
1217 return r;
1218 }
1219
1220 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1221 if (r < 0)
1222 return r;
1223
1224 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1225 if (r < 0)
1226 return r;
1227
1228 return 0;
1229}
1230
1e45e3fe
LP
1231static int event_source_time_prioq_put(
1232 sd_event_source *s,
1233 struct clock_data *d) {
1234
1235 int r;
1236
1237 assert(s);
1238 assert(d);
19947509 1239 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1e45e3fe 1240
f41315fc 1241 r = prioq_put(d->earliest, s, &s->earliest_index);
1e45e3fe
LP
1242 if (r < 0)
1243 return r;
1244
f41315fc 1245 r = prioq_put(d->latest, s, &s->latest_index);
1e45e3fe 1246 if (r < 0) {
f41315fc
LP
1247 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1248 s->earliest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
1249 return r;
1250 }
1251
1252 d->needs_rearm = true;
1253 return 0;
1254}
1255
6a0f1f6d 1256_public_ int sd_event_add_time(
fd38203a 1257 sd_event *e,
151b9b96 1258 sd_event_source **ret,
6a0f1f6d 1259 clockid_t clock,
fd38203a 1260 uint64_t usec,
c2ba3ad6 1261 uint64_t accuracy,
718db961 1262 sd_event_time_handler_t callback,
151b9b96 1263 void *userdata) {
fd38203a 1264
6a0f1f6d 1265 EventSourceType type;
ec766a51 1266 _cleanup_(source_freep) sd_event_source *s = NULL;
6a0f1f6d 1267 struct clock_data *d;
fd38203a
LP
1268 int r;
1269
305f78bf 1270 assert_return(e, -EINVAL);
b937d761 1271 assert_return(e = event_resolve(e), -ENOPKG);
f5fbe71d 1272 assert_return(accuracy != UINT64_MAX, -EINVAL);
da7e457c 1273 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1274 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1275
e475d10c
LP
1276 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1277 return -EOPNOTSUPP;
1278
1279 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1280 if (type < 0)
3411372e
LP
1281 return -EOPNOTSUPP;
1282
c4f1aff2
TG
1283 if (!callback)
1284 callback = time_exit_callback;
1285
1e45e3fe 1286 assert_se(d = event_get_clock_data(e, type));
c2ba3ad6 1287
41c63f36 1288 r = setup_clock_data(e, d, clock);
c983e776
EV
1289 if (r < 0)
1290 return r;
fd38203a 1291
a71fe8b8 1292 s = source_new(e, !ret, type);
fd38203a
LP
1293 if (!s)
1294 return -ENOMEM;
1295
1296 s->time.next = usec;
c2ba3ad6 1297 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
fd38203a 1298 s->time.callback = callback;
f41315fc 1299 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
fd38203a 1300 s->userdata = userdata;
baf76283 1301 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1302
1e45e3fe 1303 r = event_source_time_prioq_put(s, d);
c2ba3ad6 1304 if (r < 0)
ec766a51 1305 return r;
fd38203a 1306
a71fe8b8
LP
1307 if (ret)
1308 *ret = s;
ec766a51 1309 TAKE_PTR(s);
a71fe8b8 1310
fd38203a
LP
1311 return 0;
1312}
1313
d6a83dc4
LP
1314_public_ int sd_event_add_time_relative(
1315 sd_event *e,
1316 sd_event_source **ret,
1317 clockid_t clock,
1318 uint64_t usec,
1319 uint64_t accuracy,
1320 sd_event_time_handler_t callback,
1321 void *userdata) {
1322
1323 usec_t t;
1324 int r;
1325
1326 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1327 * checks for overflow. */
1328
1329 r = sd_event_now(e, clock, &t);
1330 if (r < 0)
1331 return r;
1332
1333 if (usec >= USEC_INFINITY - t)
1334 return -EOVERFLOW;
1335
1336 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1337}
1338
59bc1fd7
LP
1339static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1340 assert(s);
1341
1342 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1343}
1344
f7262a9f 1345_public_ int sd_event_add_signal(
305f78bf 1346 sd_event *e,
151b9b96 1347 sd_event_source **ret,
305f78bf 1348 int sig,
718db961 1349 sd_event_signal_handler_t callback,
151b9b96 1350 void *userdata) {
305f78bf 1351
ec766a51 1352 _cleanup_(source_freep) sd_event_source *s = NULL;
9da4cb2b 1353 struct signal_data *d;
897448bd
LP
1354 sigset_t new_ss;
1355 bool block_it;
fd38203a
LP
1356 int r;
1357
305f78bf 1358 assert_return(e, -EINVAL);
b937d761 1359 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1360 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1361 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1362
897448bd
LP
1363 /* Let's make sure our special flag stays outside of the valid signal range */
1364 assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1365
1366 if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1367 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1368 assert_return(SIGNAL_VALID(sig), -EINVAL);
1369
1370 block_it = true;
1371 } else {
1372 assert_return(SIGNAL_VALID(sig), -EINVAL);
1373
1374 r = signal_is_blocked(sig);
1375 if (r < 0)
1376 return r;
1377 if (r == 0)
1378 return -EBUSY;
1379
1380 block_it = false;
1381 }
1382
59bc1fd7
LP
1383 if (!callback)
1384 callback = signal_exit_callback;
1385
fd38203a
LP
1386 if (!e->signal_sources) {
1387 e->signal_sources = new0(sd_event_source*, _NSIG);
1388 if (!e->signal_sources)
1389 return -ENOMEM;
1390 } else if (e->signal_sources[sig])
1391 return -EBUSY;
1392
a71fe8b8 1393 s = source_new(e, !ret, SOURCE_SIGNAL);
fd38203a
LP
1394 if (!s)
1395 return -ENOMEM;
1396
1397 s->signal.sig = sig;
1398 s->signal.callback = callback;
1399 s->userdata = userdata;
baf76283 1400 s->enabled = SD_EVENT_ON;
fd38203a
LP
1401
1402 e->signal_sources[sig] = s;
fd38203a 1403
897448bd
LP
1404 if (block_it) {
1405 sigset_t old_ss;
1406
1407 if (sigemptyset(&new_ss) < 0)
1408 return -errno;
1409
1410 if (sigaddset(&new_ss, sig) < 0)
1411 return -errno;
1412
1413 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1414 if (r != 0)
1415 return -r;
1416
1417 r = sigismember(&old_ss, sig);
1418 if (r < 0)
1419 return -errno;
1420
1421 s->signal.unblock = !r;
1422 } else
1423 s->signal.unblock = false;
1424
9da4cb2b 1425 r = event_make_signal_data(e, sig, &d);
897448bd
LP
1426 if (r < 0) {
1427 if (s->signal.unblock)
1428 (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1429
9da4cb2b 1430 return r;
897448bd 1431 }
fd38203a 1432
f1f00dbb
LP
1433 /* Use the signal name as description for the event source by default */
1434 (void) sd_event_source_set_description(s, signal_to_string(sig));
1435
a71fe8b8
LP
1436 if (ret)
1437 *ret = s;
ec766a51 1438 TAKE_PTR(s);
a71fe8b8 1439
fd38203a
LP
1440 return 0;
1441}
1442
b9350e70
LP
1443static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1444 assert(s);
1445
1446 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1447}
1448
f8f3f926
LP
1449static bool shall_use_pidfd(void) {
1450 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1451 return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1452}
1453
f7262a9f 1454_public_ int sd_event_add_child(
305f78bf 1455 sd_event *e,
151b9b96 1456 sd_event_source **ret,
305f78bf
LP
1457 pid_t pid,
1458 int options,
718db961 1459 sd_event_child_handler_t callback,
151b9b96 1460 void *userdata) {
305f78bf 1461
ec766a51 1462 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1463 int r;
1464
305f78bf 1465 assert_return(e, -EINVAL);
b937d761 1466 assert_return(e = event_resolve(e), -ENOPKG);
305f78bf
LP
1467 assert_return(pid > 1, -EINVAL);
1468 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1469 assert_return(options != 0, -EINVAL);
da7e457c 1470 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1471 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1472
b9350e70
LP
1473 if (!callback)
1474 callback = child_exit_callback;
1475
b6d5481b 1476 if (e->n_online_child_sources == 0) {
ee880b37
LP
1477 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1478 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1479 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1480 * take effect.
1481 *
1482 * (As an optimization we only do this check on the first child event source created.) */
1483 r = signal_is_blocked(SIGCHLD);
1484 if (r < 0)
1485 return r;
1486 if (r == 0)
1487 return -EBUSY;
1488 }
1489
d5099efc 1490 r = hashmap_ensure_allocated(&e->child_sources, NULL);
fd38203a
LP
1491 if (r < 0)
1492 return r;
1493
4a0b58c4 1494 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
fd38203a
LP
1495 return -EBUSY;
1496
a71fe8b8 1497 s = source_new(e, !ret, SOURCE_CHILD);
fd38203a
LP
1498 if (!s)
1499 return -ENOMEM;
1500
f8f3f926 1501 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1502 s->child.options = options;
1503 s->child.callback = callback;
1504 s->userdata = userdata;
baf76283 1505 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1506
f8f3f926
LP
1507 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1508 * pin the PID, and make regular waitid() handling race-free. */
1509
1510 if (shall_use_pidfd()) {
54988a27 1511 s->child.pidfd = pidfd_open(pid, 0);
f8f3f926
LP
1512 if (s->child.pidfd < 0) {
1513 /* Propagate errors unless the syscall is not supported or blocked */
1514 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1515 return -errno;
1516 } else
1517 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1518 } else
1519 s->child.pidfd = -1;
1520
f8f3f926
LP
1521 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1522 /* We have a pidfd and we only want to watch for exit */
f8f3f926 1523 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1524 if (r < 0)
f8f3f926 1525 return r;
ac9f2640 1526
f8f3f926
LP
1527 } else {
1528 /* We have no pidfd or we shall wait for some other event than WEXITED */
f8f3f926 1529 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1530 if (r < 0)
f8f3f926 1531 return r;
f8f3f926
LP
1532
1533 e->need_process_child = true;
1534 }
c2ba3ad6 1535
54988a27
YW
1536 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1537 if (r < 0)
1538 return r;
1539
1540 /* These must be done after everything succeeds. */
1541 s->child.pid = pid;
b6d5481b 1542 e->n_online_child_sources++;
ac9f2640 1543
a71fe8b8
LP
1544 if (ret)
1545 *ret = s;
ec766a51 1546 TAKE_PTR(s);
f8f3f926
LP
1547 return 0;
1548}
1549
1550_public_ int sd_event_add_child_pidfd(
1551 sd_event *e,
1552 sd_event_source **ret,
1553 int pidfd,
1554 int options,
1555 sd_event_child_handler_t callback,
1556 void *userdata) {
1557
1558
1559 _cleanup_(source_freep) sd_event_source *s = NULL;
1560 pid_t pid;
1561 int r;
1562
1563 assert_return(e, -EINVAL);
1564 assert_return(e = event_resolve(e), -ENOPKG);
1565 assert_return(pidfd >= 0, -EBADF);
1566 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1567 assert_return(options != 0, -EINVAL);
f8f3f926
LP
1568 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1569 assert_return(!event_pid_changed(e), -ECHILD);
1570
b9350e70
LP
1571 if (!callback)
1572 callback = child_exit_callback;
1573
b6d5481b 1574 if (e->n_online_child_sources == 0) {
ee880b37
LP
1575 r = signal_is_blocked(SIGCHLD);
1576 if (r < 0)
1577 return r;
1578 if (r == 0)
1579 return -EBUSY;
1580 }
1581
f8f3f926
LP
1582 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1583 if (r < 0)
1584 return r;
1585
1586 r = pidfd_get_pid(pidfd, &pid);
1587 if (r < 0)
1588 return r;
1589
1590 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1591 return -EBUSY;
1592
1593 s = source_new(e, !ret, SOURCE_CHILD);
1594 if (!s)
1595 return -ENOMEM;
1596
1597 s->wakeup = WAKEUP_EVENT_SOURCE;
1598 s->child.pidfd = pidfd;
1599 s->child.pid = pid;
1600 s->child.options = options;
1601 s->child.callback = callback;
1602 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1603 s->userdata = userdata;
1604 s->enabled = SD_EVENT_ONESHOT;
1605
1606 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1607 if (r < 0)
1608 return r;
1609
f8f3f926
LP
1610 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1611 /* We only want to watch for WEXITED */
f8f3f926 1612 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1613 if (r < 0)
f8f3f926 1614 return r;
f8f3f926
LP
1615 } else {
1616 /* We shall wait for some other event than WEXITED */
f8f3f926 1617 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1618 if (r < 0)
f8f3f926 1619 return r;
a71fe8b8 1620
f8f3f926
LP
1621 e->need_process_child = true;
1622 }
1623
b6d5481b 1624 e->n_online_child_sources++;
ac9f2640 1625
f8f3f926
LP
1626 if (ret)
1627 *ret = s;
f8f3f926 1628 TAKE_PTR(s);
fd38203a
LP
1629 return 0;
1630}
1631
b9350e70
LP
1632static int generic_exit_callback(sd_event_source *s, void *userdata) {
1633 assert(s);
1634
1635 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1636}
1637
f7262a9f 1638_public_ int sd_event_add_defer(
305f78bf 1639 sd_event *e,
151b9b96 1640 sd_event_source **ret,
718db961 1641 sd_event_handler_t callback,
151b9b96 1642 void *userdata) {
305f78bf 1643
ec766a51 1644 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1645 int r;
1646
305f78bf 1647 assert_return(e, -EINVAL);
b937d761 1648 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1649 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1650 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1651
b9350e70
LP
1652 if (!callback)
1653 callback = generic_exit_callback;
1654
a71fe8b8 1655 s = source_new(e, !ret, SOURCE_DEFER);
fd38203a
LP
1656 if (!s)
1657 return -ENOMEM;
1658
1659 s->defer.callback = callback;
1660 s->userdata = userdata;
baf76283 1661 s->enabled = SD_EVENT_ONESHOT;
fd38203a
LP
1662
1663 r = source_set_pending(s, true);
ec766a51 1664 if (r < 0)
fd38203a 1665 return r;
fd38203a 1666
a71fe8b8
LP
1667 if (ret)
1668 *ret = s;
ec766a51 1669 TAKE_PTR(s);
a71fe8b8 1670
fd38203a
LP
1671 return 0;
1672}
1673
6e9feda3
LP
1674_public_ int sd_event_add_post(
1675 sd_event *e,
1676 sd_event_source **ret,
1677 sd_event_handler_t callback,
1678 void *userdata) {
1679
ec766a51 1680 _cleanup_(source_freep) sd_event_source *s = NULL;
6e9feda3
LP
1681 int r;
1682
1683 assert_return(e, -EINVAL);
b937d761 1684 assert_return(e = event_resolve(e), -ENOPKG);
6e9feda3
LP
1685 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1686 assert_return(!event_pid_changed(e), -ECHILD);
1687
b9350e70
LP
1688 if (!callback)
1689 callback = generic_exit_callback;
1690
a71fe8b8 1691 s = source_new(e, !ret, SOURCE_POST);
6e9feda3
LP
1692 if (!s)
1693 return -ENOMEM;
1694
1695 s->post.callback = callback;
1696 s->userdata = userdata;
1697 s->enabled = SD_EVENT_ON;
1698
de7fef4b 1699 r = set_ensure_put(&e->post_sources, NULL, s);
ec766a51 1700 if (r < 0)
6e9feda3 1701 return r;
de7fef4b 1702 assert(r > 0);
6e9feda3 1703
a71fe8b8
LP
1704 if (ret)
1705 *ret = s;
ec766a51 1706 TAKE_PTR(s);
a71fe8b8 1707
6e9feda3
LP
1708 return 0;
1709}
1710
6203e07a 1711_public_ int sd_event_add_exit(
305f78bf 1712 sd_event *e,
151b9b96 1713 sd_event_source **ret,
718db961 1714 sd_event_handler_t callback,
151b9b96 1715 void *userdata) {
305f78bf 1716
ec766a51 1717 _cleanup_(source_freep) sd_event_source *s = NULL;
da7e457c
LP
1718 int r;
1719
1720 assert_return(e, -EINVAL);
b937d761 1721 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
1722 assert_return(callback, -EINVAL);
1723 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1724 assert_return(!event_pid_changed(e), -ECHILD);
1725
c983e776
EV
1726 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1727 if (r < 0)
1728 return r;
da7e457c 1729
a71fe8b8 1730 s = source_new(e, !ret, SOURCE_EXIT);
fd38203a 1731 if (!s)
da7e457c 1732 return -ENOMEM;
fd38203a 1733
6203e07a 1734 s->exit.callback = callback;
da7e457c 1735 s->userdata = userdata;
6203e07a 1736 s->exit.prioq_index = PRIOQ_IDX_NULL;
baf76283 1737 s->enabled = SD_EVENT_ONESHOT;
da7e457c 1738
6203e07a 1739 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
ec766a51 1740 if (r < 0)
da7e457c 1741 return r;
da7e457c 1742
a71fe8b8
LP
1743 if (ret)
1744 *ret = s;
ec766a51 1745 TAKE_PTR(s);
a71fe8b8 1746
da7e457c
LP
1747 return 0;
1748}
1749
97ef5391
LP
1750static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
1751 assert(e);
1752
1753 if (!d)
1754 return;
1755
1756 assert(hashmap_isempty(d->inodes));
1757 assert(hashmap_isempty(d->wd));
1758
1759 if (d->buffer_filled > 0)
0601b958 1760 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
1761
1762 hashmap_free(d->inodes);
1763 hashmap_free(d->wd);
1764
1765 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
1766
1767 if (d->fd >= 0) {
fbae5090
YW
1768 if (!event_pid_changed(e) &&
1769 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
97ef5391
LP
1770 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
1771
1772 safe_close(d->fd);
1773 }
1774 free(d);
1775}
1776
1777static int event_make_inotify_data(
1778 sd_event *e,
1779 int64_t priority,
1780 struct inotify_data **ret) {
1781
1782 _cleanup_close_ int fd = -1;
1783 struct inotify_data *d;
97ef5391
LP
1784 int r;
1785
1786 assert(e);
1787
1788 d = hashmap_get(e->inotify_data, &priority);
1789 if (d) {
1790 if (ret)
1791 *ret = d;
1792 return 0;
1793 }
1794
1795 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
1796 if (fd < 0)
1797 return -errno;
1798
1799 fd = fd_move_above_stdio(fd);
1800
97ef5391
LP
1801 d = new(struct inotify_data, 1);
1802 if (!d)
1803 return -ENOMEM;
1804
1805 *d = (struct inotify_data) {
1806 .wakeup = WAKEUP_INOTIFY_DATA,
1807 .fd = TAKE_FD(fd),
1808 .priority = priority,
1809 };
1810
c2484a75 1811 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
97ef5391
LP
1812 if (r < 0) {
1813 d->fd = safe_close(d->fd);
1814 free(d);
1815 return r;
1816 }
1817
1eac7948 1818 struct epoll_event ev = {
97ef5391
LP
1819 .events = EPOLLIN,
1820 .data.ptr = d,
1821 };
1822
1823 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
1824 r = -errno;
1825 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
1826 * remove the fd from the epoll first, which we don't want as we couldn't
1827 * add it in the first place. */
1828 event_free_inotify_data(e, d);
1829 return r;
1830 }
1831
1832 if (ret)
1833 *ret = d;
1834
1835 return 1;
1836}
1837
7a08d314 1838static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
90c88092 1839 int r;
97ef5391
LP
1840
1841 assert(x);
1842 assert(y);
1843
90c88092
YW
1844 r = CMP(x->dev, y->dev);
1845 if (r != 0)
1846 return r;
97ef5391 1847
6dd91b36 1848 return CMP(x->ino, y->ino);
97ef5391
LP
1849}
1850
7a08d314
YW
1851static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
1852 assert(d);
97ef5391
LP
1853
1854 siphash24_compress(&d->dev, sizeof(d->dev), state);
1855 siphash24_compress(&d->ino, sizeof(d->ino), state);
1856}
1857
7a08d314 1858DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
97ef5391
LP
1859
1860static void event_free_inode_data(
1861 sd_event *e,
1862 struct inode_data *d) {
1863
1864 assert(e);
1865
1866 if (!d)
1867 return;
1868
64903d18 1869 assert(!d->event_sources);
97ef5391
LP
1870
1871 if (d->fd >= 0) {
ed828563 1872 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
1873 safe_close(d->fd);
1874 }
1875
1876 if (d->inotify_data) {
1877
1878 if (d->wd >= 0) {
fbae5090 1879 if (d->inotify_data->fd >= 0 && !event_pid_changed(e)) {
97ef5391
LP
1880 /* So here's a problem. At the time this runs the watch descriptor might already be
1881 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
1882 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
1883 * likely case to happen. */
1884
1885 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
1886 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
1887 }
1888
1889 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
1890 }
1891
1892 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
1893 }
1894
1895 free(d);
1896}
1897
53baf2ef
LP
1898static void event_gc_inotify_data(
1899 sd_event *e,
1900 struct inotify_data *d) {
1901
1902 assert(e);
1903
1904 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
1905 * any inode with it anymore, which in turn happens if no event source of this priority is interested
1906 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
1907 * (under the expectation that the GC is called again once the counter is decremented). */
1908
1909 if (!d)
1910 return;
1911
1912 if (!hashmap_isempty(d->inodes))
1913 return;
1914
1915 if (d->n_busy > 0)
1916 return;
1917
1918 event_free_inotify_data(e, d);
1919}
1920
97ef5391
LP
1921static void event_gc_inode_data(
1922 sd_event *e,
1923 struct inode_data *d) {
1924
1925 struct inotify_data *inotify_data;
1926
1927 assert(e);
1928
1929 if (!d)
1930 return;
1931
64903d18 1932 if (d->event_sources)
97ef5391
LP
1933 return;
1934
1935 inotify_data = d->inotify_data;
1936 event_free_inode_data(e, d);
1937
53baf2ef 1938 event_gc_inotify_data(e, inotify_data);
97ef5391
LP
1939}
1940
1941static int event_make_inode_data(
1942 sd_event *e,
1943 struct inotify_data *inotify_data,
1944 dev_t dev,
1945 ino_t ino,
1946 struct inode_data **ret) {
1947
1948 struct inode_data *d, key;
1949 int r;
1950
1951 assert(e);
1952 assert(inotify_data);
1953
1954 key = (struct inode_data) {
1955 .ino = ino,
1956 .dev = dev,
1957 };
1958
1959 d = hashmap_get(inotify_data->inodes, &key);
1960 if (d) {
1961 if (ret)
1962 *ret = d;
1963
1964 return 0;
1965 }
1966
1967 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
1968 if (r < 0)
1969 return r;
1970
1971 d = new(struct inode_data, 1);
1972 if (!d)
1973 return -ENOMEM;
1974
1975 *d = (struct inode_data) {
1976 .dev = dev,
1977 .ino = ino,
1978 .wd = -1,
1979 .fd = -1,
1980 .inotify_data = inotify_data,
1981 };
1982
1983 r = hashmap_put(inotify_data->inodes, d, d);
1984 if (r < 0) {
1985 free(d);
1986 return r;
1987 }
1988
1989 if (ret)
1990 *ret = d;
1991
1992 return 1;
1993}
1994
1995static uint32_t inode_data_determine_mask(struct inode_data *d) {
1996 bool excl_unlink = true;
1997 uint32_t combined = 0;
97ef5391
LP
1998
1999 assert(d);
2000
2001 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2002 * the IN_EXCL_UNLINK flag is ANDed instead.
2003 *
2004 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2005 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
f21f31b2 2006 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
97ef5391
LP
2007 * events we don't care for client-side. */
2008
2009 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2010
2011 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2012 excl_unlink = false;
2013
2014 combined |= s->inotify.mask;
2015 }
2016
2017 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2018}
2019
2020static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2021 uint32_t combined_mask;
2022 int wd, r;
2023
2024 assert(d);
2025 assert(d->fd >= 0);
2026
2027 combined_mask = inode_data_determine_mask(d);
2028
2029 if (d->wd >= 0 && combined_mask == d->combined_mask)
2030 return 0;
2031
2032 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2033 if (r < 0)
2034 return r;
2035
2036 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2037 if (wd < 0)
2038 return -errno;
2039
2040 if (d->wd < 0) {
2041 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2042 if (r < 0) {
2043 (void) inotify_rm_watch(d->inotify_data->fd, wd);
2044 return r;
2045 }
2046
2047 d->wd = wd;
2048
2049 } else if (d->wd != wd) {
2050
2051 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2052 (void) inotify_rm_watch(d->fd, wd);
2053 return -EINVAL;
2054 }
2055
2056 d->combined_mask = combined_mask;
2057 return 1;
2058}
2059
b9350e70
LP
2060static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2061 assert(s);
2062
2063 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2064}
2065
e67d738a 2066static int event_add_inotify_fd_internal(
97ef5391
LP
2067 sd_event *e,
2068 sd_event_source **ret,
e67d738a
LP
2069 int fd,
2070 bool donate,
97ef5391
LP
2071 uint32_t mask,
2072 sd_event_inotify_handler_t callback,
2073 void *userdata) {
2074
e67d738a
LP
2075 _cleanup_close_ int donated_fd = donate ? fd : -1;
2076 _cleanup_(source_freep) sd_event_source *s = NULL;
97ef5391
LP
2077 struct inotify_data *inotify_data = NULL;
2078 struct inode_data *inode_data = NULL;
97ef5391
LP
2079 struct stat st;
2080 int r;
2081
2082 assert_return(e, -EINVAL);
2083 assert_return(e = event_resolve(e), -ENOPKG);
e67d738a 2084 assert_return(fd >= 0, -EBADF);
97ef5391
LP
2085 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2086 assert_return(!event_pid_changed(e), -ECHILD);
2087
b9350e70
LP
2088 if (!callback)
2089 callback = inotify_exit_callback;
2090
97ef5391
LP
2091 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2092 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2093 * the user can't use them for us. */
2094 if (mask & IN_MASK_ADD)
2095 return -EINVAL;
2096
97ef5391
LP
2097 if (fstat(fd, &st) < 0)
2098 return -errno;
2099
2100 s = source_new(e, !ret, SOURCE_INOTIFY);
2101 if (!s)
2102 return -ENOMEM;
2103
2104 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2105 s->inotify.mask = mask;
2106 s->inotify.callback = callback;
2107 s->userdata = userdata;
2108
2109 /* Allocate an inotify object for this priority, and an inode object within it */
2110 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2111 if (r < 0)
8c75fe17 2112 return r;
97ef5391
LP
2113
2114 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
8c75fe17 2115 if (r < 0) {
e67d738a 2116 event_gc_inotify_data(e, inotify_data);
8c75fe17
ZJS
2117 return r;
2118 }
97ef5391
LP
2119
2120 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2121 * the event source, until then, for which we need the original inode. */
2122 if (inode_data->fd < 0) {
e67d738a
LP
2123 if (donated_fd >= 0)
2124 inode_data->fd = TAKE_FD(donated_fd);
2125 else {
2126 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2127 if (inode_data->fd < 0) {
2128 r = -errno;
2129 event_gc_inode_data(e, inode_data);
2130 return r;
2131 }
2132 }
2133
ed828563 2134 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
97ef5391
LP
2135 }
2136
2137 /* Link our event source to the inode data object */
2138 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2139 s->inotify.inode_data = inode_data;
2140
97ef5391
LP
2141 /* Actually realize the watch now */
2142 r = inode_data_realize_watch(e, inode_data);
2143 if (r < 0)
8c75fe17 2144 return r;
97ef5391 2145
97ef5391
LP
2146 if (ret)
2147 *ret = s;
8c75fe17 2148 TAKE_PTR(s);
97ef5391
LP
2149
2150 return 0;
97ef5391
LP
2151}
2152
e67d738a
LP
2153_public_ int sd_event_add_inotify_fd(
2154 sd_event *e,
2155 sd_event_source **ret,
2156 int fd,
2157 uint32_t mask,
2158 sd_event_inotify_handler_t callback,
2159 void *userdata) {
2160
2161 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2162}
2163
2164_public_ int sd_event_add_inotify(
2165 sd_event *e,
2166 sd_event_source **ret,
2167 const char *path,
2168 uint32_t mask,
2169 sd_event_inotify_handler_t callback,
2170 void *userdata) {
2171
2091c779 2172 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
e67d738a
LP
2173 int fd, r;
2174
2175 assert_return(path, -EINVAL);
2176
586c8cee
ZJS
2177 fd = open(path, O_PATH | O_CLOEXEC |
2178 (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2179 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
e67d738a
LP
2180 if (fd < 0)
2181 return -errno;
2182
2183 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2184 if (r < 0)
2185 return r;
2186
2187 (void) sd_event_source_set_description(s, path);
2188
2189 if (ret)
2190 *ret = s;
2191
2192 return r;
2193}
2194
8301aa0b 2195static sd_event_source* event_source_free(sd_event_source *s) {
6680dd6b
LP
2196 if (!s)
2197 return NULL;
da7e457c 2198
8301aa0b
YW
2199 /* Here's a special hack: when we are called from a
2200 * dispatch handler we won't free the event source
2201 * immediately, but we will detach the fd from the
2202 * epoll. This way it is safe for the caller to unref
2203 * the event source and immediately close the fd, but
2204 * we still retain a valid event source object after
2205 * the callback. */
fd38203a 2206
76d04c3a 2207 if (s->dispatching)
8301aa0b 2208 source_disconnect(s);
76d04c3a 2209 else
8301aa0b 2210 source_free(s);
fd38203a
LP
2211
2212 return NULL;
2213}
2214
8301aa0b
YW
2215DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2216
356779df 2217_public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
f7f53e9e 2218 assert_return(s, -EINVAL);
f4b2933e 2219 assert_return(!event_pid_changed(s->event), -ECHILD);
f7f53e9e 2220
356779df 2221 return free_and_strdup(&s->description, description);
f7f53e9e
TG
2222}
2223
356779df 2224_public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
f7f53e9e 2225 assert_return(s, -EINVAL);
356779df 2226 assert_return(description, -EINVAL);
f4b2933e 2227 assert_return(!event_pid_changed(s->event), -ECHILD);
f7f53e9e 2228
7d92a1a4
ZJS
2229 if (!s->description)
2230 return -ENXIO;
2231
356779df 2232 *description = s->description;
f7f53e9e
TG
2233 return 0;
2234}
2235
adcc4ca3 2236_public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
305f78bf 2237 assert_return(s, NULL);
eaa3cbef
LP
2238
2239 return s->event;
2240}
2241
f7262a9f 2242_public_ int sd_event_source_get_pending(sd_event_source *s) {
305f78bf 2243 assert_return(s, -EINVAL);
6203e07a 2244 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c 2245 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2246 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2247
2248 return s->pending;
2249}
2250
f7262a9f 2251_public_ int sd_event_source_get_io_fd(sd_event_source *s) {
305f78bf
LP
2252 assert_return(s, -EINVAL);
2253 assert_return(s->type == SOURCE_IO, -EDOM);
2254 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2255
2256 return s->io.fd;
2257}
2258
30caf8f3
LP
2259_public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2260 int r;
2261
2262 assert_return(s, -EINVAL);
8ac43fee 2263 assert_return(fd >= 0, -EBADF);
30caf8f3
LP
2264 assert_return(s->type == SOURCE_IO, -EDOM);
2265 assert_return(!event_pid_changed(s->event), -ECHILD);
2266
2267 if (s->io.fd == fd)
2268 return 0;
2269
b6d5481b 2270 if (event_source_is_offline(s)) {
30caf8f3
LP
2271 s->io.fd = fd;
2272 s->io.registered = false;
2273 } else {
2274 int saved_fd;
2275
2276 saved_fd = s->io.fd;
2277 assert(s->io.registered);
2278
2279 s->io.fd = fd;
2280 s->io.registered = false;
2281
2282 r = source_io_register(s, s->enabled, s->io.events);
2283 if (r < 0) {
2284 s->io.fd = saved_fd;
2285 s->io.registered = true;
2286 return r;
2287 }
2288
5a795bff 2289 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
30caf8f3
LP
2290 }
2291
2292 return 0;
2293}
2294
ab93297c
NM
2295_public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2296 assert_return(s, -EINVAL);
2297 assert_return(s->type == SOURCE_IO, -EDOM);
2298
2299 return s->io.owned;
2300}
2301
2302_public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2303 assert_return(s, -EINVAL);
2304 assert_return(s->type == SOURCE_IO, -EDOM);
2305
2306 s->io.owned = own;
2307 return 0;
2308}
2309
f7262a9f 2310_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
305f78bf
LP
2311 assert_return(s, -EINVAL);
2312 assert_return(events, -EINVAL);
2313 assert_return(s->type == SOURCE_IO, -EDOM);
2314 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2315
2316 *events = s->io.events;
2317 return 0;
2318}
2319
f7262a9f 2320_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
fd38203a
LP
2321 int r;
2322
305f78bf
LP
2323 assert_return(s, -EINVAL);
2324 assert_return(s->type == SOURCE_IO, -EDOM);
2a16a986 2325 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 2326 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2327 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2328
b63c8d4f
DH
2329 /* edge-triggered updates are never skipped, so we can reset edges */
2330 if (s->io.events == events && !(events & EPOLLET))
fd38203a
LP
2331 return 0;
2332
2a0dc6cd
LP
2333 r = source_set_pending(s, false);
2334 if (r < 0)
2335 return r;
2336
b6d5481b 2337 if (event_source_is_online(s)) {
e4715127 2338 r = source_io_register(s, s->enabled, events);
fd38203a
LP
2339 if (r < 0)
2340 return r;
2341 }
2342
2343 s->io.events = events;
2344
2345 return 0;
2346}
2347
f7262a9f 2348_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
305f78bf
LP
2349 assert_return(s, -EINVAL);
2350 assert_return(revents, -EINVAL);
2351 assert_return(s->type == SOURCE_IO, -EDOM);
2352 assert_return(s->pending, -ENODATA);
2353 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2354
2355 *revents = s->io.revents;
2356 return 0;
2357}
2358
f7262a9f 2359_public_ int sd_event_source_get_signal(sd_event_source *s) {
305f78bf
LP
2360 assert_return(s, -EINVAL);
2361 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2362 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2363
2364 return s->signal.sig;
2365}
2366
31927c16 2367_public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
305f78bf
LP
2368 assert_return(s, -EINVAL);
2369 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2370
6680b8d1
ME
2371 *priority = s->priority;
2372 return 0;
fd38203a
LP
2373}
2374
31927c16 2375_public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
97ef5391
LP
2376 bool rm_inotify = false, rm_inode = false;
2377 struct inotify_data *new_inotify_data = NULL;
2378 struct inode_data *new_inode_data = NULL;
9da4cb2b
LP
2379 int r;
2380
305f78bf 2381 assert_return(s, -EINVAL);
da7e457c 2382 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2383 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2384
2385 if (s->priority == priority)
2386 return 0;
2387
97ef5391
LP
2388 if (s->type == SOURCE_INOTIFY) {
2389 struct inode_data *old_inode_data;
2390
2391 assert(s->inotify.inode_data);
2392 old_inode_data = s->inotify.inode_data;
2393
2394 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2395 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2396 * events we allow priority changes only until the first following iteration. */
2397 if (old_inode_data->fd < 0)
2398 return -EOPNOTSUPP;
2399
2400 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2401 if (r < 0)
2402 return r;
2403 rm_inotify = r > 0;
2404
2405 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2406 if (r < 0)
2407 goto fail;
2408 rm_inode = r > 0;
2409
2410 if (new_inode_data->fd < 0) {
2411 /* Duplicate the fd for the new inode object if we don't have any yet */
2412 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2413 if (new_inode_data->fd < 0) {
2414 r = -errno;
2415 goto fail;
2416 }
2417
ed828563 2418 LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
97ef5391
LP
2419 }
2420
2421 /* Move the event source to the new inode data structure */
2422 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2423 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2424 s->inotify.inode_data = new_inode_data;
2425
2426 /* Now create the new watch */
2427 r = inode_data_realize_watch(s->event, new_inode_data);
2428 if (r < 0) {
2429 /* Move it back */
2430 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2431 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2432 s->inotify.inode_data = old_inode_data;
2433 goto fail;
2434 }
2435
2436 s->priority = priority;
2437
2438 event_gc_inode_data(s->event, old_inode_data);
2439
b6d5481b 2440 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
9da4cb2b
LP
2441 struct signal_data *old, *d;
2442
2443 /* Move us from the signalfd belonging to the old
2444 * priority to the signalfd of the new priority */
2445
2446 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2447
2448 s->priority = priority;
2449
2450 r = event_make_signal_data(s->event, s->signal.sig, &d);
2451 if (r < 0) {
2452 s->priority = old->priority;
2453 return r;
2454 }
2455
2456 event_unmask_signal_data(s->event, old, s->signal.sig);
2457 } else
2458 s->priority = priority;
fd38203a 2459
e1951c16 2460 event_source_pp_prioq_reshuffle(s);
fd38203a 2461
6203e07a
LP
2462 if (s->type == SOURCE_EXIT)
2463 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
305f78bf 2464
fd38203a 2465 return 0;
97ef5391
LP
2466
2467fail:
2468 if (rm_inode)
2469 event_free_inode_data(s->event, new_inode_data);
2470
2471 if (rm_inotify)
2472 event_free_inotify_data(s->event, new_inotify_data);
2473
2474 return r;
fd38203a
LP
2475}
2476
cad143a8 2477_public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
71193c0b
ZJS
2478 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2479 if (!s && !ret)
2480 return false;
2481
305f78bf 2482 assert_return(s, -EINVAL);
305f78bf 2483 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2484
cad143a8
LP
2485 if (ret)
2486 *ret = s->enabled;
2487
08c1eb0e 2488 return s->enabled != SD_EVENT_OFF;
fd38203a
LP
2489}
2490
b6d5481b
LP
2491static int event_source_offline(
2492 sd_event_source *s,
2493 int enabled,
2494 bool ratelimited) {
2495
2496 bool was_offline;
fd38203a
LP
2497 int r;
2498
ddfde737 2499 assert(s);
b6d5481b 2500 assert(enabled == SD_EVENT_OFF || ratelimited);
fd38203a 2501
ddfde737 2502 /* Unset the pending flag when this event source is disabled */
b6d5481b
LP
2503 if (s->enabled != SD_EVENT_OFF &&
2504 enabled == SD_EVENT_OFF &&
2505 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2506 r = source_set_pending(s, false);
2507 if (r < 0)
2508 return r;
2509 }
cc567911 2510
b6d5481b
LP
2511 was_offline = event_source_is_offline(s);
2512 s->enabled = enabled;
2513 s->ratelimited = ratelimited;
fd38203a 2514
ddfde737 2515 switch (s->type) {
fd38203a 2516
ddfde737
LP
2517 case SOURCE_IO:
2518 source_io_unregister(s);
2519 break;
ac989a78 2520
ddfde737
LP
2521 case SOURCE_SIGNAL:
2522 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2523 break;
fd38203a 2524
ddfde737 2525 case SOURCE_CHILD:
b6d5481b
LP
2526 if (!was_offline) {
2527 assert(s->event->n_online_child_sources > 0);
2528 s->event->n_online_child_sources--;
2529 }
fd38203a 2530
ddfde737
LP
2531 if (EVENT_SOURCE_WATCH_PIDFD(s))
2532 source_child_pidfd_unregister(s);
2533 else
2534 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2535 break;
4807d2d0 2536
ddfde737
LP
2537 case SOURCE_EXIT:
2538 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2539 break;
fd38203a 2540
2115b9b6
YW
2541 case SOURCE_TIME_REALTIME:
2542 case SOURCE_TIME_BOOTTIME:
2543 case SOURCE_TIME_MONOTONIC:
2544 case SOURCE_TIME_REALTIME_ALARM:
2545 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737
LP
2546 case SOURCE_DEFER:
2547 case SOURCE_POST:
2548 case SOURCE_INOTIFY:
2549 break;
fd38203a 2550
ddfde737 2551 default:
04499a70 2552 assert_not_reached();
ddfde737 2553 }
fd38203a 2554
2115b9b6
YW
2555 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2556 event_source_time_prioq_reshuffle(s);
2557
b6d5481b 2558 return 1;
ddfde737 2559}
f8f3f926 2560
b6d5481b
LP
2561static int event_source_online(
2562 sd_event_source *s,
2563 int enabled,
2564 bool ratelimited) {
2565
2566 bool was_online;
ddfde737 2567 int r;
fd38203a 2568
ddfde737 2569 assert(s);
b6d5481b 2570 assert(enabled != SD_EVENT_OFF || !ratelimited);
305f78bf 2571
ddfde737 2572 /* Unset the pending flag when this event source is enabled */
b6d5481b
LP
2573 if (s->enabled == SD_EVENT_OFF &&
2574 enabled != SD_EVENT_OFF &&
2575 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2576 r = source_set_pending(s, false);
2577 if (r < 0)
2578 return r;
2579 }
9d3e3aa5 2580
b6d5481b
LP
2581 /* Are we really ready for onlining? */
2582 if (enabled == SD_EVENT_OFF || ratelimited) {
2583 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2584 s->enabled = enabled;
2585 s->ratelimited = ratelimited;
2586 return 0;
2587 }
2588
2589 was_online = event_source_is_online(s);
2590
ddfde737 2591 switch (s->type) {
ddfde737 2592 case SOURCE_IO:
b6d5481b 2593 r = source_io_register(s, enabled, s->io.events);
d2eafe61 2594 if (r < 0)
ddfde737 2595 return r;
ddfde737 2596 break;
fd38203a 2597
ddfde737
LP
2598 case SOURCE_SIGNAL:
2599 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2600 if (r < 0) {
ddfde737
LP
2601 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2602 return r;
2603 }
fd38203a 2604
ddfde737 2605 break;
fd38203a 2606
ddfde737 2607 case SOURCE_CHILD:
ddfde737
LP
2608 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2609 /* yes, we have pidfd */
9da4cb2b 2610
b6d5481b 2611 r = source_child_pidfd_register(s, enabled);
ac9f2640 2612 if (r < 0)
9da4cb2b 2613 return r;
ddfde737
LP
2614 } else {
2615 /* no pidfd, or something other to watch for than WEXITED */
9da4cb2b 2616
ddfde737
LP
2617 r = event_make_signal_data(s->event, SIGCHLD, NULL);
2618 if (r < 0) {
ddfde737
LP
2619 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2620 return r;
2621 }
2622 }
fd38203a 2623
b6d5481b
LP
2624 if (!was_online)
2625 s->event->n_online_child_sources++;
ddfde737 2626 break;
4807d2d0 2627
d2eafe61
ZJS
2628 case SOURCE_TIME_REALTIME:
2629 case SOURCE_TIME_BOOTTIME:
2630 case SOURCE_TIME_MONOTONIC:
2631 case SOURCE_TIME_REALTIME_ALARM:
2632 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737 2633 case SOURCE_EXIT:
ddfde737
LP
2634 case SOURCE_DEFER:
2635 case SOURCE_POST:
2636 case SOURCE_INOTIFY:
2637 break;
9da4cb2b 2638
ddfde737 2639 default:
04499a70 2640 assert_not_reached();
ddfde737 2641 }
f8f3f926 2642
b6d5481b
LP
2643 s->enabled = enabled;
2644 s->ratelimited = ratelimited;
d2eafe61
ZJS
2645
2646 /* Non-failing operations below */
2115b9b6 2647 if (s->type == SOURCE_EXIT)
d2eafe61 2648 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
d2eafe61 2649
2115b9b6
YW
2650 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2651 event_source_time_prioq_reshuffle(s);
d2eafe61 2652
b6d5481b 2653 return 1;
ddfde737
LP
2654}
2655
2656_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
2657 int r;
9da4cb2b 2658
ddfde737 2659 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
7e922b05
ZJS
2660
2661 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
2662 if (m == SD_EVENT_OFF && !s)
2663 return 0;
2664
2665 assert_return(s, -EINVAL);
ddfde737 2666 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2667
ddfde737
LP
2668 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
2669 if (s->event->state == SD_EVENT_FINISHED)
2670 return m == SD_EVENT_OFF ? 0 : -ESTALE;
305f78bf 2671
ddfde737
LP
2672 if (s->enabled == m) /* No change? */
2673 return 0;
9d3e3aa5 2674
ddfde737 2675 if (m == SD_EVENT_OFF)
b6d5481b 2676 r = event_source_offline(s, m, s->ratelimited);
ddfde737
LP
2677 else {
2678 if (s->enabled != SD_EVENT_OFF) {
2679 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
2680 * event source is already enabled after all. */
2681 s->enabled = m;
2682 return 0;
fd38203a 2683 }
ddfde737 2684
b6d5481b 2685 r = event_source_online(s, m, s->ratelimited);
fd38203a 2686 }
ddfde737
LP
2687 if (r < 0)
2688 return r;
fd38203a 2689
e1951c16 2690 event_source_pp_prioq_reshuffle(s);
fd38203a
LP
2691 return 0;
2692}
2693
f7262a9f 2694_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
305f78bf
LP
2695 assert_return(s, -EINVAL);
2696 assert_return(usec, -EINVAL);
6a0f1f6d 2697 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
305f78bf 2698 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2699
2700 *usec = s->time.next;
2701 return 0;
2702}
2703
f7262a9f 2704_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
2a0dc6cd 2705 int r;
6a0f1f6d 2706
305f78bf 2707 assert_return(s, -EINVAL);
6a0f1f6d 2708 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 2709 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2710 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2711
2a0dc6cd
LP
2712 r = source_set_pending(s, false);
2713 if (r < 0)
2714 return r;
2576a19e 2715
2a0dc6cd 2716 s->time.next = usec;
fd38203a 2717
e1951c16 2718 event_source_time_prioq_reshuffle(s);
fd38203a
LP
2719 return 0;
2720}
2721
d6a83dc4
LP
2722_public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
2723 usec_t t;
2724 int r;
2725
2726 assert_return(s, -EINVAL);
2727 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2728
2729 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
2730 if (r < 0)
2731 return r;
2732
496db330
YW
2733 usec = usec_add(t, usec);
2734 if (usec == USEC_INFINITY)
d6a83dc4
LP
2735 return -EOVERFLOW;
2736
496db330 2737 return sd_event_source_set_time(s, usec);
d6a83dc4
LP
2738}
2739
f7262a9f 2740_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
305f78bf
LP
2741 assert_return(s, -EINVAL);
2742 assert_return(usec, -EINVAL);
6a0f1f6d 2743 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
305f78bf
LP
2744 assert_return(!event_pid_changed(s->event), -ECHILD);
2745
2746 *usec = s->time.accuracy;
2747 return 0;
2748}
2749
f7262a9f 2750_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
2a0dc6cd 2751 int r;
6a0f1f6d 2752
305f78bf 2753 assert_return(s, -EINVAL);
f5fbe71d 2754 assert_return(usec != UINT64_MAX, -EINVAL);
6a0f1f6d 2755 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 2756 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2757 assert_return(!event_pid_changed(s->event), -ECHILD);
eaa3cbef 2758
2a0dc6cd
LP
2759 r = source_set_pending(s, false);
2760 if (r < 0)
2761 return r;
2762
eaa3cbef
LP
2763 if (usec == 0)
2764 usec = DEFAULT_ACCURACY_USEC;
2765
eaa3cbef
LP
2766 s->time.accuracy = usec;
2767
e1951c16 2768 event_source_time_prioq_reshuffle(s);
6a0f1f6d
LP
2769 return 0;
2770}
2771
2772_public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
2773 assert_return(s, -EINVAL);
2774 assert_return(clock, -EINVAL);
2775 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2776 assert_return(!event_pid_changed(s->event), -ECHILD);
eaa3cbef 2777
6a0f1f6d 2778 *clock = event_source_type_to_clock(s->type);
eaa3cbef
LP
2779 return 0;
2780}
2781
f7262a9f 2782_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
4bee8012
LP
2783 assert_return(s, -EINVAL);
2784 assert_return(pid, -EINVAL);
2785 assert_return(s->type == SOURCE_CHILD, -EDOM);
2786 assert_return(!event_pid_changed(s->event), -ECHILD);
2787
2788 *pid = s->child.pid;
2789 return 0;
2790}
2791
f8f3f926
LP
2792_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
2793 assert_return(s, -EINVAL);
2794 assert_return(s->type == SOURCE_CHILD, -EDOM);
2795 assert_return(!event_pid_changed(s->event), -ECHILD);
2796
2797 if (s->child.pidfd < 0)
2798 return -EOPNOTSUPP;
2799
2800 return s->child.pidfd;
2801}
2802
2803_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
2804 assert_return(s, -EINVAL);
2805 assert_return(s->type == SOURCE_CHILD, -EDOM);
2806 assert_return(!event_pid_changed(s->event), -ECHILD);
2807 assert_return(SIGNAL_VALID(sig), -EINVAL);
2808
2809 /* If we already have seen indication the process exited refuse sending a signal early. This way we
2810 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
2811 * available. */
2812 if (s->child.exited)
2813 return -ESRCH;
2814
2815 if (s->child.pidfd >= 0) {
2816 siginfo_t copy;
2817
2818 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
2819 * structure here */
2820 if (si)
2821 copy = *si;
2822
2823 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
2824 /* Let's propagate the error only if the system call is not implemented or prohibited */
2825 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
2826 return -errno;
2827 } else
2828 return 0;
2829 }
2830
2831 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
2832 * this here. */
2833 if (flags != 0)
2834 return -EOPNOTSUPP;
2835
2836 if (si) {
2837 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
2838 siginfo_t copy = *si;
2839
2840 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
2841 return -errno;
2842 } else if (kill(s->child.pid, sig) < 0)
2843 return -errno;
2844
2845 return 0;
2846}
2847
2848_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
2849 assert_return(s, -EINVAL);
2850 assert_return(s->type == SOURCE_CHILD, -EDOM);
2851
2852 if (s->child.pidfd < 0)
2853 return -EOPNOTSUPP;
2854
2855 return s->child.pidfd_owned;
2856}
2857
2858_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
2859 assert_return(s, -EINVAL);
2860 assert_return(s->type == SOURCE_CHILD, -EDOM);
2861
2862 if (s->child.pidfd < 0)
2863 return -EOPNOTSUPP;
2864
2865 s->child.pidfd_owned = own;
2866 return 0;
2867}
2868
2869_public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
2870 assert_return(s, -EINVAL);
2871 assert_return(s->type == SOURCE_CHILD, -EDOM);
2872
2873 return s->child.process_owned;
2874}
2875
2876_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
2877 assert_return(s, -EINVAL);
2878 assert_return(s->type == SOURCE_CHILD, -EDOM);
2879
2880 s->child.process_owned = own;
2881 return 0;
2882}
2883
97ef5391
LP
2884_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
2885 assert_return(s, -EINVAL);
2886 assert_return(mask, -EINVAL);
2887 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
2888 assert_return(!event_pid_changed(s->event), -ECHILD);
2889
2890 *mask = s->inotify.mask;
2891 return 0;
2892}
2893
718db961 2894_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
fd38203a
LP
2895 int r;
2896
da7e457c 2897 assert_return(s, -EINVAL);
6203e07a 2898 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c
LP
2899 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2900 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2901
2902 if (s->prepare == callback)
2903 return 0;
2904
2905 if (callback && s->prepare) {
2906 s->prepare = callback;
2907 return 0;
2908 }
2909
2910 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
2911 if (r < 0)
2912 return r;
2913
2914 s->prepare = callback;
2915
2916 if (callback) {
2917 r = prioq_put(s->event->prepare, s, &s->prepare_index);
2918 if (r < 0)
2919 return r;
2920 } else
2921 prioq_remove(s->event->prepare, s, &s->prepare_index);
2922
2923 return 0;
2924}
2925
f7262a9f 2926_public_ void* sd_event_source_get_userdata(sd_event_source *s) {
da7e457c 2927 assert_return(s, NULL);
fd38203a
LP
2928
2929 return s->userdata;
2930}
2931
8f726607
LP
2932_public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
2933 void *ret;
2934
2935 assert_return(s, NULL);
2936
2937 ret = s->userdata;
2938 s->userdata = userdata;
2939
2940 return ret;
2941}
2942
b6d5481b
LP
2943static int event_source_enter_ratelimited(sd_event_source *s) {
2944 int r;
2945
2946 assert(s);
2947
2948 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
2949 * the end of the rate limit time window, much as if it was a timer event source. */
2950
2951 if (s->ratelimited)
2952 return 0; /* Already ratelimited, this is a NOP hence */
2953
2954 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
2955 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
2956 if (r < 0)
2957 return r;
2958
2959 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
2960 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
2961 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
2962 if (EVENT_SOURCE_IS_TIME(s->type))
2963 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
2964
2965 /* Now, let's add the event source to the monotonic clock instead */
2966 r = event_source_time_prioq_put(s, &s->event->monotonic);
2967 if (r < 0)
2968 goto fail;
2969
2970 /* And let's take the event source officially offline */
2971 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
2972 if (r < 0) {
2973 event_source_time_prioq_remove(s, &s->event->monotonic);
2974 goto fail;
2975 }
2976
2977 event_source_pp_prioq_reshuffle(s);
2978
2979 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
2980 return 0;
2981
2982fail:
2983 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
2984 * space for it should already be allocated. */
2985 if (EVENT_SOURCE_IS_TIME(s->type))
2986 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
2987
2988 return r;
2989}
2990
fd69f224 2991static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
b6d5481b
LP
2992 int r;
2993
2994 assert(s);
2995
2996 if (!s->ratelimited)
2997 return 0;
2998
2999 /* Let's take the event source out of the monotonic prioq first. */
3000 event_source_time_prioq_remove(s, &s->event->monotonic);
3001
3002 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3003 if (EVENT_SOURCE_IS_TIME(s->type)) {
3004 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3005 if (r < 0)
3006 goto fail;
3007 }
3008
3009 /* Let's try to take it online again. */
3010 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3011 if (r < 0) {
3012 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3013 if (EVENT_SOURCE_IS_TIME(s->type))
3014 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3015
3016 goto fail;
3017 }
3018
3019 event_source_pp_prioq_reshuffle(s);
3020 ratelimit_reset(&s->rate_limit);
3021
3022 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
fd69f224
MS
3023
3024 if (run_callback && s->ratelimit_expire_callback) {
3025 s->dispatching = true;
3026 r = s->ratelimit_expire_callback(s, s->userdata);
3027 s->dispatching = false;
3028
3029 if (r < 0) {
3030 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3031 strna(s->description),
3032 event_source_type_to_string(s->type),
3033 s->exit_on_failure ? "exiting" : "disabling");
3034
3035 if (s->exit_on_failure)
3036 (void) sd_event_exit(s->event, r);
3037 }
3038
3039 if (s->n_ref == 0)
3040 source_free(s);
3041 else if (r < 0)
0a040e64 3042 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd69f224
MS
3043
3044 return 1;
3045 }
3046
b6d5481b
LP
3047 return 0;
3048
3049fail:
3050 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3051 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3052 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3053
3054 return r;
3055}
3056
c2ba3ad6
LP
3057static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3058 usec_t c;
3059 assert(e);
3060 assert(a <= b);
3061
3062 if (a <= 0)
3063 return 0;
393003e1
LP
3064 if (a >= USEC_INFINITY)
3065 return USEC_INFINITY;
c2ba3ad6
LP
3066
3067 if (b <= a + 1)
3068 return a;
3069
52444dc4
LP
3070 initialize_perturb(e);
3071
c2ba3ad6
LP
3072 /*
3073 Find a good time to wake up again between times a and b. We
3074 have two goals here:
3075
3076 a) We want to wake up as seldom as possible, hence prefer
3077 later times over earlier times.
3078
3079 b) But if we have to wake up, then let's make sure to
3080 dispatch as much as possible on the entire system.
3081
3082 We implement this by waking up everywhere at the same time
850516e0 3083 within any given minute if we can, synchronised via the
c2ba3ad6 3084 perturbation value determined from the boot ID. If we can't,
ba276c81
LP
3085 then we try to find the same spot in every 10s, then 1s and
3086 then 250ms step. Otherwise, we pick the last possible time
3087 to wake up.
c2ba3ad6
LP
3088 */
3089
850516e0
LP
3090 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3091 if (c >= b) {
3092 if (_unlikely_(c < USEC_PER_MINUTE))
3093 return b;
3094
3095 c -= USEC_PER_MINUTE;
3096 }
3097
ba276c81
LP
3098 if (c >= a)
3099 return c;
3100
3101 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3102 if (c >= b) {
3103 if (_unlikely_(c < USEC_PER_SEC*10))
3104 return b;
3105
3106 c -= USEC_PER_SEC*10;
3107 }
3108
850516e0
LP
3109 if (c >= a)
3110 return c;
3111
3112 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
c2ba3ad6
LP
3113 if (c >= b) {
3114 if (_unlikely_(c < USEC_PER_SEC))
3115 return b;
3116
3117 c -= USEC_PER_SEC;
3118 }
3119
3120 if (c >= a)
3121 return c;
3122
3123 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3124 if (c >= b) {
3125 if (_unlikely_(c < USEC_PER_MSEC*250))
3126 return b;
3127
3128 c -= USEC_PER_MSEC*250;
3129 }
3130
3131 if (c >= a)
3132 return c;
3133
3134 return b;
3135}
3136
fd38203a
LP
3137static int event_arm_timer(
3138 sd_event *e,
6a0f1f6d 3139 struct clock_data *d) {
fd38203a
LP
3140
3141 struct itimerspec its = {};
c2ba3ad6
LP
3142 sd_event_source *a, *b;
3143 usec_t t;
fd38203a 3144
cde93897 3145 assert(e);
6a0f1f6d 3146 assert(d);
fd38203a 3147
d06441da 3148 if (!d->needs_rearm)
212bbb17 3149 return 0;
7e2bf71c
YW
3150
3151 d->needs_rearm = false;
212bbb17 3152
6a0f1f6d 3153 a = prioq_peek(d->earliest);
19947509 3154 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
b6d5481b 3155 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
72aedc1e 3156
6a0f1f6d 3157 if (d->fd < 0)
c57b5ca3
LP
3158 return 0;
3159
3a43da28 3160 if (d->next == USEC_INFINITY)
72aedc1e
LP
3161 return 0;
3162
3163 /* disarm */
15c689d7
LP
3164 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3165 return -errno;
72aedc1e 3166
3a43da28 3167 d->next = USEC_INFINITY;
fd38203a 3168 return 0;
72aedc1e 3169 }
fd38203a 3170
6a0f1f6d 3171 b = prioq_peek(d->latest);
19947509
ZJS
3172 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3173 assert(b && b->enabled != SD_EVENT_OFF);
c2ba3ad6 3174
b6d5481b 3175 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
6a0f1f6d 3176 if (d->next == t)
fd38203a
LP
3177 return 0;
3178
6a0f1f6d 3179 assert_se(d->fd >= 0);
fd38203a 3180
c2ba3ad6 3181 if (t == 0) {
1751bdde 3182 /* We don't want to disarm here, just mean some time looooong ago. */
fd38203a
LP
3183 its.it_value.tv_sec = 0;
3184 its.it_value.tv_nsec = 1;
3185 } else
c2ba3ad6 3186 timespec_store(&its.it_value, t);
fd38203a 3187
15c689d7 3188 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
cde93897 3189 return -errno;
fd38203a 3190
6a0f1f6d 3191 d->next = t;
fd38203a
LP
3192 return 0;
3193}
3194
9a800b56 3195static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
fd38203a
LP
3196 assert(e);
3197 assert(s);
3198 assert(s->type == SOURCE_IO);
3199
9a800b56
LP
3200 /* If the event source was already pending, we just OR in the
3201 * new revents, otherwise we reset the value. The ORing is
3202 * necessary to handle EPOLLONESHOT events properly where
3203 * readability might happen independently of writability, and
3204 * we need to keep track of both */
3205
3206 if (s->pending)
3207 s->io.revents |= revents;
3208 else
3209 s->io.revents = revents;
fd38203a 3210
fd38203a
LP
3211 return source_set_pending(s, true);
3212}
3213
72aedc1e 3214static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
fd38203a
LP
3215 uint64_t x;
3216 ssize_t ss;
3217
3218 assert(e);
da7e457c 3219 assert(fd >= 0);
72aedc1e 3220
305f78bf 3221 assert_return(events == EPOLLIN, -EIO);
fd38203a
LP
3222
3223 ss = read(fd, &x, sizeof(x));
3224 if (ss < 0) {
8add30a0 3225 if (ERRNO_IS_TRANSIENT(errno))
fd38203a
LP
3226 return 0;
3227
3228 return -errno;
3229 }
3230
8d35dae7 3231 if (_unlikely_(ss != sizeof(x)))
fd38203a
LP
3232 return -EIO;
3233
cde93897 3234 if (next)
3a43da28 3235 *next = USEC_INFINITY;
72aedc1e 3236
fd38203a
LP
3237 return 0;
3238}
3239
305f78bf
LP
3240static int process_timer(
3241 sd_event *e,
3242 usec_t n,
6a0f1f6d 3243 struct clock_data *d) {
305f78bf 3244
fd38203a 3245 sd_event_source *s;
fd69f224 3246 bool callback_invoked = false;
fd38203a
LP
3247 int r;
3248
3249 assert(e);
6a0f1f6d 3250 assert(d);
fd38203a
LP
3251
3252 for (;;) {
6a0f1f6d 3253 s = prioq_peek(d->earliest);
19947509
ZJS
3254 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3255
b6d5481b
LP
3256 if (!s || time_event_source_next(s) > n)
3257 break;
3258
3259 if (s->ratelimited) {
3260 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3261 * again. */
3262 assert(s->ratelimited);
3263
fd69f224 3264 r = event_source_leave_ratelimit(s, /* run_callback */ true);
b6d5481b
LP
3265 if (r < 0)
3266 return r;
fd69f224
MS
3267 else if (r == 1)
3268 callback_invoked = true;
b6d5481b
LP
3269
3270 continue;
3271 }
3272
3273 if (s->enabled == SD_EVENT_OFF || s->pending)
fd38203a
LP
3274 break;
3275
3276 r = source_set_pending(s, true);
3277 if (r < 0)
3278 return r;
3279
e1951c16 3280 event_source_time_prioq_reshuffle(s);
fd38203a
LP
3281 }
3282
fd69f224 3283 return callback_invoked;
fd38203a
LP
3284}
3285
efd3be9d
YW
3286static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3287 int64_t min_priority = threshold;
3288 bool something_new = false;
fd38203a 3289 sd_event_source *s;
fd38203a
LP
3290 int r;
3291
3292 assert(e);
efd3be9d
YW
3293 assert(ret_min_priority);
3294
3295 if (!e->need_process_child) {
3296 *ret_min_priority = min_priority;
3297 return 0;
3298 }
fd38203a 3299
c2ba3ad6
LP
3300 e->need_process_child = false;
3301
91c70071
YW
3302 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3303 * for, instead of using P_ALL. This is because we only want to get child information of very
3304 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3305 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3306 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3307 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3308 * to handle SIGCHLD yourself.
3309 *
3310 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3311 * source is dispatched so that the callback still sees the process as a zombie. */
fd38203a 3312
90e74a66 3313 HASHMAP_FOREACH(s, e->child_sources) {
fd38203a
LP
3314 assert(s->type == SOURCE_CHILD);
3315
efd3be9d
YW
3316 if (s->priority > threshold)
3317 continue;
3318
fd38203a
LP
3319 if (s->pending)
3320 continue;
3321
b6d5481b 3322 if (event_source_is_offline(s))
fd38203a
LP
3323 continue;
3324
f8f3f926
LP
3325 if (s->child.exited)
3326 continue;
3327
91c70071
YW
3328 if (EVENT_SOURCE_WATCH_PIDFD(s))
3329 /* There's a usable pidfd known for this event source? Then don't waitid() for
3330 * it here */
f8f3f926
LP
3331 continue;
3332
fd38203a 3333 zero(s->child.siginfo);
15c689d7
LP
3334 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3335 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
bfd9bfcc 3336 return negative_errno();
fd38203a
LP
3337
3338 if (s->child.siginfo.si_pid != 0) {
945c2931 3339 bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
08cd1552 3340
f8f3f926
LP
3341 if (zombie)
3342 s->child.exited = true;
3343
08cd1552 3344 if (!zombie && (s->child.options & WEXITED)) {
91c70071
YW
3345 /* If the child isn't dead then let's immediately remove the state
3346 * change from the queue, since there's no benefit in leaving it
3347 * queued. */
08cd1552
LP
3348
3349 assert(s->child.options & (WSTOPPED|WCONTINUED));
a5d27871 3350 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
08cd1552
LP
3351 }
3352
fd38203a
LP
3353 r = source_set_pending(s, true);
3354 if (r < 0)
3355 return r;
efd3be9d
YW
3356 if (r > 0) {
3357 something_new = true;
3358 min_priority = MIN(min_priority, s->priority);
3359 }
fd38203a
LP
3360 }
3361 }
3362
efd3be9d
YW
3363 *ret_min_priority = min_priority;
3364 return something_new;
fd38203a
LP
3365}
3366
f8f3f926
LP
3367static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3368 assert(e);
3369 assert(s);
3370 assert(s->type == SOURCE_CHILD);
3371
3372 if (s->pending)
3373 return 0;
3374
b6d5481b 3375 if (event_source_is_offline(s))
f8f3f926
LP
3376 return 0;
3377
3378 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3379 return 0;
3380
3381 zero(s->child.siginfo);
3382 if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3383 return -errno;
3384
3385 if (s->child.siginfo.si_pid == 0)
3386 return 0;
3387
3388 if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3389 s->child.exited = true;
3390
3391 return source_set_pending(s, true);
3392}
3393
efd3be9d 3394static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
fd38203a
LP
3395 int r;
3396
da7e457c 3397 assert(e);
97ef5391 3398 assert(d);
305f78bf 3399 assert_return(events == EPOLLIN, -EIO);
efd3be9d 3400 assert(min_priority);
fd38203a 3401
91c70071
YW
3402 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3403 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3404 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3405 * but we might have higher priority children we care about hence we need to check that
3406 * explicitly. */
9da4cb2b
LP
3407
3408 if (sigismember(&d->sigset, SIGCHLD))
3409 e->need_process_child = true;
3410
91c70071 3411 /* If there's already an event source pending for this priority we don't read another */
9da4cb2b
LP
3412 if (d->current)
3413 return 0;
3414
fd38203a 3415 for (;;) {
0eb2e0e3 3416 struct signalfd_siginfo si;
7057bd99 3417 ssize_t n;
92daebc0 3418 sd_event_source *s = NULL;
fd38203a 3419
9da4cb2b 3420 n = read(d->fd, &si, sizeof(si));
7057bd99 3421 if (n < 0) {
8add30a0 3422 if (ERRNO_IS_TRANSIENT(errno))
efd3be9d 3423 return 0;
fd38203a
LP
3424
3425 return -errno;
3426 }
3427
7057bd99 3428 if (_unlikely_(n != sizeof(si)))
fd38203a
LP
3429 return -EIO;
3430
6eb7c172 3431 assert(SIGNAL_VALID(si.ssi_signo));
7057bd99 3432
92daebc0
LP
3433 if (e->signal_sources)
3434 s = e->signal_sources[si.ssi_signo];
92daebc0
LP
3435 if (!s)
3436 continue;
9da4cb2b
LP
3437 if (s->pending)
3438 continue;
fd38203a
LP
3439
3440 s->signal.siginfo = si;
9da4cb2b
LP
3441 d->current = s;
3442
fd38203a
LP
3443 r = source_set_pending(s, true);
3444 if (r < 0)
3445 return r;
efd3be9d
YW
3446 if (r > 0 && *min_priority >= s->priority) {
3447 *min_priority = s->priority;
3448 return 1; /* an event source with smaller priority is queued. */
3449 }
9da4cb2b 3450
efd3be9d 3451 return 0;
fd38203a 3452 }
fd38203a
LP
3453}
3454
efd3be9d 3455static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
97ef5391
LP
3456 ssize_t n;
3457
3458 assert(e);
3459 assert(d);
3460
3461 assert_return(revents == EPOLLIN, -EIO);
3462
3463 /* If there's already an event source pending for this priority, don't read another */
3464 if (d->n_pending > 0)
3465 return 0;
3466
3467 /* Is the read buffer non-empty? If so, let's not read more */
3468 if (d->buffer_filled > 0)
3469 return 0;
3470
efd3be9d
YW
3471 if (d->priority > threshold)
3472 return 0;
3473
97ef5391
LP
3474 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3475 if (n < 0) {
8add30a0 3476 if (ERRNO_IS_TRANSIENT(errno))
97ef5391
LP
3477 return 0;
3478
3479 return -errno;
3480 }
3481
3482 assert(n > 0);
3483 d->buffer_filled = (size_t) n;
0601b958 3484 LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3485
3486 return 1;
3487}
3488
3489static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3490 assert(e);
3491 assert(d);
3492 assert(sz <= d->buffer_filled);
3493
3494 if (sz == 0)
3495 return;
3496
3497 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3498 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3499 d->buffer_filled -= sz;
3500
3501 if (d->buffer_filled == 0)
0601b958 3502 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3503}
3504
3505static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3506 int r;
3507
3508 assert(e);
3509 assert(d);
3510
3511 /* If there's already an event source pending for this priority, don't read another */
3512 if (d->n_pending > 0)
3513 return 0;
3514
3515 while (d->buffer_filled > 0) {
3516 size_t sz;
3517
3518 /* Let's validate that the event structures are complete */
3519 if (d->buffer_filled < offsetof(struct inotify_event, name))
3520 return -EIO;
3521
3522 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3523 if (d->buffer_filled < sz)
3524 return -EIO;
3525
3526 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3527 struct inode_data *inode_data;
97ef5391
LP
3528
3529 /* The queue overran, let's pass this event to all event sources connected to this inotify
3530 * object */
3531
03677889 3532 HASHMAP_FOREACH(inode_data, d->inodes)
97ef5391
LP
3533 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3534
b6d5481b 3535 if (event_source_is_offline(s))
97ef5391
LP
3536 continue;
3537
3538 r = source_set_pending(s, true);
3539 if (r < 0)
3540 return r;
3541 }
97ef5391
LP
3542 } else {
3543 struct inode_data *inode_data;
97ef5391
LP
3544
3545 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3546 * our watch descriptor table. */
3547 if (d->buffer.ev.mask & IN_IGNORED) {
3548
3549 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3550 if (!inode_data) {
3551 event_inotify_data_drop(e, d, sz);
3552 continue;
3553 }
3554
3555 /* The watch descriptor was removed by the kernel, let's drop it here too */
3556 inode_data->wd = -1;
3557 } else {
3558 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3559 if (!inode_data) {
3560 event_inotify_data_drop(e, d, sz);
3561 continue;
3562 }
3563 }
3564
3565 /* Trigger all event sources that are interested in these events. Also trigger all event
3566 * sources if IN_IGNORED or IN_UNMOUNT is set. */
3567 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3568
b6d5481b 3569 if (event_source_is_offline(s))
97ef5391
LP
3570 continue;
3571
3572 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3573 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3574 continue;
3575
3576 r = source_set_pending(s, true);
3577 if (r < 0)
3578 return r;
3579 }
3580 }
3581
3582 /* Something pending now? If so, let's finish, otherwise let's read more. */
3583 if (d->n_pending > 0)
3584 return 1;
3585 }
3586
3587 return 0;
3588}
3589
3590static int process_inotify(sd_event *e) {
97ef5391
LP
3591 int r, done = 0;
3592
3593 assert(e);
3594
0601b958 3595 LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
97ef5391
LP
3596 r = event_inotify_data_process(e, d);
3597 if (r < 0)
3598 return r;
3599 if (r > 0)
3600 done ++;
3601 }
3602
3603 return done;
3604}
3605
fd38203a 3606static int source_dispatch(sd_event_source *s) {
8f5c235d 3607 EventSourceType saved_type;
c8e9d15c 3608 sd_event *saved_event;
fe8245eb 3609 int r = 0;
fd38203a
LP
3610
3611 assert(s);
6203e07a 3612 assert(s->pending || s->type == SOURCE_EXIT);
fd38203a 3613
b778cba4
LP
3614 /* Save the event source type, here, so that we still know it after the event callback which might
3615 * invalidate the event. */
8f5c235d
LP
3616 saved_type = s->type;
3617
de02634c 3618 /* Similarly, store a reference to the event loop object, so that we can still access it after the
b778cba4 3619 * callback might have invalidated/disconnected the event source. */
c8e9d15c
YW
3620 saved_event = s->event;
3621 PROTECT_EVENT(saved_event);
b778cba4 3622
de02634c 3623 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
b6d5481b
LP
3624 assert(!s->ratelimited);
3625 if (!ratelimit_below(&s->rate_limit)) {
3626 r = event_source_enter_ratelimited(s);
3627 if (r < 0)
3628 return r;
3629
3630 return 1;
3631 }
3632
945c2931 3633 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
da7e457c
LP
3634 r = source_set_pending(s, false);
3635 if (r < 0)
3636 return r;
3637 }
fd38203a 3638
6e9feda3
LP
3639 if (s->type != SOURCE_POST) {
3640 sd_event_source *z;
6e9feda3 3641
de02634c 3642 /* If we execute a non-post source, let's mark all post sources as pending. */
6e9feda3 3643
90e74a66 3644 SET_FOREACH(z, s->event->post_sources) {
b6d5481b 3645 if (event_source_is_offline(z))
6e9feda3
LP
3646 continue;
3647
3648 r = source_set_pending(z, true);
3649 if (r < 0)
3650 return r;
3651 }
3652 }
3653
baf76283
LP
3654 if (s->enabled == SD_EVENT_ONESHOT) {
3655 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
fd38203a
LP
3656 if (r < 0)
3657 return r;
3658 }
3659
12179984 3660 s->dispatching = true;
b7484e2a 3661
fd38203a
LP
3662 switch (s->type) {
3663
3664 case SOURCE_IO:
3665 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
3666 break;
3667
6a0f1f6d 3668 case SOURCE_TIME_REALTIME:
a8548816 3669 case SOURCE_TIME_BOOTTIME:
6a0f1f6d
LP
3670 case SOURCE_TIME_MONOTONIC:
3671 case SOURCE_TIME_REALTIME_ALARM:
3672 case SOURCE_TIME_BOOTTIME_ALARM:
fd38203a
LP
3673 r = s->time.callback(s, s->time.next, s->userdata);
3674 break;
3675
3676 case SOURCE_SIGNAL:
3677 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
3678 break;
3679
08cd1552
LP
3680 case SOURCE_CHILD: {
3681 bool zombie;
3682
945c2931 3683 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
08cd1552 3684
fd38203a 3685 r = s->child.callback(s, &s->child.siginfo, s->userdata);
08cd1552
LP
3686
3687 /* Now, reap the PID for good. */
f8f3f926 3688 if (zombie) {
cc59d290 3689 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
f8f3f926
LP
3690 s->child.waited = true;
3691 }
08cd1552 3692
fd38203a 3693 break;
08cd1552 3694 }
fd38203a
LP
3695
3696 case SOURCE_DEFER:
3697 r = s->defer.callback(s, s->userdata);
3698 break;
da7e457c 3699
6e9feda3
LP
3700 case SOURCE_POST:
3701 r = s->post.callback(s, s->userdata);
3702 break;
3703
6203e07a
LP
3704 case SOURCE_EXIT:
3705 r = s->exit.callback(s, s->userdata);
da7e457c 3706 break;
9d3e3aa5 3707
97ef5391
LP
3708 case SOURCE_INOTIFY: {
3709 struct sd_event *e = s->event;
3710 struct inotify_data *d;
3711 size_t sz;
3712
3713 assert(s->inotify.inode_data);
3714 assert_se(d = s->inotify.inode_data->inotify_data);
3715
3716 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
3717 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3718 assert(d->buffer_filled >= sz);
3719
53baf2ef
LP
3720 /* If the inotify callback destroys the event source then this likely means we don't need to
3721 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
3722 * free it immediately, then we couldn't drop the event from the inotify event queue without
3723 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
3724 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
3725 * explicitly GC it after we are done dropping the inotify event from the buffer. */
3726 d->n_busy++;
97ef5391 3727 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
53baf2ef 3728 d->n_busy--;
97ef5391 3729
53baf2ef
LP
3730 /* When no event is pending anymore on this inotify object, then let's drop the event from
3731 * the inotify event queue buffer. */
97ef5391
LP
3732 if (d->n_pending == 0)
3733 event_inotify_data_drop(e, d, sz);
3734
53baf2ef
LP
3735 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
3736 event_gc_inotify_data(e, d);
97ef5391
LP
3737 break;
3738 }
3739
9d3e3aa5 3740 case SOURCE_WATCHDOG:
a71fe8b8 3741 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
9f2a50a3 3742 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
04499a70 3743 assert_not_reached();
fd38203a
LP
3744 }
3745
12179984
LP
3746 s->dispatching = false;
3747
b778cba4
LP
3748 if (r < 0) {
3749 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
3750 strna(s->description),
3751 event_source_type_to_string(saved_type),
3752 s->exit_on_failure ? "exiting" : "disabling");
3753
3754 if (s->exit_on_failure)
3755 (void) sd_event_exit(saved_event, r);
3756 }
12179984
LP
3757
3758 if (s->n_ref == 0)
3759 source_free(s);
3760 else if (r < 0)
c3c50474 3761 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
b7484e2a 3762
6203e07a 3763 return 1;
fd38203a
LP
3764}
3765
3766static int event_prepare(sd_event *e) {
3767 int r;
3768
3769 assert(e);
3770
3771 for (;;) {
3772 sd_event_source *s;
3773
3774 s = prioq_peek(e->prepare);
b6d5481b 3775 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
fd38203a
LP
3776 break;
3777
3778 s->prepare_iteration = e->iteration;
3779 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
3780 if (r < 0)
3781 return r;
3782
3783 assert(s->prepare);
12179984
LP
3784
3785 s->dispatching = true;
fd38203a 3786 r = s->prepare(s, s->userdata);
12179984
LP
3787 s->dispatching = false;
3788
b778cba4
LP
3789 if (r < 0) {
3790 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
3791 strna(s->description),
3792 event_source_type_to_string(s->type),
3793 s->exit_on_failure ? "exiting" : "disabling");
3794
3795 if (s->exit_on_failure)
3796 (void) sd_event_exit(e, r);
3797 }
fd38203a 3798
12179984
LP
3799 if (s->n_ref == 0)
3800 source_free(s);
3801 else if (r < 0)
c3c50474 3802 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd38203a
LP
3803 }
3804
3805 return 0;
3806}
3807
6203e07a 3808static int dispatch_exit(sd_event *e) {
da7e457c
LP
3809 sd_event_source *p;
3810 int r;
3811
3812 assert(e);
3813
6203e07a 3814 p = prioq_peek(e->exit);
19947509
ZJS
3815 assert(!p || p->type == SOURCE_EXIT);
3816
b6d5481b 3817 if (!p || event_source_is_offline(p)) {
da7e457c
LP
3818 e->state = SD_EVENT_FINISHED;
3819 return 0;
3820 }
3821
c8e9d15c 3822 PROTECT_EVENT(e);
da7e457c 3823 e->iteration++;
6203e07a 3824 e->state = SD_EVENT_EXITING;
da7e457c 3825 r = source_dispatch(p);
2b0c9ef7 3826 e->state = SD_EVENT_INITIAL;
da7e457c
LP
3827 return r;
3828}
3829
c2ba3ad6
LP
3830static sd_event_source* event_next_pending(sd_event *e) {
3831 sd_event_source *p;
3832
da7e457c
LP
3833 assert(e);
3834
c2ba3ad6
LP
3835 p = prioq_peek(e->pending);
3836 if (!p)
3837 return NULL;
3838
b6d5481b 3839 if (event_source_is_offline(p))
c2ba3ad6
LP
3840 return NULL;
3841
3842 return p;
3843}
3844
cde93897
LP
3845static int arm_watchdog(sd_event *e) {
3846 struct itimerspec its = {};
3847 usec_t t;
cde93897
LP
3848
3849 assert(e);
3850 assert(e->watchdog_fd >= 0);
3851
3852 t = sleep_between(e,
a595fb5c
YW
3853 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
3854 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
cde93897
LP
3855
3856 timespec_store(&its.it_value, t);
3857
75145780
LP
3858 /* Make sure we never set the watchdog to 0, which tells the
3859 * kernel to disable it. */
3860 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
3861 its.it_value.tv_nsec = 1;
3862
7c248223 3863 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
cde93897
LP
3864}
3865
3866static int process_watchdog(sd_event *e) {
3867 assert(e);
3868
3869 if (!e->watchdog)
3870 return 0;
3871
3872 /* Don't notify watchdog too often */
3873 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
3874 return 0;
3875
3876 sd_notify(false, "WATCHDOG=1");
3877 e->watchdog_last = e->timestamp.monotonic;
3878
3879 return arm_watchdog(e);
3880}
3881
97ef5391
LP
3882static void event_close_inode_data_fds(sd_event *e) {
3883 struct inode_data *d;
3884
3885 assert(e);
3886
3887 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
3888 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
365c2885 3889 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
97ef5391
LP
3890 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
3891 * compromise. */
3892
ed828563 3893 while ((d = e->inode_data_to_close_list)) {
97ef5391
LP
3894 assert(d->fd >= 0);
3895 d->fd = safe_close(d->fd);
3896
ed828563 3897 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
3898 }
3899}
3900
c45a5a74
TG
3901_public_ int sd_event_prepare(sd_event *e) {
3902 int r;
fd38203a 3903
da7e457c 3904 assert_return(e, -EINVAL);
b937d761 3905 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
3906 assert_return(!event_pid_changed(e), -ECHILD);
3907 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 3908 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 3909
e5446015
LP
3910 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
3911 * this check here once, since gettid() is typically not cached, and thus want to minimize
3912 * syscalls */
3913 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
3914
f814c871 3915 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 3916 PROTECT_EVENT(e);
f814c871 3917
6203e07a 3918 if (e->exit_requested)
c45a5a74 3919 goto pending;
fd38203a
LP
3920
3921 e->iteration++;
3922
0be6c2f6 3923 e->state = SD_EVENT_PREPARING;
fd38203a 3924 r = event_prepare(e);
0be6c2f6 3925 e->state = SD_EVENT_INITIAL;
fd38203a 3926 if (r < 0)
c45a5a74 3927 return r;
fd38203a 3928
6a0f1f6d
LP
3929 r = event_arm_timer(e, &e->realtime);
3930 if (r < 0)
c45a5a74 3931 return r;
6a0f1f6d 3932
a8548816
TG
3933 r = event_arm_timer(e, &e->boottime);
3934 if (r < 0)
c45a5a74 3935 return r;
a8548816 3936
6a0f1f6d
LP
3937 r = event_arm_timer(e, &e->monotonic);
3938 if (r < 0)
c45a5a74 3939 return r;
6a0f1f6d
LP
3940
3941 r = event_arm_timer(e, &e->realtime_alarm);
1b5995b0 3942 if (r < 0)
c45a5a74 3943 return r;
fd38203a 3944
6a0f1f6d 3945 r = event_arm_timer(e, &e->boottime_alarm);
1b5995b0 3946 if (r < 0)
c45a5a74 3947 return r;
fd38203a 3948
97ef5391
LP
3949 event_close_inode_data_fds(e);
3950
0601b958 3951 if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
c45a5a74
TG
3952 goto pending;
3953
2b0c9ef7 3954 e->state = SD_EVENT_ARMED;
c45a5a74
TG
3955
3956 return 0;
3957
3958pending:
2b0c9ef7 3959 e->state = SD_EVENT_ARMED;
6d148a84
TG
3960 r = sd_event_wait(e, 0);
3961 if (r == 0)
2b0c9ef7 3962 e->state = SD_EVENT_ARMED;
6d148a84
TG
3963
3964 return r;
c45a5a74
TG
3965}
3966
798445ab
LP
3967static int epoll_wait_usec(
3968 int fd,
3969 struct epoll_event *events,
3970 int maxevents,
3971 usec_t timeout) {
3972
7c248223 3973 int msec;
0c14c45e
LP
3974 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
3975
3976#if HAVE_EPOLL_PWAIT2
39f756d3 3977 static bool epoll_pwait2_absent = false;
52bb308c 3978 int r;
798445ab 3979
0c14c45e
LP
3980 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
3981 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
3982 * is not that obvious to implement given the libc and kernel definitions differ in the last
3983 * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
3984 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
3985 * missing. */
798445ab
LP
3986
3987 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
798445ab
LP
3988 r = epoll_pwait2(fd,
3989 events,
3990 maxevents,
52bb308c 3991 TIMESPEC_STORE(timeout),
798445ab
LP
3992 NULL);
3993 if (r >= 0)
3994 return r;
7cb45dbf 3995 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
798445ab
LP
3996 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
3997 * supported. */
3998
3999 epoll_pwait2_absent = true;
4000 }
39f756d3 4001#endif
798445ab
LP
4002
4003 if (timeout == USEC_INFINITY)
4004 msec = -1;
4005 else {
4006 usec_t k;
4007
4008 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4009 if (k >= INT_MAX)
4010 msec = INT_MAX; /* Saturate */
4011 else
4012 msec = (int) k;
4013 }
4014
7c248223 4015 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
798445ab
LP
4016}
4017
efd3be9d 4018static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
319a4f4b 4019 size_t n_event_queue, m, n_event_max;
efd3be9d
YW
4020 int64_t min_priority = threshold;
4021 bool something_new = false;
798445ab 4022 int r;
c45a5a74 4023
efd3be9d
YW
4024 assert(e);
4025 assert(ret_min_priority);
6a0f1f6d 4026
8b9708d1 4027 n_event_queue = MAX(e->n_sources, 1u);
319a4f4b 4028 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
5cddd924 4029 return -ENOMEM;
fd38203a 4030
319a4f4b
LP
4031 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4032
97ef5391 4033 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
0601b958 4034 if (e->buffered_inotify_data_list)
798445ab 4035 timeout = 0;
97ef5391 4036
8b9708d1 4037 for (;;) {
319a4f4b
LP
4038 r = epoll_wait_usec(
4039 e->epoll_fd,
4040 e->event_queue,
4041 n_event_max,
4042 timeout);
798445ab 4043 if (r < 0)
efd3be9d 4044 return r;
c45a5a74 4045
8b9708d1
YW
4046 m = (size_t) r;
4047
319a4f4b 4048 if (m < n_event_max)
8b9708d1
YW
4049 break;
4050
319a4f4b 4051 if (n_event_max >= n_event_queue * 10)
8b9708d1
YW
4052 break;
4053
319a4f4b 4054 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
8b9708d1
YW
4055 return -ENOMEM;
4056
319a4f4b 4057 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
798445ab 4058 timeout = 0;
da7e457c 4059 }
fd38203a 4060
efd3be9d
YW
4061 /* Set timestamp only when this is called first time. */
4062 if (threshold == INT64_MAX)
4063 triple_timestamp_get(&e->timestamp);
fd38203a 4064
8b9708d1 4065 for (size_t i = 0; i < m; i++) {
fd38203a 4066
5cddd924
LP
4067 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4068 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
9da4cb2b 4069 else {
5cddd924 4070 WakeupType *t = e->event_queue[i].data.ptr;
9da4cb2b
LP
4071
4072 switch (*t) {
4073
f8f3f926 4074 case WAKEUP_EVENT_SOURCE: {
5cddd924 4075 sd_event_source *s = e->event_queue[i].data.ptr;
f8f3f926
LP
4076
4077 assert(s);
4078
efd3be9d
YW
4079 if (s->priority > threshold)
4080 continue;
4081
4082 min_priority = MIN(min_priority, s->priority);
4083
f8f3f926
LP
4084 switch (s->type) {
4085
4086 case SOURCE_IO:
5cddd924 4087 r = process_io(e, s, e->event_queue[i].events);
f8f3f926
LP
4088 break;
4089
4090 case SOURCE_CHILD:
5cddd924 4091 r = process_pidfd(e, s, e->event_queue[i].events);
f8f3f926
LP
4092 break;
4093
4094 default:
04499a70 4095 assert_not_reached();
f8f3f926
LP
4096 }
4097
9da4cb2b 4098 break;
f8f3f926 4099 }
fd38203a 4100
9da4cb2b 4101 case WAKEUP_CLOCK_DATA: {
5cddd924 4102 struct clock_data *d = e->event_queue[i].data.ptr;
f8f3f926
LP
4103
4104 assert(d);
4105
5cddd924 4106 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
9da4cb2b
LP
4107 break;
4108 }
4109
4110 case WAKEUP_SIGNAL_DATA:
efd3be9d 4111 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
9da4cb2b
LP
4112 break;
4113
97ef5391 4114 case WAKEUP_INOTIFY_DATA:
efd3be9d 4115 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
97ef5391
LP
4116 break;
4117
9da4cb2b 4118 default:
04499a70 4119 assert_not_reached();
9da4cb2b
LP
4120 }
4121 }
efd3be9d
YW
4122 if (r < 0)
4123 return r;
4124 if (r > 0)
4125 something_new = true;
4126 }
4127
4128 *ret_min_priority = min_priority;
4129 return something_new;
4130}
4131
4132_public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4133 int r;
4134
4135 assert_return(e, -EINVAL);
4136 assert_return(e = event_resolve(e), -ENOPKG);
4137 assert_return(!event_pid_changed(e), -ECHILD);
4138 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4139 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4140
4141 if (e->exit_requested) {
4142 e->state = SD_EVENT_PENDING;
4143 return 1;
4144 }
4145
4146 for (int64_t threshold = INT64_MAX; ; threshold--) {
4147 int64_t epoll_min_priority, child_min_priority;
4148
4149 /* There may be a possibility that new epoll (especially IO) and child events are
4150 * triggered just after process_epoll() call but before process_child(), and the new IO
4151 * events may have higher priority than the child events. To salvage these events,
4152 * let's call epoll_wait() again, but accepts only events with higher priority than the
4153 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4154 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4155 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4156
4157 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4158 if (r == -EINTR) {
4159 e->state = SD_EVENT_PENDING;
4160 return 1;
4161 }
4162 if (r < 0)
4163 goto finish;
4164 if (r == 0 && threshold < INT64_MAX)
4165 /* No new epoll event. */
4166 break;
4167
4168 r = process_child(e, threshold, &child_min_priority);
fd38203a 4169 if (r < 0)
da7e457c 4170 goto finish;
efd3be9d
YW
4171 if (r == 0)
4172 /* No new child event. */
4173 break;
4174
4175 threshold = MIN(epoll_min_priority, child_min_priority);
4176 if (threshold == INT64_MIN)
4177 break;
4178
4179 timeout = 0;
fd38203a
LP
4180 }
4181
cde93897
LP
4182 r = process_watchdog(e);
4183 if (r < 0)
4184 goto finish;
4185
fd69f224 4186 r = process_inotify(e);
6a0f1f6d
LP
4187 if (r < 0)
4188 goto finish;
4189
fd69f224 4190 r = process_timer(e, e->timestamp.realtime, &e->realtime);
a8548816
TG
4191 if (r < 0)
4192 goto finish;
4193
fd69f224 4194 r = process_timer(e, e->timestamp.boottime, &e->boottime);
6a0f1f6d
LP
4195 if (r < 0)
4196 goto finish;
4197
4198 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
fd38203a 4199 if (r < 0)
da7e457c 4200 goto finish;
fd38203a 4201
e475d10c 4202 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
fd38203a 4203 if (r < 0)
da7e457c 4204 goto finish;
fd38203a 4205
fd69f224 4206 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
97ef5391
LP
4207 if (r < 0)
4208 goto finish;
fd69f224
MS
4209 else if (r == 1) {
4210 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4211 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4212 * there were potentially re-enabled by the callback.
4213 *
4214 * Wondering why we treat only this invocation of process_timer() differently? Once event
4215 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4216 * ratelimit expiry callback is never called for any other timer type. */
4217 r = 0;
4218 goto finish;
4219 }
97ef5391 4220
c45a5a74
TG
4221 if (event_next_pending(e)) {
4222 e->state = SD_EVENT_PENDING;
c45a5a74 4223 return 1;
da7e457c
LP
4224 }
4225
c45a5a74 4226 r = 0;
fd38203a 4227
da7e457c 4228finish:
2b0c9ef7 4229 e->state = SD_EVENT_INITIAL;
da7e457c
LP
4230
4231 return r;
fd38203a
LP
4232}
4233
c45a5a74
TG
4234_public_ int sd_event_dispatch(sd_event *e) {
4235 sd_event_source *p;
4236 int r;
4237
4238 assert_return(e, -EINVAL);
b937d761 4239 assert_return(e = event_resolve(e), -ENOPKG);
c45a5a74
TG
4240 assert_return(!event_pid_changed(e), -ECHILD);
4241 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4242 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4243
4244 if (e->exit_requested)
4245 return dispatch_exit(e);
4246
4247 p = event_next_pending(e);
4248 if (p) {
c8e9d15c 4249 PROTECT_EVENT(e);
c45a5a74
TG
4250
4251 e->state = SD_EVENT_RUNNING;
4252 r = source_dispatch(p);
2b0c9ef7 4253 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4254 return r;
4255 }
4256
2b0c9ef7 4257 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4258
4259 return 1;
4260}
4261
34b87517 4262static void event_log_delays(sd_event *e) {
442ac269
YW
4263 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4264 size_t l, i;
34b87517 4265
442ac269
YW
4266 p = b;
4267 l = sizeof(b);
4268 for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4269 l = strpcpyf(&p, l, "%u ", e->delays[i]);
34b87517
VC
4270 e->delays[i] = 0;
4271 }
442ac269 4272 log_debug("Event loop iterations: %s", b);
34b87517
VC
4273}
4274
c45a5a74
TG
4275_public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4276 int r;
4277
4278 assert_return(e, -EINVAL);
b937d761 4279 assert_return(e = event_resolve(e), -ENOPKG);
c45a5a74
TG
4280 assert_return(!event_pid_changed(e), -ECHILD);
4281 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 4282 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
c45a5a74 4283
e6a7bee5 4284 if (e->profile_delays && e->last_run_usec != 0) {
34b87517
VC
4285 usec_t this_run;
4286 unsigned l;
4287
4288 this_run = now(CLOCK_MONOTONIC);
4289
58c34be8 4290 l = log2u64(this_run - e->last_run_usec);
cb9d621e 4291 assert(l < ELEMENTSOF(e->delays));
34b87517
VC
4292 e->delays[l]++;
4293
e6a7bee5 4294 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
34b87517 4295 event_log_delays(e);
e6a7bee5 4296 e->last_log_usec = this_run;
34b87517
VC
4297 }
4298 }
4299
f814c871 4300 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 4301 PROTECT_EVENT(e);
f814c871 4302
c45a5a74 4303 r = sd_event_prepare(e);
53bac4e0
LP
4304 if (r == 0)
4305 /* There was nothing? Then wait... */
4306 r = sd_event_wait(e, timeout);
c45a5a74 4307
34b87517 4308 if (e->profile_delays)
e6a7bee5 4309 e->last_run_usec = now(CLOCK_MONOTONIC);
34b87517 4310
02d30981 4311 if (r > 0) {
53bac4e0 4312 /* There's something now, then let's dispatch it */
02d30981
TG
4313 r = sd_event_dispatch(e);
4314 if (r < 0)
4315 return r;
53bac4e0
LP
4316
4317 return 1;
4318 }
4319
4320 return r;
c45a5a74
TG
4321}
4322
f7262a9f 4323_public_ int sd_event_loop(sd_event *e) {
fd38203a
LP
4324 int r;
4325
da7e457c 4326 assert_return(e, -EINVAL);
b937d761 4327 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 4328 assert_return(!event_pid_changed(e), -ECHILD);
2b0c9ef7 4329 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 4330
c8e9d15c 4331 PROTECT_EVENT(e);
fd38203a 4332
da7e457c 4333 while (e->state != SD_EVENT_FINISHED) {
f5fbe71d 4334 r = sd_event_run(e, UINT64_MAX);
fd38203a 4335 if (r < 0)
30dd293c 4336 return r;
fd38203a
LP
4337 }
4338
30dd293c 4339 return e->exit_code;
fd38203a
LP
4340}
4341
9b364545 4342_public_ int sd_event_get_fd(sd_event *e) {
9b364545 4343 assert_return(e, -EINVAL);
b937d761 4344 assert_return(e = event_resolve(e), -ENOPKG);
9b364545
TG
4345 assert_return(!event_pid_changed(e), -ECHILD);
4346
4347 return e->epoll_fd;
4348}
4349
f7262a9f 4350_public_ int sd_event_get_state(sd_event *e) {
da7e457c 4351 assert_return(e, -EINVAL);
b937d761 4352 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
4353 assert_return(!event_pid_changed(e), -ECHILD);
4354
4355 return e->state;
4356}
4357
6203e07a 4358_public_ int sd_event_get_exit_code(sd_event *e, int *code) {
da7e457c 4359 assert_return(e, -EINVAL);
b937d761 4360 assert_return(e = event_resolve(e), -ENOPKG);
6203e07a 4361 assert_return(code, -EINVAL);
da7e457c 4362 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 4363
6203e07a
LP
4364 if (!e->exit_requested)
4365 return -ENODATA;
4366
4367 *code = e->exit_code;
4368 return 0;
fd38203a
LP
4369}
4370
6203e07a 4371_public_ int sd_event_exit(sd_event *e, int code) {
da7e457c 4372 assert_return(e, -EINVAL);
b937d761 4373 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
4374 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4375 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 4376
6203e07a
LP
4377 e->exit_requested = true;
4378 e->exit_code = code;
4379
fd38203a
LP
4380 return 0;
4381}
46e8c825 4382
6a0f1f6d 4383_public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
46e8c825 4384 assert_return(e, -EINVAL);
b937d761 4385 assert_return(e = event_resolve(e), -ENOPKG);
46e8c825 4386 assert_return(usec, -EINVAL);
46e8c825
LP
4387 assert_return(!event_pid_changed(e), -ECHILD);
4388
e475d10c
LP
4389 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4390 return -EOPNOTSUPP;
4391
e475d10c 4392 if (!triple_timestamp_is_set(&e->timestamp)) {
15c689d7 4393 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
38a03f06
LP
4394 *usec = now(clock);
4395 return 1;
4396 }
46e8c825 4397
e475d10c 4398 *usec = triple_timestamp_by_clock(&e->timestamp, clock);
46e8c825
LP
4399 return 0;
4400}
afc6adb5
LP
4401
4402_public_ int sd_event_default(sd_event **ret) {
39883f62 4403 sd_event *e = NULL;
afc6adb5
LP
4404 int r;
4405
4406 if (!ret)
4407 return !!default_event;
4408
4409 if (default_event) {
4410 *ret = sd_event_ref(default_event);
4411 return 0;
4412 }
4413
4414 r = sd_event_new(&e);
4415 if (r < 0)
4416 return r;
4417
4418 e->default_event_ptr = &default_event;
4419 e->tid = gettid();
4420 default_event = e;
4421
4422 *ret = e;
4423 return 1;
4424}
4425
4426_public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4427 assert_return(e, -EINVAL);
b937d761 4428 assert_return(e = event_resolve(e), -ENOPKG);
afc6adb5 4429 assert_return(tid, -EINVAL);
76b54375 4430 assert_return(!event_pid_changed(e), -ECHILD);
afc6adb5 4431
76b54375
LP
4432 if (e->tid != 0) {
4433 *tid = e->tid;
4434 return 0;
4435 }
4436
4437 return -ENXIO;
afc6adb5 4438}
cde93897
LP
4439
4440_public_ int sd_event_set_watchdog(sd_event *e, int b) {
4441 int r;
4442
4443 assert_return(e, -EINVAL);
b937d761 4444 assert_return(e = event_resolve(e), -ENOPKG);
8f726607 4445 assert_return(!event_pid_changed(e), -ECHILD);
cde93897
LP
4446
4447 if (e->watchdog == !!b)
4448 return e->watchdog;
4449
4450 if (b) {
09812eb7
LP
4451 r = sd_watchdog_enabled(false, &e->watchdog_period);
4452 if (r <= 0)
cde93897 4453 return r;
cde93897
LP
4454
4455 /* Issue first ping immediately */
4456 sd_notify(false, "WATCHDOG=1");
4457 e->watchdog_last = now(CLOCK_MONOTONIC);
4458
4459 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4460 if (e->watchdog_fd < 0)
4461 return -errno;
4462
4463 r = arm_watchdog(e);
4464 if (r < 0)
4465 goto fail;
4466
1eac7948 4467 struct epoll_event ev = {
a82f89aa
LP
4468 .events = EPOLLIN,
4469 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
4470 };
cde93897 4471
15c689d7 4472 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
cde93897
LP
4473 r = -errno;
4474 goto fail;
4475 }
4476
4477 } else {
4478 if (e->watchdog_fd >= 0) {
5a795bff 4479 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
03e334a1 4480 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
4481 }
4482 }
4483
4484 e->watchdog = !!b;
4485 return e->watchdog;
4486
4487fail:
03e334a1 4488 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
4489 return r;
4490}
8f726607
LP
4491
4492_public_ int sd_event_get_watchdog(sd_event *e) {
4493 assert_return(e, -EINVAL);
b937d761 4494 assert_return(e = event_resolve(e), -ENOPKG);
8f726607
LP
4495 assert_return(!event_pid_changed(e), -ECHILD);
4496
4497 return e->watchdog;
4498}
60a3b1e1
LP
4499
4500_public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
4501 assert_return(e, -EINVAL);
b937d761 4502 assert_return(e = event_resolve(e), -ENOPKG);
60a3b1e1
LP
4503 assert_return(!event_pid_changed(e), -ECHILD);
4504
4505 *ret = e->iteration;
4506 return 0;
4507}
15723a1d
LP
4508
4509_public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
4510 assert_return(s, -EINVAL);
4511
4512 s->destroy_callback = callback;
4513 return 0;
4514}
4515
4516_public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
4517 assert_return(s, -EINVAL);
4518
4519 if (ret)
4520 *ret = s->destroy_callback;
4521
4522 return !!s->destroy_callback;
4523}
2382c936
YW
4524
4525_public_ int sd_event_source_get_floating(sd_event_source *s) {
4526 assert_return(s, -EINVAL);
4527
4528 return s->floating;
4529}
4530
4531_public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
4532 assert_return(s, -EINVAL);
4533
4534 if (s->floating == !!b)
4535 return 0;
4536
4537 if (!s->event) /* Already disconnected */
4538 return -ESTALE;
4539
4540 s->floating = b;
4541
4542 if (b) {
4543 sd_event_source_ref(s);
4544 sd_event_unref(s->event);
4545 } else {
4546 sd_event_ref(s->event);
4547 sd_event_source_unref(s);
4548 }
4549
4550 return 1;
4551}
b778cba4
LP
4552
4553_public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
4554 assert_return(s, -EINVAL);
4555 assert_return(s->type != SOURCE_EXIT, -EDOM);
4556
4557 return s->exit_on_failure;
4558}
4559
4560_public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
4561 assert_return(s, -EINVAL);
4562 assert_return(s->type != SOURCE_EXIT, -EDOM);
4563
4564 if (s->exit_on_failure == !!b)
4565 return 0;
4566
4567 s->exit_on_failure = b;
4568 return 1;
4569}
b6d5481b
LP
4570
4571_public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
4572 int r;
4573
4574 assert_return(s, -EINVAL);
4575
4576 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
4577 * so is a programming error. */
4578 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
4579
4580 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
4581 * non-ratelimited. */
fd69f224 4582 r = event_source_leave_ratelimit(s, /* run_callback */ false);
b6d5481b
LP
4583 if (r < 0)
4584 return r;
4585
4586 s->rate_limit = (RateLimit) { interval, burst };
4587 return 0;
fd69f224
MS
4588}
4589
4590_public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
4591 assert_return(s, -EINVAL);
4592
4593 s->ratelimit_expire_callback = callback;
4594 return 0;
b6d5481b
LP
4595}
4596
4597_public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
4598 assert_return(s, -EINVAL);
4599
6dd3b818
YW
4600 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
4601 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
b6d5481b
LP
4602 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
4603 return -EDOM;
4604
4605 if (!ratelimit_configured(&s->rate_limit))
4606 return -ENOEXEC;
4607
4608 if (ret_interval)
4609 *ret_interval = s->rate_limit.interval;
4610 if (ret_burst)
4611 *ret_burst = s->rate_limit.burst;
4612
4613 return 0;
4614}
4615
4616_public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
4617 assert_return(s, -EINVAL);
4618
4619 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
4620 return false;
4621
4622 if (!ratelimit_configured(&s->rate_limit))
4623 return false;
4624
4625 return s->ratelimited;
4626}
baf3fdec
LP
4627
4628_public_ int sd_event_set_signal_exit(sd_event *e, int b) {
4629 bool change = false;
4630 int r;
4631
4632 assert_return(e, -EINVAL);
4633
4634 if (b) {
4635 /* We want to maintain pointers to these event sources, so that we can destroy them when told
4636 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
4637 * floating after creation (and undo this before deleting them again). */
4638
4639 if (!e->sigint_event_source) {
4640 r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
4641 if (r < 0)
4642 return r;
4643
4644 assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
4645 change = true;
4646 }
4647
4648 if (!e->sigterm_event_source) {
4649 r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
4650 if (r < 0) {
4651 if (change) {
4652 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
4653 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
4654 }
4655
4656 return r;
4657 }
4658
4659 assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
4660 change = true;
4661 }
4662
4663 } else {
4664 if (e->sigint_event_source) {
4665 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
4666 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
4667 change = true;
4668 }
4669
4670 if (e->sigterm_event_source) {
4671 assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
4672 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
4673 change = true;
4674 }
4675 }
4676
4677 return change;
4678}