]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/libsystemd/sd-event/sd-event.c
hwdb: fix swapped buttons for Logitech Lift left
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
fd38203a
LP
2
3#include <sys/epoll.h>
4#include <sys/timerfd.h>
5#include <sys/wait.h>
6
cde93897 7#include "sd-daemon.h"
07630cea
LP
8#include "sd-event.h"
9#include "sd-id128.h"
10
b5efdb8a 11#include "alloc-util.h"
f8f3f926 12#include "env-util.h"
a137a1c3 13#include "event-source.h"
3ffd4af2 14#include "fd-util.h"
97ef5391 15#include "fs-util.h"
28e5e1e9 16#include "glyph-util.h"
fd38203a 17#include "hashmap.h"
07630cea 18#include "list.h"
3ae6b3bf 19#include "logarithm.h"
07630cea 20#include "macro.h"
0a970718 21#include "memory-util.h"
f5947a5e 22#include "missing_syscall.h"
07630cea 23#include "prioq.h"
4a0b58c4 24#include "process-util.h"
6e9feda3 25#include "set.h"
24882e06 26#include "signal-util.h"
55cbfaa5 27#include "string-table.h"
07630cea 28#include "string-util.h"
442ac269 29#include "strxcpyx.h"
07630cea 30#include "time-util.h"
fd38203a 31
c2ba3ad6 32#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
fd38203a 33
f8f3f926
LP
34static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
35 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
36 return s &&
37 s->type == SOURCE_CHILD &&
38 s->child.pidfd >= 0 &&
39 s->child.options == WEXITED;
40}
41
b6d5481b
LP
42static bool event_source_is_online(sd_event_source *s) {
43 assert(s);
44 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
45}
46
47static bool event_source_is_offline(sd_event_source *s) {
48 assert(s);
49 return s->enabled == SD_EVENT_OFF || s->ratelimited;
50}
51
55cbfaa5 52static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
3f3548f8
ZJS
53 [SOURCE_IO] = "io",
54 [SOURCE_TIME_REALTIME] = "realtime",
55 [SOURCE_TIME_BOOTTIME] = "bootime",
56 [SOURCE_TIME_MONOTONIC] = "monotonic",
55cbfaa5
DM
57 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
58 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
3f3548f8
ZJS
59 [SOURCE_SIGNAL] = "signal",
60 [SOURCE_CHILD] = "child",
61 [SOURCE_DEFER] = "defer",
62 [SOURCE_POST] = "post",
63 [SOURCE_EXIT] = "exit",
64 [SOURCE_WATCHDOG] = "watchdog",
65 [SOURCE_INOTIFY] = "inotify",
55cbfaa5
DM
66};
67
68DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
69
b6d5481b
LP
70#define EVENT_SOURCE_IS_TIME(t) \
71 IN_SET((t), \
72 SOURCE_TIME_REALTIME, \
73 SOURCE_TIME_BOOTTIME, \
74 SOURCE_TIME_MONOTONIC, \
75 SOURCE_TIME_REALTIME_ALARM, \
76 SOURCE_TIME_BOOTTIME_ALARM)
77
78#define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
79 IN_SET((t), \
80 SOURCE_IO, \
81 SOURCE_TIME_REALTIME, \
82 SOURCE_TIME_BOOTTIME, \
83 SOURCE_TIME_MONOTONIC, \
84 SOURCE_TIME_REALTIME_ALARM, \
85 SOURCE_TIME_BOOTTIME_ALARM, \
86 SOURCE_SIGNAL, \
87 SOURCE_DEFER, \
88 SOURCE_INOTIFY)
6a0f1f6d 89
19947509
ZJS
90/* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
91 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
92 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
93#define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
94
fd38203a 95struct sd_event {
da7e457c 96 unsigned n_ref;
fd38203a
LP
97
98 int epoll_fd;
cde93897 99 int watchdog_fd;
fd38203a
LP
100
101 Prioq *pending;
102 Prioq *prepare;
c2ba3ad6 103
a8548816 104 /* timerfd_create() only supports these five clocks so far. We
6a0f1f6d
LP
105 * can add support for more clocks when the kernel learns to
106 * deal with them, too. */
107 struct clock_data realtime;
a8548816 108 struct clock_data boottime;
6a0f1f6d
LP
109 struct clock_data monotonic;
110 struct clock_data realtime_alarm;
111 struct clock_data boottime_alarm;
fd38203a 112
da7e457c
LP
113 usec_t perturb;
114
9da4cb2b
LP
115 sd_event_source **signal_sources; /* indexed by signal number */
116 Hashmap *signal_data; /* indexed by priority */
fd38203a
LP
117
118 Hashmap *child_sources;
b6d5481b 119 unsigned n_online_child_sources;
fd38203a 120
6e9feda3
LP
121 Set *post_sources;
122
6203e07a 123 Prioq *exit;
fd38203a 124
97ef5391
LP
125 Hashmap *inotify_data; /* indexed by priority */
126
127 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
ed828563 128 LIST_HEAD(struct inode_data, inode_data_to_close_list);
97ef5391
LP
129
130 /* A list of inotify objects that already have events buffered which aren't processed yet */
0601b958 131 LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
97ef5391 132
da7e457c 133 pid_t original_pid;
c2ba3ad6 134
60a3b1e1 135 uint64_t iteration;
e475d10c 136 triple_timestamp timestamp;
da7e457c 137 int state;
eaa3cbef 138
6203e07a 139 bool exit_requested:1;
da7e457c 140 bool need_process_child:1;
cde93897 141 bool watchdog:1;
34b87517 142 bool profile_delays:1;
afc6adb5 143
6203e07a
LP
144 int exit_code;
145
afc6adb5
LP
146 pid_t tid;
147 sd_event **default_event_ptr;
cde93897
LP
148
149 usec_t watchdog_last, watchdog_period;
15b38f93
LP
150
151 unsigned n_sources;
a71fe8b8 152
5cddd924 153 struct epoll_event *event_queue;
5cddd924 154
a71fe8b8 155 LIST_HEAD(sd_event_source, sources);
34b87517 156
baf3fdec
LP
157 sd_event_source *sigint_event_source, *sigterm_event_source;
158
e6a7bee5 159 usec_t last_run_usec, last_log_usec;
34b87517 160 unsigned delays[sizeof(usec_t) * 8];
fd38203a
LP
161};
162
b937d761
NM
163static thread_local sd_event *default_event = NULL;
164
a71fe8b8 165static void source_disconnect(sd_event_source *s);
97ef5391 166static void event_gc_inode_data(sd_event *e, struct inode_data *d);
a71fe8b8 167
b937d761
NM
168static sd_event *event_resolve(sd_event *e) {
169 return e == SD_EVENT_DEFAULT ? default_event : e;
170}
171
fd38203a
LP
172static int pending_prioq_compare(const void *a, const void *b) {
173 const sd_event_source *x = a, *y = b;
9c57a73b 174 int r;
fd38203a
LP
175
176 assert(x->pending);
177 assert(y->pending);
178
baf76283 179 /* Enabled ones first */
06e13147
YW
180 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
181 if (r != 0)
182 return r;
fd38203a 183
b6d5481b
LP
184 /* Non rate-limited ones first. */
185 r = CMP(!!x->ratelimited, !!y->ratelimited);
186 if (r != 0)
187 return r;
188
fd38203a 189 /* Lower priority values first */
9c57a73b
YW
190 r = CMP(x->priority, y->priority);
191 if (r != 0)
192 return r;
fd38203a
LP
193
194 /* Older entries first */
9c57a73b 195 return CMP(x->pending_iteration, y->pending_iteration);
fd38203a
LP
196}
197
198static int prepare_prioq_compare(const void *a, const void *b) {
199 const sd_event_source *x = a, *y = b;
9c57a73b 200 int r;
fd38203a
LP
201
202 assert(x->prepare);
203 assert(y->prepare);
204
8046c457 205 /* Enabled ones first */
06e13147
YW
206 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
207 if (r != 0)
208 return r;
8046c457 209
b6d5481b
LP
210 /* Non rate-limited ones first. */
211 r = CMP(!!x->ratelimited, !!y->ratelimited);
212 if (r != 0)
213 return r;
214
fd38203a
LP
215 /* Move most recently prepared ones last, so that we can stop
216 * preparing as soon as we hit one that has already been
217 * prepared in the current iteration */
9c57a73b
YW
218 r = CMP(x->prepare_iteration, y->prepare_iteration);
219 if (r != 0)
220 return r;
fd38203a 221
fd38203a 222 /* Lower priority values first */
9c57a73b 223 return CMP(x->priority, y->priority);
fd38203a
LP
224}
225
b6d5481b
LP
226static usec_t time_event_source_next(const sd_event_source *s) {
227 assert(s);
228
229 /* We have two kinds of event sources that have elapsation times associated with them: the actual
230 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
231 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
232 * looking at here. */
233
234 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
235 assert(s->rate_limit.begin != 0);
236 assert(s->rate_limit.interval != 0);
237 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
238 }
239
240 /* Otherwise this must be a time event source, if not ratelimited */
241 if (EVENT_SOURCE_IS_TIME(s->type))
242 return s->time.next;
243
244 return USEC_INFINITY;
245}
246
1bce0ffa 247static usec_t time_event_source_latest(const sd_event_source *s) {
b6d5481b
LP
248 assert(s);
249
250 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
251 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
252 * window */
253 assert(s->rate_limit.begin != 0);
254 assert(s->rate_limit.interval != 0);
255 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
256 }
257
258 /* Must be a time event source, if not ratelimited */
259 if (EVENT_SOURCE_IS_TIME(s->type))
260 return usec_add(s->time.next, s->time.accuracy);
261
262 return USEC_INFINITY;
1bce0ffa
LP
263}
264
81107b84
LP
265static bool event_source_timer_candidate(const sd_event_source *s) {
266 assert(s);
267
268 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
269 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
270 return !s->pending || s->ratelimited;
271}
272
273static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
c2ba3ad6 274 const sd_event_source *x = a, *y = b;
06e13147 275 int r;
c2ba3ad6 276
baf76283 277 /* Enabled ones first */
06e13147
YW
278 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
279 if (r != 0)
280 return r;
c2ba3ad6 281
81107b84 282 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
06e13147
YW
283 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
284 if (r != 0)
285 return r;
c2ba3ad6
LP
286
287 /* Order by time */
81107b84
LP
288 return CMP(time_func(x), time_func(y));
289}
290
291static int earliest_time_prioq_compare(const void *a, const void *b) {
292 return time_prioq_compare(a, b, time_event_source_next);
293}
294
295static int latest_time_prioq_compare(const void *a, const void *b) {
296 return time_prioq_compare(a, b, time_event_source_latest);
c2ba3ad6
LP
297}
298
6203e07a 299static int exit_prioq_compare(const void *a, const void *b) {
da7e457c 300 const sd_event_source *x = a, *y = b;
06e13147 301 int r;
da7e457c 302
6203e07a
LP
303 assert(x->type == SOURCE_EXIT);
304 assert(y->type == SOURCE_EXIT);
da7e457c 305
baf76283 306 /* Enabled ones first */
06e13147
YW
307 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
308 if (r != 0)
309 return r;
da7e457c
LP
310
311 /* Lower priority values first */
6dd91b36 312 return CMP(x->priority, y->priority);
da7e457c
LP
313}
314
6a0f1f6d
LP
315static void free_clock_data(struct clock_data *d) {
316 assert(d);
9da4cb2b 317 assert(d->wakeup == WAKEUP_CLOCK_DATA);
6a0f1f6d
LP
318
319 safe_close(d->fd);
320 prioq_free(d->earliest);
321 prioq_free(d->latest);
322}
323
8301aa0b 324static sd_event *event_free(sd_event *e) {
a71fe8b8
LP
325 sd_event_source *s;
326
fd38203a 327 assert(e);
a71fe8b8 328
baf3fdec
LP
329 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
330 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
331
a71fe8b8
LP
332 while ((s = e->sources)) {
333 assert(s->floating);
334 source_disconnect(s);
335 sd_event_source_unref(s);
336 }
337
15b38f93 338 assert(e->n_sources == 0);
fd38203a 339
afc6adb5
LP
340 if (e->default_event_ptr)
341 *(e->default_event_ptr) = NULL;
342
03e334a1 343 safe_close(e->epoll_fd);
03e334a1 344 safe_close(e->watchdog_fd);
cde93897 345
6a0f1f6d 346 free_clock_data(&e->realtime);
a8548816 347 free_clock_data(&e->boottime);
6a0f1f6d
LP
348 free_clock_data(&e->monotonic);
349 free_clock_data(&e->realtime_alarm);
350 free_clock_data(&e->boottime_alarm);
351
fd38203a
LP
352 prioq_free(e->pending);
353 prioq_free(e->prepare);
6203e07a 354 prioq_free(e->exit);
fd38203a
LP
355
356 free(e->signal_sources);
9da4cb2b 357 hashmap_free(e->signal_data);
fd38203a 358
97ef5391
LP
359 hashmap_free(e->inotify_data);
360
fd38203a 361 hashmap_free(e->child_sources);
6e9feda3 362 set_free(e->post_sources);
8301aa0b 363
5cddd924
LP
364 free(e->event_queue);
365
8301aa0b 366 return mfree(e);
fd38203a
LP
367}
368
f7262a9f 369_public_ int sd_event_new(sd_event** ret) {
fd38203a
LP
370 sd_event *e;
371 int r;
372
305f78bf 373 assert_return(ret, -EINVAL);
fd38203a 374
d08eb1fa 375 e = new(sd_event, 1);
fd38203a
LP
376 if (!e)
377 return -ENOMEM;
378
d08eb1fa
LP
379 *e = (sd_event) {
380 .n_ref = 1,
254d1313
ZJS
381 .epoll_fd = -EBADF,
382 .watchdog_fd = -EBADF,
d08eb1fa 383 .realtime.wakeup = WAKEUP_CLOCK_DATA,
254d1313 384 .realtime.fd = -EBADF,
d08eb1fa
LP
385 .realtime.next = USEC_INFINITY,
386 .boottime.wakeup = WAKEUP_CLOCK_DATA,
254d1313 387 .boottime.fd = -EBADF,
d08eb1fa
LP
388 .boottime.next = USEC_INFINITY,
389 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
254d1313 390 .monotonic.fd = -EBADF,
d08eb1fa
LP
391 .monotonic.next = USEC_INFINITY,
392 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
254d1313 393 .realtime_alarm.fd = -EBADF,
d08eb1fa
LP
394 .realtime_alarm.next = USEC_INFINITY,
395 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
254d1313 396 .boottime_alarm.fd = -EBADF,
d08eb1fa
LP
397 .boottime_alarm.next = USEC_INFINITY,
398 .perturb = USEC_INFINITY,
399 .original_pid = getpid_cached(),
400 };
fd38203a 401
c983e776
EV
402 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
403 if (r < 0)
fd38203a 404 goto fail;
fd38203a
LP
405
406 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
407 if (e->epoll_fd < 0) {
408 r = -errno;
409 goto fail;
410 }
411
7fe2903c
LP
412 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
413
34b87517 414 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
28e5e1e9
DT
415 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
416 special_glyph(SPECIAL_GLYPH_ELLIPSIS));
34b87517
VC
417 e->profile_delays = true;
418 }
419
fd38203a
LP
420 *ret = e;
421 return 0;
422
423fail:
424 event_free(e);
425 return r;
426}
427
8301aa0b 428DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event, sd_event, event_free);
c8e9d15c
YW
429#define PROTECT_EVENT(e) \
430 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
fd38203a 431
afd15bbb
ZJS
432_public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
433 if (s)
434 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
435 return sd_event_source_unref(s);
436}
437
eaa3cbef
LP
438static bool event_pid_changed(sd_event *e) {
439 assert(e);
440
a2360a46 441 /* We don't support people creating an event loop and keeping
eaa3cbef
LP
442 * it around over a fork(). Let's complain. */
443
df0ff127 444 return e->original_pid != getpid_cached();
eaa3cbef
LP
445}
446
366e6411 447static void source_io_unregister(sd_event_source *s) {
fd38203a
LP
448 assert(s);
449 assert(s->type == SOURCE_IO);
450
f6806734 451 if (event_pid_changed(s->event))
366e6411 452 return;
f6806734 453
fd38203a 454 if (!s->io.registered)
366e6411 455 return;
fd38203a 456
d1cf2023 457 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
f80a5d6a 458 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
55cbfaa5 459 strna(s->description), event_source_type_to_string(s->type));
fd38203a
LP
460
461 s->io.registered = false;
fd38203a
LP
462}
463
305f78bf
LP
464static int source_io_register(
465 sd_event_source *s,
466 int enabled,
467 uint32_t events) {
468
fd38203a
LP
469 assert(s);
470 assert(s->type == SOURCE_IO);
baf76283 471 assert(enabled != SD_EVENT_OFF);
fd38203a 472
1eac7948 473 struct epoll_event ev = {
a82f89aa
LP
474 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
475 .data.ptr = s,
476 };
fd38203a 477
15c689d7 478 if (epoll_ctl(s->event->epoll_fd,
1eac7948 479 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
55c540d3 480 s->io.fd, &ev) < 0)
fd38203a
LP
481 return -errno;
482
483 s->io.registered = true;
484
485 return 0;
486}
487
f8f3f926
LP
488static void source_child_pidfd_unregister(sd_event_source *s) {
489 assert(s);
490 assert(s->type == SOURCE_CHILD);
491
492 if (event_pid_changed(s->event))
493 return;
494
495 if (!s->child.registered)
496 return;
497
498 if (EVENT_SOURCE_WATCH_PIDFD(s))
499 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
f80a5d6a 500 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
f8f3f926
LP
501 strna(s->description), event_source_type_to_string(s->type));
502
503 s->child.registered = false;
504}
505
506static int source_child_pidfd_register(sd_event_source *s, int enabled) {
f8f3f926
LP
507 assert(s);
508 assert(s->type == SOURCE_CHILD);
509 assert(enabled != SD_EVENT_OFF);
510
511 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1eac7948 512 struct epoll_event ev = {
f8f3f926
LP
513 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
514 .data.ptr = s,
515 };
516
55c540d3
ZJS
517 if (epoll_ctl(s->event->epoll_fd,
518 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
519 s->child.pidfd, &ev) < 0)
f8f3f926
LP
520 return -errno;
521 }
522
523 s->child.registered = true;
524 return 0;
525}
526
6a0f1f6d
LP
527static clockid_t event_source_type_to_clock(EventSourceType t) {
528
529 switch (t) {
530
531 case SOURCE_TIME_REALTIME:
532 return CLOCK_REALTIME;
533
a8548816
TG
534 case SOURCE_TIME_BOOTTIME:
535 return CLOCK_BOOTTIME;
536
6a0f1f6d
LP
537 case SOURCE_TIME_MONOTONIC:
538 return CLOCK_MONOTONIC;
539
540 case SOURCE_TIME_REALTIME_ALARM:
541 return CLOCK_REALTIME_ALARM;
542
543 case SOURCE_TIME_BOOTTIME_ALARM:
544 return CLOCK_BOOTTIME_ALARM;
545
546 default:
547 return (clockid_t) -1;
548 }
549}
550
551static EventSourceType clock_to_event_source_type(clockid_t clock) {
552
553 switch (clock) {
554
555 case CLOCK_REALTIME:
556 return SOURCE_TIME_REALTIME;
557
a8548816
TG
558 case CLOCK_BOOTTIME:
559 return SOURCE_TIME_BOOTTIME;
560
6a0f1f6d
LP
561 case CLOCK_MONOTONIC:
562 return SOURCE_TIME_MONOTONIC;
563
564 case CLOCK_REALTIME_ALARM:
565 return SOURCE_TIME_REALTIME_ALARM;
566
567 case CLOCK_BOOTTIME_ALARM:
568 return SOURCE_TIME_BOOTTIME_ALARM;
569
570 default:
571 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
572 }
573}
574
575static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
576 assert(e);
577
578 switch (t) {
579
580 case SOURCE_TIME_REALTIME:
581 return &e->realtime;
582
a8548816
TG
583 case SOURCE_TIME_BOOTTIME:
584 return &e->boottime;
585
6a0f1f6d
LP
586 case SOURCE_TIME_MONOTONIC:
587 return &e->monotonic;
588
589 case SOURCE_TIME_REALTIME_ALARM:
590 return &e->realtime_alarm;
591
592 case SOURCE_TIME_BOOTTIME_ALARM:
593 return &e->boottime_alarm;
594
595 default:
596 return NULL;
597 }
598}
599
3e4eb8e7
YW
600static void event_free_signal_data(sd_event *e, struct signal_data *d) {
601 assert(e);
602
603 if (!d)
604 return;
605
606 hashmap_remove(e->signal_data, &d->priority);
607 safe_close(d->fd);
608 free(d);
609}
610
9da4cb2b
LP
611static int event_make_signal_data(
612 sd_event *e,
613 int sig,
614 struct signal_data **ret) {
4807d2d0 615
9da4cb2b
LP
616 struct signal_data *d;
617 bool added = false;
618 sigset_t ss_copy;
619 int64_t priority;
f95387cd
ZJS
620 int r;
621
622 assert(e);
623
f6806734 624 if (event_pid_changed(e))
9da4cb2b 625 return -ECHILD;
f6806734 626
9da4cb2b
LP
627 if (e->signal_sources && e->signal_sources[sig])
628 priority = e->signal_sources[sig]->priority;
629 else
de05913d 630 priority = SD_EVENT_PRIORITY_NORMAL;
f95387cd 631
9da4cb2b
LP
632 d = hashmap_get(e->signal_data, &priority);
633 if (d) {
634 if (sigismember(&d->sigset, sig) > 0) {
635 if (ret)
636 *ret = d;
637 return 0;
638 }
639 } else {
d08eb1fa 640 d = new(struct signal_data, 1);
9da4cb2b
LP
641 if (!d)
642 return -ENOMEM;
643
d08eb1fa
LP
644 *d = (struct signal_data) {
645 .wakeup = WAKEUP_SIGNAL_DATA,
254d1313 646 .fd = -EBADF,
d08eb1fa
LP
647 .priority = priority,
648 };
9da4cb2b 649
f656fdb6 650 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
90f604d1
ZJS
651 if (r < 0) {
652 free(d);
9da4cb2b 653 return r;
90f604d1 654 }
f95387cd 655
9da4cb2b
LP
656 added = true;
657 }
658
659 ss_copy = d->sigset;
660 assert_se(sigaddset(&ss_copy, sig) >= 0);
661
cbff793f
ZJS
662 r = signalfd(d->fd >= 0 ? d->fd : -1, /* the first arg must be -1 or a valid signalfd */
663 &ss_copy,
664 SFD_NONBLOCK|SFD_CLOEXEC);
9da4cb2b
LP
665 if (r < 0) {
666 r = -errno;
667 goto fail;
668 }
669
670 d->sigset = ss_copy;
f95387cd 671
9da4cb2b
LP
672 if (d->fd >= 0) {
673 if (ret)
674 *ret = d;
f95387cd 675 return 0;
9da4cb2b
LP
676 }
677
7fe2903c 678 d->fd = fd_move_above_stdio(r);
f95387cd 679
1eac7948 680 struct epoll_event ev = {
a82f89aa
LP
681 .events = EPOLLIN,
682 .data.ptr = d,
683 };
f95387cd 684
15c689d7 685 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
9da4cb2b
LP
686 r = -errno;
687 goto fail;
f95387cd
ZJS
688 }
689
9da4cb2b
LP
690 if (ret)
691 *ret = d;
692
f95387cd 693 return 0;
9da4cb2b
LP
694
695fail:
3e4eb8e7
YW
696 if (added)
697 event_free_signal_data(e, d);
9da4cb2b
LP
698
699 return r;
700}
701
702static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
703 assert(e);
704 assert(d);
705
706 /* Turns off the specified signal in the signal data
707 * object. If the signal mask of the object becomes empty that
708 * way removes it. */
709
710 if (sigismember(&d->sigset, sig) == 0)
711 return;
712
713 assert_se(sigdelset(&d->sigset, sig) >= 0);
714
715 if (sigisemptyset(&d->sigset)) {
9da4cb2b 716 /* If all the mask is all-zero we can get rid of the structure */
3e4eb8e7 717 event_free_signal_data(e, d);
9da4cb2b
LP
718 return;
719 }
720
01e6af73
YW
721 if (event_pid_changed(e))
722 return;
723
9da4cb2b
LP
724 assert(d->fd >= 0);
725
726 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
727 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
728}
729
730static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
731 struct signal_data *d;
732 static const int64_t zero_priority = 0;
733
734 assert(e);
735
f8f3f926
LP
736 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
737 * and possibly drop the signalfd for it. */
9da4cb2b
LP
738
739 if (sig == SIGCHLD &&
b6d5481b 740 e->n_online_child_sources > 0)
9da4cb2b
LP
741 return;
742
743 if (e->signal_sources &&
744 e->signal_sources[sig] &&
b6d5481b 745 event_source_is_online(e->signal_sources[sig]))
9da4cb2b
LP
746 return;
747
748 /*
749 * The specified signal might be enabled in three different queues:
750 *
751 * 1) the one that belongs to the priority passed (if it is non-NULL)
752 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
753 * 3) the 0 priority (to cover the SIGCHLD case)
754 *
755 * Hence, let's remove it from all three here.
756 */
757
758 if (priority) {
759 d = hashmap_get(e->signal_data, priority);
760 if (d)
761 event_unmask_signal_data(e, d, sig);
762 }
763
764 if (e->signal_sources && e->signal_sources[sig]) {
765 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
766 if (d)
767 event_unmask_signal_data(e, d, sig);
768 }
769
770 d = hashmap_get(e->signal_data, &zero_priority);
771 if (d)
772 event_unmask_signal_data(e, d, sig);
f95387cd
ZJS
773}
774
e1951c16
MS
775static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
776 assert(s);
777
778 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
779 * they are enabled/disabled or marked pending and such. */
780
781 if (s->pending)
782 prioq_reshuffle(s->event->pending, s, &s->pending_index);
783
784 if (s->prepare)
785 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
786}
787
788static void event_source_time_prioq_reshuffle(sd_event_source *s) {
789 struct clock_data *d;
790
791 assert(s);
e1951c16
MS
792
793 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
5c08c7ab
YW
794 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
795 * properly again. */
b6d5481b
LP
796
797 if (s->ratelimited)
798 d = &s->event->monotonic;
5c08c7ab 799 else if (EVENT_SOURCE_IS_TIME(s->type))
b6d5481b 800 assert_se(d = event_get_clock_data(s->event, s->type));
5c08c7ab
YW
801 else
802 return; /* no-op for an event source which is neither a timer nor ratelimited. */
b6d5481b 803
f41315fc
LP
804 prioq_reshuffle(d->earliest, s, &s->earliest_index);
805 prioq_reshuffle(d->latest, s, &s->latest_index);
e1951c16
MS
806 d->needs_rearm = true;
807}
808
1e45e3fe
LP
809static void event_source_time_prioq_remove(
810 sd_event_source *s,
811 struct clock_data *d) {
812
813 assert(s);
814 assert(d);
815
f41315fc
LP
816 prioq_remove(d->earliest, s, &s->earliest_index);
817 prioq_remove(d->latest, s, &s->latest_index);
818 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
819 d->needs_rearm = true;
820}
821
a71fe8b8
LP
822static void source_disconnect(sd_event_source *s) {
823 sd_event *event;
897448bd 824 int r;
a71fe8b8 825
fd38203a
LP
826 assert(s);
827
a71fe8b8
LP
828 if (!s->event)
829 return;
15b38f93 830
a71fe8b8 831 assert(s->event->n_sources > 0);
fd38203a 832
a71fe8b8 833 switch (s->type) {
fd38203a 834
a71fe8b8
LP
835 case SOURCE_IO:
836 if (s->io.fd >= 0)
837 source_io_unregister(s);
fd38203a 838
a71fe8b8 839 break;
6a0f1f6d 840
a71fe8b8 841 case SOURCE_TIME_REALTIME:
a8548816 842 case SOURCE_TIME_BOOTTIME:
a71fe8b8
LP
843 case SOURCE_TIME_MONOTONIC:
844 case SOURCE_TIME_REALTIME_ALARM:
b6d5481b
LP
845 case SOURCE_TIME_BOOTTIME_ALARM:
846 /* Only remove this event source from the time event source here if it is not ratelimited. If
847 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
848 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
849
850 if (!s->ratelimited) {
851 struct clock_data *d;
852 assert_se(d = event_get_clock_data(s->event, s->type));
853 event_source_time_prioq_remove(s, d);
854 }
855
a71fe8b8 856 break;
a71fe8b8
LP
857
858 case SOURCE_SIGNAL:
859 if (s->signal.sig > 0) {
9da4cb2b 860
a71fe8b8
LP
861 if (s->event->signal_sources)
862 s->event->signal_sources[s->signal.sig] = NULL;
4807d2d0 863
9da4cb2b 864 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
897448bd
LP
865
866 if (s->signal.unblock) {
867 sigset_t new_ss;
868
869 if (sigemptyset(&new_ss) < 0)
870 log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
871 else if (sigaddset(&new_ss, s->signal.sig) < 0)
872 log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
873 else {
874 r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
875 if (r != 0)
876 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
877 }
878 }
6a0f1f6d 879 }
fd38203a 880
a71fe8b8 881 break;
fd38203a 882
a71fe8b8 883 case SOURCE_CHILD:
86587c93
YW
884 if (event_pid_changed(s->event))
885 s->child.process_owned = false;
886
a71fe8b8 887 if (s->child.pid > 0) {
b6d5481b
LP
888 if (event_source_is_online(s)) {
889 assert(s->event->n_online_child_sources > 0);
890 s->event->n_online_child_sources--;
4807d2d0 891 }
fd38203a 892
4a0b58c4 893 (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
a71fe8b8 894 }
fd38203a 895
f8f3f926
LP
896 if (EVENT_SOURCE_WATCH_PIDFD(s))
897 source_child_pidfd_unregister(s);
898 else
899 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
900
a71fe8b8 901 break;
fd38203a 902
a71fe8b8
LP
903 case SOURCE_DEFER:
904 /* nothing */
905 break;
fd38203a 906
a71fe8b8
LP
907 case SOURCE_POST:
908 set_remove(s->event->post_sources, s);
909 break;
da7e457c 910
a71fe8b8
LP
911 case SOURCE_EXIT:
912 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
913 break;
0eb2e0e3 914
97ef5391
LP
915 case SOURCE_INOTIFY: {
916 struct inode_data *inode_data;
917
918 inode_data = s->inotify.inode_data;
919 if (inode_data) {
920 struct inotify_data *inotify_data;
921 assert_se(inotify_data = inode_data->inotify_data);
922
923 /* Detach this event source from the inode object */
924 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
925 s->inotify.inode_data = NULL;
926
927 if (s->pending) {
928 assert(inotify_data->n_pending > 0);
929 inotify_data->n_pending--;
930 }
931
932 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
933 * continued to being watched. That's because inotify doesn't really have an API for that: we
934 * can only change watch masks with access to the original inode either by fd or by path. But
935 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
f21f31b2 936 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
97ef5391
LP
937 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
938 * there), but given the need for open_by_handle_at() which is privileged and not universally
939 * available this would be quite an incomplete solution. Hence we go the other way, leave the
940 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
941 * anymore after reception. Yes, this sucks, but … Linux … */
942
943 /* Maybe release the inode data (and its inotify) */
944 event_gc_inode_data(s->event, inode_data);
945 }
946
947 break;
948 }
949
a71fe8b8 950 default:
04499a70 951 assert_not_reached();
a71fe8b8 952 }
6e9feda3 953
a71fe8b8
LP
954 if (s->pending)
955 prioq_remove(s->event->pending, s, &s->pending_index);
9d3e3aa5 956
a71fe8b8
LP
957 if (s->prepare)
958 prioq_remove(s->event->prepare, s, &s->prepare_index);
fd38203a 959
b6d5481b
LP
960 if (s->ratelimited)
961 event_source_time_prioq_remove(s, &s->event->monotonic);
962
e514aa1e 963 event = TAKE_PTR(s->event);
a71fe8b8
LP
964 LIST_REMOVE(sources, event->sources, s);
965 event->n_sources--;
fd38203a 966
f5982559
LP
967 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
968 * pidfd associated with this event source, which we'll do only on source_free(). */
969
a71fe8b8
LP
970 if (!s->floating)
971 sd_event_unref(event);
972}
973
75db809a 974static sd_event_source* source_free(sd_event_source *s) {
a71fe8b8 975 assert(s);
fd38203a 976
a71fe8b8 977 source_disconnect(s);
ab93297c
NM
978
979 if (s->type == SOURCE_IO && s->io.owned)
15723a1d
LP
980 s->io.fd = safe_close(s->io.fd);
981
f8f3f926
LP
982 if (s->type == SOURCE_CHILD) {
983 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
984
985 if (s->child.process_owned) {
986
987 if (!s->child.exited) {
988 bool sent = false;
989
990 if (s->child.pidfd >= 0) {
991 if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
992 if (errno == ESRCH) /* Already dead */
993 sent = true;
994 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
995 log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
996 s->child.pid);
997 } else
998 sent = true;
999 }
1000
1001 if (!sent)
1002 if (kill(s->child.pid, SIGKILL) < 0)
1003 if (errno != ESRCH) /* Already dead */
1004 log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1005 s->child.pid);
1006 }
1007
1008 if (!s->child.waited) {
1009 siginfo_t si = {};
1010
1011 /* Reap the child if we can */
1012 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1013 }
1014 }
1015
1016 if (s->child.pidfd_owned)
1017 s->child.pidfd = safe_close(s->child.pidfd);
1018 }
1019
15723a1d
LP
1020 if (s->destroy_callback)
1021 s->destroy_callback(s->userdata);
ab93297c 1022
356779df 1023 free(s->description);
75db809a 1024 return mfree(s);
fd38203a 1025}
8c75fe17 1026DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
fd38203a
LP
1027
1028static int source_set_pending(sd_event_source *s, bool b) {
1029 int r;
1030
1031 assert(s);
6203e07a 1032 assert(s->type != SOURCE_EXIT);
fd38203a
LP
1033
1034 if (s->pending == b)
1035 return 0;
1036
1037 s->pending = b;
1038
1039 if (b) {
1040 s->pending_iteration = s->event->iteration;
1041
1042 r = prioq_put(s->event->pending, s, &s->pending_index);
1043 if (r < 0) {
1044 s->pending = false;
1045 return r;
1046 }
1047 } else
1048 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1049
e1951c16
MS
1050 if (EVENT_SOURCE_IS_TIME(s->type))
1051 event_source_time_prioq_reshuffle(s);
2576a19e 1052
9da4cb2b
LP
1053 if (s->type == SOURCE_SIGNAL && !b) {
1054 struct signal_data *d;
1055
1056 d = hashmap_get(s->event->signal_data, &s->priority);
1057 if (d && d->current == s)
1058 d->current = NULL;
1059 }
1060
97ef5391
LP
1061 if (s->type == SOURCE_INOTIFY) {
1062
1063 assert(s->inotify.inode_data);
1064 assert(s->inotify.inode_data->inotify_data);
1065
1066 if (b)
1067 s->inotify.inode_data->inotify_data->n_pending ++;
1068 else {
1069 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1070 s->inotify.inode_data->inotify_data->n_pending --;
1071 }
1072 }
1073
efd3be9d 1074 return 1;
fd38203a
LP
1075}
1076
a71fe8b8 1077static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
a38cf9fb
LP
1078
1079 /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1080 * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1081 * lines. */
1082 static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1083 [SOURCE_IO] = endoffsetof_field(sd_event_source, io),
1084 [SOURCE_TIME_REALTIME] = endoffsetof_field(sd_event_source, time),
1085 [SOURCE_TIME_BOOTTIME] = endoffsetof_field(sd_event_source, time),
1086 [SOURCE_TIME_MONOTONIC] = endoffsetof_field(sd_event_source, time),
1087 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1088 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1089 [SOURCE_SIGNAL] = endoffsetof_field(sd_event_source, signal),
1090 [SOURCE_CHILD] = endoffsetof_field(sd_event_source, child),
1091 [SOURCE_DEFER] = endoffsetof_field(sd_event_source, defer),
1092 [SOURCE_POST] = endoffsetof_field(sd_event_source, post),
1093 [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit),
1094 [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify),
1095 };
1096
fd38203a
LP
1097 sd_event_source *s;
1098
1099 assert(e);
a38cf9fb
LP
1100 assert(type >= 0);
1101 assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1102 assert(size_table[type] > 0);
fd38203a 1103
a38cf9fb
LP
1104 /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1105 * size, even if we only allocate the initial part we need. */
1106 s = expand_to_usable(malloc0(size_table[type]), sizeof(sd_event_source));
fd38203a
LP
1107 if (!s)
1108 return NULL;
1109
a38cf9fb
LP
1110 /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1111 * than what we allocated here. */
1112 s->n_ref = 1;
1113 s->event = e;
1114 s->floating = floating;
1115 s->type = type;
1116 s->pending_index = PRIOQ_IDX_NULL;
1117 s->prepare_index = PRIOQ_IDX_NULL;
a71fe8b8
LP
1118
1119 if (!floating)
1120 sd_event_ref(e);
fd38203a 1121
a71fe8b8 1122 LIST_PREPEND(sources, e->sources, s);
313cefa1 1123 e->n_sources++;
15b38f93 1124
fd38203a
LP
1125 return s;
1126}
1127
b9350e70
LP
1128static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1129 assert(s);
1130
1131 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1132}
1133
f7262a9f 1134_public_ int sd_event_add_io(
fd38203a 1135 sd_event *e,
151b9b96 1136 sd_event_source **ret,
fd38203a
LP
1137 int fd,
1138 uint32_t events,
718db961 1139 sd_event_io_handler_t callback,
151b9b96 1140 void *userdata) {
fd38203a 1141
ec766a51 1142 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1143 int r;
1144
305f78bf 1145 assert_return(e, -EINVAL);
b937d761 1146 assert_return(e = event_resolve(e), -ENOPKG);
8ac43fee 1147 assert_return(fd >= 0, -EBADF);
2a16a986 1148 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 1149 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1150 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1151
b9350e70
LP
1152 if (!callback)
1153 callback = io_exit_callback;
1154
a71fe8b8 1155 s = source_new(e, !ret, SOURCE_IO);
fd38203a
LP
1156 if (!s)
1157 return -ENOMEM;
1158
9da4cb2b 1159 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1160 s->io.fd = fd;
1161 s->io.events = events;
1162 s->io.callback = callback;
1163 s->userdata = userdata;
baf76283 1164 s->enabled = SD_EVENT_ON;
fd38203a 1165
baf76283 1166 r = source_io_register(s, s->enabled, events);
ec766a51 1167 if (r < 0)
050f74f2 1168 return r;
fd38203a 1169
a71fe8b8
LP
1170 if (ret)
1171 *ret = s;
ec766a51 1172 TAKE_PTR(s);
a71fe8b8 1173
fd38203a
LP
1174 return 0;
1175}
1176
52444dc4
LP
1177static void initialize_perturb(sd_event *e) {
1178 sd_id128_t bootid = {};
1179
1180 /* When we sleep for longer, we try to realign the wakeup to
f21f31b2 1181 the same time within each minute/second/250ms, so that
52444dc4
LP
1182 events all across the system can be coalesced into a single
1183 CPU wakeup. However, let's take some system-specific
1184 randomness for this value, so that in a network of systems
1185 with synced clocks timer events are distributed a
1186 bit. Here, we calculate a perturbation usec offset from the
1187 boot ID. */
1188
3a43da28 1189 if (_likely_(e->perturb != USEC_INFINITY))
52444dc4
LP
1190 return;
1191
1192 if (sd_id128_get_boot(&bootid) >= 0)
1193 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1194}
1195
fd38203a
LP
1196static int event_setup_timer_fd(
1197 sd_event *e,
6a0f1f6d
LP
1198 struct clock_data *d,
1199 clockid_t clock) {
fd38203a 1200
fd38203a 1201 assert(e);
6a0f1f6d 1202 assert(d);
fd38203a 1203
6a0f1f6d 1204 if (_likely_(d->fd >= 0))
fd38203a
LP
1205 return 0;
1206
254d1313 1207 _cleanup_close_ int fd = -EBADF;
b44d87e2 1208
6a0f1f6d 1209 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
fd38203a
LP
1210 if (fd < 0)
1211 return -errno;
1212
7fe2903c
LP
1213 fd = fd_move_above_stdio(fd);
1214
1eac7948 1215 struct epoll_event ev = {
a82f89aa
LP
1216 .events = EPOLLIN,
1217 .data.ptr = d,
1218 };
fd38203a 1219
15c689d7 1220 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
fd38203a 1221 return -errno;
fd38203a 1222
b44d87e2 1223 d->fd = TAKE_FD(fd);
fd38203a
LP
1224 return 0;
1225}
1226
c4f1aff2
TG
1227static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1228 assert(s);
1229
1230 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1231}
1232
41c63f36
LP
1233static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1234 int r;
1235
1236 assert(d);
1237
1238 if (d->fd < 0) {
1239 r = event_setup_timer_fd(e, d, clock);
1240 if (r < 0)
1241 return r;
1242 }
1243
1244 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1245 if (r < 0)
1246 return r;
1247
1248 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1249 if (r < 0)
1250 return r;
1251
1252 return 0;
1253}
1254
1e45e3fe
LP
1255static int event_source_time_prioq_put(
1256 sd_event_source *s,
1257 struct clock_data *d) {
1258
1259 int r;
1260
1261 assert(s);
1262 assert(d);
19947509 1263 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1e45e3fe 1264
f41315fc 1265 r = prioq_put(d->earliest, s, &s->earliest_index);
1e45e3fe
LP
1266 if (r < 0)
1267 return r;
1268
f41315fc 1269 r = prioq_put(d->latest, s, &s->latest_index);
1e45e3fe 1270 if (r < 0) {
f41315fc
LP
1271 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1272 s->earliest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
1273 return r;
1274 }
1275
1276 d->needs_rearm = true;
1277 return 0;
1278}
1279
6a0f1f6d 1280_public_ int sd_event_add_time(
fd38203a 1281 sd_event *e,
151b9b96 1282 sd_event_source **ret,
6a0f1f6d 1283 clockid_t clock,
fd38203a 1284 uint64_t usec,
c2ba3ad6 1285 uint64_t accuracy,
718db961 1286 sd_event_time_handler_t callback,
151b9b96 1287 void *userdata) {
fd38203a 1288
6a0f1f6d 1289 EventSourceType type;
ec766a51 1290 _cleanup_(source_freep) sd_event_source *s = NULL;
6a0f1f6d 1291 struct clock_data *d;
fd38203a
LP
1292 int r;
1293
305f78bf 1294 assert_return(e, -EINVAL);
b937d761 1295 assert_return(e = event_resolve(e), -ENOPKG);
f5fbe71d 1296 assert_return(accuracy != UINT64_MAX, -EINVAL);
da7e457c 1297 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1298 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1299
e475d10c
LP
1300 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1301 return -EOPNOTSUPP;
1302
1303 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1304 if (type < 0)
3411372e
LP
1305 return -EOPNOTSUPP;
1306
c4f1aff2
TG
1307 if (!callback)
1308 callback = time_exit_callback;
1309
1e45e3fe 1310 assert_se(d = event_get_clock_data(e, type));
c2ba3ad6 1311
41c63f36 1312 r = setup_clock_data(e, d, clock);
c983e776
EV
1313 if (r < 0)
1314 return r;
fd38203a 1315
a71fe8b8 1316 s = source_new(e, !ret, type);
fd38203a
LP
1317 if (!s)
1318 return -ENOMEM;
1319
1320 s->time.next = usec;
c2ba3ad6 1321 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
fd38203a 1322 s->time.callback = callback;
f41315fc 1323 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
fd38203a 1324 s->userdata = userdata;
baf76283 1325 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1326
1e45e3fe 1327 r = event_source_time_prioq_put(s, d);
c2ba3ad6 1328 if (r < 0)
ec766a51 1329 return r;
fd38203a 1330
a71fe8b8
LP
1331 if (ret)
1332 *ret = s;
ec766a51 1333 TAKE_PTR(s);
a71fe8b8 1334
fd38203a
LP
1335 return 0;
1336}
1337
d6a83dc4
LP
1338_public_ int sd_event_add_time_relative(
1339 sd_event *e,
1340 sd_event_source **ret,
1341 clockid_t clock,
1342 uint64_t usec,
1343 uint64_t accuracy,
1344 sd_event_time_handler_t callback,
1345 void *userdata) {
1346
1347 usec_t t;
1348 int r;
1349
1350 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1351 * checks for overflow. */
1352
1353 r = sd_event_now(e, clock, &t);
1354 if (r < 0)
1355 return r;
1356
1357 if (usec >= USEC_INFINITY - t)
1358 return -EOVERFLOW;
1359
1360 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1361}
1362
59bc1fd7
LP
1363static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1364 assert(s);
1365
1366 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1367}
1368
f7262a9f 1369_public_ int sd_event_add_signal(
305f78bf 1370 sd_event *e,
151b9b96 1371 sd_event_source **ret,
305f78bf 1372 int sig,
718db961 1373 sd_event_signal_handler_t callback,
151b9b96 1374 void *userdata) {
305f78bf 1375
ec766a51 1376 _cleanup_(source_freep) sd_event_source *s = NULL;
9da4cb2b 1377 struct signal_data *d;
897448bd
LP
1378 sigset_t new_ss;
1379 bool block_it;
fd38203a
LP
1380 int r;
1381
305f78bf 1382 assert_return(e, -EINVAL);
b937d761 1383 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1384 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1385 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1386
897448bd
LP
1387 /* Let's make sure our special flag stays outside of the valid signal range */
1388 assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1389
1390 if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1391 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1392 assert_return(SIGNAL_VALID(sig), -EINVAL);
1393
1394 block_it = true;
1395 } else {
1396 assert_return(SIGNAL_VALID(sig), -EINVAL);
1397
1398 r = signal_is_blocked(sig);
1399 if (r < 0)
1400 return r;
1401 if (r == 0)
1402 return -EBUSY;
1403
1404 block_it = false;
1405 }
1406
59bc1fd7
LP
1407 if (!callback)
1408 callback = signal_exit_callback;
1409
fd38203a
LP
1410 if (!e->signal_sources) {
1411 e->signal_sources = new0(sd_event_source*, _NSIG);
1412 if (!e->signal_sources)
1413 return -ENOMEM;
1414 } else if (e->signal_sources[sig])
1415 return -EBUSY;
1416
a71fe8b8 1417 s = source_new(e, !ret, SOURCE_SIGNAL);
fd38203a
LP
1418 if (!s)
1419 return -ENOMEM;
1420
1421 s->signal.sig = sig;
1422 s->signal.callback = callback;
1423 s->userdata = userdata;
baf76283 1424 s->enabled = SD_EVENT_ON;
fd38203a
LP
1425
1426 e->signal_sources[sig] = s;
fd38203a 1427
897448bd
LP
1428 if (block_it) {
1429 sigset_t old_ss;
1430
1431 if (sigemptyset(&new_ss) < 0)
1432 return -errno;
1433
1434 if (sigaddset(&new_ss, sig) < 0)
1435 return -errno;
1436
1437 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1438 if (r != 0)
1439 return -r;
1440
1441 r = sigismember(&old_ss, sig);
1442 if (r < 0)
1443 return -errno;
1444
1445 s->signal.unblock = !r;
1446 } else
1447 s->signal.unblock = false;
1448
9da4cb2b 1449 r = event_make_signal_data(e, sig, &d);
897448bd
LP
1450 if (r < 0) {
1451 if (s->signal.unblock)
1452 (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1453
9da4cb2b 1454 return r;
897448bd 1455 }
fd38203a 1456
f1f00dbb
LP
1457 /* Use the signal name as description for the event source by default */
1458 (void) sd_event_source_set_description(s, signal_to_string(sig));
1459
a71fe8b8
LP
1460 if (ret)
1461 *ret = s;
ec766a51 1462 TAKE_PTR(s);
a71fe8b8 1463
fd38203a
LP
1464 return 0;
1465}
1466
b9350e70
LP
1467static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1468 assert(s);
1469
1470 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1471}
1472
f8f3f926
LP
1473static bool shall_use_pidfd(void) {
1474 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1475 return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1476}
1477
f7262a9f 1478_public_ int sd_event_add_child(
305f78bf 1479 sd_event *e,
151b9b96 1480 sd_event_source **ret,
305f78bf
LP
1481 pid_t pid,
1482 int options,
718db961 1483 sd_event_child_handler_t callback,
151b9b96 1484 void *userdata) {
305f78bf 1485
ec766a51 1486 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1487 int r;
1488
305f78bf 1489 assert_return(e, -EINVAL);
b937d761 1490 assert_return(e = event_resolve(e), -ENOPKG);
305f78bf
LP
1491 assert_return(pid > 1, -EINVAL);
1492 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1493 assert_return(options != 0, -EINVAL);
da7e457c 1494 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1495 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1496
b9350e70
LP
1497 if (!callback)
1498 callback = child_exit_callback;
1499
b6d5481b 1500 if (e->n_online_child_sources == 0) {
ee880b37
LP
1501 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1502 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1503 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1504 * take effect.
1505 *
1506 * (As an optimization we only do this check on the first child event source created.) */
1507 r = signal_is_blocked(SIGCHLD);
1508 if (r < 0)
1509 return r;
1510 if (r == 0)
1511 return -EBUSY;
1512 }
1513
d5099efc 1514 r = hashmap_ensure_allocated(&e->child_sources, NULL);
fd38203a
LP
1515 if (r < 0)
1516 return r;
1517
4a0b58c4 1518 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
fd38203a
LP
1519 return -EBUSY;
1520
a71fe8b8 1521 s = source_new(e, !ret, SOURCE_CHILD);
fd38203a
LP
1522 if (!s)
1523 return -ENOMEM;
1524
f8f3f926 1525 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1526 s->child.options = options;
1527 s->child.callback = callback;
1528 s->userdata = userdata;
baf76283 1529 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1530
f8f3f926
LP
1531 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1532 * pin the PID, and make regular waitid() handling race-free. */
1533
1534 if (shall_use_pidfd()) {
54988a27 1535 s->child.pidfd = pidfd_open(pid, 0);
f8f3f926
LP
1536 if (s->child.pidfd < 0) {
1537 /* Propagate errors unless the syscall is not supported or blocked */
1538 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1539 return -errno;
1540 } else
1541 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1542 } else
254d1313 1543 s->child.pidfd = -EBADF;
f8f3f926 1544
f8f3f926
LP
1545 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1546 /* We have a pidfd and we only want to watch for exit */
f8f3f926 1547 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1548 if (r < 0)
f8f3f926 1549 return r;
ac9f2640 1550
f8f3f926
LP
1551 } else {
1552 /* We have no pidfd or we shall wait for some other event than WEXITED */
f8f3f926 1553 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1554 if (r < 0)
f8f3f926 1555 return r;
f8f3f926
LP
1556
1557 e->need_process_child = true;
1558 }
c2ba3ad6 1559
54988a27
YW
1560 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1561 if (r < 0)
1562 return r;
1563
1564 /* These must be done after everything succeeds. */
1565 s->child.pid = pid;
b6d5481b 1566 e->n_online_child_sources++;
ac9f2640 1567
a71fe8b8
LP
1568 if (ret)
1569 *ret = s;
ec766a51 1570 TAKE_PTR(s);
f8f3f926
LP
1571 return 0;
1572}
1573
1574_public_ int sd_event_add_child_pidfd(
1575 sd_event *e,
1576 sd_event_source **ret,
1577 int pidfd,
1578 int options,
1579 sd_event_child_handler_t callback,
1580 void *userdata) {
1581
1582
1583 _cleanup_(source_freep) sd_event_source *s = NULL;
1584 pid_t pid;
1585 int r;
1586
1587 assert_return(e, -EINVAL);
1588 assert_return(e = event_resolve(e), -ENOPKG);
1589 assert_return(pidfd >= 0, -EBADF);
1590 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1591 assert_return(options != 0, -EINVAL);
f8f3f926
LP
1592 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1593 assert_return(!event_pid_changed(e), -ECHILD);
1594
b9350e70
LP
1595 if (!callback)
1596 callback = child_exit_callback;
1597
b6d5481b 1598 if (e->n_online_child_sources == 0) {
ee880b37
LP
1599 r = signal_is_blocked(SIGCHLD);
1600 if (r < 0)
1601 return r;
1602 if (r == 0)
1603 return -EBUSY;
1604 }
1605
f8f3f926
LP
1606 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1607 if (r < 0)
1608 return r;
1609
1610 r = pidfd_get_pid(pidfd, &pid);
1611 if (r < 0)
1612 return r;
1613
1614 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1615 return -EBUSY;
1616
1617 s = source_new(e, !ret, SOURCE_CHILD);
1618 if (!s)
1619 return -ENOMEM;
1620
1621 s->wakeup = WAKEUP_EVENT_SOURCE;
1622 s->child.pidfd = pidfd;
1623 s->child.pid = pid;
1624 s->child.options = options;
1625 s->child.callback = callback;
1626 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1627 s->userdata = userdata;
1628 s->enabled = SD_EVENT_ONESHOT;
1629
1630 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1631 if (r < 0)
1632 return r;
1633
f8f3f926
LP
1634 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1635 /* We only want to watch for WEXITED */
f8f3f926 1636 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1637 if (r < 0)
f8f3f926 1638 return r;
f8f3f926
LP
1639 } else {
1640 /* We shall wait for some other event than WEXITED */
f8f3f926 1641 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1642 if (r < 0)
f8f3f926 1643 return r;
a71fe8b8 1644
f8f3f926
LP
1645 e->need_process_child = true;
1646 }
1647
b6d5481b 1648 e->n_online_child_sources++;
ac9f2640 1649
f8f3f926
LP
1650 if (ret)
1651 *ret = s;
f8f3f926 1652 TAKE_PTR(s);
fd38203a
LP
1653 return 0;
1654}
1655
b9350e70
LP
1656static int generic_exit_callback(sd_event_source *s, void *userdata) {
1657 assert(s);
1658
1659 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1660}
1661
f7262a9f 1662_public_ int sd_event_add_defer(
305f78bf 1663 sd_event *e,
151b9b96 1664 sd_event_source **ret,
718db961 1665 sd_event_handler_t callback,
151b9b96 1666 void *userdata) {
305f78bf 1667
ec766a51 1668 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1669 int r;
1670
305f78bf 1671 assert_return(e, -EINVAL);
b937d761 1672 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1673 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1674 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1675
b9350e70
LP
1676 if (!callback)
1677 callback = generic_exit_callback;
1678
a71fe8b8 1679 s = source_new(e, !ret, SOURCE_DEFER);
fd38203a
LP
1680 if (!s)
1681 return -ENOMEM;
1682
1683 s->defer.callback = callback;
1684 s->userdata = userdata;
baf76283 1685 s->enabled = SD_EVENT_ONESHOT;
fd38203a
LP
1686
1687 r = source_set_pending(s, true);
ec766a51 1688 if (r < 0)
fd38203a 1689 return r;
fd38203a 1690
a71fe8b8
LP
1691 if (ret)
1692 *ret = s;
ec766a51 1693 TAKE_PTR(s);
a71fe8b8 1694
fd38203a
LP
1695 return 0;
1696}
1697
6e9feda3
LP
1698_public_ int sd_event_add_post(
1699 sd_event *e,
1700 sd_event_source **ret,
1701 sd_event_handler_t callback,
1702 void *userdata) {
1703
ec766a51 1704 _cleanup_(source_freep) sd_event_source *s = NULL;
6e9feda3
LP
1705 int r;
1706
1707 assert_return(e, -EINVAL);
b937d761 1708 assert_return(e = event_resolve(e), -ENOPKG);
6e9feda3
LP
1709 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1710 assert_return(!event_pid_changed(e), -ECHILD);
1711
b9350e70
LP
1712 if (!callback)
1713 callback = generic_exit_callback;
1714
a71fe8b8 1715 s = source_new(e, !ret, SOURCE_POST);
6e9feda3
LP
1716 if (!s)
1717 return -ENOMEM;
1718
1719 s->post.callback = callback;
1720 s->userdata = userdata;
1721 s->enabled = SD_EVENT_ON;
1722
de7fef4b 1723 r = set_ensure_put(&e->post_sources, NULL, s);
ec766a51 1724 if (r < 0)
6e9feda3 1725 return r;
de7fef4b 1726 assert(r > 0);
6e9feda3 1727
a71fe8b8
LP
1728 if (ret)
1729 *ret = s;
ec766a51 1730 TAKE_PTR(s);
a71fe8b8 1731
6e9feda3
LP
1732 return 0;
1733}
1734
6203e07a 1735_public_ int sd_event_add_exit(
305f78bf 1736 sd_event *e,
151b9b96 1737 sd_event_source **ret,
718db961 1738 sd_event_handler_t callback,
151b9b96 1739 void *userdata) {
305f78bf 1740
ec766a51 1741 _cleanup_(source_freep) sd_event_source *s = NULL;
da7e457c
LP
1742 int r;
1743
1744 assert_return(e, -EINVAL);
b937d761 1745 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
1746 assert_return(callback, -EINVAL);
1747 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1748 assert_return(!event_pid_changed(e), -ECHILD);
1749
c983e776
EV
1750 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1751 if (r < 0)
1752 return r;
da7e457c 1753
a71fe8b8 1754 s = source_new(e, !ret, SOURCE_EXIT);
fd38203a 1755 if (!s)
da7e457c 1756 return -ENOMEM;
fd38203a 1757
6203e07a 1758 s->exit.callback = callback;
da7e457c 1759 s->userdata = userdata;
6203e07a 1760 s->exit.prioq_index = PRIOQ_IDX_NULL;
baf76283 1761 s->enabled = SD_EVENT_ONESHOT;
da7e457c 1762
6203e07a 1763 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
ec766a51 1764 if (r < 0)
da7e457c 1765 return r;
da7e457c 1766
a71fe8b8
LP
1767 if (ret)
1768 *ret = s;
ec766a51 1769 TAKE_PTR(s);
a71fe8b8 1770
da7e457c
LP
1771 return 0;
1772}
1773
97ef5391
LP
1774static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
1775 assert(e);
1776
1777 if (!d)
1778 return;
1779
1780 assert(hashmap_isempty(d->inodes));
1781 assert(hashmap_isempty(d->wd));
1782
1783 if (d->buffer_filled > 0)
0601b958 1784 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
1785
1786 hashmap_free(d->inodes);
1787 hashmap_free(d->wd);
1788
1789 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
1790
1791 if (d->fd >= 0) {
fbae5090
YW
1792 if (!event_pid_changed(e) &&
1793 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
97ef5391
LP
1794 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
1795
1796 safe_close(d->fd);
1797 }
1798 free(d);
1799}
1800
1801static int event_make_inotify_data(
1802 sd_event *e,
1803 int64_t priority,
1804 struct inotify_data **ret) {
1805
254d1313 1806 _cleanup_close_ int fd = -EBADF;
97ef5391 1807 struct inotify_data *d;
97ef5391
LP
1808 int r;
1809
1810 assert(e);
1811
1812 d = hashmap_get(e->inotify_data, &priority);
1813 if (d) {
1814 if (ret)
1815 *ret = d;
1816 return 0;
1817 }
1818
1819 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
1820 if (fd < 0)
1821 return -errno;
1822
1823 fd = fd_move_above_stdio(fd);
1824
97ef5391
LP
1825 d = new(struct inotify_data, 1);
1826 if (!d)
1827 return -ENOMEM;
1828
1829 *d = (struct inotify_data) {
1830 .wakeup = WAKEUP_INOTIFY_DATA,
1831 .fd = TAKE_FD(fd),
1832 .priority = priority,
1833 };
1834
c2484a75 1835 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
97ef5391
LP
1836 if (r < 0) {
1837 d->fd = safe_close(d->fd);
1838 free(d);
1839 return r;
1840 }
1841
1eac7948 1842 struct epoll_event ev = {
97ef5391
LP
1843 .events = EPOLLIN,
1844 .data.ptr = d,
1845 };
1846
1847 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
1848 r = -errno;
1849 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
1850 * remove the fd from the epoll first, which we don't want as we couldn't
1851 * add it in the first place. */
1852 event_free_inotify_data(e, d);
1853 return r;
1854 }
1855
1856 if (ret)
1857 *ret = d;
1858
1859 return 1;
1860}
1861
7a08d314 1862static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
90c88092 1863 int r;
97ef5391
LP
1864
1865 assert(x);
1866 assert(y);
1867
90c88092
YW
1868 r = CMP(x->dev, y->dev);
1869 if (r != 0)
1870 return r;
97ef5391 1871
6dd91b36 1872 return CMP(x->ino, y->ino);
97ef5391
LP
1873}
1874
7a08d314
YW
1875static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
1876 assert(d);
97ef5391
LP
1877
1878 siphash24_compress(&d->dev, sizeof(d->dev), state);
1879 siphash24_compress(&d->ino, sizeof(d->ino), state);
1880}
1881
7a08d314 1882DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
97ef5391
LP
1883
1884static void event_free_inode_data(
1885 sd_event *e,
1886 struct inode_data *d) {
1887
1888 assert(e);
1889
1890 if (!d)
1891 return;
1892
64903d18 1893 assert(!d->event_sources);
97ef5391
LP
1894
1895 if (d->fd >= 0) {
ed828563 1896 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
1897 safe_close(d->fd);
1898 }
1899
1900 if (d->inotify_data) {
1901
1902 if (d->wd >= 0) {
fbae5090 1903 if (d->inotify_data->fd >= 0 && !event_pid_changed(e)) {
97ef5391
LP
1904 /* So here's a problem. At the time this runs the watch descriptor might already be
1905 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
1906 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
1907 * likely case to happen. */
1908
1909 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
1910 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
1911 }
1912
1913 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
1914 }
1915
1916 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
1917 }
1918
1919 free(d);
1920}
1921
53baf2ef
LP
1922static void event_gc_inotify_data(
1923 sd_event *e,
1924 struct inotify_data *d) {
1925
1926 assert(e);
1927
1928 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
1929 * any inode with it anymore, which in turn happens if no event source of this priority is interested
1930 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
1931 * (under the expectation that the GC is called again once the counter is decremented). */
1932
1933 if (!d)
1934 return;
1935
1936 if (!hashmap_isempty(d->inodes))
1937 return;
1938
1939 if (d->n_busy > 0)
1940 return;
1941
1942 event_free_inotify_data(e, d);
1943}
1944
97ef5391
LP
1945static void event_gc_inode_data(
1946 sd_event *e,
1947 struct inode_data *d) {
1948
1949 struct inotify_data *inotify_data;
1950
1951 assert(e);
1952
1953 if (!d)
1954 return;
1955
64903d18 1956 if (d->event_sources)
97ef5391
LP
1957 return;
1958
1959 inotify_data = d->inotify_data;
1960 event_free_inode_data(e, d);
1961
53baf2ef 1962 event_gc_inotify_data(e, inotify_data);
97ef5391
LP
1963}
1964
1965static int event_make_inode_data(
1966 sd_event *e,
1967 struct inotify_data *inotify_data,
1968 dev_t dev,
1969 ino_t ino,
1970 struct inode_data **ret) {
1971
1972 struct inode_data *d, key;
1973 int r;
1974
1975 assert(e);
1976 assert(inotify_data);
1977
1978 key = (struct inode_data) {
1979 .ino = ino,
1980 .dev = dev,
1981 };
1982
1983 d = hashmap_get(inotify_data->inodes, &key);
1984 if (d) {
1985 if (ret)
1986 *ret = d;
1987
1988 return 0;
1989 }
1990
1991 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
1992 if (r < 0)
1993 return r;
1994
1995 d = new(struct inode_data, 1);
1996 if (!d)
1997 return -ENOMEM;
1998
1999 *d = (struct inode_data) {
2000 .dev = dev,
2001 .ino = ino,
2002 .wd = -1,
254d1313 2003 .fd = -EBADF,
97ef5391
LP
2004 .inotify_data = inotify_data,
2005 };
2006
2007 r = hashmap_put(inotify_data->inodes, d, d);
2008 if (r < 0) {
2009 free(d);
2010 return r;
2011 }
2012
2013 if (ret)
2014 *ret = d;
2015
2016 return 1;
2017}
2018
2019static uint32_t inode_data_determine_mask(struct inode_data *d) {
2020 bool excl_unlink = true;
2021 uint32_t combined = 0;
97ef5391
LP
2022
2023 assert(d);
2024
2025 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2026 * the IN_EXCL_UNLINK flag is ANDed instead.
2027 *
2028 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2029 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
f21f31b2 2030 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
97ef5391
LP
2031 * events we don't care for client-side. */
2032
2033 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2034
2035 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2036 excl_unlink = false;
2037
2038 combined |= s->inotify.mask;
2039 }
2040
2041 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2042}
2043
2044static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2045 uint32_t combined_mask;
2046 int wd, r;
2047
2048 assert(d);
2049 assert(d->fd >= 0);
2050
2051 combined_mask = inode_data_determine_mask(d);
2052
2053 if (d->wd >= 0 && combined_mask == d->combined_mask)
2054 return 0;
2055
2056 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2057 if (r < 0)
2058 return r;
2059
2060 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2061 if (wd < 0)
2062 return -errno;
2063
2064 if (d->wd < 0) {
2065 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2066 if (r < 0) {
2067 (void) inotify_rm_watch(d->inotify_data->fd, wd);
2068 return r;
2069 }
2070
2071 d->wd = wd;
2072
2073 } else if (d->wd != wd) {
2074
2075 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2076 (void) inotify_rm_watch(d->fd, wd);
2077 return -EINVAL;
2078 }
2079
2080 d->combined_mask = combined_mask;
2081 return 1;
2082}
2083
b9350e70
LP
2084static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2085 assert(s);
2086
2087 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2088}
2089
e67d738a 2090static int event_add_inotify_fd_internal(
97ef5391
LP
2091 sd_event *e,
2092 sd_event_source **ret,
e67d738a
LP
2093 int fd,
2094 bool donate,
97ef5391
LP
2095 uint32_t mask,
2096 sd_event_inotify_handler_t callback,
2097 void *userdata) {
2098
5bb1d7fb 2099 _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
e67d738a 2100 _cleanup_(source_freep) sd_event_source *s = NULL;
97ef5391
LP
2101 struct inotify_data *inotify_data = NULL;
2102 struct inode_data *inode_data = NULL;
97ef5391
LP
2103 struct stat st;
2104 int r;
2105
2106 assert_return(e, -EINVAL);
2107 assert_return(e = event_resolve(e), -ENOPKG);
e67d738a 2108 assert_return(fd >= 0, -EBADF);
97ef5391
LP
2109 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2110 assert_return(!event_pid_changed(e), -ECHILD);
2111
b9350e70
LP
2112 if (!callback)
2113 callback = inotify_exit_callback;
2114
97ef5391
LP
2115 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2116 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2117 * the user can't use them for us. */
2118 if (mask & IN_MASK_ADD)
2119 return -EINVAL;
2120
97ef5391
LP
2121 if (fstat(fd, &st) < 0)
2122 return -errno;
2123
2124 s = source_new(e, !ret, SOURCE_INOTIFY);
2125 if (!s)
2126 return -ENOMEM;
2127
2128 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2129 s->inotify.mask = mask;
2130 s->inotify.callback = callback;
2131 s->userdata = userdata;
2132
2133 /* Allocate an inotify object for this priority, and an inode object within it */
2134 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2135 if (r < 0)
8c75fe17 2136 return r;
97ef5391
LP
2137
2138 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
8c75fe17 2139 if (r < 0) {
e67d738a 2140 event_gc_inotify_data(e, inotify_data);
8c75fe17
ZJS
2141 return r;
2142 }
97ef5391
LP
2143
2144 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2145 * the event source, until then, for which we need the original inode. */
2146 if (inode_data->fd < 0) {
e67d738a
LP
2147 if (donated_fd >= 0)
2148 inode_data->fd = TAKE_FD(donated_fd);
2149 else {
2150 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2151 if (inode_data->fd < 0) {
2152 r = -errno;
2153 event_gc_inode_data(e, inode_data);
2154 return r;
2155 }
2156 }
2157
ed828563 2158 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
97ef5391
LP
2159 }
2160
2161 /* Link our event source to the inode data object */
2162 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2163 s->inotify.inode_data = inode_data;
2164
97ef5391
LP
2165 /* Actually realize the watch now */
2166 r = inode_data_realize_watch(e, inode_data);
2167 if (r < 0)
8c75fe17 2168 return r;
97ef5391 2169
97ef5391
LP
2170 if (ret)
2171 *ret = s;
8c75fe17 2172 TAKE_PTR(s);
97ef5391
LP
2173
2174 return 0;
97ef5391
LP
2175}
2176
e67d738a
LP
2177_public_ int sd_event_add_inotify_fd(
2178 sd_event *e,
2179 sd_event_source **ret,
2180 int fd,
2181 uint32_t mask,
2182 sd_event_inotify_handler_t callback,
2183 void *userdata) {
2184
2185 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2186}
2187
2188_public_ int sd_event_add_inotify(
2189 sd_event *e,
2190 sd_event_source **ret,
2191 const char *path,
2192 uint32_t mask,
2193 sd_event_inotify_handler_t callback,
2194 void *userdata) {
2195
2091c779 2196 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
e67d738a
LP
2197 int fd, r;
2198
2199 assert_return(path, -EINVAL);
2200
586c8cee
ZJS
2201 fd = open(path, O_PATH | O_CLOEXEC |
2202 (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2203 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
e67d738a
LP
2204 if (fd < 0)
2205 return -errno;
2206
2207 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2208 if (r < 0)
2209 return r;
2210
2211 (void) sd_event_source_set_description(s, path);
2212
2213 if (ret)
2214 *ret = s;
2215
2216 return r;
2217}
2218
8301aa0b 2219static sd_event_source* event_source_free(sd_event_source *s) {
6680dd6b
LP
2220 if (!s)
2221 return NULL;
da7e457c 2222
8301aa0b
YW
2223 /* Here's a special hack: when we are called from a
2224 * dispatch handler we won't free the event source
2225 * immediately, but we will detach the fd from the
2226 * epoll. This way it is safe for the caller to unref
2227 * the event source and immediately close the fd, but
2228 * we still retain a valid event source object after
2229 * the callback. */
fd38203a 2230
76d04c3a 2231 if (s->dispatching)
8301aa0b 2232 source_disconnect(s);
76d04c3a 2233 else
8301aa0b 2234 source_free(s);
fd38203a
LP
2235
2236 return NULL;
2237}
2238
8301aa0b
YW
2239DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2240
356779df 2241_public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
f7f53e9e 2242 assert_return(s, -EINVAL);
f4b2933e 2243 assert_return(!event_pid_changed(s->event), -ECHILD);
f7f53e9e 2244
356779df 2245 return free_and_strdup(&s->description, description);
f7f53e9e
TG
2246}
2247
356779df 2248_public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
f7f53e9e 2249 assert_return(s, -EINVAL);
356779df 2250 assert_return(description, -EINVAL);
f4b2933e 2251 assert_return(!event_pid_changed(s->event), -ECHILD);
f7f53e9e 2252
7d92a1a4
ZJS
2253 if (!s->description)
2254 return -ENXIO;
2255
356779df 2256 *description = s->description;
f7f53e9e
TG
2257 return 0;
2258}
2259
adcc4ca3 2260_public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
305f78bf 2261 assert_return(s, NULL);
eaa3cbef
LP
2262
2263 return s->event;
2264}
2265
f7262a9f 2266_public_ int sd_event_source_get_pending(sd_event_source *s) {
305f78bf 2267 assert_return(s, -EINVAL);
6203e07a 2268 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c 2269 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2270 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2271
2272 return s->pending;
2273}
2274
f7262a9f 2275_public_ int sd_event_source_get_io_fd(sd_event_source *s) {
305f78bf
LP
2276 assert_return(s, -EINVAL);
2277 assert_return(s->type == SOURCE_IO, -EDOM);
2278 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2279
2280 return s->io.fd;
2281}
2282
30caf8f3
LP
2283_public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2284 int r;
2285
2286 assert_return(s, -EINVAL);
8ac43fee 2287 assert_return(fd >= 0, -EBADF);
30caf8f3
LP
2288 assert_return(s->type == SOURCE_IO, -EDOM);
2289 assert_return(!event_pid_changed(s->event), -ECHILD);
2290
2291 if (s->io.fd == fd)
2292 return 0;
2293
b6d5481b 2294 if (event_source_is_offline(s)) {
30caf8f3
LP
2295 s->io.fd = fd;
2296 s->io.registered = false;
2297 } else {
2298 int saved_fd;
2299
2300 saved_fd = s->io.fd;
2301 assert(s->io.registered);
2302
2303 s->io.fd = fd;
2304 s->io.registered = false;
2305
2306 r = source_io_register(s, s->enabled, s->io.events);
2307 if (r < 0) {
2308 s->io.fd = saved_fd;
2309 s->io.registered = true;
2310 return r;
2311 }
2312
5a795bff 2313 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
30caf8f3
LP
2314 }
2315
2316 return 0;
2317}
2318
ab93297c
NM
2319_public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2320 assert_return(s, -EINVAL);
2321 assert_return(s->type == SOURCE_IO, -EDOM);
2322
2323 return s->io.owned;
2324}
2325
2326_public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2327 assert_return(s, -EINVAL);
2328 assert_return(s->type == SOURCE_IO, -EDOM);
2329
2330 s->io.owned = own;
2331 return 0;
2332}
2333
f7262a9f 2334_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
305f78bf
LP
2335 assert_return(s, -EINVAL);
2336 assert_return(events, -EINVAL);
2337 assert_return(s->type == SOURCE_IO, -EDOM);
2338 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2339
2340 *events = s->io.events;
2341 return 0;
2342}
2343
f7262a9f 2344_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
fd38203a
LP
2345 int r;
2346
305f78bf
LP
2347 assert_return(s, -EINVAL);
2348 assert_return(s->type == SOURCE_IO, -EDOM);
2a16a986 2349 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 2350 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2351 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2352
b63c8d4f
DH
2353 /* edge-triggered updates are never skipped, so we can reset edges */
2354 if (s->io.events == events && !(events & EPOLLET))
fd38203a
LP
2355 return 0;
2356
2a0dc6cd
LP
2357 r = source_set_pending(s, false);
2358 if (r < 0)
2359 return r;
2360
b6d5481b 2361 if (event_source_is_online(s)) {
e4715127 2362 r = source_io_register(s, s->enabled, events);
fd38203a
LP
2363 if (r < 0)
2364 return r;
2365 }
2366
2367 s->io.events = events;
2368
2369 return 0;
2370}
2371
f7262a9f 2372_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
305f78bf
LP
2373 assert_return(s, -EINVAL);
2374 assert_return(revents, -EINVAL);
2375 assert_return(s->type == SOURCE_IO, -EDOM);
2376 assert_return(s->pending, -ENODATA);
2377 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2378
2379 *revents = s->io.revents;
2380 return 0;
2381}
2382
f7262a9f 2383_public_ int sd_event_source_get_signal(sd_event_source *s) {
305f78bf
LP
2384 assert_return(s, -EINVAL);
2385 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2386 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2387
2388 return s->signal.sig;
2389}
2390
31927c16 2391_public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
305f78bf
LP
2392 assert_return(s, -EINVAL);
2393 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2394
6680b8d1
ME
2395 *priority = s->priority;
2396 return 0;
fd38203a
LP
2397}
2398
31927c16 2399_public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
97ef5391
LP
2400 bool rm_inotify = false, rm_inode = false;
2401 struct inotify_data *new_inotify_data = NULL;
2402 struct inode_data *new_inode_data = NULL;
9da4cb2b
LP
2403 int r;
2404
305f78bf 2405 assert_return(s, -EINVAL);
da7e457c 2406 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2407 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2408
2409 if (s->priority == priority)
2410 return 0;
2411
97ef5391
LP
2412 if (s->type == SOURCE_INOTIFY) {
2413 struct inode_data *old_inode_data;
2414
2415 assert(s->inotify.inode_data);
2416 old_inode_data = s->inotify.inode_data;
2417
2418 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2419 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2420 * events we allow priority changes only until the first following iteration. */
2421 if (old_inode_data->fd < 0)
2422 return -EOPNOTSUPP;
2423
2424 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2425 if (r < 0)
2426 return r;
2427 rm_inotify = r > 0;
2428
2429 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2430 if (r < 0)
2431 goto fail;
2432 rm_inode = r > 0;
2433
2434 if (new_inode_data->fd < 0) {
2435 /* Duplicate the fd for the new inode object if we don't have any yet */
2436 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2437 if (new_inode_data->fd < 0) {
2438 r = -errno;
2439 goto fail;
2440 }
2441
ed828563 2442 LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
97ef5391
LP
2443 }
2444
2445 /* Move the event source to the new inode data structure */
2446 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2447 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2448 s->inotify.inode_data = new_inode_data;
2449
2450 /* Now create the new watch */
2451 r = inode_data_realize_watch(s->event, new_inode_data);
2452 if (r < 0) {
2453 /* Move it back */
2454 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2455 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2456 s->inotify.inode_data = old_inode_data;
2457 goto fail;
2458 }
2459
2460 s->priority = priority;
2461
2462 event_gc_inode_data(s->event, old_inode_data);
2463
b6d5481b 2464 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
9da4cb2b
LP
2465 struct signal_data *old, *d;
2466
2467 /* Move us from the signalfd belonging to the old
2468 * priority to the signalfd of the new priority */
2469
2470 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2471
2472 s->priority = priority;
2473
2474 r = event_make_signal_data(s->event, s->signal.sig, &d);
2475 if (r < 0) {
2476 s->priority = old->priority;
2477 return r;
2478 }
2479
2480 event_unmask_signal_data(s->event, old, s->signal.sig);
2481 } else
2482 s->priority = priority;
fd38203a 2483
e1951c16 2484 event_source_pp_prioq_reshuffle(s);
fd38203a 2485
6203e07a
LP
2486 if (s->type == SOURCE_EXIT)
2487 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
305f78bf 2488
fd38203a 2489 return 0;
97ef5391
LP
2490
2491fail:
2492 if (rm_inode)
2493 event_free_inode_data(s->event, new_inode_data);
2494
2495 if (rm_inotify)
2496 event_free_inotify_data(s->event, new_inotify_data);
2497
2498 return r;
fd38203a
LP
2499}
2500
cad143a8 2501_public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
71193c0b
ZJS
2502 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2503 if (!s && !ret)
2504 return false;
2505
305f78bf 2506 assert_return(s, -EINVAL);
305f78bf 2507 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2508
cad143a8
LP
2509 if (ret)
2510 *ret = s->enabled;
2511
08c1eb0e 2512 return s->enabled != SD_EVENT_OFF;
fd38203a
LP
2513}
2514
b6d5481b
LP
2515static int event_source_offline(
2516 sd_event_source *s,
2517 int enabled,
2518 bool ratelimited) {
2519
2520 bool was_offline;
fd38203a
LP
2521 int r;
2522
ddfde737 2523 assert(s);
b6d5481b 2524 assert(enabled == SD_EVENT_OFF || ratelimited);
fd38203a 2525
ddfde737 2526 /* Unset the pending flag when this event source is disabled */
b6d5481b
LP
2527 if (s->enabled != SD_EVENT_OFF &&
2528 enabled == SD_EVENT_OFF &&
2529 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2530 r = source_set_pending(s, false);
2531 if (r < 0)
2532 return r;
2533 }
cc567911 2534
b6d5481b
LP
2535 was_offline = event_source_is_offline(s);
2536 s->enabled = enabled;
2537 s->ratelimited = ratelimited;
fd38203a 2538
ddfde737 2539 switch (s->type) {
fd38203a 2540
ddfde737
LP
2541 case SOURCE_IO:
2542 source_io_unregister(s);
2543 break;
ac989a78 2544
ddfde737
LP
2545 case SOURCE_SIGNAL:
2546 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2547 break;
fd38203a 2548
ddfde737 2549 case SOURCE_CHILD:
b6d5481b
LP
2550 if (!was_offline) {
2551 assert(s->event->n_online_child_sources > 0);
2552 s->event->n_online_child_sources--;
2553 }
fd38203a 2554
ddfde737
LP
2555 if (EVENT_SOURCE_WATCH_PIDFD(s))
2556 source_child_pidfd_unregister(s);
2557 else
2558 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2559 break;
4807d2d0 2560
ddfde737
LP
2561 case SOURCE_EXIT:
2562 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2563 break;
fd38203a 2564
2115b9b6
YW
2565 case SOURCE_TIME_REALTIME:
2566 case SOURCE_TIME_BOOTTIME:
2567 case SOURCE_TIME_MONOTONIC:
2568 case SOURCE_TIME_REALTIME_ALARM:
2569 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737
LP
2570 case SOURCE_DEFER:
2571 case SOURCE_POST:
2572 case SOURCE_INOTIFY:
2573 break;
fd38203a 2574
ddfde737 2575 default:
04499a70 2576 assert_not_reached();
ddfde737 2577 }
fd38203a 2578
2115b9b6
YW
2579 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2580 event_source_time_prioq_reshuffle(s);
2581
b6d5481b 2582 return 1;
ddfde737 2583}
f8f3f926 2584
b6d5481b
LP
2585static int event_source_online(
2586 sd_event_source *s,
2587 int enabled,
2588 bool ratelimited) {
2589
2590 bool was_online;
ddfde737 2591 int r;
fd38203a 2592
ddfde737 2593 assert(s);
b6d5481b 2594 assert(enabled != SD_EVENT_OFF || !ratelimited);
305f78bf 2595
ddfde737 2596 /* Unset the pending flag when this event source is enabled */
b6d5481b
LP
2597 if (s->enabled == SD_EVENT_OFF &&
2598 enabled != SD_EVENT_OFF &&
2599 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2600 r = source_set_pending(s, false);
2601 if (r < 0)
2602 return r;
2603 }
9d3e3aa5 2604
b6d5481b
LP
2605 /* Are we really ready for onlining? */
2606 if (enabled == SD_EVENT_OFF || ratelimited) {
2607 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2608 s->enabled = enabled;
2609 s->ratelimited = ratelimited;
2610 return 0;
2611 }
2612
2613 was_online = event_source_is_online(s);
2614
ddfde737 2615 switch (s->type) {
ddfde737 2616 case SOURCE_IO:
b6d5481b 2617 r = source_io_register(s, enabled, s->io.events);
d2eafe61 2618 if (r < 0)
ddfde737 2619 return r;
ddfde737 2620 break;
fd38203a 2621
ddfde737
LP
2622 case SOURCE_SIGNAL:
2623 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2624 if (r < 0) {
ddfde737
LP
2625 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2626 return r;
2627 }
fd38203a 2628
ddfde737 2629 break;
fd38203a 2630
ddfde737 2631 case SOURCE_CHILD:
ddfde737
LP
2632 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2633 /* yes, we have pidfd */
9da4cb2b 2634
b6d5481b 2635 r = source_child_pidfd_register(s, enabled);
ac9f2640 2636 if (r < 0)
9da4cb2b 2637 return r;
ddfde737
LP
2638 } else {
2639 /* no pidfd, or something other to watch for than WEXITED */
9da4cb2b 2640
ddfde737
LP
2641 r = event_make_signal_data(s->event, SIGCHLD, NULL);
2642 if (r < 0) {
ddfde737
LP
2643 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2644 return r;
2645 }
2646 }
fd38203a 2647
b6d5481b
LP
2648 if (!was_online)
2649 s->event->n_online_child_sources++;
ddfde737 2650 break;
4807d2d0 2651
d2eafe61
ZJS
2652 case SOURCE_TIME_REALTIME:
2653 case SOURCE_TIME_BOOTTIME:
2654 case SOURCE_TIME_MONOTONIC:
2655 case SOURCE_TIME_REALTIME_ALARM:
2656 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737 2657 case SOURCE_EXIT:
ddfde737
LP
2658 case SOURCE_DEFER:
2659 case SOURCE_POST:
2660 case SOURCE_INOTIFY:
2661 break;
9da4cb2b 2662
ddfde737 2663 default:
04499a70 2664 assert_not_reached();
ddfde737 2665 }
f8f3f926 2666
b6d5481b
LP
2667 s->enabled = enabled;
2668 s->ratelimited = ratelimited;
d2eafe61
ZJS
2669
2670 /* Non-failing operations below */
2115b9b6 2671 if (s->type == SOURCE_EXIT)
d2eafe61 2672 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
d2eafe61 2673
2115b9b6
YW
2674 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2675 event_source_time_prioq_reshuffle(s);
d2eafe61 2676
b6d5481b 2677 return 1;
ddfde737
LP
2678}
2679
2680_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
2681 int r;
9da4cb2b 2682
ddfde737 2683 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
7e922b05
ZJS
2684
2685 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
2686 if (m == SD_EVENT_OFF && !s)
2687 return 0;
2688
2689 assert_return(s, -EINVAL);
ddfde737 2690 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2691
ddfde737
LP
2692 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
2693 if (s->event->state == SD_EVENT_FINISHED)
2694 return m == SD_EVENT_OFF ? 0 : -ESTALE;
305f78bf 2695
ddfde737
LP
2696 if (s->enabled == m) /* No change? */
2697 return 0;
9d3e3aa5 2698
ddfde737 2699 if (m == SD_EVENT_OFF)
b6d5481b 2700 r = event_source_offline(s, m, s->ratelimited);
ddfde737
LP
2701 else {
2702 if (s->enabled != SD_EVENT_OFF) {
2703 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
2704 * event source is already enabled after all. */
2705 s->enabled = m;
2706 return 0;
fd38203a 2707 }
ddfde737 2708
b6d5481b 2709 r = event_source_online(s, m, s->ratelimited);
fd38203a 2710 }
ddfde737
LP
2711 if (r < 0)
2712 return r;
fd38203a 2713
e1951c16 2714 event_source_pp_prioq_reshuffle(s);
fd38203a
LP
2715 return 0;
2716}
2717
f7262a9f 2718_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
305f78bf
LP
2719 assert_return(s, -EINVAL);
2720 assert_return(usec, -EINVAL);
6a0f1f6d 2721 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
305f78bf 2722 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2723
2724 *usec = s->time.next;
2725 return 0;
2726}
2727
f7262a9f 2728_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
2a0dc6cd 2729 int r;
6a0f1f6d 2730
305f78bf 2731 assert_return(s, -EINVAL);
6a0f1f6d 2732 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 2733 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2734 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2735
2a0dc6cd
LP
2736 r = source_set_pending(s, false);
2737 if (r < 0)
2738 return r;
2576a19e 2739
2a0dc6cd 2740 s->time.next = usec;
fd38203a 2741
e1951c16 2742 event_source_time_prioq_reshuffle(s);
fd38203a
LP
2743 return 0;
2744}
2745
d6a83dc4
LP
2746_public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
2747 usec_t t;
2748 int r;
2749
2750 assert_return(s, -EINVAL);
2751 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2752
ef859195
LP
2753 if (usec == USEC_INFINITY)
2754 return sd_event_source_set_time(s, USEC_INFINITY);
2755
d6a83dc4
LP
2756 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
2757 if (r < 0)
2758 return r;
2759
496db330
YW
2760 usec = usec_add(t, usec);
2761 if (usec == USEC_INFINITY)
d6a83dc4
LP
2762 return -EOVERFLOW;
2763
496db330 2764 return sd_event_source_set_time(s, usec);
d6a83dc4
LP
2765}
2766
f7262a9f 2767_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
305f78bf
LP
2768 assert_return(s, -EINVAL);
2769 assert_return(usec, -EINVAL);
6a0f1f6d 2770 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
305f78bf
LP
2771 assert_return(!event_pid_changed(s->event), -ECHILD);
2772
2773 *usec = s->time.accuracy;
2774 return 0;
2775}
2776
f7262a9f 2777_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
2a0dc6cd 2778 int r;
6a0f1f6d 2779
305f78bf 2780 assert_return(s, -EINVAL);
f5fbe71d 2781 assert_return(usec != UINT64_MAX, -EINVAL);
6a0f1f6d 2782 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 2783 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2784 assert_return(!event_pid_changed(s->event), -ECHILD);
eaa3cbef 2785
2a0dc6cd
LP
2786 r = source_set_pending(s, false);
2787 if (r < 0)
2788 return r;
2789
eaa3cbef
LP
2790 if (usec == 0)
2791 usec = DEFAULT_ACCURACY_USEC;
2792
eaa3cbef
LP
2793 s->time.accuracy = usec;
2794
e1951c16 2795 event_source_time_prioq_reshuffle(s);
6a0f1f6d
LP
2796 return 0;
2797}
2798
2799_public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
2800 assert_return(s, -EINVAL);
2801 assert_return(clock, -EINVAL);
2802 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2803 assert_return(!event_pid_changed(s->event), -ECHILD);
eaa3cbef 2804
6a0f1f6d 2805 *clock = event_source_type_to_clock(s->type);
eaa3cbef
LP
2806 return 0;
2807}
2808
f7262a9f 2809_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
4bee8012
LP
2810 assert_return(s, -EINVAL);
2811 assert_return(pid, -EINVAL);
2812 assert_return(s->type == SOURCE_CHILD, -EDOM);
2813 assert_return(!event_pid_changed(s->event), -ECHILD);
2814
2815 *pid = s->child.pid;
2816 return 0;
2817}
2818
f8f3f926
LP
2819_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
2820 assert_return(s, -EINVAL);
2821 assert_return(s->type == SOURCE_CHILD, -EDOM);
2822 assert_return(!event_pid_changed(s->event), -ECHILD);
2823
2824 if (s->child.pidfd < 0)
2825 return -EOPNOTSUPP;
2826
2827 return s->child.pidfd;
2828}
2829
2830_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
2831 assert_return(s, -EINVAL);
2832 assert_return(s->type == SOURCE_CHILD, -EDOM);
2833 assert_return(!event_pid_changed(s->event), -ECHILD);
2834 assert_return(SIGNAL_VALID(sig), -EINVAL);
2835
2836 /* If we already have seen indication the process exited refuse sending a signal early. This way we
2837 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
2838 * available. */
2839 if (s->child.exited)
2840 return -ESRCH;
2841
2842 if (s->child.pidfd >= 0) {
2843 siginfo_t copy;
2844
2845 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
2846 * structure here */
2847 if (si)
2848 copy = *si;
2849
2850 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
2851 /* Let's propagate the error only if the system call is not implemented or prohibited */
2852 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
2853 return -errno;
2854 } else
2855 return 0;
2856 }
2857
2858 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
2859 * this here. */
2860 if (flags != 0)
2861 return -EOPNOTSUPP;
2862
2863 if (si) {
2864 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
2865 siginfo_t copy = *si;
2866
2867 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
2868 return -errno;
2869 } else if (kill(s->child.pid, sig) < 0)
2870 return -errno;
2871
2872 return 0;
2873}
2874
2875_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
2876 assert_return(s, -EINVAL);
2877 assert_return(s->type == SOURCE_CHILD, -EDOM);
2878
2879 if (s->child.pidfd < 0)
2880 return -EOPNOTSUPP;
2881
2882 return s->child.pidfd_owned;
2883}
2884
2885_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
2886 assert_return(s, -EINVAL);
2887 assert_return(s->type == SOURCE_CHILD, -EDOM);
2888
2889 if (s->child.pidfd < 0)
2890 return -EOPNOTSUPP;
2891
2892 s->child.pidfd_owned = own;
2893 return 0;
2894}
2895
2896_public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
2897 assert_return(s, -EINVAL);
2898 assert_return(s->type == SOURCE_CHILD, -EDOM);
2899
2900 return s->child.process_owned;
2901}
2902
2903_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
2904 assert_return(s, -EINVAL);
2905 assert_return(s->type == SOURCE_CHILD, -EDOM);
2906
2907 s->child.process_owned = own;
2908 return 0;
2909}
2910
97ef5391
LP
2911_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
2912 assert_return(s, -EINVAL);
2913 assert_return(mask, -EINVAL);
2914 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
2915 assert_return(!event_pid_changed(s->event), -ECHILD);
2916
2917 *mask = s->inotify.mask;
2918 return 0;
2919}
2920
718db961 2921_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
fd38203a
LP
2922 int r;
2923
da7e457c 2924 assert_return(s, -EINVAL);
6203e07a 2925 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c
LP
2926 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2927 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2928
2929 if (s->prepare == callback)
2930 return 0;
2931
2932 if (callback && s->prepare) {
2933 s->prepare = callback;
2934 return 0;
2935 }
2936
2937 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
2938 if (r < 0)
2939 return r;
2940
2941 s->prepare = callback;
2942
2943 if (callback) {
2944 r = prioq_put(s->event->prepare, s, &s->prepare_index);
2945 if (r < 0)
2946 return r;
2947 } else
2948 prioq_remove(s->event->prepare, s, &s->prepare_index);
2949
2950 return 0;
2951}
2952
f7262a9f 2953_public_ void* sd_event_source_get_userdata(sd_event_source *s) {
da7e457c 2954 assert_return(s, NULL);
fd38203a
LP
2955
2956 return s->userdata;
2957}
2958
8f726607
LP
2959_public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
2960 void *ret;
2961
2962 assert_return(s, NULL);
2963
2964 ret = s->userdata;
2965 s->userdata = userdata;
2966
2967 return ret;
2968}
2969
b6d5481b
LP
2970static int event_source_enter_ratelimited(sd_event_source *s) {
2971 int r;
2972
2973 assert(s);
2974
2975 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
2976 * the end of the rate limit time window, much as if it was a timer event source. */
2977
2978 if (s->ratelimited)
2979 return 0; /* Already ratelimited, this is a NOP hence */
2980
2981 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
2982 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
2983 if (r < 0)
2984 return r;
2985
2986 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
2987 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
2988 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
2989 if (EVENT_SOURCE_IS_TIME(s->type))
2990 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
2991
2992 /* Now, let's add the event source to the monotonic clock instead */
2993 r = event_source_time_prioq_put(s, &s->event->monotonic);
2994 if (r < 0)
2995 goto fail;
2996
2997 /* And let's take the event source officially offline */
2998 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
2999 if (r < 0) {
3000 event_source_time_prioq_remove(s, &s->event->monotonic);
3001 goto fail;
3002 }
3003
3004 event_source_pp_prioq_reshuffle(s);
3005
3006 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3007 return 0;
3008
3009fail:
3010 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3011 * space for it should already be allocated. */
3012 if (EVENT_SOURCE_IS_TIME(s->type))
3013 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3014
3015 return r;
3016}
3017
fd69f224 3018static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
b6d5481b
LP
3019 int r;
3020
3021 assert(s);
3022
3023 if (!s->ratelimited)
3024 return 0;
3025
3026 /* Let's take the event source out of the monotonic prioq first. */
3027 event_source_time_prioq_remove(s, &s->event->monotonic);
3028
3029 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3030 if (EVENT_SOURCE_IS_TIME(s->type)) {
3031 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3032 if (r < 0)
3033 goto fail;
3034 }
3035
3036 /* Let's try to take it online again. */
3037 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3038 if (r < 0) {
3039 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3040 if (EVENT_SOURCE_IS_TIME(s->type))
3041 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3042
3043 goto fail;
3044 }
3045
3046 event_source_pp_prioq_reshuffle(s);
3047 ratelimit_reset(&s->rate_limit);
3048
3049 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
fd69f224
MS
3050
3051 if (run_callback && s->ratelimit_expire_callback) {
3052 s->dispatching = true;
3053 r = s->ratelimit_expire_callback(s, s->userdata);
3054 s->dispatching = false;
3055
3056 if (r < 0) {
3057 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3058 strna(s->description),
3059 event_source_type_to_string(s->type),
3060 s->exit_on_failure ? "exiting" : "disabling");
3061
3062 if (s->exit_on_failure)
3063 (void) sd_event_exit(s->event, r);
3064 }
3065
3066 if (s->n_ref == 0)
3067 source_free(s);
3068 else if (r < 0)
0a040e64 3069 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd69f224
MS
3070
3071 return 1;
3072 }
3073
b6d5481b
LP
3074 return 0;
3075
3076fail:
3077 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3078 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3079 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3080
3081 return r;
3082}
3083
c2ba3ad6
LP
3084static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3085 usec_t c;
3086 assert(e);
3087 assert(a <= b);
3088
3089 if (a <= 0)
3090 return 0;
393003e1
LP
3091 if (a >= USEC_INFINITY)
3092 return USEC_INFINITY;
c2ba3ad6
LP
3093
3094 if (b <= a + 1)
3095 return a;
3096
52444dc4
LP
3097 initialize_perturb(e);
3098
c2ba3ad6
LP
3099 /*
3100 Find a good time to wake up again between times a and b. We
3101 have two goals here:
3102
3103 a) We want to wake up as seldom as possible, hence prefer
3104 later times over earlier times.
3105
3106 b) But if we have to wake up, then let's make sure to
3107 dispatch as much as possible on the entire system.
3108
3109 We implement this by waking up everywhere at the same time
850516e0 3110 within any given minute if we can, synchronised via the
c2ba3ad6 3111 perturbation value determined from the boot ID. If we can't,
ba276c81
LP
3112 then we try to find the same spot in every 10s, then 1s and
3113 then 250ms step. Otherwise, we pick the last possible time
3114 to wake up.
c2ba3ad6
LP
3115 */
3116
850516e0
LP
3117 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3118 if (c >= b) {
3119 if (_unlikely_(c < USEC_PER_MINUTE))
3120 return b;
3121
3122 c -= USEC_PER_MINUTE;
3123 }
3124
ba276c81
LP
3125 if (c >= a)
3126 return c;
3127
3128 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3129 if (c >= b) {
3130 if (_unlikely_(c < USEC_PER_SEC*10))
3131 return b;
3132
3133 c -= USEC_PER_SEC*10;
3134 }
3135
850516e0
LP
3136 if (c >= a)
3137 return c;
3138
3139 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
c2ba3ad6
LP
3140 if (c >= b) {
3141 if (_unlikely_(c < USEC_PER_SEC))
3142 return b;
3143
3144 c -= USEC_PER_SEC;
3145 }
3146
3147 if (c >= a)
3148 return c;
3149
3150 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3151 if (c >= b) {
3152 if (_unlikely_(c < USEC_PER_MSEC*250))
3153 return b;
3154
3155 c -= USEC_PER_MSEC*250;
3156 }
3157
3158 if (c >= a)
3159 return c;
3160
3161 return b;
3162}
3163
fd38203a
LP
3164static int event_arm_timer(
3165 sd_event *e,
6a0f1f6d 3166 struct clock_data *d) {
fd38203a
LP
3167
3168 struct itimerspec its = {};
c2ba3ad6
LP
3169 sd_event_source *a, *b;
3170 usec_t t;
fd38203a 3171
cde93897 3172 assert(e);
6a0f1f6d 3173 assert(d);
fd38203a 3174
d06441da 3175 if (!d->needs_rearm)
212bbb17 3176 return 0;
7e2bf71c
YW
3177
3178 d->needs_rearm = false;
212bbb17 3179
6a0f1f6d 3180 a = prioq_peek(d->earliest);
19947509 3181 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
b6d5481b 3182 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
72aedc1e 3183
6a0f1f6d 3184 if (d->fd < 0)
c57b5ca3
LP
3185 return 0;
3186
3a43da28 3187 if (d->next == USEC_INFINITY)
72aedc1e
LP
3188 return 0;
3189
3190 /* disarm */
15c689d7
LP
3191 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3192 return -errno;
72aedc1e 3193
3a43da28 3194 d->next = USEC_INFINITY;
fd38203a 3195 return 0;
72aedc1e 3196 }
fd38203a 3197
6a0f1f6d 3198 b = prioq_peek(d->latest);
19947509
ZJS
3199 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3200 assert(b && b->enabled != SD_EVENT_OFF);
c2ba3ad6 3201
b6d5481b 3202 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
6a0f1f6d 3203 if (d->next == t)
fd38203a
LP
3204 return 0;
3205
6a0f1f6d 3206 assert_se(d->fd >= 0);
fd38203a 3207
c2ba3ad6 3208 if (t == 0) {
1751bdde 3209 /* We don't want to disarm here, just mean some time looooong ago. */
fd38203a
LP
3210 its.it_value.tv_sec = 0;
3211 its.it_value.tv_nsec = 1;
3212 } else
c2ba3ad6 3213 timespec_store(&its.it_value, t);
fd38203a 3214
15c689d7 3215 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
cde93897 3216 return -errno;
fd38203a 3217
6a0f1f6d 3218 d->next = t;
fd38203a
LP
3219 return 0;
3220}
3221
9a800b56 3222static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
fd38203a
LP
3223 assert(e);
3224 assert(s);
3225 assert(s->type == SOURCE_IO);
3226
9a800b56
LP
3227 /* If the event source was already pending, we just OR in the
3228 * new revents, otherwise we reset the value. The ORing is
3229 * necessary to handle EPOLLONESHOT events properly where
3230 * readability might happen independently of writability, and
3231 * we need to keep track of both */
3232
3233 if (s->pending)
3234 s->io.revents |= revents;
3235 else
3236 s->io.revents = revents;
fd38203a 3237
fd38203a
LP
3238 return source_set_pending(s, true);
3239}
3240
72aedc1e 3241static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
fd38203a
LP
3242 uint64_t x;
3243 ssize_t ss;
3244
3245 assert(e);
da7e457c 3246 assert(fd >= 0);
72aedc1e 3247
305f78bf 3248 assert_return(events == EPOLLIN, -EIO);
fd38203a
LP
3249
3250 ss = read(fd, &x, sizeof(x));
3251 if (ss < 0) {
8add30a0 3252 if (ERRNO_IS_TRANSIENT(errno))
fd38203a
LP
3253 return 0;
3254
3255 return -errno;
3256 }
3257
8d35dae7 3258 if (_unlikely_(ss != sizeof(x)))
fd38203a
LP
3259 return -EIO;
3260
cde93897 3261 if (next)
3a43da28 3262 *next = USEC_INFINITY;
72aedc1e 3263
fd38203a
LP
3264 return 0;
3265}
3266
305f78bf
LP
3267static int process_timer(
3268 sd_event *e,
3269 usec_t n,
6a0f1f6d 3270 struct clock_data *d) {
305f78bf 3271
fd38203a 3272 sd_event_source *s;
fd69f224 3273 bool callback_invoked = false;
fd38203a
LP
3274 int r;
3275
3276 assert(e);
6a0f1f6d 3277 assert(d);
fd38203a
LP
3278
3279 for (;;) {
6a0f1f6d 3280 s = prioq_peek(d->earliest);
19947509
ZJS
3281 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3282
b6d5481b
LP
3283 if (!s || time_event_source_next(s) > n)
3284 break;
3285
3286 if (s->ratelimited) {
3287 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3288 * again. */
3289 assert(s->ratelimited);
3290
fd69f224 3291 r = event_source_leave_ratelimit(s, /* run_callback */ true);
b6d5481b
LP
3292 if (r < 0)
3293 return r;
fd69f224
MS
3294 else if (r == 1)
3295 callback_invoked = true;
b6d5481b
LP
3296
3297 continue;
3298 }
3299
3300 if (s->enabled == SD_EVENT_OFF || s->pending)
fd38203a
LP
3301 break;
3302
3303 r = source_set_pending(s, true);
3304 if (r < 0)
3305 return r;
3306
e1951c16 3307 event_source_time_prioq_reshuffle(s);
fd38203a
LP
3308 }
3309
fd69f224 3310 return callback_invoked;
fd38203a
LP
3311}
3312
efd3be9d
YW
3313static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3314 int64_t min_priority = threshold;
3315 bool something_new = false;
fd38203a 3316 sd_event_source *s;
fd38203a
LP
3317 int r;
3318
3319 assert(e);
efd3be9d
YW
3320 assert(ret_min_priority);
3321
3322 if (!e->need_process_child) {
3323 *ret_min_priority = min_priority;
3324 return 0;
3325 }
fd38203a 3326
c2ba3ad6
LP
3327 e->need_process_child = false;
3328
91c70071
YW
3329 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3330 * for, instead of using P_ALL. This is because we only want to get child information of very
3331 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3332 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3333 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3334 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3335 * to handle SIGCHLD yourself.
3336 *
3337 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3338 * source is dispatched so that the callback still sees the process as a zombie. */
fd38203a 3339
90e74a66 3340 HASHMAP_FOREACH(s, e->child_sources) {
fd38203a
LP
3341 assert(s->type == SOURCE_CHILD);
3342
efd3be9d
YW
3343 if (s->priority > threshold)
3344 continue;
3345
fd38203a
LP
3346 if (s->pending)
3347 continue;
3348
b6d5481b 3349 if (event_source_is_offline(s))
fd38203a
LP
3350 continue;
3351
f8f3f926
LP
3352 if (s->child.exited)
3353 continue;
3354
91c70071
YW
3355 if (EVENT_SOURCE_WATCH_PIDFD(s))
3356 /* There's a usable pidfd known for this event source? Then don't waitid() for
3357 * it here */
f8f3f926
LP
3358 continue;
3359
fd38203a 3360 zero(s->child.siginfo);
15c689d7
LP
3361 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3362 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
bfd9bfcc 3363 return negative_errno();
fd38203a
LP
3364
3365 if (s->child.siginfo.si_pid != 0) {
945c2931 3366 bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
08cd1552 3367
f8f3f926
LP
3368 if (zombie)
3369 s->child.exited = true;
3370
08cd1552 3371 if (!zombie && (s->child.options & WEXITED)) {
91c70071
YW
3372 /* If the child isn't dead then let's immediately remove the state
3373 * change from the queue, since there's no benefit in leaving it
3374 * queued. */
08cd1552
LP
3375
3376 assert(s->child.options & (WSTOPPED|WCONTINUED));
a5d27871 3377 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
08cd1552
LP
3378 }
3379
fd38203a
LP
3380 r = source_set_pending(s, true);
3381 if (r < 0)
3382 return r;
efd3be9d
YW
3383 if (r > 0) {
3384 something_new = true;
3385 min_priority = MIN(min_priority, s->priority);
3386 }
fd38203a
LP
3387 }
3388 }
3389
efd3be9d
YW
3390 *ret_min_priority = min_priority;
3391 return something_new;
fd38203a
LP
3392}
3393
f8f3f926
LP
3394static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3395 assert(e);
3396 assert(s);
3397 assert(s->type == SOURCE_CHILD);
3398
3399 if (s->pending)
3400 return 0;
3401
b6d5481b 3402 if (event_source_is_offline(s))
f8f3f926
LP
3403 return 0;
3404
3405 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3406 return 0;
3407
3408 zero(s->child.siginfo);
3409 if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3410 return -errno;
3411
3412 if (s->child.siginfo.si_pid == 0)
3413 return 0;
3414
3415 if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3416 s->child.exited = true;
3417
3418 return source_set_pending(s, true);
3419}
3420
efd3be9d 3421static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
fd38203a
LP
3422 int r;
3423
da7e457c 3424 assert(e);
97ef5391 3425 assert(d);
305f78bf 3426 assert_return(events == EPOLLIN, -EIO);
efd3be9d 3427 assert(min_priority);
fd38203a 3428
91c70071
YW
3429 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3430 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3431 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3432 * but we might have higher priority children we care about hence we need to check that
3433 * explicitly. */
9da4cb2b
LP
3434
3435 if (sigismember(&d->sigset, SIGCHLD))
3436 e->need_process_child = true;
3437
91c70071 3438 /* If there's already an event source pending for this priority we don't read another */
9da4cb2b
LP
3439 if (d->current)
3440 return 0;
3441
fd38203a 3442 for (;;) {
0eb2e0e3 3443 struct signalfd_siginfo si;
7057bd99 3444 ssize_t n;
92daebc0 3445 sd_event_source *s = NULL;
fd38203a 3446
9da4cb2b 3447 n = read(d->fd, &si, sizeof(si));
7057bd99 3448 if (n < 0) {
8add30a0 3449 if (ERRNO_IS_TRANSIENT(errno))
efd3be9d 3450 return 0;
fd38203a
LP
3451
3452 return -errno;
3453 }
3454
7057bd99 3455 if (_unlikely_(n != sizeof(si)))
fd38203a
LP
3456 return -EIO;
3457
6eb7c172 3458 assert(SIGNAL_VALID(si.ssi_signo));
7057bd99 3459
92daebc0
LP
3460 if (e->signal_sources)
3461 s = e->signal_sources[si.ssi_signo];
92daebc0
LP
3462 if (!s)
3463 continue;
9da4cb2b
LP
3464 if (s->pending)
3465 continue;
fd38203a
LP
3466
3467 s->signal.siginfo = si;
9da4cb2b
LP
3468 d->current = s;
3469
fd38203a
LP
3470 r = source_set_pending(s, true);
3471 if (r < 0)
3472 return r;
efd3be9d
YW
3473 if (r > 0 && *min_priority >= s->priority) {
3474 *min_priority = s->priority;
3475 return 1; /* an event source with smaller priority is queued. */
3476 }
9da4cb2b 3477
efd3be9d 3478 return 0;
fd38203a 3479 }
fd38203a
LP
3480}
3481
efd3be9d 3482static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
97ef5391
LP
3483 ssize_t n;
3484
3485 assert(e);
3486 assert(d);
3487
3488 assert_return(revents == EPOLLIN, -EIO);
3489
3490 /* If there's already an event source pending for this priority, don't read another */
3491 if (d->n_pending > 0)
3492 return 0;
3493
3494 /* Is the read buffer non-empty? If so, let's not read more */
3495 if (d->buffer_filled > 0)
3496 return 0;
3497
efd3be9d
YW
3498 if (d->priority > threshold)
3499 return 0;
3500
97ef5391
LP
3501 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3502 if (n < 0) {
8add30a0 3503 if (ERRNO_IS_TRANSIENT(errno))
97ef5391
LP
3504 return 0;
3505
3506 return -errno;
3507 }
3508
3509 assert(n > 0);
3510 d->buffer_filled = (size_t) n;
0601b958 3511 LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3512
3513 return 1;
3514}
3515
3516static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3517 assert(e);
3518 assert(d);
3519 assert(sz <= d->buffer_filled);
3520
3521 if (sz == 0)
3522 return;
3523
3524 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3525 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3526 d->buffer_filled -= sz;
3527
3528 if (d->buffer_filled == 0)
0601b958 3529 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3530}
3531
3532static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3533 int r;
3534
3535 assert(e);
3536 assert(d);
3537
3538 /* If there's already an event source pending for this priority, don't read another */
3539 if (d->n_pending > 0)
3540 return 0;
3541
3542 while (d->buffer_filled > 0) {
3543 size_t sz;
3544
3545 /* Let's validate that the event structures are complete */
3546 if (d->buffer_filled < offsetof(struct inotify_event, name))
3547 return -EIO;
3548
3549 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3550 if (d->buffer_filled < sz)
3551 return -EIO;
3552
3553 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3554 struct inode_data *inode_data;
97ef5391
LP
3555
3556 /* The queue overran, let's pass this event to all event sources connected to this inotify
3557 * object */
3558
03677889 3559 HASHMAP_FOREACH(inode_data, d->inodes)
97ef5391
LP
3560 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3561
b6d5481b 3562 if (event_source_is_offline(s))
97ef5391
LP
3563 continue;
3564
3565 r = source_set_pending(s, true);
3566 if (r < 0)
3567 return r;
3568 }
97ef5391
LP
3569 } else {
3570 struct inode_data *inode_data;
97ef5391
LP
3571
3572 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3573 * our watch descriptor table. */
3574 if (d->buffer.ev.mask & IN_IGNORED) {
3575
3576 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3577 if (!inode_data) {
3578 event_inotify_data_drop(e, d, sz);
3579 continue;
3580 }
3581
3582 /* The watch descriptor was removed by the kernel, let's drop it here too */
3583 inode_data->wd = -1;
3584 } else {
3585 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3586 if (!inode_data) {
3587 event_inotify_data_drop(e, d, sz);
3588 continue;
3589 }
3590 }
3591
3592 /* Trigger all event sources that are interested in these events. Also trigger all event
3593 * sources if IN_IGNORED or IN_UNMOUNT is set. */
3594 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3595
b6d5481b 3596 if (event_source_is_offline(s))
97ef5391
LP
3597 continue;
3598
3599 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3600 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3601 continue;
3602
3603 r = source_set_pending(s, true);
3604 if (r < 0)
3605 return r;
3606 }
3607 }
3608
3609 /* Something pending now? If so, let's finish, otherwise let's read more. */
3610 if (d->n_pending > 0)
3611 return 1;
3612 }
3613
3614 return 0;
3615}
3616
3617static int process_inotify(sd_event *e) {
97ef5391
LP
3618 int r, done = 0;
3619
3620 assert(e);
3621
0601b958 3622 LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
97ef5391
LP
3623 r = event_inotify_data_process(e, d);
3624 if (r < 0)
3625 return r;
3626 if (r > 0)
3627 done ++;
3628 }
3629
3630 return done;
3631}
3632
fd38203a 3633static int source_dispatch(sd_event_source *s) {
8f5c235d 3634 EventSourceType saved_type;
c8e9d15c 3635 sd_event *saved_event;
fe8245eb 3636 int r = 0;
fd38203a
LP
3637
3638 assert(s);
6203e07a 3639 assert(s->pending || s->type == SOURCE_EXIT);
fd38203a 3640
b778cba4
LP
3641 /* Save the event source type, here, so that we still know it after the event callback which might
3642 * invalidate the event. */
8f5c235d
LP
3643 saved_type = s->type;
3644
de02634c 3645 /* Similarly, store a reference to the event loop object, so that we can still access it after the
b778cba4 3646 * callback might have invalidated/disconnected the event source. */
c8e9d15c
YW
3647 saved_event = s->event;
3648 PROTECT_EVENT(saved_event);
b778cba4 3649
de02634c 3650 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
b6d5481b
LP
3651 assert(!s->ratelimited);
3652 if (!ratelimit_below(&s->rate_limit)) {
3653 r = event_source_enter_ratelimited(s);
3654 if (r < 0)
3655 return r;
3656
3657 return 1;
3658 }
3659
945c2931 3660 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
da7e457c
LP
3661 r = source_set_pending(s, false);
3662 if (r < 0)
3663 return r;
3664 }
fd38203a 3665
6e9feda3
LP
3666 if (s->type != SOURCE_POST) {
3667 sd_event_source *z;
6e9feda3 3668
de02634c 3669 /* If we execute a non-post source, let's mark all post sources as pending. */
6e9feda3 3670
90e74a66 3671 SET_FOREACH(z, s->event->post_sources) {
b6d5481b 3672 if (event_source_is_offline(z))
6e9feda3
LP
3673 continue;
3674
3675 r = source_set_pending(z, true);
3676 if (r < 0)
3677 return r;
3678 }
3679 }
3680
baf76283
LP
3681 if (s->enabled == SD_EVENT_ONESHOT) {
3682 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
fd38203a
LP
3683 if (r < 0)
3684 return r;
3685 }
3686
12179984 3687 s->dispatching = true;
b7484e2a 3688
fd38203a
LP
3689 switch (s->type) {
3690
3691 case SOURCE_IO:
3692 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
3693 break;
3694
6a0f1f6d 3695 case SOURCE_TIME_REALTIME:
a8548816 3696 case SOURCE_TIME_BOOTTIME:
6a0f1f6d
LP
3697 case SOURCE_TIME_MONOTONIC:
3698 case SOURCE_TIME_REALTIME_ALARM:
3699 case SOURCE_TIME_BOOTTIME_ALARM:
fd38203a
LP
3700 r = s->time.callback(s, s->time.next, s->userdata);
3701 break;
3702
3703 case SOURCE_SIGNAL:
3704 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
3705 break;
3706
08cd1552
LP
3707 case SOURCE_CHILD: {
3708 bool zombie;
3709
945c2931 3710 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
08cd1552 3711
fd38203a 3712 r = s->child.callback(s, &s->child.siginfo, s->userdata);
08cd1552
LP
3713
3714 /* Now, reap the PID for good. */
f8f3f926 3715 if (zombie) {
cc59d290 3716 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
f8f3f926
LP
3717 s->child.waited = true;
3718 }
08cd1552 3719
fd38203a 3720 break;
08cd1552 3721 }
fd38203a
LP
3722
3723 case SOURCE_DEFER:
3724 r = s->defer.callback(s, s->userdata);
3725 break;
da7e457c 3726
6e9feda3
LP
3727 case SOURCE_POST:
3728 r = s->post.callback(s, s->userdata);
3729 break;
3730
6203e07a
LP
3731 case SOURCE_EXIT:
3732 r = s->exit.callback(s, s->userdata);
da7e457c 3733 break;
9d3e3aa5 3734
97ef5391
LP
3735 case SOURCE_INOTIFY: {
3736 struct sd_event *e = s->event;
3737 struct inotify_data *d;
3738 size_t sz;
3739
3740 assert(s->inotify.inode_data);
3741 assert_se(d = s->inotify.inode_data->inotify_data);
3742
3743 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
3744 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3745 assert(d->buffer_filled >= sz);
3746
53baf2ef
LP
3747 /* If the inotify callback destroys the event source then this likely means we don't need to
3748 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
3749 * free it immediately, then we couldn't drop the event from the inotify event queue without
3750 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
3751 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
3752 * explicitly GC it after we are done dropping the inotify event from the buffer. */
3753 d->n_busy++;
97ef5391 3754 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
53baf2ef 3755 d->n_busy--;
97ef5391 3756
53baf2ef
LP
3757 /* When no event is pending anymore on this inotify object, then let's drop the event from
3758 * the inotify event queue buffer. */
97ef5391
LP
3759 if (d->n_pending == 0)
3760 event_inotify_data_drop(e, d, sz);
3761
53baf2ef
LP
3762 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
3763 event_gc_inotify_data(e, d);
97ef5391
LP
3764 break;
3765 }
3766
9d3e3aa5 3767 case SOURCE_WATCHDOG:
a71fe8b8 3768 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
9f2a50a3 3769 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
04499a70 3770 assert_not_reached();
fd38203a
LP
3771 }
3772
12179984
LP
3773 s->dispatching = false;
3774
b778cba4
LP
3775 if (r < 0) {
3776 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
3777 strna(s->description),
3778 event_source_type_to_string(saved_type),
3779 s->exit_on_failure ? "exiting" : "disabling");
3780
3781 if (s->exit_on_failure)
3782 (void) sd_event_exit(saved_event, r);
3783 }
12179984
LP
3784
3785 if (s->n_ref == 0)
3786 source_free(s);
3787 else if (r < 0)
c3c50474 3788 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
b7484e2a 3789
6203e07a 3790 return 1;
fd38203a
LP
3791}
3792
3793static int event_prepare(sd_event *e) {
3794 int r;
3795
3796 assert(e);
3797
3798 for (;;) {
3799 sd_event_source *s;
3800
3801 s = prioq_peek(e->prepare);
b6d5481b 3802 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
fd38203a
LP
3803 break;
3804
3805 s->prepare_iteration = e->iteration;
8656f4a6 3806 prioq_reshuffle(e->prepare, s, &s->prepare_index);
fd38203a
LP
3807
3808 assert(s->prepare);
12179984 3809 s->dispatching = true;
fd38203a 3810 r = s->prepare(s, s->userdata);
12179984
LP
3811 s->dispatching = false;
3812
b778cba4
LP
3813 if (r < 0) {
3814 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
3815 strna(s->description),
3816 event_source_type_to_string(s->type),
3817 s->exit_on_failure ? "exiting" : "disabling");
3818
3819 if (s->exit_on_failure)
3820 (void) sd_event_exit(e, r);
3821 }
fd38203a 3822
12179984
LP
3823 if (s->n_ref == 0)
3824 source_free(s);
3825 else if (r < 0)
c3c50474 3826 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd38203a
LP
3827 }
3828
3829 return 0;
3830}
3831
6203e07a 3832static int dispatch_exit(sd_event *e) {
da7e457c
LP
3833 sd_event_source *p;
3834 int r;
3835
3836 assert(e);
3837
6203e07a 3838 p = prioq_peek(e->exit);
19947509
ZJS
3839 assert(!p || p->type == SOURCE_EXIT);
3840
b6d5481b 3841 if (!p || event_source_is_offline(p)) {
da7e457c
LP
3842 e->state = SD_EVENT_FINISHED;
3843 return 0;
3844 }
3845
c8e9d15c 3846 PROTECT_EVENT(e);
da7e457c 3847 e->iteration++;
6203e07a 3848 e->state = SD_EVENT_EXITING;
da7e457c 3849 r = source_dispatch(p);
2b0c9ef7 3850 e->state = SD_EVENT_INITIAL;
da7e457c
LP
3851 return r;
3852}
3853
c2ba3ad6
LP
3854static sd_event_source* event_next_pending(sd_event *e) {
3855 sd_event_source *p;
3856
da7e457c
LP
3857 assert(e);
3858
c2ba3ad6
LP
3859 p = prioq_peek(e->pending);
3860 if (!p)
3861 return NULL;
3862
b6d5481b 3863 if (event_source_is_offline(p))
c2ba3ad6
LP
3864 return NULL;
3865
3866 return p;
3867}
3868
cde93897
LP
3869static int arm_watchdog(sd_event *e) {
3870 struct itimerspec its = {};
3871 usec_t t;
cde93897
LP
3872
3873 assert(e);
3874 assert(e->watchdog_fd >= 0);
3875
3876 t = sleep_between(e,
a595fb5c
YW
3877 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
3878 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
cde93897
LP
3879
3880 timespec_store(&its.it_value, t);
3881
75145780
LP
3882 /* Make sure we never set the watchdog to 0, which tells the
3883 * kernel to disable it. */
3884 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
3885 its.it_value.tv_nsec = 1;
3886
7c248223 3887 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
cde93897
LP
3888}
3889
3890static int process_watchdog(sd_event *e) {
3891 assert(e);
3892
3893 if (!e->watchdog)
3894 return 0;
3895
3896 /* Don't notify watchdog too often */
3897 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
3898 return 0;
3899
3900 sd_notify(false, "WATCHDOG=1");
3901 e->watchdog_last = e->timestamp.monotonic;
3902
3903 return arm_watchdog(e);
3904}
3905
97ef5391
LP
3906static void event_close_inode_data_fds(sd_event *e) {
3907 struct inode_data *d;
3908
3909 assert(e);
3910
3911 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
3912 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
365c2885 3913 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
97ef5391
LP
3914 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
3915 * compromise. */
3916
ed828563 3917 while ((d = e->inode_data_to_close_list)) {
97ef5391
LP
3918 assert(d->fd >= 0);
3919 d->fd = safe_close(d->fd);
3920
ed828563 3921 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
3922 }
3923}
3924
c45a5a74
TG
3925_public_ int sd_event_prepare(sd_event *e) {
3926 int r;
fd38203a 3927
da7e457c 3928 assert_return(e, -EINVAL);
b937d761 3929 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
3930 assert_return(!event_pid_changed(e), -ECHILD);
3931 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 3932 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 3933
e5446015
LP
3934 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
3935 * this check here once, since gettid() is typically not cached, and thus want to minimize
3936 * syscalls */
3937 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
3938
f814c871 3939 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 3940 PROTECT_EVENT(e);
f814c871 3941
6203e07a 3942 if (e->exit_requested)
c45a5a74 3943 goto pending;
fd38203a
LP
3944
3945 e->iteration++;
3946
0be6c2f6 3947 e->state = SD_EVENT_PREPARING;
fd38203a 3948 r = event_prepare(e);
0be6c2f6 3949 e->state = SD_EVENT_INITIAL;
fd38203a 3950 if (r < 0)
c45a5a74 3951 return r;
fd38203a 3952
6a0f1f6d
LP
3953 r = event_arm_timer(e, &e->realtime);
3954 if (r < 0)
c45a5a74 3955 return r;
6a0f1f6d 3956
a8548816
TG
3957 r = event_arm_timer(e, &e->boottime);
3958 if (r < 0)
c45a5a74 3959 return r;
a8548816 3960
6a0f1f6d
LP
3961 r = event_arm_timer(e, &e->monotonic);
3962 if (r < 0)
c45a5a74 3963 return r;
6a0f1f6d
LP
3964
3965 r = event_arm_timer(e, &e->realtime_alarm);
1b5995b0 3966 if (r < 0)
c45a5a74 3967 return r;
fd38203a 3968
6a0f1f6d 3969 r = event_arm_timer(e, &e->boottime_alarm);
1b5995b0 3970 if (r < 0)
c45a5a74 3971 return r;
fd38203a 3972
97ef5391
LP
3973 event_close_inode_data_fds(e);
3974
0601b958 3975 if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
c45a5a74
TG
3976 goto pending;
3977
2b0c9ef7 3978 e->state = SD_EVENT_ARMED;
c45a5a74
TG
3979
3980 return 0;
3981
3982pending:
2b0c9ef7 3983 e->state = SD_EVENT_ARMED;
6d148a84
TG
3984 r = sd_event_wait(e, 0);
3985 if (r == 0)
2b0c9ef7 3986 e->state = SD_EVENT_ARMED;
6d148a84
TG
3987
3988 return r;
c45a5a74
TG
3989}
3990
798445ab
LP
3991static int epoll_wait_usec(
3992 int fd,
3993 struct epoll_event *events,
3994 int maxevents,
3995 usec_t timeout) {
3996
7c248223 3997 int msec;
0c14c45e
LP
3998 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
3999
4000#if HAVE_EPOLL_PWAIT2
39f756d3 4001 static bool epoll_pwait2_absent = false;
52bb308c 4002 int r;
798445ab 4003
0c14c45e
LP
4004 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4005 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4006 * is not that obvious to implement given the libc and kernel definitions differ in the last
4007 * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4008 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4009 * missing. */
798445ab
LP
4010
4011 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
798445ab
LP
4012 r = epoll_pwait2(fd,
4013 events,
4014 maxevents,
52bb308c 4015 TIMESPEC_STORE(timeout),
798445ab
LP
4016 NULL);
4017 if (r >= 0)
4018 return r;
7cb45dbf 4019 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
798445ab
LP
4020 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4021 * supported. */
4022
4023 epoll_pwait2_absent = true;
4024 }
39f756d3 4025#endif
798445ab
LP
4026
4027 if (timeout == USEC_INFINITY)
4028 msec = -1;
4029 else {
4030 usec_t k;
4031
4032 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4033 if (k >= INT_MAX)
4034 msec = INT_MAX; /* Saturate */
4035 else
4036 msec = (int) k;
4037 }
4038
7c248223 4039 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
798445ab
LP
4040}
4041
efd3be9d 4042static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
319a4f4b 4043 size_t n_event_queue, m, n_event_max;
efd3be9d
YW
4044 int64_t min_priority = threshold;
4045 bool something_new = false;
798445ab 4046 int r;
c45a5a74 4047
efd3be9d
YW
4048 assert(e);
4049 assert(ret_min_priority);
6a0f1f6d 4050
8b9708d1 4051 n_event_queue = MAX(e->n_sources, 1u);
319a4f4b 4052 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
5cddd924 4053 return -ENOMEM;
fd38203a 4054
319a4f4b
LP
4055 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4056
97ef5391 4057 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
0601b958 4058 if (e->buffered_inotify_data_list)
798445ab 4059 timeout = 0;
97ef5391 4060
8b9708d1 4061 for (;;) {
319a4f4b
LP
4062 r = epoll_wait_usec(
4063 e->epoll_fd,
4064 e->event_queue,
4065 n_event_max,
4066 timeout);
798445ab 4067 if (r < 0)
efd3be9d 4068 return r;
c45a5a74 4069
8b9708d1
YW
4070 m = (size_t) r;
4071
319a4f4b 4072 if (m < n_event_max)
8b9708d1
YW
4073 break;
4074
319a4f4b 4075 if (n_event_max >= n_event_queue * 10)
8b9708d1
YW
4076 break;
4077
319a4f4b 4078 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
8b9708d1
YW
4079 return -ENOMEM;
4080
319a4f4b 4081 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
798445ab 4082 timeout = 0;
da7e457c 4083 }
fd38203a 4084
efd3be9d
YW
4085 /* Set timestamp only when this is called first time. */
4086 if (threshold == INT64_MAX)
4087 triple_timestamp_get(&e->timestamp);
fd38203a 4088
8b9708d1 4089 for (size_t i = 0; i < m; i++) {
fd38203a 4090
5cddd924
LP
4091 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4092 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
9da4cb2b 4093 else {
5cddd924 4094 WakeupType *t = e->event_queue[i].data.ptr;
9da4cb2b
LP
4095
4096 switch (*t) {
4097
f8f3f926 4098 case WAKEUP_EVENT_SOURCE: {
5cddd924 4099 sd_event_source *s = e->event_queue[i].data.ptr;
f8f3f926
LP
4100
4101 assert(s);
4102
efd3be9d
YW
4103 if (s->priority > threshold)
4104 continue;
4105
4106 min_priority = MIN(min_priority, s->priority);
4107
f8f3f926
LP
4108 switch (s->type) {
4109
4110 case SOURCE_IO:
5cddd924 4111 r = process_io(e, s, e->event_queue[i].events);
f8f3f926
LP
4112 break;
4113
4114 case SOURCE_CHILD:
5cddd924 4115 r = process_pidfd(e, s, e->event_queue[i].events);
f8f3f926
LP
4116 break;
4117
4118 default:
04499a70 4119 assert_not_reached();
f8f3f926
LP
4120 }
4121
9da4cb2b 4122 break;
f8f3f926 4123 }
fd38203a 4124
9da4cb2b 4125 case WAKEUP_CLOCK_DATA: {
5cddd924 4126 struct clock_data *d = e->event_queue[i].data.ptr;
f8f3f926
LP
4127
4128 assert(d);
4129
5cddd924 4130 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
9da4cb2b
LP
4131 break;
4132 }
4133
4134 case WAKEUP_SIGNAL_DATA:
efd3be9d 4135 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
9da4cb2b
LP
4136 break;
4137
97ef5391 4138 case WAKEUP_INOTIFY_DATA:
efd3be9d 4139 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
97ef5391
LP
4140 break;
4141
9da4cb2b 4142 default:
04499a70 4143 assert_not_reached();
9da4cb2b
LP
4144 }
4145 }
efd3be9d
YW
4146 if (r < 0)
4147 return r;
4148 if (r > 0)
4149 something_new = true;
4150 }
4151
4152 *ret_min_priority = min_priority;
4153 return something_new;
4154}
4155
4156_public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4157 int r;
4158
4159 assert_return(e, -EINVAL);
4160 assert_return(e = event_resolve(e), -ENOPKG);
4161 assert_return(!event_pid_changed(e), -ECHILD);
4162 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4163 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4164
4165 if (e->exit_requested) {
4166 e->state = SD_EVENT_PENDING;
4167 return 1;
4168 }
4169
4170 for (int64_t threshold = INT64_MAX; ; threshold--) {
4171 int64_t epoll_min_priority, child_min_priority;
4172
4173 /* There may be a possibility that new epoll (especially IO) and child events are
4174 * triggered just after process_epoll() call but before process_child(), and the new IO
4175 * events may have higher priority than the child events. To salvage these events,
4176 * let's call epoll_wait() again, but accepts only events with higher priority than the
4177 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4178 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4179 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4180
4181 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4182 if (r == -EINTR) {
4183 e->state = SD_EVENT_PENDING;
4184 return 1;
4185 }
4186 if (r < 0)
4187 goto finish;
4188 if (r == 0 && threshold < INT64_MAX)
4189 /* No new epoll event. */
4190 break;
4191
4192 r = process_child(e, threshold, &child_min_priority);
fd38203a 4193 if (r < 0)
da7e457c 4194 goto finish;
efd3be9d
YW
4195 if (r == 0)
4196 /* No new child event. */
4197 break;
4198
4199 threshold = MIN(epoll_min_priority, child_min_priority);
4200 if (threshold == INT64_MIN)
4201 break;
4202
4203 timeout = 0;
fd38203a
LP
4204 }
4205
cde93897
LP
4206 r = process_watchdog(e);
4207 if (r < 0)
4208 goto finish;
4209
fd69f224 4210 r = process_inotify(e);
6a0f1f6d
LP
4211 if (r < 0)
4212 goto finish;
4213
fd69f224 4214 r = process_timer(e, e->timestamp.realtime, &e->realtime);
a8548816
TG
4215 if (r < 0)
4216 goto finish;
4217
fd69f224 4218 r = process_timer(e, e->timestamp.boottime, &e->boottime);
6a0f1f6d
LP
4219 if (r < 0)
4220 goto finish;
4221
4222 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
fd38203a 4223 if (r < 0)
da7e457c 4224 goto finish;
fd38203a 4225
e475d10c 4226 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
fd38203a 4227 if (r < 0)
da7e457c 4228 goto finish;
fd38203a 4229
fd69f224 4230 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
97ef5391
LP
4231 if (r < 0)
4232 goto finish;
fd69f224
MS
4233 else if (r == 1) {
4234 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4235 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4236 * there were potentially re-enabled by the callback.
4237 *
4238 * Wondering why we treat only this invocation of process_timer() differently? Once event
4239 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4240 * ratelimit expiry callback is never called for any other timer type. */
4241 r = 0;
4242 goto finish;
4243 }
97ef5391 4244
c45a5a74
TG
4245 if (event_next_pending(e)) {
4246 e->state = SD_EVENT_PENDING;
c45a5a74 4247 return 1;
da7e457c
LP
4248 }
4249
c45a5a74 4250 r = 0;
fd38203a 4251
da7e457c 4252finish:
2b0c9ef7 4253 e->state = SD_EVENT_INITIAL;
da7e457c
LP
4254
4255 return r;
fd38203a
LP
4256}
4257
c45a5a74
TG
4258_public_ int sd_event_dispatch(sd_event *e) {
4259 sd_event_source *p;
4260 int r;
4261
4262 assert_return(e, -EINVAL);
b937d761 4263 assert_return(e = event_resolve(e), -ENOPKG);
c45a5a74
TG
4264 assert_return(!event_pid_changed(e), -ECHILD);
4265 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4266 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4267
4268 if (e->exit_requested)
4269 return dispatch_exit(e);
4270
4271 p = event_next_pending(e);
4272 if (p) {
c8e9d15c 4273 PROTECT_EVENT(e);
c45a5a74
TG
4274
4275 e->state = SD_EVENT_RUNNING;
4276 r = source_dispatch(p);
2b0c9ef7 4277 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4278 return r;
4279 }
4280
2b0c9ef7 4281 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4282
4283 return 1;
4284}
4285
34b87517 4286static void event_log_delays(sd_event *e) {
442ac269
YW
4287 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4288 size_t l, i;
34b87517 4289
442ac269
YW
4290 p = b;
4291 l = sizeof(b);
4292 for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4293 l = strpcpyf(&p, l, "%u ", e->delays[i]);
34b87517
VC
4294 e->delays[i] = 0;
4295 }
442ac269 4296 log_debug("Event loop iterations: %s", b);
34b87517
VC
4297}
4298
c45a5a74
TG
4299_public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4300 int r;
4301
4302 assert_return(e, -EINVAL);
b937d761 4303 assert_return(e = event_resolve(e), -ENOPKG);
c45a5a74
TG
4304 assert_return(!event_pid_changed(e), -ECHILD);
4305 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 4306 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
c45a5a74 4307
e6a7bee5 4308 if (e->profile_delays && e->last_run_usec != 0) {
34b87517
VC
4309 usec_t this_run;
4310 unsigned l;
4311
4312 this_run = now(CLOCK_MONOTONIC);
4313
58c34be8 4314 l = log2u64(this_run - e->last_run_usec);
cb9d621e 4315 assert(l < ELEMENTSOF(e->delays));
34b87517
VC
4316 e->delays[l]++;
4317
e6a7bee5 4318 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
34b87517 4319 event_log_delays(e);
e6a7bee5 4320 e->last_log_usec = this_run;
34b87517
VC
4321 }
4322 }
4323
f814c871 4324 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 4325 PROTECT_EVENT(e);
f814c871 4326
c45a5a74 4327 r = sd_event_prepare(e);
53bac4e0
LP
4328 if (r == 0)
4329 /* There was nothing? Then wait... */
4330 r = sd_event_wait(e, timeout);
c45a5a74 4331
34b87517 4332 if (e->profile_delays)
e6a7bee5 4333 e->last_run_usec = now(CLOCK_MONOTONIC);
34b87517 4334
02d30981 4335 if (r > 0) {
53bac4e0 4336 /* There's something now, then let's dispatch it */
02d30981
TG
4337 r = sd_event_dispatch(e);
4338 if (r < 0)
4339 return r;
53bac4e0
LP
4340
4341 return 1;
4342 }
4343
4344 return r;
c45a5a74
TG
4345}
4346
f7262a9f 4347_public_ int sd_event_loop(sd_event *e) {
fd38203a
LP
4348 int r;
4349
da7e457c 4350 assert_return(e, -EINVAL);
b937d761 4351 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 4352 assert_return(!event_pid_changed(e), -ECHILD);
2b0c9ef7 4353 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 4354
c8e9d15c 4355 PROTECT_EVENT(e);
fd38203a 4356
da7e457c 4357 while (e->state != SD_EVENT_FINISHED) {
f5fbe71d 4358 r = sd_event_run(e, UINT64_MAX);
fd38203a 4359 if (r < 0)
30dd293c 4360 return r;
fd38203a
LP
4361 }
4362
30dd293c 4363 return e->exit_code;
fd38203a
LP
4364}
4365
9b364545 4366_public_ int sd_event_get_fd(sd_event *e) {
9b364545 4367 assert_return(e, -EINVAL);
b937d761 4368 assert_return(e = event_resolve(e), -ENOPKG);
9b364545
TG
4369 assert_return(!event_pid_changed(e), -ECHILD);
4370
4371 return e->epoll_fd;
4372}
4373
f7262a9f 4374_public_ int sd_event_get_state(sd_event *e) {
da7e457c 4375 assert_return(e, -EINVAL);
b937d761 4376 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
4377 assert_return(!event_pid_changed(e), -ECHILD);
4378
4379 return e->state;
4380}
4381
6203e07a 4382_public_ int sd_event_get_exit_code(sd_event *e, int *code) {
da7e457c 4383 assert_return(e, -EINVAL);
b937d761 4384 assert_return(e = event_resolve(e), -ENOPKG);
6203e07a 4385 assert_return(code, -EINVAL);
da7e457c 4386 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 4387
6203e07a
LP
4388 if (!e->exit_requested)
4389 return -ENODATA;
4390
4391 *code = e->exit_code;
4392 return 0;
fd38203a
LP
4393}
4394
6203e07a 4395_public_ int sd_event_exit(sd_event *e, int code) {
da7e457c 4396 assert_return(e, -EINVAL);
b937d761 4397 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
4398 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4399 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 4400
6203e07a
LP
4401 e->exit_requested = true;
4402 e->exit_code = code;
4403
fd38203a
LP
4404 return 0;
4405}
46e8c825 4406
6a0f1f6d 4407_public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
46e8c825 4408 assert_return(e, -EINVAL);
b937d761 4409 assert_return(e = event_resolve(e), -ENOPKG);
46e8c825 4410 assert_return(usec, -EINVAL);
46e8c825
LP
4411 assert_return(!event_pid_changed(e), -ECHILD);
4412
e475d10c
LP
4413 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4414 return -EOPNOTSUPP;
4415
e475d10c 4416 if (!triple_timestamp_is_set(&e->timestamp)) {
15c689d7 4417 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
38a03f06
LP
4418 *usec = now(clock);
4419 return 1;
4420 }
46e8c825 4421
e475d10c 4422 *usec = triple_timestamp_by_clock(&e->timestamp, clock);
46e8c825
LP
4423 return 0;
4424}
afc6adb5
LP
4425
4426_public_ int sd_event_default(sd_event **ret) {
39883f62 4427 sd_event *e = NULL;
afc6adb5
LP
4428 int r;
4429
4430 if (!ret)
4431 return !!default_event;
4432
4433 if (default_event) {
4434 *ret = sd_event_ref(default_event);
4435 return 0;
4436 }
4437
4438 r = sd_event_new(&e);
4439 if (r < 0)
4440 return r;
4441
4442 e->default_event_ptr = &default_event;
4443 e->tid = gettid();
4444 default_event = e;
4445
4446 *ret = e;
4447 return 1;
4448}
4449
4450_public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4451 assert_return(e, -EINVAL);
b937d761 4452 assert_return(e = event_resolve(e), -ENOPKG);
afc6adb5 4453 assert_return(tid, -EINVAL);
76b54375 4454 assert_return(!event_pid_changed(e), -ECHILD);
afc6adb5 4455
76b54375
LP
4456 if (e->tid != 0) {
4457 *tid = e->tid;
4458 return 0;
4459 }
4460
4461 return -ENXIO;
afc6adb5 4462}
cde93897
LP
4463
4464_public_ int sd_event_set_watchdog(sd_event *e, int b) {
4465 int r;
4466
4467 assert_return(e, -EINVAL);
b937d761 4468 assert_return(e = event_resolve(e), -ENOPKG);
8f726607 4469 assert_return(!event_pid_changed(e), -ECHILD);
cde93897
LP
4470
4471 if (e->watchdog == !!b)
4472 return e->watchdog;
4473
4474 if (b) {
09812eb7
LP
4475 r = sd_watchdog_enabled(false, &e->watchdog_period);
4476 if (r <= 0)
cde93897 4477 return r;
cde93897
LP
4478
4479 /* Issue first ping immediately */
4480 sd_notify(false, "WATCHDOG=1");
4481 e->watchdog_last = now(CLOCK_MONOTONIC);
4482
4483 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4484 if (e->watchdog_fd < 0)
4485 return -errno;
4486
4487 r = arm_watchdog(e);
4488 if (r < 0)
4489 goto fail;
4490
1eac7948 4491 struct epoll_event ev = {
a82f89aa
LP
4492 .events = EPOLLIN,
4493 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
4494 };
cde93897 4495
15c689d7 4496 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
cde93897
LP
4497 r = -errno;
4498 goto fail;
4499 }
4500
4501 } else {
4502 if (e->watchdog_fd >= 0) {
5a795bff 4503 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
03e334a1 4504 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
4505 }
4506 }
4507
4508 e->watchdog = !!b;
4509 return e->watchdog;
4510
4511fail:
03e334a1 4512 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
4513 return r;
4514}
8f726607
LP
4515
4516_public_ int sd_event_get_watchdog(sd_event *e) {
4517 assert_return(e, -EINVAL);
b937d761 4518 assert_return(e = event_resolve(e), -ENOPKG);
8f726607
LP
4519 assert_return(!event_pid_changed(e), -ECHILD);
4520
4521 return e->watchdog;
4522}
60a3b1e1
LP
4523
4524_public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
4525 assert_return(e, -EINVAL);
b937d761 4526 assert_return(e = event_resolve(e), -ENOPKG);
60a3b1e1
LP
4527 assert_return(!event_pid_changed(e), -ECHILD);
4528
4529 *ret = e->iteration;
4530 return 0;
4531}
15723a1d
LP
4532
4533_public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
4534 assert_return(s, -EINVAL);
4535
4536 s->destroy_callback = callback;
4537 return 0;
4538}
4539
4540_public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
4541 assert_return(s, -EINVAL);
4542
4543 if (ret)
4544 *ret = s->destroy_callback;
4545
4546 return !!s->destroy_callback;
4547}
2382c936
YW
4548
4549_public_ int sd_event_source_get_floating(sd_event_source *s) {
4550 assert_return(s, -EINVAL);
4551
4552 return s->floating;
4553}
4554
4555_public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
4556 assert_return(s, -EINVAL);
4557
4558 if (s->floating == !!b)
4559 return 0;
4560
4561 if (!s->event) /* Already disconnected */
4562 return -ESTALE;
4563
4564 s->floating = b;
4565
4566 if (b) {
4567 sd_event_source_ref(s);
4568 sd_event_unref(s->event);
4569 } else {
4570 sd_event_ref(s->event);
4571 sd_event_source_unref(s);
4572 }
4573
4574 return 1;
4575}
b778cba4
LP
4576
4577_public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
4578 assert_return(s, -EINVAL);
4579 assert_return(s->type != SOURCE_EXIT, -EDOM);
4580
4581 return s->exit_on_failure;
4582}
4583
4584_public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
4585 assert_return(s, -EINVAL);
4586 assert_return(s->type != SOURCE_EXIT, -EDOM);
4587
4588 if (s->exit_on_failure == !!b)
4589 return 0;
4590
4591 s->exit_on_failure = b;
4592 return 1;
4593}
b6d5481b
LP
4594
4595_public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
4596 int r;
4597
4598 assert_return(s, -EINVAL);
4599
4600 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
4601 * so is a programming error. */
4602 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
4603
4604 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
4605 * non-ratelimited. */
fd69f224 4606 r = event_source_leave_ratelimit(s, /* run_callback */ false);
b6d5481b
LP
4607 if (r < 0)
4608 return r;
4609
4610 s->rate_limit = (RateLimit) { interval, burst };
4611 return 0;
fd69f224
MS
4612}
4613
4614_public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
4615 assert_return(s, -EINVAL);
4616
4617 s->ratelimit_expire_callback = callback;
4618 return 0;
b6d5481b
LP
4619}
4620
4621_public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
4622 assert_return(s, -EINVAL);
4623
6dd3b818
YW
4624 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
4625 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
b6d5481b
LP
4626 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
4627 return -EDOM;
4628
4629 if (!ratelimit_configured(&s->rate_limit))
4630 return -ENOEXEC;
4631
4632 if (ret_interval)
4633 *ret_interval = s->rate_limit.interval;
4634 if (ret_burst)
4635 *ret_burst = s->rate_limit.burst;
4636
4637 return 0;
4638}
4639
4640_public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
4641 assert_return(s, -EINVAL);
4642
4643 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
4644 return false;
4645
4646 if (!ratelimit_configured(&s->rate_limit))
4647 return false;
4648
4649 return s->ratelimited;
4650}
baf3fdec
LP
4651
4652_public_ int sd_event_set_signal_exit(sd_event *e, int b) {
4653 bool change = false;
4654 int r;
4655
4656 assert_return(e, -EINVAL);
4657
4658 if (b) {
4659 /* We want to maintain pointers to these event sources, so that we can destroy them when told
4660 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
4661 * floating after creation (and undo this before deleting them again). */
4662
4663 if (!e->sigint_event_source) {
4664 r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
4665 if (r < 0)
4666 return r;
4667
4668 assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
4669 change = true;
4670 }
4671
4672 if (!e->sigterm_event_source) {
4673 r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
4674 if (r < 0) {
4675 if (change) {
4676 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
4677 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
4678 }
4679
4680 return r;
4681 }
4682
4683 assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
4684 change = true;
4685 }
4686
4687 } else {
4688 if (e->sigint_event_source) {
4689 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
4690 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
4691 change = true;
4692 }
4693
4694 if (e->sigterm_event_source) {
4695 assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
4696 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
4697 change = true;
4698 }
4699 }
4700
4701 return change;
4702}