]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/libsystemd/sd-event/sd-event.c
Merge pull request #27670 from poettering/switch-root-umount-all
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
fd38203a
LP
2
3#include <sys/epoll.h>
4#include <sys/timerfd.h>
5#include <sys/wait.h>
6
cde93897 7#include "sd-daemon.h"
07630cea
LP
8#include "sd-event.h"
9#include "sd-id128.h"
158fe190 10#include "sd-messages.h"
07630cea 11
b5efdb8a 12#include "alloc-util.h"
f8f3f926 13#include "env-util.h"
a137a1c3 14#include "event-source.h"
3ffd4af2 15#include "fd-util.h"
97ef5391 16#include "fs-util.h"
28e5e1e9 17#include "glyph-util.h"
fd38203a 18#include "hashmap.h"
158fe190 19#include "hexdecoct.h"
07630cea 20#include "list.h"
3ae6b3bf 21#include "logarithm.h"
07630cea 22#include "macro.h"
158fe190 23#include "mallinfo-util.h"
0a970718 24#include "memory-util.h"
158fe190 25#include "missing_magic.h"
f5947a5e 26#include "missing_syscall.h"
5545f336 27#include "missing_threads.h"
2eeff0f4 28#include "origin-id.h"
158fe190 29#include "path-util.h"
07630cea 30#include "prioq.h"
4a0b58c4 31#include "process-util.h"
158fe190 32#include "psi-util.h"
6e9feda3 33#include "set.h"
24882e06 34#include "signal-util.h"
158fe190
LP
35#include "socket-util.h"
36#include "stat-util.h"
55cbfaa5 37#include "string-table.h"
07630cea 38#include "string-util.h"
442ac269 39#include "strxcpyx.h"
07630cea 40#include "time-util.h"
fd38203a 41
c2ba3ad6 42#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
fd38203a 43
f8f3f926
LP
44static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
45 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
46 return s &&
47 s->type == SOURCE_CHILD &&
48 s->child.pidfd >= 0 &&
49 s->child.options == WEXITED;
50}
51
b6d5481b
LP
52static bool event_source_is_online(sd_event_source *s) {
53 assert(s);
54 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
55}
56
57static bool event_source_is_offline(sd_event_source *s) {
58 assert(s);
59 return s->enabled == SD_EVENT_OFF || s->ratelimited;
60}
61
55cbfaa5 62static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
3f3548f8
ZJS
63 [SOURCE_IO] = "io",
64 [SOURCE_TIME_REALTIME] = "realtime",
65 [SOURCE_TIME_BOOTTIME] = "bootime",
66 [SOURCE_TIME_MONOTONIC] = "monotonic",
55cbfaa5
DM
67 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
68 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
3f3548f8
ZJS
69 [SOURCE_SIGNAL] = "signal",
70 [SOURCE_CHILD] = "child",
71 [SOURCE_DEFER] = "defer",
72 [SOURCE_POST] = "post",
73 [SOURCE_EXIT] = "exit",
74 [SOURCE_WATCHDOG] = "watchdog",
75 [SOURCE_INOTIFY] = "inotify",
158fe190 76 [SOURCE_MEMORY_PRESSURE] = "memory-pressure",
55cbfaa5
DM
77};
78
79DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
80
b6d5481b
LP
81#define EVENT_SOURCE_IS_TIME(t) \
82 IN_SET((t), \
83 SOURCE_TIME_REALTIME, \
84 SOURCE_TIME_BOOTTIME, \
85 SOURCE_TIME_MONOTONIC, \
86 SOURCE_TIME_REALTIME_ALARM, \
87 SOURCE_TIME_BOOTTIME_ALARM)
88
89#define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
90 IN_SET((t), \
91 SOURCE_IO, \
92 SOURCE_TIME_REALTIME, \
93 SOURCE_TIME_BOOTTIME, \
94 SOURCE_TIME_MONOTONIC, \
95 SOURCE_TIME_REALTIME_ALARM, \
96 SOURCE_TIME_BOOTTIME_ALARM, \
97 SOURCE_SIGNAL, \
98 SOURCE_DEFER, \
158fe190
LP
99 SOURCE_INOTIFY, \
100 SOURCE_MEMORY_PRESSURE)
6a0f1f6d 101
19947509
ZJS
102/* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
103 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
104 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
105#define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
106
fd38203a 107struct sd_event {
da7e457c 108 unsigned n_ref;
fd38203a
LP
109
110 int epoll_fd;
cde93897 111 int watchdog_fd;
fd38203a
LP
112
113 Prioq *pending;
114 Prioq *prepare;
c2ba3ad6 115
a8548816 116 /* timerfd_create() only supports these five clocks so far. We
6a0f1f6d
LP
117 * can add support for more clocks when the kernel learns to
118 * deal with them, too. */
119 struct clock_data realtime;
a8548816 120 struct clock_data boottime;
6a0f1f6d
LP
121 struct clock_data monotonic;
122 struct clock_data realtime_alarm;
123 struct clock_data boottime_alarm;
fd38203a 124
da7e457c
LP
125 usec_t perturb;
126
9da4cb2b
LP
127 sd_event_source **signal_sources; /* indexed by signal number */
128 Hashmap *signal_data; /* indexed by priority */
fd38203a
LP
129
130 Hashmap *child_sources;
b6d5481b 131 unsigned n_online_child_sources;
fd38203a 132
6e9feda3
LP
133 Set *post_sources;
134
6203e07a 135 Prioq *exit;
fd38203a 136
97ef5391
LP
137 Hashmap *inotify_data; /* indexed by priority */
138
139 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
ed828563 140 LIST_HEAD(struct inode_data, inode_data_to_close_list);
97ef5391
LP
141
142 /* A list of inotify objects that already have events buffered which aren't processed yet */
0601b958 143 LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
97ef5391 144
158fe190
LP
145 /* A list of memory pressure event sources that still need their subscription string written */
146 LIST_HEAD(sd_event_source, memory_pressure_write_list);
147
2eeff0f4 148 uint64_t origin_id;
c2ba3ad6 149
60a3b1e1 150 uint64_t iteration;
e475d10c 151 triple_timestamp timestamp;
da7e457c 152 int state;
eaa3cbef 153
6203e07a 154 bool exit_requested:1;
da7e457c 155 bool need_process_child:1;
cde93897 156 bool watchdog:1;
34b87517 157 bool profile_delays:1;
afc6adb5 158
6203e07a
LP
159 int exit_code;
160
afc6adb5
LP
161 pid_t tid;
162 sd_event **default_event_ptr;
cde93897
LP
163
164 usec_t watchdog_last, watchdog_period;
15b38f93
LP
165
166 unsigned n_sources;
a71fe8b8 167
5cddd924 168 struct epoll_event *event_queue;
5cddd924 169
a71fe8b8 170 LIST_HEAD(sd_event_source, sources);
34b87517 171
baf3fdec
LP
172 sd_event_source *sigint_event_source, *sigterm_event_source;
173
e6a7bee5 174 usec_t last_run_usec, last_log_usec;
34b87517 175 unsigned delays[sizeof(usec_t) * 8];
fd38203a
LP
176};
177
2eeff0f4
LB
178DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event, event);
179
b937d761
NM
180static thread_local sd_event *default_event = NULL;
181
a71fe8b8 182static void source_disconnect(sd_event_source *s);
97ef5391 183static void event_gc_inode_data(sd_event *e, struct inode_data *d);
a71fe8b8 184
b937d761
NM
185static sd_event *event_resolve(sd_event *e) {
186 return e == SD_EVENT_DEFAULT ? default_event : e;
187}
188
fd38203a
LP
189static int pending_prioq_compare(const void *a, const void *b) {
190 const sd_event_source *x = a, *y = b;
9c57a73b 191 int r;
fd38203a
LP
192
193 assert(x->pending);
194 assert(y->pending);
195
baf76283 196 /* Enabled ones first */
06e13147
YW
197 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
198 if (r != 0)
199 return r;
fd38203a 200
b6d5481b
LP
201 /* Non rate-limited ones first. */
202 r = CMP(!!x->ratelimited, !!y->ratelimited);
203 if (r != 0)
204 return r;
205
fd38203a 206 /* Lower priority values first */
9c57a73b
YW
207 r = CMP(x->priority, y->priority);
208 if (r != 0)
209 return r;
fd38203a
LP
210
211 /* Older entries first */
9c57a73b 212 return CMP(x->pending_iteration, y->pending_iteration);
fd38203a
LP
213}
214
215static int prepare_prioq_compare(const void *a, const void *b) {
216 const sd_event_source *x = a, *y = b;
9c57a73b 217 int r;
fd38203a
LP
218
219 assert(x->prepare);
220 assert(y->prepare);
221
8046c457 222 /* Enabled ones first */
06e13147
YW
223 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
224 if (r != 0)
225 return r;
8046c457 226
b6d5481b
LP
227 /* Non rate-limited ones first. */
228 r = CMP(!!x->ratelimited, !!y->ratelimited);
229 if (r != 0)
230 return r;
231
fd38203a
LP
232 /* Move most recently prepared ones last, so that we can stop
233 * preparing as soon as we hit one that has already been
234 * prepared in the current iteration */
9c57a73b
YW
235 r = CMP(x->prepare_iteration, y->prepare_iteration);
236 if (r != 0)
237 return r;
fd38203a 238
fd38203a 239 /* Lower priority values first */
9c57a73b 240 return CMP(x->priority, y->priority);
fd38203a
LP
241}
242
b6d5481b
LP
243static usec_t time_event_source_next(const sd_event_source *s) {
244 assert(s);
245
246 /* We have two kinds of event sources that have elapsation times associated with them: the actual
247 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
248 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
249 * looking at here. */
250
251 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
252 assert(s->rate_limit.begin != 0);
253 assert(s->rate_limit.interval != 0);
254 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
255 }
256
257 /* Otherwise this must be a time event source, if not ratelimited */
258 if (EVENT_SOURCE_IS_TIME(s->type))
259 return s->time.next;
260
261 return USEC_INFINITY;
262}
263
1bce0ffa 264static usec_t time_event_source_latest(const sd_event_source *s) {
b6d5481b
LP
265 assert(s);
266
267 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
268 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
269 * window */
270 assert(s->rate_limit.begin != 0);
271 assert(s->rate_limit.interval != 0);
272 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
273 }
274
275 /* Must be a time event source, if not ratelimited */
276 if (EVENT_SOURCE_IS_TIME(s->type))
277 return usec_add(s->time.next, s->time.accuracy);
278
279 return USEC_INFINITY;
1bce0ffa
LP
280}
281
81107b84
LP
282static bool event_source_timer_candidate(const sd_event_source *s) {
283 assert(s);
284
285 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
286 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
287 return !s->pending || s->ratelimited;
288}
289
290static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
c2ba3ad6 291 const sd_event_source *x = a, *y = b;
06e13147 292 int r;
c2ba3ad6 293
baf76283 294 /* Enabled ones first */
06e13147
YW
295 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
296 if (r != 0)
297 return r;
c2ba3ad6 298
81107b84 299 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
06e13147
YW
300 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
301 if (r != 0)
302 return r;
c2ba3ad6
LP
303
304 /* Order by time */
81107b84
LP
305 return CMP(time_func(x), time_func(y));
306}
307
308static int earliest_time_prioq_compare(const void *a, const void *b) {
309 return time_prioq_compare(a, b, time_event_source_next);
310}
311
312static int latest_time_prioq_compare(const void *a, const void *b) {
313 return time_prioq_compare(a, b, time_event_source_latest);
c2ba3ad6
LP
314}
315
6203e07a 316static int exit_prioq_compare(const void *a, const void *b) {
da7e457c 317 const sd_event_source *x = a, *y = b;
06e13147 318 int r;
da7e457c 319
6203e07a
LP
320 assert(x->type == SOURCE_EXIT);
321 assert(y->type == SOURCE_EXIT);
da7e457c 322
baf76283 323 /* Enabled ones first */
06e13147
YW
324 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
325 if (r != 0)
326 return r;
da7e457c
LP
327
328 /* Lower priority values first */
6dd91b36 329 return CMP(x->priority, y->priority);
da7e457c
LP
330}
331
6a0f1f6d
LP
332static void free_clock_data(struct clock_data *d) {
333 assert(d);
9da4cb2b 334 assert(d->wakeup == WAKEUP_CLOCK_DATA);
6a0f1f6d
LP
335
336 safe_close(d->fd);
337 prioq_free(d->earliest);
338 prioq_free(d->latest);
339}
340
8301aa0b 341static sd_event *event_free(sd_event *e) {
a71fe8b8
LP
342 sd_event_source *s;
343
fd38203a 344 assert(e);
a71fe8b8 345
baf3fdec
LP
346 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
347 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
348
a71fe8b8
LP
349 while ((s = e->sources)) {
350 assert(s->floating);
351 source_disconnect(s);
352 sd_event_source_unref(s);
353 }
354
15b38f93 355 assert(e->n_sources == 0);
fd38203a 356
afc6adb5
LP
357 if (e->default_event_ptr)
358 *(e->default_event_ptr) = NULL;
359
03e334a1 360 safe_close(e->epoll_fd);
03e334a1 361 safe_close(e->watchdog_fd);
cde93897 362
6a0f1f6d 363 free_clock_data(&e->realtime);
a8548816 364 free_clock_data(&e->boottime);
6a0f1f6d
LP
365 free_clock_data(&e->monotonic);
366 free_clock_data(&e->realtime_alarm);
367 free_clock_data(&e->boottime_alarm);
368
fd38203a
LP
369 prioq_free(e->pending);
370 prioq_free(e->prepare);
6203e07a 371 prioq_free(e->exit);
fd38203a
LP
372
373 free(e->signal_sources);
9da4cb2b 374 hashmap_free(e->signal_data);
fd38203a 375
97ef5391
LP
376 hashmap_free(e->inotify_data);
377
fd38203a 378 hashmap_free(e->child_sources);
6e9feda3 379 set_free(e->post_sources);
8301aa0b 380
5cddd924
LP
381 free(e->event_queue);
382
8301aa0b 383 return mfree(e);
fd38203a
LP
384}
385
f7262a9f 386_public_ int sd_event_new(sd_event** ret) {
fd38203a
LP
387 sd_event *e;
388 int r;
389
305f78bf 390 assert_return(ret, -EINVAL);
fd38203a 391
d08eb1fa 392 e = new(sd_event, 1);
fd38203a
LP
393 if (!e)
394 return -ENOMEM;
395
d08eb1fa
LP
396 *e = (sd_event) {
397 .n_ref = 1,
254d1313
ZJS
398 .epoll_fd = -EBADF,
399 .watchdog_fd = -EBADF,
d08eb1fa 400 .realtime.wakeup = WAKEUP_CLOCK_DATA,
254d1313 401 .realtime.fd = -EBADF,
d08eb1fa
LP
402 .realtime.next = USEC_INFINITY,
403 .boottime.wakeup = WAKEUP_CLOCK_DATA,
254d1313 404 .boottime.fd = -EBADF,
d08eb1fa
LP
405 .boottime.next = USEC_INFINITY,
406 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
254d1313 407 .monotonic.fd = -EBADF,
d08eb1fa
LP
408 .monotonic.next = USEC_INFINITY,
409 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
254d1313 410 .realtime_alarm.fd = -EBADF,
d08eb1fa
LP
411 .realtime_alarm.next = USEC_INFINITY,
412 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
254d1313 413 .boottime_alarm.fd = -EBADF,
d08eb1fa
LP
414 .boottime_alarm.next = USEC_INFINITY,
415 .perturb = USEC_INFINITY,
2eeff0f4 416 .origin_id = origin_id_query(),
d08eb1fa 417 };
fd38203a 418
c983e776
EV
419 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
420 if (r < 0)
fd38203a 421 goto fail;
fd38203a
LP
422
423 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
424 if (e->epoll_fd < 0) {
425 r = -errno;
426 goto fail;
427 }
428
7fe2903c
LP
429 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
430
34b87517 431 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
28e5e1e9
DT
432 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
433 special_glyph(SPECIAL_GLYPH_ELLIPSIS));
34b87517
VC
434 e->profile_delays = true;
435 }
436
fd38203a
LP
437 *ret = e;
438 return 0;
439
440fail:
441 event_free(e);
442 return r;
443}
444
2eeff0f4
LB
445/* Define manually so we can add the origin check */
446_public_ sd_event *sd_event_ref(sd_event *e) {
447 if (!e)
448 return NULL;
449 if (event_origin_changed(e))
450 return NULL;
451
452 e->n_ref++;
453
454 return e;
455}
456
457_public_ sd_event* sd_event_unref(sd_event *e) {
458 if (!e)
459 return NULL;
460 if (event_origin_changed(e))
461 return NULL;
462
463 assert(e->n_ref > 0);
464 if (--e->n_ref > 0)
465 return NULL;
466
467 return event_free(e);
468}
469
c8e9d15c
YW
470#define PROTECT_EVENT(e) \
471 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
fd38203a 472
afd15bbb
ZJS
473_public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
474 if (s)
475 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
476 return sd_event_source_unref(s);
477}
478
366e6411 479static void source_io_unregister(sd_event_source *s) {
fd38203a
LP
480 assert(s);
481 assert(s->type == SOURCE_IO);
482
2eeff0f4 483 if (event_origin_changed(s->event))
366e6411 484 return;
f6806734 485
fd38203a 486 if (!s->io.registered)
366e6411 487 return;
fd38203a 488
d1cf2023 489 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
f80a5d6a 490 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
55cbfaa5 491 strna(s->description), event_source_type_to_string(s->type));
fd38203a
LP
492
493 s->io.registered = false;
fd38203a
LP
494}
495
305f78bf
LP
496static int source_io_register(
497 sd_event_source *s,
498 int enabled,
499 uint32_t events) {
500
fd38203a
LP
501 assert(s);
502 assert(s->type == SOURCE_IO);
baf76283 503 assert(enabled != SD_EVENT_OFF);
fd38203a 504
1eac7948 505 struct epoll_event ev = {
a82f89aa
LP
506 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
507 .data.ptr = s,
508 };
fd38203a 509
15c689d7 510 if (epoll_ctl(s->event->epoll_fd,
1eac7948 511 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
55c540d3 512 s->io.fd, &ev) < 0)
fd38203a
LP
513 return -errno;
514
515 s->io.registered = true;
516
517 return 0;
518}
519
f8f3f926
LP
520static void source_child_pidfd_unregister(sd_event_source *s) {
521 assert(s);
522 assert(s->type == SOURCE_CHILD);
523
2eeff0f4 524 if (event_origin_changed(s->event))
f8f3f926
LP
525 return;
526
527 if (!s->child.registered)
528 return;
529
530 if (EVENT_SOURCE_WATCH_PIDFD(s))
531 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
f80a5d6a 532 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
f8f3f926
LP
533 strna(s->description), event_source_type_to_string(s->type));
534
535 s->child.registered = false;
536}
537
538static int source_child_pidfd_register(sd_event_source *s, int enabled) {
f8f3f926
LP
539 assert(s);
540 assert(s->type == SOURCE_CHILD);
541 assert(enabled != SD_EVENT_OFF);
542
543 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1eac7948 544 struct epoll_event ev = {
f8f3f926
LP
545 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
546 .data.ptr = s,
547 };
548
55c540d3
ZJS
549 if (epoll_ctl(s->event->epoll_fd,
550 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
551 s->child.pidfd, &ev) < 0)
f8f3f926
LP
552 return -errno;
553 }
554
555 s->child.registered = true;
556 return 0;
557}
558
158fe190
LP
559static void source_memory_pressure_unregister(sd_event_source *s) {
560 assert(s);
561 assert(s->type == SOURCE_MEMORY_PRESSURE);
562
2eeff0f4 563 if (event_origin_changed(s->event))
158fe190
LP
564 return;
565
566 if (!s->memory_pressure.registered)
567 return;
568
569 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
570 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
571 strna(s->description), event_source_type_to_string(s->type));
572
573 s->memory_pressure.registered = false;
574}
575
576static int source_memory_pressure_register(sd_event_source *s, int enabled) {
577 assert(s);
578 assert(s->type == SOURCE_MEMORY_PRESSURE);
579 assert(enabled != SD_EVENT_OFF);
580
581 struct epoll_event ev = {
582 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
583 (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
584 .data.ptr = s,
585 };
586
587 if (epoll_ctl(s->event->epoll_fd,
588 s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
589 s->memory_pressure.fd, &ev) < 0)
590 return -errno;
591
592 s->memory_pressure.registered = true;
593 return 0;
594}
595
596static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
597 assert(s);
598 assert(s->type == SOURCE_MEMORY_PRESSURE);
599
600 if (s->memory_pressure.in_write_list)
601 return;
602
603 LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
604 s->memory_pressure.in_write_list = true;
605}
606
607static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
608 assert(s);
609 assert(s->type == SOURCE_MEMORY_PRESSURE);
610
611 if (!s->memory_pressure.in_write_list)
612 return;
613
614 LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
615 s->memory_pressure.in_write_list = false;
616}
617
6a0f1f6d
LP
618static clockid_t event_source_type_to_clock(EventSourceType t) {
619
620 switch (t) {
621
622 case SOURCE_TIME_REALTIME:
623 return CLOCK_REALTIME;
624
a8548816
TG
625 case SOURCE_TIME_BOOTTIME:
626 return CLOCK_BOOTTIME;
627
6a0f1f6d
LP
628 case SOURCE_TIME_MONOTONIC:
629 return CLOCK_MONOTONIC;
630
631 case SOURCE_TIME_REALTIME_ALARM:
632 return CLOCK_REALTIME_ALARM;
633
634 case SOURCE_TIME_BOOTTIME_ALARM:
635 return CLOCK_BOOTTIME_ALARM;
636
637 default:
638 return (clockid_t) -1;
639 }
640}
641
642static EventSourceType clock_to_event_source_type(clockid_t clock) {
643
644 switch (clock) {
645
646 case CLOCK_REALTIME:
647 return SOURCE_TIME_REALTIME;
648
a8548816
TG
649 case CLOCK_BOOTTIME:
650 return SOURCE_TIME_BOOTTIME;
651
6a0f1f6d
LP
652 case CLOCK_MONOTONIC:
653 return SOURCE_TIME_MONOTONIC;
654
655 case CLOCK_REALTIME_ALARM:
656 return SOURCE_TIME_REALTIME_ALARM;
657
658 case CLOCK_BOOTTIME_ALARM:
659 return SOURCE_TIME_BOOTTIME_ALARM;
660
661 default:
662 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
663 }
664}
665
666static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
667 assert(e);
668
669 switch (t) {
670
671 case SOURCE_TIME_REALTIME:
672 return &e->realtime;
673
a8548816
TG
674 case SOURCE_TIME_BOOTTIME:
675 return &e->boottime;
676
6a0f1f6d
LP
677 case SOURCE_TIME_MONOTONIC:
678 return &e->monotonic;
679
680 case SOURCE_TIME_REALTIME_ALARM:
681 return &e->realtime_alarm;
682
683 case SOURCE_TIME_BOOTTIME_ALARM:
684 return &e->boottime_alarm;
685
686 default:
687 return NULL;
688 }
689}
690
3e4eb8e7
YW
691static void event_free_signal_data(sd_event *e, struct signal_data *d) {
692 assert(e);
693
694 if (!d)
695 return;
696
697 hashmap_remove(e->signal_data, &d->priority);
698 safe_close(d->fd);
699 free(d);
700}
701
9da4cb2b
LP
702static int event_make_signal_data(
703 sd_event *e,
704 int sig,
705 struct signal_data **ret) {
4807d2d0 706
9da4cb2b
LP
707 struct signal_data *d;
708 bool added = false;
709 sigset_t ss_copy;
710 int64_t priority;
f95387cd
ZJS
711 int r;
712
713 assert(e);
714
2eeff0f4 715 if (event_origin_changed(e))
9da4cb2b 716 return -ECHILD;
f6806734 717
9da4cb2b
LP
718 if (e->signal_sources && e->signal_sources[sig])
719 priority = e->signal_sources[sig]->priority;
720 else
de05913d 721 priority = SD_EVENT_PRIORITY_NORMAL;
f95387cd 722
9da4cb2b
LP
723 d = hashmap_get(e->signal_data, &priority);
724 if (d) {
725 if (sigismember(&d->sigset, sig) > 0) {
726 if (ret)
727 *ret = d;
728 return 0;
729 }
730 } else {
d08eb1fa 731 d = new(struct signal_data, 1);
9da4cb2b
LP
732 if (!d)
733 return -ENOMEM;
734
d08eb1fa
LP
735 *d = (struct signal_data) {
736 .wakeup = WAKEUP_SIGNAL_DATA,
254d1313 737 .fd = -EBADF,
d08eb1fa
LP
738 .priority = priority,
739 };
9da4cb2b 740
f656fdb6 741 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
90f604d1
ZJS
742 if (r < 0) {
743 free(d);
9da4cb2b 744 return r;
90f604d1 745 }
f95387cd 746
9da4cb2b
LP
747 added = true;
748 }
749
750 ss_copy = d->sigset;
751 assert_se(sigaddset(&ss_copy, sig) >= 0);
752
cbff793f
ZJS
753 r = signalfd(d->fd >= 0 ? d->fd : -1, /* the first arg must be -1 or a valid signalfd */
754 &ss_copy,
755 SFD_NONBLOCK|SFD_CLOEXEC);
9da4cb2b
LP
756 if (r < 0) {
757 r = -errno;
758 goto fail;
759 }
760
761 d->sigset = ss_copy;
f95387cd 762
9da4cb2b
LP
763 if (d->fd >= 0) {
764 if (ret)
765 *ret = d;
f95387cd 766 return 0;
9da4cb2b
LP
767 }
768
7fe2903c 769 d->fd = fd_move_above_stdio(r);
f95387cd 770
1eac7948 771 struct epoll_event ev = {
a82f89aa
LP
772 .events = EPOLLIN,
773 .data.ptr = d,
774 };
f95387cd 775
15c689d7 776 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
9da4cb2b
LP
777 r = -errno;
778 goto fail;
f95387cd
ZJS
779 }
780
9da4cb2b
LP
781 if (ret)
782 *ret = d;
783
f95387cd 784 return 0;
9da4cb2b
LP
785
786fail:
3e4eb8e7
YW
787 if (added)
788 event_free_signal_data(e, d);
9da4cb2b
LP
789
790 return r;
791}
792
793static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
794 assert(e);
795 assert(d);
796
797 /* Turns off the specified signal in the signal data
798 * object. If the signal mask of the object becomes empty that
799 * way removes it. */
800
801 if (sigismember(&d->sigset, sig) == 0)
802 return;
803
804 assert_se(sigdelset(&d->sigset, sig) >= 0);
805
806 if (sigisemptyset(&d->sigset)) {
9da4cb2b 807 /* If all the mask is all-zero we can get rid of the structure */
3e4eb8e7 808 event_free_signal_data(e, d);
9da4cb2b
LP
809 return;
810 }
811
2eeff0f4 812 if (event_origin_changed(e))
01e6af73
YW
813 return;
814
9da4cb2b
LP
815 assert(d->fd >= 0);
816
817 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
818 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
819}
820
821static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
822 struct signal_data *d;
823 static const int64_t zero_priority = 0;
824
825 assert(e);
826
f8f3f926
LP
827 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
828 * and possibly drop the signalfd for it. */
9da4cb2b
LP
829
830 if (sig == SIGCHLD &&
b6d5481b 831 e->n_online_child_sources > 0)
9da4cb2b
LP
832 return;
833
834 if (e->signal_sources &&
835 e->signal_sources[sig] &&
b6d5481b 836 event_source_is_online(e->signal_sources[sig]))
9da4cb2b
LP
837 return;
838
839 /*
840 * The specified signal might be enabled in three different queues:
841 *
842 * 1) the one that belongs to the priority passed (if it is non-NULL)
843 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
844 * 3) the 0 priority (to cover the SIGCHLD case)
845 *
846 * Hence, let's remove it from all three here.
847 */
848
849 if (priority) {
850 d = hashmap_get(e->signal_data, priority);
851 if (d)
852 event_unmask_signal_data(e, d, sig);
853 }
854
855 if (e->signal_sources && e->signal_sources[sig]) {
856 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
857 if (d)
858 event_unmask_signal_data(e, d, sig);
859 }
860
861 d = hashmap_get(e->signal_data, &zero_priority);
862 if (d)
863 event_unmask_signal_data(e, d, sig);
f95387cd
ZJS
864}
865
e1951c16
MS
866static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
867 assert(s);
868
869 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
870 * they are enabled/disabled or marked pending and such. */
871
872 if (s->pending)
873 prioq_reshuffle(s->event->pending, s, &s->pending_index);
874
875 if (s->prepare)
876 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
877}
878
879static void event_source_time_prioq_reshuffle(sd_event_source *s) {
880 struct clock_data *d;
881
882 assert(s);
e1951c16
MS
883
884 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
5c08c7ab
YW
885 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
886 * properly again. */
b6d5481b
LP
887
888 if (s->ratelimited)
889 d = &s->event->monotonic;
5c08c7ab 890 else if (EVENT_SOURCE_IS_TIME(s->type))
b6d5481b 891 assert_se(d = event_get_clock_data(s->event, s->type));
5c08c7ab
YW
892 else
893 return; /* no-op for an event source which is neither a timer nor ratelimited. */
b6d5481b 894
f41315fc
LP
895 prioq_reshuffle(d->earliest, s, &s->earliest_index);
896 prioq_reshuffle(d->latest, s, &s->latest_index);
e1951c16
MS
897 d->needs_rearm = true;
898}
899
1e45e3fe
LP
900static void event_source_time_prioq_remove(
901 sd_event_source *s,
902 struct clock_data *d) {
903
904 assert(s);
905 assert(d);
906
f41315fc
LP
907 prioq_remove(d->earliest, s, &s->earliest_index);
908 prioq_remove(d->latest, s, &s->latest_index);
909 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
910 d->needs_rearm = true;
911}
912
a71fe8b8
LP
913static void source_disconnect(sd_event_source *s) {
914 sd_event *event;
897448bd 915 int r;
a71fe8b8 916
fd38203a
LP
917 assert(s);
918
a71fe8b8
LP
919 if (!s->event)
920 return;
15b38f93 921
a71fe8b8 922 assert(s->event->n_sources > 0);
fd38203a 923
a71fe8b8 924 switch (s->type) {
fd38203a 925
a71fe8b8
LP
926 case SOURCE_IO:
927 if (s->io.fd >= 0)
928 source_io_unregister(s);
fd38203a 929
a71fe8b8 930 break;
6a0f1f6d 931
a71fe8b8 932 case SOURCE_TIME_REALTIME:
a8548816 933 case SOURCE_TIME_BOOTTIME:
a71fe8b8
LP
934 case SOURCE_TIME_MONOTONIC:
935 case SOURCE_TIME_REALTIME_ALARM:
b6d5481b
LP
936 case SOURCE_TIME_BOOTTIME_ALARM:
937 /* Only remove this event source from the time event source here if it is not ratelimited. If
938 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
939 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
940
941 if (!s->ratelimited) {
942 struct clock_data *d;
943 assert_se(d = event_get_clock_data(s->event, s->type));
944 event_source_time_prioq_remove(s, d);
945 }
946
a71fe8b8 947 break;
a71fe8b8
LP
948
949 case SOURCE_SIGNAL:
950 if (s->signal.sig > 0) {
9da4cb2b 951
a71fe8b8
LP
952 if (s->event->signal_sources)
953 s->event->signal_sources[s->signal.sig] = NULL;
4807d2d0 954
9da4cb2b 955 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
897448bd
LP
956
957 if (s->signal.unblock) {
958 sigset_t new_ss;
959
960 if (sigemptyset(&new_ss) < 0)
961 log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
962 else if (sigaddset(&new_ss, s->signal.sig) < 0)
963 log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
964 else {
965 r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
966 if (r != 0)
967 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
968 }
969 }
6a0f1f6d 970 }
fd38203a 971
a71fe8b8 972 break;
fd38203a 973
a71fe8b8 974 case SOURCE_CHILD:
2eeff0f4 975 if (event_origin_changed(s->event))
86587c93
YW
976 s->child.process_owned = false;
977
a71fe8b8 978 if (s->child.pid > 0) {
b6d5481b
LP
979 if (event_source_is_online(s)) {
980 assert(s->event->n_online_child_sources > 0);
981 s->event->n_online_child_sources--;
4807d2d0 982 }
fd38203a 983
4a0b58c4 984 (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
a71fe8b8 985 }
fd38203a 986
f8f3f926
LP
987 if (EVENT_SOURCE_WATCH_PIDFD(s))
988 source_child_pidfd_unregister(s);
989 else
990 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
991
a71fe8b8 992 break;
fd38203a 993
a71fe8b8
LP
994 case SOURCE_DEFER:
995 /* nothing */
996 break;
fd38203a 997
a71fe8b8
LP
998 case SOURCE_POST:
999 set_remove(s->event->post_sources, s);
1000 break;
da7e457c 1001
a71fe8b8
LP
1002 case SOURCE_EXIT:
1003 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
1004 break;
0eb2e0e3 1005
97ef5391
LP
1006 case SOURCE_INOTIFY: {
1007 struct inode_data *inode_data;
1008
1009 inode_data = s->inotify.inode_data;
1010 if (inode_data) {
1011 struct inotify_data *inotify_data;
1012 assert_se(inotify_data = inode_data->inotify_data);
1013
1014 /* Detach this event source from the inode object */
1015 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
1016 s->inotify.inode_data = NULL;
1017
1018 if (s->pending) {
1019 assert(inotify_data->n_pending > 0);
1020 inotify_data->n_pending--;
1021 }
1022
1023 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1024 * continued to being watched. That's because inotify doesn't really have an API for that: we
1025 * can only change watch masks with access to the original inode either by fd or by path. But
1026 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
f21f31b2 1027 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
97ef5391
LP
1028 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1029 * there), but given the need for open_by_handle_at() which is privileged and not universally
1030 * available this would be quite an incomplete solution. Hence we go the other way, leave the
1031 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1032 * anymore after reception. Yes, this sucks, but … Linux … */
1033
1034 /* Maybe release the inode data (and its inotify) */
1035 event_gc_inode_data(s->event, inode_data);
1036 }
1037
1038 break;
1039 }
1040
158fe190
LP
1041 case SOURCE_MEMORY_PRESSURE:
1042 source_memory_pressure_remove_from_write_list(s);
1043 source_memory_pressure_unregister(s);
1044 break;
1045
a71fe8b8 1046 default:
04499a70 1047 assert_not_reached();
a71fe8b8 1048 }
6e9feda3 1049
a71fe8b8
LP
1050 if (s->pending)
1051 prioq_remove(s->event->pending, s, &s->pending_index);
9d3e3aa5 1052
a71fe8b8
LP
1053 if (s->prepare)
1054 prioq_remove(s->event->prepare, s, &s->prepare_index);
fd38203a 1055
b6d5481b
LP
1056 if (s->ratelimited)
1057 event_source_time_prioq_remove(s, &s->event->monotonic);
1058
e514aa1e 1059 event = TAKE_PTR(s->event);
a71fe8b8
LP
1060 LIST_REMOVE(sources, event->sources, s);
1061 event->n_sources--;
fd38203a 1062
f5982559
LP
1063 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1064 * pidfd associated with this event source, which we'll do only on source_free(). */
1065
a71fe8b8
LP
1066 if (!s->floating)
1067 sd_event_unref(event);
1068}
1069
75db809a 1070static sd_event_source* source_free(sd_event_source *s) {
a71fe8b8 1071 assert(s);
fd38203a 1072
a71fe8b8 1073 source_disconnect(s);
ab93297c
NM
1074
1075 if (s->type == SOURCE_IO && s->io.owned)
15723a1d
LP
1076 s->io.fd = safe_close(s->io.fd);
1077
f8f3f926
LP
1078 if (s->type == SOURCE_CHILD) {
1079 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1080
1081 if (s->child.process_owned) {
1082
1083 if (!s->child.exited) {
1084 bool sent = false;
1085
1086 if (s->child.pidfd >= 0) {
1087 if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
1088 if (errno == ESRCH) /* Already dead */
1089 sent = true;
1090 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
1091 log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
1092 s->child.pid);
1093 } else
1094 sent = true;
1095 }
1096
1097 if (!sent)
1098 if (kill(s->child.pid, SIGKILL) < 0)
1099 if (errno != ESRCH) /* Already dead */
1100 log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1101 s->child.pid);
1102 }
1103
1104 if (!s->child.waited) {
1105 siginfo_t si = {};
1106
1107 /* Reap the child if we can */
1108 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1109 }
1110 }
1111
1112 if (s->child.pidfd_owned)
1113 s->child.pidfd = safe_close(s->child.pidfd);
1114 }
1115
158fe190
LP
1116 if (s->type == SOURCE_MEMORY_PRESSURE) {
1117 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1118 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1119 }
1120
15723a1d
LP
1121 if (s->destroy_callback)
1122 s->destroy_callback(s->userdata);
ab93297c 1123
356779df 1124 free(s->description);
75db809a 1125 return mfree(s);
fd38203a 1126}
8c75fe17 1127DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
fd38203a
LP
1128
1129static int source_set_pending(sd_event_source *s, bool b) {
1130 int r;
1131
1132 assert(s);
6203e07a 1133 assert(s->type != SOURCE_EXIT);
fd38203a
LP
1134
1135 if (s->pending == b)
1136 return 0;
1137
1138 s->pending = b;
1139
1140 if (b) {
1141 s->pending_iteration = s->event->iteration;
1142
1143 r = prioq_put(s->event->pending, s, &s->pending_index);
1144 if (r < 0) {
1145 s->pending = false;
1146 return r;
1147 }
1148 } else
1149 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1150
e1951c16
MS
1151 if (EVENT_SOURCE_IS_TIME(s->type))
1152 event_source_time_prioq_reshuffle(s);
2576a19e 1153
9da4cb2b
LP
1154 if (s->type == SOURCE_SIGNAL && !b) {
1155 struct signal_data *d;
1156
1157 d = hashmap_get(s->event->signal_data, &s->priority);
1158 if (d && d->current == s)
1159 d->current = NULL;
1160 }
1161
97ef5391
LP
1162 if (s->type == SOURCE_INOTIFY) {
1163
1164 assert(s->inotify.inode_data);
1165 assert(s->inotify.inode_data->inotify_data);
1166
1167 if (b)
1168 s->inotify.inode_data->inotify_data->n_pending ++;
1169 else {
1170 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1171 s->inotify.inode_data->inotify_data->n_pending --;
1172 }
1173 }
1174
efd3be9d 1175 return 1;
fd38203a
LP
1176}
1177
a71fe8b8 1178static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
a38cf9fb
LP
1179
1180 /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1181 * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1182 * lines. */
1183 static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1184 [SOURCE_IO] = endoffsetof_field(sd_event_source, io),
1185 [SOURCE_TIME_REALTIME] = endoffsetof_field(sd_event_source, time),
1186 [SOURCE_TIME_BOOTTIME] = endoffsetof_field(sd_event_source, time),
1187 [SOURCE_TIME_MONOTONIC] = endoffsetof_field(sd_event_source, time),
1188 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1189 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1190 [SOURCE_SIGNAL] = endoffsetof_field(sd_event_source, signal),
1191 [SOURCE_CHILD] = endoffsetof_field(sd_event_source, child),
1192 [SOURCE_DEFER] = endoffsetof_field(sd_event_source, defer),
1193 [SOURCE_POST] = endoffsetof_field(sd_event_source, post),
1194 [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit),
1195 [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify),
158fe190 1196 [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure),
a38cf9fb
LP
1197 };
1198
fd38203a
LP
1199 sd_event_source *s;
1200
1201 assert(e);
a38cf9fb
LP
1202 assert(type >= 0);
1203 assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1204 assert(size_table[type] > 0);
fd38203a 1205
a38cf9fb
LP
1206 /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1207 * size, even if we only allocate the initial part we need. */
1208 s = expand_to_usable(malloc0(size_table[type]), sizeof(sd_event_source));
fd38203a
LP
1209 if (!s)
1210 return NULL;
1211
a38cf9fb
LP
1212 /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1213 * than what we allocated here. */
1214 s->n_ref = 1;
1215 s->event = e;
1216 s->floating = floating;
1217 s->type = type;
1218 s->pending_index = PRIOQ_IDX_NULL;
1219 s->prepare_index = PRIOQ_IDX_NULL;
a71fe8b8
LP
1220
1221 if (!floating)
1222 sd_event_ref(e);
fd38203a 1223
a71fe8b8 1224 LIST_PREPEND(sources, e->sources, s);
313cefa1 1225 e->n_sources++;
15b38f93 1226
fd38203a
LP
1227 return s;
1228}
1229
b9350e70
LP
1230static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1231 assert(s);
1232
1233 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1234}
1235
f7262a9f 1236_public_ int sd_event_add_io(
fd38203a 1237 sd_event *e,
151b9b96 1238 sd_event_source **ret,
fd38203a
LP
1239 int fd,
1240 uint32_t events,
718db961 1241 sd_event_io_handler_t callback,
151b9b96 1242 void *userdata) {
fd38203a 1243
ec766a51 1244 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1245 int r;
1246
305f78bf 1247 assert_return(e, -EINVAL);
b937d761 1248 assert_return(e = event_resolve(e), -ENOPKG);
8ac43fee 1249 assert_return(fd >= 0, -EBADF);
2a16a986 1250 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 1251 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1252 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 1253
b9350e70
LP
1254 if (!callback)
1255 callback = io_exit_callback;
1256
a71fe8b8 1257 s = source_new(e, !ret, SOURCE_IO);
fd38203a
LP
1258 if (!s)
1259 return -ENOMEM;
1260
9da4cb2b 1261 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1262 s->io.fd = fd;
1263 s->io.events = events;
1264 s->io.callback = callback;
1265 s->userdata = userdata;
baf76283 1266 s->enabled = SD_EVENT_ON;
fd38203a 1267
baf76283 1268 r = source_io_register(s, s->enabled, events);
ec766a51 1269 if (r < 0)
050f74f2 1270 return r;
fd38203a 1271
a71fe8b8
LP
1272 if (ret)
1273 *ret = s;
ec766a51 1274 TAKE_PTR(s);
a71fe8b8 1275
fd38203a
LP
1276 return 0;
1277}
1278
52444dc4 1279static void initialize_perturb(sd_event *e) {
6d2326e0 1280 sd_id128_t id = {};
52444dc4 1281
6d2326e0
YW
1282 /* When we sleep for longer, we try to realign the wakeup to the same time within each
1283 * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1284 * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1285 * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1286 * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
52444dc4 1287
3a43da28 1288 if (_likely_(e->perturb != USEC_INFINITY))
52444dc4
LP
1289 return;
1290
1912f790 1291 if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
6d2326e0
YW
1292 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1293 else
1294 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
52444dc4
LP
1295}
1296
fd38203a
LP
1297static int event_setup_timer_fd(
1298 sd_event *e,
6a0f1f6d
LP
1299 struct clock_data *d,
1300 clockid_t clock) {
fd38203a 1301
fd38203a 1302 assert(e);
6a0f1f6d 1303 assert(d);
fd38203a 1304
6a0f1f6d 1305 if (_likely_(d->fd >= 0))
fd38203a
LP
1306 return 0;
1307
254d1313 1308 _cleanup_close_ int fd = -EBADF;
b44d87e2 1309
6a0f1f6d 1310 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
fd38203a
LP
1311 if (fd < 0)
1312 return -errno;
1313
7fe2903c
LP
1314 fd = fd_move_above_stdio(fd);
1315
1eac7948 1316 struct epoll_event ev = {
a82f89aa
LP
1317 .events = EPOLLIN,
1318 .data.ptr = d,
1319 };
fd38203a 1320
15c689d7 1321 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
fd38203a 1322 return -errno;
fd38203a 1323
b44d87e2 1324 d->fd = TAKE_FD(fd);
fd38203a
LP
1325 return 0;
1326}
1327
c4f1aff2
TG
1328static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1329 assert(s);
1330
1331 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1332}
1333
41c63f36
LP
1334static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1335 int r;
1336
1337 assert(d);
1338
1339 if (d->fd < 0) {
1340 r = event_setup_timer_fd(e, d, clock);
1341 if (r < 0)
1342 return r;
1343 }
1344
1345 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1346 if (r < 0)
1347 return r;
1348
1349 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1350 if (r < 0)
1351 return r;
1352
1353 return 0;
1354}
1355
1e45e3fe
LP
1356static int event_source_time_prioq_put(
1357 sd_event_source *s,
1358 struct clock_data *d) {
1359
1360 int r;
1361
1362 assert(s);
1363 assert(d);
19947509 1364 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1e45e3fe 1365
f41315fc 1366 r = prioq_put(d->earliest, s, &s->earliest_index);
1e45e3fe
LP
1367 if (r < 0)
1368 return r;
1369
f41315fc 1370 r = prioq_put(d->latest, s, &s->latest_index);
1e45e3fe 1371 if (r < 0) {
f41315fc
LP
1372 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1373 s->earliest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
1374 return r;
1375 }
1376
1377 d->needs_rearm = true;
1378 return 0;
1379}
1380
6a0f1f6d 1381_public_ int sd_event_add_time(
fd38203a 1382 sd_event *e,
151b9b96 1383 sd_event_source **ret,
6a0f1f6d 1384 clockid_t clock,
fd38203a 1385 uint64_t usec,
c2ba3ad6 1386 uint64_t accuracy,
718db961 1387 sd_event_time_handler_t callback,
151b9b96 1388 void *userdata) {
fd38203a 1389
6a0f1f6d 1390 EventSourceType type;
ec766a51 1391 _cleanup_(source_freep) sd_event_source *s = NULL;
6a0f1f6d 1392 struct clock_data *d;
fd38203a
LP
1393 int r;
1394
305f78bf 1395 assert_return(e, -EINVAL);
b937d761 1396 assert_return(e = event_resolve(e), -ENOPKG);
f5fbe71d 1397 assert_return(accuracy != UINT64_MAX, -EINVAL);
da7e457c 1398 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1399 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 1400
e475d10c
LP
1401 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1402 return -EOPNOTSUPP;
1403
1404 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1405 if (type < 0)
3411372e
LP
1406 return -EOPNOTSUPP;
1407
c4f1aff2
TG
1408 if (!callback)
1409 callback = time_exit_callback;
1410
1e45e3fe 1411 assert_se(d = event_get_clock_data(e, type));
c2ba3ad6 1412
41c63f36 1413 r = setup_clock_data(e, d, clock);
c983e776
EV
1414 if (r < 0)
1415 return r;
fd38203a 1416
a71fe8b8 1417 s = source_new(e, !ret, type);
fd38203a
LP
1418 if (!s)
1419 return -ENOMEM;
1420
1421 s->time.next = usec;
c2ba3ad6 1422 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
fd38203a 1423 s->time.callback = callback;
f41315fc 1424 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
fd38203a 1425 s->userdata = userdata;
baf76283 1426 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1427
1e45e3fe 1428 r = event_source_time_prioq_put(s, d);
c2ba3ad6 1429 if (r < 0)
ec766a51 1430 return r;
fd38203a 1431
a71fe8b8
LP
1432 if (ret)
1433 *ret = s;
ec766a51 1434 TAKE_PTR(s);
a71fe8b8 1435
fd38203a
LP
1436 return 0;
1437}
1438
d6a83dc4
LP
1439_public_ int sd_event_add_time_relative(
1440 sd_event *e,
1441 sd_event_source **ret,
1442 clockid_t clock,
1443 uint64_t usec,
1444 uint64_t accuracy,
1445 sd_event_time_handler_t callback,
1446 void *userdata) {
1447
1448 usec_t t;
1449 int r;
1450
1451 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1452 * checks for overflow. */
1453
1454 r = sd_event_now(e, clock, &t);
1455 if (r < 0)
1456 return r;
1457
1458 if (usec >= USEC_INFINITY - t)
1459 return -EOVERFLOW;
1460
1461 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1462}
1463
59bc1fd7
LP
1464static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1465 assert(s);
1466
1467 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1468}
1469
f7262a9f 1470_public_ int sd_event_add_signal(
305f78bf 1471 sd_event *e,
151b9b96 1472 sd_event_source **ret,
305f78bf 1473 int sig,
718db961 1474 sd_event_signal_handler_t callback,
151b9b96 1475 void *userdata) {
305f78bf 1476
ec766a51 1477 _cleanup_(source_freep) sd_event_source *s = NULL;
9da4cb2b 1478 struct signal_data *d;
897448bd
LP
1479 sigset_t new_ss;
1480 bool block_it;
fd38203a
LP
1481 int r;
1482
305f78bf 1483 assert_return(e, -EINVAL);
b937d761 1484 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1485 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1486 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 1487
897448bd
LP
1488 /* Let's make sure our special flag stays outside of the valid signal range */
1489 assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1490
1491 if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1492 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1493 assert_return(SIGNAL_VALID(sig), -EINVAL);
1494
1495 block_it = true;
1496 } else {
1497 assert_return(SIGNAL_VALID(sig), -EINVAL);
1498
1499 r = signal_is_blocked(sig);
1500 if (r < 0)
1501 return r;
1502 if (r == 0)
1503 return -EBUSY;
1504
1505 block_it = false;
1506 }
1507
59bc1fd7
LP
1508 if (!callback)
1509 callback = signal_exit_callback;
1510
fd38203a
LP
1511 if (!e->signal_sources) {
1512 e->signal_sources = new0(sd_event_source*, _NSIG);
1513 if (!e->signal_sources)
1514 return -ENOMEM;
1515 } else if (e->signal_sources[sig])
1516 return -EBUSY;
1517
a71fe8b8 1518 s = source_new(e, !ret, SOURCE_SIGNAL);
fd38203a
LP
1519 if (!s)
1520 return -ENOMEM;
1521
1522 s->signal.sig = sig;
1523 s->signal.callback = callback;
1524 s->userdata = userdata;
baf76283 1525 s->enabled = SD_EVENT_ON;
fd38203a
LP
1526
1527 e->signal_sources[sig] = s;
fd38203a 1528
897448bd
LP
1529 if (block_it) {
1530 sigset_t old_ss;
1531
1532 if (sigemptyset(&new_ss) < 0)
1533 return -errno;
1534
1535 if (sigaddset(&new_ss, sig) < 0)
1536 return -errno;
1537
1538 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1539 if (r != 0)
1540 return -r;
1541
1542 r = sigismember(&old_ss, sig);
1543 if (r < 0)
1544 return -errno;
1545
1546 s->signal.unblock = !r;
1547 } else
1548 s->signal.unblock = false;
1549
9da4cb2b 1550 r = event_make_signal_data(e, sig, &d);
897448bd
LP
1551 if (r < 0) {
1552 if (s->signal.unblock)
1553 (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1554
9da4cb2b 1555 return r;
897448bd 1556 }
fd38203a 1557
f1f00dbb
LP
1558 /* Use the signal name as description for the event source by default */
1559 (void) sd_event_source_set_description(s, signal_to_string(sig));
1560
a71fe8b8
LP
1561 if (ret)
1562 *ret = s;
ec766a51 1563 TAKE_PTR(s);
a71fe8b8 1564
fd38203a
LP
1565 return 0;
1566}
1567
b9350e70
LP
1568static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1569 assert(s);
1570
1571 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1572}
1573
f8f3f926
LP
1574static bool shall_use_pidfd(void) {
1575 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1576 return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1577}
1578
f7262a9f 1579_public_ int sd_event_add_child(
305f78bf 1580 sd_event *e,
151b9b96 1581 sd_event_source **ret,
305f78bf
LP
1582 pid_t pid,
1583 int options,
718db961 1584 sd_event_child_handler_t callback,
151b9b96 1585 void *userdata) {
305f78bf 1586
ec766a51 1587 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1588 int r;
1589
305f78bf 1590 assert_return(e, -EINVAL);
b937d761 1591 assert_return(e = event_resolve(e), -ENOPKG);
305f78bf
LP
1592 assert_return(pid > 1, -EINVAL);
1593 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1594 assert_return(options != 0, -EINVAL);
da7e457c 1595 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1596 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 1597
b9350e70
LP
1598 if (!callback)
1599 callback = child_exit_callback;
1600
b6d5481b 1601 if (e->n_online_child_sources == 0) {
ee880b37
LP
1602 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1603 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1604 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1605 * take effect.
1606 *
1607 * (As an optimization we only do this check on the first child event source created.) */
1608 r = signal_is_blocked(SIGCHLD);
1609 if (r < 0)
1610 return r;
1611 if (r == 0)
1612 return -EBUSY;
1613 }
1614
d5099efc 1615 r = hashmap_ensure_allocated(&e->child_sources, NULL);
fd38203a
LP
1616 if (r < 0)
1617 return r;
1618
4a0b58c4 1619 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
fd38203a
LP
1620 return -EBUSY;
1621
a71fe8b8 1622 s = source_new(e, !ret, SOURCE_CHILD);
fd38203a
LP
1623 if (!s)
1624 return -ENOMEM;
1625
f8f3f926 1626 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1627 s->child.options = options;
1628 s->child.callback = callback;
1629 s->userdata = userdata;
baf76283 1630 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1631
f8f3f926
LP
1632 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1633 * pin the PID, and make regular waitid() handling race-free. */
1634
1635 if (shall_use_pidfd()) {
54988a27 1636 s->child.pidfd = pidfd_open(pid, 0);
f8f3f926
LP
1637 if (s->child.pidfd < 0) {
1638 /* Propagate errors unless the syscall is not supported or blocked */
1639 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1640 return -errno;
1641 } else
1642 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1643 } else
254d1313 1644 s->child.pidfd = -EBADF;
f8f3f926 1645
f8f3f926
LP
1646 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1647 /* We have a pidfd and we only want to watch for exit */
f8f3f926 1648 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1649 if (r < 0)
f8f3f926 1650 return r;
ac9f2640 1651
f8f3f926
LP
1652 } else {
1653 /* We have no pidfd or we shall wait for some other event than WEXITED */
f8f3f926 1654 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1655 if (r < 0)
f8f3f926 1656 return r;
f8f3f926
LP
1657
1658 e->need_process_child = true;
1659 }
c2ba3ad6 1660
54988a27
YW
1661 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1662 if (r < 0)
1663 return r;
1664
1665 /* These must be done after everything succeeds. */
1666 s->child.pid = pid;
b6d5481b 1667 e->n_online_child_sources++;
ac9f2640 1668
a71fe8b8
LP
1669 if (ret)
1670 *ret = s;
ec766a51 1671 TAKE_PTR(s);
f8f3f926
LP
1672 return 0;
1673}
1674
1675_public_ int sd_event_add_child_pidfd(
1676 sd_event *e,
1677 sd_event_source **ret,
1678 int pidfd,
1679 int options,
1680 sd_event_child_handler_t callback,
1681 void *userdata) {
1682
1683
1684 _cleanup_(source_freep) sd_event_source *s = NULL;
1685 pid_t pid;
1686 int r;
1687
1688 assert_return(e, -EINVAL);
1689 assert_return(e = event_resolve(e), -ENOPKG);
1690 assert_return(pidfd >= 0, -EBADF);
1691 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1692 assert_return(options != 0, -EINVAL);
f8f3f926 1693 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1694 assert_return(!event_origin_changed(e), -ECHILD);
f8f3f926 1695
b9350e70
LP
1696 if (!callback)
1697 callback = child_exit_callback;
1698
b6d5481b 1699 if (e->n_online_child_sources == 0) {
ee880b37
LP
1700 r = signal_is_blocked(SIGCHLD);
1701 if (r < 0)
1702 return r;
1703 if (r == 0)
1704 return -EBUSY;
1705 }
1706
f8f3f926
LP
1707 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1708 if (r < 0)
1709 return r;
1710
1711 r = pidfd_get_pid(pidfd, &pid);
1712 if (r < 0)
1713 return r;
1714
1715 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1716 return -EBUSY;
1717
1718 s = source_new(e, !ret, SOURCE_CHILD);
1719 if (!s)
1720 return -ENOMEM;
1721
1722 s->wakeup = WAKEUP_EVENT_SOURCE;
1723 s->child.pidfd = pidfd;
1724 s->child.pid = pid;
1725 s->child.options = options;
1726 s->child.callback = callback;
1727 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1728 s->userdata = userdata;
1729 s->enabled = SD_EVENT_ONESHOT;
1730
1731 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1732 if (r < 0)
1733 return r;
1734
f8f3f926
LP
1735 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1736 /* We only want to watch for WEXITED */
f8f3f926 1737 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1738 if (r < 0)
f8f3f926 1739 return r;
f8f3f926
LP
1740 } else {
1741 /* We shall wait for some other event than WEXITED */
f8f3f926 1742 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1743 if (r < 0)
f8f3f926 1744 return r;
a71fe8b8 1745
f8f3f926
LP
1746 e->need_process_child = true;
1747 }
1748
b6d5481b 1749 e->n_online_child_sources++;
ac9f2640 1750
f8f3f926
LP
1751 if (ret)
1752 *ret = s;
f8f3f926 1753 TAKE_PTR(s);
fd38203a
LP
1754 return 0;
1755}
1756
b9350e70
LP
1757static int generic_exit_callback(sd_event_source *s, void *userdata) {
1758 assert(s);
1759
1760 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1761}
1762
f7262a9f 1763_public_ int sd_event_add_defer(
305f78bf 1764 sd_event *e,
151b9b96 1765 sd_event_source **ret,
718db961 1766 sd_event_handler_t callback,
151b9b96 1767 void *userdata) {
305f78bf 1768
ec766a51 1769 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1770 int r;
1771
305f78bf 1772 assert_return(e, -EINVAL);
b937d761 1773 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1774 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1775 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 1776
b9350e70
LP
1777 if (!callback)
1778 callback = generic_exit_callback;
1779
a71fe8b8 1780 s = source_new(e, !ret, SOURCE_DEFER);
fd38203a
LP
1781 if (!s)
1782 return -ENOMEM;
1783
1784 s->defer.callback = callback;
1785 s->userdata = userdata;
baf76283 1786 s->enabled = SD_EVENT_ONESHOT;
fd38203a
LP
1787
1788 r = source_set_pending(s, true);
ec766a51 1789 if (r < 0)
fd38203a 1790 return r;
fd38203a 1791
a71fe8b8
LP
1792 if (ret)
1793 *ret = s;
ec766a51 1794 TAKE_PTR(s);
a71fe8b8 1795
fd38203a
LP
1796 return 0;
1797}
1798
6e9feda3
LP
1799_public_ int sd_event_add_post(
1800 sd_event *e,
1801 sd_event_source **ret,
1802 sd_event_handler_t callback,
1803 void *userdata) {
1804
ec766a51 1805 _cleanup_(source_freep) sd_event_source *s = NULL;
6e9feda3
LP
1806 int r;
1807
1808 assert_return(e, -EINVAL);
b937d761 1809 assert_return(e = event_resolve(e), -ENOPKG);
6e9feda3 1810 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1811 assert_return(!event_origin_changed(e), -ECHILD);
6e9feda3 1812
b9350e70
LP
1813 if (!callback)
1814 callback = generic_exit_callback;
1815
a71fe8b8 1816 s = source_new(e, !ret, SOURCE_POST);
6e9feda3
LP
1817 if (!s)
1818 return -ENOMEM;
1819
1820 s->post.callback = callback;
1821 s->userdata = userdata;
1822 s->enabled = SD_EVENT_ON;
1823
de7fef4b 1824 r = set_ensure_put(&e->post_sources, NULL, s);
ec766a51 1825 if (r < 0)
6e9feda3 1826 return r;
de7fef4b 1827 assert(r > 0);
6e9feda3 1828
a71fe8b8
LP
1829 if (ret)
1830 *ret = s;
ec766a51 1831 TAKE_PTR(s);
a71fe8b8 1832
6e9feda3
LP
1833 return 0;
1834}
1835
6203e07a 1836_public_ int sd_event_add_exit(
305f78bf 1837 sd_event *e,
151b9b96 1838 sd_event_source **ret,
718db961 1839 sd_event_handler_t callback,
151b9b96 1840 void *userdata) {
305f78bf 1841
ec766a51 1842 _cleanup_(source_freep) sd_event_source *s = NULL;
da7e457c
LP
1843 int r;
1844
1845 assert_return(e, -EINVAL);
b937d761 1846 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
1847 assert_return(callback, -EINVAL);
1848 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1849 assert_return(!event_origin_changed(e), -ECHILD);
da7e457c 1850
c983e776
EV
1851 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1852 if (r < 0)
1853 return r;
da7e457c 1854
a71fe8b8 1855 s = source_new(e, !ret, SOURCE_EXIT);
fd38203a 1856 if (!s)
da7e457c 1857 return -ENOMEM;
fd38203a 1858
6203e07a 1859 s->exit.callback = callback;
da7e457c 1860 s->userdata = userdata;
6203e07a 1861 s->exit.prioq_index = PRIOQ_IDX_NULL;
baf76283 1862 s->enabled = SD_EVENT_ONESHOT;
da7e457c 1863
6203e07a 1864 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
ec766a51 1865 if (r < 0)
da7e457c 1866 return r;
da7e457c 1867
a71fe8b8
LP
1868 if (ret)
1869 *ret = s;
ec766a51 1870 TAKE_PTR(s);
a71fe8b8 1871
da7e457c
LP
1872 return 0;
1873}
1874
9857de4f 1875_public_ int sd_event_trim_memory(void) {
158fe190
LP
1876 int r;
1877
1878 /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1879 * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1880 * NULL callback parameter. */
1881
1882 log_debug("Memory pressure event, trimming malloc() memory.");
1883
1884#if HAVE_GENERIC_MALLINFO
1885 generic_mallinfo before_mallinfo = generic_mallinfo_get();
1886#endif
1887
1888 usec_t before_timestamp = now(CLOCK_MONOTONIC);
1889 hashmap_trim_pools();
1890 r = malloc_trim(0);
1891 usec_t after_timestamp = now(CLOCK_MONOTONIC);
1892
1893 if (r > 0)
1894 log_debug("Successfully trimmed some memory.");
1895 else
1896 log_debug("Couldn't trim any memory.");
1897
1898 usec_t period = after_timestamp - before_timestamp;
1899
1900#if HAVE_GENERIC_MALLINFO
1901 generic_mallinfo after_mallinfo = generic_mallinfo_get();
1902 size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
1903 LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
1904 log_struct(LOG_DEBUG,
1905 LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1906 FORMAT_TIMESPAN(period, 0),
1907 FORMAT_BYTES(l)),
1908 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1909 "TRIMMED_BYTES=%zu", l,
1910 "TRIMMED_USEC=" USEC_FMT, period);
1911#else
1912 log_struct(LOG_DEBUG,
1913 LOG_MESSAGE("Memory trimming took %s.",
1914 FORMAT_TIMESPAN(period, 0)),
1915 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1916 "TRIMMED_USEC=" USEC_FMT, period);
1917#endif
1918
1919 return 0;
1920}
1921
1922static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1923 assert(s);
1924
1925 sd_event_trim_memory();
1926 return 0;
1927}
1928
1929_public_ int sd_event_add_memory_pressure(
1930 sd_event *e,
1931 sd_event_source **ret,
1932 sd_event_handler_t callback,
1933 void *userdata) {
1934
1935 _cleanup_free_ char *w = NULL;
1936 _cleanup_(source_freep) sd_event_source *s = NULL;
92651a7a 1937 _cleanup_close_ int path_fd = -EBADF, fd = -EBADF;
158fe190 1938 _cleanup_free_ void *write_buffer = NULL;
40c5d5d2 1939 const char *watch, *watch_fallback = NULL, *env;
158fe190
LP
1940 size_t write_buffer_size = 0;
1941 struct stat st;
1942 uint32_t events;
1943 bool locked;
1944 int r;
1945
1946 assert_return(e, -EINVAL);
1947 assert_return(e = event_resolve(e), -ENOPKG);
1948 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1949 assert_return(!event_origin_changed(e), -ECHILD);
158fe190
LP
1950
1951 if (!callback)
1952 callback = memory_pressure_callback;
1953
1954 s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1955 if (!s)
1956 return -ENOMEM;
1957
1958 s->wakeup = WAKEUP_EVENT_SOURCE;
1959 s->memory_pressure.callback = callback;
1960 s->userdata = userdata;
1961 s->enabled = SD_EVENT_ON;
1962 s->memory_pressure.fd = -EBADF;
1963
1964 env = secure_getenv("MEMORY_PRESSURE_WATCH");
1965 if (env) {
1966 if (isempty(env) || path_equal(env, "/dev/null"))
1967 return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1968 "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1969
1970 if (!path_is_absolute(env) || !path_is_normalized(env))
1971 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1972 "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1973
1974 watch = env;
1975
1976 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1977 if (env) {
1978 r = unbase64mem(env, SIZE_MAX, &write_buffer, &write_buffer_size);
1979 if (r < 0)
1980 return r;
1981 }
1982
1983 locked = true;
1984 } else {
1985
1986 r = is_pressure_supported();
1987 if (r < 0)
1988 return r;
1989 if (r == 0)
1990 return -EOPNOTSUPP;
1991
1992 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1993 * the system wide pressure if for some reason we cannot (which could be: memory controller
1994 * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1995 * only use the system-wide logic. */
1996 r = cg_all_unified();
1997 if (r < 0)
1998 return r;
1999 if (r == 0)
2000 watch = "/proc/pressure/memory";
2001 else {
2002 _cleanup_free_ char *cg = NULL;
2003
2004 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
2005 if (r < 0)
2006 return r;
2007
2008 w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
2009 if (!w)
2010 return -ENOMEM;
2011
2012 watch = w;
2013 watch_fallback = "/proc/pressure/memory";
2014 }
2015
2016 /* Android uses three levels in its userspace low memory killer logic:
2017 * some 70000 1000000
2018 * some 100000 1000000
2019 * full 70000 1000000
2020 *
2021 * GNOME's low memory monitor uses:
2022 * some 70000 1000000
2023 * some 100000 1000000
2024 * full 100000 1000000
2025 *
a6170074
LP
2026 * We'll default to the middle level that both agree on. Except we do it on a 2s window
2027 * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2028 * kernel will allow us to do unprivileged, also in the future. */
158fe190
LP
2029 if (asprintf((char**) &write_buffer,
2030 "%s " USEC_FMT " " USEC_FMT,
2031 MEMORY_PRESSURE_DEFAULT_TYPE,
2032 MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2033 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2034 return -ENOMEM;
2035
2036 write_buffer_size = strlen(write_buffer) + 1;
2037 locked = false;
2038 }
2039
2040 path_fd = open(watch, O_PATH|O_CLOEXEC);
2041 if (path_fd < 0) {
2042 if (errno != ENOENT)
2043 return -errno;
2044
2045 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2046 * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2047 * the PSI service apparently is not supported) */
2048 if (!watch_fallback)
2049 return locked ? -ENOENT : -EOPNOTSUPP;
2050
2051 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
63b1e67e
YW
2052 if (path_fd < 0) {
2053 if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2054 return -EOPNOTSUPP;
158fe190 2055 return -errno;
63b1e67e 2056 }
158fe190
LP
2057 }
2058
2059 if (fstat(path_fd, &st) < 0)
2060 return -errno;
2061
2062 if (S_ISSOCK(st.st_mode)) {
2063 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2064 if (fd < 0)
2065 return -errno;
2066
2067 r = connect_unix_path(fd, path_fd, NULL);
2068 if (r < 0)
2069 return r;
2070
2071 events = EPOLLIN;
2072
2073 } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2074 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2075 if (fd < 0)
2076 return fd;
2077
2078 if (S_ISREG(st.st_mode)) {
2079 struct statfs sfs;
2080
2081 /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2082
2083 if (fstatfs(fd, &sfs) < 0)
2084 return -errno;
2085
2086 if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2087 !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2088 return -ENOTTY;
2089
2090 events = EPOLLPRI;
2091 } else
2092 /* For fifos and char devices just watch for EPOLLIN */
2093 events = EPOLLIN;
2094
2095 } else if (S_ISDIR(st.st_mode))
2096 return -EISDIR;
2097 else
2098 return -EBADF;
2099
2100 s->memory_pressure.fd = TAKE_FD(fd);
2101 s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2102 s->memory_pressure.write_buffer_size = write_buffer_size;
2103 s->memory_pressure.events = events;
2104 s->memory_pressure.locked = locked;
2105
2106 /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2107 * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2108 * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2109 * event sources on which writes must be executed before the first event loop iteration is
2110 * executed. (We could also write the data here, right away, but we want to give the caller the
2111 * freedom to call sd_event_source_set_memory_pressure_type() and
2112 * sd_event_source_set_memory_pressure_rate() before we write it. */
2113
2114 if (s->memory_pressure.write_buffer_size > 0)
2115 source_memory_pressure_add_to_write_list(s);
2116 else {
2117 r = source_memory_pressure_register(s, s->enabled);
2118 if (r < 0)
2119 return r;
2120 }
2121
2122 if (ret)
2123 *ret = s;
2124 TAKE_PTR(s);
2125
2126 return 0;
2127}
2128
97ef5391
LP
2129static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2130 assert(e);
2131
2132 if (!d)
2133 return;
2134
2135 assert(hashmap_isempty(d->inodes));
2136 assert(hashmap_isempty(d->wd));
2137
2138 if (d->buffer_filled > 0)
0601b958 2139 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
2140
2141 hashmap_free(d->inodes);
2142 hashmap_free(d->wd);
2143
2144 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2145
2146 if (d->fd >= 0) {
2eeff0f4 2147 if (!event_origin_changed(e) &&
fbae5090 2148 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
97ef5391
LP
2149 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2150
2151 safe_close(d->fd);
2152 }
2153 free(d);
2154}
2155
2156static int event_make_inotify_data(
2157 sd_event *e,
2158 int64_t priority,
2159 struct inotify_data **ret) {
2160
254d1313 2161 _cleanup_close_ int fd = -EBADF;
97ef5391 2162 struct inotify_data *d;
97ef5391
LP
2163 int r;
2164
2165 assert(e);
2166
2167 d = hashmap_get(e->inotify_data, &priority);
2168 if (d) {
2169 if (ret)
2170 *ret = d;
2171 return 0;
2172 }
2173
2174 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2175 if (fd < 0)
2176 return -errno;
2177
2178 fd = fd_move_above_stdio(fd);
2179
97ef5391
LP
2180 d = new(struct inotify_data, 1);
2181 if (!d)
2182 return -ENOMEM;
2183
2184 *d = (struct inotify_data) {
2185 .wakeup = WAKEUP_INOTIFY_DATA,
2186 .fd = TAKE_FD(fd),
2187 .priority = priority,
2188 };
2189
c2484a75 2190 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
97ef5391
LP
2191 if (r < 0) {
2192 d->fd = safe_close(d->fd);
2193 free(d);
2194 return r;
2195 }
2196
1eac7948 2197 struct epoll_event ev = {
97ef5391
LP
2198 .events = EPOLLIN,
2199 .data.ptr = d,
2200 };
2201
2202 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2203 r = -errno;
2204 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2205 * remove the fd from the epoll first, which we don't want as we couldn't
2206 * add it in the first place. */
2207 event_free_inotify_data(e, d);
2208 return r;
2209 }
2210
2211 if (ret)
2212 *ret = d;
2213
2214 return 1;
2215}
2216
7a08d314 2217static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
90c88092 2218 int r;
97ef5391
LP
2219
2220 assert(x);
2221 assert(y);
2222
90c88092
YW
2223 r = CMP(x->dev, y->dev);
2224 if (r != 0)
2225 return r;
97ef5391 2226
6dd91b36 2227 return CMP(x->ino, y->ino);
97ef5391
LP
2228}
2229
7a08d314
YW
2230static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2231 assert(d);
97ef5391
LP
2232
2233 siphash24_compress(&d->dev, sizeof(d->dev), state);
2234 siphash24_compress(&d->ino, sizeof(d->ino), state);
2235}
2236
7a08d314 2237DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
97ef5391
LP
2238
2239static void event_free_inode_data(
2240 sd_event *e,
2241 struct inode_data *d) {
2242
2243 assert(e);
2244
2245 if (!d)
2246 return;
2247
64903d18 2248 assert(!d->event_sources);
97ef5391
LP
2249
2250 if (d->fd >= 0) {
ed828563 2251 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
2252 safe_close(d->fd);
2253 }
2254
2255 if (d->inotify_data) {
2256
2257 if (d->wd >= 0) {
2eeff0f4 2258 if (d->inotify_data->fd >= 0 && !event_origin_changed(e)) {
97ef5391
LP
2259 /* So here's a problem. At the time this runs the watch descriptor might already be
2260 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2261 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2262 * likely case to happen. */
2263
2264 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2265 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2266 }
2267
2268 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2269 }
2270
2271 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2272 }
2273
2274 free(d);
2275}
2276
53baf2ef
LP
2277static void event_gc_inotify_data(
2278 sd_event *e,
2279 struct inotify_data *d) {
2280
2281 assert(e);
2282
2283 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2284 * any inode with it anymore, which in turn happens if no event source of this priority is interested
2285 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2286 * (under the expectation that the GC is called again once the counter is decremented). */
2287
2288 if (!d)
2289 return;
2290
2291 if (!hashmap_isempty(d->inodes))
2292 return;
2293
2294 if (d->n_busy > 0)
2295 return;
2296
2297 event_free_inotify_data(e, d);
2298}
2299
97ef5391
LP
2300static void event_gc_inode_data(
2301 sd_event *e,
2302 struct inode_data *d) {
2303
2304 struct inotify_data *inotify_data;
2305
2306 assert(e);
2307
2308 if (!d)
2309 return;
2310
64903d18 2311 if (d->event_sources)
97ef5391
LP
2312 return;
2313
2314 inotify_data = d->inotify_data;
2315 event_free_inode_data(e, d);
2316
53baf2ef 2317 event_gc_inotify_data(e, inotify_data);
97ef5391
LP
2318}
2319
2320static int event_make_inode_data(
2321 sd_event *e,
2322 struct inotify_data *inotify_data,
2323 dev_t dev,
2324 ino_t ino,
2325 struct inode_data **ret) {
2326
2327 struct inode_data *d, key;
2328 int r;
2329
2330 assert(e);
2331 assert(inotify_data);
2332
2333 key = (struct inode_data) {
2334 .ino = ino,
2335 .dev = dev,
2336 };
2337
2338 d = hashmap_get(inotify_data->inodes, &key);
2339 if (d) {
2340 if (ret)
2341 *ret = d;
2342
2343 return 0;
2344 }
2345
2346 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2347 if (r < 0)
2348 return r;
2349
2350 d = new(struct inode_data, 1);
2351 if (!d)
2352 return -ENOMEM;
2353
2354 *d = (struct inode_data) {
2355 .dev = dev,
2356 .ino = ino,
2357 .wd = -1,
254d1313 2358 .fd = -EBADF,
97ef5391
LP
2359 .inotify_data = inotify_data,
2360 };
2361
2362 r = hashmap_put(inotify_data->inodes, d, d);
2363 if (r < 0) {
2364 free(d);
2365 return r;
2366 }
2367
2368 if (ret)
2369 *ret = d;
2370
2371 return 1;
2372}
2373
2374static uint32_t inode_data_determine_mask(struct inode_data *d) {
2375 bool excl_unlink = true;
2376 uint32_t combined = 0;
97ef5391
LP
2377
2378 assert(d);
2379
2380 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2381 * the IN_EXCL_UNLINK flag is ANDed instead.
2382 *
2383 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2384 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
f21f31b2 2385 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
97ef5391
LP
2386 * events we don't care for client-side. */
2387
2388 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2389
2390 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2391 excl_unlink = false;
2392
2393 combined |= s->inotify.mask;
2394 }
2395
2396 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2397}
2398
2399static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2400 uint32_t combined_mask;
2401 int wd, r;
2402
2403 assert(d);
2404 assert(d->fd >= 0);
2405
2406 combined_mask = inode_data_determine_mask(d);
2407
2408 if (d->wd >= 0 && combined_mask == d->combined_mask)
2409 return 0;
2410
2411 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2412 if (r < 0)
2413 return r;
2414
2415 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2416 if (wd < 0)
2417 return -errno;
2418
2419 if (d->wd < 0) {
2420 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2421 if (r < 0) {
2422 (void) inotify_rm_watch(d->inotify_data->fd, wd);
2423 return r;
2424 }
2425
2426 d->wd = wd;
2427
2428 } else if (d->wd != wd) {
2429
2430 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2431 (void) inotify_rm_watch(d->fd, wd);
2432 return -EINVAL;
2433 }
2434
2435 d->combined_mask = combined_mask;
2436 return 1;
2437}
2438
b9350e70
LP
2439static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2440 assert(s);
2441
2442 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2443}
2444
e67d738a 2445static int event_add_inotify_fd_internal(
97ef5391
LP
2446 sd_event *e,
2447 sd_event_source **ret,
e67d738a
LP
2448 int fd,
2449 bool donate,
97ef5391
LP
2450 uint32_t mask,
2451 sd_event_inotify_handler_t callback,
2452 void *userdata) {
2453
5bb1d7fb 2454 _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
e67d738a 2455 _cleanup_(source_freep) sd_event_source *s = NULL;
97ef5391
LP
2456 struct inotify_data *inotify_data = NULL;
2457 struct inode_data *inode_data = NULL;
97ef5391
LP
2458 struct stat st;
2459 int r;
2460
2461 assert_return(e, -EINVAL);
2462 assert_return(e = event_resolve(e), -ENOPKG);
e67d738a 2463 assert_return(fd >= 0, -EBADF);
97ef5391 2464 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 2465 assert_return(!event_origin_changed(e), -ECHILD);
97ef5391 2466
b9350e70
LP
2467 if (!callback)
2468 callback = inotify_exit_callback;
2469
97ef5391
LP
2470 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2471 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2472 * the user can't use them for us. */
2473 if (mask & IN_MASK_ADD)
2474 return -EINVAL;
2475
97ef5391
LP
2476 if (fstat(fd, &st) < 0)
2477 return -errno;
2478
2479 s = source_new(e, !ret, SOURCE_INOTIFY);
2480 if (!s)
2481 return -ENOMEM;
2482
2483 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2484 s->inotify.mask = mask;
2485 s->inotify.callback = callback;
2486 s->userdata = userdata;
2487
2488 /* Allocate an inotify object for this priority, and an inode object within it */
2489 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2490 if (r < 0)
8c75fe17 2491 return r;
97ef5391
LP
2492
2493 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
8c75fe17 2494 if (r < 0) {
e67d738a 2495 event_gc_inotify_data(e, inotify_data);
8c75fe17
ZJS
2496 return r;
2497 }
97ef5391
LP
2498
2499 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2500 * the event source, until then, for which we need the original inode. */
2501 if (inode_data->fd < 0) {
e67d738a
LP
2502 if (donated_fd >= 0)
2503 inode_data->fd = TAKE_FD(donated_fd);
2504 else {
2505 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2506 if (inode_data->fd < 0) {
2507 r = -errno;
2508 event_gc_inode_data(e, inode_data);
2509 return r;
2510 }
2511 }
2512
ed828563 2513 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
97ef5391
LP
2514 }
2515
2516 /* Link our event source to the inode data object */
2517 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2518 s->inotify.inode_data = inode_data;
2519
97ef5391
LP
2520 /* Actually realize the watch now */
2521 r = inode_data_realize_watch(e, inode_data);
2522 if (r < 0)
8c75fe17 2523 return r;
97ef5391 2524
97ef5391
LP
2525 if (ret)
2526 *ret = s;
8c75fe17 2527 TAKE_PTR(s);
97ef5391
LP
2528
2529 return 0;
97ef5391
LP
2530}
2531
e67d738a
LP
2532_public_ int sd_event_add_inotify_fd(
2533 sd_event *e,
2534 sd_event_source **ret,
2535 int fd,
2536 uint32_t mask,
2537 sd_event_inotify_handler_t callback,
2538 void *userdata) {
2539
2540 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2541}
2542
2543_public_ int sd_event_add_inotify(
2544 sd_event *e,
2545 sd_event_source **ret,
2546 const char *path,
2547 uint32_t mask,
2548 sd_event_inotify_handler_t callback,
2549 void *userdata) {
2550
2091c779 2551 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
e67d738a
LP
2552 int fd, r;
2553
2554 assert_return(path, -EINVAL);
2555
586c8cee
ZJS
2556 fd = open(path, O_PATH | O_CLOEXEC |
2557 (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2558 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
e67d738a
LP
2559 if (fd < 0)
2560 return -errno;
2561
2562 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2563 if (r < 0)
2564 return r;
2565
2566 (void) sd_event_source_set_description(s, path);
2567
2568 if (ret)
2569 *ret = s;
2570
2571 return r;
2572}
2573
8301aa0b 2574static sd_event_source* event_source_free(sd_event_source *s) {
6680dd6b
LP
2575 if (!s)
2576 return NULL;
da7e457c 2577
8301aa0b
YW
2578 /* Here's a special hack: when we are called from a
2579 * dispatch handler we won't free the event source
2580 * immediately, but we will detach the fd from the
2581 * epoll. This way it is safe for the caller to unref
2582 * the event source and immediately close the fd, but
2583 * we still retain a valid event source object after
2584 * the callback. */
fd38203a 2585
76d04c3a 2586 if (s->dispatching)
8301aa0b 2587 source_disconnect(s);
76d04c3a 2588 else
8301aa0b 2589 source_free(s);
fd38203a
LP
2590
2591 return NULL;
2592}
2593
8301aa0b
YW
2594DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2595
356779df 2596_public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
f7f53e9e 2597 assert_return(s, -EINVAL);
2eeff0f4 2598 assert_return(!event_origin_changed(s->event), -ECHILD);
f7f53e9e 2599
356779df 2600 return free_and_strdup(&s->description, description);
f7f53e9e
TG
2601}
2602
356779df 2603_public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
f7f53e9e 2604 assert_return(s, -EINVAL);
356779df 2605 assert_return(description, -EINVAL);
f7f53e9e 2606
7d92a1a4
ZJS
2607 if (!s->description)
2608 return -ENXIO;
2609
356779df 2610 *description = s->description;
f7f53e9e
TG
2611 return 0;
2612}
2613
adcc4ca3 2614_public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
305f78bf 2615 assert_return(s, NULL);
2eeff0f4 2616 assert_return(!event_origin_changed(s->event), NULL);
eaa3cbef
LP
2617
2618 return s->event;
2619}
2620
f7262a9f 2621_public_ int sd_event_source_get_pending(sd_event_source *s) {
305f78bf 2622 assert_return(s, -EINVAL);
6203e07a 2623 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c 2624 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 2625 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
2626
2627 return s->pending;
2628}
2629
f7262a9f 2630_public_ int sd_event_source_get_io_fd(sd_event_source *s) {
305f78bf
LP
2631 assert_return(s, -EINVAL);
2632 assert_return(s->type == SOURCE_IO, -EDOM);
2eeff0f4 2633 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
2634
2635 return s->io.fd;
2636}
2637
30caf8f3
LP
2638_public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2639 int r;
2640
2641 assert_return(s, -EINVAL);
8ac43fee 2642 assert_return(fd >= 0, -EBADF);
30caf8f3 2643 assert_return(s->type == SOURCE_IO, -EDOM);
2eeff0f4 2644 assert_return(!event_origin_changed(s->event), -ECHILD);
30caf8f3
LP
2645
2646 if (s->io.fd == fd)
2647 return 0;
2648
b6d5481b 2649 if (event_source_is_offline(s)) {
30caf8f3
LP
2650 s->io.fd = fd;
2651 s->io.registered = false;
2652 } else {
2653 int saved_fd;
2654
2655 saved_fd = s->io.fd;
2656 assert(s->io.registered);
2657
2658 s->io.fd = fd;
2659 s->io.registered = false;
2660
2661 r = source_io_register(s, s->enabled, s->io.events);
2662 if (r < 0) {
2663 s->io.fd = saved_fd;
2664 s->io.registered = true;
2665 return r;
2666 }
2667
5a795bff 2668 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
30caf8f3
LP
2669 }
2670
2671 return 0;
2672}
2673
ab93297c
NM
2674_public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2675 assert_return(s, -EINVAL);
2676 assert_return(s->type == SOURCE_IO, -EDOM);
2eeff0f4 2677 assert_return(!event_origin_changed(s->event), -ECHILD);
ab93297c
NM
2678
2679 return s->io.owned;
2680}
2681
2682_public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2683 assert_return(s, -EINVAL);
2684 assert_return(s->type == SOURCE_IO, -EDOM);
2eeff0f4 2685 assert_return(!event_origin_changed(s->event), -ECHILD);
ab93297c
NM
2686
2687 s->io.owned = own;
2688 return 0;
2689}
2690
f7262a9f 2691_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
305f78bf
LP
2692 assert_return(s, -EINVAL);
2693 assert_return(events, -EINVAL);
2694 assert_return(s->type == SOURCE_IO, -EDOM);
2eeff0f4 2695 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
2696
2697 *events = s->io.events;
2698 return 0;
2699}
2700
f7262a9f 2701_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
fd38203a
LP
2702 int r;
2703
305f78bf
LP
2704 assert_return(s, -EINVAL);
2705 assert_return(s->type == SOURCE_IO, -EDOM);
2a16a986 2706 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 2707 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 2708 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 2709
b63c8d4f
DH
2710 /* edge-triggered updates are never skipped, so we can reset edges */
2711 if (s->io.events == events && !(events & EPOLLET))
fd38203a
LP
2712 return 0;
2713
2a0dc6cd
LP
2714 r = source_set_pending(s, false);
2715 if (r < 0)
2716 return r;
2717
b6d5481b 2718 if (event_source_is_online(s)) {
e4715127 2719 r = source_io_register(s, s->enabled, events);
fd38203a
LP
2720 if (r < 0)
2721 return r;
2722 }
2723
2724 s->io.events = events;
2725
2726 return 0;
2727}
2728
f7262a9f 2729_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
305f78bf
LP
2730 assert_return(s, -EINVAL);
2731 assert_return(revents, -EINVAL);
2732 assert_return(s->type == SOURCE_IO, -EDOM);
2733 assert_return(s->pending, -ENODATA);
2eeff0f4 2734 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
2735
2736 *revents = s->io.revents;
2737 return 0;
2738}
2739
f7262a9f 2740_public_ int sd_event_source_get_signal(sd_event_source *s) {
305f78bf
LP
2741 assert_return(s, -EINVAL);
2742 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2eeff0f4 2743 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
2744
2745 return s->signal.sig;
2746}
2747
31927c16 2748_public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
305f78bf 2749 assert_return(s, -EINVAL);
2eeff0f4 2750 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 2751
6680b8d1
ME
2752 *priority = s->priority;
2753 return 0;
fd38203a
LP
2754}
2755
31927c16 2756_public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
97ef5391
LP
2757 bool rm_inotify = false, rm_inode = false;
2758 struct inotify_data *new_inotify_data = NULL;
2759 struct inode_data *new_inode_data = NULL;
9da4cb2b
LP
2760 int r;
2761
305f78bf 2762 assert_return(s, -EINVAL);
da7e457c 2763 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 2764 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
2765
2766 if (s->priority == priority)
2767 return 0;
2768
97ef5391
LP
2769 if (s->type == SOURCE_INOTIFY) {
2770 struct inode_data *old_inode_data;
2771
2772 assert(s->inotify.inode_data);
2773 old_inode_data = s->inotify.inode_data;
2774
2775 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2776 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2777 * events we allow priority changes only until the first following iteration. */
2778 if (old_inode_data->fd < 0)
2779 return -EOPNOTSUPP;
2780
2781 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2782 if (r < 0)
2783 return r;
2784 rm_inotify = r > 0;
2785
2786 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2787 if (r < 0)
2788 goto fail;
2789 rm_inode = r > 0;
2790
2791 if (new_inode_data->fd < 0) {
2792 /* Duplicate the fd for the new inode object if we don't have any yet */
2793 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2794 if (new_inode_data->fd < 0) {
2795 r = -errno;
2796 goto fail;
2797 }
2798
ed828563 2799 LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
97ef5391
LP
2800 }
2801
2802 /* Move the event source to the new inode data structure */
2803 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2804 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2805 s->inotify.inode_data = new_inode_data;
2806
2807 /* Now create the new watch */
2808 r = inode_data_realize_watch(s->event, new_inode_data);
2809 if (r < 0) {
2810 /* Move it back */
2811 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2812 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2813 s->inotify.inode_data = old_inode_data;
2814 goto fail;
2815 }
2816
2817 s->priority = priority;
2818
2819 event_gc_inode_data(s->event, old_inode_data);
2820
b6d5481b 2821 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
9da4cb2b
LP
2822 struct signal_data *old, *d;
2823
2824 /* Move us from the signalfd belonging to the old
2825 * priority to the signalfd of the new priority */
2826
2827 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2828
2829 s->priority = priority;
2830
2831 r = event_make_signal_data(s->event, s->signal.sig, &d);
2832 if (r < 0) {
2833 s->priority = old->priority;
2834 return r;
2835 }
2836
2837 event_unmask_signal_data(s->event, old, s->signal.sig);
2838 } else
2839 s->priority = priority;
fd38203a 2840
e1951c16 2841 event_source_pp_prioq_reshuffle(s);
fd38203a 2842
6203e07a
LP
2843 if (s->type == SOURCE_EXIT)
2844 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
305f78bf 2845
fd38203a 2846 return 0;
97ef5391
LP
2847
2848fail:
2849 if (rm_inode)
2850 event_free_inode_data(s->event, new_inode_data);
2851
2852 if (rm_inotify)
2853 event_free_inotify_data(s->event, new_inotify_data);
2854
2855 return r;
fd38203a
LP
2856}
2857
cad143a8 2858_public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
71193c0b
ZJS
2859 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2860 if (!s && !ret)
2861 return false;
2862
305f78bf 2863 assert_return(s, -EINVAL);
2eeff0f4 2864 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 2865
cad143a8
LP
2866 if (ret)
2867 *ret = s->enabled;
2868
08c1eb0e 2869 return s->enabled != SD_EVENT_OFF;
fd38203a
LP
2870}
2871
b6d5481b
LP
2872static int event_source_offline(
2873 sd_event_source *s,
2874 int enabled,
2875 bool ratelimited) {
2876
2877 bool was_offline;
fd38203a
LP
2878 int r;
2879
ddfde737 2880 assert(s);
b6d5481b 2881 assert(enabled == SD_EVENT_OFF || ratelimited);
fd38203a 2882
ddfde737 2883 /* Unset the pending flag when this event source is disabled */
b6d5481b
LP
2884 if (s->enabled != SD_EVENT_OFF &&
2885 enabled == SD_EVENT_OFF &&
2886 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2887 r = source_set_pending(s, false);
2888 if (r < 0)
2889 return r;
2890 }
cc567911 2891
b6d5481b
LP
2892 was_offline = event_source_is_offline(s);
2893 s->enabled = enabled;
2894 s->ratelimited = ratelimited;
fd38203a 2895
ddfde737 2896 switch (s->type) {
fd38203a 2897
ddfde737
LP
2898 case SOURCE_IO:
2899 source_io_unregister(s);
2900 break;
ac989a78 2901
ddfde737
LP
2902 case SOURCE_SIGNAL:
2903 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2904 break;
fd38203a 2905
ddfde737 2906 case SOURCE_CHILD:
b6d5481b
LP
2907 if (!was_offline) {
2908 assert(s->event->n_online_child_sources > 0);
2909 s->event->n_online_child_sources--;
2910 }
fd38203a 2911
ddfde737
LP
2912 if (EVENT_SOURCE_WATCH_PIDFD(s))
2913 source_child_pidfd_unregister(s);
2914 else
2915 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2916 break;
4807d2d0 2917
ddfde737
LP
2918 case SOURCE_EXIT:
2919 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2920 break;
fd38203a 2921
158fe190
LP
2922 case SOURCE_MEMORY_PRESSURE:
2923 source_memory_pressure_unregister(s);
2924 break;
2925
2115b9b6
YW
2926 case SOURCE_TIME_REALTIME:
2927 case SOURCE_TIME_BOOTTIME:
2928 case SOURCE_TIME_MONOTONIC:
2929 case SOURCE_TIME_REALTIME_ALARM:
2930 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737
LP
2931 case SOURCE_DEFER:
2932 case SOURCE_POST:
2933 case SOURCE_INOTIFY:
2934 break;
fd38203a 2935
ddfde737 2936 default:
04499a70 2937 assert_not_reached();
ddfde737 2938 }
fd38203a 2939
2115b9b6
YW
2940 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2941 event_source_time_prioq_reshuffle(s);
2942
b6d5481b 2943 return 1;
ddfde737 2944}
f8f3f926 2945
b6d5481b
LP
2946static int event_source_online(
2947 sd_event_source *s,
2948 int enabled,
2949 bool ratelimited) {
2950
2951 bool was_online;
ddfde737 2952 int r;
fd38203a 2953
ddfde737 2954 assert(s);
b6d5481b 2955 assert(enabled != SD_EVENT_OFF || !ratelimited);
305f78bf 2956
ddfde737 2957 /* Unset the pending flag when this event source is enabled */
b6d5481b
LP
2958 if (s->enabled == SD_EVENT_OFF &&
2959 enabled != SD_EVENT_OFF &&
2960 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2961 r = source_set_pending(s, false);
2962 if (r < 0)
2963 return r;
2964 }
9d3e3aa5 2965
b6d5481b
LP
2966 /* Are we really ready for onlining? */
2967 if (enabled == SD_EVENT_OFF || ratelimited) {
2968 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2969 s->enabled = enabled;
2970 s->ratelimited = ratelimited;
2971 return 0;
2972 }
2973
2974 was_online = event_source_is_online(s);
2975
ddfde737 2976 switch (s->type) {
ddfde737 2977 case SOURCE_IO:
b6d5481b 2978 r = source_io_register(s, enabled, s->io.events);
d2eafe61 2979 if (r < 0)
ddfde737 2980 return r;
ddfde737 2981 break;
fd38203a 2982
ddfde737
LP
2983 case SOURCE_SIGNAL:
2984 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2985 if (r < 0) {
ddfde737
LP
2986 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2987 return r;
2988 }
fd38203a 2989
ddfde737 2990 break;
fd38203a 2991
ddfde737 2992 case SOURCE_CHILD:
ddfde737
LP
2993 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2994 /* yes, we have pidfd */
9da4cb2b 2995
b6d5481b 2996 r = source_child_pidfd_register(s, enabled);
ac9f2640 2997 if (r < 0)
9da4cb2b 2998 return r;
ddfde737
LP
2999 } else {
3000 /* no pidfd, or something other to watch for than WEXITED */
9da4cb2b 3001
ddfde737
LP
3002 r = event_make_signal_data(s->event, SIGCHLD, NULL);
3003 if (r < 0) {
ddfde737
LP
3004 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
3005 return r;
3006 }
3007 }
fd38203a 3008
b6d5481b
LP
3009 if (!was_online)
3010 s->event->n_online_child_sources++;
ddfde737 3011 break;
4807d2d0 3012
158fe190
LP
3013 case SOURCE_MEMORY_PRESSURE:
3014 r = source_memory_pressure_register(s, enabled);
3015 if (r < 0)
3016 return r;
3017
3018 break;
3019
d2eafe61
ZJS
3020 case SOURCE_TIME_REALTIME:
3021 case SOURCE_TIME_BOOTTIME:
3022 case SOURCE_TIME_MONOTONIC:
3023 case SOURCE_TIME_REALTIME_ALARM:
3024 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737 3025 case SOURCE_EXIT:
ddfde737
LP
3026 case SOURCE_DEFER:
3027 case SOURCE_POST:
3028 case SOURCE_INOTIFY:
3029 break;
9da4cb2b 3030
ddfde737 3031 default:
04499a70 3032 assert_not_reached();
ddfde737 3033 }
f8f3f926 3034
b6d5481b
LP
3035 s->enabled = enabled;
3036 s->ratelimited = ratelimited;
d2eafe61
ZJS
3037
3038 /* Non-failing operations below */
2115b9b6 3039 if (s->type == SOURCE_EXIT)
d2eafe61 3040 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
d2eafe61 3041
2115b9b6
YW
3042 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3043 event_source_time_prioq_reshuffle(s);
d2eafe61 3044
b6d5481b 3045 return 1;
ddfde737
LP
3046}
3047
3048_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3049 int r;
9da4cb2b 3050
ddfde737 3051 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
7e922b05
ZJS
3052
3053 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3054 if (m == SD_EVENT_OFF && !s)
3055 return 0;
3056
3057 assert_return(s, -EINVAL);
2eeff0f4 3058 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 3059
ddfde737
LP
3060 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3061 if (s->event->state == SD_EVENT_FINISHED)
3062 return m == SD_EVENT_OFF ? 0 : -ESTALE;
305f78bf 3063
ddfde737
LP
3064 if (s->enabled == m) /* No change? */
3065 return 0;
9d3e3aa5 3066
ddfde737 3067 if (m == SD_EVENT_OFF)
b6d5481b 3068 r = event_source_offline(s, m, s->ratelimited);
ddfde737
LP
3069 else {
3070 if (s->enabled != SD_EVENT_OFF) {
3071 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3072 * event source is already enabled after all. */
3073 s->enabled = m;
3074 return 0;
fd38203a 3075 }
ddfde737 3076
b6d5481b 3077 r = event_source_online(s, m, s->ratelimited);
fd38203a 3078 }
ddfde737
LP
3079 if (r < 0)
3080 return r;
fd38203a 3081
e1951c16 3082 event_source_pp_prioq_reshuffle(s);
fd38203a
LP
3083 return 0;
3084}
3085
f7262a9f 3086_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
305f78bf
LP
3087 assert_return(s, -EINVAL);
3088 assert_return(usec, -EINVAL);
6a0f1f6d 3089 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2eeff0f4 3090 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
3091
3092 *usec = s->time.next;
3093 return 0;
3094}
3095
f7262a9f 3096_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
2a0dc6cd 3097 int r;
6a0f1f6d 3098
305f78bf 3099 assert_return(s, -EINVAL);
6a0f1f6d 3100 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 3101 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 3102 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 3103
2a0dc6cd
LP
3104 r = source_set_pending(s, false);
3105 if (r < 0)
3106 return r;
2576a19e 3107
2a0dc6cd 3108 s->time.next = usec;
fd38203a 3109
e1951c16 3110 event_source_time_prioq_reshuffle(s);
fd38203a
LP
3111 return 0;
3112}
3113
d6a83dc4
LP
3114_public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3115 usec_t t;
3116 int r;
3117
3118 assert_return(s, -EINVAL);
3119 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2eeff0f4 3120 assert_return(!event_origin_changed(s->event), -ECHILD);
d6a83dc4 3121
ef859195
LP
3122 if (usec == USEC_INFINITY)
3123 return sd_event_source_set_time(s, USEC_INFINITY);
3124
d6a83dc4
LP
3125 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3126 if (r < 0)
3127 return r;
3128
496db330
YW
3129 usec = usec_add(t, usec);
3130 if (usec == USEC_INFINITY)
d6a83dc4
LP
3131 return -EOVERFLOW;
3132
496db330 3133 return sd_event_source_set_time(s, usec);
d6a83dc4
LP
3134}
3135
f7262a9f 3136_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
305f78bf
LP
3137 assert_return(s, -EINVAL);
3138 assert_return(usec, -EINVAL);
6a0f1f6d 3139 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2eeff0f4 3140 assert_return(!event_origin_changed(s->event), -ECHILD);
305f78bf
LP
3141
3142 *usec = s->time.accuracy;
3143 return 0;
3144}
3145
f7262a9f 3146_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
2a0dc6cd 3147 int r;
6a0f1f6d 3148
305f78bf 3149 assert_return(s, -EINVAL);
f5fbe71d 3150 assert_return(usec != UINT64_MAX, -EINVAL);
6a0f1f6d 3151 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 3152 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 3153 assert_return(!event_origin_changed(s->event), -ECHILD);
eaa3cbef 3154
2a0dc6cd
LP
3155 r = source_set_pending(s, false);
3156 if (r < 0)
3157 return r;
3158
eaa3cbef
LP
3159 if (usec == 0)
3160 usec = DEFAULT_ACCURACY_USEC;
3161
eaa3cbef
LP
3162 s->time.accuracy = usec;
3163
e1951c16 3164 event_source_time_prioq_reshuffle(s);
6a0f1f6d
LP
3165 return 0;
3166}
3167
3168_public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
3169 assert_return(s, -EINVAL);
3170 assert_return(clock, -EINVAL);
3171 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2eeff0f4 3172 assert_return(!event_origin_changed(s->event), -ECHILD);
eaa3cbef 3173
6a0f1f6d 3174 *clock = event_source_type_to_clock(s->type);
eaa3cbef
LP
3175 return 0;
3176}
3177
f7262a9f 3178_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
4bee8012
LP
3179 assert_return(s, -EINVAL);
3180 assert_return(pid, -EINVAL);
3181 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3182 assert_return(!event_origin_changed(s->event), -ECHILD);
4bee8012
LP
3183
3184 *pid = s->child.pid;
3185 return 0;
3186}
3187
f8f3f926
LP
3188_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3189 assert_return(s, -EINVAL);
3190 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3191 assert_return(!event_origin_changed(s->event), -ECHILD);
f8f3f926
LP
3192
3193 if (s->child.pidfd < 0)
3194 return -EOPNOTSUPP;
3195
3196 return s->child.pidfd;
3197}
3198
3199_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3200 assert_return(s, -EINVAL);
3201 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3202 assert_return(!event_origin_changed(s->event), -ECHILD);
f8f3f926
LP
3203 assert_return(SIGNAL_VALID(sig), -EINVAL);
3204
3205 /* If we already have seen indication the process exited refuse sending a signal early. This way we
3206 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3207 * available. */
3208 if (s->child.exited)
3209 return -ESRCH;
3210
3211 if (s->child.pidfd >= 0) {
3212 siginfo_t copy;
3213
3214 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3215 * structure here */
3216 if (si)
3217 copy = *si;
3218
3219 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
3220 /* Let's propagate the error only if the system call is not implemented or prohibited */
3221 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3222 return -errno;
3223 } else
3224 return 0;
3225 }
3226
3227 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3228 * this here. */
3229 if (flags != 0)
3230 return -EOPNOTSUPP;
3231
3232 if (si) {
3233 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3234 siginfo_t copy = *si;
3235
3236 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
3237 return -errno;
3238 } else if (kill(s->child.pid, sig) < 0)
3239 return -errno;
3240
3241 return 0;
3242}
3243
3244_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3245 assert_return(s, -EINVAL);
3246 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3247 assert_return(!event_origin_changed(s->event), -ECHILD);
f8f3f926
LP
3248
3249 if (s->child.pidfd < 0)
3250 return -EOPNOTSUPP;
3251
3252 return s->child.pidfd_owned;
3253}
3254
3255_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3256 assert_return(s, -EINVAL);
3257 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3258 assert_return(!event_origin_changed(s->event), -ECHILD);
f8f3f926
LP
3259
3260 if (s->child.pidfd < 0)
3261 return -EOPNOTSUPP;
3262
3263 s->child.pidfd_owned = own;
3264 return 0;
3265}
3266
3267_public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3268 assert_return(s, -EINVAL);
3269 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3270 assert_return(!event_origin_changed(s->event), -ECHILD);
f8f3f926
LP
3271
3272 return s->child.process_owned;
3273}
3274
3275_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3276 assert_return(s, -EINVAL);
3277 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3278 assert_return(!event_origin_changed(s->event), -ECHILD);
f8f3f926
LP
3279
3280 s->child.process_owned = own;
3281 return 0;
3282}
3283
97ef5391
LP
3284_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
3285 assert_return(s, -EINVAL);
3286 assert_return(mask, -EINVAL);
3287 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
2eeff0f4 3288 assert_return(!event_origin_changed(s->event), -ECHILD);
97ef5391
LP
3289
3290 *mask = s->inotify.mask;
3291 return 0;
3292}
3293
718db961 3294_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
fd38203a
LP
3295 int r;
3296
da7e457c 3297 assert_return(s, -EINVAL);
6203e07a 3298 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c 3299 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 3300 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
3301
3302 if (s->prepare == callback)
3303 return 0;
3304
3305 if (callback && s->prepare) {
3306 s->prepare = callback;
3307 return 0;
3308 }
3309
3310 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3311 if (r < 0)
3312 return r;
3313
3314 s->prepare = callback;
3315
3316 if (callback) {
3317 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3318 if (r < 0)
3319 return r;
3320 } else
3321 prioq_remove(s->event->prepare, s, &s->prepare_index);
3322
3323 return 0;
3324}
3325
f7262a9f 3326_public_ void* sd_event_source_get_userdata(sd_event_source *s) {
da7e457c 3327 assert_return(s, NULL);
2eeff0f4 3328 assert_return(!event_origin_changed(s->event), NULL);
fd38203a
LP
3329
3330 return s->userdata;
3331}
3332
8f726607
LP
3333_public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3334 void *ret;
3335
3336 assert_return(s, NULL);
2eeff0f4 3337 assert_return(!event_origin_changed(s->event), NULL);
8f726607
LP
3338
3339 ret = s->userdata;
3340 s->userdata = userdata;
3341
3342 return ret;
3343}
3344
b6d5481b
LP
3345static int event_source_enter_ratelimited(sd_event_source *s) {
3346 int r;
3347
3348 assert(s);
3349
3350 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3351 * the end of the rate limit time window, much as if it was a timer event source. */
3352
3353 if (s->ratelimited)
3354 return 0; /* Already ratelimited, this is a NOP hence */
3355
3356 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3357 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3358 if (r < 0)
3359 return r;
3360
3361 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3362 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3363 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3364 if (EVENT_SOURCE_IS_TIME(s->type))
3365 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3366
3367 /* Now, let's add the event source to the monotonic clock instead */
3368 r = event_source_time_prioq_put(s, &s->event->monotonic);
3369 if (r < 0)
3370 goto fail;
3371
3372 /* And let's take the event source officially offline */
3373 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3374 if (r < 0) {
3375 event_source_time_prioq_remove(s, &s->event->monotonic);
3376 goto fail;
3377 }
3378
3379 event_source_pp_prioq_reshuffle(s);
3380
3381 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3382 return 0;
3383
3384fail:
3385 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3386 * space for it should already be allocated. */
3387 if (EVENT_SOURCE_IS_TIME(s->type))
3388 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3389
3390 return r;
3391}
3392
fd69f224 3393static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
b6d5481b
LP
3394 int r;
3395
3396 assert(s);
3397
3398 if (!s->ratelimited)
3399 return 0;
3400
3401 /* Let's take the event source out of the monotonic prioq first. */
3402 event_source_time_prioq_remove(s, &s->event->monotonic);
3403
3404 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3405 if (EVENT_SOURCE_IS_TIME(s->type)) {
3406 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3407 if (r < 0)
3408 goto fail;
3409 }
3410
3411 /* Let's try to take it online again. */
3412 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3413 if (r < 0) {
3414 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3415 if (EVENT_SOURCE_IS_TIME(s->type))
3416 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3417
3418 goto fail;
3419 }
3420
3421 event_source_pp_prioq_reshuffle(s);
3422 ratelimit_reset(&s->rate_limit);
3423
3424 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
fd69f224
MS
3425
3426 if (run_callback && s->ratelimit_expire_callback) {
3427 s->dispatching = true;
3428 r = s->ratelimit_expire_callback(s, s->userdata);
3429 s->dispatching = false;
3430
3431 if (r < 0) {
3432 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3433 strna(s->description),
3434 event_source_type_to_string(s->type),
3435 s->exit_on_failure ? "exiting" : "disabling");
3436
3437 if (s->exit_on_failure)
3438 (void) sd_event_exit(s->event, r);
3439 }
3440
3441 if (s->n_ref == 0)
3442 source_free(s);
3443 else if (r < 0)
0a040e64 3444 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd69f224
MS
3445
3446 return 1;
3447 }
3448
b6d5481b
LP
3449 return 0;
3450
3451fail:
3452 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3453 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3454 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3455
3456 return r;
3457}
3458
c2ba3ad6
LP
3459static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3460 usec_t c;
3461 assert(e);
3462 assert(a <= b);
3463
3464 if (a <= 0)
3465 return 0;
393003e1
LP
3466 if (a >= USEC_INFINITY)
3467 return USEC_INFINITY;
c2ba3ad6
LP
3468
3469 if (b <= a + 1)
3470 return a;
3471
52444dc4
LP
3472 initialize_perturb(e);
3473
c2ba3ad6
LP
3474 /*
3475 Find a good time to wake up again between times a and b. We
3476 have two goals here:
3477
3478 a) We want to wake up as seldom as possible, hence prefer
3479 later times over earlier times.
3480
3481 b) But if we have to wake up, then let's make sure to
3482 dispatch as much as possible on the entire system.
3483
3484 We implement this by waking up everywhere at the same time
850516e0 3485 within any given minute if we can, synchronised via the
c2ba3ad6 3486 perturbation value determined from the boot ID. If we can't,
ba276c81
LP
3487 then we try to find the same spot in every 10s, then 1s and
3488 then 250ms step. Otherwise, we pick the last possible time
3489 to wake up.
c2ba3ad6
LP
3490 */
3491
850516e0
LP
3492 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3493 if (c >= b) {
3494 if (_unlikely_(c < USEC_PER_MINUTE))
3495 return b;
3496
3497 c -= USEC_PER_MINUTE;
3498 }
3499
ba276c81
LP
3500 if (c >= a)
3501 return c;
3502
3503 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3504 if (c >= b) {
3505 if (_unlikely_(c < USEC_PER_SEC*10))
3506 return b;
3507
3508 c -= USEC_PER_SEC*10;
3509 }
3510
850516e0
LP
3511 if (c >= a)
3512 return c;
3513
3514 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
c2ba3ad6
LP
3515 if (c >= b) {
3516 if (_unlikely_(c < USEC_PER_SEC))
3517 return b;
3518
3519 c -= USEC_PER_SEC;
3520 }
3521
3522 if (c >= a)
3523 return c;
3524
3525 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3526 if (c >= b) {
3527 if (_unlikely_(c < USEC_PER_MSEC*250))
3528 return b;
3529
3530 c -= USEC_PER_MSEC*250;
3531 }
3532
3533 if (c >= a)
3534 return c;
3535
3536 return b;
3537}
3538
fd38203a
LP
3539static int event_arm_timer(
3540 sd_event *e,
6a0f1f6d 3541 struct clock_data *d) {
fd38203a
LP
3542
3543 struct itimerspec its = {};
c2ba3ad6
LP
3544 sd_event_source *a, *b;
3545 usec_t t;
fd38203a 3546
cde93897 3547 assert(e);
6a0f1f6d 3548 assert(d);
fd38203a 3549
d06441da 3550 if (!d->needs_rearm)
212bbb17 3551 return 0;
7e2bf71c
YW
3552
3553 d->needs_rearm = false;
212bbb17 3554
6a0f1f6d 3555 a = prioq_peek(d->earliest);
19947509 3556 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
b6d5481b 3557 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
72aedc1e 3558
6a0f1f6d 3559 if (d->fd < 0)
c57b5ca3
LP
3560 return 0;
3561
3a43da28 3562 if (d->next == USEC_INFINITY)
72aedc1e
LP
3563 return 0;
3564
3565 /* disarm */
15c689d7
LP
3566 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3567 return -errno;
72aedc1e 3568
3a43da28 3569 d->next = USEC_INFINITY;
fd38203a 3570 return 0;
72aedc1e 3571 }
fd38203a 3572
6a0f1f6d 3573 b = prioq_peek(d->latest);
19947509
ZJS
3574 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3575 assert(b && b->enabled != SD_EVENT_OFF);
c2ba3ad6 3576
b6d5481b 3577 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
6a0f1f6d 3578 if (d->next == t)
fd38203a
LP
3579 return 0;
3580
6a0f1f6d 3581 assert_se(d->fd >= 0);
fd38203a 3582
c2ba3ad6 3583 if (t == 0) {
1751bdde 3584 /* We don't want to disarm here, just mean some time looooong ago. */
fd38203a
LP
3585 its.it_value.tv_sec = 0;
3586 its.it_value.tv_nsec = 1;
3587 } else
c2ba3ad6 3588 timespec_store(&its.it_value, t);
fd38203a 3589
15c689d7 3590 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
cde93897 3591 return -errno;
fd38203a 3592
6a0f1f6d 3593 d->next = t;
fd38203a
LP
3594 return 0;
3595}
3596
9a800b56 3597static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
fd38203a
LP
3598 assert(e);
3599 assert(s);
3600 assert(s->type == SOURCE_IO);
3601
9a800b56
LP
3602 /* If the event source was already pending, we just OR in the
3603 * new revents, otherwise we reset the value. The ORing is
3604 * necessary to handle EPOLLONESHOT events properly where
3605 * readability might happen independently of writability, and
3606 * we need to keep track of both */
3607
3608 if (s->pending)
3609 s->io.revents |= revents;
3610 else
3611 s->io.revents = revents;
fd38203a 3612
fd38203a
LP
3613 return source_set_pending(s, true);
3614}
3615
72aedc1e 3616static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
fd38203a
LP
3617 uint64_t x;
3618 ssize_t ss;
3619
3620 assert(e);
da7e457c 3621 assert(fd >= 0);
72aedc1e 3622
305f78bf 3623 assert_return(events == EPOLLIN, -EIO);
fd38203a
LP
3624
3625 ss = read(fd, &x, sizeof(x));
3626 if (ss < 0) {
8add30a0 3627 if (ERRNO_IS_TRANSIENT(errno))
fd38203a
LP
3628 return 0;
3629
3630 return -errno;
3631 }
3632
8d35dae7 3633 if (_unlikely_(ss != sizeof(x)))
fd38203a
LP
3634 return -EIO;
3635
cde93897 3636 if (next)
3a43da28 3637 *next = USEC_INFINITY;
72aedc1e 3638
fd38203a
LP
3639 return 0;
3640}
3641
305f78bf
LP
3642static int process_timer(
3643 sd_event *e,
3644 usec_t n,
6a0f1f6d 3645 struct clock_data *d) {
305f78bf 3646
fd38203a 3647 sd_event_source *s;
fd69f224 3648 bool callback_invoked = false;
fd38203a
LP
3649 int r;
3650
3651 assert(e);
6a0f1f6d 3652 assert(d);
fd38203a
LP
3653
3654 for (;;) {
6a0f1f6d 3655 s = prioq_peek(d->earliest);
19947509
ZJS
3656 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3657
b6d5481b
LP
3658 if (!s || time_event_source_next(s) > n)
3659 break;
3660
3661 if (s->ratelimited) {
3662 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3663 * again. */
3664 assert(s->ratelimited);
3665
fd69f224 3666 r = event_source_leave_ratelimit(s, /* run_callback */ true);
b6d5481b
LP
3667 if (r < 0)
3668 return r;
fd69f224
MS
3669 else if (r == 1)
3670 callback_invoked = true;
b6d5481b
LP
3671
3672 continue;
3673 }
3674
3675 if (s->enabled == SD_EVENT_OFF || s->pending)
fd38203a
LP
3676 break;
3677
3678 r = source_set_pending(s, true);
3679 if (r < 0)
3680 return r;
3681
e1951c16 3682 event_source_time_prioq_reshuffle(s);
fd38203a
LP
3683 }
3684
fd69f224 3685 return callback_invoked;
fd38203a
LP
3686}
3687
efd3be9d
YW
3688static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3689 int64_t min_priority = threshold;
3690 bool something_new = false;
fd38203a 3691 sd_event_source *s;
fd38203a
LP
3692 int r;
3693
3694 assert(e);
efd3be9d
YW
3695 assert(ret_min_priority);
3696
3697 if (!e->need_process_child) {
3698 *ret_min_priority = min_priority;
3699 return 0;
3700 }
fd38203a 3701
c2ba3ad6
LP
3702 e->need_process_child = false;
3703
91c70071
YW
3704 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3705 * for, instead of using P_ALL. This is because we only want to get child information of very
3706 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3707 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3708 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3709 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3710 * to handle SIGCHLD yourself.
3711 *
3712 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3713 * source is dispatched so that the callback still sees the process as a zombie. */
fd38203a 3714
90e74a66 3715 HASHMAP_FOREACH(s, e->child_sources) {
fd38203a
LP
3716 assert(s->type == SOURCE_CHILD);
3717
efd3be9d
YW
3718 if (s->priority > threshold)
3719 continue;
3720
fd38203a
LP
3721 if (s->pending)
3722 continue;
3723
b6d5481b 3724 if (event_source_is_offline(s))
fd38203a
LP
3725 continue;
3726
f8f3f926
LP
3727 if (s->child.exited)
3728 continue;
3729
91c70071
YW
3730 if (EVENT_SOURCE_WATCH_PIDFD(s))
3731 /* There's a usable pidfd known for this event source? Then don't waitid() for
3732 * it here */
f8f3f926
LP
3733 continue;
3734
fd38203a 3735 zero(s->child.siginfo);
15c689d7
LP
3736 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3737 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
bfd9bfcc 3738 return negative_errno();
fd38203a
LP
3739
3740 if (s->child.siginfo.si_pid != 0) {
945c2931 3741 bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
08cd1552 3742
f8f3f926
LP
3743 if (zombie)
3744 s->child.exited = true;
3745
08cd1552 3746 if (!zombie && (s->child.options & WEXITED)) {
91c70071
YW
3747 /* If the child isn't dead then let's immediately remove the state
3748 * change from the queue, since there's no benefit in leaving it
3749 * queued. */
08cd1552
LP
3750
3751 assert(s->child.options & (WSTOPPED|WCONTINUED));
a5d27871 3752 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
08cd1552
LP
3753 }
3754
fd38203a
LP
3755 r = source_set_pending(s, true);
3756 if (r < 0)
3757 return r;
efd3be9d
YW
3758 if (r > 0) {
3759 something_new = true;
3760 min_priority = MIN(min_priority, s->priority);
3761 }
fd38203a
LP
3762 }
3763 }
3764
efd3be9d
YW
3765 *ret_min_priority = min_priority;
3766 return something_new;
fd38203a
LP
3767}
3768
f8f3f926
LP
3769static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3770 assert(e);
3771 assert(s);
3772 assert(s->type == SOURCE_CHILD);
3773
3774 if (s->pending)
3775 return 0;
3776
b6d5481b 3777 if (event_source_is_offline(s))
f8f3f926
LP
3778 return 0;
3779
3780 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3781 return 0;
3782
3783 zero(s->child.siginfo);
3784 if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3785 return -errno;
3786
3787 if (s->child.siginfo.si_pid == 0)
3788 return 0;
3789
3790 if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3791 s->child.exited = true;
3792
3793 return source_set_pending(s, true);
3794}
3795
efd3be9d 3796static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
fd38203a
LP
3797 int r;
3798
da7e457c 3799 assert(e);
97ef5391 3800 assert(d);
305f78bf 3801 assert_return(events == EPOLLIN, -EIO);
efd3be9d 3802 assert(min_priority);
fd38203a 3803
91c70071
YW
3804 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3805 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3806 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3807 * but we might have higher priority children we care about hence we need to check that
3808 * explicitly. */
9da4cb2b
LP
3809
3810 if (sigismember(&d->sigset, SIGCHLD))
3811 e->need_process_child = true;
3812
91c70071 3813 /* If there's already an event source pending for this priority we don't read another */
9da4cb2b
LP
3814 if (d->current)
3815 return 0;
3816
fd38203a 3817 for (;;) {
0eb2e0e3 3818 struct signalfd_siginfo si;
7057bd99 3819 ssize_t n;
92daebc0 3820 sd_event_source *s = NULL;
fd38203a 3821
9da4cb2b 3822 n = read(d->fd, &si, sizeof(si));
7057bd99 3823 if (n < 0) {
8add30a0 3824 if (ERRNO_IS_TRANSIENT(errno))
efd3be9d 3825 return 0;
fd38203a
LP
3826
3827 return -errno;
3828 }
3829
7057bd99 3830 if (_unlikely_(n != sizeof(si)))
fd38203a
LP
3831 return -EIO;
3832
6eb7c172 3833 assert(SIGNAL_VALID(si.ssi_signo));
7057bd99 3834
92daebc0
LP
3835 if (e->signal_sources)
3836 s = e->signal_sources[si.ssi_signo];
92daebc0
LP
3837 if (!s)
3838 continue;
9da4cb2b
LP
3839 if (s->pending)
3840 continue;
fd38203a
LP
3841
3842 s->signal.siginfo = si;
9da4cb2b
LP
3843 d->current = s;
3844
fd38203a
LP
3845 r = source_set_pending(s, true);
3846 if (r < 0)
3847 return r;
efd3be9d
YW
3848 if (r > 0 && *min_priority >= s->priority) {
3849 *min_priority = s->priority;
3850 return 1; /* an event source with smaller priority is queued. */
3851 }
9da4cb2b 3852
efd3be9d 3853 return 0;
fd38203a 3854 }
fd38203a
LP
3855}
3856
efd3be9d 3857static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
97ef5391
LP
3858 ssize_t n;
3859
3860 assert(e);
3861 assert(d);
3862
3863 assert_return(revents == EPOLLIN, -EIO);
3864
3865 /* If there's already an event source pending for this priority, don't read another */
3866 if (d->n_pending > 0)
3867 return 0;
3868
3869 /* Is the read buffer non-empty? If so, let's not read more */
3870 if (d->buffer_filled > 0)
3871 return 0;
3872
efd3be9d
YW
3873 if (d->priority > threshold)
3874 return 0;
3875
97ef5391
LP
3876 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3877 if (n < 0) {
8add30a0 3878 if (ERRNO_IS_TRANSIENT(errno))
97ef5391
LP
3879 return 0;
3880
3881 return -errno;
3882 }
3883
3884 assert(n > 0);
3885 d->buffer_filled = (size_t) n;
0601b958 3886 LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3887
3888 return 1;
3889}
3890
3891static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3892 assert(e);
3893 assert(d);
3894 assert(sz <= d->buffer_filled);
3895
3896 if (sz == 0)
3897 return;
3898
3899 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3900 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3901 d->buffer_filled -= sz;
3902
3903 if (d->buffer_filled == 0)
0601b958 3904 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3905}
3906
3907static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3908 int r;
3909
3910 assert(e);
3911 assert(d);
3912
3913 /* If there's already an event source pending for this priority, don't read another */
3914 if (d->n_pending > 0)
3915 return 0;
3916
3917 while (d->buffer_filled > 0) {
3918 size_t sz;
3919
3920 /* Let's validate that the event structures are complete */
3921 if (d->buffer_filled < offsetof(struct inotify_event, name))
3922 return -EIO;
3923
3924 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3925 if (d->buffer_filled < sz)
3926 return -EIO;
3927
3928 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3929 struct inode_data *inode_data;
97ef5391
LP
3930
3931 /* The queue overran, let's pass this event to all event sources connected to this inotify
3932 * object */
3933
03677889 3934 HASHMAP_FOREACH(inode_data, d->inodes)
97ef5391
LP
3935 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3936
b6d5481b 3937 if (event_source_is_offline(s))
97ef5391
LP
3938 continue;
3939
3940 r = source_set_pending(s, true);
3941 if (r < 0)
3942 return r;
3943 }
97ef5391
LP
3944 } else {
3945 struct inode_data *inode_data;
97ef5391
LP
3946
3947 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3948 * our watch descriptor table. */
3949 if (d->buffer.ev.mask & IN_IGNORED) {
3950
3951 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3952 if (!inode_data) {
3953 event_inotify_data_drop(e, d, sz);
3954 continue;
3955 }
3956
3957 /* The watch descriptor was removed by the kernel, let's drop it here too */
3958 inode_data->wd = -1;
3959 } else {
3960 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3961 if (!inode_data) {
3962 event_inotify_data_drop(e, d, sz);
3963 continue;
3964 }
3965 }
3966
3967 /* Trigger all event sources that are interested in these events. Also trigger all event
3968 * sources if IN_IGNORED or IN_UNMOUNT is set. */
3969 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3970
b6d5481b 3971 if (event_source_is_offline(s))
97ef5391
LP
3972 continue;
3973
3974 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3975 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3976 continue;
3977
3978 r = source_set_pending(s, true);
3979 if (r < 0)
3980 return r;
3981 }
3982 }
3983
3984 /* Something pending now? If so, let's finish, otherwise let's read more. */
3985 if (d->n_pending > 0)
3986 return 1;
3987 }
3988
3989 return 0;
3990}
3991
3992static int process_inotify(sd_event *e) {
97ef5391
LP
3993 int r, done = 0;
3994
3995 assert(e);
3996
0601b958 3997 LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
97ef5391
LP
3998 r = event_inotify_data_process(e, d);
3999 if (r < 0)
4000 return r;
4001 if (r > 0)
4002 done ++;
4003 }
4004
4005 return done;
4006}
4007
158fe190
LP
4008static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
4009 assert(s);
4010 assert(s->type == SOURCE_MEMORY_PRESSURE);
4011
4012 if (s->pending)
4013 s->memory_pressure.revents |= revents;
4014 else
4015 s->memory_pressure.revents = revents;
4016
4017 return source_set_pending(s, true);
4018}
4019
4020static int source_memory_pressure_write(sd_event_source *s) {
4021 ssize_t n;
4022 int r;
4023
4024 assert(s);
4025 assert(s->type == SOURCE_MEMORY_PRESSURE);
4026
4027 /* once we start writing, the buffer is locked, we allow no further changes. */
4028 s->memory_pressure.locked = true;
4029
4030 if (s->memory_pressure.write_buffer_size > 0) {
4031 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4032 if (n < 0) {
9897f5dd
LP
4033 if (!ERRNO_IS_TRANSIENT(errno)) {
4034 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4035 * files, but then generates EOPNOSUPP on read() and write() (instead of on
4036 * open()!). This sucks hard, since we can only detect this kind of failure
4037 * so late. Let's make the best of it, and turn off the event source like we
4038 * do for failed event source handlers. */
4039
4040 log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4041 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4042 return 0;
4043 }
158fe190
LP
4044
4045 n = 0;
4046 }
4047 } else
4048 n = 0;
4049
4050 assert(n >= 0);
4051
4052 if ((size_t) n == s->memory_pressure.write_buffer_size) {
4053 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4054
4055 if (n > 0) {
4056 s->memory_pressure.write_buffer_size = 0;
4057
4058 /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4059 r = source_memory_pressure_register(s, s->enabled);
4060 if (r < 0)
4061 return r;
4062 }
4063 } else if (n > 0) {
4064 _cleanup_free_ void *c = NULL;
4065
4066 assert((size_t) n < s->memory_pressure.write_buffer_size);
4067
4068 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4069 if (!c)
4070 return -ENOMEM;
4071
4072 free_and_replace(s->memory_pressure.write_buffer, c);
4073 s->memory_pressure.write_buffer_size -= n;
4074 return 1;
4075 }
4076
4077 return 0;
4078}
4079
4080static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4081 int r;
4082
4083 assert(s);
4084 assert(s->type == SOURCE_MEMORY_PRESSURE);
4085
4086 r = source_memory_pressure_write(s);
4087 if (r < 0)
4088 return r;
4089 if (r > 0)
4090 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4091 * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4092
4093 /* No pending incoming IO? Then let's not continue further */
4094 if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4095
4096 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4097 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4098 return -EIO;
4099
4100 return 1; /* leave dispatch, we already processed everything */
4101 }
4102
4103 if (s->memory_pressure.revents & EPOLLIN) {
4104 uint8_t pipe_buf[PIPE_BUF];
4105 ssize_t n;
4106
4107 /* If the fd is readable, then flush out anything that might be queued */
4108
4109 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4110 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4111 return -errno;
4112 }
4113
4114 return 0; /* go on, dispatch to user callback */
4115}
4116
fd38203a 4117static int source_dispatch(sd_event_source *s) {
8f5c235d 4118 EventSourceType saved_type;
c8e9d15c 4119 sd_event *saved_event;
fe8245eb 4120 int r = 0;
fd38203a
LP
4121
4122 assert(s);
6203e07a 4123 assert(s->pending || s->type == SOURCE_EXIT);
fd38203a 4124
b778cba4
LP
4125 /* Save the event source type, here, so that we still know it after the event callback which might
4126 * invalidate the event. */
8f5c235d
LP
4127 saved_type = s->type;
4128
de02634c 4129 /* Similarly, store a reference to the event loop object, so that we can still access it after the
b778cba4 4130 * callback might have invalidated/disconnected the event source. */
c8e9d15c
YW
4131 saved_event = s->event;
4132 PROTECT_EVENT(saved_event);
b778cba4 4133
de02634c 4134 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
b6d5481b
LP
4135 assert(!s->ratelimited);
4136 if (!ratelimit_below(&s->rate_limit)) {
4137 r = event_source_enter_ratelimited(s);
4138 if (r < 0)
4139 return r;
4140
4141 return 1;
4142 }
4143
945c2931 4144 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
da7e457c
LP
4145 r = source_set_pending(s, false);
4146 if (r < 0)
4147 return r;
4148 }
fd38203a 4149
6e9feda3
LP
4150 if (s->type != SOURCE_POST) {
4151 sd_event_source *z;
6e9feda3 4152
de02634c 4153 /* If we execute a non-post source, let's mark all post sources as pending. */
6e9feda3 4154
90e74a66 4155 SET_FOREACH(z, s->event->post_sources) {
b6d5481b 4156 if (event_source_is_offline(z))
6e9feda3
LP
4157 continue;
4158
4159 r = source_set_pending(z, true);
4160 if (r < 0)
4161 return r;
4162 }
4163 }
4164
158fe190
LP
4165 if (s->type == SOURCE_MEMORY_PRESSURE) {
4166 r = source_memory_pressure_initiate_dispatch(s);
4167 if (r == -EIO) /* handle EIO errors similar to callback errors */
4168 goto finish;
4169 if (r < 0)
4170 return r;
4171 if (r > 0) /* already handled */
4172 return 1;
4173 }
4174
baf76283
LP
4175 if (s->enabled == SD_EVENT_ONESHOT) {
4176 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
fd38203a
LP
4177 if (r < 0)
4178 return r;
4179 }
4180
12179984 4181 s->dispatching = true;
b7484e2a 4182
fd38203a
LP
4183 switch (s->type) {
4184
4185 case SOURCE_IO:
4186 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4187 break;
4188
6a0f1f6d 4189 case SOURCE_TIME_REALTIME:
a8548816 4190 case SOURCE_TIME_BOOTTIME:
6a0f1f6d
LP
4191 case SOURCE_TIME_MONOTONIC:
4192 case SOURCE_TIME_REALTIME_ALARM:
4193 case SOURCE_TIME_BOOTTIME_ALARM:
fd38203a
LP
4194 r = s->time.callback(s, s->time.next, s->userdata);
4195 break;
4196
4197 case SOURCE_SIGNAL:
4198 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4199 break;
4200
08cd1552
LP
4201 case SOURCE_CHILD: {
4202 bool zombie;
4203
945c2931 4204 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
08cd1552 4205
fd38203a 4206 r = s->child.callback(s, &s->child.siginfo, s->userdata);
08cd1552
LP
4207
4208 /* Now, reap the PID for good. */
f8f3f926 4209 if (zombie) {
cc59d290 4210 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
f8f3f926
LP
4211 s->child.waited = true;
4212 }
08cd1552 4213
fd38203a 4214 break;
08cd1552 4215 }
fd38203a
LP
4216
4217 case SOURCE_DEFER:
4218 r = s->defer.callback(s, s->userdata);
4219 break;
da7e457c 4220
6e9feda3
LP
4221 case SOURCE_POST:
4222 r = s->post.callback(s, s->userdata);
4223 break;
4224
6203e07a
LP
4225 case SOURCE_EXIT:
4226 r = s->exit.callback(s, s->userdata);
da7e457c 4227 break;
9d3e3aa5 4228
97ef5391
LP
4229 case SOURCE_INOTIFY: {
4230 struct sd_event *e = s->event;
4231 struct inotify_data *d;
4232 size_t sz;
4233
4234 assert(s->inotify.inode_data);
4235 assert_se(d = s->inotify.inode_data->inotify_data);
4236
4237 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4238 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4239 assert(d->buffer_filled >= sz);
4240
53baf2ef
LP
4241 /* If the inotify callback destroys the event source then this likely means we don't need to
4242 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4243 * free it immediately, then we couldn't drop the event from the inotify event queue without
4244 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4245 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4246 * explicitly GC it after we are done dropping the inotify event from the buffer. */
4247 d->n_busy++;
97ef5391 4248 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
53baf2ef 4249 d->n_busy--;
97ef5391 4250
53baf2ef
LP
4251 /* When no event is pending anymore on this inotify object, then let's drop the event from
4252 * the inotify event queue buffer. */
97ef5391
LP
4253 if (d->n_pending == 0)
4254 event_inotify_data_drop(e, d, sz);
4255
53baf2ef
LP
4256 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4257 event_gc_inotify_data(e, d);
97ef5391
LP
4258 break;
4259 }
4260
158fe190
LP
4261 case SOURCE_MEMORY_PRESSURE:
4262 r = s->memory_pressure.callback(s, s->userdata);
4263 break;
4264
9d3e3aa5 4265 case SOURCE_WATCHDOG:
a71fe8b8 4266 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
9f2a50a3 4267 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
04499a70 4268 assert_not_reached();
fd38203a
LP
4269 }
4270
12179984
LP
4271 s->dispatching = false;
4272
158fe190 4273finish:
b778cba4
LP
4274 if (r < 0) {
4275 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4276 strna(s->description),
4277 event_source_type_to_string(saved_type),
4278 s->exit_on_failure ? "exiting" : "disabling");
4279
4280 if (s->exit_on_failure)
4281 (void) sd_event_exit(saved_event, r);
4282 }
12179984
LP
4283
4284 if (s->n_ref == 0)
4285 source_free(s);
4286 else if (r < 0)
c3c50474 4287 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
b7484e2a 4288
6203e07a 4289 return 1;
fd38203a
LP
4290}
4291
4292static int event_prepare(sd_event *e) {
4293 int r;
4294
4295 assert(e);
4296
4297 for (;;) {
4298 sd_event_source *s;
4299
4300 s = prioq_peek(e->prepare);
b6d5481b 4301 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
fd38203a
LP
4302 break;
4303
4304 s->prepare_iteration = e->iteration;
8656f4a6 4305 prioq_reshuffle(e->prepare, s, &s->prepare_index);
fd38203a
LP
4306
4307 assert(s->prepare);
12179984 4308 s->dispatching = true;
fd38203a 4309 r = s->prepare(s, s->userdata);
12179984
LP
4310 s->dispatching = false;
4311
b778cba4
LP
4312 if (r < 0) {
4313 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4314 strna(s->description),
4315 event_source_type_to_string(s->type),
4316 s->exit_on_failure ? "exiting" : "disabling");
4317
4318 if (s->exit_on_failure)
4319 (void) sd_event_exit(e, r);
4320 }
fd38203a 4321
12179984
LP
4322 if (s->n_ref == 0)
4323 source_free(s);
4324 else if (r < 0)
c3c50474 4325 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd38203a
LP
4326 }
4327
4328 return 0;
4329}
4330
6203e07a 4331static int dispatch_exit(sd_event *e) {
da7e457c
LP
4332 sd_event_source *p;
4333 int r;
4334
4335 assert(e);
4336
6203e07a 4337 p = prioq_peek(e->exit);
19947509
ZJS
4338 assert(!p || p->type == SOURCE_EXIT);
4339
b6d5481b 4340 if (!p || event_source_is_offline(p)) {
da7e457c
LP
4341 e->state = SD_EVENT_FINISHED;
4342 return 0;
4343 }
4344
c8e9d15c 4345 PROTECT_EVENT(e);
da7e457c 4346 e->iteration++;
6203e07a 4347 e->state = SD_EVENT_EXITING;
da7e457c 4348 r = source_dispatch(p);
2b0c9ef7 4349 e->state = SD_EVENT_INITIAL;
da7e457c
LP
4350 return r;
4351}
4352
c2ba3ad6
LP
4353static sd_event_source* event_next_pending(sd_event *e) {
4354 sd_event_source *p;
4355
da7e457c
LP
4356 assert(e);
4357
c2ba3ad6
LP
4358 p = prioq_peek(e->pending);
4359 if (!p)
4360 return NULL;
4361
b6d5481b 4362 if (event_source_is_offline(p))
c2ba3ad6
LP
4363 return NULL;
4364
4365 return p;
4366}
4367
cde93897
LP
4368static int arm_watchdog(sd_event *e) {
4369 struct itimerspec its = {};
4370 usec_t t;
cde93897
LP
4371
4372 assert(e);
4373 assert(e->watchdog_fd >= 0);
4374
4375 t = sleep_between(e,
a595fb5c
YW
4376 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4377 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
cde93897
LP
4378
4379 timespec_store(&its.it_value, t);
4380
75145780
LP
4381 /* Make sure we never set the watchdog to 0, which tells the
4382 * kernel to disable it. */
4383 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4384 its.it_value.tv_nsec = 1;
4385
7c248223 4386 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
cde93897
LP
4387}
4388
4389static int process_watchdog(sd_event *e) {
4390 assert(e);
4391
4392 if (!e->watchdog)
4393 return 0;
4394
4395 /* Don't notify watchdog too often */
4396 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4397 return 0;
4398
4399 sd_notify(false, "WATCHDOG=1");
4400 e->watchdog_last = e->timestamp.monotonic;
4401
4402 return arm_watchdog(e);
4403}
4404
97ef5391
LP
4405static void event_close_inode_data_fds(sd_event *e) {
4406 struct inode_data *d;
4407
4408 assert(e);
4409
4410 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4411 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
365c2885 4412 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
97ef5391
LP
4413 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4414 * compromise. */
4415
ed828563 4416 while ((d = e->inode_data_to_close_list)) {
97ef5391
LP
4417 assert(d->fd >= 0);
4418 d->fd = safe_close(d->fd);
4419
ed828563 4420 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
4421 }
4422}
4423
158fe190
LP
4424static int event_memory_pressure_write_list(sd_event *e) {
4425 int r;
4426
4427 assert(e);
4428
4429 for (;;) {
4430 sd_event_source *s;
4431
4432 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4433 if (!s)
4434 break;
4435
4436 assert(s->type == SOURCE_MEMORY_PRESSURE);
4437 assert(s->memory_pressure.write_buffer_size > 0);
4438 s->memory_pressure.in_write_list = false;
4439
4440 r = source_memory_pressure_write(s);
4441 if (r < 0)
4442 return r;
4443 }
4444
4445 return 0;
4446}
4447
c45a5a74
TG
4448_public_ int sd_event_prepare(sd_event *e) {
4449 int r;
fd38203a 4450
da7e457c 4451 assert_return(e, -EINVAL);
b937d761 4452 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4453 assert_return(!event_origin_changed(e), -ECHILD);
da7e457c 4454 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 4455 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 4456
e5446015
LP
4457 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4458 * this check here once, since gettid() is typically not cached, and thus want to minimize
4459 * syscalls */
4460 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4461
f814c871 4462 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 4463 PROTECT_EVENT(e);
f814c871 4464
6203e07a 4465 if (e->exit_requested)
c45a5a74 4466 goto pending;
fd38203a
LP
4467
4468 e->iteration++;
4469
0be6c2f6 4470 e->state = SD_EVENT_PREPARING;
fd38203a 4471 r = event_prepare(e);
0be6c2f6 4472 e->state = SD_EVENT_INITIAL;
fd38203a 4473 if (r < 0)
c45a5a74 4474 return r;
fd38203a 4475
158fe190
LP
4476 r = event_memory_pressure_write_list(e);
4477 if (r < 0)
4478 return r;
4479
6a0f1f6d
LP
4480 r = event_arm_timer(e, &e->realtime);
4481 if (r < 0)
c45a5a74 4482 return r;
6a0f1f6d 4483
a8548816
TG
4484 r = event_arm_timer(e, &e->boottime);
4485 if (r < 0)
c45a5a74 4486 return r;
a8548816 4487
6a0f1f6d
LP
4488 r = event_arm_timer(e, &e->monotonic);
4489 if (r < 0)
c45a5a74 4490 return r;
6a0f1f6d
LP
4491
4492 r = event_arm_timer(e, &e->realtime_alarm);
1b5995b0 4493 if (r < 0)
c45a5a74 4494 return r;
fd38203a 4495
6a0f1f6d 4496 r = event_arm_timer(e, &e->boottime_alarm);
1b5995b0 4497 if (r < 0)
c45a5a74 4498 return r;
fd38203a 4499
97ef5391
LP
4500 event_close_inode_data_fds(e);
4501
0601b958 4502 if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
c45a5a74
TG
4503 goto pending;
4504
2b0c9ef7 4505 e->state = SD_EVENT_ARMED;
c45a5a74
TG
4506
4507 return 0;
4508
4509pending:
2b0c9ef7 4510 e->state = SD_EVENT_ARMED;
6d148a84
TG
4511 r = sd_event_wait(e, 0);
4512 if (r == 0)
2b0c9ef7 4513 e->state = SD_EVENT_ARMED;
6d148a84
TG
4514
4515 return r;
c45a5a74
TG
4516}
4517
798445ab
LP
4518static int epoll_wait_usec(
4519 int fd,
4520 struct epoll_event *events,
4521 int maxevents,
4522 usec_t timeout) {
4523
7c248223 4524 int msec;
0c14c45e
LP
4525 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4526
4527#if HAVE_EPOLL_PWAIT2
39f756d3 4528 static bool epoll_pwait2_absent = false;
52bb308c 4529 int r;
798445ab 4530
0c14c45e
LP
4531 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4532 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4533 * is not that obvious to implement given the libc and kernel definitions differ in the last
4534 * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4535 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4536 * missing. */
798445ab
LP
4537
4538 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
798445ab
LP
4539 r = epoll_pwait2(fd,
4540 events,
4541 maxevents,
52bb308c 4542 TIMESPEC_STORE(timeout),
798445ab
LP
4543 NULL);
4544 if (r >= 0)
4545 return r;
7cb45dbf 4546 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
798445ab
LP
4547 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4548 * supported. */
4549
4550 epoll_pwait2_absent = true;
4551 }
39f756d3 4552#endif
798445ab
LP
4553
4554 if (timeout == USEC_INFINITY)
4555 msec = -1;
4556 else {
4557 usec_t k;
4558
4559 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4560 if (k >= INT_MAX)
4561 msec = INT_MAX; /* Saturate */
4562 else
4563 msec = (int) k;
4564 }
4565
7c248223 4566 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
798445ab
LP
4567}
4568
efd3be9d 4569static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
319a4f4b 4570 size_t n_event_queue, m, n_event_max;
efd3be9d
YW
4571 int64_t min_priority = threshold;
4572 bool something_new = false;
798445ab 4573 int r;
c45a5a74 4574
efd3be9d
YW
4575 assert(e);
4576 assert(ret_min_priority);
6a0f1f6d 4577
8b9708d1 4578 n_event_queue = MAX(e->n_sources, 1u);
319a4f4b 4579 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
5cddd924 4580 return -ENOMEM;
fd38203a 4581
319a4f4b
LP
4582 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4583
97ef5391 4584 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
0601b958 4585 if (e->buffered_inotify_data_list)
798445ab 4586 timeout = 0;
97ef5391 4587
8b9708d1 4588 for (;;) {
319a4f4b
LP
4589 r = epoll_wait_usec(
4590 e->epoll_fd,
4591 e->event_queue,
4592 n_event_max,
4593 timeout);
798445ab 4594 if (r < 0)
efd3be9d 4595 return r;
c45a5a74 4596
8b9708d1
YW
4597 m = (size_t) r;
4598
319a4f4b 4599 if (m < n_event_max)
8b9708d1
YW
4600 break;
4601
319a4f4b 4602 if (n_event_max >= n_event_queue * 10)
8b9708d1
YW
4603 break;
4604
319a4f4b 4605 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
8b9708d1
YW
4606 return -ENOMEM;
4607
319a4f4b 4608 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
798445ab 4609 timeout = 0;
da7e457c 4610 }
fd38203a 4611
efd3be9d
YW
4612 /* Set timestamp only when this is called first time. */
4613 if (threshold == INT64_MAX)
4614 triple_timestamp_get(&e->timestamp);
fd38203a 4615
8b9708d1 4616 for (size_t i = 0; i < m; i++) {
fd38203a 4617
5cddd924
LP
4618 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4619 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
9da4cb2b 4620 else {
5cddd924 4621 WakeupType *t = e->event_queue[i].data.ptr;
9da4cb2b
LP
4622
4623 switch (*t) {
4624
f8f3f926 4625 case WAKEUP_EVENT_SOURCE: {
5cddd924 4626 sd_event_source *s = e->event_queue[i].data.ptr;
f8f3f926
LP
4627
4628 assert(s);
4629
efd3be9d
YW
4630 if (s->priority > threshold)
4631 continue;
4632
4633 min_priority = MIN(min_priority, s->priority);
4634
f8f3f926
LP
4635 switch (s->type) {
4636
4637 case SOURCE_IO:
5cddd924 4638 r = process_io(e, s, e->event_queue[i].events);
f8f3f926
LP
4639 break;
4640
4641 case SOURCE_CHILD:
5cddd924 4642 r = process_pidfd(e, s, e->event_queue[i].events);
f8f3f926
LP
4643 break;
4644
158fe190
LP
4645 case SOURCE_MEMORY_PRESSURE:
4646 r = process_memory_pressure(s, e->event_queue[i].events);
4647 break;
4648
f8f3f926 4649 default:
04499a70 4650 assert_not_reached();
f8f3f926
LP
4651 }
4652
9da4cb2b 4653 break;
f8f3f926 4654 }
fd38203a 4655
9da4cb2b 4656 case WAKEUP_CLOCK_DATA: {
5cddd924 4657 struct clock_data *d = e->event_queue[i].data.ptr;
f8f3f926
LP
4658
4659 assert(d);
4660
5cddd924 4661 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
9da4cb2b
LP
4662 break;
4663 }
4664
4665 case WAKEUP_SIGNAL_DATA:
efd3be9d 4666 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
9da4cb2b
LP
4667 break;
4668
97ef5391 4669 case WAKEUP_INOTIFY_DATA:
efd3be9d 4670 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
97ef5391
LP
4671 break;
4672
9da4cb2b 4673 default:
04499a70 4674 assert_not_reached();
9da4cb2b
LP
4675 }
4676 }
efd3be9d
YW
4677 if (r < 0)
4678 return r;
4679 if (r > 0)
4680 something_new = true;
4681 }
4682
4683 *ret_min_priority = min_priority;
4684 return something_new;
4685}
4686
4687_public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4688 int r;
4689
4690 assert_return(e, -EINVAL);
4691 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4692 assert_return(!event_origin_changed(e), -ECHILD);
efd3be9d
YW
4693 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4694 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4695
4696 if (e->exit_requested) {
4697 e->state = SD_EVENT_PENDING;
4698 return 1;
4699 }
4700
4701 for (int64_t threshold = INT64_MAX; ; threshold--) {
4702 int64_t epoll_min_priority, child_min_priority;
4703
4704 /* There may be a possibility that new epoll (especially IO) and child events are
4705 * triggered just after process_epoll() call but before process_child(), and the new IO
4706 * events may have higher priority than the child events. To salvage these events,
4707 * let's call epoll_wait() again, but accepts only events with higher priority than the
4708 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4709 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4710 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4711
4712 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4713 if (r == -EINTR) {
4714 e->state = SD_EVENT_PENDING;
4715 return 1;
4716 }
4717 if (r < 0)
4718 goto finish;
4719 if (r == 0 && threshold < INT64_MAX)
4720 /* No new epoll event. */
4721 break;
4722
4723 r = process_child(e, threshold, &child_min_priority);
fd38203a 4724 if (r < 0)
da7e457c 4725 goto finish;
efd3be9d
YW
4726 if (r == 0)
4727 /* No new child event. */
4728 break;
4729
4730 threshold = MIN(epoll_min_priority, child_min_priority);
4731 if (threshold == INT64_MIN)
4732 break;
4733
4734 timeout = 0;
fd38203a
LP
4735 }
4736
cde93897
LP
4737 r = process_watchdog(e);
4738 if (r < 0)
4739 goto finish;
4740
fd69f224 4741 r = process_inotify(e);
6a0f1f6d
LP
4742 if (r < 0)
4743 goto finish;
4744
fd69f224 4745 r = process_timer(e, e->timestamp.realtime, &e->realtime);
a8548816
TG
4746 if (r < 0)
4747 goto finish;
4748
fd69f224 4749 r = process_timer(e, e->timestamp.boottime, &e->boottime);
6a0f1f6d
LP
4750 if (r < 0)
4751 goto finish;
4752
4753 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
fd38203a 4754 if (r < 0)
da7e457c 4755 goto finish;
fd38203a 4756
e475d10c 4757 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
fd38203a 4758 if (r < 0)
da7e457c 4759 goto finish;
fd38203a 4760
fd69f224 4761 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
97ef5391
LP
4762 if (r < 0)
4763 goto finish;
fd69f224
MS
4764 else if (r == 1) {
4765 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4766 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4767 * there were potentially re-enabled by the callback.
4768 *
4769 * Wondering why we treat only this invocation of process_timer() differently? Once event
4770 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4771 * ratelimit expiry callback is never called for any other timer type. */
4772 r = 0;
4773 goto finish;
4774 }
97ef5391 4775
c45a5a74
TG
4776 if (event_next_pending(e)) {
4777 e->state = SD_EVENT_PENDING;
c45a5a74 4778 return 1;
da7e457c
LP
4779 }
4780
c45a5a74 4781 r = 0;
fd38203a 4782
da7e457c 4783finish:
2b0c9ef7 4784 e->state = SD_EVENT_INITIAL;
da7e457c
LP
4785
4786 return r;
fd38203a
LP
4787}
4788
c45a5a74
TG
4789_public_ int sd_event_dispatch(sd_event *e) {
4790 sd_event_source *p;
4791 int r;
4792
4793 assert_return(e, -EINVAL);
b937d761 4794 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4795 assert_return(!event_origin_changed(e), -ECHILD);
c45a5a74
TG
4796 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4797 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4798
4799 if (e->exit_requested)
4800 return dispatch_exit(e);
4801
4802 p = event_next_pending(e);
4803 if (p) {
c8e9d15c 4804 PROTECT_EVENT(e);
c45a5a74
TG
4805
4806 e->state = SD_EVENT_RUNNING;
4807 r = source_dispatch(p);
2b0c9ef7 4808 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4809 return r;
4810 }
4811
2b0c9ef7 4812 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4813
4814 return 1;
4815}
4816
34b87517 4817static void event_log_delays(sd_event *e) {
442ac269
YW
4818 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4819 size_t l, i;
34b87517 4820
442ac269
YW
4821 p = b;
4822 l = sizeof(b);
4823 for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4824 l = strpcpyf(&p, l, "%u ", e->delays[i]);
34b87517
VC
4825 e->delays[i] = 0;
4826 }
442ac269 4827 log_debug("Event loop iterations: %s", b);
34b87517
VC
4828}
4829
c45a5a74
TG
4830_public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4831 int r;
4832
4833 assert_return(e, -EINVAL);
b937d761 4834 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4835 assert_return(!event_origin_changed(e), -ECHILD);
c45a5a74 4836 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 4837 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
c45a5a74 4838
e6a7bee5 4839 if (e->profile_delays && e->last_run_usec != 0) {
34b87517
VC
4840 usec_t this_run;
4841 unsigned l;
4842
4843 this_run = now(CLOCK_MONOTONIC);
4844
58c34be8 4845 l = log2u64(this_run - e->last_run_usec);
cb9d621e 4846 assert(l < ELEMENTSOF(e->delays));
34b87517
VC
4847 e->delays[l]++;
4848
e6a7bee5 4849 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
34b87517 4850 event_log_delays(e);
e6a7bee5 4851 e->last_log_usec = this_run;
34b87517
VC
4852 }
4853 }
4854
f814c871 4855 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 4856 PROTECT_EVENT(e);
f814c871 4857
c45a5a74 4858 r = sd_event_prepare(e);
53bac4e0
LP
4859 if (r == 0)
4860 /* There was nothing? Then wait... */
4861 r = sd_event_wait(e, timeout);
c45a5a74 4862
34b87517 4863 if (e->profile_delays)
e6a7bee5 4864 e->last_run_usec = now(CLOCK_MONOTONIC);
34b87517 4865
02d30981 4866 if (r > 0) {
53bac4e0 4867 /* There's something now, then let's dispatch it */
02d30981
TG
4868 r = sd_event_dispatch(e);
4869 if (r < 0)
4870 return r;
53bac4e0
LP
4871
4872 return 1;
4873 }
4874
4875 return r;
c45a5a74
TG
4876}
4877
f7262a9f 4878_public_ int sd_event_loop(sd_event *e) {
fd38203a
LP
4879 int r;
4880
da7e457c 4881 assert_return(e, -EINVAL);
b937d761 4882 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4883 assert_return(!event_origin_changed(e), -ECHILD);
2b0c9ef7 4884 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 4885
2eeff0f4 4886
c8e9d15c 4887 PROTECT_EVENT(e);
fd38203a 4888
da7e457c 4889 while (e->state != SD_EVENT_FINISHED) {
f5fbe71d 4890 r = sd_event_run(e, UINT64_MAX);
fd38203a 4891 if (r < 0)
30dd293c 4892 return r;
fd38203a
LP
4893 }
4894
30dd293c 4895 return e->exit_code;
fd38203a
LP
4896}
4897
9b364545 4898_public_ int sd_event_get_fd(sd_event *e) {
9b364545 4899 assert_return(e, -EINVAL);
b937d761 4900 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4901 assert_return(!event_origin_changed(e), -ECHILD);
9b364545
TG
4902
4903 return e->epoll_fd;
4904}
4905
f7262a9f 4906_public_ int sd_event_get_state(sd_event *e) {
da7e457c 4907 assert_return(e, -EINVAL);
b937d761 4908 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4909 assert_return(!event_origin_changed(e), -ECHILD);
da7e457c
LP
4910
4911 return e->state;
4912}
4913
6203e07a 4914_public_ int sd_event_get_exit_code(sd_event *e, int *code) {
da7e457c 4915 assert_return(e, -EINVAL);
b937d761 4916 assert_return(e = event_resolve(e), -ENOPKG);
6203e07a 4917 assert_return(code, -EINVAL);
2eeff0f4 4918 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 4919
6203e07a
LP
4920 if (!e->exit_requested)
4921 return -ENODATA;
4922
4923 *code = e->exit_code;
4924 return 0;
fd38203a
LP
4925}
4926
6203e07a 4927_public_ int sd_event_exit(sd_event *e, int code) {
da7e457c 4928 assert_return(e, -EINVAL);
b937d761 4929 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 4930 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 4931 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 4932
6203e07a
LP
4933 e->exit_requested = true;
4934 e->exit_code = code;
4935
fd38203a
LP
4936 return 0;
4937}
46e8c825 4938
6a0f1f6d 4939_public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
46e8c825 4940 assert_return(e, -EINVAL);
b937d761 4941 assert_return(e = event_resolve(e), -ENOPKG);
46e8c825 4942 assert_return(usec, -EINVAL);
2eeff0f4 4943 assert_return(!event_origin_changed(e), -ECHILD);
46e8c825 4944
e475d10c
LP
4945 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4946 return -EOPNOTSUPP;
4947
e475d10c 4948 if (!triple_timestamp_is_set(&e->timestamp)) {
15c689d7 4949 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
38a03f06
LP
4950 *usec = now(clock);
4951 return 1;
4952 }
46e8c825 4953
e475d10c 4954 *usec = triple_timestamp_by_clock(&e->timestamp, clock);
46e8c825
LP
4955 return 0;
4956}
afc6adb5
LP
4957
4958_public_ int sd_event_default(sd_event **ret) {
39883f62 4959 sd_event *e = NULL;
afc6adb5
LP
4960 int r;
4961
4962 if (!ret)
4963 return !!default_event;
4964
4965 if (default_event) {
4966 *ret = sd_event_ref(default_event);
4967 return 0;
4968 }
4969
4970 r = sd_event_new(&e);
4971 if (r < 0)
4972 return r;
4973
4974 e->default_event_ptr = &default_event;
4975 e->tid = gettid();
4976 default_event = e;
4977
4978 *ret = e;
4979 return 1;
4980}
4981
4982_public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4983 assert_return(e, -EINVAL);
b937d761 4984 assert_return(e = event_resolve(e), -ENOPKG);
afc6adb5 4985 assert_return(tid, -EINVAL);
2eeff0f4 4986 assert_return(!event_origin_changed(e), -ECHILD);
afc6adb5 4987
76b54375
LP
4988 if (e->tid != 0) {
4989 *tid = e->tid;
4990 return 0;
4991 }
4992
4993 return -ENXIO;
afc6adb5 4994}
cde93897
LP
4995
4996_public_ int sd_event_set_watchdog(sd_event *e, int b) {
4997 int r;
4998
4999 assert_return(e, -EINVAL);
b937d761 5000 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 5001 assert_return(!event_origin_changed(e), -ECHILD);
cde93897
LP
5002
5003 if (e->watchdog == !!b)
5004 return e->watchdog;
5005
5006 if (b) {
09812eb7
LP
5007 r = sd_watchdog_enabled(false, &e->watchdog_period);
5008 if (r <= 0)
cde93897 5009 return r;
cde93897
LP
5010
5011 /* Issue first ping immediately */
5012 sd_notify(false, "WATCHDOG=1");
5013 e->watchdog_last = now(CLOCK_MONOTONIC);
5014
5015 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
5016 if (e->watchdog_fd < 0)
5017 return -errno;
5018
5019 r = arm_watchdog(e);
5020 if (r < 0)
5021 goto fail;
5022
1eac7948 5023 struct epoll_event ev = {
a82f89aa
LP
5024 .events = EPOLLIN,
5025 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
5026 };
cde93897 5027
15c689d7 5028 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
cde93897
LP
5029 r = -errno;
5030 goto fail;
5031 }
5032
5033 } else {
5034 if (e->watchdog_fd >= 0) {
5a795bff 5035 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
03e334a1 5036 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
5037 }
5038 }
5039
5040 e->watchdog = !!b;
5041 return e->watchdog;
5042
5043fail:
03e334a1 5044 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
5045 return r;
5046}
8f726607
LP
5047
5048_public_ int sd_event_get_watchdog(sd_event *e) {
5049 assert_return(e, -EINVAL);
b937d761 5050 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 5051 assert_return(!event_origin_changed(e), -ECHILD);
8f726607
LP
5052
5053 return e->watchdog;
5054}
60a3b1e1
LP
5055
5056_public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5057 assert_return(e, -EINVAL);
b937d761 5058 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 5059 assert_return(!event_origin_changed(e), -ECHILD);
60a3b1e1
LP
5060
5061 *ret = e->iteration;
5062 return 0;
5063}
15723a1d
LP
5064
5065_public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5066 assert_return(s, -EINVAL);
2eeff0f4
LB
5067 assert_return(s->event, -EINVAL);
5068 assert_return(!event_origin_changed(s->event), -ECHILD);
15723a1d
LP
5069
5070 s->destroy_callback = callback;
5071 return 0;
5072}
5073
5074_public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5075 assert_return(s, -EINVAL);
2eeff0f4 5076 assert_return(!event_origin_changed(s->event), -ECHILD);
15723a1d
LP
5077
5078 if (ret)
5079 *ret = s->destroy_callback;
5080
5081 return !!s->destroy_callback;
5082}
2382c936
YW
5083
5084_public_ int sd_event_source_get_floating(sd_event_source *s) {
5085 assert_return(s, -EINVAL);
2eeff0f4 5086 assert_return(!event_origin_changed(s->event), -ECHILD);
2382c936
YW
5087
5088 return s->floating;
5089}
5090
5091_public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5092 assert_return(s, -EINVAL);
2eeff0f4 5093 assert_return(!event_origin_changed(s->event), -ECHILD);
2382c936
YW
5094
5095 if (s->floating == !!b)
5096 return 0;
5097
5098 if (!s->event) /* Already disconnected */
5099 return -ESTALE;
5100
5101 s->floating = b;
5102
5103 if (b) {
5104 sd_event_source_ref(s);
5105 sd_event_unref(s->event);
5106 } else {
5107 sd_event_ref(s->event);
5108 sd_event_source_unref(s);
5109 }
5110
5111 return 1;
5112}
b778cba4
LP
5113
5114_public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5115 assert_return(s, -EINVAL);
5116 assert_return(s->type != SOURCE_EXIT, -EDOM);
2eeff0f4 5117 assert_return(!event_origin_changed(s->event), -ECHILD);
b778cba4
LP
5118
5119 return s->exit_on_failure;
5120}
5121
5122_public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5123 assert_return(s, -EINVAL);
5124 assert_return(s->type != SOURCE_EXIT, -EDOM);
2eeff0f4 5125 assert_return(!event_origin_changed(s->event), -ECHILD);
b778cba4
LP
5126
5127 if (s->exit_on_failure == !!b)
5128 return 0;
5129
5130 s->exit_on_failure = b;
5131 return 1;
5132}
b6d5481b
LP
5133
5134_public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5135 int r;
5136
5137 assert_return(s, -EINVAL);
2eeff0f4 5138 assert_return(!event_origin_changed(s->event), -ECHILD);
b6d5481b
LP
5139
5140 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5141 * so is a programming error. */
5142 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5143
5144 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5145 * non-ratelimited. */
fd69f224 5146 r = event_source_leave_ratelimit(s, /* run_callback */ false);
b6d5481b
LP
5147 if (r < 0)
5148 return r;
5149
5150 s->rate_limit = (RateLimit) { interval, burst };
5151 return 0;
fd69f224
MS
5152}
5153
5154_public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5155 assert_return(s, -EINVAL);
2eeff0f4 5156 assert_return(!event_origin_changed(s->event), -ECHILD);
fd69f224
MS
5157
5158 s->ratelimit_expire_callback = callback;
5159 return 0;
b6d5481b
LP
5160}
5161
5162_public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5163 assert_return(s, -EINVAL);
2eeff0f4 5164 assert_return(!event_origin_changed(s->event), -ECHILD);
b6d5481b 5165
6dd3b818
YW
5166 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5167 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
b6d5481b
LP
5168 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5169 return -EDOM;
5170
5171 if (!ratelimit_configured(&s->rate_limit))
5172 return -ENOEXEC;
5173
5174 if (ret_interval)
5175 *ret_interval = s->rate_limit.interval;
5176 if (ret_burst)
5177 *ret_burst = s->rate_limit.burst;
5178
5179 return 0;
5180}
5181
5182_public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5183 assert_return(s, -EINVAL);
2eeff0f4 5184 assert_return(!event_origin_changed(s->event), -ECHILD);
b6d5481b
LP
5185
5186 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5187 return false;
5188
5189 if (!ratelimit_configured(&s->rate_limit))
5190 return false;
5191
5192 return s->ratelimited;
5193}
baf3fdec
LP
5194
5195_public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5196 bool change = false;
5197 int r;
5198
5199 assert_return(e, -EINVAL);
5200
5201 if (b) {
5202 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5203 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5204 * floating after creation (and undo this before deleting them again). */
5205
5206 if (!e->sigint_event_source) {
5207 r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5208 if (r < 0)
5209 return r;
5210
5211 assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5212 change = true;
5213 }
5214
5215 if (!e->sigterm_event_source) {
5216 r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5217 if (r < 0) {
5218 if (change) {
5219 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5220 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5221 }
5222
5223 return r;
5224 }
5225
5226 assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5227 change = true;
5228 }
5229
5230 } else {
5231 if (e->sigint_event_source) {
5232 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5233 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5234 change = true;
5235 }
5236
5237 if (e->sigterm_event_source) {
5238 assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5239 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5240 change = true;
5241 }
5242 }
5243
5244 return change;
5245}
158fe190
LP
5246
5247_public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5248 _cleanup_free_ char *b = NULL;
5249 _cleanup_free_ void *w = NULL;
5250
5251 assert_return(s, -EINVAL);
5252 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5253 assert_return(ty, -EINVAL);
2eeff0f4 5254 assert_return(!event_origin_changed(s->event), -ECHILD);
158fe190
LP
5255
5256 if (!STR_IN_SET(ty, "some", "full"))
5257 return -EINVAL;
5258
5259 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5260 return -EBUSY;
5261
5262 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5263 if (!space)
5264 return -EINVAL;
5265
5266 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5267 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5268 if (!b)
5269 return -ENOMEM;
5270 if (!STR_IN_SET(b, "some", "full"))
5271 return -EINVAL;
5272
5273 if (streq(b, ty))
5274 return 0;
5275
5276 size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5277 w = new(char, nl);
5278 if (!w)
5279 return -ENOMEM;
5280
5281 memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5282
5283 free_and_replace(s->memory_pressure.write_buffer, w);
5284 s->memory_pressure.write_buffer_size = nl;
5285 s->memory_pressure.locked = false;
5286
5287 return 1;
5288}
5289
5290_public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5291 _cleanup_free_ char *b = NULL;
5292 _cleanup_free_ void *w = NULL;
5293
5294 assert_return(s, -EINVAL);
5295 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
2eeff0f4 5296 assert_return(!event_origin_changed(s->event), -ECHILD);
158fe190
LP
5297
5298 if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5299 return -ERANGE;
5300 if (window_usec <= 0 || window_usec >= UINT64_MAX)
5301 return -ERANGE;
5302 if (threshold_usec > window_usec)
5303 return -EINVAL;
5304
5305 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5306 return -EBUSY;
5307
5308 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5309 if (!space)
5310 return -EINVAL;
5311
5312 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5313 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5314 if (!b)
5315 return -ENOMEM;
5316 if (!STR_IN_SET(b, "some", "full"))
5317 return -EINVAL;
5318
5319 if (asprintf((char**) &w,
5320 "%s " USEC_FMT " " USEC_FMT "",
5321 b,
5322 threshold_usec,
5323 window_usec) < 0)
5324 return -EINVAL;
5325
5326 l = strlen(w) + 1;
5327 if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5328 return 0;
5329
5330 free_and_replace(s->memory_pressure.write_buffer, w);
5331 s->memory_pressure.write_buffer_size = l;
5332 s->memory_pressure.locked = false;
5333
5334 return 1;
5335}