]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/libsystemd/sd-event/sd-event.c
Merge pull request #26458 from yuwata/core-network-namespace-remount-sysfs
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
fd38203a
LP
2
3#include <sys/epoll.h>
4#include <sys/timerfd.h>
5#include <sys/wait.h>
6
cde93897 7#include "sd-daemon.h"
07630cea
LP
8#include "sd-event.h"
9#include "sd-id128.h"
158fe190 10#include "sd-messages.h"
07630cea 11
b5efdb8a 12#include "alloc-util.h"
f8f3f926 13#include "env-util.h"
a137a1c3 14#include "event-source.h"
3ffd4af2 15#include "fd-util.h"
97ef5391 16#include "fs-util.h"
28e5e1e9 17#include "glyph-util.h"
fd38203a 18#include "hashmap.h"
158fe190 19#include "hexdecoct.h"
07630cea 20#include "list.h"
3ae6b3bf 21#include "logarithm.h"
07630cea 22#include "macro.h"
158fe190 23#include "mallinfo-util.h"
0a970718 24#include "memory-util.h"
158fe190 25#include "missing_magic.h"
f5947a5e 26#include "missing_syscall.h"
158fe190 27#include "path-util.h"
07630cea 28#include "prioq.h"
4a0b58c4 29#include "process-util.h"
158fe190 30#include "psi-util.h"
6e9feda3 31#include "set.h"
24882e06 32#include "signal-util.h"
158fe190
LP
33#include "socket-util.h"
34#include "stat-util.h"
55cbfaa5 35#include "string-table.h"
07630cea 36#include "string-util.h"
442ac269 37#include "strxcpyx.h"
07630cea 38#include "time-util.h"
fd38203a 39
c2ba3ad6 40#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
fd38203a 41
f8f3f926
LP
42static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
43 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
44 return s &&
45 s->type == SOURCE_CHILD &&
46 s->child.pidfd >= 0 &&
47 s->child.options == WEXITED;
48}
49
b6d5481b
LP
50static bool event_source_is_online(sd_event_source *s) {
51 assert(s);
52 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
53}
54
55static bool event_source_is_offline(sd_event_source *s) {
56 assert(s);
57 return s->enabled == SD_EVENT_OFF || s->ratelimited;
58}
59
55cbfaa5 60static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
3f3548f8
ZJS
61 [SOURCE_IO] = "io",
62 [SOURCE_TIME_REALTIME] = "realtime",
63 [SOURCE_TIME_BOOTTIME] = "bootime",
64 [SOURCE_TIME_MONOTONIC] = "monotonic",
55cbfaa5
DM
65 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
66 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
3f3548f8
ZJS
67 [SOURCE_SIGNAL] = "signal",
68 [SOURCE_CHILD] = "child",
69 [SOURCE_DEFER] = "defer",
70 [SOURCE_POST] = "post",
71 [SOURCE_EXIT] = "exit",
72 [SOURCE_WATCHDOG] = "watchdog",
73 [SOURCE_INOTIFY] = "inotify",
158fe190 74 [SOURCE_MEMORY_PRESSURE] = "memory-pressure",
55cbfaa5
DM
75};
76
77DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
78
b6d5481b
LP
79#define EVENT_SOURCE_IS_TIME(t) \
80 IN_SET((t), \
81 SOURCE_TIME_REALTIME, \
82 SOURCE_TIME_BOOTTIME, \
83 SOURCE_TIME_MONOTONIC, \
84 SOURCE_TIME_REALTIME_ALARM, \
85 SOURCE_TIME_BOOTTIME_ALARM)
86
87#define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
88 IN_SET((t), \
89 SOURCE_IO, \
90 SOURCE_TIME_REALTIME, \
91 SOURCE_TIME_BOOTTIME, \
92 SOURCE_TIME_MONOTONIC, \
93 SOURCE_TIME_REALTIME_ALARM, \
94 SOURCE_TIME_BOOTTIME_ALARM, \
95 SOURCE_SIGNAL, \
96 SOURCE_DEFER, \
158fe190
LP
97 SOURCE_INOTIFY, \
98 SOURCE_MEMORY_PRESSURE)
6a0f1f6d 99
19947509
ZJS
100/* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
101 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
102 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
103#define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
104
fd38203a 105struct sd_event {
da7e457c 106 unsigned n_ref;
fd38203a
LP
107
108 int epoll_fd;
cde93897 109 int watchdog_fd;
fd38203a
LP
110
111 Prioq *pending;
112 Prioq *prepare;
c2ba3ad6 113
a8548816 114 /* timerfd_create() only supports these five clocks so far. We
6a0f1f6d
LP
115 * can add support for more clocks when the kernel learns to
116 * deal with them, too. */
117 struct clock_data realtime;
a8548816 118 struct clock_data boottime;
6a0f1f6d
LP
119 struct clock_data monotonic;
120 struct clock_data realtime_alarm;
121 struct clock_data boottime_alarm;
fd38203a 122
da7e457c
LP
123 usec_t perturb;
124
9da4cb2b
LP
125 sd_event_source **signal_sources; /* indexed by signal number */
126 Hashmap *signal_data; /* indexed by priority */
fd38203a
LP
127
128 Hashmap *child_sources;
b6d5481b 129 unsigned n_online_child_sources;
fd38203a 130
6e9feda3
LP
131 Set *post_sources;
132
6203e07a 133 Prioq *exit;
fd38203a 134
97ef5391
LP
135 Hashmap *inotify_data; /* indexed by priority */
136
137 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
ed828563 138 LIST_HEAD(struct inode_data, inode_data_to_close_list);
97ef5391
LP
139
140 /* A list of inotify objects that already have events buffered which aren't processed yet */
0601b958 141 LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
97ef5391 142
158fe190
LP
143 /* A list of memory pressure event sources that still need their subscription string written */
144 LIST_HEAD(sd_event_source, memory_pressure_write_list);
145
da7e457c 146 pid_t original_pid;
c2ba3ad6 147
60a3b1e1 148 uint64_t iteration;
e475d10c 149 triple_timestamp timestamp;
da7e457c 150 int state;
eaa3cbef 151
6203e07a 152 bool exit_requested:1;
da7e457c 153 bool need_process_child:1;
cde93897 154 bool watchdog:1;
34b87517 155 bool profile_delays:1;
afc6adb5 156
6203e07a
LP
157 int exit_code;
158
afc6adb5
LP
159 pid_t tid;
160 sd_event **default_event_ptr;
cde93897
LP
161
162 usec_t watchdog_last, watchdog_period;
15b38f93
LP
163
164 unsigned n_sources;
a71fe8b8 165
5cddd924 166 struct epoll_event *event_queue;
5cddd924 167
a71fe8b8 168 LIST_HEAD(sd_event_source, sources);
34b87517 169
baf3fdec
LP
170 sd_event_source *sigint_event_source, *sigterm_event_source;
171
e6a7bee5 172 usec_t last_run_usec, last_log_usec;
34b87517 173 unsigned delays[sizeof(usec_t) * 8];
fd38203a
LP
174};
175
b937d761
NM
176static thread_local sd_event *default_event = NULL;
177
a71fe8b8 178static void source_disconnect(sd_event_source *s);
97ef5391 179static void event_gc_inode_data(sd_event *e, struct inode_data *d);
a71fe8b8 180
b937d761
NM
181static sd_event *event_resolve(sd_event *e) {
182 return e == SD_EVENT_DEFAULT ? default_event : e;
183}
184
fd38203a
LP
185static int pending_prioq_compare(const void *a, const void *b) {
186 const sd_event_source *x = a, *y = b;
9c57a73b 187 int r;
fd38203a
LP
188
189 assert(x->pending);
190 assert(y->pending);
191
baf76283 192 /* Enabled ones first */
06e13147
YW
193 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
194 if (r != 0)
195 return r;
fd38203a 196
b6d5481b
LP
197 /* Non rate-limited ones first. */
198 r = CMP(!!x->ratelimited, !!y->ratelimited);
199 if (r != 0)
200 return r;
201
fd38203a 202 /* Lower priority values first */
9c57a73b
YW
203 r = CMP(x->priority, y->priority);
204 if (r != 0)
205 return r;
fd38203a
LP
206
207 /* Older entries first */
9c57a73b 208 return CMP(x->pending_iteration, y->pending_iteration);
fd38203a
LP
209}
210
211static int prepare_prioq_compare(const void *a, const void *b) {
212 const sd_event_source *x = a, *y = b;
9c57a73b 213 int r;
fd38203a
LP
214
215 assert(x->prepare);
216 assert(y->prepare);
217
8046c457 218 /* Enabled ones first */
06e13147
YW
219 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
220 if (r != 0)
221 return r;
8046c457 222
b6d5481b
LP
223 /* Non rate-limited ones first. */
224 r = CMP(!!x->ratelimited, !!y->ratelimited);
225 if (r != 0)
226 return r;
227
fd38203a
LP
228 /* Move most recently prepared ones last, so that we can stop
229 * preparing as soon as we hit one that has already been
230 * prepared in the current iteration */
9c57a73b
YW
231 r = CMP(x->prepare_iteration, y->prepare_iteration);
232 if (r != 0)
233 return r;
fd38203a 234
fd38203a 235 /* Lower priority values first */
9c57a73b 236 return CMP(x->priority, y->priority);
fd38203a
LP
237}
238
b6d5481b
LP
239static usec_t time_event_source_next(const sd_event_source *s) {
240 assert(s);
241
242 /* We have two kinds of event sources that have elapsation times associated with them: the actual
243 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
244 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
245 * looking at here. */
246
247 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
248 assert(s->rate_limit.begin != 0);
249 assert(s->rate_limit.interval != 0);
250 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
251 }
252
253 /* Otherwise this must be a time event source, if not ratelimited */
254 if (EVENT_SOURCE_IS_TIME(s->type))
255 return s->time.next;
256
257 return USEC_INFINITY;
258}
259
1bce0ffa 260static usec_t time_event_source_latest(const sd_event_source *s) {
b6d5481b
LP
261 assert(s);
262
263 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
264 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
265 * window */
266 assert(s->rate_limit.begin != 0);
267 assert(s->rate_limit.interval != 0);
268 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
269 }
270
271 /* Must be a time event source, if not ratelimited */
272 if (EVENT_SOURCE_IS_TIME(s->type))
273 return usec_add(s->time.next, s->time.accuracy);
274
275 return USEC_INFINITY;
1bce0ffa
LP
276}
277
81107b84
LP
278static bool event_source_timer_candidate(const sd_event_source *s) {
279 assert(s);
280
281 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
282 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
283 return !s->pending || s->ratelimited;
284}
285
286static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
c2ba3ad6 287 const sd_event_source *x = a, *y = b;
06e13147 288 int r;
c2ba3ad6 289
baf76283 290 /* Enabled ones first */
06e13147
YW
291 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
292 if (r != 0)
293 return r;
c2ba3ad6 294
81107b84 295 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
06e13147
YW
296 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
297 if (r != 0)
298 return r;
c2ba3ad6
LP
299
300 /* Order by time */
81107b84
LP
301 return CMP(time_func(x), time_func(y));
302}
303
304static int earliest_time_prioq_compare(const void *a, const void *b) {
305 return time_prioq_compare(a, b, time_event_source_next);
306}
307
308static int latest_time_prioq_compare(const void *a, const void *b) {
309 return time_prioq_compare(a, b, time_event_source_latest);
c2ba3ad6
LP
310}
311
6203e07a 312static int exit_prioq_compare(const void *a, const void *b) {
da7e457c 313 const sd_event_source *x = a, *y = b;
06e13147 314 int r;
da7e457c 315
6203e07a
LP
316 assert(x->type == SOURCE_EXIT);
317 assert(y->type == SOURCE_EXIT);
da7e457c 318
baf76283 319 /* Enabled ones first */
06e13147
YW
320 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
321 if (r != 0)
322 return r;
da7e457c
LP
323
324 /* Lower priority values first */
6dd91b36 325 return CMP(x->priority, y->priority);
da7e457c
LP
326}
327
6a0f1f6d
LP
328static void free_clock_data(struct clock_data *d) {
329 assert(d);
9da4cb2b 330 assert(d->wakeup == WAKEUP_CLOCK_DATA);
6a0f1f6d
LP
331
332 safe_close(d->fd);
333 prioq_free(d->earliest);
334 prioq_free(d->latest);
335}
336
8301aa0b 337static sd_event *event_free(sd_event *e) {
a71fe8b8
LP
338 sd_event_source *s;
339
fd38203a 340 assert(e);
a71fe8b8 341
baf3fdec
LP
342 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
343 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
344
a71fe8b8
LP
345 while ((s = e->sources)) {
346 assert(s->floating);
347 source_disconnect(s);
348 sd_event_source_unref(s);
349 }
350
15b38f93 351 assert(e->n_sources == 0);
fd38203a 352
afc6adb5
LP
353 if (e->default_event_ptr)
354 *(e->default_event_ptr) = NULL;
355
03e334a1 356 safe_close(e->epoll_fd);
03e334a1 357 safe_close(e->watchdog_fd);
cde93897 358
6a0f1f6d 359 free_clock_data(&e->realtime);
a8548816 360 free_clock_data(&e->boottime);
6a0f1f6d
LP
361 free_clock_data(&e->monotonic);
362 free_clock_data(&e->realtime_alarm);
363 free_clock_data(&e->boottime_alarm);
364
fd38203a
LP
365 prioq_free(e->pending);
366 prioq_free(e->prepare);
6203e07a 367 prioq_free(e->exit);
fd38203a
LP
368
369 free(e->signal_sources);
9da4cb2b 370 hashmap_free(e->signal_data);
fd38203a 371
97ef5391
LP
372 hashmap_free(e->inotify_data);
373
fd38203a 374 hashmap_free(e->child_sources);
6e9feda3 375 set_free(e->post_sources);
8301aa0b 376
5cddd924
LP
377 free(e->event_queue);
378
8301aa0b 379 return mfree(e);
fd38203a
LP
380}
381
f7262a9f 382_public_ int sd_event_new(sd_event** ret) {
fd38203a
LP
383 sd_event *e;
384 int r;
385
305f78bf 386 assert_return(ret, -EINVAL);
fd38203a 387
d08eb1fa 388 e = new(sd_event, 1);
fd38203a
LP
389 if (!e)
390 return -ENOMEM;
391
d08eb1fa
LP
392 *e = (sd_event) {
393 .n_ref = 1,
254d1313
ZJS
394 .epoll_fd = -EBADF,
395 .watchdog_fd = -EBADF,
d08eb1fa 396 .realtime.wakeup = WAKEUP_CLOCK_DATA,
254d1313 397 .realtime.fd = -EBADF,
d08eb1fa
LP
398 .realtime.next = USEC_INFINITY,
399 .boottime.wakeup = WAKEUP_CLOCK_DATA,
254d1313 400 .boottime.fd = -EBADF,
d08eb1fa
LP
401 .boottime.next = USEC_INFINITY,
402 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
254d1313 403 .monotonic.fd = -EBADF,
d08eb1fa
LP
404 .monotonic.next = USEC_INFINITY,
405 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
254d1313 406 .realtime_alarm.fd = -EBADF,
d08eb1fa
LP
407 .realtime_alarm.next = USEC_INFINITY,
408 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
254d1313 409 .boottime_alarm.fd = -EBADF,
d08eb1fa
LP
410 .boottime_alarm.next = USEC_INFINITY,
411 .perturb = USEC_INFINITY,
412 .original_pid = getpid_cached(),
413 };
fd38203a 414
c983e776
EV
415 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
416 if (r < 0)
fd38203a 417 goto fail;
fd38203a
LP
418
419 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
420 if (e->epoll_fd < 0) {
421 r = -errno;
422 goto fail;
423 }
424
7fe2903c
LP
425 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
426
34b87517 427 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
28e5e1e9
DT
428 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
429 special_glyph(SPECIAL_GLYPH_ELLIPSIS));
34b87517
VC
430 e->profile_delays = true;
431 }
432
fd38203a
LP
433 *ret = e;
434 return 0;
435
436fail:
437 event_free(e);
438 return r;
439}
440
8301aa0b 441DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event, sd_event, event_free);
c8e9d15c
YW
442#define PROTECT_EVENT(e) \
443 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
fd38203a 444
afd15bbb
ZJS
445_public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
446 if (s)
447 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
448 return sd_event_source_unref(s);
449}
450
eaa3cbef
LP
451static bool event_pid_changed(sd_event *e) {
452 assert(e);
453
a2360a46 454 /* We don't support people creating an event loop and keeping
eaa3cbef
LP
455 * it around over a fork(). Let's complain. */
456
df0ff127 457 return e->original_pid != getpid_cached();
eaa3cbef
LP
458}
459
366e6411 460static void source_io_unregister(sd_event_source *s) {
fd38203a
LP
461 assert(s);
462 assert(s->type == SOURCE_IO);
463
f6806734 464 if (event_pid_changed(s->event))
366e6411 465 return;
f6806734 466
fd38203a 467 if (!s->io.registered)
366e6411 468 return;
fd38203a 469
d1cf2023 470 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
f80a5d6a 471 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
55cbfaa5 472 strna(s->description), event_source_type_to_string(s->type));
fd38203a
LP
473
474 s->io.registered = false;
fd38203a
LP
475}
476
305f78bf
LP
477static int source_io_register(
478 sd_event_source *s,
479 int enabled,
480 uint32_t events) {
481
fd38203a
LP
482 assert(s);
483 assert(s->type == SOURCE_IO);
baf76283 484 assert(enabled != SD_EVENT_OFF);
fd38203a 485
1eac7948 486 struct epoll_event ev = {
a82f89aa
LP
487 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
488 .data.ptr = s,
489 };
fd38203a 490
15c689d7 491 if (epoll_ctl(s->event->epoll_fd,
1eac7948 492 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
55c540d3 493 s->io.fd, &ev) < 0)
fd38203a
LP
494 return -errno;
495
496 s->io.registered = true;
497
498 return 0;
499}
500
f8f3f926
LP
501static void source_child_pidfd_unregister(sd_event_source *s) {
502 assert(s);
503 assert(s->type == SOURCE_CHILD);
504
505 if (event_pid_changed(s->event))
506 return;
507
508 if (!s->child.registered)
509 return;
510
511 if (EVENT_SOURCE_WATCH_PIDFD(s))
512 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
f80a5d6a 513 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
f8f3f926
LP
514 strna(s->description), event_source_type_to_string(s->type));
515
516 s->child.registered = false;
517}
518
519static int source_child_pidfd_register(sd_event_source *s, int enabled) {
f8f3f926
LP
520 assert(s);
521 assert(s->type == SOURCE_CHILD);
522 assert(enabled != SD_EVENT_OFF);
523
524 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1eac7948 525 struct epoll_event ev = {
f8f3f926
LP
526 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
527 .data.ptr = s,
528 };
529
55c540d3
ZJS
530 if (epoll_ctl(s->event->epoll_fd,
531 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
532 s->child.pidfd, &ev) < 0)
f8f3f926
LP
533 return -errno;
534 }
535
536 s->child.registered = true;
537 return 0;
538}
539
158fe190
LP
540static void source_memory_pressure_unregister(sd_event_source *s) {
541 assert(s);
542 assert(s->type == SOURCE_MEMORY_PRESSURE);
543
544 if (event_pid_changed(s->event))
545 return;
546
547 if (!s->memory_pressure.registered)
548 return;
549
550 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
551 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
552 strna(s->description), event_source_type_to_string(s->type));
553
554 s->memory_pressure.registered = false;
555}
556
557static int source_memory_pressure_register(sd_event_source *s, int enabled) {
558 assert(s);
559 assert(s->type == SOURCE_MEMORY_PRESSURE);
560 assert(enabled != SD_EVENT_OFF);
561
562 struct epoll_event ev = {
563 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
564 (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
565 .data.ptr = s,
566 };
567
568 if (epoll_ctl(s->event->epoll_fd,
569 s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
570 s->memory_pressure.fd, &ev) < 0)
571 return -errno;
572
573 s->memory_pressure.registered = true;
574 return 0;
575}
576
577static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
578 assert(s);
579 assert(s->type == SOURCE_MEMORY_PRESSURE);
580
581 if (s->memory_pressure.in_write_list)
582 return;
583
584 LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
585 s->memory_pressure.in_write_list = true;
586}
587
588static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
589 assert(s);
590 assert(s->type == SOURCE_MEMORY_PRESSURE);
591
592 if (!s->memory_pressure.in_write_list)
593 return;
594
595 LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
596 s->memory_pressure.in_write_list = false;
597}
598
6a0f1f6d
LP
599static clockid_t event_source_type_to_clock(EventSourceType t) {
600
601 switch (t) {
602
603 case SOURCE_TIME_REALTIME:
604 return CLOCK_REALTIME;
605
a8548816
TG
606 case SOURCE_TIME_BOOTTIME:
607 return CLOCK_BOOTTIME;
608
6a0f1f6d
LP
609 case SOURCE_TIME_MONOTONIC:
610 return CLOCK_MONOTONIC;
611
612 case SOURCE_TIME_REALTIME_ALARM:
613 return CLOCK_REALTIME_ALARM;
614
615 case SOURCE_TIME_BOOTTIME_ALARM:
616 return CLOCK_BOOTTIME_ALARM;
617
618 default:
619 return (clockid_t) -1;
620 }
621}
622
623static EventSourceType clock_to_event_source_type(clockid_t clock) {
624
625 switch (clock) {
626
627 case CLOCK_REALTIME:
628 return SOURCE_TIME_REALTIME;
629
a8548816
TG
630 case CLOCK_BOOTTIME:
631 return SOURCE_TIME_BOOTTIME;
632
6a0f1f6d
LP
633 case CLOCK_MONOTONIC:
634 return SOURCE_TIME_MONOTONIC;
635
636 case CLOCK_REALTIME_ALARM:
637 return SOURCE_TIME_REALTIME_ALARM;
638
639 case CLOCK_BOOTTIME_ALARM:
640 return SOURCE_TIME_BOOTTIME_ALARM;
641
642 default:
643 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
644 }
645}
646
647static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
648 assert(e);
649
650 switch (t) {
651
652 case SOURCE_TIME_REALTIME:
653 return &e->realtime;
654
a8548816
TG
655 case SOURCE_TIME_BOOTTIME:
656 return &e->boottime;
657
6a0f1f6d
LP
658 case SOURCE_TIME_MONOTONIC:
659 return &e->monotonic;
660
661 case SOURCE_TIME_REALTIME_ALARM:
662 return &e->realtime_alarm;
663
664 case SOURCE_TIME_BOOTTIME_ALARM:
665 return &e->boottime_alarm;
666
667 default:
668 return NULL;
669 }
670}
671
3e4eb8e7
YW
672static void event_free_signal_data(sd_event *e, struct signal_data *d) {
673 assert(e);
674
675 if (!d)
676 return;
677
678 hashmap_remove(e->signal_data, &d->priority);
679 safe_close(d->fd);
680 free(d);
681}
682
9da4cb2b
LP
683static int event_make_signal_data(
684 sd_event *e,
685 int sig,
686 struct signal_data **ret) {
4807d2d0 687
9da4cb2b
LP
688 struct signal_data *d;
689 bool added = false;
690 sigset_t ss_copy;
691 int64_t priority;
f95387cd
ZJS
692 int r;
693
694 assert(e);
695
f6806734 696 if (event_pid_changed(e))
9da4cb2b 697 return -ECHILD;
f6806734 698
9da4cb2b
LP
699 if (e->signal_sources && e->signal_sources[sig])
700 priority = e->signal_sources[sig]->priority;
701 else
de05913d 702 priority = SD_EVENT_PRIORITY_NORMAL;
f95387cd 703
9da4cb2b
LP
704 d = hashmap_get(e->signal_data, &priority);
705 if (d) {
706 if (sigismember(&d->sigset, sig) > 0) {
707 if (ret)
708 *ret = d;
709 return 0;
710 }
711 } else {
d08eb1fa 712 d = new(struct signal_data, 1);
9da4cb2b
LP
713 if (!d)
714 return -ENOMEM;
715
d08eb1fa
LP
716 *d = (struct signal_data) {
717 .wakeup = WAKEUP_SIGNAL_DATA,
254d1313 718 .fd = -EBADF,
d08eb1fa
LP
719 .priority = priority,
720 };
9da4cb2b 721
f656fdb6 722 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
90f604d1
ZJS
723 if (r < 0) {
724 free(d);
9da4cb2b 725 return r;
90f604d1 726 }
f95387cd 727
9da4cb2b
LP
728 added = true;
729 }
730
731 ss_copy = d->sigset;
732 assert_se(sigaddset(&ss_copy, sig) >= 0);
733
cbff793f
ZJS
734 r = signalfd(d->fd >= 0 ? d->fd : -1, /* the first arg must be -1 or a valid signalfd */
735 &ss_copy,
736 SFD_NONBLOCK|SFD_CLOEXEC);
9da4cb2b
LP
737 if (r < 0) {
738 r = -errno;
739 goto fail;
740 }
741
742 d->sigset = ss_copy;
f95387cd 743
9da4cb2b
LP
744 if (d->fd >= 0) {
745 if (ret)
746 *ret = d;
f95387cd 747 return 0;
9da4cb2b
LP
748 }
749
7fe2903c 750 d->fd = fd_move_above_stdio(r);
f95387cd 751
1eac7948 752 struct epoll_event ev = {
a82f89aa
LP
753 .events = EPOLLIN,
754 .data.ptr = d,
755 };
f95387cd 756
15c689d7 757 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
9da4cb2b
LP
758 r = -errno;
759 goto fail;
f95387cd
ZJS
760 }
761
9da4cb2b
LP
762 if (ret)
763 *ret = d;
764
f95387cd 765 return 0;
9da4cb2b
LP
766
767fail:
3e4eb8e7
YW
768 if (added)
769 event_free_signal_data(e, d);
9da4cb2b
LP
770
771 return r;
772}
773
774static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
775 assert(e);
776 assert(d);
777
778 /* Turns off the specified signal in the signal data
779 * object. If the signal mask of the object becomes empty that
780 * way removes it. */
781
782 if (sigismember(&d->sigset, sig) == 0)
783 return;
784
785 assert_se(sigdelset(&d->sigset, sig) >= 0);
786
787 if (sigisemptyset(&d->sigset)) {
9da4cb2b 788 /* If all the mask is all-zero we can get rid of the structure */
3e4eb8e7 789 event_free_signal_data(e, d);
9da4cb2b
LP
790 return;
791 }
792
01e6af73
YW
793 if (event_pid_changed(e))
794 return;
795
9da4cb2b
LP
796 assert(d->fd >= 0);
797
798 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
799 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
800}
801
802static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
803 struct signal_data *d;
804 static const int64_t zero_priority = 0;
805
806 assert(e);
807
f8f3f926
LP
808 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
809 * and possibly drop the signalfd for it. */
9da4cb2b
LP
810
811 if (sig == SIGCHLD &&
b6d5481b 812 e->n_online_child_sources > 0)
9da4cb2b
LP
813 return;
814
815 if (e->signal_sources &&
816 e->signal_sources[sig] &&
b6d5481b 817 event_source_is_online(e->signal_sources[sig]))
9da4cb2b
LP
818 return;
819
820 /*
821 * The specified signal might be enabled in three different queues:
822 *
823 * 1) the one that belongs to the priority passed (if it is non-NULL)
824 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
825 * 3) the 0 priority (to cover the SIGCHLD case)
826 *
827 * Hence, let's remove it from all three here.
828 */
829
830 if (priority) {
831 d = hashmap_get(e->signal_data, priority);
832 if (d)
833 event_unmask_signal_data(e, d, sig);
834 }
835
836 if (e->signal_sources && e->signal_sources[sig]) {
837 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
838 if (d)
839 event_unmask_signal_data(e, d, sig);
840 }
841
842 d = hashmap_get(e->signal_data, &zero_priority);
843 if (d)
844 event_unmask_signal_data(e, d, sig);
f95387cd
ZJS
845}
846
e1951c16
MS
847static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
848 assert(s);
849
850 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
851 * they are enabled/disabled or marked pending and such. */
852
853 if (s->pending)
854 prioq_reshuffle(s->event->pending, s, &s->pending_index);
855
856 if (s->prepare)
857 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
858}
859
860static void event_source_time_prioq_reshuffle(sd_event_source *s) {
861 struct clock_data *d;
862
863 assert(s);
e1951c16
MS
864
865 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
5c08c7ab
YW
866 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
867 * properly again. */
b6d5481b
LP
868
869 if (s->ratelimited)
870 d = &s->event->monotonic;
5c08c7ab 871 else if (EVENT_SOURCE_IS_TIME(s->type))
b6d5481b 872 assert_se(d = event_get_clock_data(s->event, s->type));
5c08c7ab
YW
873 else
874 return; /* no-op for an event source which is neither a timer nor ratelimited. */
b6d5481b 875
f41315fc
LP
876 prioq_reshuffle(d->earliest, s, &s->earliest_index);
877 prioq_reshuffle(d->latest, s, &s->latest_index);
e1951c16
MS
878 d->needs_rearm = true;
879}
880
1e45e3fe
LP
881static void event_source_time_prioq_remove(
882 sd_event_source *s,
883 struct clock_data *d) {
884
885 assert(s);
886 assert(d);
887
f41315fc
LP
888 prioq_remove(d->earliest, s, &s->earliest_index);
889 prioq_remove(d->latest, s, &s->latest_index);
890 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
891 d->needs_rearm = true;
892}
893
a71fe8b8
LP
894static void source_disconnect(sd_event_source *s) {
895 sd_event *event;
897448bd 896 int r;
a71fe8b8 897
fd38203a
LP
898 assert(s);
899
a71fe8b8
LP
900 if (!s->event)
901 return;
15b38f93 902
a71fe8b8 903 assert(s->event->n_sources > 0);
fd38203a 904
a71fe8b8 905 switch (s->type) {
fd38203a 906
a71fe8b8
LP
907 case SOURCE_IO:
908 if (s->io.fd >= 0)
909 source_io_unregister(s);
fd38203a 910
a71fe8b8 911 break;
6a0f1f6d 912
a71fe8b8 913 case SOURCE_TIME_REALTIME:
a8548816 914 case SOURCE_TIME_BOOTTIME:
a71fe8b8
LP
915 case SOURCE_TIME_MONOTONIC:
916 case SOURCE_TIME_REALTIME_ALARM:
b6d5481b
LP
917 case SOURCE_TIME_BOOTTIME_ALARM:
918 /* Only remove this event source from the time event source here if it is not ratelimited. If
919 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
920 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
921
922 if (!s->ratelimited) {
923 struct clock_data *d;
924 assert_se(d = event_get_clock_data(s->event, s->type));
925 event_source_time_prioq_remove(s, d);
926 }
927
a71fe8b8 928 break;
a71fe8b8
LP
929
930 case SOURCE_SIGNAL:
931 if (s->signal.sig > 0) {
9da4cb2b 932
a71fe8b8
LP
933 if (s->event->signal_sources)
934 s->event->signal_sources[s->signal.sig] = NULL;
4807d2d0 935
9da4cb2b 936 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
897448bd
LP
937
938 if (s->signal.unblock) {
939 sigset_t new_ss;
940
941 if (sigemptyset(&new_ss) < 0)
942 log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
943 else if (sigaddset(&new_ss, s->signal.sig) < 0)
944 log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
945 else {
946 r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
947 if (r != 0)
948 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
949 }
950 }
6a0f1f6d 951 }
fd38203a 952
a71fe8b8 953 break;
fd38203a 954
a71fe8b8 955 case SOURCE_CHILD:
86587c93
YW
956 if (event_pid_changed(s->event))
957 s->child.process_owned = false;
958
a71fe8b8 959 if (s->child.pid > 0) {
b6d5481b
LP
960 if (event_source_is_online(s)) {
961 assert(s->event->n_online_child_sources > 0);
962 s->event->n_online_child_sources--;
4807d2d0 963 }
fd38203a 964
4a0b58c4 965 (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
a71fe8b8 966 }
fd38203a 967
f8f3f926
LP
968 if (EVENT_SOURCE_WATCH_PIDFD(s))
969 source_child_pidfd_unregister(s);
970 else
971 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
972
a71fe8b8 973 break;
fd38203a 974
a71fe8b8
LP
975 case SOURCE_DEFER:
976 /* nothing */
977 break;
fd38203a 978
a71fe8b8
LP
979 case SOURCE_POST:
980 set_remove(s->event->post_sources, s);
981 break;
da7e457c 982
a71fe8b8
LP
983 case SOURCE_EXIT:
984 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
985 break;
0eb2e0e3 986
97ef5391
LP
987 case SOURCE_INOTIFY: {
988 struct inode_data *inode_data;
989
990 inode_data = s->inotify.inode_data;
991 if (inode_data) {
992 struct inotify_data *inotify_data;
993 assert_se(inotify_data = inode_data->inotify_data);
994
995 /* Detach this event source from the inode object */
996 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
997 s->inotify.inode_data = NULL;
998
999 if (s->pending) {
1000 assert(inotify_data->n_pending > 0);
1001 inotify_data->n_pending--;
1002 }
1003
1004 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1005 * continued to being watched. That's because inotify doesn't really have an API for that: we
1006 * can only change watch masks with access to the original inode either by fd or by path. But
1007 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
f21f31b2 1008 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
97ef5391
LP
1009 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1010 * there), but given the need for open_by_handle_at() which is privileged and not universally
1011 * available this would be quite an incomplete solution. Hence we go the other way, leave the
1012 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1013 * anymore after reception. Yes, this sucks, but … Linux … */
1014
1015 /* Maybe release the inode data (and its inotify) */
1016 event_gc_inode_data(s->event, inode_data);
1017 }
1018
1019 break;
1020 }
1021
158fe190
LP
1022 case SOURCE_MEMORY_PRESSURE:
1023 source_memory_pressure_remove_from_write_list(s);
1024 source_memory_pressure_unregister(s);
1025 break;
1026
a71fe8b8 1027 default:
04499a70 1028 assert_not_reached();
a71fe8b8 1029 }
6e9feda3 1030
a71fe8b8
LP
1031 if (s->pending)
1032 prioq_remove(s->event->pending, s, &s->pending_index);
9d3e3aa5 1033
a71fe8b8
LP
1034 if (s->prepare)
1035 prioq_remove(s->event->prepare, s, &s->prepare_index);
fd38203a 1036
b6d5481b
LP
1037 if (s->ratelimited)
1038 event_source_time_prioq_remove(s, &s->event->monotonic);
1039
e514aa1e 1040 event = TAKE_PTR(s->event);
a71fe8b8
LP
1041 LIST_REMOVE(sources, event->sources, s);
1042 event->n_sources--;
fd38203a 1043
f5982559
LP
1044 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1045 * pidfd associated with this event source, which we'll do only on source_free(). */
1046
a71fe8b8
LP
1047 if (!s->floating)
1048 sd_event_unref(event);
1049}
1050
75db809a 1051static sd_event_source* source_free(sd_event_source *s) {
a71fe8b8 1052 assert(s);
fd38203a 1053
a71fe8b8 1054 source_disconnect(s);
ab93297c
NM
1055
1056 if (s->type == SOURCE_IO && s->io.owned)
15723a1d
LP
1057 s->io.fd = safe_close(s->io.fd);
1058
f8f3f926
LP
1059 if (s->type == SOURCE_CHILD) {
1060 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1061
1062 if (s->child.process_owned) {
1063
1064 if (!s->child.exited) {
1065 bool sent = false;
1066
1067 if (s->child.pidfd >= 0) {
1068 if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
1069 if (errno == ESRCH) /* Already dead */
1070 sent = true;
1071 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
1072 log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
1073 s->child.pid);
1074 } else
1075 sent = true;
1076 }
1077
1078 if (!sent)
1079 if (kill(s->child.pid, SIGKILL) < 0)
1080 if (errno != ESRCH) /* Already dead */
1081 log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1082 s->child.pid);
1083 }
1084
1085 if (!s->child.waited) {
1086 siginfo_t si = {};
1087
1088 /* Reap the child if we can */
1089 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1090 }
1091 }
1092
1093 if (s->child.pidfd_owned)
1094 s->child.pidfd = safe_close(s->child.pidfd);
1095 }
1096
158fe190
LP
1097 if (s->type == SOURCE_MEMORY_PRESSURE) {
1098 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1099 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1100 }
1101
15723a1d
LP
1102 if (s->destroy_callback)
1103 s->destroy_callback(s->userdata);
ab93297c 1104
356779df 1105 free(s->description);
75db809a 1106 return mfree(s);
fd38203a 1107}
8c75fe17 1108DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
fd38203a
LP
1109
1110static int source_set_pending(sd_event_source *s, bool b) {
1111 int r;
1112
1113 assert(s);
6203e07a 1114 assert(s->type != SOURCE_EXIT);
fd38203a
LP
1115
1116 if (s->pending == b)
1117 return 0;
1118
1119 s->pending = b;
1120
1121 if (b) {
1122 s->pending_iteration = s->event->iteration;
1123
1124 r = prioq_put(s->event->pending, s, &s->pending_index);
1125 if (r < 0) {
1126 s->pending = false;
1127 return r;
1128 }
1129 } else
1130 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1131
e1951c16
MS
1132 if (EVENT_SOURCE_IS_TIME(s->type))
1133 event_source_time_prioq_reshuffle(s);
2576a19e 1134
9da4cb2b
LP
1135 if (s->type == SOURCE_SIGNAL && !b) {
1136 struct signal_data *d;
1137
1138 d = hashmap_get(s->event->signal_data, &s->priority);
1139 if (d && d->current == s)
1140 d->current = NULL;
1141 }
1142
97ef5391
LP
1143 if (s->type == SOURCE_INOTIFY) {
1144
1145 assert(s->inotify.inode_data);
1146 assert(s->inotify.inode_data->inotify_data);
1147
1148 if (b)
1149 s->inotify.inode_data->inotify_data->n_pending ++;
1150 else {
1151 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1152 s->inotify.inode_data->inotify_data->n_pending --;
1153 }
1154 }
1155
efd3be9d 1156 return 1;
fd38203a
LP
1157}
1158
a71fe8b8 1159static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
a38cf9fb
LP
1160
1161 /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1162 * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1163 * lines. */
1164 static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1165 [SOURCE_IO] = endoffsetof_field(sd_event_source, io),
1166 [SOURCE_TIME_REALTIME] = endoffsetof_field(sd_event_source, time),
1167 [SOURCE_TIME_BOOTTIME] = endoffsetof_field(sd_event_source, time),
1168 [SOURCE_TIME_MONOTONIC] = endoffsetof_field(sd_event_source, time),
1169 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1170 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1171 [SOURCE_SIGNAL] = endoffsetof_field(sd_event_source, signal),
1172 [SOURCE_CHILD] = endoffsetof_field(sd_event_source, child),
1173 [SOURCE_DEFER] = endoffsetof_field(sd_event_source, defer),
1174 [SOURCE_POST] = endoffsetof_field(sd_event_source, post),
1175 [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit),
1176 [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify),
158fe190 1177 [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure),
a38cf9fb
LP
1178 };
1179
fd38203a
LP
1180 sd_event_source *s;
1181
1182 assert(e);
a38cf9fb
LP
1183 assert(type >= 0);
1184 assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1185 assert(size_table[type] > 0);
fd38203a 1186
a38cf9fb
LP
1187 /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1188 * size, even if we only allocate the initial part we need. */
1189 s = expand_to_usable(malloc0(size_table[type]), sizeof(sd_event_source));
fd38203a
LP
1190 if (!s)
1191 return NULL;
1192
a38cf9fb
LP
1193 /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1194 * than what we allocated here. */
1195 s->n_ref = 1;
1196 s->event = e;
1197 s->floating = floating;
1198 s->type = type;
1199 s->pending_index = PRIOQ_IDX_NULL;
1200 s->prepare_index = PRIOQ_IDX_NULL;
a71fe8b8
LP
1201
1202 if (!floating)
1203 sd_event_ref(e);
fd38203a 1204
a71fe8b8 1205 LIST_PREPEND(sources, e->sources, s);
313cefa1 1206 e->n_sources++;
15b38f93 1207
fd38203a
LP
1208 return s;
1209}
1210
b9350e70
LP
1211static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1212 assert(s);
1213
1214 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1215}
1216
f7262a9f 1217_public_ int sd_event_add_io(
fd38203a 1218 sd_event *e,
151b9b96 1219 sd_event_source **ret,
fd38203a
LP
1220 int fd,
1221 uint32_t events,
718db961 1222 sd_event_io_handler_t callback,
151b9b96 1223 void *userdata) {
fd38203a 1224
ec766a51 1225 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1226 int r;
1227
305f78bf 1228 assert_return(e, -EINVAL);
b937d761 1229 assert_return(e = event_resolve(e), -ENOPKG);
8ac43fee 1230 assert_return(fd >= 0, -EBADF);
2a16a986 1231 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 1232 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1233 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1234
b9350e70
LP
1235 if (!callback)
1236 callback = io_exit_callback;
1237
a71fe8b8 1238 s = source_new(e, !ret, SOURCE_IO);
fd38203a
LP
1239 if (!s)
1240 return -ENOMEM;
1241
9da4cb2b 1242 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1243 s->io.fd = fd;
1244 s->io.events = events;
1245 s->io.callback = callback;
1246 s->userdata = userdata;
baf76283 1247 s->enabled = SD_EVENT_ON;
fd38203a 1248
baf76283 1249 r = source_io_register(s, s->enabled, events);
ec766a51 1250 if (r < 0)
050f74f2 1251 return r;
fd38203a 1252
a71fe8b8
LP
1253 if (ret)
1254 *ret = s;
ec766a51 1255 TAKE_PTR(s);
a71fe8b8 1256
fd38203a
LP
1257 return 0;
1258}
1259
52444dc4 1260static void initialize_perturb(sd_event *e) {
6d2326e0 1261 sd_id128_t id = {};
52444dc4 1262
6d2326e0
YW
1263 /* When we sleep for longer, we try to realign the wakeup to the same time within each
1264 * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1265 * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1266 * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1267 * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
52444dc4 1268
3a43da28 1269 if (_likely_(e->perturb != USEC_INFINITY))
52444dc4
LP
1270 return;
1271
1912f790 1272 if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
6d2326e0
YW
1273 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1274 else
1275 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
52444dc4
LP
1276}
1277
fd38203a
LP
1278static int event_setup_timer_fd(
1279 sd_event *e,
6a0f1f6d
LP
1280 struct clock_data *d,
1281 clockid_t clock) {
fd38203a 1282
fd38203a 1283 assert(e);
6a0f1f6d 1284 assert(d);
fd38203a 1285
6a0f1f6d 1286 if (_likely_(d->fd >= 0))
fd38203a
LP
1287 return 0;
1288
254d1313 1289 _cleanup_close_ int fd = -EBADF;
b44d87e2 1290
6a0f1f6d 1291 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
fd38203a
LP
1292 if (fd < 0)
1293 return -errno;
1294
7fe2903c
LP
1295 fd = fd_move_above_stdio(fd);
1296
1eac7948 1297 struct epoll_event ev = {
a82f89aa
LP
1298 .events = EPOLLIN,
1299 .data.ptr = d,
1300 };
fd38203a 1301
15c689d7 1302 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
fd38203a 1303 return -errno;
fd38203a 1304
b44d87e2 1305 d->fd = TAKE_FD(fd);
fd38203a
LP
1306 return 0;
1307}
1308
c4f1aff2
TG
1309static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1310 assert(s);
1311
1312 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1313}
1314
41c63f36
LP
1315static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1316 int r;
1317
1318 assert(d);
1319
1320 if (d->fd < 0) {
1321 r = event_setup_timer_fd(e, d, clock);
1322 if (r < 0)
1323 return r;
1324 }
1325
1326 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1327 if (r < 0)
1328 return r;
1329
1330 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1331 if (r < 0)
1332 return r;
1333
1334 return 0;
1335}
1336
1e45e3fe
LP
1337static int event_source_time_prioq_put(
1338 sd_event_source *s,
1339 struct clock_data *d) {
1340
1341 int r;
1342
1343 assert(s);
1344 assert(d);
19947509 1345 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1e45e3fe 1346
f41315fc 1347 r = prioq_put(d->earliest, s, &s->earliest_index);
1e45e3fe
LP
1348 if (r < 0)
1349 return r;
1350
f41315fc 1351 r = prioq_put(d->latest, s, &s->latest_index);
1e45e3fe 1352 if (r < 0) {
f41315fc
LP
1353 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1354 s->earliest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
1355 return r;
1356 }
1357
1358 d->needs_rearm = true;
1359 return 0;
1360}
1361
6a0f1f6d 1362_public_ int sd_event_add_time(
fd38203a 1363 sd_event *e,
151b9b96 1364 sd_event_source **ret,
6a0f1f6d 1365 clockid_t clock,
fd38203a 1366 uint64_t usec,
c2ba3ad6 1367 uint64_t accuracy,
718db961 1368 sd_event_time_handler_t callback,
151b9b96 1369 void *userdata) {
fd38203a 1370
6a0f1f6d 1371 EventSourceType type;
ec766a51 1372 _cleanup_(source_freep) sd_event_source *s = NULL;
6a0f1f6d 1373 struct clock_data *d;
fd38203a
LP
1374 int r;
1375
305f78bf 1376 assert_return(e, -EINVAL);
b937d761 1377 assert_return(e = event_resolve(e), -ENOPKG);
f5fbe71d 1378 assert_return(accuracy != UINT64_MAX, -EINVAL);
da7e457c 1379 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1380 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1381
e475d10c
LP
1382 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1383 return -EOPNOTSUPP;
1384
1385 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1386 if (type < 0)
3411372e
LP
1387 return -EOPNOTSUPP;
1388
c4f1aff2
TG
1389 if (!callback)
1390 callback = time_exit_callback;
1391
1e45e3fe 1392 assert_se(d = event_get_clock_data(e, type));
c2ba3ad6 1393
41c63f36 1394 r = setup_clock_data(e, d, clock);
c983e776
EV
1395 if (r < 0)
1396 return r;
fd38203a 1397
a71fe8b8 1398 s = source_new(e, !ret, type);
fd38203a
LP
1399 if (!s)
1400 return -ENOMEM;
1401
1402 s->time.next = usec;
c2ba3ad6 1403 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
fd38203a 1404 s->time.callback = callback;
f41315fc 1405 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
fd38203a 1406 s->userdata = userdata;
baf76283 1407 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1408
1e45e3fe 1409 r = event_source_time_prioq_put(s, d);
c2ba3ad6 1410 if (r < 0)
ec766a51 1411 return r;
fd38203a 1412
a71fe8b8
LP
1413 if (ret)
1414 *ret = s;
ec766a51 1415 TAKE_PTR(s);
a71fe8b8 1416
fd38203a
LP
1417 return 0;
1418}
1419
d6a83dc4
LP
1420_public_ int sd_event_add_time_relative(
1421 sd_event *e,
1422 sd_event_source **ret,
1423 clockid_t clock,
1424 uint64_t usec,
1425 uint64_t accuracy,
1426 sd_event_time_handler_t callback,
1427 void *userdata) {
1428
1429 usec_t t;
1430 int r;
1431
1432 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1433 * checks for overflow. */
1434
1435 r = sd_event_now(e, clock, &t);
1436 if (r < 0)
1437 return r;
1438
1439 if (usec >= USEC_INFINITY - t)
1440 return -EOVERFLOW;
1441
1442 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1443}
1444
59bc1fd7
LP
1445static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1446 assert(s);
1447
1448 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1449}
1450
f7262a9f 1451_public_ int sd_event_add_signal(
305f78bf 1452 sd_event *e,
151b9b96 1453 sd_event_source **ret,
305f78bf 1454 int sig,
718db961 1455 sd_event_signal_handler_t callback,
151b9b96 1456 void *userdata) {
305f78bf 1457
ec766a51 1458 _cleanup_(source_freep) sd_event_source *s = NULL;
9da4cb2b 1459 struct signal_data *d;
897448bd
LP
1460 sigset_t new_ss;
1461 bool block_it;
fd38203a
LP
1462 int r;
1463
305f78bf 1464 assert_return(e, -EINVAL);
b937d761 1465 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1466 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1467 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1468
897448bd
LP
1469 /* Let's make sure our special flag stays outside of the valid signal range */
1470 assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1471
1472 if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1473 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1474 assert_return(SIGNAL_VALID(sig), -EINVAL);
1475
1476 block_it = true;
1477 } else {
1478 assert_return(SIGNAL_VALID(sig), -EINVAL);
1479
1480 r = signal_is_blocked(sig);
1481 if (r < 0)
1482 return r;
1483 if (r == 0)
1484 return -EBUSY;
1485
1486 block_it = false;
1487 }
1488
59bc1fd7
LP
1489 if (!callback)
1490 callback = signal_exit_callback;
1491
fd38203a
LP
1492 if (!e->signal_sources) {
1493 e->signal_sources = new0(sd_event_source*, _NSIG);
1494 if (!e->signal_sources)
1495 return -ENOMEM;
1496 } else if (e->signal_sources[sig])
1497 return -EBUSY;
1498
a71fe8b8 1499 s = source_new(e, !ret, SOURCE_SIGNAL);
fd38203a
LP
1500 if (!s)
1501 return -ENOMEM;
1502
1503 s->signal.sig = sig;
1504 s->signal.callback = callback;
1505 s->userdata = userdata;
baf76283 1506 s->enabled = SD_EVENT_ON;
fd38203a
LP
1507
1508 e->signal_sources[sig] = s;
fd38203a 1509
897448bd
LP
1510 if (block_it) {
1511 sigset_t old_ss;
1512
1513 if (sigemptyset(&new_ss) < 0)
1514 return -errno;
1515
1516 if (sigaddset(&new_ss, sig) < 0)
1517 return -errno;
1518
1519 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1520 if (r != 0)
1521 return -r;
1522
1523 r = sigismember(&old_ss, sig);
1524 if (r < 0)
1525 return -errno;
1526
1527 s->signal.unblock = !r;
1528 } else
1529 s->signal.unblock = false;
1530
9da4cb2b 1531 r = event_make_signal_data(e, sig, &d);
897448bd
LP
1532 if (r < 0) {
1533 if (s->signal.unblock)
1534 (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1535
9da4cb2b 1536 return r;
897448bd 1537 }
fd38203a 1538
f1f00dbb
LP
1539 /* Use the signal name as description for the event source by default */
1540 (void) sd_event_source_set_description(s, signal_to_string(sig));
1541
a71fe8b8
LP
1542 if (ret)
1543 *ret = s;
ec766a51 1544 TAKE_PTR(s);
a71fe8b8 1545
fd38203a
LP
1546 return 0;
1547}
1548
b9350e70
LP
1549static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1550 assert(s);
1551
1552 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1553}
1554
f8f3f926
LP
1555static bool shall_use_pidfd(void) {
1556 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1557 return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1558}
1559
f7262a9f 1560_public_ int sd_event_add_child(
305f78bf 1561 sd_event *e,
151b9b96 1562 sd_event_source **ret,
305f78bf
LP
1563 pid_t pid,
1564 int options,
718db961 1565 sd_event_child_handler_t callback,
151b9b96 1566 void *userdata) {
305f78bf 1567
ec766a51 1568 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1569 int r;
1570
305f78bf 1571 assert_return(e, -EINVAL);
b937d761 1572 assert_return(e = event_resolve(e), -ENOPKG);
305f78bf
LP
1573 assert_return(pid > 1, -EINVAL);
1574 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1575 assert_return(options != 0, -EINVAL);
da7e457c 1576 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1577 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1578
b9350e70
LP
1579 if (!callback)
1580 callback = child_exit_callback;
1581
b6d5481b 1582 if (e->n_online_child_sources == 0) {
ee880b37
LP
1583 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1584 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1585 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1586 * take effect.
1587 *
1588 * (As an optimization we only do this check on the first child event source created.) */
1589 r = signal_is_blocked(SIGCHLD);
1590 if (r < 0)
1591 return r;
1592 if (r == 0)
1593 return -EBUSY;
1594 }
1595
d5099efc 1596 r = hashmap_ensure_allocated(&e->child_sources, NULL);
fd38203a
LP
1597 if (r < 0)
1598 return r;
1599
4a0b58c4 1600 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
fd38203a
LP
1601 return -EBUSY;
1602
a71fe8b8 1603 s = source_new(e, !ret, SOURCE_CHILD);
fd38203a
LP
1604 if (!s)
1605 return -ENOMEM;
1606
f8f3f926 1607 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1608 s->child.options = options;
1609 s->child.callback = callback;
1610 s->userdata = userdata;
baf76283 1611 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1612
f8f3f926
LP
1613 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1614 * pin the PID, and make regular waitid() handling race-free. */
1615
1616 if (shall_use_pidfd()) {
54988a27 1617 s->child.pidfd = pidfd_open(pid, 0);
f8f3f926
LP
1618 if (s->child.pidfd < 0) {
1619 /* Propagate errors unless the syscall is not supported or blocked */
1620 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1621 return -errno;
1622 } else
1623 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1624 } else
254d1313 1625 s->child.pidfd = -EBADF;
f8f3f926 1626
f8f3f926
LP
1627 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1628 /* We have a pidfd and we only want to watch for exit */
f8f3f926 1629 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1630 if (r < 0)
f8f3f926 1631 return r;
ac9f2640 1632
f8f3f926
LP
1633 } else {
1634 /* We have no pidfd or we shall wait for some other event than WEXITED */
f8f3f926 1635 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1636 if (r < 0)
f8f3f926 1637 return r;
f8f3f926
LP
1638
1639 e->need_process_child = true;
1640 }
c2ba3ad6 1641
54988a27
YW
1642 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1643 if (r < 0)
1644 return r;
1645
1646 /* These must be done after everything succeeds. */
1647 s->child.pid = pid;
b6d5481b 1648 e->n_online_child_sources++;
ac9f2640 1649
a71fe8b8
LP
1650 if (ret)
1651 *ret = s;
ec766a51 1652 TAKE_PTR(s);
f8f3f926
LP
1653 return 0;
1654}
1655
1656_public_ int sd_event_add_child_pidfd(
1657 sd_event *e,
1658 sd_event_source **ret,
1659 int pidfd,
1660 int options,
1661 sd_event_child_handler_t callback,
1662 void *userdata) {
1663
1664
1665 _cleanup_(source_freep) sd_event_source *s = NULL;
1666 pid_t pid;
1667 int r;
1668
1669 assert_return(e, -EINVAL);
1670 assert_return(e = event_resolve(e), -ENOPKG);
1671 assert_return(pidfd >= 0, -EBADF);
1672 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1673 assert_return(options != 0, -EINVAL);
f8f3f926
LP
1674 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1675 assert_return(!event_pid_changed(e), -ECHILD);
1676
b9350e70
LP
1677 if (!callback)
1678 callback = child_exit_callback;
1679
b6d5481b 1680 if (e->n_online_child_sources == 0) {
ee880b37
LP
1681 r = signal_is_blocked(SIGCHLD);
1682 if (r < 0)
1683 return r;
1684 if (r == 0)
1685 return -EBUSY;
1686 }
1687
f8f3f926
LP
1688 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1689 if (r < 0)
1690 return r;
1691
1692 r = pidfd_get_pid(pidfd, &pid);
1693 if (r < 0)
1694 return r;
1695
1696 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1697 return -EBUSY;
1698
1699 s = source_new(e, !ret, SOURCE_CHILD);
1700 if (!s)
1701 return -ENOMEM;
1702
1703 s->wakeup = WAKEUP_EVENT_SOURCE;
1704 s->child.pidfd = pidfd;
1705 s->child.pid = pid;
1706 s->child.options = options;
1707 s->child.callback = callback;
1708 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1709 s->userdata = userdata;
1710 s->enabled = SD_EVENT_ONESHOT;
1711
1712 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1713 if (r < 0)
1714 return r;
1715
f8f3f926
LP
1716 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1717 /* We only want to watch for WEXITED */
f8f3f926 1718 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1719 if (r < 0)
f8f3f926 1720 return r;
f8f3f926
LP
1721 } else {
1722 /* We shall wait for some other event than WEXITED */
f8f3f926 1723 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1724 if (r < 0)
f8f3f926 1725 return r;
a71fe8b8 1726
f8f3f926
LP
1727 e->need_process_child = true;
1728 }
1729
b6d5481b 1730 e->n_online_child_sources++;
ac9f2640 1731
f8f3f926
LP
1732 if (ret)
1733 *ret = s;
f8f3f926 1734 TAKE_PTR(s);
fd38203a
LP
1735 return 0;
1736}
1737
b9350e70
LP
1738static int generic_exit_callback(sd_event_source *s, void *userdata) {
1739 assert(s);
1740
1741 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1742}
1743
f7262a9f 1744_public_ int sd_event_add_defer(
305f78bf 1745 sd_event *e,
151b9b96 1746 sd_event_source **ret,
718db961 1747 sd_event_handler_t callback,
151b9b96 1748 void *userdata) {
305f78bf 1749
ec766a51 1750 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1751 int r;
1752
305f78bf 1753 assert_return(e, -EINVAL);
b937d761 1754 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1755 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1756 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1757
b9350e70
LP
1758 if (!callback)
1759 callback = generic_exit_callback;
1760
a71fe8b8 1761 s = source_new(e, !ret, SOURCE_DEFER);
fd38203a
LP
1762 if (!s)
1763 return -ENOMEM;
1764
1765 s->defer.callback = callback;
1766 s->userdata = userdata;
baf76283 1767 s->enabled = SD_EVENT_ONESHOT;
fd38203a
LP
1768
1769 r = source_set_pending(s, true);
ec766a51 1770 if (r < 0)
fd38203a 1771 return r;
fd38203a 1772
a71fe8b8
LP
1773 if (ret)
1774 *ret = s;
ec766a51 1775 TAKE_PTR(s);
a71fe8b8 1776
fd38203a
LP
1777 return 0;
1778}
1779
6e9feda3
LP
1780_public_ int sd_event_add_post(
1781 sd_event *e,
1782 sd_event_source **ret,
1783 sd_event_handler_t callback,
1784 void *userdata) {
1785
ec766a51 1786 _cleanup_(source_freep) sd_event_source *s = NULL;
6e9feda3
LP
1787 int r;
1788
1789 assert_return(e, -EINVAL);
b937d761 1790 assert_return(e = event_resolve(e), -ENOPKG);
6e9feda3
LP
1791 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1792 assert_return(!event_pid_changed(e), -ECHILD);
1793
b9350e70
LP
1794 if (!callback)
1795 callback = generic_exit_callback;
1796
a71fe8b8 1797 s = source_new(e, !ret, SOURCE_POST);
6e9feda3
LP
1798 if (!s)
1799 return -ENOMEM;
1800
1801 s->post.callback = callback;
1802 s->userdata = userdata;
1803 s->enabled = SD_EVENT_ON;
1804
de7fef4b 1805 r = set_ensure_put(&e->post_sources, NULL, s);
ec766a51 1806 if (r < 0)
6e9feda3 1807 return r;
de7fef4b 1808 assert(r > 0);
6e9feda3 1809
a71fe8b8
LP
1810 if (ret)
1811 *ret = s;
ec766a51 1812 TAKE_PTR(s);
a71fe8b8 1813
6e9feda3
LP
1814 return 0;
1815}
1816
6203e07a 1817_public_ int sd_event_add_exit(
305f78bf 1818 sd_event *e,
151b9b96 1819 sd_event_source **ret,
718db961 1820 sd_event_handler_t callback,
151b9b96 1821 void *userdata) {
305f78bf 1822
ec766a51 1823 _cleanup_(source_freep) sd_event_source *s = NULL;
da7e457c
LP
1824 int r;
1825
1826 assert_return(e, -EINVAL);
b937d761 1827 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
1828 assert_return(callback, -EINVAL);
1829 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1830 assert_return(!event_pid_changed(e), -ECHILD);
1831
c983e776
EV
1832 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1833 if (r < 0)
1834 return r;
da7e457c 1835
a71fe8b8 1836 s = source_new(e, !ret, SOURCE_EXIT);
fd38203a 1837 if (!s)
da7e457c 1838 return -ENOMEM;
fd38203a 1839
6203e07a 1840 s->exit.callback = callback;
da7e457c 1841 s->userdata = userdata;
6203e07a 1842 s->exit.prioq_index = PRIOQ_IDX_NULL;
baf76283 1843 s->enabled = SD_EVENT_ONESHOT;
da7e457c 1844
6203e07a 1845 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
ec766a51 1846 if (r < 0)
da7e457c 1847 return r;
da7e457c 1848
a71fe8b8
LP
1849 if (ret)
1850 *ret = s;
ec766a51 1851 TAKE_PTR(s);
a71fe8b8 1852
da7e457c
LP
1853 return 0;
1854}
1855
158fe190
LP
1856int sd_event_trim_memory(void) {
1857 int r;
1858
1859 /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1860 * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1861 * NULL callback parameter. */
1862
1863 log_debug("Memory pressure event, trimming malloc() memory.");
1864
1865#if HAVE_GENERIC_MALLINFO
1866 generic_mallinfo before_mallinfo = generic_mallinfo_get();
1867#endif
1868
1869 usec_t before_timestamp = now(CLOCK_MONOTONIC);
1870 hashmap_trim_pools();
1871 r = malloc_trim(0);
1872 usec_t after_timestamp = now(CLOCK_MONOTONIC);
1873
1874 if (r > 0)
1875 log_debug("Successfully trimmed some memory.");
1876 else
1877 log_debug("Couldn't trim any memory.");
1878
1879 usec_t period = after_timestamp - before_timestamp;
1880
1881#if HAVE_GENERIC_MALLINFO
1882 generic_mallinfo after_mallinfo = generic_mallinfo_get();
1883 size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
1884 LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
1885 log_struct(LOG_DEBUG,
1886 LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1887 FORMAT_TIMESPAN(period, 0),
1888 FORMAT_BYTES(l)),
1889 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1890 "TRIMMED_BYTES=%zu", l,
1891 "TRIMMED_USEC=" USEC_FMT, period);
1892#else
1893 log_struct(LOG_DEBUG,
1894 LOG_MESSAGE("Memory trimming took %s.",
1895 FORMAT_TIMESPAN(period, 0)),
1896 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1897 "TRIMMED_USEC=" USEC_FMT, period);
1898#endif
1899
1900 return 0;
1901}
1902
1903static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1904 assert(s);
1905
1906 sd_event_trim_memory();
1907 return 0;
1908}
1909
1910_public_ int sd_event_add_memory_pressure(
1911 sd_event *e,
1912 sd_event_source **ret,
1913 sd_event_handler_t callback,
1914 void *userdata) {
1915
1916 _cleanup_free_ char *w = NULL;
1917 _cleanup_(source_freep) sd_event_source *s = NULL;
1918 _cleanup_close_ int path_fd = -1, fd = -1;
1919 _cleanup_free_ void *write_buffer = NULL;
40c5d5d2 1920 const char *watch, *watch_fallback = NULL, *env;
158fe190
LP
1921 size_t write_buffer_size = 0;
1922 struct stat st;
1923 uint32_t events;
1924 bool locked;
1925 int r;
1926
1927 assert_return(e, -EINVAL);
1928 assert_return(e = event_resolve(e), -ENOPKG);
1929 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1930 assert_return(!event_pid_changed(e), -ECHILD);
1931
1932 if (!callback)
1933 callback = memory_pressure_callback;
1934
1935 s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1936 if (!s)
1937 return -ENOMEM;
1938
1939 s->wakeup = WAKEUP_EVENT_SOURCE;
1940 s->memory_pressure.callback = callback;
1941 s->userdata = userdata;
1942 s->enabled = SD_EVENT_ON;
1943 s->memory_pressure.fd = -EBADF;
1944
1945 env = secure_getenv("MEMORY_PRESSURE_WATCH");
1946 if (env) {
1947 if (isempty(env) || path_equal(env, "/dev/null"))
1948 return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1949 "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1950
1951 if (!path_is_absolute(env) || !path_is_normalized(env))
1952 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1953 "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1954
1955 watch = env;
1956
1957 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1958 if (env) {
1959 r = unbase64mem(env, SIZE_MAX, &write_buffer, &write_buffer_size);
1960 if (r < 0)
1961 return r;
1962 }
1963
1964 locked = true;
1965 } else {
1966
1967 r = is_pressure_supported();
1968 if (r < 0)
1969 return r;
1970 if (r == 0)
1971 return -EOPNOTSUPP;
1972
1973 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1974 * the system wide pressure if for some reason we cannot (which could be: memory controller
1975 * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1976 * only use the system-wide logic. */
1977 r = cg_all_unified();
1978 if (r < 0)
1979 return r;
1980 if (r == 0)
1981 watch = "/proc/pressure/memory";
1982 else {
1983 _cleanup_free_ char *cg = NULL;
1984
1985 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
1986 if (r < 0)
1987 return r;
1988
1989 w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
1990 if (!w)
1991 return -ENOMEM;
1992
1993 watch = w;
1994 watch_fallback = "/proc/pressure/memory";
1995 }
1996
1997 /* Android uses three levels in its userspace low memory killer logic:
1998 * some 70000 1000000
1999 * some 100000 1000000
2000 * full 70000 1000000
2001 *
2002 * GNOME's low memory monitor uses:
2003 * some 70000 1000000
2004 * some 100000 1000000
2005 * full 100000 1000000
2006 *
2007 * We'll default to the middle level that both agree on */
2008 if (asprintf((char**) &write_buffer,
2009 "%s " USEC_FMT " " USEC_FMT,
2010 MEMORY_PRESSURE_DEFAULT_TYPE,
2011 MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2012 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2013 return -ENOMEM;
2014
2015 write_buffer_size = strlen(write_buffer) + 1;
2016 locked = false;
2017 }
2018
2019 path_fd = open(watch, O_PATH|O_CLOEXEC);
2020 if (path_fd < 0) {
2021 if (errno != ENOENT)
2022 return -errno;
2023
2024 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2025 * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2026 * the PSI service apparently is not supported) */
2027 if (!watch_fallback)
2028 return locked ? -ENOENT : -EOPNOTSUPP;
2029
2030 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
63b1e67e
YW
2031 if (path_fd < 0) {
2032 if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2033 return -EOPNOTSUPP;
158fe190 2034 return -errno;
63b1e67e 2035 }
158fe190
LP
2036 }
2037
2038 if (fstat(path_fd, &st) < 0)
2039 return -errno;
2040
2041 if (S_ISSOCK(st.st_mode)) {
2042 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2043 if (fd < 0)
2044 return -errno;
2045
2046 r = connect_unix_path(fd, path_fd, NULL);
2047 if (r < 0)
2048 return r;
2049
2050 events = EPOLLIN;
2051
2052 } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2053 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2054 if (fd < 0)
2055 return fd;
2056
2057 if (S_ISREG(st.st_mode)) {
2058 struct statfs sfs;
2059
2060 /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2061
2062 if (fstatfs(fd, &sfs) < 0)
2063 return -errno;
2064
2065 if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2066 !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2067 return -ENOTTY;
2068
2069 events = EPOLLPRI;
2070 } else
2071 /* For fifos and char devices just watch for EPOLLIN */
2072 events = EPOLLIN;
2073
2074 } else if (S_ISDIR(st.st_mode))
2075 return -EISDIR;
2076 else
2077 return -EBADF;
2078
2079 s->memory_pressure.fd = TAKE_FD(fd);
2080 s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2081 s->memory_pressure.write_buffer_size = write_buffer_size;
2082 s->memory_pressure.events = events;
2083 s->memory_pressure.locked = locked;
2084
2085 /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2086 * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2087 * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2088 * event sources on which writes must be executed before the first event loop iteration is
2089 * executed. (We could also write the data here, right away, but we want to give the caller the
2090 * freedom to call sd_event_source_set_memory_pressure_type() and
2091 * sd_event_source_set_memory_pressure_rate() before we write it. */
2092
2093 if (s->memory_pressure.write_buffer_size > 0)
2094 source_memory_pressure_add_to_write_list(s);
2095 else {
2096 r = source_memory_pressure_register(s, s->enabled);
2097 if (r < 0)
2098 return r;
2099 }
2100
2101 if (ret)
2102 *ret = s;
2103 TAKE_PTR(s);
2104
2105 return 0;
2106}
2107
97ef5391
LP
2108static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2109 assert(e);
2110
2111 if (!d)
2112 return;
2113
2114 assert(hashmap_isempty(d->inodes));
2115 assert(hashmap_isempty(d->wd));
2116
2117 if (d->buffer_filled > 0)
0601b958 2118 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
2119
2120 hashmap_free(d->inodes);
2121 hashmap_free(d->wd);
2122
2123 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2124
2125 if (d->fd >= 0) {
fbae5090
YW
2126 if (!event_pid_changed(e) &&
2127 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
97ef5391
LP
2128 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2129
2130 safe_close(d->fd);
2131 }
2132 free(d);
2133}
2134
2135static int event_make_inotify_data(
2136 sd_event *e,
2137 int64_t priority,
2138 struct inotify_data **ret) {
2139
254d1313 2140 _cleanup_close_ int fd = -EBADF;
97ef5391 2141 struct inotify_data *d;
97ef5391
LP
2142 int r;
2143
2144 assert(e);
2145
2146 d = hashmap_get(e->inotify_data, &priority);
2147 if (d) {
2148 if (ret)
2149 *ret = d;
2150 return 0;
2151 }
2152
2153 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2154 if (fd < 0)
2155 return -errno;
2156
2157 fd = fd_move_above_stdio(fd);
2158
97ef5391
LP
2159 d = new(struct inotify_data, 1);
2160 if (!d)
2161 return -ENOMEM;
2162
2163 *d = (struct inotify_data) {
2164 .wakeup = WAKEUP_INOTIFY_DATA,
2165 .fd = TAKE_FD(fd),
2166 .priority = priority,
2167 };
2168
c2484a75 2169 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
97ef5391
LP
2170 if (r < 0) {
2171 d->fd = safe_close(d->fd);
2172 free(d);
2173 return r;
2174 }
2175
1eac7948 2176 struct epoll_event ev = {
97ef5391
LP
2177 .events = EPOLLIN,
2178 .data.ptr = d,
2179 };
2180
2181 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2182 r = -errno;
2183 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2184 * remove the fd from the epoll first, which we don't want as we couldn't
2185 * add it in the first place. */
2186 event_free_inotify_data(e, d);
2187 return r;
2188 }
2189
2190 if (ret)
2191 *ret = d;
2192
2193 return 1;
2194}
2195
7a08d314 2196static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
90c88092 2197 int r;
97ef5391
LP
2198
2199 assert(x);
2200 assert(y);
2201
90c88092
YW
2202 r = CMP(x->dev, y->dev);
2203 if (r != 0)
2204 return r;
97ef5391 2205
6dd91b36 2206 return CMP(x->ino, y->ino);
97ef5391
LP
2207}
2208
7a08d314
YW
2209static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2210 assert(d);
97ef5391
LP
2211
2212 siphash24_compress(&d->dev, sizeof(d->dev), state);
2213 siphash24_compress(&d->ino, sizeof(d->ino), state);
2214}
2215
7a08d314 2216DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
97ef5391
LP
2217
2218static void event_free_inode_data(
2219 sd_event *e,
2220 struct inode_data *d) {
2221
2222 assert(e);
2223
2224 if (!d)
2225 return;
2226
64903d18 2227 assert(!d->event_sources);
97ef5391
LP
2228
2229 if (d->fd >= 0) {
ed828563 2230 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
2231 safe_close(d->fd);
2232 }
2233
2234 if (d->inotify_data) {
2235
2236 if (d->wd >= 0) {
fbae5090 2237 if (d->inotify_data->fd >= 0 && !event_pid_changed(e)) {
97ef5391
LP
2238 /* So here's a problem. At the time this runs the watch descriptor might already be
2239 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2240 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2241 * likely case to happen. */
2242
2243 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2244 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2245 }
2246
2247 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2248 }
2249
2250 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2251 }
2252
2253 free(d);
2254}
2255
53baf2ef
LP
2256static void event_gc_inotify_data(
2257 sd_event *e,
2258 struct inotify_data *d) {
2259
2260 assert(e);
2261
2262 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2263 * any inode with it anymore, which in turn happens if no event source of this priority is interested
2264 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2265 * (under the expectation that the GC is called again once the counter is decremented). */
2266
2267 if (!d)
2268 return;
2269
2270 if (!hashmap_isempty(d->inodes))
2271 return;
2272
2273 if (d->n_busy > 0)
2274 return;
2275
2276 event_free_inotify_data(e, d);
2277}
2278
97ef5391
LP
2279static void event_gc_inode_data(
2280 sd_event *e,
2281 struct inode_data *d) {
2282
2283 struct inotify_data *inotify_data;
2284
2285 assert(e);
2286
2287 if (!d)
2288 return;
2289
64903d18 2290 if (d->event_sources)
97ef5391
LP
2291 return;
2292
2293 inotify_data = d->inotify_data;
2294 event_free_inode_data(e, d);
2295
53baf2ef 2296 event_gc_inotify_data(e, inotify_data);
97ef5391
LP
2297}
2298
2299static int event_make_inode_data(
2300 sd_event *e,
2301 struct inotify_data *inotify_data,
2302 dev_t dev,
2303 ino_t ino,
2304 struct inode_data **ret) {
2305
2306 struct inode_data *d, key;
2307 int r;
2308
2309 assert(e);
2310 assert(inotify_data);
2311
2312 key = (struct inode_data) {
2313 .ino = ino,
2314 .dev = dev,
2315 };
2316
2317 d = hashmap_get(inotify_data->inodes, &key);
2318 if (d) {
2319 if (ret)
2320 *ret = d;
2321
2322 return 0;
2323 }
2324
2325 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2326 if (r < 0)
2327 return r;
2328
2329 d = new(struct inode_data, 1);
2330 if (!d)
2331 return -ENOMEM;
2332
2333 *d = (struct inode_data) {
2334 .dev = dev,
2335 .ino = ino,
2336 .wd = -1,
254d1313 2337 .fd = -EBADF,
97ef5391
LP
2338 .inotify_data = inotify_data,
2339 };
2340
2341 r = hashmap_put(inotify_data->inodes, d, d);
2342 if (r < 0) {
2343 free(d);
2344 return r;
2345 }
2346
2347 if (ret)
2348 *ret = d;
2349
2350 return 1;
2351}
2352
2353static uint32_t inode_data_determine_mask(struct inode_data *d) {
2354 bool excl_unlink = true;
2355 uint32_t combined = 0;
97ef5391
LP
2356
2357 assert(d);
2358
2359 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2360 * the IN_EXCL_UNLINK flag is ANDed instead.
2361 *
2362 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2363 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
f21f31b2 2364 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
97ef5391
LP
2365 * events we don't care for client-side. */
2366
2367 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2368
2369 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2370 excl_unlink = false;
2371
2372 combined |= s->inotify.mask;
2373 }
2374
2375 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2376}
2377
2378static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2379 uint32_t combined_mask;
2380 int wd, r;
2381
2382 assert(d);
2383 assert(d->fd >= 0);
2384
2385 combined_mask = inode_data_determine_mask(d);
2386
2387 if (d->wd >= 0 && combined_mask == d->combined_mask)
2388 return 0;
2389
2390 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2391 if (r < 0)
2392 return r;
2393
2394 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2395 if (wd < 0)
2396 return -errno;
2397
2398 if (d->wd < 0) {
2399 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2400 if (r < 0) {
2401 (void) inotify_rm_watch(d->inotify_data->fd, wd);
2402 return r;
2403 }
2404
2405 d->wd = wd;
2406
2407 } else if (d->wd != wd) {
2408
2409 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2410 (void) inotify_rm_watch(d->fd, wd);
2411 return -EINVAL;
2412 }
2413
2414 d->combined_mask = combined_mask;
2415 return 1;
2416}
2417
b9350e70
LP
2418static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2419 assert(s);
2420
2421 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2422}
2423
e67d738a 2424static int event_add_inotify_fd_internal(
97ef5391
LP
2425 sd_event *e,
2426 sd_event_source **ret,
e67d738a
LP
2427 int fd,
2428 bool donate,
97ef5391
LP
2429 uint32_t mask,
2430 sd_event_inotify_handler_t callback,
2431 void *userdata) {
2432
5bb1d7fb 2433 _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
e67d738a 2434 _cleanup_(source_freep) sd_event_source *s = NULL;
97ef5391
LP
2435 struct inotify_data *inotify_data = NULL;
2436 struct inode_data *inode_data = NULL;
97ef5391
LP
2437 struct stat st;
2438 int r;
2439
2440 assert_return(e, -EINVAL);
2441 assert_return(e = event_resolve(e), -ENOPKG);
e67d738a 2442 assert_return(fd >= 0, -EBADF);
97ef5391
LP
2443 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2444 assert_return(!event_pid_changed(e), -ECHILD);
2445
b9350e70
LP
2446 if (!callback)
2447 callback = inotify_exit_callback;
2448
97ef5391
LP
2449 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2450 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2451 * the user can't use them for us. */
2452 if (mask & IN_MASK_ADD)
2453 return -EINVAL;
2454
97ef5391
LP
2455 if (fstat(fd, &st) < 0)
2456 return -errno;
2457
2458 s = source_new(e, !ret, SOURCE_INOTIFY);
2459 if (!s)
2460 return -ENOMEM;
2461
2462 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2463 s->inotify.mask = mask;
2464 s->inotify.callback = callback;
2465 s->userdata = userdata;
2466
2467 /* Allocate an inotify object for this priority, and an inode object within it */
2468 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2469 if (r < 0)
8c75fe17 2470 return r;
97ef5391
LP
2471
2472 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
8c75fe17 2473 if (r < 0) {
e67d738a 2474 event_gc_inotify_data(e, inotify_data);
8c75fe17
ZJS
2475 return r;
2476 }
97ef5391
LP
2477
2478 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2479 * the event source, until then, for which we need the original inode. */
2480 if (inode_data->fd < 0) {
e67d738a
LP
2481 if (donated_fd >= 0)
2482 inode_data->fd = TAKE_FD(donated_fd);
2483 else {
2484 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2485 if (inode_data->fd < 0) {
2486 r = -errno;
2487 event_gc_inode_data(e, inode_data);
2488 return r;
2489 }
2490 }
2491
ed828563 2492 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
97ef5391
LP
2493 }
2494
2495 /* Link our event source to the inode data object */
2496 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2497 s->inotify.inode_data = inode_data;
2498
97ef5391
LP
2499 /* Actually realize the watch now */
2500 r = inode_data_realize_watch(e, inode_data);
2501 if (r < 0)
8c75fe17 2502 return r;
97ef5391 2503
97ef5391
LP
2504 if (ret)
2505 *ret = s;
8c75fe17 2506 TAKE_PTR(s);
97ef5391
LP
2507
2508 return 0;
97ef5391
LP
2509}
2510
e67d738a
LP
2511_public_ int sd_event_add_inotify_fd(
2512 sd_event *e,
2513 sd_event_source **ret,
2514 int fd,
2515 uint32_t mask,
2516 sd_event_inotify_handler_t callback,
2517 void *userdata) {
2518
2519 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2520}
2521
2522_public_ int sd_event_add_inotify(
2523 sd_event *e,
2524 sd_event_source **ret,
2525 const char *path,
2526 uint32_t mask,
2527 sd_event_inotify_handler_t callback,
2528 void *userdata) {
2529
2091c779 2530 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
e67d738a
LP
2531 int fd, r;
2532
2533 assert_return(path, -EINVAL);
2534
586c8cee
ZJS
2535 fd = open(path, O_PATH | O_CLOEXEC |
2536 (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2537 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
e67d738a
LP
2538 if (fd < 0)
2539 return -errno;
2540
2541 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2542 if (r < 0)
2543 return r;
2544
2545 (void) sd_event_source_set_description(s, path);
2546
2547 if (ret)
2548 *ret = s;
2549
2550 return r;
2551}
2552
8301aa0b 2553static sd_event_source* event_source_free(sd_event_source *s) {
6680dd6b
LP
2554 if (!s)
2555 return NULL;
da7e457c 2556
8301aa0b
YW
2557 /* Here's a special hack: when we are called from a
2558 * dispatch handler we won't free the event source
2559 * immediately, but we will detach the fd from the
2560 * epoll. This way it is safe for the caller to unref
2561 * the event source and immediately close the fd, but
2562 * we still retain a valid event source object after
2563 * the callback. */
fd38203a 2564
76d04c3a 2565 if (s->dispatching)
8301aa0b 2566 source_disconnect(s);
76d04c3a 2567 else
8301aa0b 2568 source_free(s);
fd38203a
LP
2569
2570 return NULL;
2571}
2572
8301aa0b
YW
2573DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2574
356779df 2575_public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
f7f53e9e 2576 assert_return(s, -EINVAL);
f4b2933e 2577 assert_return(!event_pid_changed(s->event), -ECHILD);
f7f53e9e 2578
356779df 2579 return free_and_strdup(&s->description, description);
f7f53e9e
TG
2580}
2581
356779df 2582_public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
f7f53e9e 2583 assert_return(s, -EINVAL);
356779df 2584 assert_return(description, -EINVAL);
f4b2933e 2585 assert_return(!event_pid_changed(s->event), -ECHILD);
f7f53e9e 2586
7d92a1a4
ZJS
2587 if (!s->description)
2588 return -ENXIO;
2589
356779df 2590 *description = s->description;
f7f53e9e
TG
2591 return 0;
2592}
2593
adcc4ca3 2594_public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
305f78bf 2595 assert_return(s, NULL);
eaa3cbef
LP
2596
2597 return s->event;
2598}
2599
f7262a9f 2600_public_ int sd_event_source_get_pending(sd_event_source *s) {
305f78bf 2601 assert_return(s, -EINVAL);
6203e07a 2602 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c 2603 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2604 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2605
2606 return s->pending;
2607}
2608
f7262a9f 2609_public_ int sd_event_source_get_io_fd(sd_event_source *s) {
305f78bf
LP
2610 assert_return(s, -EINVAL);
2611 assert_return(s->type == SOURCE_IO, -EDOM);
2612 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2613
2614 return s->io.fd;
2615}
2616
30caf8f3
LP
2617_public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2618 int r;
2619
2620 assert_return(s, -EINVAL);
8ac43fee 2621 assert_return(fd >= 0, -EBADF);
30caf8f3
LP
2622 assert_return(s->type == SOURCE_IO, -EDOM);
2623 assert_return(!event_pid_changed(s->event), -ECHILD);
2624
2625 if (s->io.fd == fd)
2626 return 0;
2627
b6d5481b 2628 if (event_source_is_offline(s)) {
30caf8f3
LP
2629 s->io.fd = fd;
2630 s->io.registered = false;
2631 } else {
2632 int saved_fd;
2633
2634 saved_fd = s->io.fd;
2635 assert(s->io.registered);
2636
2637 s->io.fd = fd;
2638 s->io.registered = false;
2639
2640 r = source_io_register(s, s->enabled, s->io.events);
2641 if (r < 0) {
2642 s->io.fd = saved_fd;
2643 s->io.registered = true;
2644 return r;
2645 }
2646
5a795bff 2647 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
30caf8f3
LP
2648 }
2649
2650 return 0;
2651}
2652
ab93297c
NM
2653_public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2654 assert_return(s, -EINVAL);
2655 assert_return(s->type == SOURCE_IO, -EDOM);
2656
2657 return s->io.owned;
2658}
2659
2660_public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2661 assert_return(s, -EINVAL);
2662 assert_return(s->type == SOURCE_IO, -EDOM);
2663
2664 s->io.owned = own;
2665 return 0;
2666}
2667
f7262a9f 2668_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
305f78bf
LP
2669 assert_return(s, -EINVAL);
2670 assert_return(events, -EINVAL);
2671 assert_return(s->type == SOURCE_IO, -EDOM);
2672 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2673
2674 *events = s->io.events;
2675 return 0;
2676}
2677
f7262a9f 2678_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
fd38203a
LP
2679 int r;
2680
305f78bf
LP
2681 assert_return(s, -EINVAL);
2682 assert_return(s->type == SOURCE_IO, -EDOM);
2a16a986 2683 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 2684 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2685 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2686
b63c8d4f
DH
2687 /* edge-triggered updates are never skipped, so we can reset edges */
2688 if (s->io.events == events && !(events & EPOLLET))
fd38203a
LP
2689 return 0;
2690
2a0dc6cd
LP
2691 r = source_set_pending(s, false);
2692 if (r < 0)
2693 return r;
2694
b6d5481b 2695 if (event_source_is_online(s)) {
e4715127 2696 r = source_io_register(s, s->enabled, events);
fd38203a
LP
2697 if (r < 0)
2698 return r;
2699 }
2700
2701 s->io.events = events;
2702
2703 return 0;
2704}
2705
f7262a9f 2706_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
305f78bf
LP
2707 assert_return(s, -EINVAL);
2708 assert_return(revents, -EINVAL);
2709 assert_return(s->type == SOURCE_IO, -EDOM);
2710 assert_return(s->pending, -ENODATA);
2711 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2712
2713 *revents = s->io.revents;
2714 return 0;
2715}
2716
f7262a9f 2717_public_ int sd_event_source_get_signal(sd_event_source *s) {
305f78bf
LP
2718 assert_return(s, -EINVAL);
2719 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2720 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2721
2722 return s->signal.sig;
2723}
2724
31927c16 2725_public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
305f78bf
LP
2726 assert_return(s, -EINVAL);
2727 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2728
6680b8d1
ME
2729 *priority = s->priority;
2730 return 0;
fd38203a
LP
2731}
2732
31927c16 2733_public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
97ef5391
LP
2734 bool rm_inotify = false, rm_inode = false;
2735 struct inotify_data *new_inotify_data = NULL;
2736 struct inode_data *new_inode_data = NULL;
9da4cb2b
LP
2737 int r;
2738
305f78bf 2739 assert_return(s, -EINVAL);
da7e457c 2740 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2741 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2742
2743 if (s->priority == priority)
2744 return 0;
2745
97ef5391
LP
2746 if (s->type == SOURCE_INOTIFY) {
2747 struct inode_data *old_inode_data;
2748
2749 assert(s->inotify.inode_data);
2750 old_inode_data = s->inotify.inode_data;
2751
2752 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2753 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2754 * events we allow priority changes only until the first following iteration. */
2755 if (old_inode_data->fd < 0)
2756 return -EOPNOTSUPP;
2757
2758 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2759 if (r < 0)
2760 return r;
2761 rm_inotify = r > 0;
2762
2763 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2764 if (r < 0)
2765 goto fail;
2766 rm_inode = r > 0;
2767
2768 if (new_inode_data->fd < 0) {
2769 /* Duplicate the fd for the new inode object if we don't have any yet */
2770 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2771 if (new_inode_data->fd < 0) {
2772 r = -errno;
2773 goto fail;
2774 }
2775
ed828563 2776 LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
97ef5391
LP
2777 }
2778
2779 /* Move the event source to the new inode data structure */
2780 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2781 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2782 s->inotify.inode_data = new_inode_data;
2783
2784 /* Now create the new watch */
2785 r = inode_data_realize_watch(s->event, new_inode_data);
2786 if (r < 0) {
2787 /* Move it back */
2788 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2789 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2790 s->inotify.inode_data = old_inode_data;
2791 goto fail;
2792 }
2793
2794 s->priority = priority;
2795
2796 event_gc_inode_data(s->event, old_inode_data);
2797
b6d5481b 2798 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
9da4cb2b
LP
2799 struct signal_data *old, *d;
2800
2801 /* Move us from the signalfd belonging to the old
2802 * priority to the signalfd of the new priority */
2803
2804 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2805
2806 s->priority = priority;
2807
2808 r = event_make_signal_data(s->event, s->signal.sig, &d);
2809 if (r < 0) {
2810 s->priority = old->priority;
2811 return r;
2812 }
2813
2814 event_unmask_signal_data(s->event, old, s->signal.sig);
2815 } else
2816 s->priority = priority;
fd38203a 2817
e1951c16 2818 event_source_pp_prioq_reshuffle(s);
fd38203a 2819
6203e07a
LP
2820 if (s->type == SOURCE_EXIT)
2821 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
305f78bf 2822
fd38203a 2823 return 0;
97ef5391
LP
2824
2825fail:
2826 if (rm_inode)
2827 event_free_inode_data(s->event, new_inode_data);
2828
2829 if (rm_inotify)
2830 event_free_inotify_data(s->event, new_inotify_data);
2831
2832 return r;
fd38203a
LP
2833}
2834
cad143a8 2835_public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
71193c0b
ZJS
2836 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2837 if (!s && !ret)
2838 return false;
2839
305f78bf 2840 assert_return(s, -EINVAL);
305f78bf 2841 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2842
cad143a8
LP
2843 if (ret)
2844 *ret = s->enabled;
2845
08c1eb0e 2846 return s->enabled != SD_EVENT_OFF;
fd38203a
LP
2847}
2848
b6d5481b
LP
2849static int event_source_offline(
2850 sd_event_source *s,
2851 int enabled,
2852 bool ratelimited) {
2853
2854 bool was_offline;
fd38203a
LP
2855 int r;
2856
ddfde737 2857 assert(s);
b6d5481b 2858 assert(enabled == SD_EVENT_OFF || ratelimited);
fd38203a 2859
ddfde737 2860 /* Unset the pending flag when this event source is disabled */
b6d5481b
LP
2861 if (s->enabled != SD_EVENT_OFF &&
2862 enabled == SD_EVENT_OFF &&
2863 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2864 r = source_set_pending(s, false);
2865 if (r < 0)
2866 return r;
2867 }
cc567911 2868
b6d5481b
LP
2869 was_offline = event_source_is_offline(s);
2870 s->enabled = enabled;
2871 s->ratelimited = ratelimited;
fd38203a 2872
ddfde737 2873 switch (s->type) {
fd38203a 2874
ddfde737
LP
2875 case SOURCE_IO:
2876 source_io_unregister(s);
2877 break;
ac989a78 2878
ddfde737
LP
2879 case SOURCE_SIGNAL:
2880 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2881 break;
fd38203a 2882
ddfde737 2883 case SOURCE_CHILD:
b6d5481b
LP
2884 if (!was_offline) {
2885 assert(s->event->n_online_child_sources > 0);
2886 s->event->n_online_child_sources--;
2887 }
fd38203a 2888
ddfde737
LP
2889 if (EVENT_SOURCE_WATCH_PIDFD(s))
2890 source_child_pidfd_unregister(s);
2891 else
2892 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2893 break;
4807d2d0 2894
ddfde737
LP
2895 case SOURCE_EXIT:
2896 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2897 break;
fd38203a 2898
158fe190
LP
2899 case SOURCE_MEMORY_PRESSURE:
2900 source_memory_pressure_unregister(s);
2901 break;
2902
2115b9b6
YW
2903 case SOURCE_TIME_REALTIME:
2904 case SOURCE_TIME_BOOTTIME:
2905 case SOURCE_TIME_MONOTONIC:
2906 case SOURCE_TIME_REALTIME_ALARM:
2907 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737
LP
2908 case SOURCE_DEFER:
2909 case SOURCE_POST:
2910 case SOURCE_INOTIFY:
2911 break;
fd38203a 2912
ddfde737 2913 default:
04499a70 2914 assert_not_reached();
ddfde737 2915 }
fd38203a 2916
2115b9b6
YW
2917 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2918 event_source_time_prioq_reshuffle(s);
2919
b6d5481b 2920 return 1;
ddfde737 2921}
f8f3f926 2922
b6d5481b
LP
2923static int event_source_online(
2924 sd_event_source *s,
2925 int enabled,
2926 bool ratelimited) {
2927
2928 bool was_online;
ddfde737 2929 int r;
fd38203a 2930
ddfde737 2931 assert(s);
b6d5481b 2932 assert(enabled != SD_EVENT_OFF || !ratelimited);
305f78bf 2933
ddfde737 2934 /* Unset the pending flag when this event source is enabled */
b6d5481b
LP
2935 if (s->enabled == SD_EVENT_OFF &&
2936 enabled != SD_EVENT_OFF &&
2937 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2938 r = source_set_pending(s, false);
2939 if (r < 0)
2940 return r;
2941 }
9d3e3aa5 2942
b6d5481b
LP
2943 /* Are we really ready for onlining? */
2944 if (enabled == SD_EVENT_OFF || ratelimited) {
2945 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2946 s->enabled = enabled;
2947 s->ratelimited = ratelimited;
2948 return 0;
2949 }
2950
2951 was_online = event_source_is_online(s);
2952
ddfde737 2953 switch (s->type) {
ddfde737 2954 case SOURCE_IO:
b6d5481b 2955 r = source_io_register(s, enabled, s->io.events);
d2eafe61 2956 if (r < 0)
ddfde737 2957 return r;
ddfde737 2958 break;
fd38203a 2959
ddfde737
LP
2960 case SOURCE_SIGNAL:
2961 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2962 if (r < 0) {
ddfde737
LP
2963 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2964 return r;
2965 }
fd38203a 2966
ddfde737 2967 break;
fd38203a 2968
ddfde737 2969 case SOURCE_CHILD:
ddfde737
LP
2970 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2971 /* yes, we have pidfd */
9da4cb2b 2972
b6d5481b 2973 r = source_child_pidfd_register(s, enabled);
ac9f2640 2974 if (r < 0)
9da4cb2b 2975 return r;
ddfde737
LP
2976 } else {
2977 /* no pidfd, or something other to watch for than WEXITED */
9da4cb2b 2978
ddfde737
LP
2979 r = event_make_signal_data(s->event, SIGCHLD, NULL);
2980 if (r < 0) {
ddfde737
LP
2981 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2982 return r;
2983 }
2984 }
fd38203a 2985
b6d5481b
LP
2986 if (!was_online)
2987 s->event->n_online_child_sources++;
ddfde737 2988 break;
4807d2d0 2989
158fe190
LP
2990 case SOURCE_MEMORY_PRESSURE:
2991 r = source_memory_pressure_register(s, enabled);
2992 if (r < 0)
2993 return r;
2994
2995 break;
2996
d2eafe61
ZJS
2997 case SOURCE_TIME_REALTIME:
2998 case SOURCE_TIME_BOOTTIME:
2999 case SOURCE_TIME_MONOTONIC:
3000 case SOURCE_TIME_REALTIME_ALARM:
3001 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737 3002 case SOURCE_EXIT:
ddfde737
LP
3003 case SOURCE_DEFER:
3004 case SOURCE_POST:
3005 case SOURCE_INOTIFY:
3006 break;
9da4cb2b 3007
ddfde737 3008 default:
04499a70 3009 assert_not_reached();
ddfde737 3010 }
f8f3f926 3011
b6d5481b
LP
3012 s->enabled = enabled;
3013 s->ratelimited = ratelimited;
d2eafe61
ZJS
3014
3015 /* Non-failing operations below */
2115b9b6 3016 if (s->type == SOURCE_EXIT)
d2eafe61 3017 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
d2eafe61 3018
2115b9b6
YW
3019 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3020 event_source_time_prioq_reshuffle(s);
d2eafe61 3021
b6d5481b 3022 return 1;
ddfde737
LP
3023}
3024
3025_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3026 int r;
9da4cb2b 3027
ddfde737 3028 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
7e922b05
ZJS
3029
3030 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3031 if (m == SD_EVENT_OFF && !s)
3032 return 0;
3033
3034 assert_return(s, -EINVAL);
ddfde737 3035 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 3036
ddfde737
LP
3037 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3038 if (s->event->state == SD_EVENT_FINISHED)
3039 return m == SD_EVENT_OFF ? 0 : -ESTALE;
305f78bf 3040
ddfde737
LP
3041 if (s->enabled == m) /* No change? */
3042 return 0;
9d3e3aa5 3043
ddfde737 3044 if (m == SD_EVENT_OFF)
b6d5481b 3045 r = event_source_offline(s, m, s->ratelimited);
ddfde737
LP
3046 else {
3047 if (s->enabled != SD_EVENT_OFF) {
3048 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3049 * event source is already enabled after all. */
3050 s->enabled = m;
3051 return 0;
fd38203a 3052 }
ddfde737 3053
b6d5481b 3054 r = event_source_online(s, m, s->ratelimited);
fd38203a 3055 }
ddfde737
LP
3056 if (r < 0)
3057 return r;
fd38203a 3058
e1951c16 3059 event_source_pp_prioq_reshuffle(s);
fd38203a
LP
3060 return 0;
3061}
3062
f7262a9f 3063_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
305f78bf
LP
3064 assert_return(s, -EINVAL);
3065 assert_return(usec, -EINVAL);
6a0f1f6d 3066 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
305f78bf 3067 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
3068
3069 *usec = s->time.next;
3070 return 0;
3071}
3072
f7262a9f 3073_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
2a0dc6cd 3074 int r;
6a0f1f6d 3075
305f78bf 3076 assert_return(s, -EINVAL);
6a0f1f6d 3077 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 3078 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 3079 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 3080
2a0dc6cd
LP
3081 r = source_set_pending(s, false);
3082 if (r < 0)
3083 return r;
2576a19e 3084
2a0dc6cd 3085 s->time.next = usec;
fd38203a 3086
e1951c16 3087 event_source_time_prioq_reshuffle(s);
fd38203a
LP
3088 return 0;
3089}
3090
d6a83dc4
LP
3091_public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3092 usec_t t;
3093 int r;
3094
3095 assert_return(s, -EINVAL);
3096 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3097
ef859195
LP
3098 if (usec == USEC_INFINITY)
3099 return sd_event_source_set_time(s, USEC_INFINITY);
3100
d6a83dc4
LP
3101 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3102 if (r < 0)
3103 return r;
3104
496db330
YW
3105 usec = usec_add(t, usec);
3106 if (usec == USEC_INFINITY)
d6a83dc4
LP
3107 return -EOVERFLOW;
3108
496db330 3109 return sd_event_source_set_time(s, usec);
d6a83dc4
LP
3110}
3111
f7262a9f 3112_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
305f78bf
LP
3113 assert_return(s, -EINVAL);
3114 assert_return(usec, -EINVAL);
6a0f1f6d 3115 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
305f78bf
LP
3116 assert_return(!event_pid_changed(s->event), -ECHILD);
3117
3118 *usec = s->time.accuracy;
3119 return 0;
3120}
3121
f7262a9f 3122_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
2a0dc6cd 3123 int r;
6a0f1f6d 3124
305f78bf 3125 assert_return(s, -EINVAL);
f5fbe71d 3126 assert_return(usec != UINT64_MAX, -EINVAL);
6a0f1f6d 3127 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 3128 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 3129 assert_return(!event_pid_changed(s->event), -ECHILD);
eaa3cbef 3130
2a0dc6cd
LP
3131 r = source_set_pending(s, false);
3132 if (r < 0)
3133 return r;
3134
eaa3cbef
LP
3135 if (usec == 0)
3136 usec = DEFAULT_ACCURACY_USEC;
3137
eaa3cbef
LP
3138 s->time.accuracy = usec;
3139
e1951c16 3140 event_source_time_prioq_reshuffle(s);
6a0f1f6d
LP
3141 return 0;
3142}
3143
3144_public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
3145 assert_return(s, -EINVAL);
3146 assert_return(clock, -EINVAL);
3147 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3148 assert_return(!event_pid_changed(s->event), -ECHILD);
eaa3cbef 3149
6a0f1f6d 3150 *clock = event_source_type_to_clock(s->type);
eaa3cbef
LP
3151 return 0;
3152}
3153
f7262a9f 3154_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
4bee8012
LP
3155 assert_return(s, -EINVAL);
3156 assert_return(pid, -EINVAL);
3157 assert_return(s->type == SOURCE_CHILD, -EDOM);
3158 assert_return(!event_pid_changed(s->event), -ECHILD);
3159
3160 *pid = s->child.pid;
3161 return 0;
3162}
3163
f8f3f926
LP
3164_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3165 assert_return(s, -EINVAL);
3166 assert_return(s->type == SOURCE_CHILD, -EDOM);
3167 assert_return(!event_pid_changed(s->event), -ECHILD);
3168
3169 if (s->child.pidfd < 0)
3170 return -EOPNOTSUPP;
3171
3172 return s->child.pidfd;
3173}
3174
3175_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3176 assert_return(s, -EINVAL);
3177 assert_return(s->type == SOURCE_CHILD, -EDOM);
3178 assert_return(!event_pid_changed(s->event), -ECHILD);
3179 assert_return(SIGNAL_VALID(sig), -EINVAL);
3180
3181 /* If we already have seen indication the process exited refuse sending a signal early. This way we
3182 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3183 * available. */
3184 if (s->child.exited)
3185 return -ESRCH;
3186
3187 if (s->child.pidfd >= 0) {
3188 siginfo_t copy;
3189
3190 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3191 * structure here */
3192 if (si)
3193 copy = *si;
3194
3195 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
3196 /* Let's propagate the error only if the system call is not implemented or prohibited */
3197 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3198 return -errno;
3199 } else
3200 return 0;
3201 }
3202
3203 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3204 * this here. */
3205 if (flags != 0)
3206 return -EOPNOTSUPP;
3207
3208 if (si) {
3209 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3210 siginfo_t copy = *si;
3211
3212 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
3213 return -errno;
3214 } else if (kill(s->child.pid, sig) < 0)
3215 return -errno;
3216
3217 return 0;
3218}
3219
3220_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3221 assert_return(s, -EINVAL);
3222 assert_return(s->type == SOURCE_CHILD, -EDOM);
3223
3224 if (s->child.pidfd < 0)
3225 return -EOPNOTSUPP;
3226
3227 return s->child.pidfd_owned;
3228}
3229
3230_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3231 assert_return(s, -EINVAL);
3232 assert_return(s->type == SOURCE_CHILD, -EDOM);
3233
3234 if (s->child.pidfd < 0)
3235 return -EOPNOTSUPP;
3236
3237 s->child.pidfd_owned = own;
3238 return 0;
3239}
3240
3241_public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3242 assert_return(s, -EINVAL);
3243 assert_return(s->type == SOURCE_CHILD, -EDOM);
3244
3245 return s->child.process_owned;
3246}
3247
3248_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3249 assert_return(s, -EINVAL);
3250 assert_return(s->type == SOURCE_CHILD, -EDOM);
3251
3252 s->child.process_owned = own;
3253 return 0;
3254}
3255
97ef5391
LP
3256_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
3257 assert_return(s, -EINVAL);
3258 assert_return(mask, -EINVAL);
3259 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3260 assert_return(!event_pid_changed(s->event), -ECHILD);
3261
3262 *mask = s->inotify.mask;
3263 return 0;
3264}
3265
718db961 3266_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
fd38203a
LP
3267 int r;
3268
da7e457c 3269 assert_return(s, -EINVAL);
6203e07a 3270 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c
LP
3271 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3272 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
3273
3274 if (s->prepare == callback)
3275 return 0;
3276
3277 if (callback && s->prepare) {
3278 s->prepare = callback;
3279 return 0;
3280 }
3281
3282 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3283 if (r < 0)
3284 return r;
3285
3286 s->prepare = callback;
3287
3288 if (callback) {
3289 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3290 if (r < 0)
3291 return r;
3292 } else
3293 prioq_remove(s->event->prepare, s, &s->prepare_index);
3294
3295 return 0;
3296}
3297
f7262a9f 3298_public_ void* sd_event_source_get_userdata(sd_event_source *s) {
da7e457c 3299 assert_return(s, NULL);
fd38203a
LP
3300
3301 return s->userdata;
3302}
3303
8f726607
LP
3304_public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3305 void *ret;
3306
3307 assert_return(s, NULL);
3308
3309 ret = s->userdata;
3310 s->userdata = userdata;
3311
3312 return ret;
3313}
3314
b6d5481b
LP
3315static int event_source_enter_ratelimited(sd_event_source *s) {
3316 int r;
3317
3318 assert(s);
3319
3320 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3321 * the end of the rate limit time window, much as if it was a timer event source. */
3322
3323 if (s->ratelimited)
3324 return 0; /* Already ratelimited, this is a NOP hence */
3325
3326 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3327 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3328 if (r < 0)
3329 return r;
3330
3331 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3332 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3333 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3334 if (EVENT_SOURCE_IS_TIME(s->type))
3335 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3336
3337 /* Now, let's add the event source to the monotonic clock instead */
3338 r = event_source_time_prioq_put(s, &s->event->monotonic);
3339 if (r < 0)
3340 goto fail;
3341
3342 /* And let's take the event source officially offline */
3343 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3344 if (r < 0) {
3345 event_source_time_prioq_remove(s, &s->event->monotonic);
3346 goto fail;
3347 }
3348
3349 event_source_pp_prioq_reshuffle(s);
3350
3351 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3352 return 0;
3353
3354fail:
3355 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3356 * space for it should already be allocated. */
3357 if (EVENT_SOURCE_IS_TIME(s->type))
3358 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3359
3360 return r;
3361}
3362
fd69f224 3363static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
b6d5481b
LP
3364 int r;
3365
3366 assert(s);
3367
3368 if (!s->ratelimited)
3369 return 0;
3370
3371 /* Let's take the event source out of the monotonic prioq first. */
3372 event_source_time_prioq_remove(s, &s->event->monotonic);
3373
3374 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3375 if (EVENT_SOURCE_IS_TIME(s->type)) {
3376 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3377 if (r < 0)
3378 goto fail;
3379 }
3380
3381 /* Let's try to take it online again. */
3382 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3383 if (r < 0) {
3384 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3385 if (EVENT_SOURCE_IS_TIME(s->type))
3386 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3387
3388 goto fail;
3389 }
3390
3391 event_source_pp_prioq_reshuffle(s);
3392 ratelimit_reset(&s->rate_limit);
3393
3394 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
fd69f224
MS
3395
3396 if (run_callback && s->ratelimit_expire_callback) {
3397 s->dispatching = true;
3398 r = s->ratelimit_expire_callback(s, s->userdata);
3399 s->dispatching = false;
3400
3401 if (r < 0) {
3402 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3403 strna(s->description),
3404 event_source_type_to_string(s->type),
3405 s->exit_on_failure ? "exiting" : "disabling");
3406
3407 if (s->exit_on_failure)
3408 (void) sd_event_exit(s->event, r);
3409 }
3410
3411 if (s->n_ref == 0)
3412 source_free(s);
3413 else if (r < 0)
0a040e64 3414 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd69f224
MS
3415
3416 return 1;
3417 }
3418
b6d5481b
LP
3419 return 0;
3420
3421fail:
3422 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3423 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3424 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3425
3426 return r;
3427}
3428
c2ba3ad6
LP
3429static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3430 usec_t c;
3431 assert(e);
3432 assert(a <= b);
3433
3434 if (a <= 0)
3435 return 0;
393003e1
LP
3436 if (a >= USEC_INFINITY)
3437 return USEC_INFINITY;
c2ba3ad6
LP
3438
3439 if (b <= a + 1)
3440 return a;
3441
52444dc4
LP
3442 initialize_perturb(e);
3443
c2ba3ad6
LP
3444 /*
3445 Find a good time to wake up again between times a and b. We
3446 have two goals here:
3447
3448 a) We want to wake up as seldom as possible, hence prefer
3449 later times over earlier times.
3450
3451 b) But if we have to wake up, then let's make sure to
3452 dispatch as much as possible on the entire system.
3453
3454 We implement this by waking up everywhere at the same time
850516e0 3455 within any given minute if we can, synchronised via the
c2ba3ad6 3456 perturbation value determined from the boot ID. If we can't,
ba276c81
LP
3457 then we try to find the same spot in every 10s, then 1s and
3458 then 250ms step. Otherwise, we pick the last possible time
3459 to wake up.
c2ba3ad6
LP
3460 */
3461
850516e0
LP
3462 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3463 if (c >= b) {
3464 if (_unlikely_(c < USEC_PER_MINUTE))
3465 return b;
3466
3467 c -= USEC_PER_MINUTE;
3468 }
3469
ba276c81
LP
3470 if (c >= a)
3471 return c;
3472
3473 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3474 if (c >= b) {
3475 if (_unlikely_(c < USEC_PER_SEC*10))
3476 return b;
3477
3478 c -= USEC_PER_SEC*10;
3479 }
3480
850516e0
LP
3481 if (c >= a)
3482 return c;
3483
3484 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
c2ba3ad6
LP
3485 if (c >= b) {
3486 if (_unlikely_(c < USEC_PER_SEC))
3487 return b;
3488
3489 c -= USEC_PER_SEC;
3490 }
3491
3492 if (c >= a)
3493 return c;
3494
3495 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3496 if (c >= b) {
3497 if (_unlikely_(c < USEC_PER_MSEC*250))
3498 return b;
3499
3500 c -= USEC_PER_MSEC*250;
3501 }
3502
3503 if (c >= a)
3504 return c;
3505
3506 return b;
3507}
3508
fd38203a
LP
3509static int event_arm_timer(
3510 sd_event *e,
6a0f1f6d 3511 struct clock_data *d) {
fd38203a
LP
3512
3513 struct itimerspec its = {};
c2ba3ad6
LP
3514 sd_event_source *a, *b;
3515 usec_t t;
fd38203a 3516
cde93897 3517 assert(e);
6a0f1f6d 3518 assert(d);
fd38203a 3519
d06441da 3520 if (!d->needs_rearm)
212bbb17 3521 return 0;
7e2bf71c
YW
3522
3523 d->needs_rearm = false;
212bbb17 3524
6a0f1f6d 3525 a = prioq_peek(d->earliest);
19947509 3526 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
b6d5481b 3527 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
72aedc1e 3528
6a0f1f6d 3529 if (d->fd < 0)
c57b5ca3
LP
3530 return 0;
3531
3a43da28 3532 if (d->next == USEC_INFINITY)
72aedc1e
LP
3533 return 0;
3534
3535 /* disarm */
15c689d7
LP
3536 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3537 return -errno;
72aedc1e 3538
3a43da28 3539 d->next = USEC_INFINITY;
fd38203a 3540 return 0;
72aedc1e 3541 }
fd38203a 3542
6a0f1f6d 3543 b = prioq_peek(d->latest);
19947509
ZJS
3544 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3545 assert(b && b->enabled != SD_EVENT_OFF);
c2ba3ad6 3546
b6d5481b 3547 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
6a0f1f6d 3548 if (d->next == t)
fd38203a
LP
3549 return 0;
3550
6a0f1f6d 3551 assert_se(d->fd >= 0);
fd38203a 3552
c2ba3ad6 3553 if (t == 0) {
1751bdde 3554 /* We don't want to disarm here, just mean some time looooong ago. */
fd38203a
LP
3555 its.it_value.tv_sec = 0;
3556 its.it_value.tv_nsec = 1;
3557 } else
c2ba3ad6 3558 timespec_store(&its.it_value, t);
fd38203a 3559
15c689d7 3560 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
cde93897 3561 return -errno;
fd38203a 3562
6a0f1f6d 3563 d->next = t;
fd38203a
LP
3564 return 0;
3565}
3566
9a800b56 3567static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
fd38203a
LP
3568 assert(e);
3569 assert(s);
3570 assert(s->type == SOURCE_IO);
3571
9a800b56
LP
3572 /* If the event source was already pending, we just OR in the
3573 * new revents, otherwise we reset the value. The ORing is
3574 * necessary to handle EPOLLONESHOT events properly where
3575 * readability might happen independently of writability, and
3576 * we need to keep track of both */
3577
3578 if (s->pending)
3579 s->io.revents |= revents;
3580 else
3581 s->io.revents = revents;
fd38203a 3582
fd38203a
LP
3583 return source_set_pending(s, true);
3584}
3585
72aedc1e 3586static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
fd38203a
LP
3587 uint64_t x;
3588 ssize_t ss;
3589
3590 assert(e);
da7e457c 3591 assert(fd >= 0);
72aedc1e 3592
305f78bf 3593 assert_return(events == EPOLLIN, -EIO);
fd38203a
LP
3594
3595 ss = read(fd, &x, sizeof(x));
3596 if (ss < 0) {
8add30a0 3597 if (ERRNO_IS_TRANSIENT(errno))
fd38203a
LP
3598 return 0;
3599
3600 return -errno;
3601 }
3602
8d35dae7 3603 if (_unlikely_(ss != sizeof(x)))
fd38203a
LP
3604 return -EIO;
3605
cde93897 3606 if (next)
3a43da28 3607 *next = USEC_INFINITY;
72aedc1e 3608
fd38203a
LP
3609 return 0;
3610}
3611
305f78bf
LP
3612static int process_timer(
3613 sd_event *e,
3614 usec_t n,
6a0f1f6d 3615 struct clock_data *d) {
305f78bf 3616
fd38203a 3617 sd_event_source *s;
fd69f224 3618 bool callback_invoked = false;
fd38203a
LP
3619 int r;
3620
3621 assert(e);
6a0f1f6d 3622 assert(d);
fd38203a
LP
3623
3624 for (;;) {
6a0f1f6d 3625 s = prioq_peek(d->earliest);
19947509
ZJS
3626 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3627
b6d5481b
LP
3628 if (!s || time_event_source_next(s) > n)
3629 break;
3630
3631 if (s->ratelimited) {
3632 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3633 * again. */
3634 assert(s->ratelimited);
3635
fd69f224 3636 r = event_source_leave_ratelimit(s, /* run_callback */ true);
b6d5481b
LP
3637 if (r < 0)
3638 return r;
fd69f224
MS
3639 else if (r == 1)
3640 callback_invoked = true;
b6d5481b
LP
3641
3642 continue;
3643 }
3644
3645 if (s->enabled == SD_EVENT_OFF || s->pending)
fd38203a
LP
3646 break;
3647
3648 r = source_set_pending(s, true);
3649 if (r < 0)
3650 return r;
3651
e1951c16 3652 event_source_time_prioq_reshuffle(s);
fd38203a
LP
3653 }
3654
fd69f224 3655 return callback_invoked;
fd38203a
LP
3656}
3657
efd3be9d
YW
3658static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3659 int64_t min_priority = threshold;
3660 bool something_new = false;
fd38203a 3661 sd_event_source *s;
fd38203a
LP
3662 int r;
3663
3664 assert(e);
efd3be9d
YW
3665 assert(ret_min_priority);
3666
3667 if (!e->need_process_child) {
3668 *ret_min_priority = min_priority;
3669 return 0;
3670 }
fd38203a 3671
c2ba3ad6
LP
3672 e->need_process_child = false;
3673
91c70071
YW
3674 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3675 * for, instead of using P_ALL. This is because we only want to get child information of very
3676 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3677 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3678 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3679 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3680 * to handle SIGCHLD yourself.
3681 *
3682 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3683 * source is dispatched so that the callback still sees the process as a zombie. */
fd38203a 3684
90e74a66 3685 HASHMAP_FOREACH(s, e->child_sources) {
fd38203a
LP
3686 assert(s->type == SOURCE_CHILD);
3687
efd3be9d
YW
3688 if (s->priority > threshold)
3689 continue;
3690
fd38203a
LP
3691 if (s->pending)
3692 continue;
3693
b6d5481b 3694 if (event_source_is_offline(s))
fd38203a
LP
3695 continue;
3696
f8f3f926
LP
3697 if (s->child.exited)
3698 continue;
3699
91c70071
YW
3700 if (EVENT_SOURCE_WATCH_PIDFD(s))
3701 /* There's a usable pidfd known for this event source? Then don't waitid() for
3702 * it here */
f8f3f926
LP
3703 continue;
3704
fd38203a 3705 zero(s->child.siginfo);
15c689d7
LP
3706 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3707 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
bfd9bfcc 3708 return negative_errno();
fd38203a
LP
3709
3710 if (s->child.siginfo.si_pid != 0) {
945c2931 3711 bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
08cd1552 3712
f8f3f926
LP
3713 if (zombie)
3714 s->child.exited = true;
3715
08cd1552 3716 if (!zombie && (s->child.options & WEXITED)) {
91c70071
YW
3717 /* If the child isn't dead then let's immediately remove the state
3718 * change from the queue, since there's no benefit in leaving it
3719 * queued. */
08cd1552
LP
3720
3721 assert(s->child.options & (WSTOPPED|WCONTINUED));
a5d27871 3722 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
08cd1552
LP
3723 }
3724
fd38203a
LP
3725 r = source_set_pending(s, true);
3726 if (r < 0)
3727 return r;
efd3be9d
YW
3728 if (r > 0) {
3729 something_new = true;
3730 min_priority = MIN(min_priority, s->priority);
3731 }
fd38203a
LP
3732 }
3733 }
3734
efd3be9d
YW
3735 *ret_min_priority = min_priority;
3736 return something_new;
fd38203a
LP
3737}
3738
f8f3f926
LP
3739static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3740 assert(e);
3741 assert(s);
3742 assert(s->type == SOURCE_CHILD);
3743
3744 if (s->pending)
3745 return 0;
3746
b6d5481b 3747 if (event_source_is_offline(s))
f8f3f926
LP
3748 return 0;
3749
3750 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3751 return 0;
3752
3753 zero(s->child.siginfo);
3754 if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3755 return -errno;
3756
3757 if (s->child.siginfo.si_pid == 0)
3758 return 0;
3759
3760 if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3761 s->child.exited = true;
3762
3763 return source_set_pending(s, true);
3764}
3765
efd3be9d 3766static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
fd38203a
LP
3767 int r;
3768
da7e457c 3769 assert(e);
97ef5391 3770 assert(d);
305f78bf 3771 assert_return(events == EPOLLIN, -EIO);
efd3be9d 3772 assert(min_priority);
fd38203a 3773
91c70071
YW
3774 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3775 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3776 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3777 * but we might have higher priority children we care about hence we need to check that
3778 * explicitly. */
9da4cb2b
LP
3779
3780 if (sigismember(&d->sigset, SIGCHLD))
3781 e->need_process_child = true;
3782
91c70071 3783 /* If there's already an event source pending for this priority we don't read another */
9da4cb2b
LP
3784 if (d->current)
3785 return 0;
3786
fd38203a 3787 for (;;) {
0eb2e0e3 3788 struct signalfd_siginfo si;
7057bd99 3789 ssize_t n;
92daebc0 3790 sd_event_source *s = NULL;
fd38203a 3791
9da4cb2b 3792 n = read(d->fd, &si, sizeof(si));
7057bd99 3793 if (n < 0) {
8add30a0 3794 if (ERRNO_IS_TRANSIENT(errno))
efd3be9d 3795 return 0;
fd38203a
LP
3796
3797 return -errno;
3798 }
3799
7057bd99 3800 if (_unlikely_(n != sizeof(si)))
fd38203a
LP
3801 return -EIO;
3802
6eb7c172 3803 assert(SIGNAL_VALID(si.ssi_signo));
7057bd99 3804
92daebc0
LP
3805 if (e->signal_sources)
3806 s = e->signal_sources[si.ssi_signo];
92daebc0
LP
3807 if (!s)
3808 continue;
9da4cb2b
LP
3809 if (s->pending)
3810 continue;
fd38203a
LP
3811
3812 s->signal.siginfo = si;
9da4cb2b
LP
3813 d->current = s;
3814
fd38203a
LP
3815 r = source_set_pending(s, true);
3816 if (r < 0)
3817 return r;
efd3be9d
YW
3818 if (r > 0 && *min_priority >= s->priority) {
3819 *min_priority = s->priority;
3820 return 1; /* an event source with smaller priority is queued. */
3821 }
9da4cb2b 3822
efd3be9d 3823 return 0;
fd38203a 3824 }
fd38203a
LP
3825}
3826
efd3be9d 3827static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
97ef5391
LP
3828 ssize_t n;
3829
3830 assert(e);
3831 assert(d);
3832
3833 assert_return(revents == EPOLLIN, -EIO);
3834
3835 /* If there's already an event source pending for this priority, don't read another */
3836 if (d->n_pending > 0)
3837 return 0;
3838
3839 /* Is the read buffer non-empty? If so, let's not read more */
3840 if (d->buffer_filled > 0)
3841 return 0;
3842
efd3be9d
YW
3843 if (d->priority > threshold)
3844 return 0;
3845
97ef5391
LP
3846 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3847 if (n < 0) {
8add30a0 3848 if (ERRNO_IS_TRANSIENT(errno))
97ef5391
LP
3849 return 0;
3850
3851 return -errno;
3852 }
3853
3854 assert(n > 0);
3855 d->buffer_filled = (size_t) n;
0601b958 3856 LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3857
3858 return 1;
3859}
3860
3861static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3862 assert(e);
3863 assert(d);
3864 assert(sz <= d->buffer_filled);
3865
3866 if (sz == 0)
3867 return;
3868
3869 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3870 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3871 d->buffer_filled -= sz;
3872
3873 if (d->buffer_filled == 0)
0601b958 3874 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3875}
3876
3877static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3878 int r;
3879
3880 assert(e);
3881 assert(d);
3882
3883 /* If there's already an event source pending for this priority, don't read another */
3884 if (d->n_pending > 0)
3885 return 0;
3886
3887 while (d->buffer_filled > 0) {
3888 size_t sz;
3889
3890 /* Let's validate that the event structures are complete */
3891 if (d->buffer_filled < offsetof(struct inotify_event, name))
3892 return -EIO;
3893
3894 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3895 if (d->buffer_filled < sz)
3896 return -EIO;
3897
3898 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3899 struct inode_data *inode_data;
97ef5391
LP
3900
3901 /* The queue overran, let's pass this event to all event sources connected to this inotify
3902 * object */
3903
03677889 3904 HASHMAP_FOREACH(inode_data, d->inodes)
97ef5391
LP
3905 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3906
b6d5481b 3907 if (event_source_is_offline(s))
97ef5391
LP
3908 continue;
3909
3910 r = source_set_pending(s, true);
3911 if (r < 0)
3912 return r;
3913 }
97ef5391
LP
3914 } else {
3915 struct inode_data *inode_data;
97ef5391
LP
3916
3917 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3918 * our watch descriptor table. */
3919 if (d->buffer.ev.mask & IN_IGNORED) {
3920
3921 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3922 if (!inode_data) {
3923 event_inotify_data_drop(e, d, sz);
3924 continue;
3925 }
3926
3927 /* The watch descriptor was removed by the kernel, let's drop it here too */
3928 inode_data->wd = -1;
3929 } else {
3930 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3931 if (!inode_data) {
3932 event_inotify_data_drop(e, d, sz);
3933 continue;
3934 }
3935 }
3936
3937 /* Trigger all event sources that are interested in these events. Also trigger all event
3938 * sources if IN_IGNORED or IN_UNMOUNT is set. */
3939 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3940
b6d5481b 3941 if (event_source_is_offline(s))
97ef5391
LP
3942 continue;
3943
3944 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3945 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3946 continue;
3947
3948 r = source_set_pending(s, true);
3949 if (r < 0)
3950 return r;
3951 }
3952 }
3953
3954 /* Something pending now? If so, let's finish, otherwise let's read more. */
3955 if (d->n_pending > 0)
3956 return 1;
3957 }
3958
3959 return 0;
3960}
3961
3962static int process_inotify(sd_event *e) {
97ef5391
LP
3963 int r, done = 0;
3964
3965 assert(e);
3966
0601b958 3967 LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
97ef5391
LP
3968 r = event_inotify_data_process(e, d);
3969 if (r < 0)
3970 return r;
3971 if (r > 0)
3972 done ++;
3973 }
3974
3975 return done;
3976}
3977
158fe190
LP
3978static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
3979 assert(s);
3980 assert(s->type == SOURCE_MEMORY_PRESSURE);
3981
3982 if (s->pending)
3983 s->memory_pressure.revents |= revents;
3984 else
3985 s->memory_pressure.revents = revents;
3986
3987 return source_set_pending(s, true);
3988}
3989
3990static int source_memory_pressure_write(sd_event_source *s) {
3991 ssize_t n;
3992 int r;
3993
3994 assert(s);
3995 assert(s->type == SOURCE_MEMORY_PRESSURE);
3996
3997 /* once we start writing, the buffer is locked, we allow no further changes. */
3998 s->memory_pressure.locked = true;
3999
4000 if (s->memory_pressure.write_buffer_size > 0) {
4001 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4002 if (n < 0) {
4003 if (!ERRNO_IS_TRANSIENT(errno))
4004 return -errno;
4005
4006 n = 0;
4007 }
4008 } else
4009 n = 0;
4010
4011 assert(n >= 0);
4012
4013 if ((size_t) n == s->memory_pressure.write_buffer_size) {
4014 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4015
4016 if (n > 0) {
4017 s->memory_pressure.write_buffer_size = 0;
4018
4019 /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4020 r = source_memory_pressure_register(s, s->enabled);
4021 if (r < 0)
4022 return r;
4023 }
4024 } else if (n > 0) {
4025 _cleanup_free_ void *c = NULL;
4026
4027 assert((size_t) n < s->memory_pressure.write_buffer_size);
4028
4029 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4030 if (!c)
4031 return -ENOMEM;
4032
4033 free_and_replace(s->memory_pressure.write_buffer, c);
4034 s->memory_pressure.write_buffer_size -= n;
4035 return 1;
4036 }
4037
4038 return 0;
4039}
4040
4041static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4042 int r;
4043
4044 assert(s);
4045 assert(s->type == SOURCE_MEMORY_PRESSURE);
4046
4047 r = source_memory_pressure_write(s);
4048 if (r < 0)
4049 return r;
4050 if (r > 0)
4051 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4052 * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4053
4054 /* No pending incoming IO? Then let's not continue further */
4055 if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4056
4057 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4058 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4059 return -EIO;
4060
4061 return 1; /* leave dispatch, we already processed everything */
4062 }
4063
4064 if (s->memory_pressure.revents & EPOLLIN) {
4065 uint8_t pipe_buf[PIPE_BUF];
4066 ssize_t n;
4067
4068 /* If the fd is readable, then flush out anything that might be queued */
4069
4070 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4071 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4072 return -errno;
4073 }
4074
4075 return 0; /* go on, dispatch to user callback */
4076}
4077
fd38203a 4078static int source_dispatch(sd_event_source *s) {
8f5c235d 4079 EventSourceType saved_type;
c8e9d15c 4080 sd_event *saved_event;
fe8245eb 4081 int r = 0;
fd38203a
LP
4082
4083 assert(s);
6203e07a 4084 assert(s->pending || s->type == SOURCE_EXIT);
fd38203a 4085
b778cba4
LP
4086 /* Save the event source type, here, so that we still know it after the event callback which might
4087 * invalidate the event. */
8f5c235d
LP
4088 saved_type = s->type;
4089
de02634c 4090 /* Similarly, store a reference to the event loop object, so that we can still access it after the
b778cba4 4091 * callback might have invalidated/disconnected the event source. */
c8e9d15c
YW
4092 saved_event = s->event;
4093 PROTECT_EVENT(saved_event);
b778cba4 4094
de02634c 4095 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
b6d5481b
LP
4096 assert(!s->ratelimited);
4097 if (!ratelimit_below(&s->rate_limit)) {
4098 r = event_source_enter_ratelimited(s);
4099 if (r < 0)
4100 return r;
4101
4102 return 1;
4103 }
4104
945c2931 4105 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
da7e457c
LP
4106 r = source_set_pending(s, false);
4107 if (r < 0)
4108 return r;
4109 }
fd38203a 4110
6e9feda3
LP
4111 if (s->type != SOURCE_POST) {
4112 sd_event_source *z;
6e9feda3 4113
de02634c 4114 /* If we execute a non-post source, let's mark all post sources as pending. */
6e9feda3 4115
90e74a66 4116 SET_FOREACH(z, s->event->post_sources) {
b6d5481b 4117 if (event_source_is_offline(z))
6e9feda3
LP
4118 continue;
4119
4120 r = source_set_pending(z, true);
4121 if (r < 0)
4122 return r;
4123 }
4124 }
4125
158fe190
LP
4126 if (s->type == SOURCE_MEMORY_PRESSURE) {
4127 r = source_memory_pressure_initiate_dispatch(s);
4128 if (r == -EIO) /* handle EIO errors similar to callback errors */
4129 goto finish;
4130 if (r < 0)
4131 return r;
4132 if (r > 0) /* already handled */
4133 return 1;
4134 }
4135
baf76283
LP
4136 if (s->enabled == SD_EVENT_ONESHOT) {
4137 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
fd38203a
LP
4138 if (r < 0)
4139 return r;
4140 }
4141
12179984 4142 s->dispatching = true;
b7484e2a 4143
fd38203a
LP
4144 switch (s->type) {
4145
4146 case SOURCE_IO:
4147 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4148 break;
4149
6a0f1f6d 4150 case SOURCE_TIME_REALTIME:
a8548816 4151 case SOURCE_TIME_BOOTTIME:
6a0f1f6d
LP
4152 case SOURCE_TIME_MONOTONIC:
4153 case SOURCE_TIME_REALTIME_ALARM:
4154 case SOURCE_TIME_BOOTTIME_ALARM:
fd38203a
LP
4155 r = s->time.callback(s, s->time.next, s->userdata);
4156 break;
4157
4158 case SOURCE_SIGNAL:
4159 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4160 break;
4161
08cd1552
LP
4162 case SOURCE_CHILD: {
4163 bool zombie;
4164
945c2931 4165 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
08cd1552 4166
fd38203a 4167 r = s->child.callback(s, &s->child.siginfo, s->userdata);
08cd1552
LP
4168
4169 /* Now, reap the PID for good. */
f8f3f926 4170 if (zombie) {
cc59d290 4171 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
f8f3f926
LP
4172 s->child.waited = true;
4173 }
08cd1552 4174
fd38203a 4175 break;
08cd1552 4176 }
fd38203a
LP
4177
4178 case SOURCE_DEFER:
4179 r = s->defer.callback(s, s->userdata);
4180 break;
da7e457c 4181
6e9feda3
LP
4182 case SOURCE_POST:
4183 r = s->post.callback(s, s->userdata);
4184 break;
4185
6203e07a
LP
4186 case SOURCE_EXIT:
4187 r = s->exit.callback(s, s->userdata);
da7e457c 4188 break;
9d3e3aa5 4189
97ef5391
LP
4190 case SOURCE_INOTIFY: {
4191 struct sd_event *e = s->event;
4192 struct inotify_data *d;
4193 size_t sz;
4194
4195 assert(s->inotify.inode_data);
4196 assert_se(d = s->inotify.inode_data->inotify_data);
4197
4198 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4199 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4200 assert(d->buffer_filled >= sz);
4201
53baf2ef
LP
4202 /* If the inotify callback destroys the event source then this likely means we don't need to
4203 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4204 * free it immediately, then we couldn't drop the event from the inotify event queue without
4205 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4206 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4207 * explicitly GC it after we are done dropping the inotify event from the buffer. */
4208 d->n_busy++;
97ef5391 4209 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
53baf2ef 4210 d->n_busy--;
97ef5391 4211
53baf2ef
LP
4212 /* When no event is pending anymore on this inotify object, then let's drop the event from
4213 * the inotify event queue buffer. */
97ef5391
LP
4214 if (d->n_pending == 0)
4215 event_inotify_data_drop(e, d, sz);
4216
53baf2ef
LP
4217 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4218 event_gc_inotify_data(e, d);
97ef5391
LP
4219 break;
4220 }
4221
158fe190
LP
4222 case SOURCE_MEMORY_PRESSURE:
4223 r = s->memory_pressure.callback(s, s->userdata);
4224 break;
4225
9d3e3aa5 4226 case SOURCE_WATCHDOG:
a71fe8b8 4227 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
9f2a50a3 4228 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
04499a70 4229 assert_not_reached();
fd38203a
LP
4230 }
4231
12179984
LP
4232 s->dispatching = false;
4233
158fe190 4234finish:
b778cba4
LP
4235 if (r < 0) {
4236 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4237 strna(s->description),
4238 event_source_type_to_string(saved_type),
4239 s->exit_on_failure ? "exiting" : "disabling");
4240
4241 if (s->exit_on_failure)
4242 (void) sd_event_exit(saved_event, r);
4243 }
12179984
LP
4244
4245 if (s->n_ref == 0)
4246 source_free(s);
4247 else if (r < 0)
c3c50474 4248 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
b7484e2a 4249
6203e07a 4250 return 1;
fd38203a
LP
4251}
4252
4253static int event_prepare(sd_event *e) {
4254 int r;
4255
4256 assert(e);
4257
4258 for (;;) {
4259 sd_event_source *s;
4260
4261 s = prioq_peek(e->prepare);
b6d5481b 4262 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
fd38203a
LP
4263 break;
4264
4265 s->prepare_iteration = e->iteration;
8656f4a6 4266 prioq_reshuffle(e->prepare, s, &s->prepare_index);
fd38203a
LP
4267
4268 assert(s->prepare);
12179984 4269 s->dispatching = true;
fd38203a 4270 r = s->prepare(s, s->userdata);
12179984
LP
4271 s->dispatching = false;
4272
b778cba4
LP
4273 if (r < 0) {
4274 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4275 strna(s->description),
4276 event_source_type_to_string(s->type),
4277 s->exit_on_failure ? "exiting" : "disabling");
4278
4279 if (s->exit_on_failure)
4280 (void) sd_event_exit(e, r);
4281 }
fd38203a 4282
12179984
LP
4283 if (s->n_ref == 0)
4284 source_free(s);
4285 else if (r < 0)
c3c50474 4286 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd38203a
LP
4287 }
4288
4289 return 0;
4290}
4291
6203e07a 4292static int dispatch_exit(sd_event *e) {
da7e457c
LP
4293 sd_event_source *p;
4294 int r;
4295
4296 assert(e);
4297
6203e07a 4298 p = prioq_peek(e->exit);
19947509
ZJS
4299 assert(!p || p->type == SOURCE_EXIT);
4300
b6d5481b 4301 if (!p || event_source_is_offline(p)) {
da7e457c
LP
4302 e->state = SD_EVENT_FINISHED;
4303 return 0;
4304 }
4305
c8e9d15c 4306 PROTECT_EVENT(e);
da7e457c 4307 e->iteration++;
6203e07a 4308 e->state = SD_EVENT_EXITING;
da7e457c 4309 r = source_dispatch(p);
2b0c9ef7 4310 e->state = SD_EVENT_INITIAL;
da7e457c
LP
4311 return r;
4312}
4313
c2ba3ad6
LP
4314static sd_event_source* event_next_pending(sd_event *e) {
4315 sd_event_source *p;
4316
da7e457c
LP
4317 assert(e);
4318
c2ba3ad6
LP
4319 p = prioq_peek(e->pending);
4320 if (!p)
4321 return NULL;
4322
b6d5481b 4323 if (event_source_is_offline(p))
c2ba3ad6
LP
4324 return NULL;
4325
4326 return p;
4327}
4328
cde93897
LP
4329static int arm_watchdog(sd_event *e) {
4330 struct itimerspec its = {};
4331 usec_t t;
cde93897
LP
4332
4333 assert(e);
4334 assert(e->watchdog_fd >= 0);
4335
4336 t = sleep_between(e,
a595fb5c
YW
4337 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4338 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
cde93897
LP
4339
4340 timespec_store(&its.it_value, t);
4341
75145780
LP
4342 /* Make sure we never set the watchdog to 0, which tells the
4343 * kernel to disable it. */
4344 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4345 its.it_value.tv_nsec = 1;
4346
7c248223 4347 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
cde93897
LP
4348}
4349
4350static int process_watchdog(sd_event *e) {
4351 assert(e);
4352
4353 if (!e->watchdog)
4354 return 0;
4355
4356 /* Don't notify watchdog too often */
4357 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4358 return 0;
4359
4360 sd_notify(false, "WATCHDOG=1");
4361 e->watchdog_last = e->timestamp.monotonic;
4362
4363 return arm_watchdog(e);
4364}
4365
97ef5391
LP
4366static void event_close_inode_data_fds(sd_event *e) {
4367 struct inode_data *d;
4368
4369 assert(e);
4370
4371 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4372 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
365c2885 4373 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
97ef5391
LP
4374 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4375 * compromise. */
4376
ed828563 4377 while ((d = e->inode_data_to_close_list)) {
97ef5391
LP
4378 assert(d->fd >= 0);
4379 d->fd = safe_close(d->fd);
4380
ed828563 4381 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
4382 }
4383}
4384
158fe190
LP
4385static int event_memory_pressure_write_list(sd_event *e) {
4386 int r;
4387
4388 assert(e);
4389
4390 for (;;) {
4391 sd_event_source *s;
4392
4393 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4394 if (!s)
4395 break;
4396
4397 assert(s->type == SOURCE_MEMORY_PRESSURE);
4398 assert(s->memory_pressure.write_buffer_size > 0);
4399 s->memory_pressure.in_write_list = false;
4400
4401 r = source_memory_pressure_write(s);
4402 if (r < 0)
4403 return r;
4404 }
4405
4406 return 0;
4407}
4408
c45a5a74
TG
4409_public_ int sd_event_prepare(sd_event *e) {
4410 int r;
fd38203a 4411
da7e457c 4412 assert_return(e, -EINVAL);
b937d761 4413 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
4414 assert_return(!event_pid_changed(e), -ECHILD);
4415 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 4416 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 4417
e5446015
LP
4418 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4419 * this check here once, since gettid() is typically not cached, and thus want to minimize
4420 * syscalls */
4421 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4422
f814c871 4423 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 4424 PROTECT_EVENT(e);
f814c871 4425
6203e07a 4426 if (e->exit_requested)
c45a5a74 4427 goto pending;
fd38203a
LP
4428
4429 e->iteration++;
4430
0be6c2f6 4431 e->state = SD_EVENT_PREPARING;
fd38203a 4432 r = event_prepare(e);
0be6c2f6 4433 e->state = SD_EVENT_INITIAL;
fd38203a 4434 if (r < 0)
c45a5a74 4435 return r;
fd38203a 4436
158fe190
LP
4437 r = event_memory_pressure_write_list(e);
4438 if (r < 0)
4439 return r;
4440
6a0f1f6d
LP
4441 r = event_arm_timer(e, &e->realtime);
4442 if (r < 0)
c45a5a74 4443 return r;
6a0f1f6d 4444
a8548816
TG
4445 r = event_arm_timer(e, &e->boottime);
4446 if (r < 0)
c45a5a74 4447 return r;
a8548816 4448
6a0f1f6d
LP
4449 r = event_arm_timer(e, &e->monotonic);
4450 if (r < 0)
c45a5a74 4451 return r;
6a0f1f6d
LP
4452
4453 r = event_arm_timer(e, &e->realtime_alarm);
1b5995b0 4454 if (r < 0)
c45a5a74 4455 return r;
fd38203a 4456
6a0f1f6d 4457 r = event_arm_timer(e, &e->boottime_alarm);
1b5995b0 4458 if (r < 0)
c45a5a74 4459 return r;
fd38203a 4460
97ef5391
LP
4461 event_close_inode_data_fds(e);
4462
0601b958 4463 if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
c45a5a74
TG
4464 goto pending;
4465
2b0c9ef7 4466 e->state = SD_EVENT_ARMED;
c45a5a74
TG
4467
4468 return 0;
4469
4470pending:
2b0c9ef7 4471 e->state = SD_EVENT_ARMED;
6d148a84
TG
4472 r = sd_event_wait(e, 0);
4473 if (r == 0)
2b0c9ef7 4474 e->state = SD_EVENT_ARMED;
6d148a84
TG
4475
4476 return r;
c45a5a74
TG
4477}
4478
798445ab
LP
4479static int epoll_wait_usec(
4480 int fd,
4481 struct epoll_event *events,
4482 int maxevents,
4483 usec_t timeout) {
4484
7c248223 4485 int msec;
0c14c45e
LP
4486 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4487
4488#if HAVE_EPOLL_PWAIT2
39f756d3 4489 static bool epoll_pwait2_absent = false;
52bb308c 4490 int r;
798445ab 4491
0c14c45e
LP
4492 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4493 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4494 * is not that obvious to implement given the libc and kernel definitions differ in the last
4495 * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4496 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4497 * missing. */
798445ab
LP
4498
4499 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
798445ab
LP
4500 r = epoll_pwait2(fd,
4501 events,
4502 maxevents,
52bb308c 4503 TIMESPEC_STORE(timeout),
798445ab
LP
4504 NULL);
4505 if (r >= 0)
4506 return r;
7cb45dbf 4507 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
798445ab
LP
4508 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4509 * supported. */
4510
4511 epoll_pwait2_absent = true;
4512 }
39f756d3 4513#endif
798445ab
LP
4514
4515 if (timeout == USEC_INFINITY)
4516 msec = -1;
4517 else {
4518 usec_t k;
4519
4520 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4521 if (k >= INT_MAX)
4522 msec = INT_MAX; /* Saturate */
4523 else
4524 msec = (int) k;
4525 }
4526
7c248223 4527 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
798445ab
LP
4528}
4529
efd3be9d 4530static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
319a4f4b 4531 size_t n_event_queue, m, n_event_max;
efd3be9d
YW
4532 int64_t min_priority = threshold;
4533 bool something_new = false;
798445ab 4534 int r;
c45a5a74 4535
efd3be9d
YW
4536 assert(e);
4537 assert(ret_min_priority);
6a0f1f6d 4538
8b9708d1 4539 n_event_queue = MAX(e->n_sources, 1u);
319a4f4b 4540 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
5cddd924 4541 return -ENOMEM;
fd38203a 4542
319a4f4b
LP
4543 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4544
97ef5391 4545 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
0601b958 4546 if (e->buffered_inotify_data_list)
798445ab 4547 timeout = 0;
97ef5391 4548
8b9708d1 4549 for (;;) {
319a4f4b
LP
4550 r = epoll_wait_usec(
4551 e->epoll_fd,
4552 e->event_queue,
4553 n_event_max,
4554 timeout);
798445ab 4555 if (r < 0)
efd3be9d 4556 return r;
c45a5a74 4557
8b9708d1
YW
4558 m = (size_t) r;
4559
319a4f4b 4560 if (m < n_event_max)
8b9708d1
YW
4561 break;
4562
319a4f4b 4563 if (n_event_max >= n_event_queue * 10)
8b9708d1
YW
4564 break;
4565
319a4f4b 4566 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
8b9708d1
YW
4567 return -ENOMEM;
4568
319a4f4b 4569 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
798445ab 4570 timeout = 0;
da7e457c 4571 }
fd38203a 4572
efd3be9d
YW
4573 /* Set timestamp only when this is called first time. */
4574 if (threshold == INT64_MAX)
4575 triple_timestamp_get(&e->timestamp);
fd38203a 4576
8b9708d1 4577 for (size_t i = 0; i < m; i++) {
fd38203a 4578
5cddd924
LP
4579 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4580 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
9da4cb2b 4581 else {
5cddd924 4582 WakeupType *t = e->event_queue[i].data.ptr;
9da4cb2b
LP
4583
4584 switch (*t) {
4585
f8f3f926 4586 case WAKEUP_EVENT_SOURCE: {
5cddd924 4587 sd_event_source *s = e->event_queue[i].data.ptr;
f8f3f926
LP
4588
4589 assert(s);
4590
efd3be9d
YW
4591 if (s->priority > threshold)
4592 continue;
4593
4594 min_priority = MIN(min_priority, s->priority);
4595
f8f3f926
LP
4596 switch (s->type) {
4597
4598 case SOURCE_IO:
5cddd924 4599 r = process_io(e, s, e->event_queue[i].events);
f8f3f926
LP
4600 break;
4601
4602 case SOURCE_CHILD:
5cddd924 4603 r = process_pidfd(e, s, e->event_queue[i].events);
f8f3f926
LP
4604 break;
4605
158fe190
LP
4606 case SOURCE_MEMORY_PRESSURE:
4607 r = process_memory_pressure(s, e->event_queue[i].events);
4608 break;
4609
f8f3f926 4610 default:
04499a70 4611 assert_not_reached();
f8f3f926
LP
4612 }
4613
9da4cb2b 4614 break;
f8f3f926 4615 }
fd38203a 4616
9da4cb2b 4617 case WAKEUP_CLOCK_DATA: {
5cddd924 4618 struct clock_data *d = e->event_queue[i].data.ptr;
f8f3f926
LP
4619
4620 assert(d);
4621
5cddd924 4622 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
9da4cb2b
LP
4623 break;
4624 }
4625
4626 case WAKEUP_SIGNAL_DATA:
efd3be9d 4627 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
9da4cb2b
LP
4628 break;
4629
97ef5391 4630 case WAKEUP_INOTIFY_DATA:
efd3be9d 4631 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
97ef5391
LP
4632 break;
4633
9da4cb2b 4634 default:
04499a70 4635 assert_not_reached();
9da4cb2b
LP
4636 }
4637 }
efd3be9d
YW
4638 if (r < 0)
4639 return r;
4640 if (r > 0)
4641 something_new = true;
4642 }
4643
4644 *ret_min_priority = min_priority;
4645 return something_new;
4646}
4647
4648_public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4649 int r;
4650
4651 assert_return(e, -EINVAL);
4652 assert_return(e = event_resolve(e), -ENOPKG);
4653 assert_return(!event_pid_changed(e), -ECHILD);
4654 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4655 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4656
4657 if (e->exit_requested) {
4658 e->state = SD_EVENT_PENDING;
4659 return 1;
4660 }
4661
4662 for (int64_t threshold = INT64_MAX; ; threshold--) {
4663 int64_t epoll_min_priority, child_min_priority;
4664
4665 /* There may be a possibility that new epoll (especially IO) and child events are
4666 * triggered just after process_epoll() call but before process_child(), and the new IO
4667 * events may have higher priority than the child events. To salvage these events,
4668 * let's call epoll_wait() again, but accepts only events with higher priority than the
4669 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4670 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4671 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4672
4673 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4674 if (r == -EINTR) {
4675 e->state = SD_EVENT_PENDING;
4676 return 1;
4677 }
4678 if (r < 0)
4679 goto finish;
4680 if (r == 0 && threshold < INT64_MAX)
4681 /* No new epoll event. */
4682 break;
4683
4684 r = process_child(e, threshold, &child_min_priority);
fd38203a 4685 if (r < 0)
da7e457c 4686 goto finish;
efd3be9d
YW
4687 if (r == 0)
4688 /* No new child event. */
4689 break;
4690
4691 threshold = MIN(epoll_min_priority, child_min_priority);
4692 if (threshold == INT64_MIN)
4693 break;
4694
4695 timeout = 0;
fd38203a
LP
4696 }
4697
cde93897
LP
4698 r = process_watchdog(e);
4699 if (r < 0)
4700 goto finish;
4701
fd69f224 4702 r = process_inotify(e);
6a0f1f6d
LP
4703 if (r < 0)
4704 goto finish;
4705
fd69f224 4706 r = process_timer(e, e->timestamp.realtime, &e->realtime);
a8548816
TG
4707 if (r < 0)
4708 goto finish;
4709
fd69f224 4710 r = process_timer(e, e->timestamp.boottime, &e->boottime);
6a0f1f6d
LP
4711 if (r < 0)
4712 goto finish;
4713
4714 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
fd38203a 4715 if (r < 0)
da7e457c 4716 goto finish;
fd38203a 4717
e475d10c 4718 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
fd38203a 4719 if (r < 0)
da7e457c 4720 goto finish;
fd38203a 4721
fd69f224 4722 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
97ef5391
LP
4723 if (r < 0)
4724 goto finish;
fd69f224
MS
4725 else if (r == 1) {
4726 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4727 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4728 * there were potentially re-enabled by the callback.
4729 *
4730 * Wondering why we treat only this invocation of process_timer() differently? Once event
4731 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4732 * ratelimit expiry callback is never called for any other timer type. */
4733 r = 0;
4734 goto finish;
4735 }
97ef5391 4736
c45a5a74
TG
4737 if (event_next_pending(e)) {
4738 e->state = SD_EVENT_PENDING;
c45a5a74 4739 return 1;
da7e457c
LP
4740 }
4741
c45a5a74 4742 r = 0;
fd38203a 4743
da7e457c 4744finish:
2b0c9ef7 4745 e->state = SD_EVENT_INITIAL;
da7e457c
LP
4746
4747 return r;
fd38203a
LP
4748}
4749
c45a5a74
TG
4750_public_ int sd_event_dispatch(sd_event *e) {
4751 sd_event_source *p;
4752 int r;
4753
4754 assert_return(e, -EINVAL);
b937d761 4755 assert_return(e = event_resolve(e), -ENOPKG);
c45a5a74
TG
4756 assert_return(!event_pid_changed(e), -ECHILD);
4757 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4758 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4759
4760 if (e->exit_requested)
4761 return dispatch_exit(e);
4762
4763 p = event_next_pending(e);
4764 if (p) {
c8e9d15c 4765 PROTECT_EVENT(e);
c45a5a74
TG
4766
4767 e->state = SD_EVENT_RUNNING;
4768 r = source_dispatch(p);
2b0c9ef7 4769 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4770 return r;
4771 }
4772
2b0c9ef7 4773 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4774
4775 return 1;
4776}
4777
34b87517 4778static void event_log_delays(sd_event *e) {
442ac269
YW
4779 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4780 size_t l, i;
34b87517 4781
442ac269
YW
4782 p = b;
4783 l = sizeof(b);
4784 for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4785 l = strpcpyf(&p, l, "%u ", e->delays[i]);
34b87517
VC
4786 e->delays[i] = 0;
4787 }
442ac269 4788 log_debug("Event loop iterations: %s", b);
34b87517
VC
4789}
4790
c45a5a74
TG
4791_public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4792 int r;
4793
4794 assert_return(e, -EINVAL);
b937d761 4795 assert_return(e = event_resolve(e), -ENOPKG);
c45a5a74
TG
4796 assert_return(!event_pid_changed(e), -ECHILD);
4797 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 4798 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
c45a5a74 4799
e6a7bee5 4800 if (e->profile_delays && e->last_run_usec != 0) {
34b87517
VC
4801 usec_t this_run;
4802 unsigned l;
4803
4804 this_run = now(CLOCK_MONOTONIC);
4805
58c34be8 4806 l = log2u64(this_run - e->last_run_usec);
cb9d621e 4807 assert(l < ELEMENTSOF(e->delays));
34b87517
VC
4808 e->delays[l]++;
4809
e6a7bee5 4810 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
34b87517 4811 event_log_delays(e);
e6a7bee5 4812 e->last_log_usec = this_run;
34b87517
VC
4813 }
4814 }
4815
f814c871 4816 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 4817 PROTECT_EVENT(e);
f814c871 4818
c45a5a74 4819 r = sd_event_prepare(e);
53bac4e0
LP
4820 if (r == 0)
4821 /* There was nothing? Then wait... */
4822 r = sd_event_wait(e, timeout);
c45a5a74 4823
34b87517 4824 if (e->profile_delays)
e6a7bee5 4825 e->last_run_usec = now(CLOCK_MONOTONIC);
34b87517 4826
02d30981 4827 if (r > 0) {
53bac4e0 4828 /* There's something now, then let's dispatch it */
02d30981
TG
4829 r = sd_event_dispatch(e);
4830 if (r < 0)
4831 return r;
53bac4e0
LP
4832
4833 return 1;
4834 }
4835
4836 return r;
c45a5a74
TG
4837}
4838
f7262a9f 4839_public_ int sd_event_loop(sd_event *e) {
fd38203a
LP
4840 int r;
4841
da7e457c 4842 assert_return(e, -EINVAL);
b937d761 4843 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 4844 assert_return(!event_pid_changed(e), -ECHILD);
2b0c9ef7 4845 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 4846
c8e9d15c 4847 PROTECT_EVENT(e);
fd38203a 4848
da7e457c 4849 while (e->state != SD_EVENT_FINISHED) {
f5fbe71d 4850 r = sd_event_run(e, UINT64_MAX);
fd38203a 4851 if (r < 0)
30dd293c 4852 return r;
fd38203a
LP
4853 }
4854
30dd293c 4855 return e->exit_code;
fd38203a
LP
4856}
4857
9b364545 4858_public_ int sd_event_get_fd(sd_event *e) {
9b364545 4859 assert_return(e, -EINVAL);
b937d761 4860 assert_return(e = event_resolve(e), -ENOPKG);
9b364545
TG
4861 assert_return(!event_pid_changed(e), -ECHILD);
4862
4863 return e->epoll_fd;
4864}
4865
f7262a9f 4866_public_ int sd_event_get_state(sd_event *e) {
da7e457c 4867 assert_return(e, -EINVAL);
b937d761 4868 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
4869 assert_return(!event_pid_changed(e), -ECHILD);
4870
4871 return e->state;
4872}
4873
6203e07a 4874_public_ int sd_event_get_exit_code(sd_event *e, int *code) {
da7e457c 4875 assert_return(e, -EINVAL);
b937d761 4876 assert_return(e = event_resolve(e), -ENOPKG);
6203e07a 4877 assert_return(code, -EINVAL);
da7e457c 4878 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 4879
6203e07a
LP
4880 if (!e->exit_requested)
4881 return -ENODATA;
4882
4883 *code = e->exit_code;
4884 return 0;
fd38203a
LP
4885}
4886
6203e07a 4887_public_ int sd_event_exit(sd_event *e, int code) {
da7e457c 4888 assert_return(e, -EINVAL);
b937d761 4889 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
4890 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4891 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 4892
6203e07a
LP
4893 e->exit_requested = true;
4894 e->exit_code = code;
4895
fd38203a
LP
4896 return 0;
4897}
46e8c825 4898
6a0f1f6d 4899_public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
46e8c825 4900 assert_return(e, -EINVAL);
b937d761 4901 assert_return(e = event_resolve(e), -ENOPKG);
46e8c825 4902 assert_return(usec, -EINVAL);
46e8c825
LP
4903 assert_return(!event_pid_changed(e), -ECHILD);
4904
e475d10c
LP
4905 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4906 return -EOPNOTSUPP;
4907
e475d10c 4908 if (!triple_timestamp_is_set(&e->timestamp)) {
15c689d7 4909 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
38a03f06
LP
4910 *usec = now(clock);
4911 return 1;
4912 }
46e8c825 4913
e475d10c 4914 *usec = triple_timestamp_by_clock(&e->timestamp, clock);
46e8c825
LP
4915 return 0;
4916}
afc6adb5
LP
4917
4918_public_ int sd_event_default(sd_event **ret) {
39883f62 4919 sd_event *e = NULL;
afc6adb5
LP
4920 int r;
4921
4922 if (!ret)
4923 return !!default_event;
4924
4925 if (default_event) {
4926 *ret = sd_event_ref(default_event);
4927 return 0;
4928 }
4929
4930 r = sd_event_new(&e);
4931 if (r < 0)
4932 return r;
4933
4934 e->default_event_ptr = &default_event;
4935 e->tid = gettid();
4936 default_event = e;
4937
4938 *ret = e;
4939 return 1;
4940}
4941
4942_public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4943 assert_return(e, -EINVAL);
b937d761 4944 assert_return(e = event_resolve(e), -ENOPKG);
afc6adb5 4945 assert_return(tid, -EINVAL);
76b54375 4946 assert_return(!event_pid_changed(e), -ECHILD);
afc6adb5 4947
76b54375
LP
4948 if (e->tid != 0) {
4949 *tid = e->tid;
4950 return 0;
4951 }
4952
4953 return -ENXIO;
afc6adb5 4954}
cde93897
LP
4955
4956_public_ int sd_event_set_watchdog(sd_event *e, int b) {
4957 int r;
4958
4959 assert_return(e, -EINVAL);
b937d761 4960 assert_return(e = event_resolve(e), -ENOPKG);
8f726607 4961 assert_return(!event_pid_changed(e), -ECHILD);
cde93897
LP
4962
4963 if (e->watchdog == !!b)
4964 return e->watchdog;
4965
4966 if (b) {
09812eb7
LP
4967 r = sd_watchdog_enabled(false, &e->watchdog_period);
4968 if (r <= 0)
cde93897 4969 return r;
cde93897
LP
4970
4971 /* Issue first ping immediately */
4972 sd_notify(false, "WATCHDOG=1");
4973 e->watchdog_last = now(CLOCK_MONOTONIC);
4974
4975 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4976 if (e->watchdog_fd < 0)
4977 return -errno;
4978
4979 r = arm_watchdog(e);
4980 if (r < 0)
4981 goto fail;
4982
1eac7948 4983 struct epoll_event ev = {
a82f89aa
LP
4984 .events = EPOLLIN,
4985 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
4986 };
cde93897 4987
15c689d7 4988 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
cde93897
LP
4989 r = -errno;
4990 goto fail;
4991 }
4992
4993 } else {
4994 if (e->watchdog_fd >= 0) {
5a795bff 4995 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
03e334a1 4996 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
4997 }
4998 }
4999
5000 e->watchdog = !!b;
5001 return e->watchdog;
5002
5003fail:
03e334a1 5004 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
5005 return r;
5006}
8f726607
LP
5007
5008_public_ int sd_event_get_watchdog(sd_event *e) {
5009 assert_return(e, -EINVAL);
b937d761 5010 assert_return(e = event_resolve(e), -ENOPKG);
8f726607
LP
5011 assert_return(!event_pid_changed(e), -ECHILD);
5012
5013 return e->watchdog;
5014}
60a3b1e1
LP
5015
5016_public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5017 assert_return(e, -EINVAL);
b937d761 5018 assert_return(e = event_resolve(e), -ENOPKG);
60a3b1e1
LP
5019 assert_return(!event_pid_changed(e), -ECHILD);
5020
5021 *ret = e->iteration;
5022 return 0;
5023}
15723a1d
LP
5024
5025_public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5026 assert_return(s, -EINVAL);
5027
5028 s->destroy_callback = callback;
5029 return 0;
5030}
5031
5032_public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5033 assert_return(s, -EINVAL);
5034
5035 if (ret)
5036 *ret = s->destroy_callback;
5037
5038 return !!s->destroy_callback;
5039}
2382c936
YW
5040
5041_public_ int sd_event_source_get_floating(sd_event_source *s) {
5042 assert_return(s, -EINVAL);
5043
5044 return s->floating;
5045}
5046
5047_public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5048 assert_return(s, -EINVAL);
5049
5050 if (s->floating == !!b)
5051 return 0;
5052
5053 if (!s->event) /* Already disconnected */
5054 return -ESTALE;
5055
5056 s->floating = b;
5057
5058 if (b) {
5059 sd_event_source_ref(s);
5060 sd_event_unref(s->event);
5061 } else {
5062 sd_event_ref(s->event);
5063 sd_event_source_unref(s);
5064 }
5065
5066 return 1;
5067}
b778cba4
LP
5068
5069_public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5070 assert_return(s, -EINVAL);
5071 assert_return(s->type != SOURCE_EXIT, -EDOM);
5072
5073 return s->exit_on_failure;
5074}
5075
5076_public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5077 assert_return(s, -EINVAL);
5078 assert_return(s->type != SOURCE_EXIT, -EDOM);
5079
5080 if (s->exit_on_failure == !!b)
5081 return 0;
5082
5083 s->exit_on_failure = b;
5084 return 1;
5085}
b6d5481b
LP
5086
5087_public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5088 int r;
5089
5090 assert_return(s, -EINVAL);
5091
5092 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5093 * so is a programming error. */
5094 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5095
5096 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5097 * non-ratelimited. */
fd69f224 5098 r = event_source_leave_ratelimit(s, /* run_callback */ false);
b6d5481b
LP
5099 if (r < 0)
5100 return r;
5101
5102 s->rate_limit = (RateLimit) { interval, burst };
5103 return 0;
fd69f224
MS
5104}
5105
5106_public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5107 assert_return(s, -EINVAL);
5108
5109 s->ratelimit_expire_callback = callback;
5110 return 0;
b6d5481b
LP
5111}
5112
5113_public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5114 assert_return(s, -EINVAL);
5115
6dd3b818
YW
5116 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5117 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
b6d5481b
LP
5118 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5119 return -EDOM;
5120
5121 if (!ratelimit_configured(&s->rate_limit))
5122 return -ENOEXEC;
5123
5124 if (ret_interval)
5125 *ret_interval = s->rate_limit.interval;
5126 if (ret_burst)
5127 *ret_burst = s->rate_limit.burst;
5128
5129 return 0;
5130}
5131
5132_public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5133 assert_return(s, -EINVAL);
5134
5135 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5136 return false;
5137
5138 if (!ratelimit_configured(&s->rate_limit))
5139 return false;
5140
5141 return s->ratelimited;
5142}
baf3fdec
LP
5143
5144_public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5145 bool change = false;
5146 int r;
5147
5148 assert_return(e, -EINVAL);
5149
5150 if (b) {
5151 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5152 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5153 * floating after creation (and undo this before deleting them again). */
5154
5155 if (!e->sigint_event_source) {
5156 r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5157 if (r < 0)
5158 return r;
5159
5160 assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5161 change = true;
5162 }
5163
5164 if (!e->sigterm_event_source) {
5165 r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5166 if (r < 0) {
5167 if (change) {
5168 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5169 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5170 }
5171
5172 return r;
5173 }
5174
5175 assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5176 change = true;
5177 }
5178
5179 } else {
5180 if (e->sigint_event_source) {
5181 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5182 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5183 change = true;
5184 }
5185
5186 if (e->sigterm_event_source) {
5187 assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5188 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5189 change = true;
5190 }
5191 }
5192
5193 return change;
5194}
158fe190
LP
5195
5196_public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5197 _cleanup_free_ char *b = NULL;
5198 _cleanup_free_ void *w = NULL;
5199
5200 assert_return(s, -EINVAL);
5201 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5202 assert_return(ty, -EINVAL);
5203
5204 if (!STR_IN_SET(ty, "some", "full"))
5205 return -EINVAL;
5206
5207 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5208 return -EBUSY;
5209
5210 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5211 if (!space)
5212 return -EINVAL;
5213
5214 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5215 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5216 if (!b)
5217 return -ENOMEM;
5218 if (!STR_IN_SET(b, "some", "full"))
5219 return -EINVAL;
5220
5221 if (streq(b, ty))
5222 return 0;
5223
5224 size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5225 w = new(char, nl);
5226 if (!w)
5227 return -ENOMEM;
5228
5229 memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5230
5231 free_and_replace(s->memory_pressure.write_buffer, w);
5232 s->memory_pressure.write_buffer_size = nl;
5233 s->memory_pressure.locked = false;
5234
5235 return 1;
5236}
5237
5238_public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5239 _cleanup_free_ char *b = NULL;
5240 _cleanup_free_ void *w = NULL;
5241
5242 assert_return(s, -EINVAL);
5243 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5244
5245 if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5246 return -ERANGE;
5247 if (window_usec <= 0 || window_usec >= UINT64_MAX)
5248 return -ERANGE;
5249 if (threshold_usec > window_usec)
5250 return -EINVAL;
5251
5252 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5253 return -EBUSY;
5254
5255 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5256 if (!space)
5257 return -EINVAL;
5258
5259 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5260 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5261 if (!b)
5262 return -ENOMEM;
5263 if (!STR_IN_SET(b, "some", "full"))
5264 return -EINVAL;
5265
5266 if (asprintf((char**) &w,
5267 "%s " USEC_FMT " " USEC_FMT "",
5268 b,
5269 threshold_usec,
5270 window_usec) < 0)
5271 return -EINVAL;
5272
5273 l = strlen(w) + 1;
5274 if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5275 return 0;
5276
5277 free_and_replace(s->memory_pressure.write_buffer, w);
5278 s->memory_pressure.write_buffer_size = l;
5279 s->memory_pressure.locked = false;
5280
5281 return 1;
5282}