]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/libsystemd/sd-event/sd-event.c
man: fix typo in ukify page
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
fd38203a
LP
2
3#include <sys/epoll.h>
4#include <sys/timerfd.h>
5#include <sys/wait.h>
6
cde93897 7#include "sd-daemon.h"
07630cea
LP
8#include "sd-event.h"
9#include "sd-id128.h"
158fe190 10#include "sd-messages.h"
07630cea 11
b5efdb8a 12#include "alloc-util.h"
f8f3f926 13#include "env-util.h"
a137a1c3 14#include "event-source.h"
3ffd4af2 15#include "fd-util.h"
97ef5391 16#include "fs-util.h"
28e5e1e9 17#include "glyph-util.h"
fd38203a 18#include "hashmap.h"
158fe190 19#include "hexdecoct.h"
07630cea 20#include "list.h"
3ae6b3bf 21#include "logarithm.h"
07630cea 22#include "macro.h"
158fe190 23#include "mallinfo-util.h"
0a970718 24#include "memory-util.h"
158fe190 25#include "missing_magic.h"
f5947a5e 26#include "missing_syscall.h"
5545f336 27#include "missing_threads.h"
158fe190 28#include "path-util.h"
07630cea 29#include "prioq.h"
4a0b58c4 30#include "process-util.h"
158fe190 31#include "psi-util.h"
6e9feda3 32#include "set.h"
24882e06 33#include "signal-util.h"
158fe190
LP
34#include "socket-util.h"
35#include "stat-util.h"
55cbfaa5 36#include "string-table.h"
07630cea 37#include "string-util.h"
442ac269 38#include "strxcpyx.h"
07630cea 39#include "time-util.h"
fd38203a 40
c2ba3ad6 41#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
fd38203a 42
f8f3f926
LP
43static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
44 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
45 return s &&
46 s->type == SOURCE_CHILD &&
47 s->child.pidfd >= 0 &&
48 s->child.options == WEXITED;
49}
50
b6d5481b
LP
51static bool event_source_is_online(sd_event_source *s) {
52 assert(s);
53 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
54}
55
56static bool event_source_is_offline(sd_event_source *s) {
57 assert(s);
58 return s->enabled == SD_EVENT_OFF || s->ratelimited;
59}
60
55cbfaa5 61static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
3f3548f8
ZJS
62 [SOURCE_IO] = "io",
63 [SOURCE_TIME_REALTIME] = "realtime",
64 [SOURCE_TIME_BOOTTIME] = "bootime",
65 [SOURCE_TIME_MONOTONIC] = "monotonic",
55cbfaa5
DM
66 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
67 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
3f3548f8
ZJS
68 [SOURCE_SIGNAL] = "signal",
69 [SOURCE_CHILD] = "child",
70 [SOURCE_DEFER] = "defer",
71 [SOURCE_POST] = "post",
72 [SOURCE_EXIT] = "exit",
73 [SOURCE_WATCHDOG] = "watchdog",
74 [SOURCE_INOTIFY] = "inotify",
158fe190 75 [SOURCE_MEMORY_PRESSURE] = "memory-pressure",
55cbfaa5
DM
76};
77
78DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
79
b6d5481b
LP
80#define EVENT_SOURCE_IS_TIME(t) \
81 IN_SET((t), \
82 SOURCE_TIME_REALTIME, \
83 SOURCE_TIME_BOOTTIME, \
84 SOURCE_TIME_MONOTONIC, \
85 SOURCE_TIME_REALTIME_ALARM, \
86 SOURCE_TIME_BOOTTIME_ALARM)
87
88#define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
89 IN_SET((t), \
90 SOURCE_IO, \
91 SOURCE_TIME_REALTIME, \
92 SOURCE_TIME_BOOTTIME, \
93 SOURCE_TIME_MONOTONIC, \
94 SOURCE_TIME_REALTIME_ALARM, \
95 SOURCE_TIME_BOOTTIME_ALARM, \
96 SOURCE_SIGNAL, \
97 SOURCE_DEFER, \
158fe190
LP
98 SOURCE_INOTIFY, \
99 SOURCE_MEMORY_PRESSURE)
6a0f1f6d 100
19947509
ZJS
101/* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
102 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
103 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
104#define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
105
fd38203a 106struct sd_event {
da7e457c 107 unsigned n_ref;
fd38203a
LP
108
109 int epoll_fd;
cde93897 110 int watchdog_fd;
fd38203a
LP
111
112 Prioq *pending;
113 Prioq *prepare;
c2ba3ad6 114
a8548816 115 /* timerfd_create() only supports these five clocks so far. We
6a0f1f6d
LP
116 * can add support for more clocks when the kernel learns to
117 * deal with them, too. */
118 struct clock_data realtime;
a8548816 119 struct clock_data boottime;
6a0f1f6d
LP
120 struct clock_data monotonic;
121 struct clock_data realtime_alarm;
122 struct clock_data boottime_alarm;
fd38203a 123
da7e457c
LP
124 usec_t perturb;
125
9da4cb2b
LP
126 sd_event_source **signal_sources; /* indexed by signal number */
127 Hashmap *signal_data; /* indexed by priority */
fd38203a
LP
128
129 Hashmap *child_sources;
b6d5481b 130 unsigned n_online_child_sources;
fd38203a 131
6e9feda3
LP
132 Set *post_sources;
133
6203e07a 134 Prioq *exit;
fd38203a 135
97ef5391
LP
136 Hashmap *inotify_data; /* indexed by priority */
137
138 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
ed828563 139 LIST_HEAD(struct inode_data, inode_data_to_close_list);
97ef5391
LP
140
141 /* A list of inotify objects that already have events buffered which aren't processed yet */
0601b958 142 LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
97ef5391 143
158fe190
LP
144 /* A list of memory pressure event sources that still need their subscription string written */
145 LIST_HEAD(sd_event_source, memory_pressure_write_list);
146
da7e457c 147 pid_t original_pid;
c2ba3ad6 148
60a3b1e1 149 uint64_t iteration;
e475d10c 150 triple_timestamp timestamp;
da7e457c 151 int state;
eaa3cbef 152
6203e07a 153 bool exit_requested:1;
da7e457c 154 bool need_process_child:1;
cde93897 155 bool watchdog:1;
34b87517 156 bool profile_delays:1;
afc6adb5 157
6203e07a
LP
158 int exit_code;
159
afc6adb5
LP
160 pid_t tid;
161 sd_event **default_event_ptr;
cde93897
LP
162
163 usec_t watchdog_last, watchdog_period;
15b38f93
LP
164
165 unsigned n_sources;
a71fe8b8 166
5cddd924 167 struct epoll_event *event_queue;
5cddd924 168
a71fe8b8 169 LIST_HEAD(sd_event_source, sources);
34b87517 170
baf3fdec
LP
171 sd_event_source *sigint_event_source, *sigterm_event_source;
172
e6a7bee5 173 usec_t last_run_usec, last_log_usec;
34b87517 174 unsigned delays[sizeof(usec_t) * 8];
fd38203a
LP
175};
176
b937d761
NM
177static thread_local sd_event *default_event = NULL;
178
a71fe8b8 179static void source_disconnect(sd_event_source *s);
97ef5391 180static void event_gc_inode_data(sd_event *e, struct inode_data *d);
a71fe8b8 181
b937d761
NM
182static sd_event *event_resolve(sd_event *e) {
183 return e == SD_EVENT_DEFAULT ? default_event : e;
184}
185
fd38203a
LP
186static int pending_prioq_compare(const void *a, const void *b) {
187 const sd_event_source *x = a, *y = b;
9c57a73b 188 int r;
fd38203a
LP
189
190 assert(x->pending);
191 assert(y->pending);
192
baf76283 193 /* Enabled ones first */
06e13147
YW
194 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
195 if (r != 0)
196 return r;
fd38203a 197
b6d5481b
LP
198 /* Non rate-limited ones first. */
199 r = CMP(!!x->ratelimited, !!y->ratelimited);
200 if (r != 0)
201 return r;
202
fd38203a 203 /* Lower priority values first */
9c57a73b
YW
204 r = CMP(x->priority, y->priority);
205 if (r != 0)
206 return r;
fd38203a
LP
207
208 /* Older entries first */
9c57a73b 209 return CMP(x->pending_iteration, y->pending_iteration);
fd38203a
LP
210}
211
212static int prepare_prioq_compare(const void *a, const void *b) {
213 const sd_event_source *x = a, *y = b;
9c57a73b 214 int r;
fd38203a
LP
215
216 assert(x->prepare);
217 assert(y->prepare);
218
8046c457 219 /* Enabled ones first */
06e13147
YW
220 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
221 if (r != 0)
222 return r;
8046c457 223
b6d5481b
LP
224 /* Non rate-limited ones first. */
225 r = CMP(!!x->ratelimited, !!y->ratelimited);
226 if (r != 0)
227 return r;
228
fd38203a
LP
229 /* Move most recently prepared ones last, so that we can stop
230 * preparing as soon as we hit one that has already been
231 * prepared in the current iteration */
9c57a73b
YW
232 r = CMP(x->prepare_iteration, y->prepare_iteration);
233 if (r != 0)
234 return r;
fd38203a 235
fd38203a 236 /* Lower priority values first */
9c57a73b 237 return CMP(x->priority, y->priority);
fd38203a
LP
238}
239
b6d5481b
LP
240static usec_t time_event_source_next(const sd_event_source *s) {
241 assert(s);
242
243 /* We have two kinds of event sources that have elapsation times associated with them: the actual
244 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
245 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
246 * looking at here. */
247
248 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
249 assert(s->rate_limit.begin != 0);
250 assert(s->rate_limit.interval != 0);
251 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
252 }
253
254 /* Otherwise this must be a time event source, if not ratelimited */
255 if (EVENT_SOURCE_IS_TIME(s->type))
256 return s->time.next;
257
258 return USEC_INFINITY;
259}
260
1bce0ffa 261static usec_t time_event_source_latest(const sd_event_source *s) {
b6d5481b
LP
262 assert(s);
263
264 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
265 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
266 * window */
267 assert(s->rate_limit.begin != 0);
268 assert(s->rate_limit.interval != 0);
269 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
270 }
271
272 /* Must be a time event source, if not ratelimited */
273 if (EVENT_SOURCE_IS_TIME(s->type))
274 return usec_add(s->time.next, s->time.accuracy);
275
276 return USEC_INFINITY;
1bce0ffa
LP
277}
278
81107b84
LP
279static bool event_source_timer_candidate(const sd_event_source *s) {
280 assert(s);
281
282 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
283 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
284 return !s->pending || s->ratelimited;
285}
286
287static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
c2ba3ad6 288 const sd_event_source *x = a, *y = b;
06e13147 289 int r;
c2ba3ad6 290
baf76283 291 /* Enabled ones first */
06e13147
YW
292 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
293 if (r != 0)
294 return r;
c2ba3ad6 295
81107b84 296 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
06e13147
YW
297 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
298 if (r != 0)
299 return r;
c2ba3ad6
LP
300
301 /* Order by time */
81107b84
LP
302 return CMP(time_func(x), time_func(y));
303}
304
305static int earliest_time_prioq_compare(const void *a, const void *b) {
306 return time_prioq_compare(a, b, time_event_source_next);
307}
308
309static int latest_time_prioq_compare(const void *a, const void *b) {
310 return time_prioq_compare(a, b, time_event_source_latest);
c2ba3ad6
LP
311}
312
6203e07a 313static int exit_prioq_compare(const void *a, const void *b) {
da7e457c 314 const sd_event_source *x = a, *y = b;
06e13147 315 int r;
da7e457c 316
6203e07a
LP
317 assert(x->type == SOURCE_EXIT);
318 assert(y->type == SOURCE_EXIT);
da7e457c 319
baf76283 320 /* Enabled ones first */
06e13147
YW
321 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
322 if (r != 0)
323 return r;
da7e457c
LP
324
325 /* Lower priority values first */
6dd91b36 326 return CMP(x->priority, y->priority);
da7e457c
LP
327}
328
6a0f1f6d
LP
329static void free_clock_data(struct clock_data *d) {
330 assert(d);
9da4cb2b 331 assert(d->wakeup == WAKEUP_CLOCK_DATA);
6a0f1f6d
LP
332
333 safe_close(d->fd);
334 prioq_free(d->earliest);
335 prioq_free(d->latest);
336}
337
8301aa0b 338static sd_event *event_free(sd_event *e) {
a71fe8b8
LP
339 sd_event_source *s;
340
fd38203a 341 assert(e);
a71fe8b8 342
baf3fdec
LP
343 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
344 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
345
a71fe8b8
LP
346 while ((s = e->sources)) {
347 assert(s->floating);
348 source_disconnect(s);
349 sd_event_source_unref(s);
350 }
351
15b38f93 352 assert(e->n_sources == 0);
fd38203a 353
afc6adb5
LP
354 if (e->default_event_ptr)
355 *(e->default_event_ptr) = NULL;
356
03e334a1 357 safe_close(e->epoll_fd);
03e334a1 358 safe_close(e->watchdog_fd);
cde93897 359
6a0f1f6d 360 free_clock_data(&e->realtime);
a8548816 361 free_clock_data(&e->boottime);
6a0f1f6d
LP
362 free_clock_data(&e->monotonic);
363 free_clock_data(&e->realtime_alarm);
364 free_clock_data(&e->boottime_alarm);
365
fd38203a
LP
366 prioq_free(e->pending);
367 prioq_free(e->prepare);
6203e07a 368 prioq_free(e->exit);
fd38203a
LP
369
370 free(e->signal_sources);
9da4cb2b 371 hashmap_free(e->signal_data);
fd38203a 372
97ef5391
LP
373 hashmap_free(e->inotify_data);
374
fd38203a 375 hashmap_free(e->child_sources);
6e9feda3 376 set_free(e->post_sources);
8301aa0b 377
5cddd924
LP
378 free(e->event_queue);
379
8301aa0b 380 return mfree(e);
fd38203a
LP
381}
382
f7262a9f 383_public_ int sd_event_new(sd_event** ret) {
fd38203a
LP
384 sd_event *e;
385 int r;
386
305f78bf 387 assert_return(ret, -EINVAL);
fd38203a 388
d08eb1fa 389 e = new(sd_event, 1);
fd38203a
LP
390 if (!e)
391 return -ENOMEM;
392
d08eb1fa
LP
393 *e = (sd_event) {
394 .n_ref = 1,
254d1313
ZJS
395 .epoll_fd = -EBADF,
396 .watchdog_fd = -EBADF,
d08eb1fa 397 .realtime.wakeup = WAKEUP_CLOCK_DATA,
254d1313 398 .realtime.fd = -EBADF,
d08eb1fa
LP
399 .realtime.next = USEC_INFINITY,
400 .boottime.wakeup = WAKEUP_CLOCK_DATA,
254d1313 401 .boottime.fd = -EBADF,
d08eb1fa
LP
402 .boottime.next = USEC_INFINITY,
403 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
254d1313 404 .monotonic.fd = -EBADF,
d08eb1fa
LP
405 .monotonic.next = USEC_INFINITY,
406 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
254d1313 407 .realtime_alarm.fd = -EBADF,
d08eb1fa
LP
408 .realtime_alarm.next = USEC_INFINITY,
409 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
254d1313 410 .boottime_alarm.fd = -EBADF,
d08eb1fa
LP
411 .boottime_alarm.next = USEC_INFINITY,
412 .perturb = USEC_INFINITY,
413 .original_pid = getpid_cached(),
414 };
fd38203a 415
c983e776
EV
416 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
417 if (r < 0)
fd38203a 418 goto fail;
fd38203a
LP
419
420 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
421 if (e->epoll_fd < 0) {
422 r = -errno;
423 goto fail;
424 }
425
7fe2903c
LP
426 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
427
34b87517 428 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
28e5e1e9
DT
429 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
430 special_glyph(SPECIAL_GLYPH_ELLIPSIS));
34b87517
VC
431 e->profile_delays = true;
432 }
433
fd38203a
LP
434 *ret = e;
435 return 0;
436
437fail:
438 event_free(e);
439 return r;
440}
441
8301aa0b 442DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event, sd_event, event_free);
c8e9d15c
YW
443#define PROTECT_EVENT(e) \
444 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
fd38203a 445
afd15bbb
ZJS
446_public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
447 if (s)
448 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
449 return sd_event_source_unref(s);
450}
451
eaa3cbef
LP
452static bool event_pid_changed(sd_event *e) {
453 assert(e);
454
a2360a46 455 /* We don't support people creating an event loop and keeping
eaa3cbef
LP
456 * it around over a fork(). Let's complain. */
457
df0ff127 458 return e->original_pid != getpid_cached();
eaa3cbef
LP
459}
460
366e6411 461static void source_io_unregister(sd_event_source *s) {
fd38203a
LP
462 assert(s);
463 assert(s->type == SOURCE_IO);
464
f6806734 465 if (event_pid_changed(s->event))
366e6411 466 return;
f6806734 467
fd38203a 468 if (!s->io.registered)
366e6411 469 return;
fd38203a 470
d1cf2023 471 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
f80a5d6a 472 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
55cbfaa5 473 strna(s->description), event_source_type_to_string(s->type));
fd38203a
LP
474
475 s->io.registered = false;
fd38203a
LP
476}
477
305f78bf
LP
478static int source_io_register(
479 sd_event_source *s,
480 int enabled,
481 uint32_t events) {
482
fd38203a
LP
483 assert(s);
484 assert(s->type == SOURCE_IO);
baf76283 485 assert(enabled != SD_EVENT_OFF);
fd38203a 486
1eac7948 487 struct epoll_event ev = {
a82f89aa
LP
488 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
489 .data.ptr = s,
490 };
fd38203a 491
15c689d7 492 if (epoll_ctl(s->event->epoll_fd,
1eac7948 493 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
55c540d3 494 s->io.fd, &ev) < 0)
fd38203a
LP
495 return -errno;
496
497 s->io.registered = true;
498
499 return 0;
500}
501
f8f3f926
LP
502static void source_child_pidfd_unregister(sd_event_source *s) {
503 assert(s);
504 assert(s->type == SOURCE_CHILD);
505
506 if (event_pid_changed(s->event))
507 return;
508
509 if (!s->child.registered)
510 return;
511
512 if (EVENT_SOURCE_WATCH_PIDFD(s))
513 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
f80a5d6a 514 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
f8f3f926
LP
515 strna(s->description), event_source_type_to_string(s->type));
516
517 s->child.registered = false;
518}
519
520static int source_child_pidfd_register(sd_event_source *s, int enabled) {
f8f3f926
LP
521 assert(s);
522 assert(s->type == SOURCE_CHILD);
523 assert(enabled != SD_EVENT_OFF);
524
525 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1eac7948 526 struct epoll_event ev = {
f8f3f926
LP
527 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
528 .data.ptr = s,
529 };
530
55c540d3
ZJS
531 if (epoll_ctl(s->event->epoll_fd,
532 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
533 s->child.pidfd, &ev) < 0)
f8f3f926
LP
534 return -errno;
535 }
536
537 s->child.registered = true;
538 return 0;
539}
540
158fe190
LP
541static void source_memory_pressure_unregister(sd_event_source *s) {
542 assert(s);
543 assert(s->type == SOURCE_MEMORY_PRESSURE);
544
545 if (event_pid_changed(s->event))
546 return;
547
548 if (!s->memory_pressure.registered)
549 return;
550
551 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
552 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
553 strna(s->description), event_source_type_to_string(s->type));
554
555 s->memory_pressure.registered = false;
556}
557
558static int source_memory_pressure_register(sd_event_source *s, int enabled) {
559 assert(s);
560 assert(s->type == SOURCE_MEMORY_PRESSURE);
561 assert(enabled != SD_EVENT_OFF);
562
563 struct epoll_event ev = {
564 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
565 (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
566 .data.ptr = s,
567 };
568
569 if (epoll_ctl(s->event->epoll_fd,
570 s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
571 s->memory_pressure.fd, &ev) < 0)
572 return -errno;
573
574 s->memory_pressure.registered = true;
575 return 0;
576}
577
578static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
579 assert(s);
580 assert(s->type == SOURCE_MEMORY_PRESSURE);
581
582 if (s->memory_pressure.in_write_list)
583 return;
584
585 LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
586 s->memory_pressure.in_write_list = true;
587}
588
589static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
590 assert(s);
591 assert(s->type == SOURCE_MEMORY_PRESSURE);
592
593 if (!s->memory_pressure.in_write_list)
594 return;
595
596 LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
597 s->memory_pressure.in_write_list = false;
598}
599
6a0f1f6d
LP
600static clockid_t event_source_type_to_clock(EventSourceType t) {
601
602 switch (t) {
603
604 case SOURCE_TIME_REALTIME:
605 return CLOCK_REALTIME;
606
a8548816
TG
607 case SOURCE_TIME_BOOTTIME:
608 return CLOCK_BOOTTIME;
609
6a0f1f6d
LP
610 case SOURCE_TIME_MONOTONIC:
611 return CLOCK_MONOTONIC;
612
613 case SOURCE_TIME_REALTIME_ALARM:
614 return CLOCK_REALTIME_ALARM;
615
616 case SOURCE_TIME_BOOTTIME_ALARM:
617 return CLOCK_BOOTTIME_ALARM;
618
619 default:
620 return (clockid_t) -1;
621 }
622}
623
624static EventSourceType clock_to_event_source_type(clockid_t clock) {
625
626 switch (clock) {
627
628 case CLOCK_REALTIME:
629 return SOURCE_TIME_REALTIME;
630
a8548816
TG
631 case CLOCK_BOOTTIME:
632 return SOURCE_TIME_BOOTTIME;
633
6a0f1f6d
LP
634 case CLOCK_MONOTONIC:
635 return SOURCE_TIME_MONOTONIC;
636
637 case CLOCK_REALTIME_ALARM:
638 return SOURCE_TIME_REALTIME_ALARM;
639
640 case CLOCK_BOOTTIME_ALARM:
641 return SOURCE_TIME_BOOTTIME_ALARM;
642
643 default:
644 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
645 }
646}
647
648static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
649 assert(e);
650
651 switch (t) {
652
653 case SOURCE_TIME_REALTIME:
654 return &e->realtime;
655
a8548816
TG
656 case SOURCE_TIME_BOOTTIME:
657 return &e->boottime;
658
6a0f1f6d
LP
659 case SOURCE_TIME_MONOTONIC:
660 return &e->monotonic;
661
662 case SOURCE_TIME_REALTIME_ALARM:
663 return &e->realtime_alarm;
664
665 case SOURCE_TIME_BOOTTIME_ALARM:
666 return &e->boottime_alarm;
667
668 default:
669 return NULL;
670 }
671}
672
3e4eb8e7
YW
673static void event_free_signal_data(sd_event *e, struct signal_data *d) {
674 assert(e);
675
676 if (!d)
677 return;
678
679 hashmap_remove(e->signal_data, &d->priority);
680 safe_close(d->fd);
681 free(d);
682}
683
9da4cb2b
LP
684static int event_make_signal_data(
685 sd_event *e,
686 int sig,
687 struct signal_data **ret) {
4807d2d0 688
9da4cb2b
LP
689 struct signal_data *d;
690 bool added = false;
691 sigset_t ss_copy;
692 int64_t priority;
f95387cd
ZJS
693 int r;
694
695 assert(e);
696
f6806734 697 if (event_pid_changed(e))
9da4cb2b 698 return -ECHILD;
f6806734 699
9da4cb2b
LP
700 if (e->signal_sources && e->signal_sources[sig])
701 priority = e->signal_sources[sig]->priority;
702 else
de05913d 703 priority = SD_EVENT_PRIORITY_NORMAL;
f95387cd 704
9da4cb2b
LP
705 d = hashmap_get(e->signal_data, &priority);
706 if (d) {
707 if (sigismember(&d->sigset, sig) > 0) {
708 if (ret)
709 *ret = d;
710 return 0;
711 }
712 } else {
d08eb1fa 713 d = new(struct signal_data, 1);
9da4cb2b
LP
714 if (!d)
715 return -ENOMEM;
716
d08eb1fa
LP
717 *d = (struct signal_data) {
718 .wakeup = WAKEUP_SIGNAL_DATA,
254d1313 719 .fd = -EBADF,
d08eb1fa
LP
720 .priority = priority,
721 };
9da4cb2b 722
f656fdb6 723 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
90f604d1
ZJS
724 if (r < 0) {
725 free(d);
9da4cb2b 726 return r;
90f604d1 727 }
f95387cd 728
9da4cb2b
LP
729 added = true;
730 }
731
732 ss_copy = d->sigset;
733 assert_se(sigaddset(&ss_copy, sig) >= 0);
734
cbff793f
ZJS
735 r = signalfd(d->fd >= 0 ? d->fd : -1, /* the first arg must be -1 or a valid signalfd */
736 &ss_copy,
737 SFD_NONBLOCK|SFD_CLOEXEC);
9da4cb2b
LP
738 if (r < 0) {
739 r = -errno;
740 goto fail;
741 }
742
743 d->sigset = ss_copy;
f95387cd 744
9da4cb2b
LP
745 if (d->fd >= 0) {
746 if (ret)
747 *ret = d;
f95387cd 748 return 0;
9da4cb2b
LP
749 }
750
7fe2903c 751 d->fd = fd_move_above_stdio(r);
f95387cd 752
1eac7948 753 struct epoll_event ev = {
a82f89aa
LP
754 .events = EPOLLIN,
755 .data.ptr = d,
756 };
f95387cd 757
15c689d7 758 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
9da4cb2b
LP
759 r = -errno;
760 goto fail;
f95387cd
ZJS
761 }
762
9da4cb2b
LP
763 if (ret)
764 *ret = d;
765
f95387cd 766 return 0;
9da4cb2b
LP
767
768fail:
3e4eb8e7
YW
769 if (added)
770 event_free_signal_data(e, d);
9da4cb2b
LP
771
772 return r;
773}
774
775static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
776 assert(e);
777 assert(d);
778
779 /* Turns off the specified signal in the signal data
780 * object. If the signal mask of the object becomes empty that
781 * way removes it. */
782
783 if (sigismember(&d->sigset, sig) == 0)
784 return;
785
786 assert_se(sigdelset(&d->sigset, sig) >= 0);
787
788 if (sigisemptyset(&d->sigset)) {
9da4cb2b 789 /* If all the mask is all-zero we can get rid of the structure */
3e4eb8e7 790 event_free_signal_data(e, d);
9da4cb2b
LP
791 return;
792 }
793
01e6af73
YW
794 if (event_pid_changed(e))
795 return;
796
9da4cb2b
LP
797 assert(d->fd >= 0);
798
799 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
800 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
801}
802
803static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
804 struct signal_data *d;
805 static const int64_t zero_priority = 0;
806
807 assert(e);
808
f8f3f926
LP
809 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
810 * and possibly drop the signalfd for it. */
9da4cb2b
LP
811
812 if (sig == SIGCHLD &&
b6d5481b 813 e->n_online_child_sources > 0)
9da4cb2b
LP
814 return;
815
816 if (e->signal_sources &&
817 e->signal_sources[sig] &&
b6d5481b 818 event_source_is_online(e->signal_sources[sig]))
9da4cb2b
LP
819 return;
820
821 /*
822 * The specified signal might be enabled in three different queues:
823 *
824 * 1) the one that belongs to the priority passed (if it is non-NULL)
825 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
826 * 3) the 0 priority (to cover the SIGCHLD case)
827 *
828 * Hence, let's remove it from all three here.
829 */
830
831 if (priority) {
832 d = hashmap_get(e->signal_data, priority);
833 if (d)
834 event_unmask_signal_data(e, d, sig);
835 }
836
837 if (e->signal_sources && e->signal_sources[sig]) {
838 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
839 if (d)
840 event_unmask_signal_data(e, d, sig);
841 }
842
843 d = hashmap_get(e->signal_data, &zero_priority);
844 if (d)
845 event_unmask_signal_data(e, d, sig);
f95387cd
ZJS
846}
847
e1951c16
MS
848static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
849 assert(s);
850
851 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
852 * they are enabled/disabled or marked pending and such. */
853
854 if (s->pending)
855 prioq_reshuffle(s->event->pending, s, &s->pending_index);
856
857 if (s->prepare)
858 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
859}
860
861static void event_source_time_prioq_reshuffle(sd_event_source *s) {
862 struct clock_data *d;
863
864 assert(s);
e1951c16
MS
865
866 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
5c08c7ab
YW
867 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
868 * properly again. */
b6d5481b
LP
869
870 if (s->ratelimited)
871 d = &s->event->monotonic;
5c08c7ab 872 else if (EVENT_SOURCE_IS_TIME(s->type))
b6d5481b 873 assert_se(d = event_get_clock_data(s->event, s->type));
5c08c7ab
YW
874 else
875 return; /* no-op for an event source which is neither a timer nor ratelimited. */
b6d5481b 876
f41315fc
LP
877 prioq_reshuffle(d->earliest, s, &s->earliest_index);
878 prioq_reshuffle(d->latest, s, &s->latest_index);
e1951c16
MS
879 d->needs_rearm = true;
880}
881
1e45e3fe
LP
882static void event_source_time_prioq_remove(
883 sd_event_source *s,
884 struct clock_data *d) {
885
886 assert(s);
887 assert(d);
888
f41315fc
LP
889 prioq_remove(d->earliest, s, &s->earliest_index);
890 prioq_remove(d->latest, s, &s->latest_index);
891 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
892 d->needs_rearm = true;
893}
894
a71fe8b8
LP
895static void source_disconnect(sd_event_source *s) {
896 sd_event *event;
897448bd 897 int r;
a71fe8b8 898
fd38203a
LP
899 assert(s);
900
a71fe8b8
LP
901 if (!s->event)
902 return;
15b38f93 903
a71fe8b8 904 assert(s->event->n_sources > 0);
fd38203a 905
a71fe8b8 906 switch (s->type) {
fd38203a 907
a71fe8b8
LP
908 case SOURCE_IO:
909 if (s->io.fd >= 0)
910 source_io_unregister(s);
fd38203a 911
a71fe8b8 912 break;
6a0f1f6d 913
a71fe8b8 914 case SOURCE_TIME_REALTIME:
a8548816 915 case SOURCE_TIME_BOOTTIME:
a71fe8b8
LP
916 case SOURCE_TIME_MONOTONIC:
917 case SOURCE_TIME_REALTIME_ALARM:
b6d5481b
LP
918 case SOURCE_TIME_BOOTTIME_ALARM:
919 /* Only remove this event source from the time event source here if it is not ratelimited. If
920 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
921 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
922
923 if (!s->ratelimited) {
924 struct clock_data *d;
925 assert_se(d = event_get_clock_data(s->event, s->type));
926 event_source_time_prioq_remove(s, d);
927 }
928
a71fe8b8 929 break;
a71fe8b8
LP
930
931 case SOURCE_SIGNAL:
932 if (s->signal.sig > 0) {
9da4cb2b 933
a71fe8b8
LP
934 if (s->event->signal_sources)
935 s->event->signal_sources[s->signal.sig] = NULL;
4807d2d0 936
9da4cb2b 937 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
897448bd
LP
938
939 if (s->signal.unblock) {
940 sigset_t new_ss;
941
942 if (sigemptyset(&new_ss) < 0)
943 log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
944 else if (sigaddset(&new_ss, s->signal.sig) < 0)
945 log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
946 else {
947 r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
948 if (r != 0)
949 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
950 }
951 }
6a0f1f6d 952 }
fd38203a 953
a71fe8b8 954 break;
fd38203a 955
a71fe8b8 956 case SOURCE_CHILD:
86587c93
YW
957 if (event_pid_changed(s->event))
958 s->child.process_owned = false;
959
a71fe8b8 960 if (s->child.pid > 0) {
b6d5481b
LP
961 if (event_source_is_online(s)) {
962 assert(s->event->n_online_child_sources > 0);
963 s->event->n_online_child_sources--;
4807d2d0 964 }
fd38203a 965
4a0b58c4 966 (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
a71fe8b8 967 }
fd38203a 968
f8f3f926
LP
969 if (EVENT_SOURCE_WATCH_PIDFD(s))
970 source_child_pidfd_unregister(s);
971 else
972 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
973
a71fe8b8 974 break;
fd38203a 975
a71fe8b8
LP
976 case SOURCE_DEFER:
977 /* nothing */
978 break;
fd38203a 979
a71fe8b8
LP
980 case SOURCE_POST:
981 set_remove(s->event->post_sources, s);
982 break;
da7e457c 983
a71fe8b8
LP
984 case SOURCE_EXIT:
985 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
986 break;
0eb2e0e3 987
97ef5391
LP
988 case SOURCE_INOTIFY: {
989 struct inode_data *inode_data;
990
991 inode_data = s->inotify.inode_data;
992 if (inode_data) {
993 struct inotify_data *inotify_data;
994 assert_se(inotify_data = inode_data->inotify_data);
995
996 /* Detach this event source from the inode object */
997 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
998 s->inotify.inode_data = NULL;
999
1000 if (s->pending) {
1001 assert(inotify_data->n_pending > 0);
1002 inotify_data->n_pending--;
1003 }
1004
1005 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1006 * continued to being watched. That's because inotify doesn't really have an API for that: we
1007 * can only change watch masks with access to the original inode either by fd or by path. But
1008 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
f21f31b2 1009 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
97ef5391
LP
1010 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1011 * there), but given the need for open_by_handle_at() which is privileged and not universally
1012 * available this would be quite an incomplete solution. Hence we go the other way, leave the
1013 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1014 * anymore after reception. Yes, this sucks, but … Linux … */
1015
1016 /* Maybe release the inode data (and its inotify) */
1017 event_gc_inode_data(s->event, inode_data);
1018 }
1019
1020 break;
1021 }
1022
158fe190
LP
1023 case SOURCE_MEMORY_PRESSURE:
1024 source_memory_pressure_remove_from_write_list(s);
1025 source_memory_pressure_unregister(s);
1026 break;
1027
a71fe8b8 1028 default:
04499a70 1029 assert_not_reached();
a71fe8b8 1030 }
6e9feda3 1031
a71fe8b8
LP
1032 if (s->pending)
1033 prioq_remove(s->event->pending, s, &s->pending_index);
9d3e3aa5 1034
a71fe8b8
LP
1035 if (s->prepare)
1036 prioq_remove(s->event->prepare, s, &s->prepare_index);
fd38203a 1037
b6d5481b
LP
1038 if (s->ratelimited)
1039 event_source_time_prioq_remove(s, &s->event->monotonic);
1040
e514aa1e 1041 event = TAKE_PTR(s->event);
a71fe8b8
LP
1042 LIST_REMOVE(sources, event->sources, s);
1043 event->n_sources--;
fd38203a 1044
f5982559
LP
1045 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1046 * pidfd associated with this event source, which we'll do only on source_free(). */
1047
a71fe8b8
LP
1048 if (!s->floating)
1049 sd_event_unref(event);
1050}
1051
75db809a 1052static sd_event_source* source_free(sd_event_source *s) {
a71fe8b8 1053 assert(s);
fd38203a 1054
a71fe8b8 1055 source_disconnect(s);
ab93297c
NM
1056
1057 if (s->type == SOURCE_IO && s->io.owned)
15723a1d
LP
1058 s->io.fd = safe_close(s->io.fd);
1059
f8f3f926
LP
1060 if (s->type == SOURCE_CHILD) {
1061 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1062
1063 if (s->child.process_owned) {
1064
1065 if (!s->child.exited) {
1066 bool sent = false;
1067
1068 if (s->child.pidfd >= 0) {
1069 if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
1070 if (errno == ESRCH) /* Already dead */
1071 sent = true;
1072 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
1073 log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
1074 s->child.pid);
1075 } else
1076 sent = true;
1077 }
1078
1079 if (!sent)
1080 if (kill(s->child.pid, SIGKILL) < 0)
1081 if (errno != ESRCH) /* Already dead */
1082 log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1083 s->child.pid);
1084 }
1085
1086 if (!s->child.waited) {
1087 siginfo_t si = {};
1088
1089 /* Reap the child if we can */
1090 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1091 }
1092 }
1093
1094 if (s->child.pidfd_owned)
1095 s->child.pidfd = safe_close(s->child.pidfd);
1096 }
1097
158fe190
LP
1098 if (s->type == SOURCE_MEMORY_PRESSURE) {
1099 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1100 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1101 }
1102
15723a1d
LP
1103 if (s->destroy_callback)
1104 s->destroy_callback(s->userdata);
ab93297c 1105
356779df 1106 free(s->description);
75db809a 1107 return mfree(s);
fd38203a 1108}
8c75fe17 1109DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
fd38203a
LP
1110
1111static int source_set_pending(sd_event_source *s, bool b) {
1112 int r;
1113
1114 assert(s);
6203e07a 1115 assert(s->type != SOURCE_EXIT);
fd38203a
LP
1116
1117 if (s->pending == b)
1118 return 0;
1119
1120 s->pending = b;
1121
1122 if (b) {
1123 s->pending_iteration = s->event->iteration;
1124
1125 r = prioq_put(s->event->pending, s, &s->pending_index);
1126 if (r < 0) {
1127 s->pending = false;
1128 return r;
1129 }
1130 } else
1131 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1132
e1951c16
MS
1133 if (EVENT_SOURCE_IS_TIME(s->type))
1134 event_source_time_prioq_reshuffle(s);
2576a19e 1135
9da4cb2b
LP
1136 if (s->type == SOURCE_SIGNAL && !b) {
1137 struct signal_data *d;
1138
1139 d = hashmap_get(s->event->signal_data, &s->priority);
1140 if (d && d->current == s)
1141 d->current = NULL;
1142 }
1143
97ef5391
LP
1144 if (s->type == SOURCE_INOTIFY) {
1145
1146 assert(s->inotify.inode_data);
1147 assert(s->inotify.inode_data->inotify_data);
1148
1149 if (b)
1150 s->inotify.inode_data->inotify_data->n_pending ++;
1151 else {
1152 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1153 s->inotify.inode_data->inotify_data->n_pending --;
1154 }
1155 }
1156
efd3be9d 1157 return 1;
fd38203a
LP
1158}
1159
a71fe8b8 1160static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
a38cf9fb
LP
1161
1162 /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1163 * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1164 * lines. */
1165 static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1166 [SOURCE_IO] = endoffsetof_field(sd_event_source, io),
1167 [SOURCE_TIME_REALTIME] = endoffsetof_field(sd_event_source, time),
1168 [SOURCE_TIME_BOOTTIME] = endoffsetof_field(sd_event_source, time),
1169 [SOURCE_TIME_MONOTONIC] = endoffsetof_field(sd_event_source, time),
1170 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1171 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1172 [SOURCE_SIGNAL] = endoffsetof_field(sd_event_source, signal),
1173 [SOURCE_CHILD] = endoffsetof_field(sd_event_source, child),
1174 [SOURCE_DEFER] = endoffsetof_field(sd_event_source, defer),
1175 [SOURCE_POST] = endoffsetof_field(sd_event_source, post),
1176 [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit),
1177 [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify),
158fe190 1178 [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure),
a38cf9fb
LP
1179 };
1180
fd38203a
LP
1181 sd_event_source *s;
1182
1183 assert(e);
a38cf9fb
LP
1184 assert(type >= 0);
1185 assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1186 assert(size_table[type] > 0);
fd38203a 1187
a38cf9fb
LP
1188 /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1189 * size, even if we only allocate the initial part we need. */
1190 s = expand_to_usable(malloc0(size_table[type]), sizeof(sd_event_source));
fd38203a
LP
1191 if (!s)
1192 return NULL;
1193
a38cf9fb
LP
1194 /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1195 * than what we allocated here. */
1196 s->n_ref = 1;
1197 s->event = e;
1198 s->floating = floating;
1199 s->type = type;
1200 s->pending_index = PRIOQ_IDX_NULL;
1201 s->prepare_index = PRIOQ_IDX_NULL;
a71fe8b8
LP
1202
1203 if (!floating)
1204 sd_event_ref(e);
fd38203a 1205
a71fe8b8 1206 LIST_PREPEND(sources, e->sources, s);
313cefa1 1207 e->n_sources++;
15b38f93 1208
fd38203a
LP
1209 return s;
1210}
1211
b9350e70
LP
1212static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1213 assert(s);
1214
1215 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1216}
1217
f7262a9f 1218_public_ int sd_event_add_io(
fd38203a 1219 sd_event *e,
151b9b96 1220 sd_event_source **ret,
fd38203a
LP
1221 int fd,
1222 uint32_t events,
718db961 1223 sd_event_io_handler_t callback,
151b9b96 1224 void *userdata) {
fd38203a 1225
ec766a51 1226 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1227 int r;
1228
305f78bf 1229 assert_return(e, -EINVAL);
b937d761 1230 assert_return(e = event_resolve(e), -ENOPKG);
8ac43fee 1231 assert_return(fd >= 0, -EBADF);
2a16a986 1232 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 1233 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1234 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1235
b9350e70
LP
1236 if (!callback)
1237 callback = io_exit_callback;
1238
a71fe8b8 1239 s = source_new(e, !ret, SOURCE_IO);
fd38203a
LP
1240 if (!s)
1241 return -ENOMEM;
1242
9da4cb2b 1243 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1244 s->io.fd = fd;
1245 s->io.events = events;
1246 s->io.callback = callback;
1247 s->userdata = userdata;
baf76283 1248 s->enabled = SD_EVENT_ON;
fd38203a 1249
baf76283 1250 r = source_io_register(s, s->enabled, events);
ec766a51 1251 if (r < 0)
050f74f2 1252 return r;
fd38203a 1253
a71fe8b8
LP
1254 if (ret)
1255 *ret = s;
ec766a51 1256 TAKE_PTR(s);
a71fe8b8 1257
fd38203a
LP
1258 return 0;
1259}
1260
52444dc4 1261static void initialize_perturb(sd_event *e) {
6d2326e0 1262 sd_id128_t id = {};
52444dc4 1263
6d2326e0
YW
1264 /* When we sleep for longer, we try to realign the wakeup to the same time within each
1265 * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1266 * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1267 * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1268 * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
52444dc4 1269
3a43da28 1270 if (_likely_(e->perturb != USEC_INFINITY))
52444dc4
LP
1271 return;
1272
1912f790 1273 if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
6d2326e0
YW
1274 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1275 else
1276 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
52444dc4
LP
1277}
1278
fd38203a
LP
1279static int event_setup_timer_fd(
1280 sd_event *e,
6a0f1f6d
LP
1281 struct clock_data *d,
1282 clockid_t clock) {
fd38203a 1283
fd38203a 1284 assert(e);
6a0f1f6d 1285 assert(d);
fd38203a 1286
6a0f1f6d 1287 if (_likely_(d->fd >= 0))
fd38203a
LP
1288 return 0;
1289
254d1313 1290 _cleanup_close_ int fd = -EBADF;
b44d87e2 1291
6a0f1f6d 1292 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
fd38203a
LP
1293 if (fd < 0)
1294 return -errno;
1295
7fe2903c
LP
1296 fd = fd_move_above_stdio(fd);
1297
1eac7948 1298 struct epoll_event ev = {
a82f89aa
LP
1299 .events = EPOLLIN,
1300 .data.ptr = d,
1301 };
fd38203a 1302
15c689d7 1303 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
fd38203a 1304 return -errno;
fd38203a 1305
b44d87e2 1306 d->fd = TAKE_FD(fd);
fd38203a
LP
1307 return 0;
1308}
1309
c4f1aff2
TG
1310static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1311 assert(s);
1312
1313 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1314}
1315
41c63f36
LP
1316static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1317 int r;
1318
1319 assert(d);
1320
1321 if (d->fd < 0) {
1322 r = event_setup_timer_fd(e, d, clock);
1323 if (r < 0)
1324 return r;
1325 }
1326
1327 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1328 if (r < 0)
1329 return r;
1330
1331 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1332 if (r < 0)
1333 return r;
1334
1335 return 0;
1336}
1337
1e45e3fe
LP
1338static int event_source_time_prioq_put(
1339 sd_event_source *s,
1340 struct clock_data *d) {
1341
1342 int r;
1343
1344 assert(s);
1345 assert(d);
19947509 1346 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1e45e3fe 1347
f41315fc 1348 r = prioq_put(d->earliest, s, &s->earliest_index);
1e45e3fe
LP
1349 if (r < 0)
1350 return r;
1351
f41315fc 1352 r = prioq_put(d->latest, s, &s->latest_index);
1e45e3fe 1353 if (r < 0) {
f41315fc
LP
1354 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1355 s->earliest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
1356 return r;
1357 }
1358
1359 d->needs_rearm = true;
1360 return 0;
1361}
1362
6a0f1f6d 1363_public_ int sd_event_add_time(
fd38203a 1364 sd_event *e,
151b9b96 1365 sd_event_source **ret,
6a0f1f6d 1366 clockid_t clock,
fd38203a 1367 uint64_t usec,
c2ba3ad6 1368 uint64_t accuracy,
718db961 1369 sd_event_time_handler_t callback,
151b9b96 1370 void *userdata) {
fd38203a 1371
6a0f1f6d 1372 EventSourceType type;
ec766a51 1373 _cleanup_(source_freep) sd_event_source *s = NULL;
6a0f1f6d 1374 struct clock_data *d;
fd38203a
LP
1375 int r;
1376
305f78bf 1377 assert_return(e, -EINVAL);
b937d761 1378 assert_return(e = event_resolve(e), -ENOPKG);
f5fbe71d 1379 assert_return(accuracy != UINT64_MAX, -EINVAL);
da7e457c 1380 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1381 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1382
e475d10c
LP
1383 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1384 return -EOPNOTSUPP;
1385
1386 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1387 if (type < 0)
3411372e
LP
1388 return -EOPNOTSUPP;
1389
c4f1aff2
TG
1390 if (!callback)
1391 callback = time_exit_callback;
1392
1e45e3fe 1393 assert_se(d = event_get_clock_data(e, type));
c2ba3ad6 1394
41c63f36 1395 r = setup_clock_data(e, d, clock);
c983e776
EV
1396 if (r < 0)
1397 return r;
fd38203a 1398
a71fe8b8 1399 s = source_new(e, !ret, type);
fd38203a
LP
1400 if (!s)
1401 return -ENOMEM;
1402
1403 s->time.next = usec;
c2ba3ad6 1404 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
fd38203a 1405 s->time.callback = callback;
f41315fc 1406 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
fd38203a 1407 s->userdata = userdata;
baf76283 1408 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1409
1e45e3fe 1410 r = event_source_time_prioq_put(s, d);
c2ba3ad6 1411 if (r < 0)
ec766a51 1412 return r;
fd38203a 1413
a71fe8b8
LP
1414 if (ret)
1415 *ret = s;
ec766a51 1416 TAKE_PTR(s);
a71fe8b8 1417
fd38203a
LP
1418 return 0;
1419}
1420
d6a83dc4
LP
1421_public_ int sd_event_add_time_relative(
1422 sd_event *e,
1423 sd_event_source **ret,
1424 clockid_t clock,
1425 uint64_t usec,
1426 uint64_t accuracy,
1427 sd_event_time_handler_t callback,
1428 void *userdata) {
1429
1430 usec_t t;
1431 int r;
1432
1433 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1434 * checks for overflow. */
1435
1436 r = sd_event_now(e, clock, &t);
1437 if (r < 0)
1438 return r;
1439
1440 if (usec >= USEC_INFINITY - t)
1441 return -EOVERFLOW;
1442
1443 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1444}
1445
59bc1fd7
LP
1446static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1447 assert(s);
1448
1449 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1450}
1451
f7262a9f 1452_public_ int sd_event_add_signal(
305f78bf 1453 sd_event *e,
151b9b96 1454 sd_event_source **ret,
305f78bf 1455 int sig,
718db961 1456 sd_event_signal_handler_t callback,
151b9b96 1457 void *userdata) {
305f78bf 1458
ec766a51 1459 _cleanup_(source_freep) sd_event_source *s = NULL;
9da4cb2b 1460 struct signal_data *d;
897448bd
LP
1461 sigset_t new_ss;
1462 bool block_it;
fd38203a
LP
1463 int r;
1464
305f78bf 1465 assert_return(e, -EINVAL);
b937d761 1466 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1467 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1468 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1469
897448bd
LP
1470 /* Let's make sure our special flag stays outside of the valid signal range */
1471 assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1472
1473 if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1474 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1475 assert_return(SIGNAL_VALID(sig), -EINVAL);
1476
1477 block_it = true;
1478 } else {
1479 assert_return(SIGNAL_VALID(sig), -EINVAL);
1480
1481 r = signal_is_blocked(sig);
1482 if (r < 0)
1483 return r;
1484 if (r == 0)
1485 return -EBUSY;
1486
1487 block_it = false;
1488 }
1489
59bc1fd7
LP
1490 if (!callback)
1491 callback = signal_exit_callback;
1492
fd38203a
LP
1493 if (!e->signal_sources) {
1494 e->signal_sources = new0(sd_event_source*, _NSIG);
1495 if (!e->signal_sources)
1496 return -ENOMEM;
1497 } else if (e->signal_sources[sig])
1498 return -EBUSY;
1499
a71fe8b8 1500 s = source_new(e, !ret, SOURCE_SIGNAL);
fd38203a
LP
1501 if (!s)
1502 return -ENOMEM;
1503
1504 s->signal.sig = sig;
1505 s->signal.callback = callback;
1506 s->userdata = userdata;
baf76283 1507 s->enabled = SD_EVENT_ON;
fd38203a
LP
1508
1509 e->signal_sources[sig] = s;
fd38203a 1510
897448bd
LP
1511 if (block_it) {
1512 sigset_t old_ss;
1513
1514 if (sigemptyset(&new_ss) < 0)
1515 return -errno;
1516
1517 if (sigaddset(&new_ss, sig) < 0)
1518 return -errno;
1519
1520 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1521 if (r != 0)
1522 return -r;
1523
1524 r = sigismember(&old_ss, sig);
1525 if (r < 0)
1526 return -errno;
1527
1528 s->signal.unblock = !r;
1529 } else
1530 s->signal.unblock = false;
1531
9da4cb2b 1532 r = event_make_signal_data(e, sig, &d);
897448bd
LP
1533 if (r < 0) {
1534 if (s->signal.unblock)
1535 (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1536
9da4cb2b 1537 return r;
897448bd 1538 }
fd38203a 1539
f1f00dbb
LP
1540 /* Use the signal name as description for the event source by default */
1541 (void) sd_event_source_set_description(s, signal_to_string(sig));
1542
a71fe8b8
LP
1543 if (ret)
1544 *ret = s;
ec766a51 1545 TAKE_PTR(s);
a71fe8b8 1546
fd38203a
LP
1547 return 0;
1548}
1549
b9350e70
LP
1550static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1551 assert(s);
1552
1553 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1554}
1555
f8f3f926
LP
1556static bool shall_use_pidfd(void) {
1557 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1558 return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1559}
1560
f7262a9f 1561_public_ int sd_event_add_child(
305f78bf 1562 sd_event *e,
151b9b96 1563 sd_event_source **ret,
305f78bf
LP
1564 pid_t pid,
1565 int options,
718db961 1566 sd_event_child_handler_t callback,
151b9b96 1567 void *userdata) {
305f78bf 1568
ec766a51 1569 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1570 int r;
1571
305f78bf 1572 assert_return(e, -EINVAL);
b937d761 1573 assert_return(e = event_resolve(e), -ENOPKG);
305f78bf
LP
1574 assert_return(pid > 1, -EINVAL);
1575 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1576 assert_return(options != 0, -EINVAL);
da7e457c 1577 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1578 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1579
b9350e70
LP
1580 if (!callback)
1581 callback = child_exit_callback;
1582
b6d5481b 1583 if (e->n_online_child_sources == 0) {
ee880b37
LP
1584 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1585 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1586 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1587 * take effect.
1588 *
1589 * (As an optimization we only do this check on the first child event source created.) */
1590 r = signal_is_blocked(SIGCHLD);
1591 if (r < 0)
1592 return r;
1593 if (r == 0)
1594 return -EBUSY;
1595 }
1596
d5099efc 1597 r = hashmap_ensure_allocated(&e->child_sources, NULL);
fd38203a
LP
1598 if (r < 0)
1599 return r;
1600
4a0b58c4 1601 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
fd38203a
LP
1602 return -EBUSY;
1603
a71fe8b8 1604 s = source_new(e, !ret, SOURCE_CHILD);
fd38203a
LP
1605 if (!s)
1606 return -ENOMEM;
1607
f8f3f926 1608 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1609 s->child.options = options;
1610 s->child.callback = callback;
1611 s->userdata = userdata;
baf76283 1612 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1613
f8f3f926
LP
1614 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1615 * pin the PID, and make regular waitid() handling race-free. */
1616
1617 if (shall_use_pidfd()) {
54988a27 1618 s->child.pidfd = pidfd_open(pid, 0);
f8f3f926
LP
1619 if (s->child.pidfd < 0) {
1620 /* Propagate errors unless the syscall is not supported or blocked */
1621 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1622 return -errno;
1623 } else
1624 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1625 } else
254d1313 1626 s->child.pidfd = -EBADF;
f8f3f926 1627
f8f3f926
LP
1628 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1629 /* We have a pidfd and we only want to watch for exit */
f8f3f926 1630 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1631 if (r < 0)
f8f3f926 1632 return r;
ac9f2640 1633
f8f3f926
LP
1634 } else {
1635 /* We have no pidfd or we shall wait for some other event than WEXITED */
f8f3f926 1636 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1637 if (r < 0)
f8f3f926 1638 return r;
f8f3f926
LP
1639
1640 e->need_process_child = true;
1641 }
c2ba3ad6 1642
54988a27
YW
1643 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1644 if (r < 0)
1645 return r;
1646
1647 /* These must be done after everything succeeds. */
1648 s->child.pid = pid;
b6d5481b 1649 e->n_online_child_sources++;
ac9f2640 1650
a71fe8b8
LP
1651 if (ret)
1652 *ret = s;
ec766a51 1653 TAKE_PTR(s);
f8f3f926
LP
1654 return 0;
1655}
1656
1657_public_ int sd_event_add_child_pidfd(
1658 sd_event *e,
1659 sd_event_source **ret,
1660 int pidfd,
1661 int options,
1662 sd_event_child_handler_t callback,
1663 void *userdata) {
1664
1665
1666 _cleanup_(source_freep) sd_event_source *s = NULL;
1667 pid_t pid;
1668 int r;
1669
1670 assert_return(e, -EINVAL);
1671 assert_return(e = event_resolve(e), -ENOPKG);
1672 assert_return(pidfd >= 0, -EBADF);
1673 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1674 assert_return(options != 0, -EINVAL);
f8f3f926
LP
1675 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1676 assert_return(!event_pid_changed(e), -ECHILD);
1677
b9350e70
LP
1678 if (!callback)
1679 callback = child_exit_callback;
1680
b6d5481b 1681 if (e->n_online_child_sources == 0) {
ee880b37
LP
1682 r = signal_is_blocked(SIGCHLD);
1683 if (r < 0)
1684 return r;
1685 if (r == 0)
1686 return -EBUSY;
1687 }
1688
f8f3f926
LP
1689 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1690 if (r < 0)
1691 return r;
1692
1693 r = pidfd_get_pid(pidfd, &pid);
1694 if (r < 0)
1695 return r;
1696
1697 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1698 return -EBUSY;
1699
1700 s = source_new(e, !ret, SOURCE_CHILD);
1701 if (!s)
1702 return -ENOMEM;
1703
1704 s->wakeup = WAKEUP_EVENT_SOURCE;
1705 s->child.pidfd = pidfd;
1706 s->child.pid = pid;
1707 s->child.options = options;
1708 s->child.callback = callback;
1709 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1710 s->userdata = userdata;
1711 s->enabled = SD_EVENT_ONESHOT;
1712
1713 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1714 if (r < 0)
1715 return r;
1716
f8f3f926
LP
1717 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1718 /* We only want to watch for WEXITED */
f8f3f926 1719 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1720 if (r < 0)
f8f3f926 1721 return r;
f8f3f926
LP
1722 } else {
1723 /* We shall wait for some other event than WEXITED */
f8f3f926 1724 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1725 if (r < 0)
f8f3f926 1726 return r;
a71fe8b8 1727
f8f3f926
LP
1728 e->need_process_child = true;
1729 }
1730
b6d5481b 1731 e->n_online_child_sources++;
ac9f2640 1732
f8f3f926
LP
1733 if (ret)
1734 *ret = s;
f8f3f926 1735 TAKE_PTR(s);
fd38203a
LP
1736 return 0;
1737}
1738
b9350e70
LP
1739static int generic_exit_callback(sd_event_source *s, void *userdata) {
1740 assert(s);
1741
1742 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1743}
1744
f7262a9f 1745_public_ int sd_event_add_defer(
305f78bf 1746 sd_event *e,
151b9b96 1747 sd_event_source **ret,
718db961 1748 sd_event_handler_t callback,
151b9b96 1749 void *userdata) {
305f78bf 1750
ec766a51 1751 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1752 int r;
1753
305f78bf 1754 assert_return(e, -EINVAL);
b937d761 1755 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1756 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 1757 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 1758
b9350e70
LP
1759 if (!callback)
1760 callback = generic_exit_callback;
1761
a71fe8b8 1762 s = source_new(e, !ret, SOURCE_DEFER);
fd38203a
LP
1763 if (!s)
1764 return -ENOMEM;
1765
1766 s->defer.callback = callback;
1767 s->userdata = userdata;
baf76283 1768 s->enabled = SD_EVENT_ONESHOT;
fd38203a
LP
1769
1770 r = source_set_pending(s, true);
ec766a51 1771 if (r < 0)
fd38203a 1772 return r;
fd38203a 1773
a71fe8b8
LP
1774 if (ret)
1775 *ret = s;
ec766a51 1776 TAKE_PTR(s);
a71fe8b8 1777
fd38203a
LP
1778 return 0;
1779}
1780
6e9feda3
LP
1781_public_ int sd_event_add_post(
1782 sd_event *e,
1783 sd_event_source **ret,
1784 sd_event_handler_t callback,
1785 void *userdata) {
1786
ec766a51 1787 _cleanup_(source_freep) sd_event_source *s = NULL;
6e9feda3
LP
1788 int r;
1789
1790 assert_return(e, -EINVAL);
b937d761 1791 assert_return(e = event_resolve(e), -ENOPKG);
6e9feda3
LP
1792 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1793 assert_return(!event_pid_changed(e), -ECHILD);
1794
b9350e70
LP
1795 if (!callback)
1796 callback = generic_exit_callback;
1797
a71fe8b8 1798 s = source_new(e, !ret, SOURCE_POST);
6e9feda3
LP
1799 if (!s)
1800 return -ENOMEM;
1801
1802 s->post.callback = callback;
1803 s->userdata = userdata;
1804 s->enabled = SD_EVENT_ON;
1805
de7fef4b 1806 r = set_ensure_put(&e->post_sources, NULL, s);
ec766a51 1807 if (r < 0)
6e9feda3 1808 return r;
de7fef4b 1809 assert(r > 0);
6e9feda3 1810
a71fe8b8
LP
1811 if (ret)
1812 *ret = s;
ec766a51 1813 TAKE_PTR(s);
a71fe8b8 1814
6e9feda3
LP
1815 return 0;
1816}
1817
6203e07a 1818_public_ int sd_event_add_exit(
305f78bf 1819 sd_event *e,
151b9b96 1820 sd_event_source **ret,
718db961 1821 sd_event_handler_t callback,
151b9b96 1822 void *userdata) {
305f78bf 1823
ec766a51 1824 _cleanup_(source_freep) sd_event_source *s = NULL;
da7e457c
LP
1825 int r;
1826
1827 assert_return(e, -EINVAL);
b937d761 1828 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
1829 assert_return(callback, -EINVAL);
1830 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1831 assert_return(!event_pid_changed(e), -ECHILD);
1832
c983e776
EV
1833 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1834 if (r < 0)
1835 return r;
da7e457c 1836
a71fe8b8 1837 s = source_new(e, !ret, SOURCE_EXIT);
fd38203a 1838 if (!s)
da7e457c 1839 return -ENOMEM;
fd38203a 1840
6203e07a 1841 s->exit.callback = callback;
da7e457c 1842 s->userdata = userdata;
6203e07a 1843 s->exit.prioq_index = PRIOQ_IDX_NULL;
baf76283 1844 s->enabled = SD_EVENT_ONESHOT;
da7e457c 1845
6203e07a 1846 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
ec766a51 1847 if (r < 0)
da7e457c 1848 return r;
da7e457c 1849
a71fe8b8
LP
1850 if (ret)
1851 *ret = s;
ec766a51 1852 TAKE_PTR(s);
a71fe8b8 1853
da7e457c
LP
1854 return 0;
1855}
1856
158fe190
LP
1857int sd_event_trim_memory(void) {
1858 int r;
1859
1860 /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1861 * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1862 * NULL callback parameter. */
1863
1864 log_debug("Memory pressure event, trimming malloc() memory.");
1865
1866#if HAVE_GENERIC_MALLINFO
1867 generic_mallinfo before_mallinfo = generic_mallinfo_get();
1868#endif
1869
1870 usec_t before_timestamp = now(CLOCK_MONOTONIC);
1871 hashmap_trim_pools();
1872 r = malloc_trim(0);
1873 usec_t after_timestamp = now(CLOCK_MONOTONIC);
1874
1875 if (r > 0)
1876 log_debug("Successfully trimmed some memory.");
1877 else
1878 log_debug("Couldn't trim any memory.");
1879
1880 usec_t period = after_timestamp - before_timestamp;
1881
1882#if HAVE_GENERIC_MALLINFO
1883 generic_mallinfo after_mallinfo = generic_mallinfo_get();
1884 size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
1885 LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
1886 log_struct(LOG_DEBUG,
1887 LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1888 FORMAT_TIMESPAN(period, 0),
1889 FORMAT_BYTES(l)),
1890 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1891 "TRIMMED_BYTES=%zu", l,
1892 "TRIMMED_USEC=" USEC_FMT, period);
1893#else
1894 log_struct(LOG_DEBUG,
1895 LOG_MESSAGE("Memory trimming took %s.",
1896 FORMAT_TIMESPAN(period, 0)),
1897 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1898 "TRIMMED_USEC=" USEC_FMT, period);
1899#endif
1900
1901 return 0;
1902}
1903
1904static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1905 assert(s);
1906
1907 sd_event_trim_memory();
1908 return 0;
1909}
1910
1911_public_ int sd_event_add_memory_pressure(
1912 sd_event *e,
1913 sd_event_source **ret,
1914 sd_event_handler_t callback,
1915 void *userdata) {
1916
1917 _cleanup_free_ char *w = NULL;
1918 _cleanup_(source_freep) sd_event_source *s = NULL;
92651a7a 1919 _cleanup_close_ int path_fd = -EBADF, fd = -EBADF;
158fe190 1920 _cleanup_free_ void *write_buffer = NULL;
40c5d5d2 1921 const char *watch, *watch_fallback = NULL, *env;
158fe190
LP
1922 size_t write_buffer_size = 0;
1923 struct stat st;
1924 uint32_t events;
1925 bool locked;
1926 int r;
1927
1928 assert_return(e, -EINVAL);
1929 assert_return(e = event_resolve(e), -ENOPKG);
1930 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1931 assert_return(!event_pid_changed(e), -ECHILD);
1932
1933 if (!callback)
1934 callback = memory_pressure_callback;
1935
1936 s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1937 if (!s)
1938 return -ENOMEM;
1939
1940 s->wakeup = WAKEUP_EVENT_SOURCE;
1941 s->memory_pressure.callback = callback;
1942 s->userdata = userdata;
1943 s->enabled = SD_EVENT_ON;
1944 s->memory_pressure.fd = -EBADF;
1945
1946 env = secure_getenv("MEMORY_PRESSURE_WATCH");
1947 if (env) {
1948 if (isempty(env) || path_equal(env, "/dev/null"))
1949 return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1950 "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1951
1952 if (!path_is_absolute(env) || !path_is_normalized(env))
1953 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1954 "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1955
1956 watch = env;
1957
1958 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1959 if (env) {
1960 r = unbase64mem(env, SIZE_MAX, &write_buffer, &write_buffer_size);
1961 if (r < 0)
1962 return r;
1963 }
1964
1965 locked = true;
1966 } else {
1967
1968 r = is_pressure_supported();
1969 if (r < 0)
1970 return r;
1971 if (r == 0)
1972 return -EOPNOTSUPP;
1973
1974 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1975 * the system wide pressure if for some reason we cannot (which could be: memory controller
1976 * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1977 * only use the system-wide logic. */
1978 r = cg_all_unified();
1979 if (r < 0)
1980 return r;
1981 if (r == 0)
1982 watch = "/proc/pressure/memory";
1983 else {
1984 _cleanup_free_ char *cg = NULL;
1985
1986 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
1987 if (r < 0)
1988 return r;
1989
1990 w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
1991 if (!w)
1992 return -ENOMEM;
1993
1994 watch = w;
1995 watch_fallback = "/proc/pressure/memory";
1996 }
1997
1998 /* Android uses three levels in its userspace low memory killer logic:
1999 * some 70000 1000000
2000 * some 100000 1000000
2001 * full 70000 1000000
2002 *
2003 * GNOME's low memory monitor uses:
2004 * some 70000 1000000
2005 * some 100000 1000000
2006 * full 100000 1000000
2007 *
2008 * We'll default to the middle level that both agree on */
2009 if (asprintf((char**) &write_buffer,
2010 "%s " USEC_FMT " " USEC_FMT,
2011 MEMORY_PRESSURE_DEFAULT_TYPE,
2012 MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2013 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2014 return -ENOMEM;
2015
2016 write_buffer_size = strlen(write_buffer) + 1;
2017 locked = false;
2018 }
2019
2020 path_fd = open(watch, O_PATH|O_CLOEXEC);
2021 if (path_fd < 0) {
2022 if (errno != ENOENT)
2023 return -errno;
2024
2025 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2026 * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2027 * the PSI service apparently is not supported) */
2028 if (!watch_fallback)
2029 return locked ? -ENOENT : -EOPNOTSUPP;
2030
2031 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
63b1e67e
YW
2032 if (path_fd < 0) {
2033 if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2034 return -EOPNOTSUPP;
158fe190 2035 return -errno;
63b1e67e 2036 }
158fe190
LP
2037 }
2038
2039 if (fstat(path_fd, &st) < 0)
2040 return -errno;
2041
2042 if (S_ISSOCK(st.st_mode)) {
2043 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2044 if (fd < 0)
2045 return -errno;
2046
2047 r = connect_unix_path(fd, path_fd, NULL);
2048 if (r < 0)
2049 return r;
2050
2051 events = EPOLLIN;
2052
2053 } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2054 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2055 if (fd < 0)
2056 return fd;
2057
2058 if (S_ISREG(st.st_mode)) {
2059 struct statfs sfs;
2060
2061 /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2062
2063 if (fstatfs(fd, &sfs) < 0)
2064 return -errno;
2065
2066 if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2067 !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2068 return -ENOTTY;
2069
2070 events = EPOLLPRI;
2071 } else
2072 /* For fifos and char devices just watch for EPOLLIN */
2073 events = EPOLLIN;
2074
2075 } else if (S_ISDIR(st.st_mode))
2076 return -EISDIR;
2077 else
2078 return -EBADF;
2079
2080 s->memory_pressure.fd = TAKE_FD(fd);
2081 s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2082 s->memory_pressure.write_buffer_size = write_buffer_size;
2083 s->memory_pressure.events = events;
2084 s->memory_pressure.locked = locked;
2085
2086 /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2087 * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2088 * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2089 * event sources on which writes must be executed before the first event loop iteration is
2090 * executed. (We could also write the data here, right away, but we want to give the caller the
2091 * freedom to call sd_event_source_set_memory_pressure_type() and
2092 * sd_event_source_set_memory_pressure_rate() before we write it. */
2093
2094 if (s->memory_pressure.write_buffer_size > 0)
2095 source_memory_pressure_add_to_write_list(s);
2096 else {
2097 r = source_memory_pressure_register(s, s->enabled);
2098 if (r < 0)
2099 return r;
2100 }
2101
2102 if (ret)
2103 *ret = s;
2104 TAKE_PTR(s);
2105
2106 return 0;
2107}
2108
97ef5391
LP
2109static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2110 assert(e);
2111
2112 if (!d)
2113 return;
2114
2115 assert(hashmap_isempty(d->inodes));
2116 assert(hashmap_isempty(d->wd));
2117
2118 if (d->buffer_filled > 0)
0601b958 2119 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
2120
2121 hashmap_free(d->inodes);
2122 hashmap_free(d->wd);
2123
2124 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2125
2126 if (d->fd >= 0) {
fbae5090
YW
2127 if (!event_pid_changed(e) &&
2128 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
97ef5391
LP
2129 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2130
2131 safe_close(d->fd);
2132 }
2133 free(d);
2134}
2135
2136static int event_make_inotify_data(
2137 sd_event *e,
2138 int64_t priority,
2139 struct inotify_data **ret) {
2140
254d1313 2141 _cleanup_close_ int fd = -EBADF;
97ef5391 2142 struct inotify_data *d;
97ef5391
LP
2143 int r;
2144
2145 assert(e);
2146
2147 d = hashmap_get(e->inotify_data, &priority);
2148 if (d) {
2149 if (ret)
2150 *ret = d;
2151 return 0;
2152 }
2153
2154 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2155 if (fd < 0)
2156 return -errno;
2157
2158 fd = fd_move_above_stdio(fd);
2159
97ef5391
LP
2160 d = new(struct inotify_data, 1);
2161 if (!d)
2162 return -ENOMEM;
2163
2164 *d = (struct inotify_data) {
2165 .wakeup = WAKEUP_INOTIFY_DATA,
2166 .fd = TAKE_FD(fd),
2167 .priority = priority,
2168 };
2169
c2484a75 2170 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
97ef5391
LP
2171 if (r < 0) {
2172 d->fd = safe_close(d->fd);
2173 free(d);
2174 return r;
2175 }
2176
1eac7948 2177 struct epoll_event ev = {
97ef5391
LP
2178 .events = EPOLLIN,
2179 .data.ptr = d,
2180 };
2181
2182 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2183 r = -errno;
2184 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2185 * remove the fd from the epoll first, which we don't want as we couldn't
2186 * add it in the first place. */
2187 event_free_inotify_data(e, d);
2188 return r;
2189 }
2190
2191 if (ret)
2192 *ret = d;
2193
2194 return 1;
2195}
2196
7a08d314 2197static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
90c88092 2198 int r;
97ef5391
LP
2199
2200 assert(x);
2201 assert(y);
2202
90c88092
YW
2203 r = CMP(x->dev, y->dev);
2204 if (r != 0)
2205 return r;
97ef5391 2206
6dd91b36 2207 return CMP(x->ino, y->ino);
97ef5391
LP
2208}
2209
7a08d314
YW
2210static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2211 assert(d);
97ef5391
LP
2212
2213 siphash24_compress(&d->dev, sizeof(d->dev), state);
2214 siphash24_compress(&d->ino, sizeof(d->ino), state);
2215}
2216
7a08d314 2217DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
97ef5391
LP
2218
2219static void event_free_inode_data(
2220 sd_event *e,
2221 struct inode_data *d) {
2222
2223 assert(e);
2224
2225 if (!d)
2226 return;
2227
64903d18 2228 assert(!d->event_sources);
97ef5391
LP
2229
2230 if (d->fd >= 0) {
ed828563 2231 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
2232 safe_close(d->fd);
2233 }
2234
2235 if (d->inotify_data) {
2236
2237 if (d->wd >= 0) {
fbae5090 2238 if (d->inotify_data->fd >= 0 && !event_pid_changed(e)) {
97ef5391
LP
2239 /* So here's a problem. At the time this runs the watch descriptor might already be
2240 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2241 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2242 * likely case to happen. */
2243
2244 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2245 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2246 }
2247
2248 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2249 }
2250
2251 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2252 }
2253
2254 free(d);
2255}
2256
53baf2ef
LP
2257static void event_gc_inotify_data(
2258 sd_event *e,
2259 struct inotify_data *d) {
2260
2261 assert(e);
2262
2263 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2264 * any inode with it anymore, which in turn happens if no event source of this priority is interested
2265 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2266 * (under the expectation that the GC is called again once the counter is decremented). */
2267
2268 if (!d)
2269 return;
2270
2271 if (!hashmap_isempty(d->inodes))
2272 return;
2273
2274 if (d->n_busy > 0)
2275 return;
2276
2277 event_free_inotify_data(e, d);
2278}
2279
97ef5391
LP
2280static void event_gc_inode_data(
2281 sd_event *e,
2282 struct inode_data *d) {
2283
2284 struct inotify_data *inotify_data;
2285
2286 assert(e);
2287
2288 if (!d)
2289 return;
2290
64903d18 2291 if (d->event_sources)
97ef5391
LP
2292 return;
2293
2294 inotify_data = d->inotify_data;
2295 event_free_inode_data(e, d);
2296
53baf2ef 2297 event_gc_inotify_data(e, inotify_data);
97ef5391
LP
2298}
2299
2300static int event_make_inode_data(
2301 sd_event *e,
2302 struct inotify_data *inotify_data,
2303 dev_t dev,
2304 ino_t ino,
2305 struct inode_data **ret) {
2306
2307 struct inode_data *d, key;
2308 int r;
2309
2310 assert(e);
2311 assert(inotify_data);
2312
2313 key = (struct inode_data) {
2314 .ino = ino,
2315 .dev = dev,
2316 };
2317
2318 d = hashmap_get(inotify_data->inodes, &key);
2319 if (d) {
2320 if (ret)
2321 *ret = d;
2322
2323 return 0;
2324 }
2325
2326 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2327 if (r < 0)
2328 return r;
2329
2330 d = new(struct inode_data, 1);
2331 if (!d)
2332 return -ENOMEM;
2333
2334 *d = (struct inode_data) {
2335 .dev = dev,
2336 .ino = ino,
2337 .wd = -1,
254d1313 2338 .fd = -EBADF,
97ef5391
LP
2339 .inotify_data = inotify_data,
2340 };
2341
2342 r = hashmap_put(inotify_data->inodes, d, d);
2343 if (r < 0) {
2344 free(d);
2345 return r;
2346 }
2347
2348 if (ret)
2349 *ret = d;
2350
2351 return 1;
2352}
2353
2354static uint32_t inode_data_determine_mask(struct inode_data *d) {
2355 bool excl_unlink = true;
2356 uint32_t combined = 0;
97ef5391
LP
2357
2358 assert(d);
2359
2360 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2361 * the IN_EXCL_UNLINK flag is ANDed instead.
2362 *
2363 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2364 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
f21f31b2 2365 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
97ef5391
LP
2366 * events we don't care for client-side. */
2367
2368 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2369
2370 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2371 excl_unlink = false;
2372
2373 combined |= s->inotify.mask;
2374 }
2375
2376 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2377}
2378
2379static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2380 uint32_t combined_mask;
2381 int wd, r;
2382
2383 assert(d);
2384 assert(d->fd >= 0);
2385
2386 combined_mask = inode_data_determine_mask(d);
2387
2388 if (d->wd >= 0 && combined_mask == d->combined_mask)
2389 return 0;
2390
2391 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2392 if (r < 0)
2393 return r;
2394
2395 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2396 if (wd < 0)
2397 return -errno;
2398
2399 if (d->wd < 0) {
2400 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2401 if (r < 0) {
2402 (void) inotify_rm_watch(d->inotify_data->fd, wd);
2403 return r;
2404 }
2405
2406 d->wd = wd;
2407
2408 } else if (d->wd != wd) {
2409
2410 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2411 (void) inotify_rm_watch(d->fd, wd);
2412 return -EINVAL;
2413 }
2414
2415 d->combined_mask = combined_mask;
2416 return 1;
2417}
2418
b9350e70
LP
2419static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2420 assert(s);
2421
2422 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2423}
2424
e67d738a 2425static int event_add_inotify_fd_internal(
97ef5391
LP
2426 sd_event *e,
2427 sd_event_source **ret,
e67d738a
LP
2428 int fd,
2429 bool donate,
97ef5391
LP
2430 uint32_t mask,
2431 sd_event_inotify_handler_t callback,
2432 void *userdata) {
2433
5bb1d7fb 2434 _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
e67d738a 2435 _cleanup_(source_freep) sd_event_source *s = NULL;
97ef5391
LP
2436 struct inotify_data *inotify_data = NULL;
2437 struct inode_data *inode_data = NULL;
97ef5391
LP
2438 struct stat st;
2439 int r;
2440
2441 assert_return(e, -EINVAL);
2442 assert_return(e = event_resolve(e), -ENOPKG);
e67d738a 2443 assert_return(fd >= 0, -EBADF);
97ef5391
LP
2444 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2445 assert_return(!event_pid_changed(e), -ECHILD);
2446
b9350e70
LP
2447 if (!callback)
2448 callback = inotify_exit_callback;
2449
97ef5391
LP
2450 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2451 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2452 * the user can't use them for us. */
2453 if (mask & IN_MASK_ADD)
2454 return -EINVAL;
2455
97ef5391
LP
2456 if (fstat(fd, &st) < 0)
2457 return -errno;
2458
2459 s = source_new(e, !ret, SOURCE_INOTIFY);
2460 if (!s)
2461 return -ENOMEM;
2462
2463 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2464 s->inotify.mask = mask;
2465 s->inotify.callback = callback;
2466 s->userdata = userdata;
2467
2468 /* Allocate an inotify object for this priority, and an inode object within it */
2469 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2470 if (r < 0)
8c75fe17 2471 return r;
97ef5391
LP
2472
2473 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
8c75fe17 2474 if (r < 0) {
e67d738a 2475 event_gc_inotify_data(e, inotify_data);
8c75fe17
ZJS
2476 return r;
2477 }
97ef5391
LP
2478
2479 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2480 * the event source, until then, for which we need the original inode. */
2481 if (inode_data->fd < 0) {
e67d738a
LP
2482 if (donated_fd >= 0)
2483 inode_data->fd = TAKE_FD(donated_fd);
2484 else {
2485 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2486 if (inode_data->fd < 0) {
2487 r = -errno;
2488 event_gc_inode_data(e, inode_data);
2489 return r;
2490 }
2491 }
2492
ed828563 2493 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
97ef5391
LP
2494 }
2495
2496 /* Link our event source to the inode data object */
2497 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2498 s->inotify.inode_data = inode_data;
2499
97ef5391
LP
2500 /* Actually realize the watch now */
2501 r = inode_data_realize_watch(e, inode_data);
2502 if (r < 0)
8c75fe17 2503 return r;
97ef5391 2504
97ef5391
LP
2505 if (ret)
2506 *ret = s;
8c75fe17 2507 TAKE_PTR(s);
97ef5391
LP
2508
2509 return 0;
97ef5391
LP
2510}
2511
e67d738a
LP
2512_public_ int sd_event_add_inotify_fd(
2513 sd_event *e,
2514 sd_event_source **ret,
2515 int fd,
2516 uint32_t mask,
2517 sd_event_inotify_handler_t callback,
2518 void *userdata) {
2519
2520 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2521}
2522
2523_public_ int sd_event_add_inotify(
2524 sd_event *e,
2525 sd_event_source **ret,
2526 const char *path,
2527 uint32_t mask,
2528 sd_event_inotify_handler_t callback,
2529 void *userdata) {
2530
2091c779 2531 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
e67d738a
LP
2532 int fd, r;
2533
2534 assert_return(path, -EINVAL);
2535
586c8cee
ZJS
2536 fd = open(path, O_PATH | O_CLOEXEC |
2537 (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2538 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
e67d738a
LP
2539 if (fd < 0)
2540 return -errno;
2541
2542 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2543 if (r < 0)
2544 return r;
2545
2546 (void) sd_event_source_set_description(s, path);
2547
2548 if (ret)
2549 *ret = s;
2550
2551 return r;
2552}
2553
8301aa0b 2554static sd_event_source* event_source_free(sd_event_source *s) {
6680dd6b
LP
2555 if (!s)
2556 return NULL;
da7e457c 2557
8301aa0b
YW
2558 /* Here's a special hack: when we are called from a
2559 * dispatch handler we won't free the event source
2560 * immediately, but we will detach the fd from the
2561 * epoll. This way it is safe for the caller to unref
2562 * the event source and immediately close the fd, but
2563 * we still retain a valid event source object after
2564 * the callback. */
fd38203a 2565
76d04c3a 2566 if (s->dispatching)
8301aa0b 2567 source_disconnect(s);
76d04c3a 2568 else
8301aa0b 2569 source_free(s);
fd38203a
LP
2570
2571 return NULL;
2572}
2573
8301aa0b
YW
2574DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2575
356779df 2576_public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
f7f53e9e 2577 assert_return(s, -EINVAL);
f4b2933e 2578 assert_return(!event_pid_changed(s->event), -ECHILD);
f7f53e9e 2579
356779df 2580 return free_and_strdup(&s->description, description);
f7f53e9e
TG
2581}
2582
356779df 2583_public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
f7f53e9e 2584 assert_return(s, -EINVAL);
356779df 2585 assert_return(description, -EINVAL);
f4b2933e 2586 assert_return(!event_pid_changed(s->event), -ECHILD);
f7f53e9e 2587
7d92a1a4
ZJS
2588 if (!s->description)
2589 return -ENXIO;
2590
356779df 2591 *description = s->description;
f7f53e9e
TG
2592 return 0;
2593}
2594
adcc4ca3 2595_public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
305f78bf 2596 assert_return(s, NULL);
eaa3cbef
LP
2597
2598 return s->event;
2599}
2600
f7262a9f 2601_public_ int sd_event_source_get_pending(sd_event_source *s) {
305f78bf 2602 assert_return(s, -EINVAL);
6203e07a 2603 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c 2604 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2605 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2606
2607 return s->pending;
2608}
2609
f7262a9f 2610_public_ int sd_event_source_get_io_fd(sd_event_source *s) {
305f78bf
LP
2611 assert_return(s, -EINVAL);
2612 assert_return(s->type == SOURCE_IO, -EDOM);
2613 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2614
2615 return s->io.fd;
2616}
2617
30caf8f3
LP
2618_public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2619 int r;
2620
2621 assert_return(s, -EINVAL);
8ac43fee 2622 assert_return(fd >= 0, -EBADF);
30caf8f3
LP
2623 assert_return(s->type == SOURCE_IO, -EDOM);
2624 assert_return(!event_pid_changed(s->event), -ECHILD);
2625
2626 if (s->io.fd == fd)
2627 return 0;
2628
b6d5481b 2629 if (event_source_is_offline(s)) {
30caf8f3
LP
2630 s->io.fd = fd;
2631 s->io.registered = false;
2632 } else {
2633 int saved_fd;
2634
2635 saved_fd = s->io.fd;
2636 assert(s->io.registered);
2637
2638 s->io.fd = fd;
2639 s->io.registered = false;
2640
2641 r = source_io_register(s, s->enabled, s->io.events);
2642 if (r < 0) {
2643 s->io.fd = saved_fd;
2644 s->io.registered = true;
2645 return r;
2646 }
2647
5a795bff 2648 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
30caf8f3
LP
2649 }
2650
2651 return 0;
2652}
2653
ab93297c
NM
2654_public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2655 assert_return(s, -EINVAL);
2656 assert_return(s->type == SOURCE_IO, -EDOM);
2657
2658 return s->io.owned;
2659}
2660
2661_public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2662 assert_return(s, -EINVAL);
2663 assert_return(s->type == SOURCE_IO, -EDOM);
2664
2665 s->io.owned = own;
2666 return 0;
2667}
2668
f7262a9f 2669_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
305f78bf
LP
2670 assert_return(s, -EINVAL);
2671 assert_return(events, -EINVAL);
2672 assert_return(s->type == SOURCE_IO, -EDOM);
2673 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2674
2675 *events = s->io.events;
2676 return 0;
2677}
2678
f7262a9f 2679_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
fd38203a
LP
2680 int r;
2681
305f78bf
LP
2682 assert_return(s, -EINVAL);
2683 assert_return(s->type == SOURCE_IO, -EDOM);
2a16a986 2684 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 2685 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2686 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2687
b63c8d4f
DH
2688 /* edge-triggered updates are never skipped, so we can reset edges */
2689 if (s->io.events == events && !(events & EPOLLET))
fd38203a
LP
2690 return 0;
2691
2a0dc6cd
LP
2692 r = source_set_pending(s, false);
2693 if (r < 0)
2694 return r;
2695
b6d5481b 2696 if (event_source_is_online(s)) {
e4715127 2697 r = source_io_register(s, s->enabled, events);
fd38203a
LP
2698 if (r < 0)
2699 return r;
2700 }
2701
2702 s->io.events = events;
2703
2704 return 0;
2705}
2706
f7262a9f 2707_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
305f78bf
LP
2708 assert_return(s, -EINVAL);
2709 assert_return(revents, -EINVAL);
2710 assert_return(s->type == SOURCE_IO, -EDOM);
2711 assert_return(s->pending, -ENODATA);
2712 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2713
2714 *revents = s->io.revents;
2715 return 0;
2716}
2717
f7262a9f 2718_public_ int sd_event_source_get_signal(sd_event_source *s) {
305f78bf
LP
2719 assert_return(s, -EINVAL);
2720 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2721 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2722
2723 return s->signal.sig;
2724}
2725
31927c16 2726_public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
305f78bf
LP
2727 assert_return(s, -EINVAL);
2728 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2729
6680b8d1
ME
2730 *priority = s->priority;
2731 return 0;
fd38203a
LP
2732}
2733
31927c16 2734_public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
97ef5391
LP
2735 bool rm_inotify = false, rm_inode = false;
2736 struct inotify_data *new_inotify_data = NULL;
2737 struct inode_data *new_inode_data = NULL;
9da4cb2b
LP
2738 int r;
2739
305f78bf 2740 assert_return(s, -EINVAL);
da7e457c 2741 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 2742 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
2743
2744 if (s->priority == priority)
2745 return 0;
2746
97ef5391
LP
2747 if (s->type == SOURCE_INOTIFY) {
2748 struct inode_data *old_inode_data;
2749
2750 assert(s->inotify.inode_data);
2751 old_inode_data = s->inotify.inode_data;
2752
2753 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2754 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2755 * events we allow priority changes only until the first following iteration. */
2756 if (old_inode_data->fd < 0)
2757 return -EOPNOTSUPP;
2758
2759 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2760 if (r < 0)
2761 return r;
2762 rm_inotify = r > 0;
2763
2764 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2765 if (r < 0)
2766 goto fail;
2767 rm_inode = r > 0;
2768
2769 if (new_inode_data->fd < 0) {
2770 /* Duplicate the fd for the new inode object if we don't have any yet */
2771 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2772 if (new_inode_data->fd < 0) {
2773 r = -errno;
2774 goto fail;
2775 }
2776
ed828563 2777 LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
97ef5391
LP
2778 }
2779
2780 /* Move the event source to the new inode data structure */
2781 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2782 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2783 s->inotify.inode_data = new_inode_data;
2784
2785 /* Now create the new watch */
2786 r = inode_data_realize_watch(s->event, new_inode_data);
2787 if (r < 0) {
2788 /* Move it back */
2789 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2790 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2791 s->inotify.inode_data = old_inode_data;
2792 goto fail;
2793 }
2794
2795 s->priority = priority;
2796
2797 event_gc_inode_data(s->event, old_inode_data);
2798
b6d5481b 2799 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
9da4cb2b
LP
2800 struct signal_data *old, *d;
2801
2802 /* Move us from the signalfd belonging to the old
2803 * priority to the signalfd of the new priority */
2804
2805 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2806
2807 s->priority = priority;
2808
2809 r = event_make_signal_data(s->event, s->signal.sig, &d);
2810 if (r < 0) {
2811 s->priority = old->priority;
2812 return r;
2813 }
2814
2815 event_unmask_signal_data(s->event, old, s->signal.sig);
2816 } else
2817 s->priority = priority;
fd38203a 2818
e1951c16 2819 event_source_pp_prioq_reshuffle(s);
fd38203a 2820
6203e07a
LP
2821 if (s->type == SOURCE_EXIT)
2822 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
305f78bf 2823
fd38203a 2824 return 0;
97ef5391
LP
2825
2826fail:
2827 if (rm_inode)
2828 event_free_inode_data(s->event, new_inode_data);
2829
2830 if (rm_inotify)
2831 event_free_inotify_data(s->event, new_inotify_data);
2832
2833 return r;
fd38203a
LP
2834}
2835
cad143a8 2836_public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
71193c0b
ZJS
2837 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2838 if (!s && !ret)
2839 return false;
2840
305f78bf 2841 assert_return(s, -EINVAL);
305f78bf 2842 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 2843
cad143a8
LP
2844 if (ret)
2845 *ret = s->enabled;
2846
08c1eb0e 2847 return s->enabled != SD_EVENT_OFF;
fd38203a
LP
2848}
2849
b6d5481b
LP
2850static int event_source_offline(
2851 sd_event_source *s,
2852 int enabled,
2853 bool ratelimited) {
2854
2855 bool was_offline;
fd38203a
LP
2856 int r;
2857
ddfde737 2858 assert(s);
b6d5481b 2859 assert(enabled == SD_EVENT_OFF || ratelimited);
fd38203a 2860
ddfde737 2861 /* Unset the pending flag when this event source is disabled */
b6d5481b
LP
2862 if (s->enabled != SD_EVENT_OFF &&
2863 enabled == SD_EVENT_OFF &&
2864 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2865 r = source_set_pending(s, false);
2866 if (r < 0)
2867 return r;
2868 }
cc567911 2869
b6d5481b
LP
2870 was_offline = event_source_is_offline(s);
2871 s->enabled = enabled;
2872 s->ratelimited = ratelimited;
fd38203a 2873
ddfde737 2874 switch (s->type) {
fd38203a 2875
ddfde737
LP
2876 case SOURCE_IO:
2877 source_io_unregister(s);
2878 break;
ac989a78 2879
ddfde737
LP
2880 case SOURCE_SIGNAL:
2881 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2882 break;
fd38203a 2883
ddfde737 2884 case SOURCE_CHILD:
b6d5481b
LP
2885 if (!was_offline) {
2886 assert(s->event->n_online_child_sources > 0);
2887 s->event->n_online_child_sources--;
2888 }
fd38203a 2889
ddfde737
LP
2890 if (EVENT_SOURCE_WATCH_PIDFD(s))
2891 source_child_pidfd_unregister(s);
2892 else
2893 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2894 break;
4807d2d0 2895
ddfde737
LP
2896 case SOURCE_EXIT:
2897 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2898 break;
fd38203a 2899
158fe190
LP
2900 case SOURCE_MEMORY_PRESSURE:
2901 source_memory_pressure_unregister(s);
2902 break;
2903
2115b9b6
YW
2904 case SOURCE_TIME_REALTIME:
2905 case SOURCE_TIME_BOOTTIME:
2906 case SOURCE_TIME_MONOTONIC:
2907 case SOURCE_TIME_REALTIME_ALARM:
2908 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737
LP
2909 case SOURCE_DEFER:
2910 case SOURCE_POST:
2911 case SOURCE_INOTIFY:
2912 break;
fd38203a 2913
ddfde737 2914 default:
04499a70 2915 assert_not_reached();
ddfde737 2916 }
fd38203a 2917
2115b9b6
YW
2918 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2919 event_source_time_prioq_reshuffle(s);
2920
b6d5481b 2921 return 1;
ddfde737 2922}
f8f3f926 2923
b6d5481b
LP
2924static int event_source_online(
2925 sd_event_source *s,
2926 int enabled,
2927 bool ratelimited) {
2928
2929 bool was_online;
ddfde737 2930 int r;
fd38203a 2931
ddfde737 2932 assert(s);
b6d5481b 2933 assert(enabled != SD_EVENT_OFF || !ratelimited);
305f78bf 2934
ddfde737 2935 /* Unset the pending flag when this event source is enabled */
b6d5481b
LP
2936 if (s->enabled == SD_EVENT_OFF &&
2937 enabled != SD_EVENT_OFF &&
2938 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2939 r = source_set_pending(s, false);
2940 if (r < 0)
2941 return r;
2942 }
9d3e3aa5 2943
b6d5481b
LP
2944 /* Are we really ready for onlining? */
2945 if (enabled == SD_EVENT_OFF || ratelimited) {
2946 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2947 s->enabled = enabled;
2948 s->ratelimited = ratelimited;
2949 return 0;
2950 }
2951
2952 was_online = event_source_is_online(s);
2953
ddfde737 2954 switch (s->type) {
ddfde737 2955 case SOURCE_IO:
b6d5481b 2956 r = source_io_register(s, enabled, s->io.events);
d2eafe61 2957 if (r < 0)
ddfde737 2958 return r;
ddfde737 2959 break;
fd38203a 2960
ddfde737
LP
2961 case SOURCE_SIGNAL:
2962 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2963 if (r < 0) {
ddfde737
LP
2964 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2965 return r;
2966 }
fd38203a 2967
ddfde737 2968 break;
fd38203a 2969
ddfde737 2970 case SOURCE_CHILD:
ddfde737
LP
2971 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2972 /* yes, we have pidfd */
9da4cb2b 2973
b6d5481b 2974 r = source_child_pidfd_register(s, enabled);
ac9f2640 2975 if (r < 0)
9da4cb2b 2976 return r;
ddfde737
LP
2977 } else {
2978 /* no pidfd, or something other to watch for than WEXITED */
9da4cb2b 2979
ddfde737
LP
2980 r = event_make_signal_data(s->event, SIGCHLD, NULL);
2981 if (r < 0) {
ddfde737
LP
2982 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2983 return r;
2984 }
2985 }
fd38203a 2986
b6d5481b
LP
2987 if (!was_online)
2988 s->event->n_online_child_sources++;
ddfde737 2989 break;
4807d2d0 2990
158fe190
LP
2991 case SOURCE_MEMORY_PRESSURE:
2992 r = source_memory_pressure_register(s, enabled);
2993 if (r < 0)
2994 return r;
2995
2996 break;
2997
d2eafe61
ZJS
2998 case SOURCE_TIME_REALTIME:
2999 case SOURCE_TIME_BOOTTIME:
3000 case SOURCE_TIME_MONOTONIC:
3001 case SOURCE_TIME_REALTIME_ALARM:
3002 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737 3003 case SOURCE_EXIT:
ddfde737
LP
3004 case SOURCE_DEFER:
3005 case SOURCE_POST:
3006 case SOURCE_INOTIFY:
3007 break;
9da4cb2b 3008
ddfde737 3009 default:
04499a70 3010 assert_not_reached();
ddfde737 3011 }
f8f3f926 3012
b6d5481b
LP
3013 s->enabled = enabled;
3014 s->ratelimited = ratelimited;
d2eafe61
ZJS
3015
3016 /* Non-failing operations below */
2115b9b6 3017 if (s->type == SOURCE_EXIT)
d2eafe61 3018 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
d2eafe61 3019
2115b9b6
YW
3020 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3021 event_source_time_prioq_reshuffle(s);
d2eafe61 3022
b6d5481b 3023 return 1;
ddfde737
LP
3024}
3025
3026_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3027 int r;
9da4cb2b 3028
ddfde737 3029 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
7e922b05
ZJS
3030
3031 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3032 if (m == SD_EVENT_OFF && !s)
3033 return 0;
3034
3035 assert_return(s, -EINVAL);
ddfde737 3036 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 3037
ddfde737
LP
3038 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3039 if (s->event->state == SD_EVENT_FINISHED)
3040 return m == SD_EVENT_OFF ? 0 : -ESTALE;
305f78bf 3041
ddfde737
LP
3042 if (s->enabled == m) /* No change? */
3043 return 0;
9d3e3aa5 3044
ddfde737 3045 if (m == SD_EVENT_OFF)
b6d5481b 3046 r = event_source_offline(s, m, s->ratelimited);
ddfde737
LP
3047 else {
3048 if (s->enabled != SD_EVENT_OFF) {
3049 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3050 * event source is already enabled after all. */
3051 s->enabled = m;
3052 return 0;
fd38203a 3053 }
ddfde737 3054
b6d5481b 3055 r = event_source_online(s, m, s->ratelimited);
fd38203a 3056 }
ddfde737
LP
3057 if (r < 0)
3058 return r;
fd38203a 3059
e1951c16 3060 event_source_pp_prioq_reshuffle(s);
fd38203a
LP
3061 return 0;
3062}
3063
f7262a9f 3064_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
305f78bf
LP
3065 assert_return(s, -EINVAL);
3066 assert_return(usec, -EINVAL);
6a0f1f6d 3067 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
305f78bf 3068 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
3069
3070 *usec = s->time.next;
3071 return 0;
3072}
3073
f7262a9f 3074_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
2a0dc6cd 3075 int r;
6a0f1f6d 3076
305f78bf 3077 assert_return(s, -EINVAL);
6a0f1f6d 3078 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 3079 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 3080 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a 3081
2a0dc6cd
LP
3082 r = source_set_pending(s, false);
3083 if (r < 0)
3084 return r;
2576a19e 3085
2a0dc6cd 3086 s->time.next = usec;
fd38203a 3087
e1951c16 3088 event_source_time_prioq_reshuffle(s);
fd38203a
LP
3089 return 0;
3090}
3091
d6a83dc4
LP
3092_public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3093 usec_t t;
3094 int r;
3095
3096 assert_return(s, -EINVAL);
3097 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3098
ef859195
LP
3099 if (usec == USEC_INFINITY)
3100 return sd_event_source_set_time(s, USEC_INFINITY);
3101
d6a83dc4
LP
3102 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3103 if (r < 0)
3104 return r;
3105
496db330
YW
3106 usec = usec_add(t, usec);
3107 if (usec == USEC_INFINITY)
d6a83dc4
LP
3108 return -EOVERFLOW;
3109
496db330 3110 return sd_event_source_set_time(s, usec);
d6a83dc4
LP
3111}
3112
f7262a9f 3113_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
305f78bf
LP
3114 assert_return(s, -EINVAL);
3115 assert_return(usec, -EINVAL);
6a0f1f6d 3116 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
305f78bf
LP
3117 assert_return(!event_pid_changed(s->event), -ECHILD);
3118
3119 *usec = s->time.accuracy;
3120 return 0;
3121}
3122
f7262a9f 3123_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
2a0dc6cd 3124 int r;
6a0f1f6d 3125
305f78bf 3126 assert_return(s, -EINVAL);
f5fbe71d 3127 assert_return(usec != UINT64_MAX, -EINVAL);
6a0f1f6d 3128 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 3129 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
305f78bf 3130 assert_return(!event_pid_changed(s->event), -ECHILD);
eaa3cbef 3131
2a0dc6cd
LP
3132 r = source_set_pending(s, false);
3133 if (r < 0)
3134 return r;
3135
eaa3cbef
LP
3136 if (usec == 0)
3137 usec = DEFAULT_ACCURACY_USEC;
3138
eaa3cbef
LP
3139 s->time.accuracy = usec;
3140
e1951c16 3141 event_source_time_prioq_reshuffle(s);
6a0f1f6d
LP
3142 return 0;
3143}
3144
3145_public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
3146 assert_return(s, -EINVAL);
3147 assert_return(clock, -EINVAL);
3148 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3149 assert_return(!event_pid_changed(s->event), -ECHILD);
eaa3cbef 3150
6a0f1f6d 3151 *clock = event_source_type_to_clock(s->type);
eaa3cbef
LP
3152 return 0;
3153}
3154
f7262a9f 3155_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
4bee8012
LP
3156 assert_return(s, -EINVAL);
3157 assert_return(pid, -EINVAL);
3158 assert_return(s->type == SOURCE_CHILD, -EDOM);
3159 assert_return(!event_pid_changed(s->event), -ECHILD);
3160
3161 *pid = s->child.pid;
3162 return 0;
3163}
3164
f8f3f926
LP
3165_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3166 assert_return(s, -EINVAL);
3167 assert_return(s->type == SOURCE_CHILD, -EDOM);
3168 assert_return(!event_pid_changed(s->event), -ECHILD);
3169
3170 if (s->child.pidfd < 0)
3171 return -EOPNOTSUPP;
3172
3173 return s->child.pidfd;
3174}
3175
3176_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3177 assert_return(s, -EINVAL);
3178 assert_return(s->type == SOURCE_CHILD, -EDOM);
3179 assert_return(!event_pid_changed(s->event), -ECHILD);
3180 assert_return(SIGNAL_VALID(sig), -EINVAL);
3181
3182 /* If we already have seen indication the process exited refuse sending a signal early. This way we
3183 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3184 * available. */
3185 if (s->child.exited)
3186 return -ESRCH;
3187
3188 if (s->child.pidfd >= 0) {
3189 siginfo_t copy;
3190
3191 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3192 * structure here */
3193 if (si)
3194 copy = *si;
3195
3196 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
3197 /* Let's propagate the error only if the system call is not implemented or prohibited */
3198 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3199 return -errno;
3200 } else
3201 return 0;
3202 }
3203
3204 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3205 * this here. */
3206 if (flags != 0)
3207 return -EOPNOTSUPP;
3208
3209 if (si) {
3210 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3211 siginfo_t copy = *si;
3212
3213 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
3214 return -errno;
3215 } else if (kill(s->child.pid, sig) < 0)
3216 return -errno;
3217
3218 return 0;
3219}
3220
3221_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3222 assert_return(s, -EINVAL);
3223 assert_return(s->type == SOURCE_CHILD, -EDOM);
3224
3225 if (s->child.pidfd < 0)
3226 return -EOPNOTSUPP;
3227
3228 return s->child.pidfd_owned;
3229}
3230
3231_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3232 assert_return(s, -EINVAL);
3233 assert_return(s->type == SOURCE_CHILD, -EDOM);
3234
3235 if (s->child.pidfd < 0)
3236 return -EOPNOTSUPP;
3237
3238 s->child.pidfd_owned = own;
3239 return 0;
3240}
3241
3242_public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3243 assert_return(s, -EINVAL);
3244 assert_return(s->type == SOURCE_CHILD, -EDOM);
3245
3246 return s->child.process_owned;
3247}
3248
3249_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3250 assert_return(s, -EINVAL);
3251 assert_return(s->type == SOURCE_CHILD, -EDOM);
3252
3253 s->child.process_owned = own;
3254 return 0;
3255}
3256
97ef5391
LP
3257_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
3258 assert_return(s, -EINVAL);
3259 assert_return(mask, -EINVAL);
3260 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3261 assert_return(!event_pid_changed(s->event), -ECHILD);
3262
3263 *mask = s->inotify.mask;
3264 return 0;
3265}
3266
718db961 3267_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
fd38203a
LP
3268 int r;
3269
da7e457c 3270 assert_return(s, -EINVAL);
6203e07a 3271 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c
LP
3272 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3273 assert_return(!event_pid_changed(s->event), -ECHILD);
fd38203a
LP
3274
3275 if (s->prepare == callback)
3276 return 0;
3277
3278 if (callback && s->prepare) {
3279 s->prepare = callback;
3280 return 0;
3281 }
3282
3283 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3284 if (r < 0)
3285 return r;
3286
3287 s->prepare = callback;
3288
3289 if (callback) {
3290 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3291 if (r < 0)
3292 return r;
3293 } else
3294 prioq_remove(s->event->prepare, s, &s->prepare_index);
3295
3296 return 0;
3297}
3298
f7262a9f 3299_public_ void* sd_event_source_get_userdata(sd_event_source *s) {
da7e457c 3300 assert_return(s, NULL);
fd38203a
LP
3301
3302 return s->userdata;
3303}
3304
8f726607
LP
3305_public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3306 void *ret;
3307
3308 assert_return(s, NULL);
3309
3310 ret = s->userdata;
3311 s->userdata = userdata;
3312
3313 return ret;
3314}
3315
b6d5481b
LP
3316static int event_source_enter_ratelimited(sd_event_source *s) {
3317 int r;
3318
3319 assert(s);
3320
3321 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3322 * the end of the rate limit time window, much as if it was a timer event source. */
3323
3324 if (s->ratelimited)
3325 return 0; /* Already ratelimited, this is a NOP hence */
3326
3327 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3328 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3329 if (r < 0)
3330 return r;
3331
3332 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3333 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3334 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3335 if (EVENT_SOURCE_IS_TIME(s->type))
3336 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3337
3338 /* Now, let's add the event source to the monotonic clock instead */
3339 r = event_source_time_prioq_put(s, &s->event->monotonic);
3340 if (r < 0)
3341 goto fail;
3342
3343 /* And let's take the event source officially offline */
3344 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3345 if (r < 0) {
3346 event_source_time_prioq_remove(s, &s->event->monotonic);
3347 goto fail;
3348 }
3349
3350 event_source_pp_prioq_reshuffle(s);
3351
3352 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3353 return 0;
3354
3355fail:
3356 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3357 * space for it should already be allocated. */
3358 if (EVENT_SOURCE_IS_TIME(s->type))
3359 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3360
3361 return r;
3362}
3363
fd69f224 3364static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
b6d5481b
LP
3365 int r;
3366
3367 assert(s);
3368
3369 if (!s->ratelimited)
3370 return 0;
3371
3372 /* Let's take the event source out of the monotonic prioq first. */
3373 event_source_time_prioq_remove(s, &s->event->monotonic);
3374
3375 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3376 if (EVENT_SOURCE_IS_TIME(s->type)) {
3377 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3378 if (r < 0)
3379 goto fail;
3380 }
3381
3382 /* Let's try to take it online again. */
3383 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3384 if (r < 0) {
3385 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3386 if (EVENT_SOURCE_IS_TIME(s->type))
3387 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3388
3389 goto fail;
3390 }
3391
3392 event_source_pp_prioq_reshuffle(s);
3393 ratelimit_reset(&s->rate_limit);
3394
3395 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
fd69f224
MS
3396
3397 if (run_callback && s->ratelimit_expire_callback) {
3398 s->dispatching = true;
3399 r = s->ratelimit_expire_callback(s, s->userdata);
3400 s->dispatching = false;
3401
3402 if (r < 0) {
3403 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3404 strna(s->description),
3405 event_source_type_to_string(s->type),
3406 s->exit_on_failure ? "exiting" : "disabling");
3407
3408 if (s->exit_on_failure)
3409 (void) sd_event_exit(s->event, r);
3410 }
3411
3412 if (s->n_ref == 0)
3413 source_free(s);
3414 else if (r < 0)
0a040e64 3415 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd69f224
MS
3416
3417 return 1;
3418 }
3419
b6d5481b
LP
3420 return 0;
3421
3422fail:
3423 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3424 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3425 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3426
3427 return r;
3428}
3429
c2ba3ad6
LP
3430static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3431 usec_t c;
3432 assert(e);
3433 assert(a <= b);
3434
3435 if (a <= 0)
3436 return 0;
393003e1
LP
3437 if (a >= USEC_INFINITY)
3438 return USEC_INFINITY;
c2ba3ad6
LP
3439
3440 if (b <= a + 1)
3441 return a;
3442
52444dc4
LP
3443 initialize_perturb(e);
3444
c2ba3ad6
LP
3445 /*
3446 Find a good time to wake up again between times a and b. We
3447 have two goals here:
3448
3449 a) We want to wake up as seldom as possible, hence prefer
3450 later times over earlier times.
3451
3452 b) But if we have to wake up, then let's make sure to
3453 dispatch as much as possible on the entire system.
3454
3455 We implement this by waking up everywhere at the same time
850516e0 3456 within any given minute if we can, synchronised via the
c2ba3ad6 3457 perturbation value determined from the boot ID. If we can't,
ba276c81
LP
3458 then we try to find the same spot in every 10s, then 1s and
3459 then 250ms step. Otherwise, we pick the last possible time
3460 to wake up.
c2ba3ad6
LP
3461 */
3462
850516e0
LP
3463 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3464 if (c >= b) {
3465 if (_unlikely_(c < USEC_PER_MINUTE))
3466 return b;
3467
3468 c -= USEC_PER_MINUTE;
3469 }
3470
ba276c81
LP
3471 if (c >= a)
3472 return c;
3473
3474 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3475 if (c >= b) {
3476 if (_unlikely_(c < USEC_PER_SEC*10))
3477 return b;
3478
3479 c -= USEC_PER_SEC*10;
3480 }
3481
850516e0
LP
3482 if (c >= a)
3483 return c;
3484
3485 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
c2ba3ad6
LP
3486 if (c >= b) {
3487 if (_unlikely_(c < USEC_PER_SEC))
3488 return b;
3489
3490 c -= USEC_PER_SEC;
3491 }
3492
3493 if (c >= a)
3494 return c;
3495
3496 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3497 if (c >= b) {
3498 if (_unlikely_(c < USEC_PER_MSEC*250))
3499 return b;
3500
3501 c -= USEC_PER_MSEC*250;
3502 }
3503
3504 if (c >= a)
3505 return c;
3506
3507 return b;
3508}
3509
fd38203a
LP
3510static int event_arm_timer(
3511 sd_event *e,
6a0f1f6d 3512 struct clock_data *d) {
fd38203a
LP
3513
3514 struct itimerspec its = {};
c2ba3ad6
LP
3515 sd_event_source *a, *b;
3516 usec_t t;
fd38203a 3517
cde93897 3518 assert(e);
6a0f1f6d 3519 assert(d);
fd38203a 3520
d06441da 3521 if (!d->needs_rearm)
212bbb17 3522 return 0;
7e2bf71c
YW
3523
3524 d->needs_rearm = false;
212bbb17 3525
6a0f1f6d 3526 a = prioq_peek(d->earliest);
19947509 3527 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
b6d5481b 3528 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
72aedc1e 3529
6a0f1f6d 3530 if (d->fd < 0)
c57b5ca3
LP
3531 return 0;
3532
3a43da28 3533 if (d->next == USEC_INFINITY)
72aedc1e
LP
3534 return 0;
3535
3536 /* disarm */
15c689d7
LP
3537 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3538 return -errno;
72aedc1e 3539
3a43da28 3540 d->next = USEC_INFINITY;
fd38203a 3541 return 0;
72aedc1e 3542 }
fd38203a 3543
6a0f1f6d 3544 b = prioq_peek(d->latest);
19947509
ZJS
3545 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3546 assert(b && b->enabled != SD_EVENT_OFF);
c2ba3ad6 3547
b6d5481b 3548 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
6a0f1f6d 3549 if (d->next == t)
fd38203a
LP
3550 return 0;
3551
6a0f1f6d 3552 assert_se(d->fd >= 0);
fd38203a 3553
c2ba3ad6 3554 if (t == 0) {
1751bdde 3555 /* We don't want to disarm here, just mean some time looooong ago. */
fd38203a
LP
3556 its.it_value.tv_sec = 0;
3557 its.it_value.tv_nsec = 1;
3558 } else
c2ba3ad6 3559 timespec_store(&its.it_value, t);
fd38203a 3560
15c689d7 3561 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
cde93897 3562 return -errno;
fd38203a 3563
6a0f1f6d 3564 d->next = t;
fd38203a
LP
3565 return 0;
3566}
3567
9a800b56 3568static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
fd38203a
LP
3569 assert(e);
3570 assert(s);
3571 assert(s->type == SOURCE_IO);
3572
9a800b56
LP
3573 /* If the event source was already pending, we just OR in the
3574 * new revents, otherwise we reset the value. The ORing is
3575 * necessary to handle EPOLLONESHOT events properly where
3576 * readability might happen independently of writability, and
3577 * we need to keep track of both */
3578
3579 if (s->pending)
3580 s->io.revents |= revents;
3581 else
3582 s->io.revents = revents;
fd38203a 3583
fd38203a
LP
3584 return source_set_pending(s, true);
3585}
3586
72aedc1e 3587static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
fd38203a
LP
3588 uint64_t x;
3589 ssize_t ss;
3590
3591 assert(e);
da7e457c 3592 assert(fd >= 0);
72aedc1e 3593
305f78bf 3594 assert_return(events == EPOLLIN, -EIO);
fd38203a
LP
3595
3596 ss = read(fd, &x, sizeof(x));
3597 if (ss < 0) {
8add30a0 3598 if (ERRNO_IS_TRANSIENT(errno))
fd38203a
LP
3599 return 0;
3600
3601 return -errno;
3602 }
3603
8d35dae7 3604 if (_unlikely_(ss != sizeof(x)))
fd38203a
LP
3605 return -EIO;
3606
cde93897 3607 if (next)
3a43da28 3608 *next = USEC_INFINITY;
72aedc1e 3609
fd38203a
LP
3610 return 0;
3611}
3612
305f78bf
LP
3613static int process_timer(
3614 sd_event *e,
3615 usec_t n,
6a0f1f6d 3616 struct clock_data *d) {
305f78bf 3617
fd38203a 3618 sd_event_source *s;
fd69f224 3619 bool callback_invoked = false;
fd38203a
LP
3620 int r;
3621
3622 assert(e);
6a0f1f6d 3623 assert(d);
fd38203a
LP
3624
3625 for (;;) {
6a0f1f6d 3626 s = prioq_peek(d->earliest);
19947509
ZJS
3627 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3628
b6d5481b
LP
3629 if (!s || time_event_source_next(s) > n)
3630 break;
3631
3632 if (s->ratelimited) {
3633 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3634 * again. */
3635 assert(s->ratelimited);
3636
fd69f224 3637 r = event_source_leave_ratelimit(s, /* run_callback */ true);
b6d5481b
LP
3638 if (r < 0)
3639 return r;
fd69f224
MS
3640 else if (r == 1)
3641 callback_invoked = true;
b6d5481b
LP
3642
3643 continue;
3644 }
3645
3646 if (s->enabled == SD_EVENT_OFF || s->pending)
fd38203a
LP
3647 break;
3648
3649 r = source_set_pending(s, true);
3650 if (r < 0)
3651 return r;
3652
e1951c16 3653 event_source_time_prioq_reshuffle(s);
fd38203a
LP
3654 }
3655
fd69f224 3656 return callback_invoked;
fd38203a
LP
3657}
3658
efd3be9d
YW
3659static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3660 int64_t min_priority = threshold;
3661 bool something_new = false;
fd38203a 3662 sd_event_source *s;
fd38203a
LP
3663 int r;
3664
3665 assert(e);
efd3be9d
YW
3666 assert(ret_min_priority);
3667
3668 if (!e->need_process_child) {
3669 *ret_min_priority = min_priority;
3670 return 0;
3671 }
fd38203a 3672
c2ba3ad6
LP
3673 e->need_process_child = false;
3674
91c70071
YW
3675 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3676 * for, instead of using P_ALL. This is because we only want to get child information of very
3677 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3678 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3679 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3680 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3681 * to handle SIGCHLD yourself.
3682 *
3683 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3684 * source is dispatched so that the callback still sees the process as a zombie. */
fd38203a 3685
90e74a66 3686 HASHMAP_FOREACH(s, e->child_sources) {
fd38203a
LP
3687 assert(s->type == SOURCE_CHILD);
3688
efd3be9d
YW
3689 if (s->priority > threshold)
3690 continue;
3691
fd38203a
LP
3692 if (s->pending)
3693 continue;
3694
b6d5481b 3695 if (event_source_is_offline(s))
fd38203a
LP
3696 continue;
3697
f8f3f926
LP
3698 if (s->child.exited)
3699 continue;
3700
91c70071
YW
3701 if (EVENT_SOURCE_WATCH_PIDFD(s))
3702 /* There's a usable pidfd known for this event source? Then don't waitid() for
3703 * it here */
f8f3f926
LP
3704 continue;
3705
fd38203a 3706 zero(s->child.siginfo);
15c689d7
LP
3707 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3708 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
bfd9bfcc 3709 return negative_errno();
fd38203a
LP
3710
3711 if (s->child.siginfo.si_pid != 0) {
945c2931 3712 bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
08cd1552 3713
f8f3f926
LP
3714 if (zombie)
3715 s->child.exited = true;
3716
08cd1552 3717 if (!zombie && (s->child.options & WEXITED)) {
91c70071
YW
3718 /* If the child isn't dead then let's immediately remove the state
3719 * change from the queue, since there's no benefit in leaving it
3720 * queued. */
08cd1552
LP
3721
3722 assert(s->child.options & (WSTOPPED|WCONTINUED));
a5d27871 3723 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
08cd1552
LP
3724 }
3725
fd38203a
LP
3726 r = source_set_pending(s, true);
3727 if (r < 0)
3728 return r;
efd3be9d
YW
3729 if (r > 0) {
3730 something_new = true;
3731 min_priority = MIN(min_priority, s->priority);
3732 }
fd38203a
LP
3733 }
3734 }
3735
efd3be9d
YW
3736 *ret_min_priority = min_priority;
3737 return something_new;
fd38203a
LP
3738}
3739
f8f3f926
LP
3740static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3741 assert(e);
3742 assert(s);
3743 assert(s->type == SOURCE_CHILD);
3744
3745 if (s->pending)
3746 return 0;
3747
b6d5481b 3748 if (event_source_is_offline(s))
f8f3f926
LP
3749 return 0;
3750
3751 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3752 return 0;
3753
3754 zero(s->child.siginfo);
3755 if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3756 return -errno;
3757
3758 if (s->child.siginfo.si_pid == 0)
3759 return 0;
3760
3761 if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3762 s->child.exited = true;
3763
3764 return source_set_pending(s, true);
3765}
3766
efd3be9d 3767static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
fd38203a
LP
3768 int r;
3769
da7e457c 3770 assert(e);
97ef5391 3771 assert(d);
305f78bf 3772 assert_return(events == EPOLLIN, -EIO);
efd3be9d 3773 assert(min_priority);
fd38203a 3774
91c70071
YW
3775 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3776 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3777 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3778 * but we might have higher priority children we care about hence we need to check that
3779 * explicitly. */
9da4cb2b
LP
3780
3781 if (sigismember(&d->sigset, SIGCHLD))
3782 e->need_process_child = true;
3783
91c70071 3784 /* If there's already an event source pending for this priority we don't read another */
9da4cb2b
LP
3785 if (d->current)
3786 return 0;
3787
fd38203a 3788 for (;;) {
0eb2e0e3 3789 struct signalfd_siginfo si;
7057bd99 3790 ssize_t n;
92daebc0 3791 sd_event_source *s = NULL;
fd38203a 3792
9da4cb2b 3793 n = read(d->fd, &si, sizeof(si));
7057bd99 3794 if (n < 0) {
8add30a0 3795 if (ERRNO_IS_TRANSIENT(errno))
efd3be9d 3796 return 0;
fd38203a
LP
3797
3798 return -errno;
3799 }
3800
7057bd99 3801 if (_unlikely_(n != sizeof(si)))
fd38203a
LP
3802 return -EIO;
3803
6eb7c172 3804 assert(SIGNAL_VALID(si.ssi_signo));
7057bd99 3805
92daebc0
LP
3806 if (e->signal_sources)
3807 s = e->signal_sources[si.ssi_signo];
92daebc0
LP
3808 if (!s)
3809 continue;
9da4cb2b
LP
3810 if (s->pending)
3811 continue;
fd38203a
LP
3812
3813 s->signal.siginfo = si;
9da4cb2b
LP
3814 d->current = s;
3815
fd38203a
LP
3816 r = source_set_pending(s, true);
3817 if (r < 0)
3818 return r;
efd3be9d
YW
3819 if (r > 0 && *min_priority >= s->priority) {
3820 *min_priority = s->priority;
3821 return 1; /* an event source with smaller priority is queued. */
3822 }
9da4cb2b 3823
efd3be9d 3824 return 0;
fd38203a 3825 }
fd38203a
LP
3826}
3827
efd3be9d 3828static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
97ef5391
LP
3829 ssize_t n;
3830
3831 assert(e);
3832 assert(d);
3833
3834 assert_return(revents == EPOLLIN, -EIO);
3835
3836 /* If there's already an event source pending for this priority, don't read another */
3837 if (d->n_pending > 0)
3838 return 0;
3839
3840 /* Is the read buffer non-empty? If so, let's not read more */
3841 if (d->buffer_filled > 0)
3842 return 0;
3843
efd3be9d
YW
3844 if (d->priority > threshold)
3845 return 0;
3846
97ef5391
LP
3847 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3848 if (n < 0) {
8add30a0 3849 if (ERRNO_IS_TRANSIENT(errno))
97ef5391
LP
3850 return 0;
3851
3852 return -errno;
3853 }
3854
3855 assert(n > 0);
3856 d->buffer_filled = (size_t) n;
0601b958 3857 LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3858
3859 return 1;
3860}
3861
3862static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3863 assert(e);
3864 assert(d);
3865 assert(sz <= d->buffer_filled);
3866
3867 if (sz == 0)
3868 return;
3869
3870 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3871 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3872 d->buffer_filled -= sz;
3873
3874 if (d->buffer_filled == 0)
0601b958 3875 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3876}
3877
3878static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3879 int r;
3880
3881 assert(e);
3882 assert(d);
3883
3884 /* If there's already an event source pending for this priority, don't read another */
3885 if (d->n_pending > 0)
3886 return 0;
3887
3888 while (d->buffer_filled > 0) {
3889 size_t sz;
3890
3891 /* Let's validate that the event structures are complete */
3892 if (d->buffer_filled < offsetof(struct inotify_event, name))
3893 return -EIO;
3894
3895 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3896 if (d->buffer_filled < sz)
3897 return -EIO;
3898
3899 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3900 struct inode_data *inode_data;
97ef5391
LP
3901
3902 /* The queue overran, let's pass this event to all event sources connected to this inotify
3903 * object */
3904
03677889 3905 HASHMAP_FOREACH(inode_data, d->inodes)
97ef5391
LP
3906 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3907
b6d5481b 3908 if (event_source_is_offline(s))
97ef5391
LP
3909 continue;
3910
3911 r = source_set_pending(s, true);
3912 if (r < 0)
3913 return r;
3914 }
97ef5391
LP
3915 } else {
3916 struct inode_data *inode_data;
97ef5391
LP
3917
3918 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3919 * our watch descriptor table. */
3920 if (d->buffer.ev.mask & IN_IGNORED) {
3921
3922 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3923 if (!inode_data) {
3924 event_inotify_data_drop(e, d, sz);
3925 continue;
3926 }
3927
3928 /* The watch descriptor was removed by the kernel, let's drop it here too */
3929 inode_data->wd = -1;
3930 } else {
3931 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3932 if (!inode_data) {
3933 event_inotify_data_drop(e, d, sz);
3934 continue;
3935 }
3936 }
3937
3938 /* Trigger all event sources that are interested in these events. Also trigger all event
3939 * sources if IN_IGNORED or IN_UNMOUNT is set. */
3940 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3941
b6d5481b 3942 if (event_source_is_offline(s))
97ef5391
LP
3943 continue;
3944
3945 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3946 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3947 continue;
3948
3949 r = source_set_pending(s, true);
3950 if (r < 0)
3951 return r;
3952 }
3953 }
3954
3955 /* Something pending now? If so, let's finish, otherwise let's read more. */
3956 if (d->n_pending > 0)
3957 return 1;
3958 }
3959
3960 return 0;
3961}
3962
3963static int process_inotify(sd_event *e) {
97ef5391
LP
3964 int r, done = 0;
3965
3966 assert(e);
3967
0601b958 3968 LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
97ef5391
LP
3969 r = event_inotify_data_process(e, d);
3970 if (r < 0)
3971 return r;
3972 if (r > 0)
3973 done ++;
3974 }
3975
3976 return done;
3977}
3978
158fe190
LP
3979static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
3980 assert(s);
3981 assert(s->type == SOURCE_MEMORY_PRESSURE);
3982
3983 if (s->pending)
3984 s->memory_pressure.revents |= revents;
3985 else
3986 s->memory_pressure.revents = revents;
3987
3988 return source_set_pending(s, true);
3989}
3990
3991static int source_memory_pressure_write(sd_event_source *s) {
3992 ssize_t n;
3993 int r;
3994
3995 assert(s);
3996 assert(s->type == SOURCE_MEMORY_PRESSURE);
3997
3998 /* once we start writing, the buffer is locked, we allow no further changes. */
3999 s->memory_pressure.locked = true;
4000
4001 if (s->memory_pressure.write_buffer_size > 0) {
4002 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4003 if (n < 0) {
9897f5dd
LP
4004 if (!ERRNO_IS_TRANSIENT(errno)) {
4005 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4006 * files, but then generates EOPNOSUPP on read() and write() (instead of on
4007 * open()!). This sucks hard, since we can only detect this kind of failure
4008 * so late. Let's make the best of it, and turn off the event source like we
4009 * do for failed event source handlers. */
4010
4011 log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4012 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4013 return 0;
4014 }
158fe190
LP
4015
4016 n = 0;
4017 }
4018 } else
4019 n = 0;
4020
4021 assert(n >= 0);
4022
4023 if ((size_t) n == s->memory_pressure.write_buffer_size) {
4024 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4025
4026 if (n > 0) {
4027 s->memory_pressure.write_buffer_size = 0;
4028
4029 /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4030 r = source_memory_pressure_register(s, s->enabled);
4031 if (r < 0)
4032 return r;
4033 }
4034 } else if (n > 0) {
4035 _cleanup_free_ void *c = NULL;
4036
4037 assert((size_t) n < s->memory_pressure.write_buffer_size);
4038
4039 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4040 if (!c)
4041 return -ENOMEM;
4042
4043 free_and_replace(s->memory_pressure.write_buffer, c);
4044 s->memory_pressure.write_buffer_size -= n;
4045 return 1;
4046 }
4047
4048 return 0;
4049}
4050
4051static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4052 int r;
4053
4054 assert(s);
4055 assert(s->type == SOURCE_MEMORY_PRESSURE);
4056
4057 r = source_memory_pressure_write(s);
4058 if (r < 0)
4059 return r;
4060 if (r > 0)
4061 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4062 * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4063
4064 /* No pending incoming IO? Then let's not continue further */
4065 if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4066
4067 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4068 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4069 return -EIO;
4070
4071 return 1; /* leave dispatch, we already processed everything */
4072 }
4073
4074 if (s->memory_pressure.revents & EPOLLIN) {
4075 uint8_t pipe_buf[PIPE_BUF];
4076 ssize_t n;
4077
4078 /* If the fd is readable, then flush out anything that might be queued */
4079
4080 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4081 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4082 return -errno;
4083 }
4084
4085 return 0; /* go on, dispatch to user callback */
4086}
4087
fd38203a 4088static int source_dispatch(sd_event_source *s) {
8f5c235d 4089 EventSourceType saved_type;
c8e9d15c 4090 sd_event *saved_event;
fe8245eb 4091 int r = 0;
fd38203a
LP
4092
4093 assert(s);
6203e07a 4094 assert(s->pending || s->type == SOURCE_EXIT);
fd38203a 4095
b778cba4
LP
4096 /* Save the event source type, here, so that we still know it after the event callback which might
4097 * invalidate the event. */
8f5c235d
LP
4098 saved_type = s->type;
4099
de02634c 4100 /* Similarly, store a reference to the event loop object, so that we can still access it after the
b778cba4 4101 * callback might have invalidated/disconnected the event source. */
c8e9d15c
YW
4102 saved_event = s->event;
4103 PROTECT_EVENT(saved_event);
b778cba4 4104
de02634c 4105 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
b6d5481b
LP
4106 assert(!s->ratelimited);
4107 if (!ratelimit_below(&s->rate_limit)) {
4108 r = event_source_enter_ratelimited(s);
4109 if (r < 0)
4110 return r;
4111
4112 return 1;
4113 }
4114
945c2931 4115 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
da7e457c
LP
4116 r = source_set_pending(s, false);
4117 if (r < 0)
4118 return r;
4119 }
fd38203a 4120
6e9feda3
LP
4121 if (s->type != SOURCE_POST) {
4122 sd_event_source *z;
6e9feda3 4123
de02634c 4124 /* If we execute a non-post source, let's mark all post sources as pending. */
6e9feda3 4125
90e74a66 4126 SET_FOREACH(z, s->event->post_sources) {
b6d5481b 4127 if (event_source_is_offline(z))
6e9feda3
LP
4128 continue;
4129
4130 r = source_set_pending(z, true);
4131 if (r < 0)
4132 return r;
4133 }
4134 }
4135
158fe190
LP
4136 if (s->type == SOURCE_MEMORY_PRESSURE) {
4137 r = source_memory_pressure_initiate_dispatch(s);
4138 if (r == -EIO) /* handle EIO errors similar to callback errors */
4139 goto finish;
4140 if (r < 0)
4141 return r;
4142 if (r > 0) /* already handled */
4143 return 1;
4144 }
4145
baf76283
LP
4146 if (s->enabled == SD_EVENT_ONESHOT) {
4147 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
fd38203a
LP
4148 if (r < 0)
4149 return r;
4150 }
4151
12179984 4152 s->dispatching = true;
b7484e2a 4153
fd38203a
LP
4154 switch (s->type) {
4155
4156 case SOURCE_IO:
4157 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4158 break;
4159
6a0f1f6d 4160 case SOURCE_TIME_REALTIME:
a8548816 4161 case SOURCE_TIME_BOOTTIME:
6a0f1f6d
LP
4162 case SOURCE_TIME_MONOTONIC:
4163 case SOURCE_TIME_REALTIME_ALARM:
4164 case SOURCE_TIME_BOOTTIME_ALARM:
fd38203a
LP
4165 r = s->time.callback(s, s->time.next, s->userdata);
4166 break;
4167
4168 case SOURCE_SIGNAL:
4169 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4170 break;
4171
08cd1552
LP
4172 case SOURCE_CHILD: {
4173 bool zombie;
4174
945c2931 4175 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
08cd1552 4176
fd38203a 4177 r = s->child.callback(s, &s->child.siginfo, s->userdata);
08cd1552
LP
4178
4179 /* Now, reap the PID for good. */
f8f3f926 4180 if (zombie) {
cc59d290 4181 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
f8f3f926
LP
4182 s->child.waited = true;
4183 }
08cd1552 4184
fd38203a 4185 break;
08cd1552 4186 }
fd38203a
LP
4187
4188 case SOURCE_DEFER:
4189 r = s->defer.callback(s, s->userdata);
4190 break;
da7e457c 4191
6e9feda3
LP
4192 case SOURCE_POST:
4193 r = s->post.callback(s, s->userdata);
4194 break;
4195
6203e07a
LP
4196 case SOURCE_EXIT:
4197 r = s->exit.callback(s, s->userdata);
da7e457c 4198 break;
9d3e3aa5 4199
97ef5391
LP
4200 case SOURCE_INOTIFY: {
4201 struct sd_event *e = s->event;
4202 struct inotify_data *d;
4203 size_t sz;
4204
4205 assert(s->inotify.inode_data);
4206 assert_se(d = s->inotify.inode_data->inotify_data);
4207
4208 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4209 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4210 assert(d->buffer_filled >= sz);
4211
53baf2ef
LP
4212 /* If the inotify callback destroys the event source then this likely means we don't need to
4213 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4214 * free it immediately, then we couldn't drop the event from the inotify event queue without
4215 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4216 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4217 * explicitly GC it after we are done dropping the inotify event from the buffer. */
4218 d->n_busy++;
97ef5391 4219 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
53baf2ef 4220 d->n_busy--;
97ef5391 4221
53baf2ef
LP
4222 /* When no event is pending anymore on this inotify object, then let's drop the event from
4223 * the inotify event queue buffer. */
97ef5391
LP
4224 if (d->n_pending == 0)
4225 event_inotify_data_drop(e, d, sz);
4226
53baf2ef
LP
4227 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4228 event_gc_inotify_data(e, d);
97ef5391
LP
4229 break;
4230 }
4231
158fe190
LP
4232 case SOURCE_MEMORY_PRESSURE:
4233 r = s->memory_pressure.callback(s, s->userdata);
4234 break;
4235
9d3e3aa5 4236 case SOURCE_WATCHDOG:
a71fe8b8 4237 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
9f2a50a3 4238 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
04499a70 4239 assert_not_reached();
fd38203a
LP
4240 }
4241
12179984
LP
4242 s->dispatching = false;
4243
158fe190 4244finish:
b778cba4
LP
4245 if (r < 0) {
4246 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4247 strna(s->description),
4248 event_source_type_to_string(saved_type),
4249 s->exit_on_failure ? "exiting" : "disabling");
4250
4251 if (s->exit_on_failure)
4252 (void) sd_event_exit(saved_event, r);
4253 }
12179984
LP
4254
4255 if (s->n_ref == 0)
4256 source_free(s);
4257 else if (r < 0)
c3c50474 4258 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
b7484e2a 4259
6203e07a 4260 return 1;
fd38203a
LP
4261}
4262
4263static int event_prepare(sd_event *e) {
4264 int r;
4265
4266 assert(e);
4267
4268 for (;;) {
4269 sd_event_source *s;
4270
4271 s = prioq_peek(e->prepare);
b6d5481b 4272 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
fd38203a
LP
4273 break;
4274
4275 s->prepare_iteration = e->iteration;
8656f4a6 4276 prioq_reshuffle(e->prepare, s, &s->prepare_index);
fd38203a
LP
4277
4278 assert(s->prepare);
12179984 4279 s->dispatching = true;
fd38203a 4280 r = s->prepare(s, s->userdata);
12179984
LP
4281 s->dispatching = false;
4282
b778cba4
LP
4283 if (r < 0) {
4284 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4285 strna(s->description),
4286 event_source_type_to_string(s->type),
4287 s->exit_on_failure ? "exiting" : "disabling");
4288
4289 if (s->exit_on_failure)
4290 (void) sd_event_exit(e, r);
4291 }
fd38203a 4292
12179984
LP
4293 if (s->n_ref == 0)
4294 source_free(s);
4295 else if (r < 0)
c3c50474 4296 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd38203a
LP
4297 }
4298
4299 return 0;
4300}
4301
6203e07a 4302static int dispatch_exit(sd_event *e) {
da7e457c
LP
4303 sd_event_source *p;
4304 int r;
4305
4306 assert(e);
4307
6203e07a 4308 p = prioq_peek(e->exit);
19947509
ZJS
4309 assert(!p || p->type == SOURCE_EXIT);
4310
b6d5481b 4311 if (!p || event_source_is_offline(p)) {
da7e457c
LP
4312 e->state = SD_EVENT_FINISHED;
4313 return 0;
4314 }
4315
c8e9d15c 4316 PROTECT_EVENT(e);
da7e457c 4317 e->iteration++;
6203e07a 4318 e->state = SD_EVENT_EXITING;
da7e457c 4319 r = source_dispatch(p);
2b0c9ef7 4320 e->state = SD_EVENT_INITIAL;
da7e457c
LP
4321 return r;
4322}
4323
c2ba3ad6
LP
4324static sd_event_source* event_next_pending(sd_event *e) {
4325 sd_event_source *p;
4326
da7e457c
LP
4327 assert(e);
4328
c2ba3ad6
LP
4329 p = prioq_peek(e->pending);
4330 if (!p)
4331 return NULL;
4332
b6d5481b 4333 if (event_source_is_offline(p))
c2ba3ad6
LP
4334 return NULL;
4335
4336 return p;
4337}
4338
cde93897
LP
4339static int arm_watchdog(sd_event *e) {
4340 struct itimerspec its = {};
4341 usec_t t;
cde93897
LP
4342
4343 assert(e);
4344 assert(e->watchdog_fd >= 0);
4345
4346 t = sleep_between(e,
a595fb5c
YW
4347 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4348 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
cde93897
LP
4349
4350 timespec_store(&its.it_value, t);
4351
75145780
LP
4352 /* Make sure we never set the watchdog to 0, which tells the
4353 * kernel to disable it. */
4354 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4355 its.it_value.tv_nsec = 1;
4356
7c248223 4357 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
cde93897
LP
4358}
4359
4360static int process_watchdog(sd_event *e) {
4361 assert(e);
4362
4363 if (!e->watchdog)
4364 return 0;
4365
4366 /* Don't notify watchdog too often */
4367 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4368 return 0;
4369
4370 sd_notify(false, "WATCHDOG=1");
4371 e->watchdog_last = e->timestamp.monotonic;
4372
4373 return arm_watchdog(e);
4374}
4375
97ef5391
LP
4376static void event_close_inode_data_fds(sd_event *e) {
4377 struct inode_data *d;
4378
4379 assert(e);
4380
4381 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4382 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
365c2885 4383 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
97ef5391
LP
4384 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4385 * compromise. */
4386
ed828563 4387 while ((d = e->inode_data_to_close_list)) {
97ef5391
LP
4388 assert(d->fd >= 0);
4389 d->fd = safe_close(d->fd);
4390
ed828563 4391 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
4392 }
4393}
4394
158fe190
LP
4395static int event_memory_pressure_write_list(sd_event *e) {
4396 int r;
4397
4398 assert(e);
4399
4400 for (;;) {
4401 sd_event_source *s;
4402
4403 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4404 if (!s)
4405 break;
4406
4407 assert(s->type == SOURCE_MEMORY_PRESSURE);
4408 assert(s->memory_pressure.write_buffer_size > 0);
4409 s->memory_pressure.in_write_list = false;
4410
4411 r = source_memory_pressure_write(s);
4412 if (r < 0)
4413 return r;
4414 }
4415
4416 return 0;
4417}
4418
c45a5a74
TG
4419_public_ int sd_event_prepare(sd_event *e) {
4420 int r;
fd38203a 4421
da7e457c 4422 assert_return(e, -EINVAL);
b937d761 4423 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
4424 assert_return(!event_pid_changed(e), -ECHILD);
4425 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 4426 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 4427
e5446015
LP
4428 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4429 * this check here once, since gettid() is typically not cached, and thus want to minimize
4430 * syscalls */
4431 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4432
f814c871 4433 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 4434 PROTECT_EVENT(e);
f814c871 4435
6203e07a 4436 if (e->exit_requested)
c45a5a74 4437 goto pending;
fd38203a
LP
4438
4439 e->iteration++;
4440
0be6c2f6 4441 e->state = SD_EVENT_PREPARING;
fd38203a 4442 r = event_prepare(e);
0be6c2f6 4443 e->state = SD_EVENT_INITIAL;
fd38203a 4444 if (r < 0)
c45a5a74 4445 return r;
fd38203a 4446
158fe190
LP
4447 r = event_memory_pressure_write_list(e);
4448 if (r < 0)
4449 return r;
4450
6a0f1f6d
LP
4451 r = event_arm_timer(e, &e->realtime);
4452 if (r < 0)
c45a5a74 4453 return r;
6a0f1f6d 4454
a8548816
TG
4455 r = event_arm_timer(e, &e->boottime);
4456 if (r < 0)
c45a5a74 4457 return r;
a8548816 4458
6a0f1f6d
LP
4459 r = event_arm_timer(e, &e->monotonic);
4460 if (r < 0)
c45a5a74 4461 return r;
6a0f1f6d
LP
4462
4463 r = event_arm_timer(e, &e->realtime_alarm);
1b5995b0 4464 if (r < 0)
c45a5a74 4465 return r;
fd38203a 4466
6a0f1f6d 4467 r = event_arm_timer(e, &e->boottime_alarm);
1b5995b0 4468 if (r < 0)
c45a5a74 4469 return r;
fd38203a 4470
97ef5391
LP
4471 event_close_inode_data_fds(e);
4472
0601b958 4473 if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
c45a5a74
TG
4474 goto pending;
4475
2b0c9ef7 4476 e->state = SD_EVENT_ARMED;
c45a5a74
TG
4477
4478 return 0;
4479
4480pending:
2b0c9ef7 4481 e->state = SD_EVENT_ARMED;
6d148a84
TG
4482 r = sd_event_wait(e, 0);
4483 if (r == 0)
2b0c9ef7 4484 e->state = SD_EVENT_ARMED;
6d148a84
TG
4485
4486 return r;
c45a5a74
TG
4487}
4488
798445ab
LP
4489static int epoll_wait_usec(
4490 int fd,
4491 struct epoll_event *events,
4492 int maxevents,
4493 usec_t timeout) {
4494
7c248223 4495 int msec;
0c14c45e
LP
4496 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4497
4498#if HAVE_EPOLL_PWAIT2
39f756d3 4499 static bool epoll_pwait2_absent = false;
52bb308c 4500 int r;
798445ab 4501
0c14c45e
LP
4502 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4503 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4504 * is not that obvious to implement given the libc and kernel definitions differ in the last
4505 * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4506 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4507 * missing. */
798445ab
LP
4508
4509 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
798445ab
LP
4510 r = epoll_pwait2(fd,
4511 events,
4512 maxevents,
52bb308c 4513 TIMESPEC_STORE(timeout),
798445ab
LP
4514 NULL);
4515 if (r >= 0)
4516 return r;
7cb45dbf 4517 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
798445ab
LP
4518 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4519 * supported. */
4520
4521 epoll_pwait2_absent = true;
4522 }
39f756d3 4523#endif
798445ab
LP
4524
4525 if (timeout == USEC_INFINITY)
4526 msec = -1;
4527 else {
4528 usec_t k;
4529
4530 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4531 if (k >= INT_MAX)
4532 msec = INT_MAX; /* Saturate */
4533 else
4534 msec = (int) k;
4535 }
4536
7c248223 4537 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
798445ab
LP
4538}
4539
efd3be9d 4540static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
319a4f4b 4541 size_t n_event_queue, m, n_event_max;
efd3be9d
YW
4542 int64_t min_priority = threshold;
4543 bool something_new = false;
798445ab 4544 int r;
c45a5a74 4545
efd3be9d
YW
4546 assert(e);
4547 assert(ret_min_priority);
6a0f1f6d 4548
8b9708d1 4549 n_event_queue = MAX(e->n_sources, 1u);
319a4f4b 4550 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
5cddd924 4551 return -ENOMEM;
fd38203a 4552
319a4f4b
LP
4553 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4554
97ef5391 4555 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
0601b958 4556 if (e->buffered_inotify_data_list)
798445ab 4557 timeout = 0;
97ef5391 4558
8b9708d1 4559 for (;;) {
319a4f4b
LP
4560 r = epoll_wait_usec(
4561 e->epoll_fd,
4562 e->event_queue,
4563 n_event_max,
4564 timeout);
798445ab 4565 if (r < 0)
efd3be9d 4566 return r;
c45a5a74 4567
8b9708d1
YW
4568 m = (size_t) r;
4569
319a4f4b 4570 if (m < n_event_max)
8b9708d1
YW
4571 break;
4572
319a4f4b 4573 if (n_event_max >= n_event_queue * 10)
8b9708d1
YW
4574 break;
4575
319a4f4b 4576 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
8b9708d1
YW
4577 return -ENOMEM;
4578
319a4f4b 4579 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
798445ab 4580 timeout = 0;
da7e457c 4581 }
fd38203a 4582
efd3be9d
YW
4583 /* Set timestamp only when this is called first time. */
4584 if (threshold == INT64_MAX)
4585 triple_timestamp_get(&e->timestamp);
fd38203a 4586
8b9708d1 4587 for (size_t i = 0; i < m; i++) {
fd38203a 4588
5cddd924
LP
4589 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4590 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
9da4cb2b 4591 else {
5cddd924 4592 WakeupType *t = e->event_queue[i].data.ptr;
9da4cb2b
LP
4593
4594 switch (*t) {
4595
f8f3f926 4596 case WAKEUP_EVENT_SOURCE: {
5cddd924 4597 sd_event_source *s = e->event_queue[i].data.ptr;
f8f3f926
LP
4598
4599 assert(s);
4600
efd3be9d
YW
4601 if (s->priority > threshold)
4602 continue;
4603
4604 min_priority = MIN(min_priority, s->priority);
4605
f8f3f926
LP
4606 switch (s->type) {
4607
4608 case SOURCE_IO:
5cddd924 4609 r = process_io(e, s, e->event_queue[i].events);
f8f3f926
LP
4610 break;
4611
4612 case SOURCE_CHILD:
5cddd924 4613 r = process_pidfd(e, s, e->event_queue[i].events);
f8f3f926
LP
4614 break;
4615
158fe190
LP
4616 case SOURCE_MEMORY_PRESSURE:
4617 r = process_memory_pressure(s, e->event_queue[i].events);
4618 break;
4619
f8f3f926 4620 default:
04499a70 4621 assert_not_reached();
f8f3f926
LP
4622 }
4623
9da4cb2b 4624 break;
f8f3f926 4625 }
fd38203a 4626
9da4cb2b 4627 case WAKEUP_CLOCK_DATA: {
5cddd924 4628 struct clock_data *d = e->event_queue[i].data.ptr;
f8f3f926
LP
4629
4630 assert(d);
4631
5cddd924 4632 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
9da4cb2b
LP
4633 break;
4634 }
4635
4636 case WAKEUP_SIGNAL_DATA:
efd3be9d 4637 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
9da4cb2b
LP
4638 break;
4639
97ef5391 4640 case WAKEUP_INOTIFY_DATA:
efd3be9d 4641 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
97ef5391
LP
4642 break;
4643
9da4cb2b 4644 default:
04499a70 4645 assert_not_reached();
9da4cb2b
LP
4646 }
4647 }
efd3be9d
YW
4648 if (r < 0)
4649 return r;
4650 if (r > 0)
4651 something_new = true;
4652 }
4653
4654 *ret_min_priority = min_priority;
4655 return something_new;
4656}
4657
4658_public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4659 int r;
4660
4661 assert_return(e, -EINVAL);
4662 assert_return(e = event_resolve(e), -ENOPKG);
4663 assert_return(!event_pid_changed(e), -ECHILD);
4664 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4665 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4666
4667 if (e->exit_requested) {
4668 e->state = SD_EVENT_PENDING;
4669 return 1;
4670 }
4671
4672 for (int64_t threshold = INT64_MAX; ; threshold--) {
4673 int64_t epoll_min_priority, child_min_priority;
4674
4675 /* There may be a possibility that new epoll (especially IO) and child events are
4676 * triggered just after process_epoll() call but before process_child(), and the new IO
4677 * events may have higher priority than the child events. To salvage these events,
4678 * let's call epoll_wait() again, but accepts only events with higher priority than the
4679 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4680 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4681 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4682
4683 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4684 if (r == -EINTR) {
4685 e->state = SD_EVENT_PENDING;
4686 return 1;
4687 }
4688 if (r < 0)
4689 goto finish;
4690 if (r == 0 && threshold < INT64_MAX)
4691 /* No new epoll event. */
4692 break;
4693
4694 r = process_child(e, threshold, &child_min_priority);
fd38203a 4695 if (r < 0)
da7e457c 4696 goto finish;
efd3be9d
YW
4697 if (r == 0)
4698 /* No new child event. */
4699 break;
4700
4701 threshold = MIN(epoll_min_priority, child_min_priority);
4702 if (threshold == INT64_MIN)
4703 break;
4704
4705 timeout = 0;
fd38203a
LP
4706 }
4707
cde93897
LP
4708 r = process_watchdog(e);
4709 if (r < 0)
4710 goto finish;
4711
fd69f224 4712 r = process_inotify(e);
6a0f1f6d
LP
4713 if (r < 0)
4714 goto finish;
4715
fd69f224 4716 r = process_timer(e, e->timestamp.realtime, &e->realtime);
a8548816
TG
4717 if (r < 0)
4718 goto finish;
4719
fd69f224 4720 r = process_timer(e, e->timestamp.boottime, &e->boottime);
6a0f1f6d
LP
4721 if (r < 0)
4722 goto finish;
4723
4724 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
fd38203a 4725 if (r < 0)
da7e457c 4726 goto finish;
fd38203a 4727
e475d10c 4728 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
fd38203a 4729 if (r < 0)
da7e457c 4730 goto finish;
fd38203a 4731
fd69f224 4732 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
97ef5391
LP
4733 if (r < 0)
4734 goto finish;
fd69f224
MS
4735 else if (r == 1) {
4736 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4737 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4738 * there were potentially re-enabled by the callback.
4739 *
4740 * Wondering why we treat only this invocation of process_timer() differently? Once event
4741 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4742 * ratelimit expiry callback is never called for any other timer type. */
4743 r = 0;
4744 goto finish;
4745 }
97ef5391 4746
c45a5a74
TG
4747 if (event_next_pending(e)) {
4748 e->state = SD_EVENT_PENDING;
c45a5a74 4749 return 1;
da7e457c
LP
4750 }
4751
c45a5a74 4752 r = 0;
fd38203a 4753
da7e457c 4754finish:
2b0c9ef7 4755 e->state = SD_EVENT_INITIAL;
da7e457c
LP
4756
4757 return r;
fd38203a
LP
4758}
4759
c45a5a74
TG
4760_public_ int sd_event_dispatch(sd_event *e) {
4761 sd_event_source *p;
4762 int r;
4763
4764 assert_return(e, -EINVAL);
b937d761 4765 assert_return(e = event_resolve(e), -ENOPKG);
c45a5a74
TG
4766 assert_return(!event_pid_changed(e), -ECHILD);
4767 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4768 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4769
4770 if (e->exit_requested)
4771 return dispatch_exit(e);
4772
4773 p = event_next_pending(e);
4774 if (p) {
c8e9d15c 4775 PROTECT_EVENT(e);
c45a5a74
TG
4776
4777 e->state = SD_EVENT_RUNNING;
4778 r = source_dispatch(p);
2b0c9ef7 4779 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4780 return r;
4781 }
4782
2b0c9ef7 4783 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4784
4785 return 1;
4786}
4787
34b87517 4788static void event_log_delays(sd_event *e) {
442ac269
YW
4789 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4790 size_t l, i;
34b87517 4791
442ac269
YW
4792 p = b;
4793 l = sizeof(b);
4794 for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4795 l = strpcpyf(&p, l, "%u ", e->delays[i]);
34b87517
VC
4796 e->delays[i] = 0;
4797 }
442ac269 4798 log_debug("Event loop iterations: %s", b);
34b87517
VC
4799}
4800
c45a5a74
TG
4801_public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4802 int r;
4803
4804 assert_return(e, -EINVAL);
b937d761 4805 assert_return(e = event_resolve(e), -ENOPKG);
c45a5a74
TG
4806 assert_return(!event_pid_changed(e), -ECHILD);
4807 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 4808 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
c45a5a74 4809
e6a7bee5 4810 if (e->profile_delays && e->last_run_usec != 0) {
34b87517
VC
4811 usec_t this_run;
4812 unsigned l;
4813
4814 this_run = now(CLOCK_MONOTONIC);
4815
58c34be8 4816 l = log2u64(this_run - e->last_run_usec);
cb9d621e 4817 assert(l < ELEMENTSOF(e->delays));
34b87517
VC
4818 e->delays[l]++;
4819
e6a7bee5 4820 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
34b87517 4821 event_log_delays(e);
e6a7bee5 4822 e->last_log_usec = this_run;
34b87517
VC
4823 }
4824 }
4825
f814c871 4826 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 4827 PROTECT_EVENT(e);
f814c871 4828
c45a5a74 4829 r = sd_event_prepare(e);
53bac4e0
LP
4830 if (r == 0)
4831 /* There was nothing? Then wait... */
4832 r = sd_event_wait(e, timeout);
c45a5a74 4833
34b87517 4834 if (e->profile_delays)
e6a7bee5 4835 e->last_run_usec = now(CLOCK_MONOTONIC);
34b87517 4836
02d30981 4837 if (r > 0) {
53bac4e0 4838 /* There's something now, then let's dispatch it */
02d30981
TG
4839 r = sd_event_dispatch(e);
4840 if (r < 0)
4841 return r;
53bac4e0
LP
4842
4843 return 1;
4844 }
4845
4846 return r;
c45a5a74
TG
4847}
4848
f7262a9f 4849_public_ int sd_event_loop(sd_event *e) {
fd38203a
LP
4850 int r;
4851
da7e457c 4852 assert_return(e, -EINVAL);
b937d761 4853 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 4854 assert_return(!event_pid_changed(e), -ECHILD);
2b0c9ef7 4855 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 4856
c8e9d15c 4857 PROTECT_EVENT(e);
fd38203a 4858
da7e457c 4859 while (e->state != SD_EVENT_FINISHED) {
f5fbe71d 4860 r = sd_event_run(e, UINT64_MAX);
fd38203a 4861 if (r < 0)
30dd293c 4862 return r;
fd38203a
LP
4863 }
4864
30dd293c 4865 return e->exit_code;
fd38203a
LP
4866}
4867
9b364545 4868_public_ int sd_event_get_fd(sd_event *e) {
9b364545 4869 assert_return(e, -EINVAL);
b937d761 4870 assert_return(e = event_resolve(e), -ENOPKG);
9b364545
TG
4871 assert_return(!event_pid_changed(e), -ECHILD);
4872
4873 return e->epoll_fd;
4874}
4875
f7262a9f 4876_public_ int sd_event_get_state(sd_event *e) {
da7e457c 4877 assert_return(e, -EINVAL);
b937d761 4878 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
4879 assert_return(!event_pid_changed(e), -ECHILD);
4880
4881 return e->state;
4882}
4883
6203e07a 4884_public_ int sd_event_get_exit_code(sd_event *e, int *code) {
da7e457c 4885 assert_return(e, -EINVAL);
b937d761 4886 assert_return(e = event_resolve(e), -ENOPKG);
6203e07a 4887 assert_return(code, -EINVAL);
da7e457c 4888 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 4889
6203e07a
LP
4890 if (!e->exit_requested)
4891 return -ENODATA;
4892
4893 *code = e->exit_code;
4894 return 0;
fd38203a
LP
4895}
4896
6203e07a 4897_public_ int sd_event_exit(sd_event *e, int code) {
da7e457c 4898 assert_return(e, -EINVAL);
b937d761 4899 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
4900 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4901 assert_return(!event_pid_changed(e), -ECHILD);
fd38203a 4902
6203e07a
LP
4903 e->exit_requested = true;
4904 e->exit_code = code;
4905
fd38203a
LP
4906 return 0;
4907}
46e8c825 4908
6a0f1f6d 4909_public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
46e8c825 4910 assert_return(e, -EINVAL);
b937d761 4911 assert_return(e = event_resolve(e), -ENOPKG);
46e8c825 4912 assert_return(usec, -EINVAL);
46e8c825
LP
4913 assert_return(!event_pid_changed(e), -ECHILD);
4914
e475d10c
LP
4915 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4916 return -EOPNOTSUPP;
4917
e475d10c 4918 if (!triple_timestamp_is_set(&e->timestamp)) {
15c689d7 4919 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
38a03f06
LP
4920 *usec = now(clock);
4921 return 1;
4922 }
46e8c825 4923
e475d10c 4924 *usec = triple_timestamp_by_clock(&e->timestamp, clock);
46e8c825
LP
4925 return 0;
4926}
afc6adb5
LP
4927
4928_public_ int sd_event_default(sd_event **ret) {
39883f62 4929 sd_event *e = NULL;
afc6adb5
LP
4930 int r;
4931
4932 if (!ret)
4933 return !!default_event;
4934
4935 if (default_event) {
4936 *ret = sd_event_ref(default_event);
4937 return 0;
4938 }
4939
4940 r = sd_event_new(&e);
4941 if (r < 0)
4942 return r;
4943
4944 e->default_event_ptr = &default_event;
4945 e->tid = gettid();
4946 default_event = e;
4947
4948 *ret = e;
4949 return 1;
4950}
4951
4952_public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4953 assert_return(e, -EINVAL);
b937d761 4954 assert_return(e = event_resolve(e), -ENOPKG);
afc6adb5 4955 assert_return(tid, -EINVAL);
76b54375 4956 assert_return(!event_pid_changed(e), -ECHILD);
afc6adb5 4957
76b54375
LP
4958 if (e->tid != 0) {
4959 *tid = e->tid;
4960 return 0;
4961 }
4962
4963 return -ENXIO;
afc6adb5 4964}
cde93897
LP
4965
4966_public_ int sd_event_set_watchdog(sd_event *e, int b) {
4967 int r;
4968
4969 assert_return(e, -EINVAL);
b937d761 4970 assert_return(e = event_resolve(e), -ENOPKG);
8f726607 4971 assert_return(!event_pid_changed(e), -ECHILD);
cde93897
LP
4972
4973 if (e->watchdog == !!b)
4974 return e->watchdog;
4975
4976 if (b) {
09812eb7
LP
4977 r = sd_watchdog_enabled(false, &e->watchdog_period);
4978 if (r <= 0)
cde93897 4979 return r;
cde93897
LP
4980
4981 /* Issue first ping immediately */
4982 sd_notify(false, "WATCHDOG=1");
4983 e->watchdog_last = now(CLOCK_MONOTONIC);
4984
4985 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4986 if (e->watchdog_fd < 0)
4987 return -errno;
4988
4989 r = arm_watchdog(e);
4990 if (r < 0)
4991 goto fail;
4992
1eac7948 4993 struct epoll_event ev = {
a82f89aa
LP
4994 .events = EPOLLIN,
4995 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
4996 };
cde93897 4997
15c689d7 4998 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
cde93897
LP
4999 r = -errno;
5000 goto fail;
5001 }
5002
5003 } else {
5004 if (e->watchdog_fd >= 0) {
5a795bff 5005 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
03e334a1 5006 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
5007 }
5008 }
5009
5010 e->watchdog = !!b;
5011 return e->watchdog;
5012
5013fail:
03e334a1 5014 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
5015 return r;
5016}
8f726607
LP
5017
5018_public_ int sd_event_get_watchdog(sd_event *e) {
5019 assert_return(e, -EINVAL);
b937d761 5020 assert_return(e = event_resolve(e), -ENOPKG);
8f726607
LP
5021 assert_return(!event_pid_changed(e), -ECHILD);
5022
5023 return e->watchdog;
5024}
60a3b1e1
LP
5025
5026_public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5027 assert_return(e, -EINVAL);
b937d761 5028 assert_return(e = event_resolve(e), -ENOPKG);
60a3b1e1
LP
5029 assert_return(!event_pid_changed(e), -ECHILD);
5030
5031 *ret = e->iteration;
5032 return 0;
5033}
15723a1d
LP
5034
5035_public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5036 assert_return(s, -EINVAL);
5037
5038 s->destroy_callback = callback;
5039 return 0;
5040}
5041
5042_public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5043 assert_return(s, -EINVAL);
5044
5045 if (ret)
5046 *ret = s->destroy_callback;
5047
5048 return !!s->destroy_callback;
5049}
2382c936
YW
5050
5051_public_ int sd_event_source_get_floating(sd_event_source *s) {
5052 assert_return(s, -EINVAL);
5053
5054 return s->floating;
5055}
5056
5057_public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5058 assert_return(s, -EINVAL);
5059
5060 if (s->floating == !!b)
5061 return 0;
5062
5063 if (!s->event) /* Already disconnected */
5064 return -ESTALE;
5065
5066 s->floating = b;
5067
5068 if (b) {
5069 sd_event_source_ref(s);
5070 sd_event_unref(s->event);
5071 } else {
5072 sd_event_ref(s->event);
5073 sd_event_source_unref(s);
5074 }
5075
5076 return 1;
5077}
b778cba4
LP
5078
5079_public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5080 assert_return(s, -EINVAL);
5081 assert_return(s->type != SOURCE_EXIT, -EDOM);
5082
5083 return s->exit_on_failure;
5084}
5085
5086_public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5087 assert_return(s, -EINVAL);
5088 assert_return(s->type != SOURCE_EXIT, -EDOM);
5089
5090 if (s->exit_on_failure == !!b)
5091 return 0;
5092
5093 s->exit_on_failure = b;
5094 return 1;
5095}
b6d5481b
LP
5096
5097_public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5098 int r;
5099
5100 assert_return(s, -EINVAL);
5101
5102 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5103 * so is a programming error. */
5104 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5105
5106 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5107 * non-ratelimited. */
fd69f224 5108 r = event_source_leave_ratelimit(s, /* run_callback */ false);
b6d5481b
LP
5109 if (r < 0)
5110 return r;
5111
5112 s->rate_limit = (RateLimit) { interval, burst };
5113 return 0;
fd69f224
MS
5114}
5115
5116_public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5117 assert_return(s, -EINVAL);
5118
5119 s->ratelimit_expire_callback = callback;
5120 return 0;
b6d5481b
LP
5121}
5122
5123_public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5124 assert_return(s, -EINVAL);
5125
6dd3b818
YW
5126 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5127 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
b6d5481b
LP
5128 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5129 return -EDOM;
5130
5131 if (!ratelimit_configured(&s->rate_limit))
5132 return -ENOEXEC;
5133
5134 if (ret_interval)
5135 *ret_interval = s->rate_limit.interval;
5136 if (ret_burst)
5137 *ret_burst = s->rate_limit.burst;
5138
5139 return 0;
5140}
5141
5142_public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5143 assert_return(s, -EINVAL);
5144
5145 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5146 return false;
5147
5148 if (!ratelimit_configured(&s->rate_limit))
5149 return false;
5150
5151 return s->ratelimited;
5152}
baf3fdec
LP
5153
5154_public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5155 bool change = false;
5156 int r;
5157
5158 assert_return(e, -EINVAL);
5159
5160 if (b) {
5161 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5162 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5163 * floating after creation (and undo this before deleting them again). */
5164
5165 if (!e->sigint_event_source) {
5166 r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5167 if (r < 0)
5168 return r;
5169
5170 assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5171 change = true;
5172 }
5173
5174 if (!e->sigterm_event_source) {
5175 r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5176 if (r < 0) {
5177 if (change) {
5178 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5179 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5180 }
5181
5182 return r;
5183 }
5184
5185 assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5186 change = true;
5187 }
5188
5189 } else {
5190 if (e->sigint_event_source) {
5191 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5192 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5193 change = true;
5194 }
5195
5196 if (e->sigterm_event_source) {
5197 assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5198 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5199 change = true;
5200 }
5201 }
5202
5203 return change;
5204}
158fe190
LP
5205
5206_public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5207 _cleanup_free_ char *b = NULL;
5208 _cleanup_free_ void *w = NULL;
5209
5210 assert_return(s, -EINVAL);
5211 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5212 assert_return(ty, -EINVAL);
5213
5214 if (!STR_IN_SET(ty, "some", "full"))
5215 return -EINVAL;
5216
5217 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5218 return -EBUSY;
5219
5220 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5221 if (!space)
5222 return -EINVAL;
5223
5224 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5225 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5226 if (!b)
5227 return -ENOMEM;
5228 if (!STR_IN_SET(b, "some", "full"))
5229 return -EINVAL;
5230
5231 if (streq(b, ty))
5232 return 0;
5233
5234 size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5235 w = new(char, nl);
5236 if (!w)
5237 return -ENOMEM;
5238
5239 memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5240
5241 free_and_replace(s->memory_pressure.write_buffer, w);
5242 s->memory_pressure.write_buffer_size = nl;
5243 s->memory_pressure.locked = false;
5244
5245 return 1;
5246}
5247
5248_public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5249 _cleanup_free_ char *b = NULL;
5250 _cleanup_free_ void *w = NULL;
5251
5252 assert_return(s, -EINVAL);
5253 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5254
5255 if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5256 return -ERANGE;
5257 if (window_usec <= 0 || window_usec >= UINT64_MAX)
5258 return -ERANGE;
5259 if (threshold_usec > window_usec)
5260 return -EINVAL;
5261
5262 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5263 return -EBUSY;
5264
5265 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5266 if (!space)
5267 return -EINVAL;
5268
5269 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5270 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5271 if (!b)
5272 return -ENOMEM;
5273 if (!STR_IN_SET(b, "some", "full"))
5274 return -EINVAL;
5275
5276 if (asprintf((char**) &w,
5277 "%s " USEC_FMT " " USEC_FMT "",
5278 b,
5279 threshold_usec,
5280 window_usec) < 0)
5281 return -EINVAL;
5282
5283 l = strlen(w) + 1;
5284 if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5285 return 0;
5286
5287 free_and_replace(s->memory_pressure.write_buffer, w);
5288 s->memory_pressure.write_buffer_size = l;
5289 s->memory_pressure.locked = false;
5290
5291 return 1;
5292}