]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/libsystemd/sd-event/sd-event.c
NEWS: fix typo
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
fd38203a 2
abb99d31 3#include <malloc.h>
5cdf13c7 4#include <stdlib.h>
fd38203a
LP
5#include <sys/timerfd.h>
6#include <sys/wait.h>
e7537295 7#include <threads.h>
4f18ff2e 8#include <unistd.h>
fd38203a 9
cde93897 10#include "sd-daemon.h"
07630cea
LP
11#include "sd-event.h"
12#include "sd-id128.h"
158fe190 13#include "sd-messages.h"
07630cea 14
b5efdb8a 15#include "alloc-util.h"
5cdf13c7 16#include "errno-util.h"
a137a1c3 17#include "event-source.h"
3ffd4af2 18#include "fd-util.h"
5cdf13c7 19#include "format-util.h"
28e5e1e9 20#include "glyph-util.h"
fd38203a 21#include "hashmap.h"
158fe190 22#include "hexdecoct.h"
07630cea 23#include "list.h"
5cdf13c7 24#include "log.h"
3ae6b3bf 25#include "logarithm.h"
0a970718 26#include "memory-util.h"
158fe190 27#include "missing_magic.h"
6e14c46b 28#include "missing_wait.h"
2eeff0f4 29#include "origin-id.h"
158fe190 30#include "path-util.h"
b234026d 31#include "pidfd-util.h"
1cf40697 32#include "prioq.h"
4a0b58c4 33#include "process-util.h"
158fe190 34#include "psi-util.h"
6e9feda3 35#include "set.h"
24882e06 36#include "signal-util.h"
5cdf13c7 37#include "siphash24.h"
158fe190
LP
38#include "socket-util.h"
39#include "stat-util.h"
55cbfaa5 40#include "string-table.h"
07630cea 41#include "string-util.h"
d9ccf6b3 42#include "strv.h"
442ac269 43#include "strxcpyx.h"
07630cea 44#include "time-util.h"
fd38203a 45
c2ba3ad6 46#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
fd38203a 47
c6cc7efc 48static bool EVENT_SOURCE_WATCH_PIDFD(const sd_event_source *s) {
f8f3f926
LP
49 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
50 return s &&
51 s->type == SOURCE_CHILD &&
f8f3f926
LP
52 s->child.options == WEXITED;
53}
54
b6d5481b
LP
55static bool event_source_is_online(sd_event_source *s) {
56 assert(s);
57 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
58}
59
60static bool event_source_is_offline(sd_event_source *s) {
61 assert(s);
62 return s->enabled == SD_EVENT_OFF || s->ratelimited;
63}
64
55cbfaa5 65static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
3f3548f8
ZJS
66 [SOURCE_IO] = "io",
67 [SOURCE_TIME_REALTIME] = "realtime",
c5ea1471 68 [SOURCE_TIME_BOOTTIME] = "boottime",
3f3548f8 69 [SOURCE_TIME_MONOTONIC] = "monotonic",
55cbfaa5
DM
70 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
71 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
3f3548f8
ZJS
72 [SOURCE_SIGNAL] = "signal",
73 [SOURCE_CHILD] = "child",
74 [SOURCE_DEFER] = "defer",
75 [SOURCE_POST] = "post",
76 [SOURCE_EXIT] = "exit",
77 [SOURCE_WATCHDOG] = "watchdog",
78 [SOURCE_INOTIFY] = "inotify",
158fe190 79 [SOURCE_MEMORY_PRESSURE] = "memory-pressure",
55cbfaa5
DM
80};
81
82DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
83
b6d5481b
LP
84#define EVENT_SOURCE_IS_TIME(t) \
85 IN_SET((t), \
86 SOURCE_TIME_REALTIME, \
87 SOURCE_TIME_BOOTTIME, \
88 SOURCE_TIME_MONOTONIC, \
89 SOURCE_TIME_REALTIME_ALARM, \
90 SOURCE_TIME_BOOTTIME_ALARM)
91
92#define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
93 IN_SET((t), \
94 SOURCE_IO, \
95 SOURCE_TIME_REALTIME, \
96 SOURCE_TIME_BOOTTIME, \
97 SOURCE_TIME_MONOTONIC, \
98 SOURCE_TIME_REALTIME_ALARM, \
99 SOURCE_TIME_BOOTTIME_ALARM, \
100 SOURCE_SIGNAL, \
101 SOURCE_DEFER, \
158fe190
LP
102 SOURCE_INOTIFY, \
103 SOURCE_MEMORY_PRESSURE)
6a0f1f6d 104
19947509
ZJS
105/* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
106 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
107 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
108#define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
109
fd38203a 110struct sd_event {
da7e457c 111 unsigned n_ref;
fd38203a
LP
112
113 int epoll_fd;
cde93897 114 int watchdog_fd;
fd38203a
LP
115
116 Prioq *pending;
117 Prioq *prepare;
c2ba3ad6 118
a8548816 119 /* timerfd_create() only supports these five clocks so far. We
6a0f1f6d
LP
120 * can add support for more clocks when the kernel learns to
121 * deal with them, too. */
122 struct clock_data realtime;
a8548816 123 struct clock_data boottime;
6a0f1f6d
LP
124 struct clock_data monotonic;
125 struct clock_data realtime_alarm;
126 struct clock_data boottime_alarm;
fd38203a 127
da7e457c
LP
128 usec_t perturb;
129
9da4cb2b
LP
130 sd_event_source **signal_sources; /* indexed by signal number */
131 Hashmap *signal_data; /* indexed by priority */
fd38203a
LP
132
133 Hashmap *child_sources;
b6d5481b 134 unsigned n_online_child_sources;
fd38203a 135
6e9feda3
LP
136 Set *post_sources;
137
6203e07a 138 Prioq *exit;
fd38203a 139
97ef5391
LP
140 Hashmap *inotify_data; /* indexed by priority */
141
142 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
ed828563 143 LIST_HEAD(struct inode_data, inode_data_to_close_list);
97ef5391
LP
144
145 /* A list of inotify objects that already have events buffered which aren't processed yet */
0601b958 146 LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
97ef5391 147
158fe190
LP
148 /* A list of memory pressure event sources that still need their subscription string written */
149 LIST_HEAD(sd_event_source, memory_pressure_write_list);
150
2eeff0f4 151 uint64_t origin_id;
c2ba3ad6 152
60a3b1e1 153 uint64_t iteration;
e475d10c 154 triple_timestamp timestamp;
da7e457c 155 int state;
eaa3cbef 156
6203e07a 157 bool exit_requested:1;
da7e457c 158 bool need_process_child:1;
cde93897 159 bool watchdog:1;
34b87517 160 bool profile_delays:1;
afc6adb5 161
6203e07a
LP
162 int exit_code;
163
afc6adb5
LP
164 pid_t tid;
165 sd_event **default_event_ptr;
cde93897
LP
166
167 usec_t watchdog_last, watchdog_period;
15b38f93
LP
168
169 unsigned n_sources;
a71fe8b8 170
5cddd924 171 struct epoll_event *event_queue;
5cddd924 172
a71fe8b8 173 LIST_HEAD(sd_event_source, sources);
34b87517 174
baf3fdec
LP
175 sd_event_source *sigint_event_source, *sigterm_event_source;
176
e6a7bee5 177 usec_t last_run_usec, last_log_usec;
34b87517 178 unsigned delays[sizeof(usec_t) * 8];
fd38203a
LP
179};
180
2eeff0f4
LB
181DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event, event);
182
b937d761
NM
183static thread_local sd_event *default_event = NULL;
184
a71fe8b8 185static void source_disconnect(sd_event_source *s);
97ef5391 186static void event_gc_inode_data(sd_event *e, struct inode_data *d);
a71fe8b8 187
415bf4e0 188static sd_event* event_resolve(sd_event *e) {
b937d761
NM
189 return e == SD_EVENT_DEFAULT ? default_event : e;
190}
191
fd38203a
LP
192static int pending_prioq_compare(const void *a, const void *b) {
193 const sd_event_source *x = a, *y = b;
9c57a73b 194 int r;
fd38203a
LP
195
196 assert(x->pending);
197 assert(y->pending);
198
baf76283 199 /* Enabled ones first */
06e13147
YW
200 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
201 if (r != 0)
202 return r;
fd38203a 203
b6d5481b
LP
204 /* Non rate-limited ones first. */
205 r = CMP(!!x->ratelimited, !!y->ratelimited);
206 if (r != 0)
207 return r;
208
fd38203a 209 /* Lower priority values first */
9c57a73b
YW
210 r = CMP(x->priority, y->priority);
211 if (r != 0)
212 return r;
fd38203a
LP
213
214 /* Older entries first */
9c57a73b 215 return CMP(x->pending_iteration, y->pending_iteration);
fd38203a
LP
216}
217
218static int prepare_prioq_compare(const void *a, const void *b) {
219 const sd_event_source *x = a, *y = b;
9c57a73b 220 int r;
fd38203a
LP
221
222 assert(x->prepare);
223 assert(y->prepare);
224
8046c457 225 /* Enabled ones first */
06e13147
YW
226 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
227 if (r != 0)
228 return r;
8046c457 229
b6d5481b
LP
230 /* Non rate-limited ones first. */
231 r = CMP(!!x->ratelimited, !!y->ratelimited);
232 if (r != 0)
233 return r;
234
fd38203a
LP
235 /* Move most recently prepared ones last, so that we can stop
236 * preparing as soon as we hit one that has already been
237 * prepared in the current iteration */
9c57a73b
YW
238 r = CMP(x->prepare_iteration, y->prepare_iteration);
239 if (r != 0)
240 return r;
fd38203a 241
fd38203a 242 /* Lower priority values first */
9c57a73b 243 return CMP(x->priority, y->priority);
fd38203a
LP
244}
245
b6d5481b
LP
246static usec_t time_event_source_next(const sd_event_source *s) {
247 assert(s);
248
249 /* We have two kinds of event sources that have elapsation times associated with them: the actual
250 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
251 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
252 * looking at here. */
253
254 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
255 assert(s->rate_limit.begin != 0);
256 assert(s->rate_limit.interval != 0);
257 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
258 }
259
260 /* Otherwise this must be a time event source, if not ratelimited */
261 if (EVENT_SOURCE_IS_TIME(s->type))
262 return s->time.next;
263
264 return USEC_INFINITY;
265}
266
1bce0ffa 267static usec_t time_event_source_latest(const sd_event_source *s) {
b6d5481b
LP
268 assert(s);
269
270 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
271 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
272 * window */
273 assert(s->rate_limit.begin != 0);
274 assert(s->rate_limit.interval != 0);
275 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
276 }
277
278 /* Must be a time event source, if not ratelimited */
279 if (EVENT_SOURCE_IS_TIME(s->type))
280 return usec_add(s->time.next, s->time.accuracy);
281
282 return USEC_INFINITY;
1bce0ffa
LP
283}
284
81107b84
LP
285static bool event_source_timer_candidate(const sd_event_source *s) {
286 assert(s);
287
288 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
289 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
290 return !s->pending || s->ratelimited;
291}
292
293static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
c2ba3ad6 294 const sd_event_source *x = a, *y = b;
06e13147 295 int r;
c2ba3ad6 296
baf76283 297 /* Enabled ones first */
06e13147
YW
298 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
299 if (r != 0)
300 return r;
c2ba3ad6 301
81107b84 302 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
06e13147
YW
303 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
304 if (r != 0)
305 return r;
c2ba3ad6
LP
306
307 /* Order by time */
81107b84
LP
308 return CMP(time_func(x), time_func(y));
309}
310
311static int earliest_time_prioq_compare(const void *a, const void *b) {
312 return time_prioq_compare(a, b, time_event_source_next);
313}
314
315static int latest_time_prioq_compare(const void *a, const void *b) {
316 return time_prioq_compare(a, b, time_event_source_latest);
c2ba3ad6
LP
317}
318
6203e07a 319static int exit_prioq_compare(const void *a, const void *b) {
da7e457c 320 const sd_event_source *x = a, *y = b;
06e13147 321 int r;
da7e457c 322
6203e07a
LP
323 assert(x->type == SOURCE_EXIT);
324 assert(y->type == SOURCE_EXIT);
da7e457c 325
baf76283 326 /* Enabled ones first */
06e13147
YW
327 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
328 if (r != 0)
329 return r;
da7e457c
LP
330
331 /* Lower priority values first */
6dd91b36 332 return CMP(x->priority, y->priority);
da7e457c
LP
333}
334
6a0f1f6d
LP
335static void free_clock_data(struct clock_data *d) {
336 assert(d);
9da4cb2b 337 assert(d->wakeup == WAKEUP_CLOCK_DATA);
6a0f1f6d
LP
338
339 safe_close(d->fd);
340 prioq_free(d->earliest);
341 prioq_free(d->latest);
342}
343
415bf4e0 344static sd_event* event_free(sd_event *e) {
a71fe8b8
LP
345 sd_event_source *s;
346
fd38203a 347 assert(e);
a71fe8b8 348
baf3fdec
LP
349 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
350 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
351
a71fe8b8
LP
352 while ((s = e->sources)) {
353 assert(s->floating);
354 source_disconnect(s);
355 sd_event_source_unref(s);
356 }
357
15b38f93 358 assert(e->n_sources == 0);
fd38203a 359
afc6adb5
LP
360 if (e->default_event_ptr)
361 *(e->default_event_ptr) = NULL;
362
03e334a1 363 safe_close(e->epoll_fd);
03e334a1 364 safe_close(e->watchdog_fd);
cde93897 365
6a0f1f6d 366 free_clock_data(&e->realtime);
a8548816 367 free_clock_data(&e->boottime);
6a0f1f6d
LP
368 free_clock_data(&e->monotonic);
369 free_clock_data(&e->realtime_alarm);
370 free_clock_data(&e->boottime_alarm);
371
fd38203a
LP
372 prioq_free(e->pending);
373 prioq_free(e->prepare);
6203e07a 374 prioq_free(e->exit);
fd38203a
LP
375
376 free(e->signal_sources);
9da4cb2b 377 hashmap_free(e->signal_data);
fd38203a 378
97ef5391
LP
379 hashmap_free(e->inotify_data);
380
fd38203a 381 hashmap_free(e->child_sources);
6e9feda3 382 set_free(e->post_sources);
8301aa0b 383
5cddd924
LP
384 free(e->event_queue);
385
8301aa0b 386 return mfree(e);
fd38203a
LP
387}
388
f7262a9f 389_public_ int sd_event_new(sd_event** ret) {
fd38203a
LP
390 sd_event *e;
391 int r;
392
305f78bf 393 assert_return(ret, -EINVAL);
fd38203a 394
d08eb1fa 395 e = new(sd_event, 1);
fd38203a
LP
396 if (!e)
397 return -ENOMEM;
398
d08eb1fa
LP
399 *e = (sd_event) {
400 .n_ref = 1,
254d1313
ZJS
401 .epoll_fd = -EBADF,
402 .watchdog_fd = -EBADF,
d08eb1fa 403 .realtime.wakeup = WAKEUP_CLOCK_DATA,
254d1313 404 .realtime.fd = -EBADF,
d08eb1fa
LP
405 .realtime.next = USEC_INFINITY,
406 .boottime.wakeup = WAKEUP_CLOCK_DATA,
254d1313 407 .boottime.fd = -EBADF,
d08eb1fa
LP
408 .boottime.next = USEC_INFINITY,
409 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
254d1313 410 .monotonic.fd = -EBADF,
d08eb1fa
LP
411 .monotonic.next = USEC_INFINITY,
412 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
254d1313 413 .realtime_alarm.fd = -EBADF,
d08eb1fa
LP
414 .realtime_alarm.next = USEC_INFINITY,
415 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
254d1313 416 .boottime_alarm.fd = -EBADF,
d08eb1fa
LP
417 .boottime_alarm.next = USEC_INFINITY,
418 .perturb = USEC_INFINITY,
2eeff0f4 419 .origin_id = origin_id_query(),
d08eb1fa 420 };
fd38203a 421
c983e776
EV
422 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
423 if (r < 0)
fd38203a 424 goto fail;
fd38203a
LP
425
426 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
427 if (e->epoll_fd < 0) {
428 r = -errno;
429 goto fail;
430 }
431
7fe2903c
LP
432 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
433
34b87517 434 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
28e5e1e9 435 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
1ae9b0cf 436 glyph(GLYPH_ELLIPSIS));
34b87517
VC
437 e->profile_delays = true;
438 }
439
fd38203a
LP
440 *ret = e;
441 return 0;
442
443fail:
444 event_free(e);
445 return r;
446}
447
2eeff0f4 448/* Define manually so we can add the origin check */
415bf4e0 449_public_ sd_event* sd_event_ref(sd_event *e) {
2eeff0f4
LB
450 if (!e)
451 return NULL;
452 if (event_origin_changed(e))
453 return NULL;
454
455 e->n_ref++;
456
457 return e;
458}
459
460_public_ sd_event* sd_event_unref(sd_event *e) {
461 if (!e)
462 return NULL;
463 if (event_origin_changed(e))
464 return NULL;
465
466 assert(e->n_ref > 0);
467 if (--e->n_ref > 0)
468 return NULL;
469
470 return event_free(e);
471}
472
c8e9d15c
YW
473#define PROTECT_EVENT(e) \
474 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
fd38203a 475
afd15bbb 476_public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
8ca77ee6
ZJS
477 int r;
478
479 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
480 if (r < 0)
481 log_debug_errno(r, "Failed to disable event source %p (%s): %m",
482 s, strna(s->description));
483
afd15bbb
ZJS
484 return sd_event_source_unref(s);
485}
486
366e6411 487static void source_io_unregister(sd_event_source *s) {
fd38203a
LP
488 assert(s);
489 assert(s->type == SOURCE_IO);
490
2eeff0f4 491 if (event_origin_changed(s->event))
366e6411 492 return;
f6806734 493
fd38203a 494 if (!s->io.registered)
366e6411 495 return;
fd38203a 496
d1cf2023 497 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
f80a5d6a 498 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
55cbfaa5 499 strna(s->description), event_source_type_to_string(s->type));
fd38203a
LP
500
501 s->io.registered = false;
fd38203a
LP
502}
503
305f78bf
LP
504static int source_io_register(
505 sd_event_source *s,
506 int enabled,
507 uint32_t events) {
508
fd38203a
LP
509 assert(s);
510 assert(s->type == SOURCE_IO);
baf76283 511 assert(enabled != SD_EVENT_OFF);
fd38203a 512
1eac7948 513 struct epoll_event ev = {
a82f89aa
LP
514 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
515 .data.ptr = s,
516 };
fd38203a 517
15c689d7 518 if (epoll_ctl(s->event->epoll_fd,
1eac7948 519 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
55c540d3 520 s->io.fd, &ev) < 0)
fd38203a
LP
521 return -errno;
522
523 s->io.registered = true;
524
525 return 0;
526}
527
f8f3f926
LP
528static void source_child_pidfd_unregister(sd_event_source *s) {
529 assert(s);
530 assert(s->type == SOURCE_CHILD);
531
2eeff0f4 532 if (event_origin_changed(s->event))
f8f3f926
LP
533 return;
534
535 if (!s->child.registered)
536 return;
537
538 if (EVENT_SOURCE_WATCH_PIDFD(s))
539 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
f80a5d6a 540 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
f8f3f926
LP
541 strna(s->description), event_source_type_to_string(s->type));
542
543 s->child.registered = false;
544}
545
546static int source_child_pidfd_register(sd_event_source *s, int enabled) {
f8f3f926
LP
547 assert(s);
548 assert(s->type == SOURCE_CHILD);
549 assert(enabled != SD_EVENT_OFF);
550
551 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1eac7948 552 struct epoll_event ev = {
f8f3f926
LP
553 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
554 .data.ptr = s,
555 };
556
55c540d3
ZJS
557 if (epoll_ctl(s->event->epoll_fd,
558 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
559 s->child.pidfd, &ev) < 0)
f8f3f926
LP
560 return -errno;
561 }
562
563 s->child.registered = true;
564 return 0;
565}
566
158fe190
LP
567static void source_memory_pressure_unregister(sd_event_source *s) {
568 assert(s);
569 assert(s->type == SOURCE_MEMORY_PRESSURE);
570
2eeff0f4 571 if (event_origin_changed(s->event))
158fe190
LP
572 return;
573
574 if (!s->memory_pressure.registered)
575 return;
576
577 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
578 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
579 strna(s->description), event_source_type_to_string(s->type));
580
581 s->memory_pressure.registered = false;
582}
583
584static int source_memory_pressure_register(sd_event_source *s, int enabled) {
585 assert(s);
586 assert(s->type == SOURCE_MEMORY_PRESSURE);
587 assert(enabled != SD_EVENT_OFF);
588
589 struct epoll_event ev = {
590 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
591 (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
592 .data.ptr = s,
593 };
594
595 if (epoll_ctl(s->event->epoll_fd,
596 s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
597 s->memory_pressure.fd, &ev) < 0)
598 return -errno;
599
600 s->memory_pressure.registered = true;
601 return 0;
602}
603
604static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
605 assert(s);
606 assert(s->type == SOURCE_MEMORY_PRESSURE);
607
608 if (s->memory_pressure.in_write_list)
609 return;
610
611 LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
612 s->memory_pressure.in_write_list = true;
613}
614
615static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
616 assert(s);
617 assert(s->type == SOURCE_MEMORY_PRESSURE);
618
619 if (!s->memory_pressure.in_write_list)
620 return;
621
622 LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
623 s->memory_pressure.in_write_list = false;
624}
625
6a0f1f6d
LP
626static clockid_t event_source_type_to_clock(EventSourceType t) {
627
628 switch (t) {
629
630 case SOURCE_TIME_REALTIME:
631 return CLOCK_REALTIME;
632
a8548816
TG
633 case SOURCE_TIME_BOOTTIME:
634 return CLOCK_BOOTTIME;
635
6a0f1f6d
LP
636 case SOURCE_TIME_MONOTONIC:
637 return CLOCK_MONOTONIC;
638
639 case SOURCE_TIME_REALTIME_ALARM:
640 return CLOCK_REALTIME_ALARM;
641
642 case SOURCE_TIME_BOOTTIME_ALARM:
643 return CLOCK_BOOTTIME_ALARM;
644
645 default:
646 return (clockid_t) -1;
647 }
648}
649
650static EventSourceType clock_to_event_source_type(clockid_t clock) {
651
652 switch (clock) {
653
654 case CLOCK_REALTIME:
655 return SOURCE_TIME_REALTIME;
656
a8548816
TG
657 case CLOCK_BOOTTIME:
658 return SOURCE_TIME_BOOTTIME;
659
6a0f1f6d
LP
660 case CLOCK_MONOTONIC:
661 return SOURCE_TIME_MONOTONIC;
662
663 case CLOCK_REALTIME_ALARM:
664 return SOURCE_TIME_REALTIME_ALARM;
665
666 case CLOCK_BOOTTIME_ALARM:
667 return SOURCE_TIME_BOOTTIME_ALARM;
668
669 default:
670 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
671 }
672}
673
674static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
675 assert(e);
676
677 switch (t) {
678
679 case SOURCE_TIME_REALTIME:
680 return &e->realtime;
681
a8548816
TG
682 case SOURCE_TIME_BOOTTIME:
683 return &e->boottime;
684
6a0f1f6d
LP
685 case SOURCE_TIME_MONOTONIC:
686 return &e->monotonic;
687
688 case SOURCE_TIME_REALTIME_ALARM:
689 return &e->realtime_alarm;
690
691 case SOURCE_TIME_BOOTTIME_ALARM:
692 return &e->boottime_alarm;
693
694 default:
695 return NULL;
696 }
697}
698
3e4eb8e7
YW
699static void event_free_signal_data(sd_event *e, struct signal_data *d) {
700 assert(e);
701
702 if (!d)
703 return;
704
705 hashmap_remove(e->signal_data, &d->priority);
706 safe_close(d->fd);
707 free(d);
708}
709
9da4cb2b
LP
710static int event_make_signal_data(
711 sd_event *e,
712 int sig,
713 struct signal_data **ret) {
4807d2d0 714
9da4cb2b
LP
715 struct signal_data *d;
716 bool added = false;
717 sigset_t ss_copy;
718 int64_t priority;
f95387cd
ZJS
719 int r;
720
721 assert(e);
722
2eeff0f4 723 if (event_origin_changed(e))
9da4cb2b 724 return -ECHILD;
f6806734 725
9da4cb2b
LP
726 if (e->signal_sources && e->signal_sources[sig])
727 priority = e->signal_sources[sig]->priority;
728 else
de05913d 729 priority = SD_EVENT_PRIORITY_NORMAL;
f95387cd 730
9da4cb2b
LP
731 d = hashmap_get(e->signal_data, &priority);
732 if (d) {
733 if (sigismember(&d->sigset, sig) > 0) {
734 if (ret)
735 *ret = d;
736 return 0;
737 }
738 } else {
d08eb1fa 739 d = new(struct signal_data, 1);
9da4cb2b
LP
740 if (!d)
741 return -ENOMEM;
742
d08eb1fa
LP
743 *d = (struct signal_data) {
744 .wakeup = WAKEUP_SIGNAL_DATA,
254d1313 745 .fd = -EBADF,
d08eb1fa
LP
746 .priority = priority,
747 };
9da4cb2b 748
f656fdb6 749 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
90f604d1
ZJS
750 if (r < 0) {
751 free(d);
9da4cb2b 752 return r;
90f604d1 753 }
f95387cd 754
9da4cb2b
LP
755 added = true;
756 }
757
758 ss_copy = d->sigset;
759 assert_se(sigaddset(&ss_copy, sig) >= 0);
760
cbff793f
ZJS
761 r = signalfd(d->fd >= 0 ? d->fd : -1, /* the first arg must be -1 or a valid signalfd */
762 &ss_copy,
763 SFD_NONBLOCK|SFD_CLOEXEC);
9da4cb2b
LP
764 if (r < 0) {
765 r = -errno;
766 goto fail;
767 }
768
769 d->sigset = ss_copy;
f95387cd 770
9da4cb2b
LP
771 if (d->fd >= 0) {
772 if (ret)
773 *ret = d;
f95387cd 774 return 0;
9da4cb2b
LP
775 }
776
7fe2903c 777 d->fd = fd_move_above_stdio(r);
f95387cd 778
1eac7948 779 struct epoll_event ev = {
a82f89aa
LP
780 .events = EPOLLIN,
781 .data.ptr = d,
782 };
f95387cd 783
15c689d7 784 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
9da4cb2b
LP
785 r = -errno;
786 goto fail;
f95387cd
ZJS
787 }
788
9da4cb2b
LP
789 if (ret)
790 *ret = d;
791
f95387cd 792 return 0;
9da4cb2b
LP
793
794fail:
3e4eb8e7
YW
795 if (added)
796 event_free_signal_data(e, d);
9da4cb2b
LP
797
798 return r;
799}
800
801static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
802 assert(e);
803 assert(d);
804
805 /* Turns off the specified signal in the signal data
806 * object. If the signal mask of the object becomes empty that
807 * way removes it. */
808
809 if (sigismember(&d->sigset, sig) == 0)
810 return;
811
812 assert_se(sigdelset(&d->sigset, sig) >= 0);
813
814 if (sigisemptyset(&d->sigset)) {
9da4cb2b 815 /* If all the mask is all-zero we can get rid of the structure */
3e4eb8e7 816 event_free_signal_data(e, d);
9da4cb2b
LP
817 return;
818 }
819
2eeff0f4 820 if (event_origin_changed(e))
01e6af73
YW
821 return;
822
9da4cb2b
LP
823 assert(d->fd >= 0);
824
825 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
826 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
827}
828
829static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
830 struct signal_data *d;
831 static const int64_t zero_priority = 0;
832
833 assert(e);
834
f8f3f926
LP
835 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
836 * and possibly drop the signalfd for it. */
9da4cb2b
LP
837
838 if (sig == SIGCHLD &&
b6d5481b 839 e->n_online_child_sources > 0)
9da4cb2b
LP
840 return;
841
842 if (e->signal_sources &&
843 e->signal_sources[sig] &&
b6d5481b 844 event_source_is_online(e->signal_sources[sig]))
9da4cb2b
LP
845 return;
846
847 /*
848 * The specified signal might be enabled in three different queues:
849 *
850 * 1) the one that belongs to the priority passed (if it is non-NULL)
851 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
852 * 3) the 0 priority (to cover the SIGCHLD case)
853 *
854 * Hence, let's remove it from all three here.
855 */
856
857 if (priority) {
858 d = hashmap_get(e->signal_data, priority);
859 if (d)
860 event_unmask_signal_data(e, d, sig);
861 }
862
863 if (e->signal_sources && e->signal_sources[sig]) {
864 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
865 if (d)
866 event_unmask_signal_data(e, d, sig);
867 }
868
869 d = hashmap_get(e->signal_data, &zero_priority);
870 if (d)
871 event_unmask_signal_data(e, d, sig);
f95387cd
ZJS
872}
873
e1951c16
MS
874static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
875 assert(s);
876
877 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
878 * they are enabled/disabled or marked pending and such. */
879
880 if (s->pending)
881 prioq_reshuffle(s->event->pending, s, &s->pending_index);
882
883 if (s->prepare)
884 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
885}
886
887static void event_source_time_prioq_reshuffle(sd_event_source *s) {
888 struct clock_data *d;
889
890 assert(s);
e1951c16
MS
891
892 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
5c08c7ab
YW
893 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
894 * properly again. */
b6d5481b
LP
895
896 if (s->ratelimited)
897 d = &s->event->monotonic;
5c08c7ab 898 else if (EVENT_SOURCE_IS_TIME(s->type))
b6d5481b 899 assert_se(d = event_get_clock_data(s->event, s->type));
5c08c7ab
YW
900 else
901 return; /* no-op for an event source which is neither a timer nor ratelimited. */
b6d5481b 902
f41315fc
LP
903 prioq_reshuffle(d->earliest, s, &s->earliest_index);
904 prioq_reshuffle(d->latest, s, &s->latest_index);
e1951c16
MS
905 d->needs_rearm = true;
906}
907
1e45e3fe
LP
908static void event_source_time_prioq_remove(
909 sd_event_source *s,
910 struct clock_data *d) {
911
912 assert(s);
913 assert(d);
914
f41315fc
LP
915 prioq_remove(d->earliest, s, &s->earliest_index);
916 prioq_remove(d->latest, s, &s->latest_index);
917 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
918 d->needs_rearm = true;
919}
920
a71fe8b8
LP
921static void source_disconnect(sd_event_source *s) {
922 sd_event *event;
897448bd 923 int r;
a71fe8b8 924
fd38203a
LP
925 assert(s);
926
a71fe8b8
LP
927 if (!s->event)
928 return;
15b38f93 929
a71fe8b8 930 assert(s->event->n_sources > 0);
fd38203a 931
a71fe8b8 932 switch (s->type) {
fd38203a 933
a71fe8b8
LP
934 case SOURCE_IO:
935 if (s->io.fd >= 0)
936 source_io_unregister(s);
fd38203a 937
a71fe8b8 938 break;
6a0f1f6d 939
a71fe8b8 940 case SOURCE_TIME_REALTIME:
a8548816 941 case SOURCE_TIME_BOOTTIME:
a71fe8b8
LP
942 case SOURCE_TIME_MONOTONIC:
943 case SOURCE_TIME_REALTIME_ALARM:
b6d5481b
LP
944 case SOURCE_TIME_BOOTTIME_ALARM:
945 /* Only remove this event source from the time event source here if it is not ratelimited. If
946 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
947 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
948
949 if (!s->ratelimited) {
950 struct clock_data *d;
951 assert_se(d = event_get_clock_data(s->event, s->type));
952 event_source_time_prioq_remove(s, d);
953 }
954
a71fe8b8 955 break;
a71fe8b8
LP
956
957 case SOURCE_SIGNAL:
958 if (s->signal.sig > 0) {
9da4cb2b 959
a71fe8b8
LP
960 if (s->event->signal_sources)
961 s->event->signal_sources[s->signal.sig] = NULL;
4807d2d0 962
9da4cb2b 963 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
897448bd
LP
964
965 if (s->signal.unblock) {
966 sigset_t new_ss;
967
968 if (sigemptyset(&new_ss) < 0)
969 log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
970 else if (sigaddset(&new_ss, s->signal.sig) < 0)
971 log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
972 else {
973 r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
974 if (r != 0)
975 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
976 }
977 }
6a0f1f6d 978 }
fd38203a 979
a71fe8b8 980 break;
fd38203a 981
a71fe8b8 982 case SOURCE_CHILD:
2eeff0f4 983 if (event_origin_changed(s->event))
86587c93
YW
984 s->child.process_owned = false;
985
a71fe8b8 986 if (s->child.pid > 0) {
b6d5481b
LP
987 if (event_source_is_online(s)) {
988 assert(s->event->n_online_child_sources > 0);
989 s->event->n_online_child_sources--;
4807d2d0 990 }
fd38203a 991
4f636734 992 assert_se(hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid)));
a71fe8b8 993 }
fd38203a 994
f8f3f926
LP
995 if (EVENT_SOURCE_WATCH_PIDFD(s))
996 source_child_pidfd_unregister(s);
997 else
998 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
999
a71fe8b8 1000 break;
fd38203a 1001
a71fe8b8
LP
1002 case SOURCE_DEFER:
1003 /* nothing */
1004 break;
fd38203a 1005
a71fe8b8
LP
1006 case SOURCE_POST:
1007 set_remove(s->event->post_sources, s);
1008 break;
da7e457c 1009
a71fe8b8
LP
1010 case SOURCE_EXIT:
1011 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
1012 break;
0eb2e0e3 1013
97ef5391
LP
1014 case SOURCE_INOTIFY: {
1015 struct inode_data *inode_data;
1016
1017 inode_data = s->inotify.inode_data;
1018 if (inode_data) {
1019 struct inotify_data *inotify_data;
1020 assert_se(inotify_data = inode_data->inotify_data);
1021
1022 /* Detach this event source from the inode object */
1023 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
1024 s->inotify.inode_data = NULL;
1025
1026 if (s->pending) {
1027 assert(inotify_data->n_pending > 0);
1028 inotify_data->n_pending--;
1029 }
1030
1031 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1032 * continued to being watched. That's because inotify doesn't really have an API for that: we
1033 * can only change watch masks with access to the original inode either by fd or by path. But
1034 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
f21f31b2 1035 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
97ef5391
LP
1036 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1037 * there), but given the need for open_by_handle_at() which is privileged and not universally
1038 * available this would be quite an incomplete solution. Hence we go the other way, leave the
1039 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1040 * anymore after reception. Yes, this sucks, but … Linux … */
1041
1042 /* Maybe release the inode data (and its inotify) */
1043 event_gc_inode_data(s->event, inode_data);
1044 }
1045
1046 break;
1047 }
1048
158fe190
LP
1049 case SOURCE_MEMORY_PRESSURE:
1050 source_memory_pressure_remove_from_write_list(s);
1051 source_memory_pressure_unregister(s);
1052 break;
1053
a71fe8b8 1054 default:
04499a70 1055 assert_not_reached();
a71fe8b8 1056 }
6e9feda3 1057
a71fe8b8
LP
1058 if (s->pending)
1059 prioq_remove(s->event->pending, s, &s->pending_index);
9d3e3aa5 1060
a71fe8b8
LP
1061 if (s->prepare)
1062 prioq_remove(s->event->prepare, s, &s->prepare_index);
fd38203a 1063
b6d5481b
LP
1064 if (s->ratelimited)
1065 event_source_time_prioq_remove(s, &s->event->monotonic);
1066
e514aa1e 1067 event = TAKE_PTR(s->event);
a71fe8b8
LP
1068 LIST_REMOVE(sources, event->sources, s);
1069 event->n_sources--;
fd38203a 1070
f5982559
LP
1071 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1072 * pidfd associated with this event source, which we'll do only on source_free(). */
1073
a71fe8b8
LP
1074 if (!s->floating)
1075 sd_event_unref(event);
1076}
1077
75db809a 1078static sd_event_source* source_free(sd_event_source *s) {
6e14c46b
MY
1079 int r;
1080
a71fe8b8 1081 assert(s);
fd38203a 1082
a71fe8b8 1083 source_disconnect(s);
ab93297c
NM
1084
1085 if (s->type == SOURCE_IO && s->io.owned)
15723a1d
LP
1086 s->io.fd = safe_close(s->io.fd);
1087
f8f3f926
LP
1088 if (s->type == SOURCE_CHILD) {
1089 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1090
1091 if (s->child.process_owned) {
c6cc7efc
MY
1092 assert(s->child.pid > 0);
1093 assert(s->child.pidfd >= 0);
f8f3f926
LP
1094
1095 if (!s->child.exited) {
c6cc7efc 1096 r = RET_NERRNO(pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0));
6e14c46b
MY
1097 if (r < 0 && r != -ESRCH)
1098 log_debug_errno(r, "Failed to kill process " PID_FMT ", ignoring: %m",
1099 s->child.pid);
f8f3f926
LP
1100 }
1101
1102 if (!s->child.waited) {
1103 siginfo_t si = {};
1104
1105 /* Reap the child if we can */
c6cc7efc 1106 (void) waitid(P_PIDFD, s->child.pidfd, &si, WEXITED);
f8f3f926
LP
1107 }
1108 }
1109
1110 if (s->child.pidfd_owned)
1111 s->child.pidfd = safe_close(s->child.pidfd);
1112 }
1113
158fe190
LP
1114 if (s->type == SOURCE_MEMORY_PRESSURE) {
1115 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1116 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1117 }
1118
15723a1d
LP
1119 if (s->destroy_callback)
1120 s->destroy_callback(s->userdata);
ab93297c 1121
356779df 1122 free(s->description);
75db809a 1123 return mfree(s);
fd38203a 1124}
8c75fe17 1125DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
fd38203a
LP
1126
1127static int source_set_pending(sd_event_source *s, bool b) {
1128 int r;
1129
1130 assert(s);
6203e07a 1131 assert(s->type != SOURCE_EXIT);
fd38203a
LP
1132
1133 if (s->pending == b)
1134 return 0;
1135
1136 s->pending = b;
1137
1138 if (b) {
1139 s->pending_iteration = s->event->iteration;
1140
1141 r = prioq_put(s->event->pending, s, &s->pending_index);
1142 if (r < 0) {
1143 s->pending = false;
1144 return r;
1145 }
1146 } else
1147 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1148
e1951c16
MS
1149 if (EVENT_SOURCE_IS_TIME(s->type))
1150 event_source_time_prioq_reshuffle(s);
2576a19e 1151
9da4cb2b
LP
1152 if (s->type == SOURCE_SIGNAL && !b) {
1153 struct signal_data *d;
1154
1155 d = hashmap_get(s->event->signal_data, &s->priority);
1156 if (d && d->current == s)
1157 d->current = NULL;
1158 }
1159
97ef5391
LP
1160 if (s->type == SOURCE_INOTIFY) {
1161
1162 assert(s->inotify.inode_data);
1163 assert(s->inotify.inode_data->inotify_data);
1164
1165 if (b)
b3a9d980 1166 s->inotify.inode_data->inotify_data->n_pending++;
97ef5391
LP
1167 else {
1168 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
b3a9d980 1169 s->inotify.inode_data->inotify_data->n_pending--;
97ef5391
LP
1170 }
1171 }
1172
efd3be9d 1173 return 1;
fd38203a
LP
1174}
1175
415bf4e0 1176static sd_event_source* source_new(sd_event *e, bool floating, EventSourceType type) {
a38cf9fb
LP
1177
1178 /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1179 * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1180 * lines. */
1181 static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1182 [SOURCE_IO] = endoffsetof_field(sd_event_source, io),
1183 [SOURCE_TIME_REALTIME] = endoffsetof_field(sd_event_source, time),
1184 [SOURCE_TIME_BOOTTIME] = endoffsetof_field(sd_event_source, time),
1185 [SOURCE_TIME_MONOTONIC] = endoffsetof_field(sd_event_source, time),
1186 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1187 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1188 [SOURCE_SIGNAL] = endoffsetof_field(sd_event_source, signal),
1189 [SOURCE_CHILD] = endoffsetof_field(sd_event_source, child),
1190 [SOURCE_DEFER] = endoffsetof_field(sd_event_source, defer),
1191 [SOURCE_POST] = endoffsetof_field(sd_event_source, post),
1192 [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit),
1193 [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify),
158fe190 1194 [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure),
a38cf9fb
LP
1195 };
1196
fd38203a
LP
1197 sd_event_source *s;
1198
1199 assert(e);
a38cf9fb
LP
1200 assert(type >= 0);
1201 assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1202 assert(size_table[type] > 0);
fd38203a 1203
a19b6bd5 1204 s = malloc0(size_table[type]);
fd38203a
LP
1205 if (!s)
1206 return NULL;
a19b6bd5
FS
1207 /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1208 * size, even if we only allocate the initial part we need. */
1209 s = expand_to_usable(s, sizeof(sd_event_source));
fd38203a 1210
a38cf9fb
LP
1211 /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1212 * than what we allocated here. */
1213 s->n_ref = 1;
1214 s->event = e;
1215 s->floating = floating;
1216 s->type = type;
1217 s->pending_index = PRIOQ_IDX_NULL;
1218 s->prepare_index = PRIOQ_IDX_NULL;
a71fe8b8
LP
1219
1220 if (!floating)
1221 sd_event_ref(e);
fd38203a 1222
a71fe8b8 1223 LIST_PREPEND(sources, e->sources, s);
313cefa1 1224 e->n_sources++;
15b38f93 1225
fd38203a
LP
1226 return s;
1227}
1228
b9350e70
LP
1229static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1230 assert(s);
1231
1232 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1233}
1234
f7262a9f 1235_public_ int sd_event_add_io(
fd38203a 1236 sd_event *e,
151b9b96 1237 sd_event_source **ret,
fd38203a
LP
1238 int fd,
1239 uint32_t events,
718db961 1240 sd_event_io_handler_t callback,
151b9b96 1241 void *userdata) {
fd38203a 1242
ec766a51 1243 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1244 int r;
1245
305f78bf 1246 assert_return(e, -EINVAL);
b937d761 1247 assert_return(e = event_resolve(e), -ENOPKG);
8ac43fee 1248 assert_return(fd >= 0, -EBADF);
2a16a986 1249 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 1250 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1251 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 1252
b9350e70
LP
1253 if (!callback)
1254 callback = io_exit_callback;
1255
a71fe8b8 1256 s = source_new(e, !ret, SOURCE_IO);
fd38203a
LP
1257 if (!s)
1258 return -ENOMEM;
1259
9da4cb2b 1260 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1261 s->io.fd = fd;
1262 s->io.events = events;
1263 s->io.callback = callback;
1264 s->userdata = userdata;
baf76283 1265 s->enabled = SD_EVENT_ON;
fd38203a 1266
baf76283 1267 r = source_io_register(s, s->enabled, events);
ec766a51 1268 if (r < 0)
050f74f2 1269 return r;
fd38203a 1270
a71fe8b8
LP
1271 if (ret)
1272 *ret = s;
ec766a51 1273 TAKE_PTR(s);
a71fe8b8 1274
fd38203a
LP
1275 return 0;
1276}
1277
52444dc4 1278static void initialize_perturb(sd_event *e) {
6d2326e0 1279 sd_id128_t id = {};
52444dc4 1280
6d2326e0
YW
1281 /* When we sleep for longer, we try to realign the wakeup to the same time within each
1282 * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1283 * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1284 * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1285 * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
52444dc4 1286
3a43da28 1287 if (_likely_(e->perturb != USEC_INFINITY))
52444dc4
LP
1288 return;
1289
1912f790 1290 if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
6d2326e0
YW
1291 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1292 else
1293 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
52444dc4
LP
1294}
1295
fd38203a
LP
1296static int event_setup_timer_fd(
1297 sd_event *e,
6a0f1f6d
LP
1298 struct clock_data *d,
1299 clockid_t clock) {
fd38203a 1300
fd38203a 1301 assert(e);
6a0f1f6d 1302 assert(d);
fd38203a 1303
6a0f1f6d 1304 if (_likely_(d->fd >= 0))
fd38203a
LP
1305 return 0;
1306
254d1313 1307 _cleanup_close_ int fd = -EBADF;
b44d87e2 1308
6a0f1f6d 1309 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
fd38203a
LP
1310 if (fd < 0)
1311 return -errno;
1312
7fe2903c
LP
1313 fd = fd_move_above_stdio(fd);
1314
1eac7948 1315 struct epoll_event ev = {
a82f89aa
LP
1316 .events = EPOLLIN,
1317 .data.ptr = d,
1318 };
fd38203a 1319
15c689d7 1320 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
fd38203a 1321 return -errno;
fd38203a 1322
b44d87e2 1323 d->fd = TAKE_FD(fd);
fd38203a
LP
1324 return 0;
1325}
1326
c4f1aff2
TG
1327static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1328 assert(s);
1329
1330 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1331}
1332
41c63f36
LP
1333static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1334 int r;
1335
1336 assert(d);
1337
1338 if (d->fd < 0) {
1339 r = event_setup_timer_fd(e, d, clock);
1340 if (r < 0)
1341 return r;
1342 }
1343
1344 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1345 if (r < 0)
1346 return r;
1347
1348 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1349 if (r < 0)
1350 return r;
1351
1352 return 0;
1353}
1354
1e45e3fe
LP
1355static int event_source_time_prioq_put(
1356 sd_event_source *s,
1357 struct clock_data *d) {
1358
1359 int r;
1360
1361 assert(s);
1362 assert(d);
19947509 1363 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1e45e3fe 1364
f41315fc 1365 r = prioq_put(d->earliest, s, &s->earliest_index);
1e45e3fe
LP
1366 if (r < 0)
1367 return r;
1368
f41315fc 1369 r = prioq_put(d->latest, s, &s->latest_index);
1e45e3fe 1370 if (r < 0) {
f41315fc
LP
1371 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1372 s->earliest_index = PRIOQ_IDX_NULL;
1e45e3fe
LP
1373 return r;
1374 }
1375
1376 d->needs_rearm = true;
1377 return 0;
1378}
1379
6a0f1f6d 1380_public_ int sd_event_add_time(
fd38203a 1381 sd_event *e,
151b9b96 1382 sd_event_source **ret,
6a0f1f6d 1383 clockid_t clock,
fd38203a 1384 uint64_t usec,
c2ba3ad6 1385 uint64_t accuracy,
718db961 1386 sd_event_time_handler_t callback,
151b9b96 1387 void *userdata) {
fd38203a 1388
6a0f1f6d 1389 EventSourceType type;
ec766a51 1390 _cleanup_(source_freep) sd_event_source *s = NULL;
6a0f1f6d 1391 struct clock_data *d;
fd38203a
LP
1392 int r;
1393
305f78bf 1394 assert_return(e, -EINVAL);
b937d761 1395 assert_return(e = event_resolve(e), -ENOPKG);
f5fbe71d 1396 assert_return(accuracy != UINT64_MAX, -EINVAL);
da7e457c 1397 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1398 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 1399
e475d10c
LP
1400 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1401 return -EOPNOTSUPP;
1402
1403 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1404 if (type < 0)
3411372e
LP
1405 return -EOPNOTSUPP;
1406
c4f1aff2
TG
1407 if (!callback)
1408 callback = time_exit_callback;
1409
1e45e3fe 1410 assert_se(d = event_get_clock_data(e, type));
c2ba3ad6 1411
41c63f36 1412 r = setup_clock_data(e, d, clock);
c983e776
EV
1413 if (r < 0)
1414 return r;
fd38203a 1415
a71fe8b8 1416 s = source_new(e, !ret, type);
fd38203a
LP
1417 if (!s)
1418 return -ENOMEM;
1419
1420 s->time.next = usec;
c2ba3ad6 1421 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
fd38203a 1422 s->time.callback = callback;
f41315fc 1423 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
fd38203a 1424 s->userdata = userdata;
baf76283 1425 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1426
1e45e3fe 1427 r = event_source_time_prioq_put(s, d);
c2ba3ad6 1428 if (r < 0)
ec766a51 1429 return r;
fd38203a 1430
a71fe8b8
LP
1431 if (ret)
1432 *ret = s;
ec766a51 1433 TAKE_PTR(s);
a71fe8b8 1434
fd38203a
LP
1435 return 0;
1436}
1437
d6a83dc4
LP
1438_public_ int sd_event_add_time_relative(
1439 sd_event *e,
1440 sd_event_source **ret,
1441 clockid_t clock,
1442 uint64_t usec,
1443 uint64_t accuracy,
1444 sd_event_time_handler_t callback,
1445 void *userdata) {
1446
1447 usec_t t;
1448 int r;
1449
1450 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1451 * checks for overflow. */
1452
1453 r = sd_event_now(e, clock, &t);
1454 if (r < 0)
1455 return r;
1456
1457 if (usec >= USEC_INFINITY - t)
1458 return -EOVERFLOW;
1459
1460 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1461}
1462
59bc1fd7
LP
1463static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1464 assert(s);
1465
1466 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1467}
1468
f7262a9f 1469_public_ int sd_event_add_signal(
305f78bf 1470 sd_event *e,
151b9b96 1471 sd_event_source **ret,
305f78bf 1472 int sig,
718db961 1473 sd_event_signal_handler_t callback,
151b9b96 1474 void *userdata) {
305f78bf 1475
ec766a51 1476 _cleanup_(source_freep) sd_event_source *s = NULL;
9da4cb2b 1477 struct signal_data *d;
897448bd
LP
1478 sigset_t new_ss;
1479 bool block_it;
fd38203a
LP
1480 int r;
1481
305f78bf 1482 assert_return(e, -EINVAL);
b937d761 1483 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1484 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1485 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 1486
897448bd
LP
1487 /* Let's make sure our special flag stays outside of the valid signal range */
1488 assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1489
1490 if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1491 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1492 assert_return(SIGNAL_VALID(sig), -EINVAL);
1493
1494 block_it = true;
1495 } else {
1496 assert_return(SIGNAL_VALID(sig), -EINVAL);
1497
1498 r = signal_is_blocked(sig);
1499 if (r < 0)
1500 return r;
1501 if (r == 0)
1502 return -EBUSY;
1503
1504 block_it = false;
1505 }
1506
59bc1fd7
LP
1507 if (!callback)
1508 callback = signal_exit_callback;
1509
fd38203a
LP
1510 if (!e->signal_sources) {
1511 e->signal_sources = new0(sd_event_source*, _NSIG);
1512 if (!e->signal_sources)
1513 return -ENOMEM;
1514 } else if (e->signal_sources[sig])
1515 return -EBUSY;
1516
a71fe8b8 1517 s = source_new(e, !ret, SOURCE_SIGNAL);
fd38203a
LP
1518 if (!s)
1519 return -ENOMEM;
1520
1521 s->signal.sig = sig;
1522 s->signal.callback = callback;
1523 s->userdata = userdata;
baf76283 1524 s->enabled = SD_EVENT_ON;
fd38203a
LP
1525
1526 e->signal_sources[sig] = s;
fd38203a 1527
897448bd
LP
1528 if (block_it) {
1529 sigset_t old_ss;
1530
1531 if (sigemptyset(&new_ss) < 0)
1532 return -errno;
1533
1534 if (sigaddset(&new_ss, sig) < 0)
1535 return -errno;
1536
1537 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1538 if (r != 0)
1539 return -r;
1540
1541 r = sigismember(&old_ss, sig);
1542 if (r < 0)
1543 return -errno;
1544
1545 s->signal.unblock = !r;
1546 } else
1547 s->signal.unblock = false;
1548
9da4cb2b 1549 r = event_make_signal_data(e, sig, &d);
897448bd
LP
1550 if (r < 0) {
1551 if (s->signal.unblock)
1552 (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1553
9da4cb2b 1554 return r;
897448bd 1555 }
fd38203a 1556
f1f00dbb
LP
1557 /* Use the signal name as description for the event source by default */
1558 (void) sd_event_source_set_description(s, signal_to_string(sig));
1559
a71fe8b8
LP
1560 if (ret)
1561 *ret = s;
ec766a51 1562 TAKE_PTR(s);
a71fe8b8 1563
fd38203a
LP
1564 return 0;
1565}
1566
b9350e70
LP
1567static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1568 assert(s);
1569
1570 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1571}
1572
f7262a9f 1573_public_ int sd_event_add_child(
305f78bf 1574 sd_event *e,
151b9b96 1575 sd_event_source **ret,
305f78bf
LP
1576 pid_t pid,
1577 int options,
718db961 1578 sd_event_child_handler_t callback,
151b9b96 1579 void *userdata) {
305f78bf 1580
ec766a51 1581 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1582 int r;
1583
305f78bf 1584 assert_return(e, -EINVAL);
b937d761 1585 assert_return(e = event_resolve(e), -ENOPKG);
305f78bf
LP
1586 assert_return(pid > 1, -EINVAL);
1587 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1588 assert_return(options != 0, -EINVAL);
da7e457c 1589 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1590 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 1591
b9350e70
LP
1592 if (!callback)
1593 callback = child_exit_callback;
1594
b6d5481b 1595 if (e->n_online_child_sources == 0) {
ee880b37
LP
1596 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1597 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1598 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1599 * take effect.
1600 *
1601 * (As an optimization we only do this check on the first child event source created.) */
1602 r = signal_is_blocked(SIGCHLD);
1603 if (r < 0)
1604 return r;
1605 if (r == 0)
1606 return -EBUSY;
1607 }
1608
d5099efc 1609 r = hashmap_ensure_allocated(&e->child_sources, NULL);
fd38203a
LP
1610 if (r < 0)
1611 return r;
1612
4a0b58c4 1613 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
fd38203a
LP
1614 return -EBUSY;
1615
a71fe8b8 1616 s = source_new(e, !ret, SOURCE_CHILD);
fd38203a
LP
1617 if (!s)
1618 return -ENOMEM;
1619
6e14c46b
MY
1620 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1621 * pin the PID, and make regular waitid() handling race-free. */
1622
1623 s->child.pidfd = pidfd_open(pid, 0);
1624 if (s->child.pidfd < 0)
1625 return -errno;
1626
1627 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1628
f8f3f926 1629 s->wakeup = WAKEUP_EVENT_SOURCE;
fd38203a
LP
1630 s->child.options = options;
1631 s->child.callback = callback;
1632 s->userdata = userdata;
baf76283 1633 s->enabled = SD_EVENT_ONESHOT;
fd38203a 1634
f8f3f926 1635 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
6e14c46b 1636 /* We only want to watch for exit */
f8f3f926 1637 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1638 if (r < 0)
f8f3f926 1639 return r;
ac9f2640 1640
f8f3f926 1641 } else {
6e14c46b 1642 /* We shall wait for some other event than WEXITED */
f8f3f926 1643 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1644 if (r < 0)
f8f3f926 1645 return r;
f8f3f926
LP
1646
1647 e->need_process_child = true;
1648 }
c2ba3ad6 1649
54988a27
YW
1650 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1651 if (r < 0)
1652 return r;
1653
1654 /* These must be done after everything succeeds. */
1655 s->child.pid = pid;
b6d5481b 1656 e->n_online_child_sources++;
ac9f2640 1657
a71fe8b8
LP
1658 if (ret)
1659 *ret = s;
ec766a51 1660 TAKE_PTR(s);
f8f3f926
LP
1661 return 0;
1662}
1663
1664_public_ int sd_event_add_child_pidfd(
1665 sd_event *e,
1666 sd_event_source **ret,
1667 int pidfd,
1668 int options,
1669 sd_event_child_handler_t callback,
1670 void *userdata) {
1671
f8f3f926
LP
1672 _cleanup_(source_freep) sd_event_source *s = NULL;
1673 pid_t pid;
1674 int r;
1675
1676 assert_return(e, -EINVAL);
1677 assert_return(e = event_resolve(e), -ENOPKG);
1678 assert_return(pidfd >= 0, -EBADF);
1679 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1680 assert_return(options != 0, -EINVAL);
f8f3f926 1681 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1682 assert_return(!event_origin_changed(e), -ECHILD);
f8f3f926 1683
b9350e70
LP
1684 if (!callback)
1685 callback = child_exit_callback;
1686
b6d5481b 1687 if (e->n_online_child_sources == 0) {
ee880b37
LP
1688 r = signal_is_blocked(SIGCHLD);
1689 if (r < 0)
1690 return r;
1691 if (r == 0)
1692 return -EBUSY;
1693 }
1694
f8f3f926
LP
1695 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1696 if (r < 0)
1697 return r;
1698
1699 r = pidfd_get_pid(pidfd, &pid);
1700 if (r < 0)
1701 return r;
1702
1703 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1704 return -EBUSY;
1705
1706 s = source_new(e, !ret, SOURCE_CHILD);
1707 if (!s)
1708 return -ENOMEM;
1709
1710 s->wakeup = WAKEUP_EVENT_SOURCE;
1711 s->child.pidfd = pidfd;
f8f3f926
LP
1712 s->child.options = options;
1713 s->child.callback = callback;
1714 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1715 s->userdata = userdata;
1716 s->enabled = SD_EVENT_ONESHOT;
1717
f8f3f926
LP
1718 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1719 /* We only want to watch for WEXITED */
f8f3f926 1720 r = source_child_pidfd_register(s, s->enabled);
ac9f2640 1721 if (r < 0)
f8f3f926 1722 return r;
f8f3f926
LP
1723 } else {
1724 /* We shall wait for some other event than WEXITED */
f8f3f926 1725 r = event_make_signal_data(e, SIGCHLD, NULL);
ac9f2640 1726 if (r < 0)
f8f3f926 1727 return r;
a71fe8b8 1728
f8f3f926
LP
1729 e->need_process_child = true;
1730 }
1731
a342c71d
MY
1732 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1733 if (r < 0)
1734 return r;
1735
1736 s->child.pid = pid;
b6d5481b 1737 e->n_online_child_sources++;
ac9f2640 1738
f8f3f926
LP
1739 if (ret)
1740 *ret = s;
f8f3f926 1741 TAKE_PTR(s);
fd38203a
LP
1742 return 0;
1743}
1744
b9350e70
LP
1745static int generic_exit_callback(sd_event_source *s, void *userdata) {
1746 assert(s);
1747
1748 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1749}
1750
f7262a9f 1751_public_ int sd_event_add_defer(
305f78bf 1752 sd_event *e,
151b9b96 1753 sd_event_source **ret,
718db961 1754 sd_event_handler_t callback,
151b9b96 1755 void *userdata) {
305f78bf 1756
ec766a51 1757 _cleanup_(source_freep) sd_event_source *s = NULL;
fd38203a
LP
1758 int r;
1759
305f78bf 1760 assert_return(e, -EINVAL);
b937d761 1761 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 1762 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1763 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 1764
b9350e70
LP
1765 if (!callback)
1766 callback = generic_exit_callback;
1767
a71fe8b8 1768 s = source_new(e, !ret, SOURCE_DEFER);
fd38203a
LP
1769 if (!s)
1770 return -ENOMEM;
1771
1772 s->defer.callback = callback;
1773 s->userdata = userdata;
baf76283 1774 s->enabled = SD_EVENT_ONESHOT;
fd38203a
LP
1775
1776 r = source_set_pending(s, true);
ec766a51 1777 if (r < 0)
fd38203a 1778 return r;
fd38203a 1779
a71fe8b8
LP
1780 if (ret)
1781 *ret = s;
ec766a51 1782 TAKE_PTR(s);
a71fe8b8 1783
fd38203a
LP
1784 return 0;
1785}
1786
6e9feda3
LP
1787_public_ int sd_event_add_post(
1788 sd_event *e,
1789 sd_event_source **ret,
1790 sd_event_handler_t callback,
1791 void *userdata) {
1792
ec766a51 1793 _cleanup_(source_freep) sd_event_source *s = NULL;
6e9feda3
LP
1794 int r;
1795
1796 assert_return(e, -EINVAL);
b937d761 1797 assert_return(e = event_resolve(e), -ENOPKG);
6e9feda3 1798 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1799 assert_return(!event_origin_changed(e), -ECHILD);
6e9feda3 1800
b9350e70
LP
1801 if (!callback)
1802 callback = generic_exit_callback;
1803
a71fe8b8 1804 s = source_new(e, !ret, SOURCE_POST);
6e9feda3
LP
1805 if (!s)
1806 return -ENOMEM;
1807
1808 s->post.callback = callback;
1809 s->userdata = userdata;
1810 s->enabled = SD_EVENT_ON;
1811
de7fef4b 1812 r = set_ensure_put(&e->post_sources, NULL, s);
ec766a51 1813 if (r < 0)
6e9feda3 1814 return r;
de7fef4b 1815 assert(r > 0);
6e9feda3 1816
a71fe8b8
LP
1817 if (ret)
1818 *ret = s;
ec766a51 1819 TAKE_PTR(s);
a71fe8b8 1820
6e9feda3
LP
1821 return 0;
1822}
1823
6203e07a 1824_public_ int sd_event_add_exit(
305f78bf 1825 sd_event *e,
151b9b96 1826 sd_event_source **ret,
718db961 1827 sd_event_handler_t callback,
151b9b96 1828 void *userdata) {
305f78bf 1829
ec766a51 1830 _cleanup_(source_freep) sd_event_source *s = NULL;
da7e457c
LP
1831 int r;
1832
1833 assert_return(e, -EINVAL);
b937d761 1834 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c
LP
1835 assert_return(callback, -EINVAL);
1836 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1837 assert_return(!event_origin_changed(e), -ECHILD);
da7e457c 1838
c983e776
EV
1839 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1840 if (r < 0)
1841 return r;
da7e457c 1842
a71fe8b8 1843 s = source_new(e, !ret, SOURCE_EXIT);
fd38203a 1844 if (!s)
da7e457c 1845 return -ENOMEM;
fd38203a 1846
6203e07a 1847 s->exit.callback = callback;
da7e457c 1848 s->userdata = userdata;
6203e07a 1849 s->exit.prioq_index = PRIOQ_IDX_NULL;
baf76283 1850 s->enabled = SD_EVENT_ONESHOT;
da7e457c 1851
6203e07a 1852 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
ec766a51 1853 if (r < 0)
da7e457c 1854 return r;
da7e457c 1855
a71fe8b8
LP
1856 if (ret)
1857 *ret = s;
ec766a51 1858 TAKE_PTR(s);
a71fe8b8 1859
da7e457c
LP
1860 return 0;
1861}
1862
9857de4f 1863_public_ int sd_event_trim_memory(void) {
158fe190
LP
1864 int r;
1865
1866 /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1867 * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1868 * NULL callback parameter. */
1869
1870 log_debug("Memory pressure event, trimming malloc() memory.");
1871
abb99d31 1872 struct mallinfo2 before_mallinfo = mallinfo2();
158fe190
LP
1873
1874 usec_t before_timestamp = now(CLOCK_MONOTONIC);
1875 hashmap_trim_pools();
1876 r = malloc_trim(0);
1877 usec_t after_timestamp = now(CLOCK_MONOTONIC);
1878
1879 if (r > 0)
1880 log_debug("Successfully trimmed some memory.");
1881 else
1882 log_debug("Couldn't trim any memory.");
1883
1884 usec_t period = after_timestamp - before_timestamp;
1885
abb99d31
YW
1886 struct mallinfo2 after_mallinfo = mallinfo2();
1887 size_t l = LESS_BY(before_mallinfo.hblkhd, after_mallinfo.hblkhd) +
1888 LESS_BY(before_mallinfo.arena, after_mallinfo.arena);
158fe190
LP
1889 log_struct(LOG_DEBUG,
1890 LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1891 FORMAT_TIMESPAN(period, 0),
1892 FORMAT_BYTES(l)),
3cf6a3a3
YW
1893 LOG_MESSAGE_ID(SD_MESSAGE_MEMORY_TRIM_STR),
1894 LOG_ITEM("TRIMMED_BYTES=%zu", l),
1895 LOG_ITEM("TRIMMED_USEC=" USEC_FMT, period));
158fe190
LP
1896
1897 return 0;
1898}
1899
1900static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1901 assert(s);
1902
1903 sd_event_trim_memory();
1904 return 0;
1905}
1906
1907_public_ int sd_event_add_memory_pressure(
1908 sd_event *e,
1909 sd_event_source **ret,
1910 sd_event_handler_t callback,
1911 void *userdata) {
1912
1913 _cleanup_free_ char *w = NULL;
1914 _cleanup_(source_freep) sd_event_source *s = NULL;
92651a7a 1915 _cleanup_close_ int path_fd = -EBADF, fd = -EBADF;
158fe190 1916 _cleanup_free_ void *write_buffer = NULL;
40c5d5d2 1917 const char *watch, *watch_fallback = NULL, *env;
158fe190
LP
1918 size_t write_buffer_size = 0;
1919 struct stat st;
1920 uint32_t events;
1921 bool locked;
1922 int r;
1923
1924 assert_return(e, -EINVAL);
1925 assert_return(e = event_resolve(e), -ENOPKG);
1926 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 1927 assert_return(!event_origin_changed(e), -ECHILD);
158fe190
LP
1928
1929 if (!callback)
1930 callback = memory_pressure_callback;
1931
1932 s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1933 if (!s)
1934 return -ENOMEM;
1935
1936 s->wakeup = WAKEUP_EVENT_SOURCE;
1937 s->memory_pressure.callback = callback;
1938 s->userdata = userdata;
1939 s->enabled = SD_EVENT_ON;
1940 s->memory_pressure.fd = -EBADF;
1941
1942 env = secure_getenv("MEMORY_PRESSURE_WATCH");
1943 if (env) {
1944 if (isempty(env) || path_equal(env, "/dev/null"))
1945 return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1946 "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1947
1948 if (!path_is_absolute(env) || !path_is_normalized(env))
1949 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1950 "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1951
1952 watch = env;
1953
1954 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1955 if (env) {
bdd2036e 1956 r = unbase64mem(env, &write_buffer, &write_buffer_size);
158fe190
LP
1957 if (r < 0)
1958 return r;
1959 }
1960
1961 locked = true;
1962 } else {
1963
1964 r = is_pressure_supported();
1965 if (r < 0)
1966 return r;
1967 if (r == 0)
1968 return -EOPNOTSUPP;
1969
1970 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1971 * the system wide pressure if for some reason we cannot (which could be: memory controller
1972 * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1973 * only use the system-wide logic. */
1974 r = cg_all_unified();
1975 if (r < 0)
1976 return r;
1977 if (r == 0)
1978 watch = "/proc/pressure/memory";
1979 else {
1980 _cleanup_free_ char *cg = NULL;
1981
1982 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
1983 if (r < 0)
1984 return r;
1985
1986 w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
1987 if (!w)
1988 return -ENOMEM;
1989
1990 watch = w;
1991 watch_fallback = "/proc/pressure/memory";
1992 }
1993
1994 /* Android uses three levels in its userspace low memory killer logic:
1995 * some 70000 1000000
1996 * some 100000 1000000
1997 * full 70000 1000000
1998 *
1999 * GNOME's low memory monitor uses:
2000 * some 70000 1000000
2001 * some 100000 1000000
2002 * full 100000 1000000
2003 *
a6170074
LP
2004 * We'll default to the middle level that both agree on. Except we do it on a 2s window
2005 * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2006 * kernel will allow us to do unprivileged, also in the future. */
158fe190
LP
2007 if (asprintf((char**) &write_buffer,
2008 "%s " USEC_FMT " " USEC_FMT,
2009 MEMORY_PRESSURE_DEFAULT_TYPE,
2010 MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2011 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2012 return -ENOMEM;
2013
2014 write_buffer_size = strlen(write_buffer) + 1;
2015 locked = false;
2016 }
2017
2018 path_fd = open(watch, O_PATH|O_CLOEXEC);
2019 if (path_fd < 0) {
2020 if (errno != ENOENT)
2021 return -errno;
2022
2023 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2024 * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2025 * the PSI service apparently is not supported) */
2026 if (!watch_fallback)
2027 return locked ? -ENOENT : -EOPNOTSUPP;
2028
2029 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
63b1e67e
YW
2030 if (path_fd < 0) {
2031 if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2032 return -EOPNOTSUPP;
158fe190 2033 return -errno;
63b1e67e 2034 }
158fe190
LP
2035 }
2036
2037 if (fstat(path_fd, &st) < 0)
2038 return -errno;
2039
2040 if (S_ISSOCK(st.st_mode)) {
2041 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2042 if (fd < 0)
2043 return -errno;
2044
2045 r = connect_unix_path(fd, path_fd, NULL);
2046 if (r < 0)
2047 return r;
2048
2049 events = EPOLLIN;
2050
2051 } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2052 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2053 if (fd < 0)
2054 return fd;
2055
2056 if (S_ISREG(st.st_mode)) {
2057 struct statfs sfs;
2058
2059 /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2060
2061 if (fstatfs(fd, &sfs) < 0)
2062 return -errno;
2063
2064 if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2065 !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2066 return -ENOTTY;
2067
2068 events = EPOLLPRI;
2069 } else
2070 /* For fifos and char devices just watch for EPOLLIN */
2071 events = EPOLLIN;
2072
2073 } else if (S_ISDIR(st.st_mode))
2074 return -EISDIR;
2075 else
2076 return -EBADF;
2077
2078 s->memory_pressure.fd = TAKE_FD(fd);
2079 s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2080 s->memory_pressure.write_buffer_size = write_buffer_size;
2081 s->memory_pressure.events = events;
2082 s->memory_pressure.locked = locked;
2083
2084 /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2085 * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2086 * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2087 * event sources on which writes must be executed before the first event loop iteration is
2088 * executed. (We could also write the data here, right away, but we want to give the caller the
2089 * freedom to call sd_event_source_set_memory_pressure_type() and
2090 * sd_event_source_set_memory_pressure_rate() before we write it. */
2091
2092 if (s->memory_pressure.write_buffer_size > 0)
2093 source_memory_pressure_add_to_write_list(s);
2094 else {
2095 r = source_memory_pressure_register(s, s->enabled);
2096 if (r < 0)
2097 return r;
2098 }
2099
2100 if (ret)
2101 *ret = s;
2102 TAKE_PTR(s);
2103
2104 return 0;
2105}
2106
97ef5391
LP
2107static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2108 assert(e);
2109
2110 if (!d)
2111 return;
2112
2113 assert(hashmap_isempty(d->inodes));
2114 assert(hashmap_isempty(d->wd));
2115
2116 if (d->buffer_filled > 0)
0601b958 2117 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
2118
2119 hashmap_free(d->inodes);
2120 hashmap_free(d->wd);
2121
2122 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2123
2124 if (d->fd >= 0) {
2eeff0f4 2125 if (!event_origin_changed(e) &&
fbae5090 2126 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
97ef5391
LP
2127 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2128
2129 safe_close(d->fd);
2130 }
2131 free(d);
2132}
2133
2134static int event_make_inotify_data(
2135 sd_event *e,
2136 int64_t priority,
2137 struct inotify_data **ret) {
2138
254d1313 2139 _cleanup_close_ int fd = -EBADF;
97ef5391 2140 struct inotify_data *d;
97ef5391
LP
2141 int r;
2142
2143 assert(e);
2144
2145 d = hashmap_get(e->inotify_data, &priority);
2146 if (d) {
2147 if (ret)
2148 *ret = d;
2149 return 0;
2150 }
2151
2152 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2153 if (fd < 0)
2154 return -errno;
2155
2156 fd = fd_move_above_stdio(fd);
2157
97ef5391
LP
2158 d = new(struct inotify_data, 1);
2159 if (!d)
2160 return -ENOMEM;
2161
2162 *d = (struct inotify_data) {
2163 .wakeup = WAKEUP_INOTIFY_DATA,
2164 .fd = TAKE_FD(fd),
2165 .priority = priority,
2166 };
2167
c2484a75 2168 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
97ef5391
LP
2169 if (r < 0) {
2170 d->fd = safe_close(d->fd);
2171 free(d);
2172 return r;
2173 }
2174
1eac7948 2175 struct epoll_event ev = {
97ef5391
LP
2176 .events = EPOLLIN,
2177 .data.ptr = d,
2178 };
2179
2180 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2181 r = -errno;
2182 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2183 * remove the fd from the epoll first, which we don't want as we couldn't
2184 * add it in the first place. */
2185 event_free_inotify_data(e, d);
2186 return r;
2187 }
2188
2189 if (ret)
2190 *ret = d;
2191
2192 return 1;
2193}
2194
7a08d314 2195static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
90c88092 2196 int r;
97ef5391
LP
2197
2198 assert(x);
2199 assert(y);
2200
90c88092
YW
2201 r = CMP(x->dev, y->dev);
2202 if (r != 0)
2203 return r;
97ef5391 2204
6dd91b36 2205 return CMP(x->ino, y->ino);
97ef5391
LP
2206}
2207
7a08d314
YW
2208static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2209 assert(d);
97ef5391 2210
c01a5c05
YW
2211 siphash24_compress_typesafe(d->dev, state);
2212 siphash24_compress_typesafe(d->ino, state);
97ef5391
LP
2213}
2214
7a08d314 2215DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
97ef5391
LP
2216
2217static void event_free_inode_data(
2218 sd_event *e,
2219 struct inode_data *d) {
2220
2221 assert(e);
2222
2223 if (!d)
2224 return;
2225
64903d18 2226 assert(!d->event_sources);
97ef5391
LP
2227
2228 if (d->fd >= 0) {
ed828563 2229 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
2230 safe_close(d->fd);
2231 }
2232
2233 if (d->inotify_data) {
2234
2235 if (d->wd >= 0) {
2eeff0f4 2236 if (d->inotify_data->fd >= 0 && !event_origin_changed(e)) {
97ef5391
LP
2237 /* So here's a problem. At the time this runs the watch descriptor might already be
2238 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2239 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2240 * likely case to happen. */
2241
2242 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2243 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2244 }
2245
2246 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2247 }
2248
2249 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2250 }
2251
4a7cd0ca 2252 free(d->path);
97ef5391
LP
2253 free(d);
2254}
2255
53baf2ef
LP
2256static void event_gc_inotify_data(
2257 sd_event *e,
2258 struct inotify_data *d) {
2259
2260 assert(e);
2261
2262 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2263 * any inode with it anymore, which in turn happens if no event source of this priority is interested
2264 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2265 * (under the expectation that the GC is called again once the counter is decremented). */
2266
2267 if (!d)
2268 return;
2269
2270 if (!hashmap_isempty(d->inodes))
2271 return;
2272
2273 if (d->n_busy > 0)
2274 return;
2275
2276 event_free_inotify_data(e, d);
2277}
2278
97ef5391
LP
2279static void event_gc_inode_data(
2280 sd_event *e,
2281 struct inode_data *d) {
2282
2283 struct inotify_data *inotify_data;
2284
2285 assert(e);
2286
2287 if (!d)
2288 return;
2289
64903d18 2290 if (d->event_sources)
97ef5391
LP
2291 return;
2292
2293 inotify_data = d->inotify_data;
2294 event_free_inode_data(e, d);
2295
53baf2ef 2296 event_gc_inotify_data(e, inotify_data);
97ef5391
LP
2297}
2298
2299static int event_make_inode_data(
2300 sd_event *e,
2301 struct inotify_data *inotify_data,
2302 dev_t dev,
2303 ino_t ino,
2304 struct inode_data **ret) {
2305
2306 struct inode_data *d, key;
2307 int r;
2308
2309 assert(e);
2310 assert(inotify_data);
2311
2312 key = (struct inode_data) {
2313 .ino = ino,
2314 .dev = dev,
2315 };
2316
2317 d = hashmap_get(inotify_data->inodes, &key);
2318 if (d) {
2319 if (ret)
2320 *ret = d;
2321
2322 return 0;
2323 }
2324
2325 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2326 if (r < 0)
2327 return r;
2328
2329 d = new(struct inode_data, 1);
2330 if (!d)
2331 return -ENOMEM;
2332
2333 *d = (struct inode_data) {
2334 .dev = dev,
2335 .ino = ino,
2336 .wd = -1,
254d1313 2337 .fd = -EBADF,
97ef5391
LP
2338 .inotify_data = inotify_data,
2339 };
2340
2341 r = hashmap_put(inotify_data->inodes, d, d);
2342 if (r < 0) {
2343 free(d);
2344 return r;
2345 }
2346
2347 if (ret)
2348 *ret = d;
2349
2350 return 1;
2351}
2352
2353static uint32_t inode_data_determine_mask(struct inode_data *d) {
2354 bool excl_unlink = true;
2355 uint32_t combined = 0;
97ef5391
LP
2356
2357 assert(d);
2358
2359 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2360 * the IN_EXCL_UNLINK flag is ANDed instead.
2361 *
2362 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2363 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
f21f31b2 2364 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
97ef5391
LP
2365 * events we don't care for client-side. */
2366
2367 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2368
2369 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2370 excl_unlink = false;
2371
2372 combined |= s->inotify.mask;
2373 }
2374
2375 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2376}
2377
2378static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2379 uint32_t combined_mask;
2380 int wd, r;
2381
2382 assert(d);
2383 assert(d->fd >= 0);
2384
2385 combined_mask = inode_data_determine_mask(d);
2386
2387 if (d->wd >= 0 && combined_mask == d->combined_mask)
2388 return 0;
2389
2390 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2391 if (r < 0)
2392 return r;
2393
2394 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2395 if (wd < 0)
d5f24a0e 2396 return wd;
97ef5391
LP
2397
2398 if (d->wd < 0) {
2399 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2400 if (r < 0) {
2401 (void) inotify_rm_watch(d->inotify_data->fd, wd);
2402 return r;
2403 }
2404
2405 d->wd = wd;
2406
2407 } else if (d->wd != wd) {
2408
2409 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2410 (void) inotify_rm_watch(d->fd, wd);
2411 return -EINVAL;
2412 }
2413
2414 d->combined_mask = combined_mask;
2415 return 1;
2416}
2417
b9350e70
LP
2418static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2419 assert(s);
2420
2421 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2422}
2423
e67d738a 2424static int event_add_inotify_fd_internal(
97ef5391
LP
2425 sd_event *e,
2426 sd_event_source **ret,
e67d738a
LP
2427 int fd,
2428 bool donate,
97ef5391
LP
2429 uint32_t mask,
2430 sd_event_inotify_handler_t callback,
2431 void *userdata) {
2432
5bb1d7fb 2433 _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
e67d738a 2434 _cleanup_(source_freep) sd_event_source *s = NULL;
97ef5391
LP
2435 struct inotify_data *inotify_data = NULL;
2436 struct inode_data *inode_data = NULL;
97ef5391
LP
2437 struct stat st;
2438 int r;
2439
2440 assert_return(e, -EINVAL);
2441 assert_return(e = event_resolve(e), -ENOPKG);
e67d738a 2442 assert_return(fd >= 0, -EBADF);
97ef5391 2443 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 2444 assert_return(!event_origin_changed(e), -ECHILD);
97ef5391 2445
b9350e70
LP
2446 if (!callback)
2447 callback = inotify_exit_callback;
2448
97ef5391
LP
2449 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2450 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2451 * the user can't use them for us. */
2452 if (mask & IN_MASK_ADD)
2453 return -EINVAL;
2454
97ef5391
LP
2455 if (fstat(fd, &st) < 0)
2456 return -errno;
2457
2458 s = source_new(e, !ret, SOURCE_INOTIFY);
2459 if (!s)
2460 return -ENOMEM;
2461
2462 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2463 s->inotify.mask = mask;
2464 s->inotify.callback = callback;
2465 s->userdata = userdata;
2466
2467 /* Allocate an inotify object for this priority, and an inode object within it */
2468 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2469 if (r < 0)
8c75fe17 2470 return r;
97ef5391
LP
2471
2472 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
8c75fe17 2473 if (r < 0) {
e67d738a 2474 event_gc_inotify_data(e, inotify_data);
8c75fe17
ZJS
2475 return r;
2476 }
97ef5391
LP
2477
2478 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2479 * the event source, until then, for which we need the original inode. */
2480 if (inode_data->fd < 0) {
e67d738a
LP
2481 if (donated_fd >= 0)
2482 inode_data->fd = TAKE_FD(donated_fd);
2483 else {
2484 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2485 if (inode_data->fd < 0) {
2486 r = -errno;
2487 event_gc_inode_data(e, inode_data);
2488 return r;
2489 }
2490 }
2491
ed828563 2492 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
4a7cd0ca
YW
2493
2494 _cleanup_free_ char *path = NULL;
2495 r = fd_get_path(inode_data->fd, &path);
2496 if (r < 0 && r != -ENOSYS) { /* The path is optional, hence ignore -ENOSYS. */
2497 event_gc_inode_data(e, inode_data);
2498 return r;
2499 }
2500
2501 free_and_replace(inode_data->path, path);
97ef5391
LP
2502 }
2503
2504 /* Link our event source to the inode data object */
2505 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2506 s->inotify.inode_data = inode_data;
2507
97ef5391
LP
2508 /* Actually realize the watch now */
2509 r = inode_data_realize_watch(e, inode_data);
2510 if (r < 0)
8c75fe17 2511 return r;
97ef5391 2512
97ef5391
LP
2513 if (ret)
2514 *ret = s;
8c75fe17 2515 TAKE_PTR(s);
97ef5391
LP
2516
2517 return 0;
97ef5391
LP
2518}
2519
e67d738a
LP
2520_public_ int sd_event_add_inotify_fd(
2521 sd_event *e,
2522 sd_event_source **ret,
2523 int fd,
2524 uint32_t mask,
2525 sd_event_inotify_handler_t callback,
2526 void *userdata) {
2527
2528 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2529}
2530
2531_public_ int sd_event_add_inotify(
2532 sd_event *e,
2533 sd_event_source **ret,
2534 const char *path,
2535 uint32_t mask,
2536 sd_event_inotify_handler_t callback,
2537 void *userdata) {
2538
2091c779 2539 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
e67d738a
LP
2540 int fd, r;
2541
2542 assert_return(path, -EINVAL);
2543
586c8cee
ZJS
2544 fd = open(path, O_PATH | O_CLOEXEC |
2545 (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2546 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
e67d738a
LP
2547 if (fd < 0)
2548 return -errno;
2549
2550 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2551 if (r < 0)
2552 return r;
2553
2554 (void) sd_event_source_set_description(s, path);
2555
2556 if (ret)
2557 *ret = s;
2558
2559 return r;
2560}
2561
8301aa0b 2562static sd_event_source* event_source_free(sd_event_source *s) {
6680dd6b
LP
2563 if (!s)
2564 return NULL;
da7e457c 2565
8301aa0b
YW
2566 /* Here's a special hack: when we are called from a
2567 * dispatch handler we won't free the event source
2568 * immediately, but we will detach the fd from the
2569 * epoll. This way it is safe for the caller to unref
2570 * the event source and immediately close the fd, but
2571 * we still retain a valid event source object after
2572 * the callback. */
fd38203a 2573
76d04c3a 2574 if (s->dispatching)
8301aa0b 2575 source_disconnect(s);
76d04c3a 2576 else
8301aa0b 2577 source_free(s);
fd38203a
LP
2578
2579 return NULL;
2580}
2581
8301aa0b
YW
2582DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2583
356779df 2584_public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
f7f53e9e 2585 assert_return(s, -EINVAL);
2eeff0f4 2586 assert_return(!event_origin_changed(s->event), -ECHILD);
f7f53e9e 2587
356779df 2588 return free_and_strdup(&s->description, description);
f7f53e9e
TG
2589}
2590
ff5ba2d6 2591_public_ int sd_event_source_get_description(sd_event_source *s, const char **ret) {
f7f53e9e 2592 assert_return(s, -EINVAL);
ff5ba2d6 2593 assert_return(ret, -EINVAL);
f7f53e9e 2594
7d92a1a4
ZJS
2595 if (!s->description)
2596 return -ENXIO;
2597
ff5ba2d6 2598 *ret = s->description;
f7f53e9e
TG
2599 return 0;
2600}
2601
415bf4e0 2602_public_ sd_event* sd_event_source_get_event(sd_event_source *s) {
305f78bf 2603 assert_return(s, NULL);
2eeff0f4 2604 assert_return(!event_origin_changed(s->event), NULL);
eaa3cbef
LP
2605
2606 return s->event;
2607}
2608
f7262a9f 2609_public_ int sd_event_source_get_pending(sd_event_source *s) {
305f78bf 2610 assert_return(s, -EINVAL);
6203e07a 2611 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c 2612 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 2613 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
2614
2615 return s->pending;
2616}
2617
f7262a9f 2618_public_ int sd_event_source_get_io_fd(sd_event_source *s) {
305f78bf
LP
2619 assert_return(s, -EINVAL);
2620 assert_return(s->type == SOURCE_IO, -EDOM);
2eeff0f4 2621 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
2622
2623 return s->io.fd;
2624}
2625
30caf8f3 2626_public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2fa48059 2627 int saved_fd, r;
30caf8f3
LP
2628
2629 assert_return(s, -EINVAL);
8ac43fee 2630 assert_return(fd >= 0, -EBADF);
30caf8f3 2631 assert_return(s->type == SOURCE_IO, -EDOM);
2eeff0f4 2632 assert_return(!event_origin_changed(s->event), -ECHILD);
30caf8f3
LP
2633
2634 if (s->io.fd == fd)
2635 return 0;
2636
2fa48059
YW
2637 saved_fd = s->io.fd;
2638 s->io.fd = fd;
30caf8f3 2639
2fa48059 2640 assert(event_source_is_offline(s) == !s->io.registered);
30caf8f3 2641
2fa48059 2642 if (s->io.registered) {
30caf8f3
LP
2643 s->io.registered = false;
2644
2645 r = source_io_register(s, s->enabled, s->io.events);
2646 if (r < 0) {
2647 s->io.fd = saved_fd;
2648 s->io.registered = true;
2649 return r;
2650 }
2651
5a795bff 2652 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
30caf8f3
LP
2653 }
2654
2fa48059
YW
2655 if (s->io.owned)
2656 safe_close(saved_fd);
2657
30caf8f3
LP
2658 return 0;
2659}
2660
ab93297c
NM
2661_public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2662 assert_return(s, -EINVAL);
2663 assert_return(s->type == SOURCE_IO, -EDOM);
2eeff0f4 2664 assert_return(!event_origin_changed(s->event), -ECHILD);
ab93297c
NM
2665
2666 return s->io.owned;
2667}
2668
2669_public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2670 assert_return(s, -EINVAL);
2671 assert_return(s->type == SOURCE_IO, -EDOM);
2eeff0f4 2672 assert_return(!event_origin_changed(s->event), -ECHILD);
ab93297c
NM
2673
2674 s->io.owned = own;
2675 return 0;
2676}
2677
ff5ba2d6 2678_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t *ret) {
305f78bf 2679 assert_return(s, -EINVAL);
ff5ba2d6 2680 assert_return(ret, -EINVAL);
305f78bf 2681 assert_return(s->type == SOURCE_IO, -EDOM);
2eeff0f4 2682 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 2683
ff5ba2d6 2684 *ret = s->io.events;
fd38203a
LP
2685 return 0;
2686}
2687
f7262a9f 2688_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
fd38203a
LP
2689 int r;
2690
305f78bf
LP
2691 assert_return(s, -EINVAL);
2692 assert_return(s->type == SOURCE_IO, -EDOM);
2a16a986 2693 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
da7e457c 2694 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 2695 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 2696
b63c8d4f
DH
2697 /* edge-triggered updates are never skipped, so we can reset edges */
2698 if (s->io.events == events && !(events & EPOLLET))
fd38203a
LP
2699 return 0;
2700
2a0dc6cd
LP
2701 r = source_set_pending(s, false);
2702 if (r < 0)
2703 return r;
2704
b6d5481b 2705 if (event_source_is_online(s)) {
e4715127 2706 r = source_io_register(s, s->enabled, events);
fd38203a
LP
2707 if (r < 0)
2708 return r;
2709 }
2710
2711 s->io.events = events;
2712
2713 return 0;
2714}
2715
ff5ba2d6 2716_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t *ret) {
305f78bf 2717 assert_return(s, -EINVAL);
ff5ba2d6 2718 assert_return(ret, -EINVAL);
305f78bf 2719 assert_return(s->type == SOURCE_IO, -EDOM);
2eeff0f4 2720 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 2721
b49ba390
LP
2722 if (!s->pending)
2723 return -ENODATA;
2724
ff5ba2d6 2725 *ret = s->io.revents;
fd38203a
LP
2726 return 0;
2727}
2728
f7262a9f 2729_public_ int sd_event_source_get_signal(sd_event_source *s) {
305f78bf
LP
2730 assert_return(s, -EINVAL);
2731 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2eeff0f4 2732 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
2733
2734 return s->signal.sig;
2735}
2736
ff5ba2d6 2737_public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *ret) {
305f78bf 2738 assert_return(s, -EINVAL);
ff5ba2d6 2739 assert_return(ret, -EINVAL);
2eeff0f4 2740 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 2741
ff5ba2d6 2742 *ret = s->priority;
6680b8d1 2743 return 0;
fd38203a
LP
2744}
2745
31927c16 2746_public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
97ef5391
LP
2747 bool rm_inotify = false, rm_inode = false;
2748 struct inotify_data *new_inotify_data = NULL;
2749 struct inode_data *new_inode_data = NULL;
9da4cb2b
LP
2750 int r;
2751
305f78bf 2752 assert_return(s, -EINVAL);
da7e457c 2753 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 2754 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
2755
2756 if (s->priority == priority)
2757 return 0;
2758
97ef5391
LP
2759 if (s->type == SOURCE_INOTIFY) {
2760 struct inode_data *old_inode_data;
2761
2762 assert(s->inotify.inode_data);
2763 old_inode_data = s->inotify.inode_data;
2764
2765 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2766 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2767 * events we allow priority changes only until the first following iteration. */
2768 if (old_inode_data->fd < 0)
2769 return -EOPNOTSUPP;
2770
2771 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2772 if (r < 0)
2773 return r;
2774 rm_inotify = r > 0;
2775
2776 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2777 if (r < 0)
2778 goto fail;
2779 rm_inode = r > 0;
2780
2781 if (new_inode_data->fd < 0) {
2782 /* Duplicate the fd for the new inode object if we don't have any yet */
2783 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2784 if (new_inode_data->fd < 0) {
2785 r = -errno;
2786 goto fail;
2787 }
2788
ed828563 2789 LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
4a7cd0ca
YW
2790
2791 _cleanup_free_ char *path = NULL;
2792 r = fd_get_path(new_inode_data->fd, &path);
2793 if (r < 0 && r != -ENOSYS)
2794 goto fail;
2795
2796 free_and_replace(new_inode_data->path, path);
97ef5391
LP
2797 }
2798
2799 /* Move the event source to the new inode data structure */
2800 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2801 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2802 s->inotify.inode_data = new_inode_data;
2803
2804 /* Now create the new watch */
2805 r = inode_data_realize_watch(s->event, new_inode_data);
2806 if (r < 0) {
2807 /* Move it back */
2808 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2809 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2810 s->inotify.inode_data = old_inode_data;
2811 goto fail;
2812 }
2813
2814 s->priority = priority;
2815
2816 event_gc_inode_data(s->event, old_inode_data);
2817
b6d5481b 2818 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
9da4cb2b
LP
2819 struct signal_data *old, *d;
2820
2821 /* Move us from the signalfd belonging to the old
2822 * priority to the signalfd of the new priority */
2823
2824 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2825
2826 s->priority = priority;
2827
2828 r = event_make_signal_data(s->event, s->signal.sig, &d);
2829 if (r < 0) {
2830 s->priority = old->priority;
2831 return r;
2832 }
2833
2834 event_unmask_signal_data(s->event, old, s->signal.sig);
2835 } else
2836 s->priority = priority;
fd38203a 2837
e1951c16 2838 event_source_pp_prioq_reshuffle(s);
fd38203a 2839
6203e07a
LP
2840 if (s->type == SOURCE_EXIT)
2841 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
305f78bf 2842
fd38203a 2843 return 0;
97ef5391
LP
2844
2845fail:
2846 if (rm_inode)
2847 event_free_inode_data(s->event, new_inode_data);
2848
2849 if (rm_inotify)
2850 event_free_inotify_data(s->event, new_inotify_data);
2851
2852 return r;
fd38203a
LP
2853}
2854
cad143a8 2855_public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
71193c0b
ZJS
2856 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2857 if (!s && !ret)
2858 return false;
2859
305f78bf 2860 assert_return(s, -EINVAL);
2eeff0f4 2861 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 2862
cad143a8
LP
2863 if (ret)
2864 *ret = s->enabled;
2865
08c1eb0e 2866 return s->enabled != SD_EVENT_OFF;
fd38203a
LP
2867}
2868
b6d5481b
LP
2869static int event_source_offline(
2870 sd_event_source *s,
2871 int enabled,
2872 bool ratelimited) {
2873
2874 bool was_offline;
fd38203a
LP
2875 int r;
2876
ddfde737 2877 assert(s);
b6d5481b 2878 assert(enabled == SD_EVENT_OFF || ratelimited);
fd38203a 2879
ddfde737 2880 /* Unset the pending flag when this event source is disabled */
b6d5481b
LP
2881 if (s->enabled != SD_EVENT_OFF &&
2882 enabled == SD_EVENT_OFF &&
2883 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2884 r = source_set_pending(s, false);
2885 if (r < 0)
2886 return r;
2887 }
cc567911 2888
b6d5481b
LP
2889 was_offline = event_source_is_offline(s);
2890 s->enabled = enabled;
2891 s->ratelimited = ratelimited;
fd38203a 2892
ddfde737 2893 switch (s->type) {
fd38203a 2894
ddfde737
LP
2895 case SOURCE_IO:
2896 source_io_unregister(s);
2897 break;
ac989a78 2898
ddfde737
LP
2899 case SOURCE_SIGNAL:
2900 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2901 break;
fd38203a 2902
ddfde737 2903 case SOURCE_CHILD:
b6d5481b
LP
2904 if (!was_offline) {
2905 assert(s->event->n_online_child_sources > 0);
2906 s->event->n_online_child_sources--;
2907 }
fd38203a 2908
ddfde737
LP
2909 if (EVENT_SOURCE_WATCH_PIDFD(s))
2910 source_child_pidfd_unregister(s);
2911 else
2912 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2913 break;
4807d2d0 2914
ddfde737
LP
2915 case SOURCE_EXIT:
2916 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2917 break;
fd38203a 2918
158fe190
LP
2919 case SOURCE_MEMORY_PRESSURE:
2920 source_memory_pressure_unregister(s);
2921 break;
2922
2115b9b6
YW
2923 case SOURCE_TIME_REALTIME:
2924 case SOURCE_TIME_BOOTTIME:
2925 case SOURCE_TIME_MONOTONIC:
2926 case SOURCE_TIME_REALTIME_ALARM:
2927 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737
LP
2928 case SOURCE_DEFER:
2929 case SOURCE_POST:
2930 case SOURCE_INOTIFY:
2931 break;
fd38203a 2932
ddfde737 2933 default:
04499a70 2934 assert_not_reached();
ddfde737 2935 }
fd38203a 2936
2115b9b6
YW
2937 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2938 event_source_time_prioq_reshuffle(s);
2939
b6d5481b 2940 return 1;
ddfde737 2941}
f8f3f926 2942
b6d5481b
LP
2943static int event_source_online(
2944 sd_event_source *s,
2945 int enabled,
2946 bool ratelimited) {
2947
2948 bool was_online;
ddfde737 2949 int r;
fd38203a 2950
ddfde737 2951 assert(s);
b6d5481b 2952 assert(enabled != SD_EVENT_OFF || !ratelimited);
305f78bf 2953
ddfde737 2954 /* Unset the pending flag when this event source is enabled */
b6d5481b
LP
2955 if (s->enabled == SD_EVENT_OFF &&
2956 enabled != SD_EVENT_OFF &&
2957 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
ddfde737
LP
2958 r = source_set_pending(s, false);
2959 if (r < 0)
2960 return r;
2961 }
9d3e3aa5 2962
b6d5481b
LP
2963 /* Are we really ready for onlining? */
2964 if (enabled == SD_EVENT_OFF || ratelimited) {
2965 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2966 s->enabled = enabled;
2967 s->ratelimited = ratelimited;
2968 return 0;
2969 }
2970
2971 was_online = event_source_is_online(s);
2972
ddfde737 2973 switch (s->type) {
ddfde737 2974 case SOURCE_IO:
b6d5481b 2975 r = source_io_register(s, enabled, s->io.events);
d2eafe61 2976 if (r < 0)
ddfde737 2977 return r;
ddfde737 2978 break;
fd38203a 2979
ddfde737
LP
2980 case SOURCE_SIGNAL:
2981 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2982 if (r < 0) {
ddfde737
LP
2983 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2984 return r;
2985 }
fd38203a 2986
ddfde737 2987 break;
fd38203a 2988
ddfde737 2989 case SOURCE_CHILD:
ddfde737 2990 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
c6cc7efc 2991 /* yes, we can rely on pidfd */
9da4cb2b 2992
b6d5481b 2993 r = source_child_pidfd_register(s, enabled);
ac9f2640 2994 if (r < 0)
9da4cb2b 2995 return r;
ddfde737 2996 } else {
c6cc7efc 2997 /* something other to watch for than WEXITED */
9da4cb2b 2998
ddfde737
LP
2999 r = event_make_signal_data(s->event, SIGCHLD, NULL);
3000 if (r < 0) {
ddfde737
LP
3001 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
3002 return r;
3003 }
3004 }
fd38203a 3005
b6d5481b
LP
3006 if (!was_online)
3007 s->event->n_online_child_sources++;
ddfde737 3008 break;
4807d2d0 3009
158fe190
LP
3010 case SOURCE_MEMORY_PRESSURE:
3011 r = source_memory_pressure_register(s, enabled);
3012 if (r < 0)
3013 return r;
3014
3015 break;
3016
d2eafe61
ZJS
3017 case SOURCE_TIME_REALTIME:
3018 case SOURCE_TIME_BOOTTIME:
3019 case SOURCE_TIME_MONOTONIC:
3020 case SOURCE_TIME_REALTIME_ALARM:
3021 case SOURCE_TIME_BOOTTIME_ALARM:
ddfde737 3022 case SOURCE_EXIT:
ddfde737
LP
3023 case SOURCE_DEFER:
3024 case SOURCE_POST:
3025 case SOURCE_INOTIFY:
3026 break;
9da4cb2b 3027
ddfde737 3028 default:
04499a70 3029 assert_not_reached();
ddfde737 3030 }
f8f3f926 3031
b6d5481b
LP
3032 s->enabled = enabled;
3033 s->ratelimited = ratelimited;
d2eafe61
ZJS
3034
3035 /* Non-failing operations below */
2115b9b6 3036 if (s->type == SOURCE_EXIT)
d2eafe61 3037 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
d2eafe61 3038
2115b9b6
YW
3039 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3040 event_source_time_prioq_reshuffle(s);
d2eafe61 3041
b6d5481b 3042 return 1;
ddfde737
LP
3043}
3044
3045_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3046 int r;
9da4cb2b 3047
ddfde737 3048 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
7e922b05
ZJS
3049
3050 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3051 if (m == SD_EVENT_OFF && !s)
3052 return 0;
3053
3054 assert_return(s, -EINVAL);
2eeff0f4 3055 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 3056
ddfde737
LP
3057 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3058 if (s->event->state == SD_EVENT_FINISHED)
3059 return m == SD_EVENT_OFF ? 0 : -ESTALE;
305f78bf 3060
ddfde737
LP
3061 if (s->enabled == m) /* No change? */
3062 return 0;
9d3e3aa5 3063
ddfde737 3064 if (m == SD_EVENT_OFF)
b6d5481b 3065 r = event_source_offline(s, m, s->ratelimited);
ddfde737
LP
3066 else {
3067 if (s->enabled != SD_EVENT_OFF) {
3068 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3069 * event source is already enabled after all. */
3070 s->enabled = m;
3071 return 0;
fd38203a 3072 }
ddfde737 3073
b6d5481b 3074 r = event_source_online(s, m, s->ratelimited);
fd38203a 3075 }
ddfde737
LP
3076 if (r < 0)
3077 return r;
fd38203a 3078
e1951c16 3079 event_source_pp_prioq_reshuffle(s);
fd38203a
LP
3080 return 0;
3081}
3082
ff5ba2d6 3083_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *ret) {
305f78bf 3084 assert_return(s, -EINVAL);
ff5ba2d6 3085 assert_return(ret, -EINVAL);
6a0f1f6d 3086 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2eeff0f4 3087 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 3088
ff5ba2d6 3089 *ret = s->time.next;
fd38203a
LP
3090 return 0;
3091}
3092
f7262a9f 3093_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
2a0dc6cd 3094 int r;
6a0f1f6d 3095
305f78bf 3096 assert_return(s, -EINVAL);
6a0f1f6d 3097 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 3098 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 3099 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a 3100
2a0dc6cd
LP
3101 r = source_set_pending(s, false);
3102 if (r < 0)
3103 return r;
2576a19e 3104
2a0dc6cd 3105 s->time.next = usec;
fd38203a 3106
e1951c16 3107 event_source_time_prioq_reshuffle(s);
fd38203a
LP
3108 return 0;
3109}
3110
d6a83dc4
LP
3111_public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3112 usec_t t;
3113 int r;
3114
3115 assert_return(s, -EINVAL);
3116 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2eeff0f4 3117 assert_return(!event_origin_changed(s->event), -ECHILD);
d6a83dc4 3118
ef859195
LP
3119 if (usec == USEC_INFINITY)
3120 return sd_event_source_set_time(s, USEC_INFINITY);
3121
d6a83dc4
LP
3122 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3123 if (r < 0)
3124 return r;
3125
496db330
YW
3126 usec = usec_add(t, usec);
3127 if (usec == USEC_INFINITY)
d6a83dc4
LP
3128 return -EOVERFLOW;
3129
496db330 3130 return sd_event_source_set_time(s, usec);
d6a83dc4
LP
3131}
3132
ff5ba2d6 3133_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *ret) {
305f78bf 3134 assert_return(s, -EINVAL);
ff5ba2d6 3135 assert_return(ret, -EINVAL);
6a0f1f6d 3136 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2eeff0f4 3137 assert_return(!event_origin_changed(s->event), -ECHILD);
305f78bf 3138
ff5ba2d6 3139 *ret = s->time.accuracy;
305f78bf
LP
3140 return 0;
3141}
3142
f7262a9f 3143_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
2a0dc6cd 3144 int r;
6a0f1f6d 3145
305f78bf 3146 assert_return(s, -EINVAL);
f5fbe71d 3147 assert_return(usec != UINT64_MAX, -EINVAL);
6a0f1f6d 3148 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
da7e457c 3149 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 3150 assert_return(!event_origin_changed(s->event), -ECHILD);
eaa3cbef 3151
2a0dc6cd
LP
3152 r = source_set_pending(s, false);
3153 if (r < 0)
3154 return r;
3155
eaa3cbef
LP
3156 if (usec == 0)
3157 usec = DEFAULT_ACCURACY_USEC;
3158
eaa3cbef
LP
3159 s->time.accuracy = usec;
3160
e1951c16 3161 event_source_time_prioq_reshuffle(s);
6a0f1f6d
LP
3162 return 0;
3163}
3164
ff5ba2d6 3165_public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *ret) {
6a0f1f6d 3166 assert_return(s, -EINVAL);
ff5ba2d6 3167 assert_return(ret, -EINVAL);
6a0f1f6d 3168 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2eeff0f4 3169 assert_return(!event_origin_changed(s->event), -ECHILD);
eaa3cbef 3170
ff5ba2d6 3171 *ret = event_source_type_to_clock(s->type);
eaa3cbef
LP
3172 return 0;
3173}
3174
ff5ba2d6 3175_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *ret) {
4bee8012 3176 assert_return(s, -EINVAL);
ff5ba2d6 3177 assert_return(ret, -EINVAL);
4bee8012 3178 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3179 assert_return(!event_origin_changed(s->event), -ECHILD);
4bee8012 3180
ff5ba2d6 3181 *ret = s->child.pid;
4bee8012
LP
3182 return 0;
3183}
3184
f8f3f926
LP
3185_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3186 assert_return(s, -EINVAL);
3187 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3188 assert_return(!event_origin_changed(s->event), -ECHILD);
f8f3f926 3189
f8f3f926
LP
3190 return s->child.pidfd;
3191}
3192
3193_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3194 assert_return(s, -EINVAL);
3195 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3196 assert_return(!event_origin_changed(s->event), -ECHILD);
f8f3f926 3197 assert_return(SIGNAL_VALID(sig), -EINVAL);
19e1a908 3198 assert(s->child.pidfd >= 0);
f8f3f926 3199
19e1a908 3200 /* If we already have seen indication the process exited refuse sending a signal early. */
f8f3f926
LP
3201 if (s->child.exited)
3202 return -ESRCH;
19e1a908 3203 assert(!s->child.waited);
f8f3f926 3204
19e1a908
MY
3205 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the structure here. */
3206 siginfo_t copy;
3207 if (si)
3208 copy = *si;
f8f3f926 3209
19e1a908 3210 return RET_NERRNO(pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, flags));
f8f3f926
LP
3211}
3212
3213_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3214 assert_return(s, -EINVAL);
3215 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3216 assert_return(!event_origin_changed(s->event), -ECHILD);
c6cc7efc 3217 assert(s->child.pidfd >= 0);
f8f3f926
LP
3218
3219 return s->child.pidfd_owned;
3220}
3221
3222_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3223 assert_return(s, -EINVAL);
3224 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3225 assert_return(!event_origin_changed(s->event), -ECHILD);
c6cc7efc 3226 assert(s->child.pidfd >= 0);
f8f3f926
LP
3227
3228 s->child.pidfd_owned = own;
3229 return 0;
3230}
3231
3232_public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3233 assert_return(s, -EINVAL);
3234 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3235 assert_return(!event_origin_changed(s->event), -ECHILD);
f8f3f926
LP
3236
3237 return s->child.process_owned;
3238}
3239
3240_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3241 assert_return(s, -EINVAL);
3242 assert_return(s->type == SOURCE_CHILD, -EDOM);
2eeff0f4 3243 assert_return(!event_origin_changed(s->event), -ECHILD);
f8f3f926
LP
3244
3245 s->child.process_owned = own;
3246 return 0;
3247}
3248
c1ab4458 3249_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret) {
97ef5391 3250 assert_return(s, -EINVAL);
c1ab4458 3251 assert_return(ret, -EINVAL);
97ef5391 3252 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
2eeff0f4 3253 assert_return(!event_origin_changed(s->event), -ECHILD);
97ef5391 3254
c1ab4458 3255 *ret = s->inotify.mask;
97ef5391
LP
3256 return 0;
3257}
3258
4a7cd0ca 3259_public_ int sd_event_source_get_inotify_path(sd_event_source *s, const char **ret) {
74c4231c
YW
3260 assert_return(s, -EINVAL);
3261 assert_return(ret, -EINVAL);
3262 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3263 assert_return(!event_origin_changed(s->event), -ECHILD);
3264
4a7cd0ca
YW
3265 if (!s->inotify.inode_data)
3266 return -ESTALE; /* already disconnected. */
74c4231c 3267
4a7cd0ca
YW
3268 if (!s->inotify.inode_data->path)
3269 return -ENOSYS; /* /proc was not mounted? */
3270
3271 *ret = s->inotify.inode_data->path;
3272 return 0;
74c4231c
YW
3273}
3274
718db961 3275_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
fd38203a
LP
3276 int r;
3277
da7e457c 3278 assert_return(s, -EINVAL);
6203e07a 3279 assert_return(s->type != SOURCE_EXIT, -EDOM);
da7e457c 3280 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 3281 assert_return(!event_origin_changed(s->event), -ECHILD);
fd38203a
LP
3282
3283 if (s->prepare == callback)
3284 return 0;
3285
3286 if (callback && s->prepare) {
3287 s->prepare = callback;
3288 return 0;
3289 }
3290
3291 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3292 if (r < 0)
3293 return r;
3294
3295 s->prepare = callback;
3296
3297 if (callback) {
3298 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3299 if (r < 0)
3300 return r;
3301 } else
3302 prioq_remove(s->event->prepare, s, &s->prepare_index);
3303
3304 return 0;
3305}
3306
f7262a9f 3307_public_ void* sd_event_source_get_userdata(sd_event_source *s) {
da7e457c 3308 assert_return(s, NULL);
2eeff0f4 3309 assert_return(!event_origin_changed(s->event), NULL);
fd38203a
LP
3310
3311 return s->userdata;
3312}
3313
8f726607
LP
3314_public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3315 void *ret;
3316
3317 assert_return(s, NULL);
2eeff0f4 3318 assert_return(!event_origin_changed(s->event), NULL);
8f726607
LP
3319
3320 ret = s->userdata;
3321 s->userdata = userdata;
3322
3323 return ret;
3324}
3325
b6d5481b
LP
3326static int event_source_enter_ratelimited(sd_event_source *s) {
3327 int r;
3328
3329 assert(s);
3330
3331 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3332 * the end of the rate limit time window, much as if it was a timer event source. */
3333
3334 if (s->ratelimited)
3335 return 0; /* Already ratelimited, this is a NOP hence */
3336
3337 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3338 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3339 if (r < 0)
3340 return r;
3341
3342 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3343 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3344 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3345 if (EVENT_SOURCE_IS_TIME(s->type))
3346 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3347
3348 /* Now, let's add the event source to the monotonic clock instead */
3349 r = event_source_time_prioq_put(s, &s->event->monotonic);
3350 if (r < 0)
3351 goto fail;
3352
3353 /* And let's take the event source officially offline */
3354 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3355 if (r < 0) {
3356 event_source_time_prioq_remove(s, &s->event->monotonic);
3357 goto fail;
3358 }
3359
3360 event_source_pp_prioq_reshuffle(s);
3361
3362 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3363 return 0;
3364
3365fail:
3366 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3367 * space for it should already be allocated. */
3368 if (EVENT_SOURCE_IS_TIME(s->type))
3369 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3370
3371 return r;
3372}
3373
fd69f224 3374static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
b6d5481b
LP
3375 int r;
3376
3377 assert(s);
3378
3379 if (!s->ratelimited)
3380 return 0;
3381
3382 /* Let's take the event source out of the monotonic prioq first. */
3383 event_source_time_prioq_remove(s, &s->event->monotonic);
3384
3385 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3386 if (EVENT_SOURCE_IS_TIME(s->type)) {
3387 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3388 if (r < 0)
3389 goto fail;
3390 }
3391
3392 /* Let's try to take it online again. */
3393 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3394 if (r < 0) {
3395 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3396 if (EVENT_SOURCE_IS_TIME(s->type))
3397 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3398
3399 goto fail;
3400 }
3401
3402 event_source_pp_prioq_reshuffle(s);
3403 ratelimit_reset(&s->rate_limit);
3404
3405 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
fd69f224
MS
3406
3407 if (run_callback && s->ratelimit_expire_callback) {
3408 s->dispatching = true;
3409 r = s->ratelimit_expire_callback(s, s->userdata);
3410 s->dispatching = false;
3411
3412 if (r < 0) {
3413 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3414 strna(s->description),
3415 event_source_type_to_string(s->type),
3416 s->exit_on_failure ? "exiting" : "disabling");
3417
3418 if (s->exit_on_failure)
3419 (void) sd_event_exit(s->event, r);
3420 }
3421
3422 if (s->n_ref == 0)
3423 source_free(s);
3424 else if (r < 0)
0a040e64 3425 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd69f224
MS
3426
3427 return 1;
3428 }
3429
b6d5481b
LP
3430 return 0;
3431
3432fail:
3433 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3434 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3435 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3436
3437 return r;
3438}
3439
c2ba3ad6
LP
3440static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3441 usec_t c;
3442 assert(e);
3443 assert(a <= b);
3444
3445 if (a <= 0)
3446 return 0;
393003e1
LP
3447 if (a >= USEC_INFINITY)
3448 return USEC_INFINITY;
c2ba3ad6
LP
3449
3450 if (b <= a + 1)
3451 return a;
3452
52444dc4
LP
3453 initialize_perturb(e);
3454
c2ba3ad6
LP
3455 /*
3456 Find a good time to wake up again between times a and b. We
3457 have two goals here:
3458
3459 a) We want to wake up as seldom as possible, hence prefer
3460 later times over earlier times.
3461
3462 b) But if we have to wake up, then let's make sure to
3463 dispatch as much as possible on the entire system.
3464
3465 We implement this by waking up everywhere at the same time
850516e0 3466 within any given minute if we can, synchronised via the
c2ba3ad6 3467 perturbation value determined from the boot ID. If we can't,
ba276c81
LP
3468 then we try to find the same spot in every 10s, then 1s and
3469 then 250ms step. Otherwise, we pick the last possible time
3470 to wake up.
c2ba3ad6
LP
3471 */
3472
850516e0
LP
3473 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3474 if (c >= b) {
3475 if (_unlikely_(c < USEC_PER_MINUTE))
3476 return b;
3477
3478 c -= USEC_PER_MINUTE;
3479 }
3480
ba276c81
LP
3481 if (c >= a)
3482 return c;
3483
3484 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3485 if (c >= b) {
3486 if (_unlikely_(c < USEC_PER_SEC*10))
3487 return b;
3488
3489 c -= USEC_PER_SEC*10;
3490 }
3491
850516e0
LP
3492 if (c >= a)
3493 return c;
3494
3495 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
c2ba3ad6
LP
3496 if (c >= b) {
3497 if (_unlikely_(c < USEC_PER_SEC))
3498 return b;
3499
3500 c -= USEC_PER_SEC;
3501 }
3502
3503 if (c >= a)
3504 return c;
3505
3506 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3507 if (c >= b) {
3508 if (_unlikely_(c < USEC_PER_MSEC*250))
3509 return b;
3510
3511 c -= USEC_PER_MSEC*250;
3512 }
3513
3514 if (c >= a)
3515 return c;
3516
3517 return b;
3518}
3519
fd38203a
LP
3520static int event_arm_timer(
3521 sd_event *e,
6a0f1f6d 3522 struct clock_data *d) {
fd38203a
LP
3523
3524 struct itimerspec its = {};
c2ba3ad6
LP
3525 sd_event_source *a, *b;
3526 usec_t t;
fd38203a 3527
cde93897 3528 assert(e);
6a0f1f6d 3529 assert(d);
fd38203a 3530
d06441da 3531 if (!d->needs_rearm)
212bbb17 3532 return 0;
7e2bf71c
YW
3533
3534 d->needs_rearm = false;
212bbb17 3535
6a0f1f6d 3536 a = prioq_peek(d->earliest);
19947509 3537 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
b6d5481b 3538 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
72aedc1e 3539
6a0f1f6d 3540 if (d->fd < 0)
c57b5ca3
LP
3541 return 0;
3542
3a43da28 3543 if (d->next == USEC_INFINITY)
72aedc1e
LP
3544 return 0;
3545
3546 /* disarm */
15c689d7
LP
3547 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3548 return -errno;
72aedc1e 3549
3a43da28 3550 d->next = USEC_INFINITY;
fd38203a 3551 return 0;
72aedc1e 3552 }
fd38203a 3553
6a0f1f6d 3554 b = prioq_peek(d->latest);
19947509
ZJS
3555 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3556 assert(b && b->enabled != SD_EVENT_OFF);
c2ba3ad6 3557
b6d5481b 3558 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
6a0f1f6d 3559 if (d->next == t)
fd38203a
LP
3560 return 0;
3561
6a0f1f6d 3562 assert_se(d->fd >= 0);
fd38203a 3563
c2ba3ad6 3564 if (t == 0) {
1751bdde 3565 /* We don't want to disarm here, just mean some time looooong ago. */
fd38203a
LP
3566 its.it_value.tv_sec = 0;
3567 its.it_value.tv_nsec = 1;
3568 } else
c2ba3ad6 3569 timespec_store(&its.it_value, t);
fd38203a 3570
15c689d7 3571 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
cde93897 3572 return -errno;
fd38203a 3573
6a0f1f6d 3574 d->next = t;
fd38203a
LP
3575 return 0;
3576}
3577
9a800b56 3578static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
fd38203a
LP
3579 assert(e);
3580 assert(s);
3581 assert(s->type == SOURCE_IO);
3582
9a800b56
LP
3583 /* If the event source was already pending, we just OR in the
3584 * new revents, otherwise we reset the value. The ORing is
3585 * necessary to handle EPOLLONESHOT events properly where
3586 * readability might happen independently of writability, and
3587 * we need to keep track of both */
3588
3589 if (s->pending)
3590 s->io.revents |= revents;
3591 else
3592 s->io.revents = revents;
fd38203a 3593
fd38203a
LP
3594 return source_set_pending(s, true);
3595}
3596
72aedc1e 3597static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
fd38203a
LP
3598 uint64_t x;
3599 ssize_t ss;
3600
3601 assert(e);
da7e457c 3602 assert(fd >= 0);
72aedc1e 3603
305f78bf 3604 assert_return(events == EPOLLIN, -EIO);
fd38203a
LP
3605
3606 ss = read(fd, &x, sizeof(x));
3607 if (ss < 0) {
8add30a0 3608 if (ERRNO_IS_TRANSIENT(errno))
fd38203a
LP
3609 return 0;
3610
3611 return -errno;
3612 }
3613
8d35dae7 3614 if (_unlikely_(ss != sizeof(x)))
fd38203a
LP
3615 return -EIO;
3616
cde93897 3617 if (next)
3a43da28 3618 *next = USEC_INFINITY;
72aedc1e 3619
fd38203a
LP
3620 return 0;
3621}
3622
305f78bf
LP
3623static int process_timer(
3624 sd_event *e,
3625 usec_t n,
6a0f1f6d 3626 struct clock_data *d) {
305f78bf 3627
fd38203a 3628 sd_event_source *s;
fd69f224 3629 bool callback_invoked = false;
fd38203a
LP
3630 int r;
3631
3632 assert(e);
6a0f1f6d 3633 assert(d);
fd38203a
LP
3634
3635 for (;;) {
6a0f1f6d 3636 s = prioq_peek(d->earliest);
19947509
ZJS
3637 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3638
b6d5481b
LP
3639 if (!s || time_event_source_next(s) > n)
3640 break;
3641
3642 if (s->ratelimited) {
3643 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3644 * again. */
3645 assert(s->ratelimited);
3646
fd69f224 3647 r = event_source_leave_ratelimit(s, /* run_callback */ true);
b6d5481b
LP
3648 if (r < 0)
3649 return r;
fd69f224
MS
3650 else if (r == 1)
3651 callback_invoked = true;
b6d5481b
LP
3652
3653 continue;
3654 }
3655
3656 if (s->enabled == SD_EVENT_OFF || s->pending)
fd38203a
LP
3657 break;
3658
3659 r = source_set_pending(s, true);
3660 if (r < 0)
3661 return r;
3662
e1951c16 3663 event_source_time_prioq_reshuffle(s);
fd38203a
LP
3664 }
3665
fd69f224 3666 return callback_invoked;
fd38203a
LP
3667}
3668
efd3be9d
YW
3669static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3670 int64_t min_priority = threshold;
3671 bool something_new = false;
fd38203a 3672 sd_event_source *s;
fd38203a
LP
3673 int r;
3674
3675 assert(e);
efd3be9d
YW
3676 assert(ret_min_priority);
3677
3678 if (!e->need_process_child) {
3679 *ret_min_priority = min_priority;
3680 return 0;
3681 }
fd38203a 3682
c2ba3ad6
LP
3683 e->need_process_child = false;
3684
c6cc7efc
MY
3685 /* So, this is ugly. We iteratively invoke waitid() + WNOHANG with each child process we shall wait for,
3686 * instead of using P_ALL. This is because we only want to get child information of very specific
3687 * child processes, and not all of them. We might not have processed the SIGCHLD event
91c70071
YW
3688 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3689 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3690 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3691 * to handle SIGCHLD yourself.
3692 *
3693 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3694 * source is dispatched so that the callback still sees the process as a zombie. */
fd38203a 3695
90e74a66 3696 HASHMAP_FOREACH(s, e->child_sources) {
fd38203a 3697 assert(s->type == SOURCE_CHILD);
c6cc7efc 3698 assert(s->child.pidfd >= 0);
fd38203a 3699
efd3be9d
YW
3700 if (s->priority > threshold)
3701 continue;
3702
fd38203a
LP
3703 if (s->pending)
3704 continue;
3705
b6d5481b 3706 if (event_source_is_offline(s))
fd38203a
LP
3707 continue;
3708
f8f3f926
LP
3709 if (s->child.exited)
3710 continue;
3711
91c70071
YW
3712 if (EVENT_SOURCE_WATCH_PIDFD(s))
3713 /* There's a usable pidfd known for this event source? Then don't waitid() for
3714 * it here */
f8f3f926
LP
3715 continue;
3716
fd38203a 3717 zero(s->child.siginfo);
c6cc7efc 3718 if (waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo,
15c689d7 3719 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
bfd9bfcc 3720 return negative_errno();
fd38203a
LP
3721
3722 if (s->child.siginfo.si_pid != 0) {
37149e69 3723 bool zombie = SIGINFO_CODE_IS_DEAD(s->child.siginfo.si_code);
08cd1552 3724
f8f3f926
LP
3725 if (zombie)
3726 s->child.exited = true;
c6cc7efc
MY
3727 else if (s->child.options & WEXITED) {
3728 /* If the child isn't dead then let's immediately remove the state change
3729 * from the queue, since there's no benefit in leaving it queued. */
08cd1552
LP
3730
3731 assert(s->child.options & (WSTOPPED|WCONTINUED));
c6cc7efc 3732 (void) waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
08cd1552
LP
3733 }
3734
fd38203a
LP
3735 r = source_set_pending(s, true);
3736 if (r < 0)
3737 return r;
efd3be9d
YW
3738 if (r > 0) {
3739 something_new = true;
3740 min_priority = MIN(min_priority, s->priority);
3741 }
fd38203a
LP
3742 }
3743 }
3744
efd3be9d
YW
3745 *ret_min_priority = min_priority;
3746 return something_new;
fd38203a
LP
3747}
3748
f8f3f926
LP
3749static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3750 assert(e);
3751 assert(s);
3752 assert(s->type == SOURCE_CHILD);
c6cc7efc 3753 assert(s->child.pidfd >= 0);
f8f3f926
LP
3754
3755 if (s->pending)
3756 return 0;
3757
b6d5481b 3758 if (event_source_is_offline(s))
f8f3f926
LP
3759 return 0;
3760
3761 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3762 return 0;
3763
c6cc7efc
MY
3764 /* Note that pidfd would also generate EPOLLHUP when the process gets reaped. But at this point we
3765 * only permit EPOLLIN, under the assumption that upon EPOLLHUP the child source should already
3766 * be set to pending, and we would have returned early above. */
3767 assert(!s->child.exited);
3768
f8f3f926 3769 zero(s->child.siginfo);
c6cc7efc 3770 if (waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
f8f3f926
LP
3771 return -errno;
3772
3773 if (s->child.siginfo.si_pid == 0)
3774 return 0;
3775
37149e69 3776 if (SIGINFO_CODE_IS_DEAD(s->child.siginfo.si_code))
f8f3f926
LP
3777 s->child.exited = true;
3778
3779 return source_set_pending(s, true);
3780}
3781
efd3be9d 3782static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
fd38203a
LP
3783 int r;
3784
da7e457c 3785 assert(e);
97ef5391 3786 assert(d);
305f78bf 3787 assert_return(events == EPOLLIN, -EIO);
efd3be9d 3788 assert(min_priority);
fd38203a 3789
91c70071
YW
3790 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3791 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3792 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3793 * but we might have higher priority children we care about hence we need to check that
3794 * explicitly. */
9da4cb2b
LP
3795
3796 if (sigismember(&d->sigset, SIGCHLD))
3797 e->need_process_child = true;
3798
91c70071 3799 /* If there's already an event source pending for this priority we don't read another */
9da4cb2b
LP
3800 if (d->current)
3801 return 0;
3802
fd38203a 3803 for (;;) {
0eb2e0e3 3804 struct signalfd_siginfo si;
7057bd99 3805 ssize_t n;
92daebc0 3806 sd_event_source *s = NULL;
fd38203a 3807
9da4cb2b 3808 n = read(d->fd, &si, sizeof(si));
7057bd99 3809 if (n < 0) {
8add30a0 3810 if (ERRNO_IS_TRANSIENT(errno))
efd3be9d 3811 return 0;
fd38203a
LP
3812
3813 return -errno;
3814 }
3815
7057bd99 3816 if (_unlikely_(n != sizeof(si)))
fd38203a
LP
3817 return -EIO;
3818
7a64c5f2 3819 if (_unlikely_(!SIGNAL_VALID(si.ssi_signo)))
ab9af70e 3820 return -EIO;
7057bd99 3821
92daebc0
LP
3822 if (e->signal_sources)
3823 s = e->signal_sources[si.ssi_signo];
92daebc0
LP
3824 if (!s)
3825 continue;
9da4cb2b
LP
3826 if (s->pending)
3827 continue;
fd38203a
LP
3828
3829 s->signal.siginfo = si;
9da4cb2b
LP
3830 d->current = s;
3831
fd38203a
LP
3832 r = source_set_pending(s, true);
3833 if (r < 0)
3834 return r;
efd3be9d
YW
3835 if (r > 0 && *min_priority >= s->priority) {
3836 *min_priority = s->priority;
3837 return 1; /* an event source with smaller priority is queued. */
3838 }
9da4cb2b 3839
efd3be9d 3840 return 0;
fd38203a 3841 }
fd38203a
LP
3842}
3843
efd3be9d 3844static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
97ef5391
LP
3845 ssize_t n;
3846
3847 assert(e);
3848 assert(d);
3849
3850 assert_return(revents == EPOLLIN, -EIO);
3851
3852 /* If there's already an event source pending for this priority, don't read another */
3853 if (d->n_pending > 0)
3854 return 0;
3855
3856 /* Is the read buffer non-empty? If so, let's not read more */
3857 if (d->buffer_filled > 0)
3858 return 0;
3859
efd3be9d
YW
3860 if (d->priority > threshold)
3861 return 0;
3862
97ef5391
LP
3863 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3864 if (n < 0) {
8add30a0 3865 if (ERRNO_IS_TRANSIENT(errno))
97ef5391
LP
3866 return 0;
3867
3868 return -errno;
3869 }
3870
3871 assert(n > 0);
3872 d->buffer_filled = (size_t) n;
0601b958 3873 LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3874
3875 return 1;
3876}
3877
3878static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3879 assert(e);
3880 assert(d);
3881 assert(sz <= d->buffer_filled);
3882
3883 if (sz == 0)
3884 return;
3885
3886 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3887 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3888 d->buffer_filled -= sz;
3889
3890 if (d->buffer_filled == 0)
0601b958 3891 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
97ef5391
LP
3892}
3893
3894static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3895 int r;
3896
3897 assert(e);
3898 assert(d);
3899
3900 /* If there's already an event source pending for this priority, don't read another */
3901 if (d->n_pending > 0)
3902 return 0;
3903
3904 while (d->buffer_filled > 0) {
3905 size_t sz;
3906
3907 /* Let's validate that the event structures are complete */
3908 if (d->buffer_filled < offsetof(struct inotify_event, name))
3909 return -EIO;
3910
3911 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3912 if (d->buffer_filled < sz)
3913 return -EIO;
3914
3915 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3916 struct inode_data *inode_data;
97ef5391
LP
3917
3918 /* The queue overran, let's pass this event to all event sources connected to this inotify
3919 * object */
3920
03677889 3921 HASHMAP_FOREACH(inode_data, d->inodes)
97ef5391
LP
3922 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3923
b6d5481b 3924 if (event_source_is_offline(s))
97ef5391
LP
3925 continue;
3926
3927 r = source_set_pending(s, true);
3928 if (r < 0)
3929 return r;
3930 }
97ef5391
LP
3931 } else {
3932 struct inode_data *inode_data;
97ef5391
LP
3933
3934 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3935 * our watch descriptor table. */
3936 if (d->buffer.ev.mask & IN_IGNORED) {
3937
3938 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3939 if (!inode_data) {
3940 event_inotify_data_drop(e, d, sz);
3941 continue;
3942 }
3943
3944 /* The watch descriptor was removed by the kernel, let's drop it here too */
3945 inode_data->wd = -1;
3946 } else {
3947 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3948 if (!inode_data) {
3949 event_inotify_data_drop(e, d, sz);
3950 continue;
3951 }
3952 }
3953
3954 /* Trigger all event sources that are interested in these events. Also trigger all event
3955 * sources if IN_IGNORED or IN_UNMOUNT is set. */
3956 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3957
b6d5481b 3958 if (event_source_is_offline(s))
97ef5391
LP
3959 continue;
3960
3961 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3962 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3963 continue;
3964
3965 r = source_set_pending(s, true);
3966 if (r < 0)
3967 return r;
3968 }
3969 }
3970
3971 /* Something pending now? If so, let's finish, otherwise let's read more. */
3972 if (d->n_pending > 0)
3973 return 1;
3974 }
3975
3976 return 0;
3977}
3978
3979static int process_inotify(sd_event *e) {
97ef5391
LP
3980 int r, done = 0;
3981
3982 assert(e);
3983
0601b958 3984 LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
97ef5391
LP
3985 r = event_inotify_data_process(e, d);
3986 if (r < 0)
3987 return r;
3988 if (r > 0)
b3a9d980 3989 done++;
97ef5391
LP
3990 }
3991
3992 return done;
3993}
3994
158fe190
LP
3995static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
3996 assert(s);
3997 assert(s->type == SOURCE_MEMORY_PRESSURE);
3998
3999 if (s->pending)
4000 s->memory_pressure.revents |= revents;
4001 else
4002 s->memory_pressure.revents = revents;
4003
4004 return source_set_pending(s, true);
4005}
4006
4007static int source_memory_pressure_write(sd_event_source *s) {
4008 ssize_t n;
4009 int r;
4010
4011 assert(s);
4012 assert(s->type == SOURCE_MEMORY_PRESSURE);
4013
4014 /* once we start writing, the buffer is locked, we allow no further changes. */
4015 s->memory_pressure.locked = true;
4016
4017 if (s->memory_pressure.write_buffer_size > 0) {
4018 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4019 if (n < 0) {
9897f5dd
LP
4020 if (!ERRNO_IS_TRANSIENT(errno)) {
4021 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4022 * files, but then generates EOPNOSUPP on read() and write() (instead of on
4023 * open()!). This sucks hard, since we can only detect this kind of failure
4024 * so late. Let's make the best of it, and turn off the event source like we
4025 * do for failed event source handlers. */
4026
4027 log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4028 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4029 return 0;
4030 }
158fe190
LP
4031
4032 n = 0;
4033 }
4034 } else
4035 n = 0;
4036
4037 assert(n >= 0);
4038
4039 if ((size_t) n == s->memory_pressure.write_buffer_size) {
4040 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4041
4042 if (n > 0) {
4043 s->memory_pressure.write_buffer_size = 0;
4044
4045 /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4046 r = source_memory_pressure_register(s, s->enabled);
4047 if (r < 0)
4048 return r;
4049 }
4050 } else if (n > 0) {
4051 _cleanup_free_ void *c = NULL;
4052
4053 assert((size_t) n < s->memory_pressure.write_buffer_size);
4054
4055 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4056 if (!c)
4057 return -ENOMEM;
4058
4059 free_and_replace(s->memory_pressure.write_buffer, c);
4060 s->memory_pressure.write_buffer_size -= n;
4061 return 1;
4062 }
4063
4064 return 0;
4065}
4066
4067static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4068 int r;
4069
4070 assert(s);
4071 assert(s->type == SOURCE_MEMORY_PRESSURE);
4072
4073 r = source_memory_pressure_write(s);
4074 if (r < 0)
4075 return r;
4076 if (r > 0)
4077 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4078 * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4079
4080 /* No pending incoming IO? Then let's not continue further */
4081 if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4082
4083 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4084 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4085 return -EIO;
4086
4087 return 1; /* leave dispatch, we already processed everything */
4088 }
4089
4090 if (s->memory_pressure.revents & EPOLLIN) {
4091 uint8_t pipe_buf[PIPE_BUF];
4092 ssize_t n;
4093
4094 /* If the fd is readable, then flush out anything that might be queued */
4095
4096 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4097 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4098 return -errno;
4099 }
4100
4101 return 0; /* go on, dispatch to user callback */
4102}
4103
fd38203a 4104static int source_dispatch(sd_event_source *s) {
8f5c235d 4105 EventSourceType saved_type;
c8e9d15c 4106 sd_event *saved_event;
fe8245eb 4107 int r = 0;
fd38203a
LP
4108
4109 assert(s);
6203e07a 4110 assert(s->pending || s->type == SOURCE_EXIT);
fd38203a 4111
b778cba4
LP
4112 /* Save the event source type, here, so that we still know it after the event callback which might
4113 * invalidate the event. */
8f5c235d
LP
4114 saved_type = s->type;
4115
de02634c 4116 /* Similarly, store a reference to the event loop object, so that we can still access it after the
b778cba4 4117 * callback might have invalidated/disconnected the event source. */
c8e9d15c
YW
4118 saved_event = s->event;
4119 PROTECT_EVENT(saved_event);
b778cba4 4120
de02634c 4121 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
b6d5481b
LP
4122 assert(!s->ratelimited);
4123 if (!ratelimit_below(&s->rate_limit)) {
4124 r = event_source_enter_ratelimited(s);
4125 if (r < 0)
4126 return r;
4127
4128 return 1;
4129 }
4130
945c2931 4131 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
da7e457c
LP
4132 r = source_set_pending(s, false);
4133 if (r < 0)
4134 return r;
4135 }
fd38203a 4136
6e9feda3
LP
4137 if (s->type != SOURCE_POST) {
4138 sd_event_source *z;
6e9feda3 4139
de02634c 4140 /* If we execute a non-post source, let's mark all post sources as pending. */
6e9feda3 4141
90e74a66 4142 SET_FOREACH(z, s->event->post_sources) {
b6d5481b 4143 if (event_source_is_offline(z))
6e9feda3
LP
4144 continue;
4145
4146 r = source_set_pending(z, true);
4147 if (r < 0)
4148 return r;
4149 }
4150 }
4151
158fe190
LP
4152 if (s->type == SOURCE_MEMORY_PRESSURE) {
4153 r = source_memory_pressure_initiate_dispatch(s);
4154 if (r == -EIO) /* handle EIO errors similar to callback errors */
4155 goto finish;
4156 if (r < 0)
4157 return r;
4158 if (r > 0) /* already handled */
4159 return 1;
4160 }
4161
baf76283
LP
4162 if (s->enabled == SD_EVENT_ONESHOT) {
4163 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
fd38203a
LP
4164 if (r < 0)
4165 return r;
4166 }
4167
12179984 4168 s->dispatching = true;
b7484e2a 4169
fd38203a
LP
4170 switch (s->type) {
4171
4172 case SOURCE_IO:
4173 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4174 break;
4175
6a0f1f6d 4176 case SOURCE_TIME_REALTIME:
a8548816 4177 case SOURCE_TIME_BOOTTIME:
6a0f1f6d
LP
4178 case SOURCE_TIME_MONOTONIC:
4179 case SOURCE_TIME_REALTIME_ALARM:
4180 case SOURCE_TIME_BOOTTIME_ALARM:
fd38203a
LP
4181 r = s->time.callback(s, s->time.next, s->userdata);
4182 break;
4183
4184 case SOURCE_SIGNAL:
4185 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4186 break;
4187
08cd1552 4188 case SOURCE_CHILD: {
37149e69 4189 bool zombie = SIGINFO_CODE_IS_DEAD(s->child.siginfo.si_code);
08cd1552 4190
fd38203a 4191 r = s->child.callback(s, &s->child.siginfo, s->userdata);
08cd1552
LP
4192
4193 /* Now, reap the PID for good. */
f8f3f926 4194 if (zombie) {
c6cc7efc 4195 (void) waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo, WNOHANG|WEXITED);
f8f3f926
LP
4196 s->child.waited = true;
4197 }
08cd1552 4198
fd38203a 4199 break;
08cd1552 4200 }
fd38203a
LP
4201
4202 case SOURCE_DEFER:
4203 r = s->defer.callback(s, s->userdata);
4204 break;
da7e457c 4205
6e9feda3
LP
4206 case SOURCE_POST:
4207 r = s->post.callback(s, s->userdata);
4208 break;
4209
6203e07a
LP
4210 case SOURCE_EXIT:
4211 r = s->exit.callback(s, s->userdata);
da7e457c 4212 break;
9d3e3aa5 4213
97ef5391
LP
4214 case SOURCE_INOTIFY: {
4215 struct sd_event *e = s->event;
4216 struct inotify_data *d;
4217 size_t sz;
4218
4219 assert(s->inotify.inode_data);
4220 assert_se(d = s->inotify.inode_data->inotify_data);
4221
4222 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4223 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4224 assert(d->buffer_filled >= sz);
4225
53baf2ef
LP
4226 /* If the inotify callback destroys the event source then this likely means we don't need to
4227 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4228 * free it immediately, then we couldn't drop the event from the inotify event queue without
4229 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4230 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4231 * explicitly GC it after we are done dropping the inotify event from the buffer. */
4232 d->n_busy++;
97ef5391 4233 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
53baf2ef 4234 d->n_busy--;
97ef5391 4235
53baf2ef
LP
4236 /* When no event is pending anymore on this inotify object, then let's drop the event from
4237 * the inotify event queue buffer. */
97ef5391
LP
4238 if (d->n_pending == 0)
4239 event_inotify_data_drop(e, d, sz);
4240
53baf2ef
LP
4241 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4242 event_gc_inotify_data(e, d);
97ef5391
LP
4243 break;
4244 }
4245
158fe190
LP
4246 case SOURCE_MEMORY_PRESSURE:
4247 r = s->memory_pressure.callback(s, s->userdata);
4248 break;
4249
9d3e3aa5 4250 case SOURCE_WATCHDOG:
a71fe8b8 4251 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
9f2a50a3 4252 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
04499a70 4253 assert_not_reached();
fd38203a
LP
4254 }
4255
12179984
LP
4256 s->dispatching = false;
4257
158fe190 4258finish:
b778cba4
LP
4259 if (r < 0) {
4260 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4261 strna(s->description),
4262 event_source_type_to_string(saved_type),
4263 s->exit_on_failure ? "exiting" : "disabling");
4264
4265 if (s->exit_on_failure)
4266 (void) sd_event_exit(saved_event, r);
4267 }
12179984
LP
4268
4269 if (s->n_ref == 0)
4270 source_free(s);
4271 else if (r < 0)
c3c50474 4272 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
b7484e2a 4273
6203e07a 4274 return 1;
fd38203a
LP
4275}
4276
4277static int event_prepare(sd_event *e) {
4278 int r;
4279
4280 assert(e);
4281
4282 for (;;) {
4283 sd_event_source *s;
4284
4285 s = prioq_peek(e->prepare);
b6d5481b 4286 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
fd38203a
LP
4287 break;
4288
4289 s->prepare_iteration = e->iteration;
8656f4a6 4290 prioq_reshuffle(e->prepare, s, &s->prepare_index);
fd38203a
LP
4291
4292 assert(s->prepare);
12179984 4293 s->dispatching = true;
fd38203a 4294 r = s->prepare(s, s->userdata);
12179984
LP
4295 s->dispatching = false;
4296
b778cba4
LP
4297 if (r < 0) {
4298 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4299 strna(s->description),
4300 event_source_type_to_string(s->type),
4301 s->exit_on_failure ? "exiting" : "disabling");
4302
4303 if (s->exit_on_failure)
4304 (void) sd_event_exit(e, r);
4305 }
fd38203a 4306
12179984
LP
4307 if (s->n_ref == 0)
4308 source_free(s);
4309 else if (r < 0)
c3c50474 4310 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
fd38203a
LP
4311 }
4312
4313 return 0;
4314}
4315
6203e07a 4316static int dispatch_exit(sd_event *e) {
da7e457c
LP
4317 sd_event_source *p;
4318 int r;
4319
4320 assert(e);
4321
6203e07a 4322 p = prioq_peek(e->exit);
19947509
ZJS
4323 assert(!p || p->type == SOURCE_EXIT);
4324
b6d5481b 4325 if (!p || event_source_is_offline(p)) {
da7e457c
LP
4326 e->state = SD_EVENT_FINISHED;
4327 return 0;
4328 }
4329
c8e9d15c 4330 PROTECT_EVENT(e);
da7e457c 4331 e->iteration++;
6203e07a 4332 e->state = SD_EVENT_EXITING;
da7e457c 4333 r = source_dispatch(p);
2b0c9ef7 4334 e->state = SD_EVENT_INITIAL;
da7e457c
LP
4335 return r;
4336}
4337
c2ba3ad6
LP
4338static sd_event_source* event_next_pending(sd_event *e) {
4339 sd_event_source *p;
4340
da7e457c
LP
4341 assert(e);
4342
c2ba3ad6
LP
4343 p = prioq_peek(e->pending);
4344 if (!p)
4345 return NULL;
4346
b6d5481b 4347 if (event_source_is_offline(p))
c2ba3ad6
LP
4348 return NULL;
4349
4350 return p;
4351}
4352
cde93897
LP
4353static int arm_watchdog(sd_event *e) {
4354 struct itimerspec its = {};
4355 usec_t t;
cde93897
LP
4356
4357 assert(e);
4358 assert(e->watchdog_fd >= 0);
4359
4360 t = sleep_between(e,
a595fb5c
YW
4361 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4362 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
cde93897
LP
4363
4364 timespec_store(&its.it_value, t);
4365
75145780
LP
4366 /* Make sure we never set the watchdog to 0, which tells the
4367 * kernel to disable it. */
4368 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4369 its.it_value.tv_nsec = 1;
4370
7c248223 4371 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
cde93897
LP
4372}
4373
4374static int process_watchdog(sd_event *e) {
4375 assert(e);
4376
4377 if (!e->watchdog)
4378 return 0;
4379
4380 /* Don't notify watchdog too often */
4381 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4382 return 0;
4383
4384 sd_notify(false, "WATCHDOG=1");
4385 e->watchdog_last = e->timestamp.monotonic;
4386
4387 return arm_watchdog(e);
4388}
4389
97ef5391
LP
4390static void event_close_inode_data_fds(sd_event *e) {
4391 struct inode_data *d;
4392
4393 assert(e);
4394
4395 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4396 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
365c2885 4397 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
97ef5391
LP
4398 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4399 * compromise. */
4400
ed828563 4401 while ((d = e->inode_data_to_close_list)) {
97ef5391
LP
4402 assert(d->fd >= 0);
4403 d->fd = safe_close(d->fd);
4404
ed828563 4405 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
97ef5391
LP
4406 }
4407}
4408
158fe190
LP
4409static int event_memory_pressure_write_list(sd_event *e) {
4410 int r;
4411
4412 assert(e);
4413
4414 for (;;) {
4415 sd_event_source *s;
4416
4417 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4418 if (!s)
4419 break;
4420
4421 assert(s->type == SOURCE_MEMORY_PRESSURE);
4422 assert(s->memory_pressure.write_buffer_size > 0);
4423 s->memory_pressure.in_write_list = false;
4424
4425 r = source_memory_pressure_write(s);
4426 if (r < 0)
4427 return r;
4428 }
4429
4430 return 0;
4431}
4432
c45a5a74
TG
4433_public_ int sd_event_prepare(sd_event *e) {
4434 int r;
fd38203a 4435
da7e457c 4436 assert_return(e, -EINVAL);
b937d761 4437 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4438 assert_return(!event_origin_changed(e), -ECHILD);
da7e457c 4439 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 4440 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 4441
e5446015
LP
4442 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4443 * this check here once, since gettid() is typically not cached, and thus want to minimize
4444 * syscalls */
4445 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4446
f814c871 4447 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 4448 PROTECT_EVENT(e);
f814c871 4449
6203e07a 4450 if (e->exit_requested)
c45a5a74 4451 goto pending;
fd38203a
LP
4452
4453 e->iteration++;
4454
0be6c2f6 4455 e->state = SD_EVENT_PREPARING;
fd38203a 4456 r = event_prepare(e);
0be6c2f6 4457 e->state = SD_EVENT_INITIAL;
fd38203a 4458 if (r < 0)
c45a5a74 4459 return r;
fd38203a 4460
158fe190
LP
4461 r = event_memory_pressure_write_list(e);
4462 if (r < 0)
4463 return r;
4464
6a0f1f6d
LP
4465 r = event_arm_timer(e, &e->realtime);
4466 if (r < 0)
c45a5a74 4467 return r;
6a0f1f6d 4468
a8548816
TG
4469 r = event_arm_timer(e, &e->boottime);
4470 if (r < 0)
c45a5a74 4471 return r;
a8548816 4472
6a0f1f6d
LP
4473 r = event_arm_timer(e, &e->monotonic);
4474 if (r < 0)
c45a5a74 4475 return r;
6a0f1f6d
LP
4476
4477 r = event_arm_timer(e, &e->realtime_alarm);
1b5995b0 4478 if (r < 0)
c45a5a74 4479 return r;
fd38203a 4480
6a0f1f6d 4481 r = event_arm_timer(e, &e->boottime_alarm);
1b5995b0 4482 if (r < 0)
c45a5a74 4483 return r;
fd38203a 4484
97ef5391
LP
4485 event_close_inode_data_fds(e);
4486
0601b958 4487 if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
c45a5a74
TG
4488 goto pending;
4489
2b0c9ef7 4490 e->state = SD_EVENT_ARMED;
c45a5a74
TG
4491
4492 return 0;
4493
4494pending:
2b0c9ef7 4495 e->state = SD_EVENT_ARMED;
6d148a84
TG
4496 r = sd_event_wait(e, 0);
4497 if (r == 0)
2b0c9ef7 4498 e->state = SD_EVENT_ARMED;
6d148a84
TG
4499
4500 return r;
c45a5a74
TG
4501}
4502
798445ab
LP
4503static int epoll_wait_usec(
4504 int fd,
4505 struct epoll_event *events,
4506 int maxevents,
4507 usec_t timeout) {
4508
7c248223 4509 int msec;
0c14c45e
LP
4510 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4511
4512#if HAVE_EPOLL_PWAIT2
39f756d3 4513 static bool epoll_pwait2_absent = false;
52bb308c 4514 int r;
798445ab 4515
0c14c45e
LP
4516 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4517 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4518 * is not that obvious to implement given the libc and kernel definitions differ in the last
2257be13 4519 * argument. Moreover, the only reason to use it is the more accurate timeouts (which is not a
0c14c45e
LP
4520 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4521 * missing. */
798445ab
LP
4522
4523 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
798445ab
LP
4524 r = epoll_pwait2(fd,
4525 events,
4526 maxevents,
52bb308c 4527 TIMESPEC_STORE(timeout),
798445ab
LP
4528 NULL);
4529 if (r >= 0)
4530 return r;
7cb45dbf 4531 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
798445ab
LP
4532 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4533 * supported. */
4534
4535 epoll_pwait2_absent = true;
4536 }
39f756d3 4537#endif
798445ab
LP
4538
4539 if (timeout == USEC_INFINITY)
4540 msec = -1;
4541 else {
4542 usec_t k;
4543
4544 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4545 if (k >= INT_MAX)
4546 msec = INT_MAX; /* Saturate */
4547 else
4548 msec = (int) k;
4549 }
4550
7c248223 4551 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
798445ab
LP
4552}
4553
efd3be9d 4554static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
319a4f4b 4555 size_t n_event_queue, m, n_event_max;
efd3be9d
YW
4556 int64_t min_priority = threshold;
4557 bool something_new = false;
798445ab 4558 int r;
c45a5a74 4559
efd3be9d
YW
4560 assert(e);
4561 assert(ret_min_priority);
6a0f1f6d 4562
8b9708d1 4563 n_event_queue = MAX(e->n_sources, 1u);
319a4f4b 4564 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
5cddd924 4565 return -ENOMEM;
fd38203a 4566
319a4f4b
LP
4567 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4568
97ef5391 4569 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
0601b958 4570 if (e->buffered_inotify_data_list)
798445ab 4571 timeout = 0;
97ef5391 4572
8b9708d1 4573 for (;;) {
319a4f4b
LP
4574 r = epoll_wait_usec(
4575 e->epoll_fd,
4576 e->event_queue,
4577 n_event_max,
4578 timeout);
798445ab 4579 if (r < 0)
efd3be9d 4580 return r;
c45a5a74 4581
8b9708d1
YW
4582 m = (size_t) r;
4583
319a4f4b 4584 if (m < n_event_max)
8b9708d1
YW
4585 break;
4586
319a4f4b 4587 if (n_event_max >= n_event_queue * 10)
8b9708d1
YW
4588 break;
4589
319a4f4b 4590 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
8b9708d1
YW
4591 return -ENOMEM;
4592
319a4f4b 4593 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
798445ab 4594 timeout = 0;
da7e457c 4595 }
fd38203a 4596
efd3be9d
YW
4597 /* Set timestamp only when this is called first time. */
4598 if (threshold == INT64_MAX)
fa5a0251 4599 triple_timestamp_now(&e->timestamp);
fd38203a 4600
8b9708d1 4601 for (size_t i = 0; i < m; i++) {
fd38203a 4602
5cddd924
LP
4603 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4604 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
9da4cb2b 4605 else {
5cddd924 4606 WakeupType *t = e->event_queue[i].data.ptr;
9da4cb2b
LP
4607
4608 switch (*t) {
4609
f8f3f926 4610 case WAKEUP_EVENT_SOURCE: {
5cddd924 4611 sd_event_source *s = e->event_queue[i].data.ptr;
f8f3f926
LP
4612
4613 assert(s);
4614
efd3be9d
YW
4615 if (s->priority > threshold)
4616 continue;
4617
4618 min_priority = MIN(min_priority, s->priority);
4619
f8f3f926
LP
4620 switch (s->type) {
4621
4622 case SOURCE_IO:
5cddd924 4623 r = process_io(e, s, e->event_queue[i].events);
f8f3f926
LP
4624 break;
4625
4626 case SOURCE_CHILD:
5cddd924 4627 r = process_pidfd(e, s, e->event_queue[i].events);
f8f3f926
LP
4628 break;
4629
158fe190
LP
4630 case SOURCE_MEMORY_PRESSURE:
4631 r = process_memory_pressure(s, e->event_queue[i].events);
4632 break;
4633
f8f3f926 4634 default:
04499a70 4635 assert_not_reached();
f8f3f926
LP
4636 }
4637
9da4cb2b 4638 break;
f8f3f926 4639 }
fd38203a 4640
9da4cb2b 4641 case WAKEUP_CLOCK_DATA: {
5cddd924 4642 struct clock_data *d = e->event_queue[i].data.ptr;
f8f3f926
LP
4643
4644 assert(d);
4645
5cddd924 4646 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
9da4cb2b
LP
4647 break;
4648 }
4649
4650 case WAKEUP_SIGNAL_DATA:
efd3be9d 4651 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
9da4cb2b
LP
4652 break;
4653
97ef5391 4654 case WAKEUP_INOTIFY_DATA:
efd3be9d 4655 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
97ef5391
LP
4656 break;
4657
9da4cb2b 4658 default:
04499a70 4659 assert_not_reached();
9da4cb2b
LP
4660 }
4661 }
efd3be9d
YW
4662 if (r < 0)
4663 return r;
4664 if (r > 0)
4665 something_new = true;
4666 }
4667
4668 *ret_min_priority = min_priority;
4669 return something_new;
4670}
4671
4672_public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4673 int r;
4674
4675 assert_return(e, -EINVAL);
4676 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4677 assert_return(!event_origin_changed(e), -ECHILD);
efd3be9d
YW
4678 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4679 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4680
4681 if (e->exit_requested) {
4682 e->state = SD_EVENT_PENDING;
4683 return 1;
4684 }
4685
4686 for (int64_t threshold = INT64_MAX; ; threshold--) {
4687 int64_t epoll_min_priority, child_min_priority;
4688
4689 /* There may be a possibility that new epoll (especially IO) and child events are
4690 * triggered just after process_epoll() call but before process_child(), and the new IO
4691 * events may have higher priority than the child events. To salvage these events,
4692 * let's call epoll_wait() again, but accepts only events with higher priority than the
4693 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4694 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4695 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4696
4697 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4698 if (r == -EINTR) {
4699 e->state = SD_EVENT_PENDING;
4700 return 1;
4701 }
4702 if (r < 0)
4703 goto finish;
4704 if (r == 0 && threshold < INT64_MAX)
4705 /* No new epoll event. */
4706 break;
4707
4708 r = process_child(e, threshold, &child_min_priority);
fd38203a 4709 if (r < 0)
da7e457c 4710 goto finish;
efd3be9d
YW
4711 if (r == 0)
4712 /* No new child event. */
4713 break;
4714
4715 threshold = MIN(epoll_min_priority, child_min_priority);
4716 if (threshold == INT64_MIN)
4717 break;
4718
4719 timeout = 0;
fd38203a
LP
4720 }
4721
cde93897
LP
4722 r = process_watchdog(e);
4723 if (r < 0)
4724 goto finish;
4725
fd69f224 4726 r = process_inotify(e);
6a0f1f6d
LP
4727 if (r < 0)
4728 goto finish;
4729
fd69f224 4730 r = process_timer(e, e->timestamp.realtime, &e->realtime);
a8548816
TG
4731 if (r < 0)
4732 goto finish;
4733
fd69f224 4734 r = process_timer(e, e->timestamp.boottime, &e->boottime);
6a0f1f6d
LP
4735 if (r < 0)
4736 goto finish;
4737
4738 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
fd38203a 4739 if (r < 0)
da7e457c 4740 goto finish;
fd38203a 4741
e475d10c 4742 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
fd38203a 4743 if (r < 0)
da7e457c 4744 goto finish;
fd38203a 4745
fd69f224 4746 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
97ef5391
LP
4747 if (r < 0)
4748 goto finish;
fd69f224
MS
4749 else if (r == 1) {
4750 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4751 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4752 * there were potentially re-enabled by the callback.
4753 *
4754 * Wondering why we treat only this invocation of process_timer() differently? Once event
4755 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4756 * ratelimit expiry callback is never called for any other timer type. */
4757 r = 0;
4758 goto finish;
4759 }
97ef5391 4760
c45a5a74
TG
4761 if (event_next_pending(e)) {
4762 e->state = SD_EVENT_PENDING;
c45a5a74 4763 return 1;
da7e457c
LP
4764 }
4765
c45a5a74 4766 r = 0;
fd38203a 4767
da7e457c 4768finish:
2b0c9ef7 4769 e->state = SD_EVENT_INITIAL;
da7e457c
LP
4770
4771 return r;
fd38203a
LP
4772}
4773
c45a5a74
TG
4774_public_ int sd_event_dispatch(sd_event *e) {
4775 sd_event_source *p;
4776 int r;
4777
4778 assert_return(e, -EINVAL);
b937d761 4779 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4780 assert_return(!event_origin_changed(e), -ECHILD);
c45a5a74
TG
4781 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4782 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4783
4784 if (e->exit_requested)
4785 return dispatch_exit(e);
4786
4787 p = event_next_pending(e);
4788 if (p) {
c8e9d15c 4789 PROTECT_EVENT(e);
c45a5a74
TG
4790
4791 e->state = SD_EVENT_RUNNING;
4792 r = source_dispatch(p);
2b0c9ef7 4793 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4794 return r;
4795 }
4796
2b0c9ef7 4797 e->state = SD_EVENT_INITIAL;
c45a5a74
TG
4798
4799 return 1;
4800}
4801
34b87517 4802static void event_log_delays(sd_event *e) {
442ac269 4803 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
ddb8a639 4804 size_t l;
34b87517 4805
442ac269
YW
4806 p = b;
4807 l = sizeof(b);
ddb8a639
I
4808 FOREACH_ELEMENT(delay, e->delays) {
4809 l = strpcpyf(&p, l, "%u ", *delay);
4810 *delay = 0;
34b87517 4811 }
442ac269 4812 log_debug("Event loop iterations: %s", b);
34b87517
VC
4813}
4814
c45a5a74
TG
4815_public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4816 int r;
4817
4818 assert_return(e, -EINVAL);
b937d761 4819 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4820 assert_return(!event_origin_changed(e), -ECHILD);
c45a5a74 4821 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2b0c9ef7 4822 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
c45a5a74 4823
e6a7bee5 4824 if (e->profile_delays && e->last_run_usec != 0) {
34b87517
VC
4825 usec_t this_run;
4826 unsigned l;
4827
4828 this_run = now(CLOCK_MONOTONIC);
4829
58c34be8 4830 l = log2u64(this_run - e->last_run_usec);
cb9d621e 4831 assert(l < ELEMENTSOF(e->delays));
34b87517
VC
4832 e->delays[l]++;
4833
e6a7bee5 4834 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
34b87517 4835 event_log_delays(e);
e6a7bee5 4836 e->last_log_usec = this_run;
34b87517
VC
4837 }
4838 }
4839
f814c871 4840 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
c8e9d15c 4841 PROTECT_EVENT(e);
f814c871 4842
c45a5a74 4843 r = sd_event_prepare(e);
53bac4e0
LP
4844 if (r == 0)
4845 /* There was nothing? Then wait... */
4846 r = sd_event_wait(e, timeout);
c45a5a74 4847
34b87517 4848 if (e->profile_delays)
e6a7bee5 4849 e->last_run_usec = now(CLOCK_MONOTONIC);
34b87517 4850
02d30981 4851 if (r > 0) {
53bac4e0 4852 /* There's something now, then let's dispatch it */
02d30981
TG
4853 r = sd_event_dispatch(e);
4854 if (r < 0)
4855 return r;
53bac4e0
LP
4856
4857 return 1;
4858 }
4859
4860 return r;
c45a5a74
TG
4861}
4862
f7262a9f 4863_public_ int sd_event_loop(sd_event *e) {
fd38203a
LP
4864 int r;
4865
da7e457c 4866 assert_return(e, -EINVAL);
b937d761 4867 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4868 assert_return(!event_origin_changed(e), -ECHILD);
2b0c9ef7 4869 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
da7e457c 4870
c8e9d15c 4871 PROTECT_EVENT(e);
fd38203a 4872
da7e457c 4873 while (e->state != SD_EVENT_FINISHED) {
f5fbe71d 4874 r = sd_event_run(e, UINT64_MAX);
fd38203a 4875 if (r < 0)
30dd293c 4876 return r;
fd38203a
LP
4877 }
4878
30dd293c 4879 return e->exit_code;
fd38203a
LP
4880}
4881
9b364545 4882_public_ int sd_event_get_fd(sd_event *e) {
9b364545 4883 assert_return(e, -EINVAL);
b937d761 4884 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4885 assert_return(!event_origin_changed(e), -ECHILD);
9b364545
TG
4886
4887 return e->epoll_fd;
4888}
4889
f7262a9f 4890_public_ int sd_event_get_state(sd_event *e) {
da7e457c 4891 assert_return(e, -EINVAL);
b937d761 4892 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4893 assert_return(!event_origin_changed(e), -ECHILD);
da7e457c
LP
4894
4895 return e->state;
4896}
4897
ff5ba2d6 4898_public_ int sd_event_get_exit_code(sd_event *e, int *ret) {
da7e457c 4899 assert_return(e, -EINVAL);
b937d761 4900 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4901 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 4902
6203e07a
LP
4903 if (!e->exit_requested)
4904 return -ENODATA;
4905
ff5ba2d6
YW
4906 if (ret)
4907 *ret = e->exit_code;
6203e07a 4908 return 0;
fd38203a
LP
4909}
4910
6203e07a 4911_public_ int sd_event_exit(sd_event *e, int code) {
da7e457c 4912 assert_return(e, -EINVAL);
b937d761 4913 assert_return(e = event_resolve(e), -ENOPKG);
da7e457c 4914 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2eeff0f4 4915 assert_return(!event_origin_changed(e), -ECHILD);
fd38203a 4916
6203e07a
LP
4917 e->exit_requested = true;
4918 e->exit_code = code;
4919
fd38203a
LP
4920 return 0;
4921}
46e8c825 4922
ff5ba2d6 4923_public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *ret) {
46e8c825 4924 assert_return(e, -EINVAL);
b937d761 4925 assert_return(e = event_resolve(e), -ENOPKG);
ff5ba2d6 4926 assert_return(ret, -EINVAL);
2eeff0f4 4927 assert_return(!event_origin_changed(e), -ECHILD);
46e8c825 4928
e475d10c
LP
4929 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4930 return -EOPNOTSUPP;
4931
e475d10c 4932 if (!triple_timestamp_is_set(&e->timestamp)) {
15c689d7 4933 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
ff5ba2d6 4934 *ret = now(clock);
38a03f06
LP
4935 return 1;
4936 }
46e8c825 4937
ff5ba2d6 4938 *ret = triple_timestamp_by_clock(&e->timestamp, clock);
46e8c825
LP
4939 return 0;
4940}
afc6adb5
LP
4941
4942_public_ int sd_event_default(sd_event **ret) {
39883f62 4943 sd_event *e = NULL;
afc6adb5
LP
4944 int r;
4945
4946 if (!ret)
4947 return !!default_event;
4948
4949 if (default_event) {
4950 *ret = sd_event_ref(default_event);
4951 return 0;
4952 }
4953
4954 r = sd_event_new(&e);
4955 if (r < 0)
4956 return r;
4957
4958 e->default_event_ptr = &default_event;
4959 e->tid = gettid();
4960 default_event = e;
4961
4962 *ret = e;
4963 return 1;
4964}
4965
ff5ba2d6 4966_public_ int sd_event_get_tid(sd_event *e, pid_t *ret) {
afc6adb5 4967 assert_return(e, -EINVAL);
b937d761 4968 assert_return(e = event_resolve(e), -ENOPKG);
ff5ba2d6 4969 assert_return(ret, -EINVAL);
2eeff0f4 4970 assert_return(!event_origin_changed(e), -ECHILD);
afc6adb5 4971
ff5ba2d6
YW
4972 if (e->tid == 0)
4973 return -ENXIO;
76b54375 4974
ff5ba2d6
YW
4975 *ret = e->tid;
4976 return 0;
afc6adb5 4977}
cde93897
LP
4978
4979_public_ int sd_event_set_watchdog(sd_event *e, int b) {
4980 int r;
4981
4982 assert_return(e, -EINVAL);
b937d761 4983 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 4984 assert_return(!event_origin_changed(e), -ECHILD);
cde93897
LP
4985
4986 if (e->watchdog == !!b)
4987 return e->watchdog;
4988
4989 if (b) {
09812eb7
LP
4990 r = sd_watchdog_enabled(false, &e->watchdog_period);
4991 if (r <= 0)
cde93897 4992 return r;
cde93897
LP
4993
4994 /* Issue first ping immediately */
4995 sd_notify(false, "WATCHDOG=1");
4996 e->watchdog_last = now(CLOCK_MONOTONIC);
4997
4998 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4999 if (e->watchdog_fd < 0)
5000 return -errno;
5001
5002 r = arm_watchdog(e);
5003 if (r < 0)
5004 goto fail;
5005
1eac7948 5006 struct epoll_event ev = {
a82f89aa
LP
5007 .events = EPOLLIN,
5008 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
5009 };
cde93897 5010
15c689d7 5011 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
cde93897
LP
5012 r = -errno;
5013 goto fail;
5014 }
5015
5016 } else {
5017 if (e->watchdog_fd >= 0) {
5a795bff 5018 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
03e334a1 5019 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
5020 }
5021 }
5022
0a6a5965 5023 e->watchdog = b;
cde93897
LP
5024 return e->watchdog;
5025
5026fail:
03e334a1 5027 e->watchdog_fd = safe_close(e->watchdog_fd);
cde93897
LP
5028 return r;
5029}
8f726607
LP
5030
5031_public_ int sd_event_get_watchdog(sd_event *e) {
5032 assert_return(e, -EINVAL);
b937d761 5033 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 5034 assert_return(!event_origin_changed(e), -ECHILD);
8f726607
LP
5035
5036 return e->watchdog;
5037}
60a3b1e1
LP
5038
5039_public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5040 assert_return(e, -EINVAL);
b937d761 5041 assert_return(e = event_resolve(e), -ENOPKG);
2eeff0f4 5042 assert_return(!event_origin_changed(e), -ECHILD);
60a3b1e1
LP
5043
5044 *ret = e->iteration;
5045 return 0;
5046}
15723a1d
LP
5047
5048_public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5049 assert_return(s, -EINVAL);
2eeff0f4
LB
5050 assert_return(s->event, -EINVAL);
5051 assert_return(!event_origin_changed(s->event), -ECHILD);
15723a1d
LP
5052
5053 s->destroy_callback = callback;
5054 return 0;
5055}
5056
5057_public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5058 assert_return(s, -EINVAL);
2eeff0f4 5059 assert_return(!event_origin_changed(s->event), -ECHILD);
15723a1d
LP
5060
5061 if (ret)
5062 *ret = s->destroy_callback;
5063
5064 return !!s->destroy_callback;
5065}
2382c936
YW
5066
5067_public_ int sd_event_source_get_floating(sd_event_source *s) {
5068 assert_return(s, -EINVAL);
2eeff0f4 5069 assert_return(!event_origin_changed(s->event), -ECHILD);
2382c936
YW
5070
5071 return s->floating;
5072}
5073
5074_public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5075 assert_return(s, -EINVAL);
2eeff0f4 5076 assert_return(!event_origin_changed(s->event), -ECHILD);
2382c936
YW
5077
5078 if (s->floating == !!b)
5079 return 0;
5080
5081 if (!s->event) /* Already disconnected */
5082 return -ESTALE;
5083
5084 s->floating = b;
5085
5086 if (b) {
5087 sd_event_source_ref(s);
5088 sd_event_unref(s->event);
5089 } else {
5090 sd_event_ref(s->event);
5091 sd_event_source_unref(s);
5092 }
5093
5094 return 1;
5095}
b778cba4
LP
5096
5097_public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5098 assert_return(s, -EINVAL);
5099 assert_return(s->type != SOURCE_EXIT, -EDOM);
2eeff0f4 5100 assert_return(!event_origin_changed(s->event), -ECHILD);
b778cba4
LP
5101
5102 return s->exit_on_failure;
5103}
5104
5105_public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5106 assert_return(s, -EINVAL);
5107 assert_return(s->type != SOURCE_EXIT, -EDOM);
2eeff0f4 5108 assert_return(!event_origin_changed(s->event), -ECHILD);
b778cba4
LP
5109
5110 if (s->exit_on_failure == !!b)
5111 return 0;
5112
5113 s->exit_on_failure = b;
5114 return 1;
5115}
b6d5481b
LP
5116
5117_public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5118 int r;
5119
5120 assert_return(s, -EINVAL);
2eeff0f4 5121 assert_return(!event_origin_changed(s->event), -ECHILD);
b6d5481b
LP
5122
5123 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5124 * so is a programming error. */
5125 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5126
5127 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5128 * non-ratelimited. */
fd69f224 5129 r = event_source_leave_ratelimit(s, /* run_callback */ false);
b6d5481b
LP
5130 if (r < 0)
5131 return r;
5132
5133 s->rate_limit = (RateLimit) { interval, burst };
5134 return 0;
fd69f224
MS
5135}
5136
5137_public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5138 assert_return(s, -EINVAL);
2eeff0f4 5139 assert_return(!event_origin_changed(s->event), -ECHILD);
fd69f224
MS
5140
5141 s->ratelimit_expire_callback = callback;
5142 return 0;
b6d5481b
LP
5143}
5144
5145_public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5146 assert_return(s, -EINVAL);
2eeff0f4 5147 assert_return(!event_origin_changed(s->event), -ECHILD);
b6d5481b 5148
6dd3b818
YW
5149 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5150 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
b6d5481b
LP
5151 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5152 return -EDOM;
5153
5154 if (!ratelimit_configured(&s->rate_limit))
5155 return -ENOEXEC;
5156
5157 if (ret_interval)
5158 *ret_interval = s->rate_limit.interval;
5159 if (ret_burst)
5160 *ret_burst = s->rate_limit.burst;
5161
5162 return 0;
5163}
5164
5165_public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5166 assert_return(s, -EINVAL);
2eeff0f4 5167 assert_return(!event_origin_changed(s->event), -ECHILD);
b6d5481b
LP
5168
5169 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5170 return false;
5171
5172 if (!ratelimit_configured(&s->rate_limit))
5173 return false;
5174
5175 return s->ratelimited;
5176}
baf3fdec 5177
2fdc274c
LP
5178_public_ int sd_event_source_leave_ratelimit(sd_event_source *s) {
5179 int r;
5180
5181 assert_return(s, -EINVAL);
5182
5183 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5184 return 0;
5185
5186 if (!ratelimit_configured(&s->rate_limit))
5187 return 0;
5188
5189 if (!s->ratelimited)
5190 return 0;
5191
5192 r = event_source_leave_ratelimit(s, /* run_callback */ false);
5193 if (r < 0)
5194 return r;
5195
5196 return 1; /* tell caller that we indeed just left the ratelimit state */
5197}
5198
baf3fdec
LP
5199_public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5200 bool change = false;
5201 int r;
5202
5203 assert_return(e, -EINVAL);
5dc06688
YW
5204 assert_return(e = event_resolve(e), -ENOPKG);
5205 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
5206 assert_return(!event_origin_changed(e), -ECHILD);
baf3fdec
LP
5207
5208 if (b) {
5209 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5210 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5211 * floating after creation (and undo this before deleting them again). */
5212
5213 if (!e->sigint_event_source) {
5214 r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5215 if (r < 0)
5216 return r;
5217
5dc06688 5218 assert_se(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
baf3fdec
LP
5219 change = true;
5220 }
5221
5222 if (!e->sigterm_event_source) {
5223 r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5224 if (r < 0) {
5225 if (change) {
5dc06688 5226 assert_se(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
baf3fdec
LP
5227 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5228 }
5229
5230 return r;
5231 }
5232
5dc06688 5233 assert_se(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
baf3fdec
LP
5234 change = true;
5235 }
5236
5237 } else {
5238 if (e->sigint_event_source) {
5dc06688 5239 assert_se(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
baf3fdec
LP
5240 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5241 change = true;
5242 }
5243
5244 if (e->sigterm_event_source) {
5dc06688 5245 assert_se(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
baf3fdec
LP
5246 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5247 change = true;
5248 }
5249 }
5250
5251 return change;
5252}
158fe190
LP
5253
5254_public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5255 _cleanup_free_ char *b = NULL;
5256 _cleanup_free_ void *w = NULL;
5257
5258 assert_return(s, -EINVAL);
5259 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5260 assert_return(ty, -EINVAL);
2eeff0f4 5261 assert_return(!event_origin_changed(s->event), -ECHILD);
158fe190
LP
5262
5263 if (!STR_IN_SET(ty, "some", "full"))
5264 return -EINVAL;
5265
5266 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5267 return -EBUSY;
5268
5269 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5270 if (!space)
5271 return -EINVAL;
5272
5273 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5274 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5275 if (!b)
5276 return -ENOMEM;
5277 if (!STR_IN_SET(b, "some", "full"))
5278 return -EINVAL;
5279
5280 if (streq(b, ty))
5281 return 0;
5282
5283 size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5284 w = new(char, nl);
5285 if (!w)
5286 return -ENOMEM;
5287
5288 memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5289
5290 free_and_replace(s->memory_pressure.write_buffer, w);
5291 s->memory_pressure.write_buffer_size = nl;
5292 s->memory_pressure.locked = false;
5293
5294 return 1;
5295}
5296
5297_public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5298 _cleanup_free_ char *b = NULL;
5299 _cleanup_free_ void *w = NULL;
5300
5301 assert_return(s, -EINVAL);
5302 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
2eeff0f4 5303 assert_return(!event_origin_changed(s->event), -ECHILD);
158fe190
LP
5304
5305 if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5306 return -ERANGE;
5307 if (window_usec <= 0 || window_usec >= UINT64_MAX)
5308 return -ERANGE;
5309 if (threshold_usec > window_usec)
5310 return -EINVAL;
5311
5312 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5313 return -EBUSY;
5314
5315 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5316 if (!space)
5317 return -EINVAL;
5318
5319 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5320 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5321 if (!b)
5322 return -ENOMEM;
5323 if (!STR_IN_SET(b, "some", "full"))
5324 return -EINVAL;
5325
5326 if (asprintf((char**) &w,
5327 "%s " USEC_FMT " " USEC_FMT "",
5328 b,
5329 threshold_usec,
5330 window_usec) < 0)
5331 return -EINVAL;
5332
5333 l = strlen(w) + 1;
5334 if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5335 return 0;
5336
5337 free_and_replace(s->memory_pressure.write_buffer, w);
5338 s->memory_pressure.write_buffer_size = l;
5339 s->memory_pressure.locked = false;
5340
5341 return 1;
5342}