]> git.ipfire.org Git - thirdparty/systemd.git/blame_incremental - src/libsystemd/sd-event/sd-event.c
Fixes for vscode/intellisense parsing (#38040)
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
... / ...
CommitLineData
1/* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3#include <malloc.h>
4#include <stdlib.h>
5#include <sys/timerfd.h>
6#include <sys/wait.h>
7#include <threads.h>
8#include <unistd.h>
9
10#include "sd-daemon.h"
11#include "sd-event.h"
12#include "sd-id128.h"
13#include "sd-messages.h"
14
15#include "alloc-util.h"
16#include "errno-util.h"
17#include "event-source.h"
18#include "fd-util.h"
19#include "format-util.h"
20#include "glyph-util.h"
21#include "hashmap.h"
22#include "hexdecoct.h"
23#include "list.h"
24#include "log.h"
25#include "logarithm.h"
26#include "memory-util.h"
27#include "missing_magic.h"
28#include "missing_wait.h"
29#include "origin-id.h"
30#include "path-util.h"
31#include "pidfd-util.h"
32#include "prioq.h"
33#include "process-util.h"
34#include "psi-util.h"
35#include "set.h"
36#include "signal-util.h"
37#include "siphash24.h"
38#include "socket-util.h"
39#include "stat-util.h"
40#include "string-table.h"
41#include "string-util.h"
42#include "strv.h"
43#include "strxcpyx.h"
44#include "time-util.h"
45
46#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
47
48static bool EVENT_SOURCE_WATCH_PIDFD(const sd_event_source *s) {
49 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
50 return s &&
51 s->type == SOURCE_CHILD &&
52 s->child.options == WEXITED;
53}
54
55static bool event_source_is_online(sd_event_source *s) {
56 assert(s);
57 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
58}
59
60static bool event_source_is_offline(sd_event_source *s) {
61 assert(s);
62 return s->enabled == SD_EVENT_OFF || s->ratelimited;
63}
64
65static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
66 [SOURCE_IO] = "io",
67 [SOURCE_TIME_REALTIME] = "realtime",
68 [SOURCE_TIME_BOOTTIME] = "boottime",
69 [SOURCE_TIME_MONOTONIC] = "monotonic",
70 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
71 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
72 [SOURCE_SIGNAL] = "signal",
73 [SOURCE_CHILD] = "child",
74 [SOURCE_DEFER] = "defer",
75 [SOURCE_POST] = "post",
76 [SOURCE_EXIT] = "exit",
77 [SOURCE_WATCHDOG] = "watchdog",
78 [SOURCE_INOTIFY] = "inotify",
79 [SOURCE_MEMORY_PRESSURE] = "memory-pressure",
80};
81
82DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
83
84#define EVENT_SOURCE_IS_TIME(t) \
85 IN_SET((t), \
86 SOURCE_TIME_REALTIME, \
87 SOURCE_TIME_BOOTTIME, \
88 SOURCE_TIME_MONOTONIC, \
89 SOURCE_TIME_REALTIME_ALARM, \
90 SOURCE_TIME_BOOTTIME_ALARM)
91
92#define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
93 IN_SET((t), \
94 SOURCE_IO, \
95 SOURCE_TIME_REALTIME, \
96 SOURCE_TIME_BOOTTIME, \
97 SOURCE_TIME_MONOTONIC, \
98 SOURCE_TIME_REALTIME_ALARM, \
99 SOURCE_TIME_BOOTTIME_ALARM, \
100 SOURCE_SIGNAL, \
101 SOURCE_DEFER, \
102 SOURCE_INOTIFY, \
103 SOURCE_MEMORY_PRESSURE)
104
105/* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
106 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
107 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
108#define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
109
110struct sd_event {
111 unsigned n_ref;
112
113 int epoll_fd;
114 int watchdog_fd;
115
116 Prioq *pending;
117 Prioq *prepare;
118
119 /* timerfd_create() only supports these five clocks so far. We
120 * can add support for more clocks when the kernel learns to
121 * deal with them, too. */
122 struct clock_data realtime;
123 struct clock_data boottime;
124 struct clock_data monotonic;
125 struct clock_data realtime_alarm;
126 struct clock_data boottime_alarm;
127
128 usec_t perturb;
129
130 sd_event_source **signal_sources; /* indexed by signal number */
131 Hashmap *signal_data; /* indexed by priority */
132
133 Hashmap *child_sources;
134 unsigned n_online_child_sources;
135
136 Set *post_sources;
137
138 Prioq *exit;
139
140 Hashmap *inotify_data; /* indexed by priority */
141
142 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
143 LIST_HEAD(struct inode_data, inode_data_to_close_list);
144
145 /* A list of inotify objects that already have events buffered which aren't processed yet */
146 LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
147
148 /* A list of memory pressure event sources that still need their subscription string written */
149 LIST_HEAD(sd_event_source, memory_pressure_write_list);
150
151 uint64_t origin_id;
152
153 uint64_t iteration;
154 triple_timestamp timestamp;
155 int state;
156
157 bool exit_requested:1;
158 bool need_process_child:1;
159 bool watchdog:1;
160 bool profile_delays:1;
161
162 int exit_code;
163
164 pid_t tid;
165 sd_event **default_event_ptr;
166
167 usec_t watchdog_last, watchdog_period;
168
169 unsigned n_sources;
170
171 struct epoll_event *event_queue;
172
173 LIST_HEAD(sd_event_source, sources);
174
175 sd_event_source *sigint_event_source, *sigterm_event_source;
176
177 usec_t last_run_usec, last_log_usec;
178 unsigned delays[sizeof(usec_t) * 8];
179};
180
181DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event, event);
182
183static thread_local sd_event *default_event = NULL;
184
185static void source_disconnect(sd_event_source *s);
186static void event_gc_inode_data(sd_event *e, struct inode_data *d);
187
188static sd_event* event_resolve(sd_event *e) {
189 return e == SD_EVENT_DEFAULT ? default_event : e;
190}
191
192static int pending_prioq_compare(const void *a, const void *b) {
193 const sd_event_source *x = a, *y = b;
194 int r;
195
196 assert(x->pending);
197 assert(y->pending);
198
199 /* Enabled ones first */
200 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
201 if (r != 0)
202 return r;
203
204 /* Non rate-limited ones first. */
205 r = CMP(!!x->ratelimited, !!y->ratelimited);
206 if (r != 0)
207 return r;
208
209 /* Lower priority values first */
210 r = CMP(x->priority, y->priority);
211 if (r != 0)
212 return r;
213
214 /* Older entries first */
215 return CMP(x->pending_iteration, y->pending_iteration);
216}
217
218static int prepare_prioq_compare(const void *a, const void *b) {
219 const sd_event_source *x = a, *y = b;
220 int r;
221
222 assert(x->prepare);
223 assert(y->prepare);
224
225 /* Enabled ones first */
226 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
227 if (r != 0)
228 return r;
229
230 /* Non rate-limited ones first. */
231 r = CMP(!!x->ratelimited, !!y->ratelimited);
232 if (r != 0)
233 return r;
234
235 /* Move most recently prepared ones last, so that we can stop
236 * preparing as soon as we hit one that has already been
237 * prepared in the current iteration */
238 r = CMP(x->prepare_iteration, y->prepare_iteration);
239 if (r != 0)
240 return r;
241
242 /* Lower priority values first */
243 return CMP(x->priority, y->priority);
244}
245
246static usec_t time_event_source_next(const sd_event_source *s) {
247 assert(s);
248
249 /* We have two kinds of event sources that have elapsation times associated with them: the actual
250 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
251 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
252 * looking at here. */
253
254 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
255 assert(s->rate_limit.begin != 0);
256 assert(s->rate_limit.interval != 0);
257 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
258 }
259
260 /* Otherwise this must be a time event source, if not ratelimited */
261 if (EVENT_SOURCE_IS_TIME(s->type))
262 return s->time.next;
263
264 return USEC_INFINITY;
265}
266
267static usec_t time_event_source_latest(const sd_event_source *s) {
268 assert(s);
269
270 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
271 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
272 * window */
273 assert(s->rate_limit.begin != 0);
274 assert(s->rate_limit.interval != 0);
275 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
276 }
277
278 /* Must be a time event source, if not ratelimited */
279 if (EVENT_SOURCE_IS_TIME(s->type))
280 return usec_add(s->time.next, s->time.accuracy);
281
282 return USEC_INFINITY;
283}
284
285static bool event_source_timer_candidate(const sd_event_source *s) {
286 assert(s);
287
288 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
289 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
290 return !s->pending || s->ratelimited;
291}
292
293static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
294 const sd_event_source *x = a, *y = b;
295 int r;
296
297 /* Enabled ones first */
298 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
299 if (r != 0)
300 return r;
301
302 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
303 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
304 if (r != 0)
305 return r;
306
307 /* Order by time */
308 return CMP(time_func(x), time_func(y));
309}
310
311static int earliest_time_prioq_compare(const void *a, const void *b) {
312 return time_prioq_compare(a, b, time_event_source_next);
313}
314
315static int latest_time_prioq_compare(const void *a, const void *b) {
316 return time_prioq_compare(a, b, time_event_source_latest);
317}
318
319static int exit_prioq_compare(const void *a, const void *b) {
320 const sd_event_source *x = a, *y = b;
321 int r;
322
323 assert(x->type == SOURCE_EXIT);
324 assert(y->type == SOURCE_EXIT);
325
326 /* Enabled ones first */
327 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
328 if (r != 0)
329 return r;
330
331 /* Lower priority values first */
332 return CMP(x->priority, y->priority);
333}
334
335static void free_clock_data(struct clock_data *d) {
336 assert(d);
337 assert(d->wakeup == WAKEUP_CLOCK_DATA);
338
339 safe_close(d->fd);
340 prioq_free(d->earliest);
341 prioq_free(d->latest);
342}
343
344static sd_event* event_free(sd_event *e) {
345 sd_event_source *s;
346
347 assert(e);
348
349 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
350 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
351
352 while ((s = e->sources)) {
353 assert(s->floating);
354 source_disconnect(s);
355 sd_event_source_unref(s);
356 }
357
358 assert(e->n_sources == 0);
359
360 if (e->default_event_ptr)
361 *(e->default_event_ptr) = NULL;
362
363 safe_close(e->epoll_fd);
364 safe_close(e->watchdog_fd);
365
366 free_clock_data(&e->realtime);
367 free_clock_data(&e->boottime);
368 free_clock_data(&e->monotonic);
369 free_clock_data(&e->realtime_alarm);
370 free_clock_data(&e->boottime_alarm);
371
372 prioq_free(e->pending);
373 prioq_free(e->prepare);
374 prioq_free(e->exit);
375
376 free(e->signal_sources);
377 hashmap_free(e->signal_data);
378
379 hashmap_free(e->inotify_data);
380
381 hashmap_free(e->child_sources);
382 set_free(e->post_sources);
383
384 free(e->event_queue);
385
386 return mfree(e);
387}
388
389_public_ int sd_event_new(sd_event** ret) {
390 sd_event *e;
391 int r;
392
393 assert_return(ret, -EINVAL);
394
395 e = new(sd_event, 1);
396 if (!e)
397 return -ENOMEM;
398
399 *e = (sd_event) {
400 .n_ref = 1,
401 .epoll_fd = -EBADF,
402 .watchdog_fd = -EBADF,
403 .realtime.wakeup = WAKEUP_CLOCK_DATA,
404 .realtime.fd = -EBADF,
405 .realtime.next = USEC_INFINITY,
406 .boottime.wakeup = WAKEUP_CLOCK_DATA,
407 .boottime.fd = -EBADF,
408 .boottime.next = USEC_INFINITY,
409 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
410 .monotonic.fd = -EBADF,
411 .monotonic.next = USEC_INFINITY,
412 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
413 .realtime_alarm.fd = -EBADF,
414 .realtime_alarm.next = USEC_INFINITY,
415 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
416 .boottime_alarm.fd = -EBADF,
417 .boottime_alarm.next = USEC_INFINITY,
418 .perturb = USEC_INFINITY,
419 .origin_id = origin_id_query(),
420 };
421
422 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
423 if (r < 0)
424 goto fail;
425
426 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
427 if (e->epoll_fd < 0) {
428 r = -errno;
429 goto fail;
430 }
431
432 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
433
434 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
435 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
436 glyph(GLYPH_ELLIPSIS));
437 e->profile_delays = true;
438 }
439
440 *ret = e;
441 return 0;
442
443fail:
444 event_free(e);
445 return r;
446}
447
448/* Define manually so we can add the origin check */
449_public_ sd_event* sd_event_ref(sd_event *e) {
450 if (!e)
451 return NULL;
452 if (event_origin_changed(e))
453 return NULL;
454
455 e->n_ref++;
456
457 return e;
458}
459
460_public_ sd_event* sd_event_unref(sd_event *e) {
461 if (!e)
462 return NULL;
463 if (event_origin_changed(e))
464 return NULL;
465
466 assert(e->n_ref > 0);
467 if (--e->n_ref > 0)
468 return NULL;
469
470 return event_free(e);
471}
472
473#define PROTECT_EVENT(e) \
474 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
475
476_public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
477 int r;
478
479 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
480 if (r < 0)
481 log_debug_errno(r, "Failed to disable event source %p (%s): %m",
482 s, strna(s->description));
483
484 return sd_event_source_unref(s);
485}
486
487static void source_io_unregister(sd_event_source *s) {
488 assert(s);
489 assert(s->type == SOURCE_IO);
490
491 if (event_origin_changed(s->event))
492 return;
493
494 if (!s->io.registered)
495 return;
496
497 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
498 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
499 strna(s->description), event_source_type_to_string(s->type));
500
501 s->io.registered = false;
502}
503
504static int source_io_register(
505 sd_event_source *s,
506 int enabled,
507 uint32_t events) {
508
509 assert(s);
510 assert(s->type == SOURCE_IO);
511 assert(enabled != SD_EVENT_OFF);
512
513 struct epoll_event ev = {
514 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
515 .data.ptr = s,
516 };
517
518 if (epoll_ctl(s->event->epoll_fd,
519 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
520 s->io.fd, &ev) < 0)
521 return -errno;
522
523 s->io.registered = true;
524
525 return 0;
526}
527
528static void source_child_pidfd_unregister(sd_event_source *s) {
529 assert(s);
530 assert(s->type == SOURCE_CHILD);
531
532 if (event_origin_changed(s->event))
533 return;
534
535 if (!s->child.registered)
536 return;
537
538 if (EVENT_SOURCE_WATCH_PIDFD(s))
539 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
540 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
541 strna(s->description), event_source_type_to_string(s->type));
542
543 s->child.registered = false;
544}
545
546static int source_child_pidfd_register(sd_event_source *s, int enabled) {
547 assert(s);
548 assert(s->type == SOURCE_CHILD);
549 assert(enabled != SD_EVENT_OFF);
550
551 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
552 struct epoll_event ev = {
553 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
554 .data.ptr = s,
555 };
556
557 if (epoll_ctl(s->event->epoll_fd,
558 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
559 s->child.pidfd, &ev) < 0)
560 return -errno;
561 }
562
563 s->child.registered = true;
564 return 0;
565}
566
567static void source_memory_pressure_unregister(sd_event_source *s) {
568 assert(s);
569 assert(s->type == SOURCE_MEMORY_PRESSURE);
570
571 if (event_origin_changed(s->event))
572 return;
573
574 if (!s->memory_pressure.registered)
575 return;
576
577 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
578 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
579 strna(s->description), event_source_type_to_string(s->type));
580
581 s->memory_pressure.registered = false;
582}
583
584static int source_memory_pressure_register(sd_event_source *s, int enabled) {
585 assert(s);
586 assert(s->type == SOURCE_MEMORY_PRESSURE);
587 assert(enabled != SD_EVENT_OFF);
588
589 struct epoll_event ev = {
590 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
591 (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
592 .data.ptr = s,
593 };
594
595 if (epoll_ctl(s->event->epoll_fd,
596 s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
597 s->memory_pressure.fd, &ev) < 0)
598 return -errno;
599
600 s->memory_pressure.registered = true;
601 return 0;
602}
603
604static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
605 assert(s);
606 assert(s->type == SOURCE_MEMORY_PRESSURE);
607
608 if (s->memory_pressure.in_write_list)
609 return;
610
611 LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
612 s->memory_pressure.in_write_list = true;
613}
614
615static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
616 assert(s);
617 assert(s->type == SOURCE_MEMORY_PRESSURE);
618
619 if (!s->memory_pressure.in_write_list)
620 return;
621
622 LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
623 s->memory_pressure.in_write_list = false;
624}
625
626static clockid_t event_source_type_to_clock(EventSourceType t) {
627
628 switch (t) {
629
630 case SOURCE_TIME_REALTIME:
631 return CLOCK_REALTIME;
632
633 case SOURCE_TIME_BOOTTIME:
634 return CLOCK_BOOTTIME;
635
636 case SOURCE_TIME_MONOTONIC:
637 return CLOCK_MONOTONIC;
638
639 case SOURCE_TIME_REALTIME_ALARM:
640 return CLOCK_REALTIME_ALARM;
641
642 case SOURCE_TIME_BOOTTIME_ALARM:
643 return CLOCK_BOOTTIME_ALARM;
644
645 default:
646 return (clockid_t) -1;
647 }
648}
649
650static EventSourceType clock_to_event_source_type(clockid_t clock) {
651
652 switch (clock) {
653
654 case CLOCK_REALTIME:
655 return SOURCE_TIME_REALTIME;
656
657 case CLOCK_BOOTTIME:
658 return SOURCE_TIME_BOOTTIME;
659
660 case CLOCK_MONOTONIC:
661 return SOURCE_TIME_MONOTONIC;
662
663 case CLOCK_REALTIME_ALARM:
664 return SOURCE_TIME_REALTIME_ALARM;
665
666 case CLOCK_BOOTTIME_ALARM:
667 return SOURCE_TIME_BOOTTIME_ALARM;
668
669 default:
670 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
671 }
672}
673
674static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
675 assert(e);
676
677 switch (t) {
678
679 case SOURCE_TIME_REALTIME:
680 return &e->realtime;
681
682 case SOURCE_TIME_BOOTTIME:
683 return &e->boottime;
684
685 case SOURCE_TIME_MONOTONIC:
686 return &e->monotonic;
687
688 case SOURCE_TIME_REALTIME_ALARM:
689 return &e->realtime_alarm;
690
691 case SOURCE_TIME_BOOTTIME_ALARM:
692 return &e->boottime_alarm;
693
694 default:
695 return NULL;
696 }
697}
698
699static void event_free_signal_data(sd_event *e, struct signal_data *d) {
700 assert(e);
701
702 if (!d)
703 return;
704
705 hashmap_remove(e->signal_data, &d->priority);
706 safe_close(d->fd);
707 free(d);
708}
709
710static int event_make_signal_data(
711 sd_event *e,
712 int sig,
713 struct signal_data **ret) {
714
715 struct signal_data *d;
716 bool added = false;
717 sigset_t ss_copy;
718 int64_t priority;
719 int r;
720
721 assert(e);
722
723 if (event_origin_changed(e))
724 return -ECHILD;
725
726 if (e->signal_sources && e->signal_sources[sig])
727 priority = e->signal_sources[sig]->priority;
728 else
729 priority = SD_EVENT_PRIORITY_NORMAL;
730
731 d = hashmap_get(e->signal_data, &priority);
732 if (d) {
733 if (sigismember(&d->sigset, sig) > 0) {
734 if (ret)
735 *ret = d;
736 return 0;
737 }
738 } else {
739 d = new(struct signal_data, 1);
740 if (!d)
741 return -ENOMEM;
742
743 *d = (struct signal_data) {
744 .wakeup = WAKEUP_SIGNAL_DATA,
745 .fd = -EBADF,
746 .priority = priority,
747 };
748
749 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
750 if (r < 0) {
751 free(d);
752 return r;
753 }
754
755 added = true;
756 }
757
758 ss_copy = d->sigset;
759 assert_se(sigaddset(&ss_copy, sig) >= 0);
760
761 r = signalfd(d->fd >= 0 ? d->fd : -1, /* the first arg must be -1 or a valid signalfd */
762 &ss_copy,
763 SFD_NONBLOCK|SFD_CLOEXEC);
764 if (r < 0) {
765 r = -errno;
766 goto fail;
767 }
768
769 d->sigset = ss_copy;
770
771 if (d->fd >= 0) {
772 if (ret)
773 *ret = d;
774 return 0;
775 }
776
777 d->fd = fd_move_above_stdio(r);
778
779 struct epoll_event ev = {
780 .events = EPOLLIN,
781 .data.ptr = d,
782 };
783
784 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
785 r = -errno;
786 goto fail;
787 }
788
789 if (ret)
790 *ret = d;
791
792 return 0;
793
794fail:
795 if (added)
796 event_free_signal_data(e, d);
797
798 return r;
799}
800
801static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
802 assert(e);
803 assert(d);
804
805 /* Turns off the specified signal in the signal data
806 * object. If the signal mask of the object becomes empty that
807 * way removes it. */
808
809 if (sigismember(&d->sigset, sig) == 0)
810 return;
811
812 assert_se(sigdelset(&d->sigset, sig) >= 0);
813
814 if (sigisemptyset(&d->sigset)) {
815 /* If all the mask is all-zero we can get rid of the structure */
816 event_free_signal_data(e, d);
817 return;
818 }
819
820 if (event_origin_changed(e))
821 return;
822
823 assert(d->fd >= 0);
824
825 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
826 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
827}
828
829static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
830 struct signal_data *d;
831 static const int64_t zero_priority = 0;
832
833 assert(e);
834
835 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
836 * and possibly drop the signalfd for it. */
837
838 if (sig == SIGCHLD &&
839 e->n_online_child_sources > 0)
840 return;
841
842 if (e->signal_sources &&
843 e->signal_sources[sig] &&
844 event_source_is_online(e->signal_sources[sig]))
845 return;
846
847 /*
848 * The specified signal might be enabled in three different queues:
849 *
850 * 1) the one that belongs to the priority passed (if it is non-NULL)
851 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
852 * 3) the 0 priority (to cover the SIGCHLD case)
853 *
854 * Hence, let's remove it from all three here.
855 */
856
857 if (priority) {
858 d = hashmap_get(e->signal_data, priority);
859 if (d)
860 event_unmask_signal_data(e, d, sig);
861 }
862
863 if (e->signal_sources && e->signal_sources[sig]) {
864 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
865 if (d)
866 event_unmask_signal_data(e, d, sig);
867 }
868
869 d = hashmap_get(e->signal_data, &zero_priority);
870 if (d)
871 event_unmask_signal_data(e, d, sig);
872}
873
874static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
875 assert(s);
876
877 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
878 * they are enabled/disabled or marked pending and such. */
879
880 if (s->pending)
881 prioq_reshuffle(s->event->pending, s, &s->pending_index);
882
883 if (s->prepare)
884 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
885}
886
887static void event_source_time_prioq_reshuffle(sd_event_source *s) {
888 struct clock_data *d;
889
890 assert(s);
891
892 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
893 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
894 * properly again. */
895
896 if (s->ratelimited)
897 d = &s->event->monotonic;
898 else if (EVENT_SOURCE_IS_TIME(s->type))
899 assert_se(d = event_get_clock_data(s->event, s->type));
900 else
901 return; /* no-op for an event source which is neither a timer nor ratelimited. */
902
903 prioq_reshuffle(d->earliest, s, &s->earliest_index);
904 prioq_reshuffle(d->latest, s, &s->latest_index);
905 d->needs_rearm = true;
906}
907
908static void event_source_time_prioq_remove(
909 sd_event_source *s,
910 struct clock_data *d) {
911
912 assert(s);
913 assert(d);
914
915 prioq_remove(d->earliest, s, &s->earliest_index);
916 prioq_remove(d->latest, s, &s->latest_index);
917 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
918 d->needs_rearm = true;
919}
920
921static void source_disconnect(sd_event_source *s) {
922 sd_event *event;
923 int r;
924
925 assert(s);
926
927 if (!s->event)
928 return;
929
930 assert(s->event->n_sources > 0);
931
932 switch (s->type) {
933
934 case SOURCE_IO:
935 if (s->io.fd >= 0)
936 source_io_unregister(s);
937
938 break;
939
940 case SOURCE_TIME_REALTIME:
941 case SOURCE_TIME_BOOTTIME:
942 case SOURCE_TIME_MONOTONIC:
943 case SOURCE_TIME_REALTIME_ALARM:
944 case SOURCE_TIME_BOOTTIME_ALARM:
945 /* Only remove this event source from the time event source here if it is not ratelimited. If
946 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
947 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
948
949 if (!s->ratelimited) {
950 struct clock_data *d;
951 assert_se(d = event_get_clock_data(s->event, s->type));
952 event_source_time_prioq_remove(s, d);
953 }
954
955 break;
956
957 case SOURCE_SIGNAL:
958 if (s->signal.sig > 0) {
959
960 if (s->event->signal_sources)
961 s->event->signal_sources[s->signal.sig] = NULL;
962
963 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
964
965 if (s->signal.unblock) {
966 sigset_t new_ss;
967
968 if (sigemptyset(&new_ss) < 0)
969 log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
970 else if (sigaddset(&new_ss, s->signal.sig) < 0)
971 log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
972 else {
973 r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
974 if (r != 0)
975 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
976 }
977 }
978 }
979
980 break;
981
982 case SOURCE_CHILD:
983 if (event_origin_changed(s->event))
984 s->child.process_owned = false;
985
986 if (s->child.pid > 0) {
987 if (event_source_is_online(s)) {
988 assert(s->event->n_online_child_sources > 0);
989 s->event->n_online_child_sources--;
990 }
991
992 assert_se(hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid)));
993 }
994
995 if (EVENT_SOURCE_WATCH_PIDFD(s))
996 source_child_pidfd_unregister(s);
997 else
998 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
999
1000 break;
1001
1002 case SOURCE_DEFER:
1003 /* nothing */
1004 break;
1005
1006 case SOURCE_POST:
1007 set_remove(s->event->post_sources, s);
1008 break;
1009
1010 case SOURCE_EXIT:
1011 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
1012 break;
1013
1014 case SOURCE_INOTIFY: {
1015 struct inode_data *inode_data;
1016
1017 inode_data = s->inotify.inode_data;
1018 if (inode_data) {
1019 struct inotify_data *inotify_data;
1020 assert_se(inotify_data = inode_data->inotify_data);
1021
1022 /* Detach this event source from the inode object */
1023 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
1024 s->inotify.inode_data = NULL;
1025
1026 if (s->pending) {
1027 assert(inotify_data->n_pending > 0);
1028 inotify_data->n_pending--;
1029 }
1030
1031 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1032 * continued to being watched. That's because inotify doesn't really have an API for that: we
1033 * can only change watch masks with access to the original inode either by fd or by path. But
1034 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1035 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1036 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1037 * there), but given the need for open_by_handle_at() which is privileged and not universally
1038 * available this would be quite an incomplete solution. Hence we go the other way, leave the
1039 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1040 * anymore after reception. Yes, this sucks, but … Linux … */
1041
1042 /* Maybe release the inode data (and its inotify) */
1043 event_gc_inode_data(s->event, inode_data);
1044 }
1045
1046 break;
1047 }
1048
1049 case SOURCE_MEMORY_PRESSURE:
1050 source_memory_pressure_remove_from_write_list(s);
1051 source_memory_pressure_unregister(s);
1052 break;
1053
1054 default:
1055 assert_not_reached();
1056 }
1057
1058 if (s->pending)
1059 prioq_remove(s->event->pending, s, &s->pending_index);
1060
1061 if (s->prepare)
1062 prioq_remove(s->event->prepare, s, &s->prepare_index);
1063
1064 if (s->ratelimited)
1065 event_source_time_prioq_remove(s, &s->event->monotonic);
1066
1067 event = TAKE_PTR(s->event);
1068 LIST_REMOVE(sources, event->sources, s);
1069 event->n_sources--;
1070
1071 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1072 * pidfd associated with this event source, which we'll do only on source_free(). */
1073
1074 if (!s->floating)
1075 sd_event_unref(event);
1076}
1077
1078static sd_event_source* source_free(sd_event_source *s) {
1079 int r;
1080
1081 assert(s);
1082
1083 source_disconnect(s);
1084
1085 if (s->type == SOURCE_IO && s->io.owned)
1086 s->io.fd = safe_close(s->io.fd);
1087
1088 if (s->type == SOURCE_CHILD) {
1089 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1090
1091 if (s->child.process_owned) {
1092 assert(s->child.pid > 0);
1093 assert(s->child.pidfd >= 0);
1094
1095 if (!s->child.exited) {
1096 r = RET_NERRNO(pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0));
1097 if (r < 0 && r != -ESRCH)
1098 log_debug_errno(r, "Failed to kill process " PID_FMT ", ignoring: %m",
1099 s->child.pid);
1100 }
1101
1102 if (!s->child.waited) {
1103 siginfo_t si = {};
1104
1105 /* Reap the child if we can */
1106 (void) waitid(P_PIDFD, s->child.pidfd, &si, WEXITED);
1107 }
1108 }
1109
1110 if (s->child.pidfd_owned)
1111 s->child.pidfd = safe_close(s->child.pidfd);
1112 }
1113
1114 if (s->type == SOURCE_MEMORY_PRESSURE) {
1115 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1116 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1117 }
1118
1119 if (s->destroy_callback)
1120 s->destroy_callback(s->userdata);
1121
1122 free(s->description);
1123 return mfree(s);
1124}
1125DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1126
1127static int source_set_pending(sd_event_source *s, bool b) {
1128 int r;
1129
1130 assert(s);
1131 assert(s->type != SOURCE_EXIT);
1132
1133 if (s->pending == b)
1134 return 0;
1135
1136 s->pending = b;
1137
1138 if (b) {
1139 s->pending_iteration = s->event->iteration;
1140
1141 r = prioq_put(s->event->pending, s, &s->pending_index);
1142 if (r < 0) {
1143 s->pending = false;
1144 return r;
1145 }
1146 } else
1147 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1148
1149 if (EVENT_SOURCE_IS_TIME(s->type))
1150 event_source_time_prioq_reshuffle(s);
1151
1152 if (s->type == SOURCE_SIGNAL && !b) {
1153 struct signal_data *d;
1154
1155 d = hashmap_get(s->event->signal_data, &s->priority);
1156 if (d && d->current == s)
1157 d->current = NULL;
1158 }
1159
1160 if (s->type == SOURCE_INOTIFY) {
1161
1162 assert(s->inotify.inode_data);
1163 assert(s->inotify.inode_data->inotify_data);
1164
1165 if (b)
1166 s->inotify.inode_data->inotify_data->n_pending++;
1167 else {
1168 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1169 s->inotify.inode_data->inotify_data->n_pending--;
1170 }
1171 }
1172
1173 return 1;
1174}
1175
1176static sd_event_source* source_new(sd_event *e, bool floating, EventSourceType type) {
1177
1178 /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1179 * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1180 * lines. */
1181 static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1182 [SOURCE_IO] = endoffsetof_field(sd_event_source, io),
1183 [SOURCE_TIME_REALTIME] = endoffsetof_field(sd_event_source, time),
1184 [SOURCE_TIME_BOOTTIME] = endoffsetof_field(sd_event_source, time),
1185 [SOURCE_TIME_MONOTONIC] = endoffsetof_field(sd_event_source, time),
1186 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1187 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1188 [SOURCE_SIGNAL] = endoffsetof_field(sd_event_source, signal),
1189 [SOURCE_CHILD] = endoffsetof_field(sd_event_source, child),
1190 [SOURCE_DEFER] = endoffsetof_field(sd_event_source, defer),
1191 [SOURCE_POST] = endoffsetof_field(sd_event_source, post),
1192 [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit),
1193 [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify),
1194 [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure),
1195 };
1196
1197 sd_event_source *s;
1198
1199 assert(e);
1200 assert(type >= 0);
1201 assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1202 assert(size_table[type] > 0);
1203
1204 s = malloc0(size_table[type]);
1205 if (!s)
1206 return NULL;
1207 /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1208 * size, even if we only allocate the initial part we need. */
1209 s = expand_to_usable(s, sizeof(sd_event_source));
1210
1211 /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1212 * than what we allocated here. */
1213 s->n_ref = 1;
1214 s->event = e;
1215 s->floating = floating;
1216 s->type = type;
1217 s->pending_index = PRIOQ_IDX_NULL;
1218 s->prepare_index = PRIOQ_IDX_NULL;
1219
1220 if (!floating)
1221 sd_event_ref(e);
1222
1223 LIST_PREPEND(sources, e->sources, s);
1224 e->n_sources++;
1225
1226 return s;
1227}
1228
1229static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1230 assert(s);
1231
1232 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1233}
1234
1235_public_ int sd_event_add_io(
1236 sd_event *e,
1237 sd_event_source **ret,
1238 int fd,
1239 uint32_t events,
1240 sd_event_io_handler_t callback,
1241 void *userdata) {
1242
1243 _cleanup_(source_freep) sd_event_source *s = NULL;
1244 int r;
1245
1246 assert_return(e, -EINVAL);
1247 assert_return(e = event_resolve(e), -ENOPKG);
1248 assert_return(fd >= 0, -EBADF);
1249 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1250 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1251 assert_return(!event_origin_changed(e), -ECHILD);
1252
1253 if (!callback)
1254 callback = io_exit_callback;
1255
1256 s = source_new(e, !ret, SOURCE_IO);
1257 if (!s)
1258 return -ENOMEM;
1259
1260 s->wakeup = WAKEUP_EVENT_SOURCE;
1261 s->io.fd = fd;
1262 s->io.events = events;
1263 s->io.callback = callback;
1264 s->userdata = userdata;
1265 s->enabled = SD_EVENT_ON;
1266
1267 r = source_io_register(s, s->enabled, events);
1268 if (r < 0)
1269 return r;
1270
1271 if (ret)
1272 *ret = s;
1273 TAKE_PTR(s);
1274
1275 return 0;
1276}
1277
1278static void initialize_perturb(sd_event *e) {
1279 sd_id128_t id = {};
1280
1281 /* When we sleep for longer, we try to realign the wakeup to the same time within each
1282 * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1283 * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1284 * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1285 * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1286
1287 if (_likely_(e->perturb != USEC_INFINITY))
1288 return;
1289
1290 if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
1291 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1292 else
1293 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
1294}
1295
1296static int event_setup_timer_fd(
1297 sd_event *e,
1298 struct clock_data *d,
1299 clockid_t clock) {
1300
1301 assert(e);
1302 assert(d);
1303
1304 if (_likely_(d->fd >= 0))
1305 return 0;
1306
1307 _cleanup_close_ int fd = -EBADF;
1308
1309 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1310 if (fd < 0)
1311 return -errno;
1312
1313 fd = fd_move_above_stdio(fd);
1314
1315 struct epoll_event ev = {
1316 .events = EPOLLIN,
1317 .data.ptr = d,
1318 };
1319
1320 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1321 return -errno;
1322
1323 d->fd = TAKE_FD(fd);
1324 return 0;
1325}
1326
1327static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1328 assert(s);
1329
1330 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1331}
1332
1333static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1334 int r;
1335
1336 assert(d);
1337
1338 if (d->fd < 0) {
1339 r = event_setup_timer_fd(e, d, clock);
1340 if (r < 0)
1341 return r;
1342 }
1343
1344 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1345 if (r < 0)
1346 return r;
1347
1348 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1349 if (r < 0)
1350 return r;
1351
1352 return 0;
1353}
1354
1355static int event_source_time_prioq_put(
1356 sd_event_source *s,
1357 struct clock_data *d) {
1358
1359 int r;
1360
1361 assert(s);
1362 assert(d);
1363 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1364
1365 r = prioq_put(d->earliest, s, &s->earliest_index);
1366 if (r < 0)
1367 return r;
1368
1369 r = prioq_put(d->latest, s, &s->latest_index);
1370 if (r < 0) {
1371 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1372 s->earliest_index = PRIOQ_IDX_NULL;
1373 return r;
1374 }
1375
1376 d->needs_rearm = true;
1377 return 0;
1378}
1379
1380_public_ int sd_event_add_time(
1381 sd_event *e,
1382 sd_event_source **ret,
1383 clockid_t clock,
1384 uint64_t usec,
1385 uint64_t accuracy,
1386 sd_event_time_handler_t callback,
1387 void *userdata) {
1388
1389 EventSourceType type;
1390 _cleanup_(source_freep) sd_event_source *s = NULL;
1391 struct clock_data *d;
1392 int r;
1393
1394 assert_return(e, -EINVAL);
1395 assert_return(e = event_resolve(e), -ENOPKG);
1396 assert_return(accuracy != UINT64_MAX, -EINVAL);
1397 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1398 assert_return(!event_origin_changed(e), -ECHILD);
1399
1400 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1401 return -EOPNOTSUPP;
1402
1403 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1404 if (type < 0)
1405 return -EOPNOTSUPP;
1406
1407 if (!callback)
1408 callback = time_exit_callback;
1409
1410 assert_se(d = event_get_clock_data(e, type));
1411
1412 r = setup_clock_data(e, d, clock);
1413 if (r < 0)
1414 return r;
1415
1416 s = source_new(e, !ret, type);
1417 if (!s)
1418 return -ENOMEM;
1419
1420 s->time.next = usec;
1421 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1422 s->time.callback = callback;
1423 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1424 s->userdata = userdata;
1425 s->enabled = SD_EVENT_ONESHOT;
1426
1427 r = event_source_time_prioq_put(s, d);
1428 if (r < 0)
1429 return r;
1430
1431 if (ret)
1432 *ret = s;
1433 TAKE_PTR(s);
1434
1435 return 0;
1436}
1437
1438_public_ int sd_event_add_time_relative(
1439 sd_event *e,
1440 sd_event_source **ret,
1441 clockid_t clock,
1442 uint64_t usec,
1443 uint64_t accuracy,
1444 sd_event_time_handler_t callback,
1445 void *userdata) {
1446
1447 usec_t t;
1448 int r;
1449
1450 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1451 * checks for overflow. */
1452
1453 r = sd_event_now(e, clock, &t);
1454 if (r < 0)
1455 return r;
1456
1457 if (usec >= USEC_INFINITY - t)
1458 return -EOVERFLOW;
1459
1460 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1461}
1462
1463static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1464 assert(s);
1465
1466 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1467}
1468
1469_public_ int sd_event_add_signal(
1470 sd_event *e,
1471 sd_event_source **ret,
1472 int sig,
1473 sd_event_signal_handler_t callback,
1474 void *userdata) {
1475
1476 _cleanup_(source_freep) sd_event_source *s = NULL;
1477 struct signal_data *d;
1478 sigset_t new_ss;
1479 bool block_it;
1480 int r;
1481
1482 assert_return(e, -EINVAL);
1483 assert_return(e = event_resolve(e), -ENOPKG);
1484 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1485 assert_return(!event_origin_changed(e), -ECHILD);
1486
1487 /* Let's make sure our special flag stays outside of the valid signal range */
1488 assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1489
1490 if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1491 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1492 assert_return(SIGNAL_VALID(sig), -EINVAL);
1493
1494 block_it = true;
1495 } else {
1496 assert_return(SIGNAL_VALID(sig), -EINVAL);
1497
1498 r = signal_is_blocked(sig);
1499 if (r < 0)
1500 return r;
1501 if (r == 0)
1502 return -EBUSY;
1503
1504 block_it = false;
1505 }
1506
1507 if (!callback)
1508 callback = signal_exit_callback;
1509
1510 if (!e->signal_sources) {
1511 e->signal_sources = new0(sd_event_source*, _NSIG);
1512 if (!e->signal_sources)
1513 return -ENOMEM;
1514 } else if (e->signal_sources[sig])
1515 return -EBUSY;
1516
1517 s = source_new(e, !ret, SOURCE_SIGNAL);
1518 if (!s)
1519 return -ENOMEM;
1520
1521 s->signal.sig = sig;
1522 s->signal.callback = callback;
1523 s->userdata = userdata;
1524 s->enabled = SD_EVENT_ON;
1525
1526 e->signal_sources[sig] = s;
1527
1528 if (block_it) {
1529 sigset_t old_ss;
1530
1531 if (sigemptyset(&new_ss) < 0)
1532 return -errno;
1533
1534 if (sigaddset(&new_ss, sig) < 0)
1535 return -errno;
1536
1537 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1538 if (r != 0)
1539 return -r;
1540
1541 r = sigismember(&old_ss, sig);
1542 if (r < 0)
1543 return -errno;
1544
1545 s->signal.unblock = !r;
1546 } else
1547 s->signal.unblock = false;
1548
1549 r = event_make_signal_data(e, sig, &d);
1550 if (r < 0) {
1551 if (s->signal.unblock)
1552 (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1553
1554 return r;
1555 }
1556
1557 /* Use the signal name as description for the event source by default */
1558 (void) sd_event_source_set_description(s, signal_to_string(sig));
1559
1560 if (ret)
1561 *ret = s;
1562 TAKE_PTR(s);
1563
1564 return 0;
1565}
1566
1567static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1568 assert(s);
1569
1570 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1571}
1572
1573_public_ int sd_event_add_child(
1574 sd_event *e,
1575 sd_event_source **ret,
1576 pid_t pid,
1577 int options,
1578 sd_event_child_handler_t callback,
1579 void *userdata) {
1580
1581 _cleanup_(source_freep) sd_event_source *s = NULL;
1582 int r;
1583
1584 assert_return(e, -EINVAL);
1585 assert_return(e = event_resolve(e), -ENOPKG);
1586 assert_return(pid > 1, -EINVAL);
1587 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1588 assert_return(options != 0, -EINVAL);
1589 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1590 assert_return(!event_origin_changed(e), -ECHILD);
1591
1592 if (!callback)
1593 callback = child_exit_callback;
1594
1595 if (e->n_online_child_sources == 0) {
1596 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1597 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1598 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1599 * take effect.
1600 *
1601 * (As an optimization we only do this check on the first child event source created.) */
1602 r = signal_is_blocked(SIGCHLD);
1603 if (r < 0)
1604 return r;
1605 if (r == 0)
1606 return -EBUSY;
1607 }
1608
1609 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1610 if (r < 0)
1611 return r;
1612
1613 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1614 return -EBUSY;
1615
1616 s = source_new(e, !ret, SOURCE_CHILD);
1617 if (!s)
1618 return -ENOMEM;
1619
1620 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1621 * pin the PID, and make regular waitid() handling race-free. */
1622
1623 s->child.pidfd = pidfd_open(pid, 0);
1624 if (s->child.pidfd < 0)
1625 return -errno;
1626
1627 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1628
1629 s->wakeup = WAKEUP_EVENT_SOURCE;
1630 s->child.options = options;
1631 s->child.callback = callback;
1632 s->userdata = userdata;
1633 s->enabled = SD_EVENT_ONESHOT;
1634
1635 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1636 /* We only want to watch for exit */
1637 r = source_child_pidfd_register(s, s->enabled);
1638 if (r < 0)
1639 return r;
1640
1641 } else {
1642 /* We shall wait for some other event than WEXITED */
1643 r = event_make_signal_data(e, SIGCHLD, NULL);
1644 if (r < 0)
1645 return r;
1646
1647 e->need_process_child = true;
1648 }
1649
1650 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1651 if (r < 0)
1652 return r;
1653
1654 /* These must be done after everything succeeds. */
1655 s->child.pid = pid;
1656 e->n_online_child_sources++;
1657
1658 if (ret)
1659 *ret = s;
1660 TAKE_PTR(s);
1661 return 0;
1662}
1663
1664_public_ int sd_event_add_child_pidfd(
1665 sd_event *e,
1666 sd_event_source **ret,
1667 int pidfd,
1668 int options,
1669 sd_event_child_handler_t callback,
1670 void *userdata) {
1671
1672 _cleanup_(source_freep) sd_event_source *s = NULL;
1673 pid_t pid;
1674 int r;
1675
1676 assert_return(e, -EINVAL);
1677 assert_return(e = event_resolve(e), -ENOPKG);
1678 assert_return(pidfd >= 0, -EBADF);
1679 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1680 assert_return(options != 0, -EINVAL);
1681 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1682 assert_return(!event_origin_changed(e), -ECHILD);
1683
1684 if (!callback)
1685 callback = child_exit_callback;
1686
1687 if (e->n_online_child_sources == 0) {
1688 r = signal_is_blocked(SIGCHLD);
1689 if (r < 0)
1690 return r;
1691 if (r == 0)
1692 return -EBUSY;
1693 }
1694
1695 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1696 if (r < 0)
1697 return r;
1698
1699 r = pidfd_get_pid(pidfd, &pid);
1700 if (r < 0)
1701 return r;
1702
1703 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1704 return -EBUSY;
1705
1706 s = source_new(e, !ret, SOURCE_CHILD);
1707 if (!s)
1708 return -ENOMEM;
1709
1710 s->wakeup = WAKEUP_EVENT_SOURCE;
1711 s->child.pidfd = pidfd;
1712 s->child.options = options;
1713 s->child.callback = callback;
1714 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1715 s->userdata = userdata;
1716 s->enabled = SD_EVENT_ONESHOT;
1717
1718 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1719 /* We only want to watch for WEXITED */
1720 r = source_child_pidfd_register(s, s->enabled);
1721 if (r < 0)
1722 return r;
1723 } else {
1724 /* We shall wait for some other event than WEXITED */
1725 r = event_make_signal_data(e, SIGCHLD, NULL);
1726 if (r < 0)
1727 return r;
1728
1729 e->need_process_child = true;
1730 }
1731
1732 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1733 if (r < 0)
1734 return r;
1735
1736 s->child.pid = pid;
1737 e->n_online_child_sources++;
1738
1739 if (ret)
1740 *ret = s;
1741 TAKE_PTR(s);
1742 return 0;
1743}
1744
1745static int generic_exit_callback(sd_event_source *s, void *userdata) {
1746 assert(s);
1747
1748 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1749}
1750
1751_public_ int sd_event_add_defer(
1752 sd_event *e,
1753 sd_event_source **ret,
1754 sd_event_handler_t callback,
1755 void *userdata) {
1756
1757 _cleanup_(source_freep) sd_event_source *s = NULL;
1758 int r;
1759
1760 assert_return(e, -EINVAL);
1761 assert_return(e = event_resolve(e), -ENOPKG);
1762 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1763 assert_return(!event_origin_changed(e), -ECHILD);
1764
1765 if (!callback)
1766 callback = generic_exit_callback;
1767
1768 s = source_new(e, !ret, SOURCE_DEFER);
1769 if (!s)
1770 return -ENOMEM;
1771
1772 s->defer.callback = callback;
1773 s->userdata = userdata;
1774 s->enabled = SD_EVENT_ONESHOT;
1775
1776 r = source_set_pending(s, true);
1777 if (r < 0)
1778 return r;
1779
1780 if (ret)
1781 *ret = s;
1782 TAKE_PTR(s);
1783
1784 return 0;
1785}
1786
1787_public_ int sd_event_add_post(
1788 sd_event *e,
1789 sd_event_source **ret,
1790 sd_event_handler_t callback,
1791 void *userdata) {
1792
1793 _cleanup_(source_freep) sd_event_source *s = NULL;
1794 int r;
1795
1796 assert_return(e, -EINVAL);
1797 assert_return(e = event_resolve(e), -ENOPKG);
1798 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1799 assert_return(!event_origin_changed(e), -ECHILD);
1800
1801 if (!callback)
1802 callback = generic_exit_callback;
1803
1804 s = source_new(e, !ret, SOURCE_POST);
1805 if (!s)
1806 return -ENOMEM;
1807
1808 s->post.callback = callback;
1809 s->userdata = userdata;
1810 s->enabled = SD_EVENT_ON;
1811
1812 r = set_ensure_put(&e->post_sources, NULL, s);
1813 if (r < 0)
1814 return r;
1815 assert(r > 0);
1816
1817 if (ret)
1818 *ret = s;
1819 TAKE_PTR(s);
1820
1821 return 0;
1822}
1823
1824_public_ int sd_event_add_exit(
1825 sd_event *e,
1826 sd_event_source **ret,
1827 sd_event_handler_t callback,
1828 void *userdata) {
1829
1830 _cleanup_(source_freep) sd_event_source *s = NULL;
1831 int r;
1832
1833 assert_return(e, -EINVAL);
1834 assert_return(e = event_resolve(e), -ENOPKG);
1835 assert_return(callback, -EINVAL);
1836 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1837 assert_return(!event_origin_changed(e), -ECHILD);
1838
1839 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1840 if (r < 0)
1841 return r;
1842
1843 s = source_new(e, !ret, SOURCE_EXIT);
1844 if (!s)
1845 return -ENOMEM;
1846
1847 s->exit.callback = callback;
1848 s->userdata = userdata;
1849 s->exit.prioq_index = PRIOQ_IDX_NULL;
1850 s->enabled = SD_EVENT_ONESHOT;
1851
1852 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1853 if (r < 0)
1854 return r;
1855
1856 if (ret)
1857 *ret = s;
1858 TAKE_PTR(s);
1859
1860 return 0;
1861}
1862
1863_public_ int sd_event_trim_memory(void) {
1864 int r;
1865
1866 /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1867 * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1868 * NULL callback parameter. */
1869
1870 log_debug("Memory pressure event, trimming malloc() memory.");
1871
1872 struct mallinfo2 before_mallinfo = mallinfo2();
1873
1874 usec_t before_timestamp = now(CLOCK_MONOTONIC);
1875 hashmap_trim_pools();
1876 r = malloc_trim(0);
1877 usec_t after_timestamp = now(CLOCK_MONOTONIC);
1878
1879 if (r > 0)
1880 log_debug("Successfully trimmed some memory.");
1881 else
1882 log_debug("Couldn't trim any memory.");
1883
1884 usec_t period = after_timestamp - before_timestamp;
1885
1886 struct mallinfo2 after_mallinfo = mallinfo2();
1887 size_t l = LESS_BY(before_mallinfo.hblkhd, after_mallinfo.hblkhd) +
1888 LESS_BY(before_mallinfo.arena, after_mallinfo.arena);
1889 log_struct(LOG_DEBUG,
1890 LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1891 FORMAT_TIMESPAN(period, 0),
1892 FORMAT_BYTES(l)),
1893 LOG_MESSAGE_ID(SD_MESSAGE_MEMORY_TRIM_STR),
1894 LOG_ITEM("TRIMMED_BYTES=%zu", l),
1895 LOG_ITEM("TRIMMED_USEC=" USEC_FMT, period));
1896
1897 return 0;
1898}
1899
1900static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1901 assert(s);
1902
1903 sd_event_trim_memory();
1904 return 0;
1905}
1906
1907_public_ int sd_event_add_memory_pressure(
1908 sd_event *e,
1909 sd_event_source **ret,
1910 sd_event_handler_t callback,
1911 void *userdata) {
1912
1913 _cleanup_free_ char *w = NULL;
1914 _cleanup_(source_freep) sd_event_source *s = NULL;
1915 _cleanup_close_ int path_fd = -EBADF, fd = -EBADF;
1916 _cleanup_free_ void *write_buffer = NULL;
1917 const char *watch, *watch_fallback = NULL, *env;
1918 size_t write_buffer_size = 0;
1919 struct stat st;
1920 uint32_t events;
1921 bool locked;
1922 int r;
1923
1924 assert_return(e, -EINVAL);
1925 assert_return(e = event_resolve(e), -ENOPKG);
1926 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1927 assert_return(!event_origin_changed(e), -ECHILD);
1928
1929 if (!callback)
1930 callback = memory_pressure_callback;
1931
1932 s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1933 if (!s)
1934 return -ENOMEM;
1935
1936 s->wakeup = WAKEUP_EVENT_SOURCE;
1937 s->memory_pressure.callback = callback;
1938 s->userdata = userdata;
1939 s->enabled = SD_EVENT_ON;
1940 s->memory_pressure.fd = -EBADF;
1941
1942 env = secure_getenv("MEMORY_PRESSURE_WATCH");
1943 if (env) {
1944 if (isempty(env) || path_equal(env, "/dev/null"))
1945 return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1946 "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1947
1948 if (!path_is_absolute(env) || !path_is_normalized(env))
1949 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1950 "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1951
1952 watch = env;
1953
1954 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1955 if (env) {
1956 r = unbase64mem(env, &write_buffer, &write_buffer_size);
1957 if (r < 0)
1958 return r;
1959 }
1960
1961 locked = true;
1962 } else {
1963
1964 r = is_pressure_supported();
1965 if (r < 0)
1966 return r;
1967 if (r == 0)
1968 return -EOPNOTSUPP;
1969
1970 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1971 * the system wide pressure if for some reason we cannot (which could be: memory controller
1972 * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1973 * only use the system-wide logic. */
1974 r = cg_all_unified();
1975 if (r < 0)
1976 return r;
1977 if (r == 0)
1978 watch = "/proc/pressure/memory";
1979 else {
1980 _cleanup_free_ char *cg = NULL;
1981
1982 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
1983 if (r < 0)
1984 return r;
1985
1986 w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
1987 if (!w)
1988 return -ENOMEM;
1989
1990 watch = w;
1991 watch_fallback = "/proc/pressure/memory";
1992 }
1993
1994 /* Android uses three levels in its userspace low memory killer logic:
1995 * some 70000 1000000
1996 * some 100000 1000000
1997 * full 70000 1000000
1998 *
1999 * GNOME's low memory monitor uses:
2000 * some 70000 1000000
2001 * some 100000 1000000
2002 * full 100000 1000000
2003 *
2004 * We'll default to the middle level that both agree on. Except we do it on a 2s window
2005 * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2006 * kernel will allow us to do unprivileged, also in the future. */
2007 if (asprintf((char**) &write_buffer,
2008 "%s " USEC_FMT " " USEC_FMT,
2009 MEMORY_PRESSURE_DEFAULT_TYPE,
2010 MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2011 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2012 return -ENOMEM;
2013
2014 write_buffer_size = strlen(write_buffer) + 1;
2015 locked = false;
2016 }
2017
2018 path_fd = open(watch, O_PATH|O_CLOEXEC);
2019 if (path_fd < 0) {
2020 if (errno != ENOENT)
2021 return -errno;
2022
2023 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2024 * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2025 * the PSI service apparently is not supported) */
2026 if (!watch_fallback)
2027 return locked ? -ENOENT : -EOPNOTSUPP;
2028
2029 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
2030 if (path_fd < 0) {
2031 if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2032 return -EOPNOTSUPP;
2033 return -errno;
2034 }
2035 }
2036
2037 if (fstat(path_fd, &st) < 0)
2038 return -errno;
2039
2040 if (S_ISSOCK(st.st_mode)) {
2041 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2042 if (fd < 0)
2043 return -errno;
2044
2045 r = connect_unix_path(fd, path_fd, NULL);
2046 if (r < 0)
2047 return r;
2048
2049 events = EPOLLIN;
2050
2051 } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2052 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2053 if (fd < 0)
2054 return fd;
2055
2056 if (S_ISREG(st.st_mode)) {
2057 struct statfs sfs;
2058
2059 /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2060
2061 if (fstatfs(fd, &sfs) < 0)
2062 return -errno;
2063
2064 if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2065 !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2066 return -ENOTTY;
2067
2068 events = EPOLLPRI;
2069 } else
2070 /* For fifos and char devices just watch for EPOLLIN */
2071 events = EPOLLIN;
2072
2073 } else if (S_ISDIR(st.st_mode))
2074 return -EISDIR;
2075 else
2076 return -EBADF;
2077
2078 s->memory_pressure.fd = TAKE_FD(fd);
2079 s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2080 s->memory_pressure.write_buffer_size = write_buffer_size;
2081 s->memory_pressure.events = events;
2082 s->memory_pressure.locked = locked;
2083
2084 /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2085 * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2086 * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2087 * event sources on which writes must be executed before the first event loop iteration is
2088 * executed. (We could also write the data here, right away, but we want to give the caller the
2089 * freedom to call sd_event_source_set_memory_pressure_type() and
2090 * sd_event_source_set_memory_pressure_rate() before we write it. */
2091
2092 if (s->memory_pressure.write_buffer_size > 0)
2093 source_memory_pressure_add_to_write_list(s);
2094 else {
2095 r = source_memory_pressure_register(s, s->enabled);
2096 if (r < 0)
2097 return r;
2098 }
2099
2100 if (ret)
2101 *ret = s;
2102 TAKE_PTR(s);
2103
2104 return 0;
2105}
2106
2107static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2108 assert(e);
2109
2110 if (!d)
2111 return;
2112
2113 assert(hashmap_isempty(d->inodes));
2114 assert(hashmap_isempty(d->wd));
2115
2116 if (d->buffer_filled > 0)
2117 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
2118
2119 hashmap_free(d->inodes);
2120 hashmap_free(d->wd);
2121
2122 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2123
2124 if (d->fd >= 0) {
2125 if (!event_origin_changed(e) &&
2126 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
2127 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2128
2129 safe_close(d->fd);
2130 }
2131 free(d);
2132}
2133
2134static int event_make_inotify_data(
2135 sd_event *e,
2136 int64_t priority,
2137 struct inotify_data **ret) {
2138
2139 _cleanup_close_ int fd = -EBADF;
2140 struct inotify_data *d;
2141 int r;
2142
2143 assert(e);
2144
2145 d = hashmap_get(e->inotify_data, &priority);
2146 if (d) {
2147 if (ret)
2148 *ret = d;
2149 return 0;
2150 }
2151
2152 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2153 if (fd < 0)
2154 return -errno;
2155
2156 fd = fd_move_above_stdio(fd);
2157
2158 d = new(struct inotify_data, 1);
2159 if (!d)
2160 return -ENOMEM;
2161
2162 *d = (struct inotify_data) {
2163 .wakeup = WAKEUP_INOTIFY_DATA,
2164 .fd = TAKE_FD(fd),
2165 .priority = priority,
2166 };
2167
2168 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
2169 if (r < 0) {
2170 d->fd = safe_close(d->fd);
2171 free(d);
2172 return r;
2173 }
2174
2175 struct epoll_event ev = {
2176 .events = EPOLLIN,
2177 .data.ptr = d,
2178 };
2179
2180 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2181 r = -errno;
2182 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2183 * remove the fd from the epoll first, which we don't want as we couldn't
2184 * add it in the first place. */
2185 event_free_inotify_data(e, d);
2186 return r;
2187 }
2188
2189 if (ret)
2190 *ret = d;
2191
2192 return 1;
2193}
2194
2195static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
2196 int r;
2197
2198 assert(x);
2199 assert(y);
2200
2201 r = CMP(x->dev, y->dev);
2202 if (r != 0)
2203 return r;
2204
2205 return CMP(x->ino, y->ino);
2206}
2207
2208static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2209 assert(d);
2210
2211 siphash24_compress_typesafe(d->dev, state);
2212 siphash24_compress_typesafe(d->ino, state);
2213}
2214
2215DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
2216
2217static void event_free_inode_data(
2218 sd_event *e,
2219 struct inode_data *d) {
2220
2221 assert(e);
2222
2223 if (!d)
2224 return;
2225
2226 assert(!d->event_sources);
2227
2228 if (d->fd >= 0) {
2229 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
2230 safe_close(d->fd);
2231 }
2232
2233 if (d->inotify_data) {
2234
2235 if (d->wd >= 0) {
2236 if (d->inotify_data->fd >= 0 && !event_origin_changed(e)) {
2237 /* So here's a problem. At the time this runs the watch descriptor might already be
2238 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2239 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2240 * likely case to happen. */
2241
2242 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2243 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2244 }
2245
2246 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2247 }
2248
2249 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2250 }
2251
2252 free(d->path);
2253 free(d);
2254}
2255
2256static void event_gc_inotify_data(
2257 sd_event *e,
2258 struct inotify_data *d) {
2259
2260 assert(e);
2261
2262 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2263 * any inode with it anymore, which in turn happens if no event source of this priority is interested
2264 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2265 * (under the expectation that the GC is called again once the counter is decremented). */
2266
2267 if (!d)
2268 return;
2269
2270 if (!hashmap_isempty(d->inodes))
2271 return;
2272
2273 if (d->n_busy > 0)
2274 return;
2275
2276 event_free_inotify_data(e, d);
2277}
2278
2279static void event_gc_inode_data(
2280 sd_event *e,
2281 struct inode_data *d) {
2282
2283 struct inotify_data *inotify_data;
2284
2285 assert(e);
2286
2287 if (!d)
2288 return;
2289
2290 if (d->event_sources)
2291 return;
2292
2293 inotify_data = d->inotify_data;
2294 event_free_inode_data(e, d);
2295
2296 event_gc_inotify_data(e, inotify_data);
2297}
2298
2299static int event_make_inode_data(
2300 sd_event *e,
2301 struct inotify_data *inotify_data,
2302 dev_t dev,
2303 ino_t ino,
2304 struct inode_data **ret) {
2305
2306 struct inode_data *d, key;
2307 int r;
2308
2309 assert(e);
2310 assert(inotify_data);
2311
2312 key = (struct inode_data) {
2313 .ino = ino,
2314 .dev = dev,
2315 };
2316
2317 d = hashmap_get(inotify_data->inodes, &key);
2318 if (d) {
2319 if (ret)
2320 *ret = d;
2321
2322 return 0;
2323 }
2324
2325 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2326 if (r < 0)
2327 return r;
2328
2329 d = new(struct inode_data, 1);
2330 if (!d)
2331 return -ENOMEM;
2332
2333 *d = (struct inode_data) {
2334 .dev = dev,
2335 .ino = ino,
2336 .wd = -1,
2337 .fd = -EBADF,
2338 .inotify_data = inotify_data,
2339 };
2340
2341 r = hashmap_put(inotify_data->inodes, d, d);
2342 if (r < 0) {
2343 free(d);
2344 return r;
2345 }
2346
2347 if (ret)
2348 *ret = d;
2349
2350 return 1;
2351}
2352
2353static uint32_t inode_data_determine_mask(struct inode_data *d) {
2354 bool excl_unlink = true;
2355 uint32_t combined = 0;
2356
2357 assert(d);
2358
2359 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2360 * the IN_EXCL_UNLINK flag is ANDed instead.
2361 *
2362 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2363 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2364 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2365 * events we don't care for client-side. */
2366
2367 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2368
2369 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2370 excl_unlink = false;
2371
2372 combined |= s->inotify.mask;
2373 }
2374
2375 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2376}
2377
2378static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2379 uint32_t combined_mask;
2380 int wd, r;
2381
2382 assert(d);
2383 assert(d->fd >= 0);
2384
2385 combined_mask = inode_data_determine_mask(d);
2386
2387 if (d->wd >= 0 && combined_mask == d->combined_mask)
2388 return 0;
2389
2390 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2391 if (r < 0)
2392 return r;
2393
2394 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2395 if (wd < 0)
2396 return wd;
2397
2398 if (d->wd < 0) {
2399 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2400 if (r < 0) {
2401 (void) inotify_rm_watch(d->inotify_data->fd, wd);
2402 return r;
2403 }
2404
2405 d->wd = wd;
2406
2407 } else if (d->wd != wd) {
2408
2409 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2410 (void) inotify_rm_watch(d->fd, wd);
2411 return -EINVAL;
2412 }
2413
2414 d->combined_mask = combined_mask;
2415 return 1;
2416}
2417
2418static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2419 assert(s);
2420
2421 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2422}
2423
2424static int event_add_inotify_fd_internal(
2425 sd_event *e,
2426 sd_event_source **ret,
2427 int fd,
2428 bool donate,
2429 uint32_t mask,
2430 sd_event_inotify_handler_t callback,
2431 void *userdata) {
2432
2433 _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
2434 _cleanup_(source_freep) sd_event_source *s = NULL;
2435 struct inotify_data *inotify_data = NULL;
2436 struct inode_data *inode_data = NULL;
2437 struct stat st;
2438 int r;
2439
2440 assert_return(e, -EINVAL);
2441 assert_return(e = event_resolve(e), -ENOPKG);
2442 assert_return(fd >= 0, -EBADF);
2443 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2444 assert_return(!event_origin_changed(e), -ECHILD);
2445
2446 if (!callback)
2447 callback = inotify_exit_callback;
2448
2449 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2450 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2451 * the user can't use them for us. */
2452 if (mask & IN_MASK_ADD)
2453 return -EINVAL;
2454
2455 if (fstat(fd, &st) < 0)
2456 return -errno;
2457
2458 s = source_new(e, !ret, SOURCE_INOTIFY);
2459 if (!s)
2460 return -ENOMEM;
2461
2462 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2463 s->inotify.mask = mask;
2464 s->inotify.callback = callback;
2465 s->userdata = userdata;
2466
2467 /* Allocate an inotify object for this priority, and an inode object within it */
2468 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2469 if (r < 0)
2470 return r;
2471
2472 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2473 if (r < 0) {
2474 event_gc_inotify_data(e, inotify_data);
2475 return r;
2476 }
2477
2478 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2479 * the event source, until then, for which we need the original inode. */
2480 if (inode_data->fd < 0) {
2481 if (donated_fd >= 0)
2482 inode_data->fd = TAKE_FD(donated_fd);
2483 else {
2484 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2485 if (inode_data->fd < 0) {
2486 r = -errno;
2487 event_gc_inode_data(e, inode_data);
2488 return r;
2489 }
2490 }
2491
2492 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
2493
2494 _cleanup_free_ char *path = NULL;
2495 r = fd_get_path(inode_data->fd, &path);
2496 if (r < 0 && r != -ENOSYS) { /* The path is optional, hence ignore -ENOSYS. */
2497 event_gc_inode_data(e, inode_data);
2498 return r;
2499 }
2500
2501 free_and_replace(inode_data->path, path);
2502 }
2503
2504 /* Link our event source to the inode data object */
2505 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2506 s->inotify.inode_data = inode_data;
2507
2508 /* Actually realize the watch now */
2509 r = inode_data_realize_watch(e, inode_data);
2510 if (r < 0)
2511 return r;
2512
2513 if (ret)
2514 *ret = s;
2515 TAKE_PTR(s);
2516
2517 return 0;
2518}
2519
2520_public_ int sd_event_add_inotify_fd(
2521 sd_event *e,
2522 sd_event_source **ret,
2523 int fd,
2524 uint32_t mask,
2525 sd_event_inotify_handler_t callback,
2526 void *userdata) {
2527
2528 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2529}
2530
2531_public_ int sd_event_add_inotify(
2532 sd_event *e,
2533 sd_event_source **ret,
2534 const char *path,
2535 uint32_t mask,
2536 sd_event_inotify_handler_t callback,
2537 void *userdata) {
2538
2539 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2540 int fd, r;
2541
2542 assert_return(path, -EINVAL);
2543
2544 fd = open(path, O_PATH | O_CLOEXEC |
2545 (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2546 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2547 if (fd < 0)
2548 return -errno;
2549
2550 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2551 if (r < 0)
2552 return r;
2553
2554 (void) sd_event_source_set_description(s, path);
2555
2556 if (ret)
2557 *ret = s;
2558
2559 return r;
2560}
2561
2562static sd_event_source* event_source_free(sd_event_source *s) {
2563 if (!s)
2564 return NULL;
2565
2566 /* Here's a special hack: when we are called from a
2567 * dispatch handler we won't free the event source
2568 * immediately, but we will detach the fd from the
2569 * epoll. This way it is safe for the caller to unref
2570 * the event source and immediately close the fd, but
2571 * we still retain a valid event source object after
2572 * the callback. */
2573
2574 if (s->dispatching)
2575 source_disconnect(s);
2576 else
2577 source_free(s);
2578
2579 return NULL;
2580}
2581
2582DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2583
2584_public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2585 assert_return(s, -EINVAL);
2586 assert_return(!event_origin_changed(s->event), -ECHILD);
2587
2588 return free_and_strdup(&s->description, description);
2589}
2590
2591_public_ int sd_event_source_get_description(sd_event_source *s, const char **ret) {
2592 assert_return(s, -EINVAL);
2593 assert_return(ret, -EINVAL);
2594
2595 if (!s->description)
2596 return -ENXIO;
2597
2598 *ret = s->description;
2599 return 0;
2600}
2601
2602_public_ sd_event* sd_event_source_get_event(sd_event_source *s) {
2603 assert_return(s, NULL);
2604 assert_return(!event_origin_changed(s->event), NULL);
2605
2606 return s->event;
2607}
2608
2609_public_ int sd_event_source_get_pending(sd_event_source *s) {
2610 assert_return(s, -EINVAL);
2611 assert_return(s->type != SOURCE_EXIT, -EDOM);
2612 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2613 assert_return(!event_origin_changed(s->event), -ECHILD);
2614
2615 return s->pending;
2616}
2617
2618_public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2619 assert_return(s, -EINVAL);
2620 assert_return(s->type == SOURCE_IO, -EDOM);
2621 assert_return(!event_origin_changed(s->event), -ECHILD);
2622
2623 return s->io.fd;
2624}
2625
2626_public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2627 int saved_fd, r;
2628
2629 assert_return(s, -EINVAL);
2630 assert_return(fd >= 0, -EBADF);
2631 assert_return(s->type == SOURCE_IO, -EDOM);
2632 assert_return(!event_origin_changed(s->event), -ECHILD);
2633
2634 if (s->io.fd == fd)
2635 return 0;
2636
2637 saved_fd = s->io.fd;
2638 s->io.fd = fd;
2639
2640 assert(event_source_is_offline(s) == !s->io.registered);
2641
2642 if (s->io.registered) {
2643 s->io.registered = false;
2644
2645 r = source_io_register(s, s->enabled, s->io.events);
2646 if (r < 0) {
2647 s->io.fd = saved_fd;
2648 s->io.registered = true;
2649 return r;
2650 }
2651
2652 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2653 }
2654
2655 if (s->io.owned)
2656 safe_close(saved_fd);
2657
2658 return 0;
2659}
2660
2661_public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2662 assert_return(s, -EINVAL);
2663 assert_return(s->type == SOURCE_IO, -EDOM);
2664 assert_return(!event_origin_changed(s->event), -ECHILD);
2665
2666 return s->io.owned;
2667}
2668
2669_public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2670 assert_return(s, -EINVAL);
2671 assert_return(s->type == SOURCE_IO, -EDOM);
2672 assert_return(!event_origin_changed(s->event), -ECHILD);
2673
2674 s->io.owned = own;
2675 return 0;
2676}
2677
2678_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t *ret) {
2679 assert_return(s, -EINVAL);
2680 assert_return(ret, -EINVAL);
2681 assert_return(s->type == SOURCE_IO, -EDOM);
2682 assert_return(!event_origin_changed(s->event), -ECHILD);
2683
2684 *ret = s->io.events;
2685 return 0;
2686}
2687
2688_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2689 int r;
2690
2691 assert_return(s, -EINVAL);
2692 assert_return(s->type == SOURCE_IO, -EDOM);
2693 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2694 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2695 assert_return(!event_origin_changed(s->event), -ECHILD);
2696
2697 /* edge-triggered updates are never skipped, so we can reset edges */
2698 if (s->io.events == events && !(events & EPOLLET))
2699 return 0;
2700
2701 r = source_set_pending(s, false);
2702 if (r < 0)
2703 return r;
2704
2705 if (event_source_is_online(s)) {
2706 r = source_io_register(s, s->enabled, events);
2707 if (r < 0)
2708 return r;
2709 }
2710
2711 s->io.events = events;
2712
2713 return 0;
2714}
2715
2716_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t *ret) {
2717 assert_return(s, -EINVAL);
2718 assert_return(ret, -EINVAL);
2719 assert_return(s->type == SOURCE_IO, -EDOM);
2720 assert_return(!event_origin_changed(s->event), -ECHILD);
2721
2722 if (!s->pending)
2723 return -ENODATA;
2724
2725 *ret = s->io.revents;
2726 return 0;
2727}
2728
2729_public_ int sd_event_source_get_signal(sd_event_source *s) {
2730 assert_return(s, -EINVAL);
2731 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2732 assert_return(!event_origin_changed(s->event), -ECHILD);
2733
2734 return s->signal.sig;
2735}
2736
2737_public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *ret) {
2738 assert_return(s, -EINVAL);
2739 assert_return(ret, -EINVAL);
2740 assert_return(!event_origin_changed(s->event), -ECHILD);
2741
2742 *ret = s->priority;
2743 return 0;
2744}
2745
2746_public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2747 bool rm_inotify = false, rm_inode = false;
2748 struct inotify_data *new_inotify_data = NULL;
2749 struct inode_data *new_inode_data = NULL;
2750 int r;
2751
2752 assert_return(s, -EINVAL);
2753 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2754 assert_return(!event_origin_changed(s->event), -ECHILD);
2755
2756 if (s->priority == priority)
2757 return 0;
2758
2759 if (s->type == SOURCE_INOTIFY) {
2760 struct inode_data *old_inode_data;
2761
2762 assert(s->inotify.inode_data);
2763 old_inode_data = s->inotify.inode_data;
2764
2765 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2766 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2767 * events we allow priority changes only until the first following iteration. */
2768 if (old_inode_data->fd < 0)
2769 return -EOPNOTSUPP;
2770
2771 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2772 if (r < 0)
2773 return r;
2774 rm_inotify = r > 0;
2775
2776 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2777 if (r < 0)
2778 goto fail;
2779 rm_inode = r > 0;
2780
2781 if (new_inode_data->fd < 0) {
2782 /* Duplicate the fd for the new inode object if we don't have any yet */
2783 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2784 if (new_inode_data->fd < 0) {
2785 r = -errno;
2786 goto fail;
2787 }
2788
2789 LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
2790
2791 _cleanup_free_ char *path = NULL;
2792 r = fd_get_path(new_inode_data->fd, &path);
2793 if (r < 0 && r != -ENOSYS)
2794 goto fail;
2795
2796 free_and_replace(new_inode_data->path, path);
2797 }
2798
2799 /* Move the event source to the new inode data structure */
2800 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2801 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2802 s->inotify.inode_data = new_inode_data;
2803
2804 /* Now create the new watch */
2805 r = inode_data_realize_watch(s->event, new_inode_data);
2806 if (r < 0) {
2807 /* Move it back */
2808 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2809 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2810 s->inotify.inode_data = old_inode_data;
2811 goto fail;
2812 }
2813
2814 s->priority = priority;
2815
2816 event_gc_inode_data(s->event, old_inode_data);
2817
2818 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2819 struct signal_data *old, *d;
2820
2821 /* Move us from the signalfd belonging to the old
2822 * priority to the signalfd of the new priority */
2823
2824 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2825
2826 s->priority = priority;
2827
2828 r = event_make_signal_data(s->event, s->signal.sig, &d);
2829 if (r < 0) {
2830 s->priority = old->priority;
2831 return r;
2832 }
2833
2834 event_unmask_signal_data(s->event, old, s->signal.sig);
2835 } else
2836 s->priority = priority;
2837
2838 event_source_pp_prioq_reshuffle(s);
2839
2840 if (s->type == SOURCE_EXIT)
2841 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2842
2843 return 0;
2844
2845fail:
2846 if (rm_inode)
2847 event_free_inode_data(s->event, new_inode_data);
2848
2849 if (rm_inotify)
2850 event_free_inotify_data(s->event, new_inotify_data);
2851
2852 return r;
2853}
2854
2855_public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2856 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2857 if (!s && !ret)
2858 return false;
2859
2860 assert_return(s, -EINVAL);
2861 assert_return(!event_origin_changed(s->event), -ECHILD);
2862
2863 if (ret)
2864 *ret = s->enabled;
2865
2866 return s->enabled != SD_EVENT_OFF;
2867}
2868
2869static int event_source_offline(
2870 sd_event_source *s,
2871 int enabled,
2872 bool ratelimited) {
2873
2874 bool was_offline;
2875 int r;
2876
2877 assert(s);
2878 assert(enabled == SD_EVENT_OFF || ratelimited);
2879
2880 /* Unset the pending flag when this event source is disabled */
2881 if (s->enabled != SD_EVENT_OFF &&
2882 enabled == SD_EVENT_OFF &&
2883 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2884 r = source_set_pending(s, false);
2885 if (r < 0)
2886 return r;
2887 }
2888
2889 was_offline = event_source_is_offline(s);
2890 s->enabled = enabled;
2891 s->ratelimited = ratelimited;
2892
2893 switch (s->type) {
2894
2895 case SOURCE_IO:
2896 source_io_unregister(s);
2897 break;
2898
2899 case SOURCE_SIGNAL:
2900 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2901 break;
2902
2903 case SOURCE_CHILD:
2904 if (!was_offline) {
2905 assert(s->event->n_online_child_sources > 0);
2906 s->event->n_online_child_sources--;
2907 }
2908
2909 if (EVENT_SOURCE_WATCH_PIDFD(s))
2910 source_child_pidfd_unregister(s);
2911 else
2912 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2913 break;
2914
2915 case SOURCE_EXIT:
2916 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2917 break;
2918
2919 case SOURCE_MEMORY_PRESSURE:
2920 source_memory_pressure_unregister(s);
2921 break;
2922
2923 case SOURCE_TIME_REALTIME:
2924 case SOURCE_TIME_BOOTTIME:
2925 case SOURCE_TIME_MONOTONIC:
2926 case SOURCE_TIME_REALTIME_ALARM:
2927 case SOURCE_TIME_BOOTTIME_ALARM:
2928 case SOURCE_DEFER:
2929 case SOURCE_POST:
2930 case SOURCE_INOTIFY:
2931 break;
2932
2933 default:
2934 assert_not_reached();
2935 }
2936
2937 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2938 event_source_time_prioq_reshuffle(s);
2939
2940 return 1;
2941}
2942
2943static int event_source_online(
2944 sd_event_source *s,
2945 int enabled,
2946 bool ratelimited) {
2947
2948 bool was_online;
2949 int r;
2950
2951 assert(s);
2952 assert(enabled != SD_EVENT_OFF || !ratelimited);
2953
2954 /* Unset the pending flag when this event source is enabled */
2955 if (s->enabled == SD_EVENT_OFF &&
2956 enabled != SD_EVENT_OFF &&
2957 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2958 r = source_set_pending(s, false);
2959 if (r < 0)
2960 return r;
2961 }
2962
2963 /* Are we really ready for onlining? */
2964 if (enabled == SD_EVENT_OFF || ratelimited) {
2965 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2966 s->enabled = enabled;
2967 s->ratelimited = ratelimited;
2968 return 0;
2969 }
2970
2971 was_online = event_source_is_online(s);
2972
2973 switch (s->type) {
2974 case SOURCE_IO:
2975 r = source_io_register(s, enabled, s->io.events);
2976 if (r < 0)
2977 return r;
2978 break;
2979
2980 case SOURCE_SIGNAL:
2981 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2982 if (r < 0) {
2983 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2984 return r;
2985 }
2986
2987 break;
2988
2989 case SOURCE_CHILD:
2990 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2991 /* yes, we can rely on pidfd */
2992
2993 r = source_child_pidfd_register(s, enabled);
2994 if (r < 0)
2995 return r;
2996 } else {
2997 /* something other to watch for than WEXITED */
2998
2999 r = event_make_signal_data(s->event, SIGCHLD, NULL);
3000 if (r < 0) {
3001 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
3002 return r;
3003 }
3004 }
3005
3006 if (!was_online)
3007 s->event->n_online_child_sources++;
3008 break;
3009
3010 case SOURCE_MEMORY_PRESSURE:
3011 r = source_memory_pressure_register(s, enabled);
3012 if (r < 0)
3013 return r;
3014
3015 break;
3016
3017 case SOURCE_TIME_REALTIME:
3018 case SOURCE_TIME_BOOTTIME:
3019 case SOURCE_TIME_MONOTONIC:
3020 case SOURCE_TIME_REALTIME_ALARM:
3021 case SOURCE_TIME_BOOTTIME_ALARM:
3022 case SOURCE_EXIT:
3023 case SOURCE_DEFER:
3024 case SOURCE_POST:
3025 case SOURCE_INOTIFY:
3026 break;
3027
3028 default:
3029 assert_not_reached();
3030 }
3031
3032 s->enabled = enabled;
3033 s->ratelimited = ratelimited;
3034
3035 /* Non-failing operations below */
3036 if (s->type == SOURCE_EXIT)
3037 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
3038
3039 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3040 event_source_time_prioq_reshuffle(s);
3041
3042 return 1;
3043}
3044
3045_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3046 int r;
3047
3048 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
3049
3050 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3051 if (m == SD_EVENT_OFF && !s)
3052 return 0;
3053
3054 assert_return(s, -EINVAL);
3055 assert_return(!event_origin_changed(s->event), -ECHILD);
3056
3057 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3058 if (s->event->state == SD_EVENT_FINISHED)
3059 return m == SD_EVENT_OFF ? 0 : -ESTALE;
3060
3061 if (s->enabled == m) /* No change? */
3062 return 0;
3063
3064 if (m == SD_EVENT_OFF)
3065 r = event_source_offline(s, m, s->ratelimited);
3066 else {
3067 if (s->enabled != SD_EVENT_OFF) {
3068 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3069 * event source is already enabled after all. */
3070 s->enabled = m;
3071 return 0;
3072 }
3073
3074 r = event_source_online(s, m, s->ratelimited);
3075 }
3076 if (r < 0)
3077 return r;
3078
3079 event_source_pp_prioq_reshuffle(s);
3080 return 0;
3081}
3082
3083_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *ret) {
3084 assert_return(s, -EINVAL);
3085 assert_return(ret, -EINVAL);
3086 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3087 assert_return(!event_origin_changed(s->event), -ECHILD);
3088
3089 *ret = s->time.next;
3090 return 0;
3091}
3092
3093_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
3094 int r;
3095
3096 assert_return(s, -EINVAL);
3097 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3098 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3099 assert_return(!event_origin_changed(s->event), -ECHILD);
3100
3101 r = source_set_pending(s, false);
3102 if (r < 0)
3103 return r;
3104
3105 s->time.next = usec;
3106
3107 event_source_time_prioq_reshuffle(s);
3108 return 0;
3109}
3110
3111_public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3112 usec_t t;
3113 int r;
3114
3115 assert_return(s, -EINVAL);
3116 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3117 assert_return(!event_origin_changed(s->event), -ECHILD);
3118
3119 if (usec == USEC_INFINITY)
3120 return sd_event_source_set_time(s, USEC_INFINITY);
3121
3122 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3123 if (r < 0)
3124 return r;
3125
3126 usec = usec_add(t, usec);
3127 if (usec == USEC_INFINITY)
3128 return -EOVERFLOW;
3129
3130 return sd_event_source_set_time(s, usec);
3131}
3132
3133_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *ret) {
3134 assert_return(s, -EINVAL);
3135 assert_return(ret, -EINVAL);
3136 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3137 assert_return(!event_origin_changed(s->event), -ECHILD);
3138
3139 *ret = s->time.accuracy;
3140 return 0;
3141}
3142
3143_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
3144 int r;
3145
3146 assert_return(s, -EINVAL);
3147 assert_return(usec != UINT64_MAX, -EINVAL);
3148 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3149 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3150 assert_return(!event_origin_changed(s->event), -ECHILD);
3151
3152 r = source_set_pending(s, false);
3153 if (r < 0)
3154 return r;
3155
3156 if (usec == 0)
3157 usec = DEFAULT_ACCURACY_USEC;
3158
3159 s->time.accuracy = usec;
3160
3161 event_source_time_prioq_reshuffle(s);
3162 return 0;
3163}
3164
3165_public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *ret) {
3166 assert_return(s, -EINVAL);
3167 assert_return(ret, -EINVAL);
3168 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3169 assert_return(!event_origin_changed(s->event), -ECHILD);
3170
3171 *ret = event_source_type_to_clock(s->type);
3172 return 0;
3173}
3174
3175_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *ret) {
3176 assert_return(s, -EINVAL);
3177 assert_return(ret, -EINVAL);
3178 assert_return(s->type == SOURCE_CHILD, -EDOM);
3179 assert_return(!event_origin_changed(s->event), -ECHILD);
3180
3181 *ret = s->child.pid;
3182 return 0;
3183}
3184
3185_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3186 assert_return(s, -EINVAL);
3187 assert_return(s->type == SOURCE_CHILD, -EDOM);
3188 assert_return(!event_origin_changed(s->event), -ECHILD);
3189
3190 return s->child.pidfd;
3191}
3192
3193_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3194 assert_return(s, -EINVAL);
3195 assert_return(s->type == SOURCE_CHILD, -EDOM);
3196 assert_return(!event_origin_changed(s->event), -ECHILD);
3197 assert_return(SIGNAL_VALID(sig), -EINVAL);
3198 assert(s->child.pidfd >= 0);
3199
3200 /* If we already have seen indication the process exited refuse sending a signal early. */
3201 if (s->child.exited)
3202 return -ESRCH;
3203 assert(!s->child.waited);
3204
3205 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the structure here. */
3206 siginfo_t copy;
3207 if (si)
3208 copy = *si;
3209
3210 return RET_NERRNO(pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, flags));
3211}
3212
3213_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3214 assert_return(s, -EINVAL);
3215 assert_return(s->type == SOURCE_CHILD, -EDOM);
3216 assert_return(!event_origin_changed(s->event), -ECHILD);
3217 assert(s->child.pidfd >= 0);
3218
3219 return s->child.pidfd_owned;
3220}
3221
3222_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3223 assert_return(s, -EINVAL);
3224 assert_return(s->type == SOURCE_CHILD, -EDOM);
3225 assert_return(!event_origin_changed(s->event), -ECHILD);
3226 assert(s->child.pidfd >= 0);
3227
3228 s->child.pidfd_owned = own;
3229 return 0;
3230}
3231
3232_public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3233 assert_return(s, -EINVAL);
3234 assert_return(s->type == SOURCE_CHILD, -EDOM);
3235 assert_return(!event_origin_changed(s->event), -ECHILD);
3236
3237 return s->child.process_owned;
3238}
3239
3240_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3241 assert_return(s, -EINVAL);
3242 assert_return(s->type == SOURCE_CHILD, -EDOM);
3243 assert_return(!event_origin_changed(s->event), -ECHILD);
3244
3245 s->child.process_owned = own;
3246 return 0;
3247}
3248
3249_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret) {
3250 assert_return(s, -EINVAL);
3251 assert_return(ret, -EINVAL);
3252 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3253 assert_return(!event_origin_changed(s->event), -ECHILD);
3254
3255 *ret = s->inotify.mask;
3256 return 0;
3257}
3258
3259_public_ int sd_event_source_get_inotify_path(sd_event_source *s, const char **ret) {
3260 assert_return(s, -EINVAL);
3261 assert_return(ret, -EINVAL);
3262 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3263 assert_return(!event_origin_changed(s->event), -ECHILD);
3264
3265 if (!s->inotify.inode_data)
3266 return -ESTALE; /* already disconnected. */
3267
3268 if (!s->inotify.inode_data->path)
3269 return -ENOSYS; /* /proc was not mounted? */
3270
3271 *ret = s->inotify.inode_data->path;
3272 return 0;
3273}
3274
3275_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
3276 int r;
3277
3278 assert_return(s, -EINVAL);
3279 assert_return(s->type != SOURCE_EXIT, -EDOM);
3280 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3281 assert_return(!event_origin_changed(s->event), -ECHILD);
3282
3283 if (s->prepare == callback)
3284 return 0;
3285
3286 if (callback && s->prepare) {
3287 s->prepare = callback;
3288 return 0;
3289 }
3290
3291 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3292 if (r < 0)
3293 return r;
3294
3295 s->prepare = callback;
3296
3297 if (callback) {
3298 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3299 if (r < 0)
3300 return r;
3301 } else
3302 prioq_remove(s->event->prepare, s, &s->prepare_index);
3303
3304 return 0;
3305}
3306
3307_public_ void* sd_event_source_get_userdata(sd_event_source *s) {
3308 assert_return(s, NULL);
3309 assert_return(!event_origin_changed(s->event), NULL);
3310
3311 return s->userdata;
3312}
3313
3314_public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3315 void *ret;
3316
3317 assert_return(s, NULL);
3318 assert_return(!event_origin_changed(s->event), NULL);
3319
3320 ret = s->userdata;
3321 s->userdata = userdata;
3322
3323 return ret;
3324}
3325
3326static int event_source_enter_ratelimited(sd_event_source *s) {
3327 int r;
3328
3329 assert(s);
3330
3331 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3332 * the end of the rate limit time window, much as if it was a timer event source. */
3333
3334 if (s->ratelimited)
3335 return 0; /* Already ratelimited, this is a NOP hence */
3336
3337 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3338 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3339 if (r < 0)
3340 return r;
3341
3342 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3343 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3344 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3345 if (EVENT_SOURCE_IS_TIME(s->type))
3346 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3347
3348 /* Now, let's add the event source to the monotonic clock instead */
3349 r = event_source_time_prioq_put(s, &s->event->monotonic);
3350 if (r < 0)
3351 goto fail;
3352
3353 /* And let's take the event source officially offline */
3354 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3355 if (r < 0) {
3356 event_source_time_prioq_remove(s, &s->event->monotonic);
3357 goto fail;
3358 }
3359
3360 event_source_pp_prioq_reshuffle(s);
3361
3362 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3363 return 0;
3364
3365fail:
3366 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3367 * space for it should already be allocated. */
3368 if (EVENT_SOURCE_IS_TIME(s->type))
3369 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3370
3371 return r;
3372}
3373
3374static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
3375 int r;
3376
3377 assert(s);
3378
3379 if (!s->ratelimited)
3380 return 0;
3381
3382 /* Let's take the event source out of the monotonic prioq first. */
3383 event_source_time_prioq_remove(s, &s->event->monotonic);
3384
3385 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3386 if (EVENT_SOURCE_IS_TIME(s->type)) {
3387 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3388 if (r < 0)
3389 goto fail;
3390 }
3391
3392 /* Let's try to take it online again. */
3393 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3394 if (r < 0) {
3395 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3396 if (EVENT_SOURCE_IS_TIME(s->type))
3397 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3398
3399 goto fail;
3400 }
3401
3402 event_source_pp_prioq_reshuffle(s);
3403 ratelimit_reset(&s->rate_limit);
3404
3405 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
3406
3407 if (run_callback && s->ratelimit_expire_callback) {
3408 s->dispatching = true;
3409 r = s->ratelimit_expire_callback(s, s->userdata);
3410 s->dispatching = false;
3411
3412 if (r < 0) {
3413 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3414 strna(s->description),
3415 event_source_type_to_string(s->type),
3416 s->exit_on_failure ? "exiting" : "disabling");
3417
3418 if (s->exit_on_failure)
3419 (void) sd_event_exit(s->event, r);
3420 }
3421
3422 if (s->n_ref == 0)
3423 source_free(s);
3424 else if (r < 0)
3425 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3426
3427 return 1;
3428 }
3429
3430 return 0;
3431
3432fail:
3433 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3434 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3435 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3436
3437 return r;
3438}
3439
3440static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3441 usec_t c;
3442 assert(e);
3443 assert(a <= b);
3444
3445 if (a <= 0)
3446 return 0;
3447 if (a >= USEC_INFINITY)
3448 return USEC_INFINITY;
3449
3450 if (b <= a + 1)
3451 return a;
3452
3453 initialize_perturb(e);
3454
3455 /*
3456 Find a good time to wake up again between times a and b. We
3457 have two goals here:
3458
3459 a) We want to wake up as seldom as possible, hence prefer
3460 later times over earlier times.
3461
3462 b) But if we have to wake up, then let's make sure to
3463 dispatch as much as possible on the entire system.
3464
3465 We implement this by waking up everywhere at the same time
3466 within any given minute if we can, synchronised via the
3467 perturbation value determined from the boot ID. If we can't,
3468 then we try to find the same spot in every 10s, then 1s and
3469 then 250ms step. Otherwise, we pick the last possible time
3470 to wake up.
3471 */
3472
3473 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3474 if (c >= b) {
3475 if (_unlikely_(c < USEC_PER_MINUTE))
3476 return b;
3477
3478 c -= USEC_PER_MINUTE;
3479 }
3480
3481 if (c >= a)
3482 return c;
3483
3484 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3485 if (c >= b) {
3486 if (_unlikely_(c < USEC_PER_SEC*10))
3487 return b;
3488
3489 c -= USEC_PER_SEC*10;
3490 }
3491
3492 if (c >= a)
3493 return c;
3494
3495 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3496 if (c >= b) {
3497 if (_unlikely_(c < USEC_PER_SEC))
3498 return b;
3499
3500 c -= USEC_PER_SEC;
3501 }
3502
3503 if (c >= a)
3504 return c;
3505
3506 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3507 if (c >= b) {
3508 if (_unlikely_(c < USEC_PER_MSEC*250))
3509 return b;
3510
3511 c -= USEC_PER_MSEC*250;
3512 }
3513
3514 if (c >= a)
3515 return c;
3516
3517 return b;
3518}
3519
3520static int event_arm_timer(
3521 sd_event *e,
3522 struct clock_data *d) {
3523
3524 struct itimerspec its = {};
3525 sd_event_source *a, *b;
3526 usec_t t;
3527
3528 assert(e);
3529 assert(d);
3530
3531 if (!d->needs_rearm)
3532 return 0;
3533
3534 d->needs_rearm = false;
3535
3536 a = prioq_peek(d->earliest);
3537 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3538 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3539
3540 if (d->fd < 0)
3541 return 0;
3542
3543 if (d->next == USEC_INFINITY)
3544 return 0;
3545
3546 /* disarm */
3547 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3548 return -errno;
3549
3550 d->next = USEC_INFINITY;
3551 return 0;
3552 }
3553
3554 b = prioq_peek(d->latest);
3555 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3556 assert(b && b->enabled != SD_EVENT_OFF);
3557
3558 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3559 if (d->next == t)
3560 return 0;
3561
3562 assert_se(d->fd >= 0);
3563
3564 if (t == 0) {
3565 /* We don't want to disarm here, just mean some time looooong ago. */
3566 its.it_value.tv_sec = 0;
3567 its.it_value.tv_nsec = 1;
3568 } else
3569 timespec_store(&its.it_value, t);
3570
3571 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3572 return -errno;
3573
3574 d->next = t;
3575 return 0;
3576}
3577
3578static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3579 assert(e);
3580 assert(s);
3581 assert(s->type == SOURCE_IO);
3582
3583 /* If the event source was already pending, we just OR in the
3584 * new revents, otherwise we reset the value. The ORing is
3585 * necessary to handle EPOLLONESHOT events properly where
3586 * readability might happen independently of writability, and
3587 * we need to keep track of both */
3588
3589 if (s->pending)
3590 s->io.revents |= revents;
3591 else
3592 s->io.revents = revents;
3593
3594 return source_set_pending(s, true);
3595}
3596
3597static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3598 uint64_t x;
3599 ssize_t ss;
3600
3601 assert(e);
3602 assert(fd >= 0);
3603
3604 assert_return(events == EPOLLIN, -EIO);
3605
3606 ss = read(fd, &x, sizeof(x));
3607 if (ss < 0) {
3608 if (ERRNO_IS_TRANSIENT(errno))
3609 return 0;
3610
3611 return -errno;
3612 }
3613
3614 if (_unlikely_(ss != sizeof(x)))
3615 return -EIO;
3616
3617 if (next)
3618 *next = USEC_INFINITY;
3619
3620 return 0;
3621}
3622
3623static int process_timer(
3624 sd_event *e,
3625 usec_t n,
3626 struct clock_data *d) {
3627
3628 sd_event_source *s;
3629 bool callback_invoked = false;
3630 int r;
3631
3632 assert(e);
3633 assert(d);
3634
3635 for (;;) {
3636 s = prioq_peek(d->earliest);
3637 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3638
3639 if (!s || time_event_source_next(s) > n)
3640 break;
3641
3642 if (s->ratelimited) {
3643 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3644 * again. */
3645 assert(s->ratelimited);
3646
3647 r = event_source_leave_ratelimit(s, /* run_callback */ true);
3648 if (r < 0)
3649 return r;
3650 else if (r == 1)
3651 callback_invoked = true;
3652
3653 continue;
3654 }
3655
3656 if (s->enabled == SD_EVENT_OFF || s->pending)
3657 break;
3658
3659 r = source_set_pending(s, true);
3660 if (r < 0)
3661 return r;
3662
3663 event_source_time_prioq_reshuffle(s);
3664 }
3665
3666 return callback_invoked;
3667}
3668
3669static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3670 int64_t min_priority = threshold;
3671 bool something_new = false;
3672 sd_event_source *s;
3673 int r;
3674
3675 assert(e);
3676 assert(ret_min_priority);
3677
3678 if (!e->need_process_child) {
3679 *ret_min_priority = min_priority;
3680 return 0;
3681 }
3682
3683 e->need_process_child = false;
3684
3685 /* So, this is ugly. We iteratively invoke waitid() + WNOHANG with each child process we shall wait for,
3686 * instead of using P_ALL. This is because we only want to get child information of very specific
3687 * child processes, and not all of them. We might not have processed the SIGCHLD event
3688 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3689 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3690 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3691 * to handle SIGCHLD yourself.
3692 *
3693 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3694 * source is dispatched so that the callback still sees the process as a zombie. */
3695
3696 HASHMAP_FOREACH(s, e->child_sources) {
3697 assert(s->type == SOURCE_CHILD);
3698 assert(s->child.pidfd >= 0);
3699
3700 if (s->priority > threshold)
3701 continue;
3702
3703 if (s->pending)
3704 continue;
3705
3706 if (event_source_is_offline(s))
3707 continue;
3708
3709 if (s->child.exited)
3710 continue;
3711
3712 if (EVENT_SOURCE_WATCH_PIDFD(s))
3713 /* There's a usable pidfd known for this event source? Then don't waitid() for
3714 * it here */
3715 continue;
3716
3717 zero(s->child.siginfo);
3718 if (waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo,
3719 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3720 return negative_errno();
3721
3722 if (s->child.siginfo.si_pid != 0) {
3723 bool zombie = SIGINFO_CODE_IS_DEAD(s->child.siginfo.si_code);
3724
3725 if (zombie)
3726 s->child.exited = true;
3727 else if (s->child.options & WEXITED) {
3728 /* If the child isn't dead then let's immediately remove the state change
3729 * from the queue, since there's no benefit in leaving it queued. */
3730
3731 assert(s->child.options & (WSTOPPED|WCONTINUED));
3732 (void) waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3733 }
3734
3735 r = source_set_pending(s, true);
3736 if (r < 0)
3737 return r;
3738 if (r > 0) {
3739 something_new = true;
3740 min_priority = MIN(min_priority, s->priority);
3741 }
3742 }
3743 }
3744
3745 *ret_min_priority = min_priority;
3746 return something_new;
3747}
3748
3749static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3750 assert(e);
3751 assert(s);
3752 assert(s->type == SOURCE_CHILD);
3753 assert(s->child.pidfd >= 0);
3754
3755 if (s->pending)
3756 return 0;
3757
3758 if (event_source_is_offline(s))
3759 return 0;
3760
3761 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3762 return 0;
3763
3764 /* Note that pidfd would also generate EPOLLHUP when the process gets reaped. But at this point we
3765 * only permit EPOLLIN, under the assumption that upon EPOLLHUP the child source should already
3766 * be set to pending, and we would have returned early above. */
3767 assert(!s->child.exited);
3768
3769 zero(s->child.siginfo);
3770 if (waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3771 return -errno;
3772
3773 if (s->child.siginfo.si_pid == 0)
3774 return 0;
3775
3776 if (SIGINFO_CODE_IS_DEAD(s->child.siginfo.si_code))
3777 s->child.exited = true;
3778
3779 return source_set_pending(s, true);
3780}
3781
3782static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3783 int r;
3784
3785 assert(e);
3786 assert(d);
3787 assert_return(events == EPOLLIN, -EIO);
3788 assert(min_priority);
3789
3790 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3791 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3792 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3793 * but we might have higher priority children we care about hence we need to check that
3794 * explicitly. */
3795
3796 if (sigismember(&d->sigset, SIGCHLD))
3797 e->need_process_child = true;
3798
3799 /* If there's already an event source pending for this priority we don't read another */
3800 if (d->current)
3801 return 0;
3802
3803 for (;;) {
3804 struct signalfd_siginfo si;
3805 ssize_t n;
3806 sd_event_source *s = NULL;
3807
3808 n = read(d->fd, &si, sizeof(si));
3809 if (n < 0) {
3810 if (ERRNO_IS_TRANSIENT(errno))
3811 return 0;
3812
3813 return -errno;
3814 }
3815
3816 if (_unlikely_(n != sizeof(si)))
3817 return -EIO;
3818
3819 if (_unlikely_(!SIGNAL_VALID(si.ssi_signo)))
3820 return -EIO;
3821
3822 if (e->signal_sources)
3823 s = e->signal_sources[si.ssi_signo];
3824 if (!s)
3825 continue;
3826 if (s->pending)
3827 continue;
3828
3829 s->signal.siginfo = si;
3830 d->current = s;
3831
3832 r = source_set_pending(s, true);
3833 if (r < 0)
3834 return r;
3835 if (r > 0 && *min_priority >= s->priority) {
3836 *min_priority = s->priority;
3837 return 1; /* an event source with smaller priority is queued. */
3838 }
3839
3840 return 0;
3841 }
3842}
3843
3844static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3845 ssize_t n;
3846
3847 assert(e);
3848 assert(d);
3849
3850 assert_return(revents == EPOLLIN, -EIO);
3851
3852 /* If there's already an event source pending for this priority, don't read another */
3853 if (d->n_pending > 0)
3854 return 0;
3855
3856 /* Is the read buffer non-empty? If so, let's not read more */
3857 if (d->buffer_filled > 0)
3858 return 0;
3859
3860 if (d->priority > threshold)
3861 return 0;
3862
3863 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3864 if (n < 0) {
3865 if (ERRNO_IS_TRANSIENT(errno))
3866 return 0;
3867
3868 return -errno;
3869 }
3870
3871 assert(n > 0);
3872 d->buffer_filled = (size_t) n;
3873 LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
3874
3875 return 1;
3876}
3877
3878static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3879 assert(e);
3880 assert(d);
3881 assert(sz <= d->buffer_filled);
3882
3883 if (sz == 0)
3884 return;
3885
3886 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3887 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3888 d->buffer_filled -= sz;
3889
3890 if (d->buffer_filled == 0)
3891 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
3892}
3893
3894static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3895 int r;
3896
3897 assert(e);
3898 assert(d);
3899
3900 /* If there's already an event source pending for this priority, don't read another */
3901 if (d->n_pending > 0)
3902 return 0;
3903
3904 while (d->buffer_filled > 0) {
3905 size_t sz;
3906
3907 /* Let's validate that the event structures are complete */
3908 if (d->buffer_filled < offsetof(struct inotify_event, name))
3909 return -EIO;
3910
3911 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3912 if (d->buffer_filled < sz)
3913 return -EIO;
3914
3915 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3916 struct inode_data *inode_data;
3917
3918 /* The queue overran, let's pass this event to all event sources connected to this inotify
3919 * object */
3920
3921 HASHMAP_FOREACH(inode_data, d->inodes)
3922 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3923
3924 if (event_source_is_offline(s))
3925 continue;
3926
3927 r = source_set_pending(s, true);
3928 if (r < 0)
3929 return r;
3930 }
3931 } else {
3932 struct inode_data *inode_data;
3933
3934 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3935 * our watch descriptor table. */
3936 if (d->buffer.ev.mask & IN_IGNORED) {
3937
3938 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3939 if (!inode_data) {
3940 event_inotify_data_drop(e, d, sz);
3941 continue;
3942 }
3943
3944 /* The watch descriptor was removed by the kernel, let's drop it here too */
3945 inode_data->wd = -1;
3946 } else {
3947 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3948 if (!inode_data) {
3949 event_inotify_data_drop(e, d, sz);
3950 continue;
3951 }
3952 }
3953
3954 /* Trigger all event sources that are interested in these events. Also trigger all event
3955 * sources if IN_IGNORED or IN_UNMOUNT is set. */
3956 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3957
3958 if (event_source_is_offline(s))
3959 continue;
3960
3961 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3962 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3963 continue;
3964
3965 r = source_set_pending(s, true);
3966 if (r < 0)
3967 return r;
3968 }
3969 }
3970
3971 /* Something pending now? If so, let's finish, otherwise let's read more. */
3972 if (d->n_pending > 0)
3973 return 1;
3974 }
3975
3976 return 0;
3977}
3978
3979static int process_inotify(sd_event *e) {
3980 int r, done = 0;
3981
3982 assert(e);
3983
3984 LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
3985 r = event_inotify_data_process(e, d);
3986 if (r < 0)
3987 return r;
3988 if (r > 0)
3989 done++;
3990 }
3991
3992 return done;
3993}
3994
3995static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
3996 assert(s);
3997 assert(s->type == SOURCE_MEMORY_PRESSURE);
3998
3999 if (s->pending)
4000 s->memory_pressure.revents |= revents;
4001 else
4002 s->memory_pressure.revents = revents;
4003
4004 return source_set_pending(s, true);
4005}
4006
4007static int source_memory_pressure_write(sd_event_source *s) {
4008 ssize_t n;
4009 int r;
4010
4011 assert(s);
4012 assert(s->type == SOURCE_MEMORY_PRESSURE);
4013
4014 /* once we start writing, the buffer is locked, we allow no further changes. */
4015 s->memory_pressure.locked = true;
4016
4017 if (s->memory_pressure.write_buffer_size > 0) {
4018 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4019 if (n < 0) {
4020 if (!ERRNO_IS_TRANSIENT(errno)) {
4021 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4022 * files, but then generates EOPNOSUPP on read() and write() (instead of on
4023 * open()!). This sucks hard, since we can only detect this kind of failure
4024 * so late. Let's make the best of it, and turn off the event source like we
4025 * do for failed event source handlers. */
4026
4027 log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4028 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4029 return 0;
4030 }
4031
4032 n = 0;
4033 }
4034 } else
4035 n = 0;
4036
4037 assert(n >= 0);
4038
4039 if ((size_t) n == s->memory_pressure.write_buffer_size) {
4040 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4041
4042 if (n > 0) {
4043 s->memory_pressure.write_buffer_size = 0;
4044
4045 /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4046 r = source_memory_pressure_register(s, s->enabled);
4047 if (r < 0)
4048 return r;
4049 }
4050 } else if (n > 0) {
4051 _cleanup_free_ void *c = NULL;
4052
4053 assert((size_t) n < s->memory_pressure.write_buffer_size);
4054
4055 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4056 if (!c)
4057 return -ENOMEM;
4058
4059 free_and_replace(s->memory_pressure.write_buffer, c);
4060 s->memory_pressure.write_buffer_size -= n;
4061 return 1;
4062 }
4063
4064 return 0;
4065}
4066
4067static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4068 int r;
4069
4070 assert(s);
4071 assert(s->type == SOURCE_MEMORY_PRESSURE);
4072
4073 r = source_memory_pressure_write(s);
4074 if (r < 0)
4075 return r;
4076 if (r > 0)
4077 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4078 * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4079
4080 /* No pending incoming IO? Then let's not continue further */
4081 if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4082
4083 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4084 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4085 return -EIO;
4086
4087 return 1; /* leave dispatch, we already processed everything */
4088 }
4089
4090 if (s->memory_pressure.revents & EPOLLIN) {
4091 uint8_t pipe_buf[PIPE_BUF];
4092 ssize_t n;
4093
4094 /* If the fd is readable, then flush out anything that might be queued */
4095
4096 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4097 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4098 return -errno;
4099 }
4100
4101 return 0; /* go on, dispatch to user callback */
4102}
4103
4104static int source_dispatch(sd_event_source *s) {
4105 EventSourceType saved_type;
4106 sd_event *saved_event;
4107 int r = 0;
4108
4109 assert(s);
4110 assert(s->pending || s->type == SOURCE_EXIT);
4111
4112 /* Save the event source type, here, so that we still know it after the event callback which might
4113 * invalidate the event. */
4114 saved_type = s->type;
4115
4116 /* Similarly, store a reference to the event loop object, so that we can still access it after the
4117 * callback might have invalidated/disconnected the event source. */
4118 saved_event = s->event;
4119 PROTECT_EVENT(saved_event);
4120
4121 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4122 assert(!s->ratelimited);
4123 if (!ratelimit_below(&s->rate_limit)) {
4124 r = event_source_enter_ratelimited(s);
4125 if (r < 0)
4126 return r;
4127
4128 return 1;
4129 }
4130
4131 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
4132 r = source_set_pending(s, false);
4133 if (r < 0)
4134 return r;
4135 }
4136
4137 if (s->type != SOURCE_POST) {
4138 sd_event_source *z;
4139
4140 /* If we execute a non-post source, let's mark all post sources as pending. */
4141
4142 SET_FOREACH(z, s->event->post_sources) {
4143 if (event_source_is_offline(z))
4144 continue;
4145
4146 r = source_set_pending(z, true);
4147 if (r < 0)
4148 return r;
4149 }
4150 }
4151
4152 if (s->type == SOURCE_MEMORY_PRESSURE) {
4153 r = source_memory_pressure_initiate_dispatch(s);
4154 if (r == -EIO) /* handle EIO errors similar to callback errors */
4155 goto finish;
4156 if (r < 0)
4157 return r;
4158 if (r > 0) /* already handled */
4159 return 1;
4160 }
4161
4162 if (s->enabled == SD_EVENT_ONESHOT) {
4163 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
4164 if (r < 0)
4165 return r;
4166 }
4167
4168 s->dispatching = true;
4169
4170 switch (s->type) {
4171
4172 case SOURCE_IO:
4173 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4174 break;
4175
4176 case SOURCE_TIME_REALTIME:
4177 case SOURCE_TIME_BOOTTIME:
4178 case SOURCE_TIME_MONOTONIC:
4179 case SOURCE_TIME_REALTIME_ALARM:
4180 case SOURCE_TIME_BOOTTIME_ALARM:
4181 r = s->time.callback(s, s->time.next, s->userdata);
4182 break;
4183
4184 case SOURCE_SIGNAL:
4185 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4186 break;
4187
4188 case SOURCE_CHILD: {
4189 bool zombie = SIGINFO_CODE_IS_DEAD(s->child.siginfo.si_code);
4190
4191 r = s->child.callback(s, &s->child.siginfo, s->userdata);
4192
4193 /* Now, reap the PID for good. */
4194 if (zombie) {
4195 (void) waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo, WNOHANG|WEXITED);
4196 s->child.waited = true;
4197 }
4198
4199 break;
4200 }
4201
4202 case SOURCE_DEFER:
4203 r = s->defer.callback(s, s->userdata);
4204 break;
4205
4206 case SOURCE_POST:
4207 r = s->post.callback(s, s->userdata);
4208 break;
4209
4210 case SOURCE_EXIT:
4211 r = s->exit.callback(s, s->userdata);
4212 break;
4213
4214 case SOURCE_INOTIFY: {
4215 struct sd_event *e = s->event;
4216 struct inotify_data *d;
4217 size_t sz;
4218
4219 assert(s->inotify.inode_data);
4220 assert_se(d = s->inotify.inode_data->inotify_data);
4221
4222 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4223 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4224 assert(d->buffer_filled >= sz);
4225
4226 /* If the inotify callback destroys the event source then this likely means we don't need to
4227 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4228 * free it immediately, then we couldn't drop the event from the inotify event queue without
4229 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4230 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4231 * explicitly GC it after we are done dropping the inotify event from the buffer. */
4232 d->n_busy++;
4233 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
4234 d->n_busy--;
4235
4236 /* When no event is pending anymore on this inotify object, then let's drop the event from
4237 * the inotify event queue buffer. */
4238 if (d->n_pending == 0)
4239 event_inotify_data_drop(e, d, sz);
4240
4241 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4242 event_gc_inotify_data(e, d);
4243 break;
4244 }
4245
4246 case SOURCE_MEMORY_PRESSURE:
4247 r = s->memory_pressure.callback(s, s->userdata);
4248 break;
4249
4250 case SOURCE_WATCHDOG:
4251 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
4252 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
4253 assert_not_reached();
4254 }
4255
4256 s->dispatching = false;
4257
4258finish:
4259 if (r < 0) {
4260 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4261 strna(s->description),
4262 event_source_type_to_string(saved_type),
4263 s->exit_on_failure ? "exiting" : "disabling");
4264
4265 if (s->exit_on_failure)
4266 (void) sd_event_exit(saved_event, r);
4267 }
4268
4269 if (s->n_ref == 0)
4270 source_free(s);
4271 else if (r < 0)
4272 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4273
4274 return 1;
4275}
4276
4277static int event_prepare(sd_event *e) {
4278 int r;
4279
4280 assert(e);
4281
4282 for (;;) {
4283 sd_event_source *s;
4284
4285 s = prioq_peek(e->prepare);
4286 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
4287 break;
4288
4289 s->prepare_iteration = e->iteration;
4290 prioq_reshuffle(e->prepare, s, &s->prepare_index);
4291
4292 assert(s->prepare);
4293 s->dispatching = true;
4294 r = s->prepare(s, s->userdata);
4295 s->dispatching = false;
4296
4297 if (r < 0) {
4298 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4299 strna(s->description),
4300 event_source_type_to_string(s->type),
4301 s->exit_on_failure ? "exiting" : "disabling");
4302
4303 if (s->exit_on_failure)
4304 (void) sd_event_exit(e, r);
4305 }
4306
4307 if (s->n_ref == 0)
4308 source_free(s);
4309 else if (r < 0)
4310 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4311 }
4312
4313 return 0;
4314}
4315
4316static int dispatch_exit(sd_event *e) {
4317 sd_event_source *p;
4318 int r;
4319
4320 assert(e);
4321
4322 p = prioq_peek(e->exit);
4323 assert(!p || p->type == SOURCE_EXIT);
4324
4325 if (!p || event_source_is_offline(p)) {
4326 e->state = SD_EVENT_FINISHED;
4327 return 0;
4328 }
4329
4330 PROTECT_EVENT(e);
4331 e->iteration++;
4332 e->state = SD_EVENT_EXITING;
4333 r = source_dispatch(p);
4334 e->state = SD_EVENT_INITIAL;
4335 return r;
4336}
4337
4338static sd_event_source* event_next_pending(sd_event *e) {
4339 sd_event_source *p;
4340
4341 assert(e);
4342
4343 p = prioq_peek(e->pending);
4344 if (!p)
4345 return NULL;
4346
4347 if (event_source_is_offline(p))
4348 return NULL;
4349
4350 return p;
4351}
4352
4353static int arm_watchdog(sd_event *e) {
4354 struct itimerspec its = {};
4355 usec_t t;
4356
4357 assert(e);
4358 assert(e->watchdog_fd >= 0);
4359
4360 t = sleep_between(e,
4361 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4362 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
4363
4364 timespec_store(&its.it_value, t);
4365
4366 /* Make sure we never set the watchdog to 0, which tells the
4367 * kernel to disable it. */
4368 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4369 its.it_value.tv_nsec = 1;
4370
4371 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
4372}
4373
4374static int process_watchdog(sd_event *e) {
4375 assert(e);
4376
4377 if (!e->watchdog)
4378 return 0;
4379
4380 /* Don't notify watchdog too often */
4381 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4382 return 0;
4383
4384 sd_notify(false, "WATCHDOG=1");
4385 e->watchdog_last = e->timestamp.monotonic;
4386
4387 return arm_watchdog(e);
4388}
4389
4390static void event_close_inode_data_fds(sd_event *e) {
4391 struct inode_data *d;
4392
4393 assert(e);
4394
4395 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4396 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4397 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4398 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4399 * compromise. */
4400
4401 while ((d = e->inode_data_to_close_list)) {
4402 assert(d->fd >= 0);
4403 d->fd = safe_close(d->fd);
4404
4405 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
4406 }
4407}
4408
4409static int event_memory_pressure_write_list(sd_event *e) {
4410 int r;
4411
4412 assert(e);
4413
4414 for (;;) {
4415 sd_event_source *s;
4416
4417 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4418 if (!s)
4419 break;
4420
4421 assert(s->type == SOURCE_MEMORY_PRESSURE);
4422 assert(s->memory_pressure.write_buffer_size > 0);
4423 s->memory_pressure.in_write_list = false;
4424
4425 r = source_memory_pressure_write(s);
4426 if (r < 0)
4427 return r;
4428 }
4429
4430 return 0;
4431}
4432
4433_public_ int sd_event_prepare(sd_event *e) {
4434 int r;
4435
4436 assert_return(e, -EINVAL);
4437 assert_return(e = event_resolve(e), -ENOPKG);
4438 assert_return(!event_origin_changed(e), -ECHILD);
4439 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4440 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4441
4442 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4443 * this check here once, since gettid() is typically not cached, and thus want to minimize
4444 * syscalls */
4445 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4446
4447 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4448 PROTECT_EVENT(e);
4449
4450 if (e->exit_requested)
4451 goto pending;
4452
4453 e->iteration++;
4454
4455 e->state = SD_EVENT_PREPARING;
4456 r = event_prepare(e);
4457 e->state = SD_EVENT_INITIAL;
4458 if (r < 0)
4459 return r;
4460
4461 r = event_memory_pressure_write_list(e);
4462 if (r < 0)
4463 return r;
4464
4465 r = event_arm_timer(e, &e->realtime);
4466 if (r < 0)
4467 return r;
4468
4469 r = event_arm_timer(e, &e->boottime);
4470 if (r < 0)
4471 return r;
4472
4473 r = event_arm_timer(e, &e->monotonic);
4474 if (r < 0)
4475 return r;
4476
4477 r = event_arm_timer(e, &e->realtime_alarm);
4478 if (r < 0)
4479 return r;
4480
4481 r = event_arm_timer(e, &e->boottime_alarm);
4482 if (r < 0)
4483 return r;
4484
4485 event_close_inode_data_fds(e);
4486
4487 if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
4488 goto pending;
4489
4490 e->state = SD_EVENT_ARMED;
4491
4492 return 0;
4493
4494pending:
4495 e->state = SD_EVENT_ARMED;
4496 r = sd_event_wait(e, 0);
4497 if (r == 0)
4498 e->state = SD_EVENT_ARMED;
4499
4500 return r;
4501}
4502
4503static int epoll_wait_usec(
4504 int fd,
4505 struct epoll_event *events,
4506 int maxevents,
4507 usec_t timeout) {
4508
4509 int msec;
4510 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4511
4512#if HAVE_EPOLL_PWAIT2
4513 static bool epoll_pwait2_absent = false;
4514 int r;
4515
4516 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4517 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4518 * is not that obvious to implement given the libc and kernel definitions differ in the last
4519 * argument. Moreover, the only reason to use it is the more accurate timeouts (which is not a
4520 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4521 * missing. */
4522
4523 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
4524 r = epoll_pwait2(fd,
4525 events,
4526 maxevents,
4527 TIMESPEC_STORE(timeout),
4528 NULL);
4529 if (r >= 0)
4530 return r;
4531 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
4532 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4533 * supported. */
4534
4535 epoll_pwait2_absent = true;
4536 }
4537#endif
4538
4539 if (timeout == USEC_INFINITY)
4540 msec = -1;
4541 else {
4542 usec_t k;
4543
4544 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4545 if (k >= INT_MAX)
4546 msec = INT_MAX; /* Saturate */
4547 else
4548 msec = (int) k;
4549 }
4550
4551 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
4552}
4553
4554static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
4555 size_t n_event_queue, m, n_event_max;
4556 int64_t min_priority = threshold;
4557 bool something_new = false;
4558 int r;
4559
4560 assert(e);
4561 assert(ret_min_priority);
4562
4563 n_event_queue = MAX(e->n_sources, 1u);
4564 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
4565 return -ENOMEM;
4566
4567 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4568
4569 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4570 if (e->buffered_inotify_data_list)
4571 timeout = 0;
4572
4573 for (;;) {
4574 r = epoll_wait_usec(
4575 e->epoll_fd,
4576 e->event_queue,
4577 n_event_max,
4578 timeout);
4579 if (r < 0)
4580 return r;
4581
4582 m = (size_t) r;
4583
4584 if (m < n_event_max)
4585 break;
4586
4587 if (n_event_max >= n_event_queue * 10)
4588 break;
4589
4590 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
4591 return -ENOMEM;
4592
4593 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4594 timeout = 0;
4595 }
4596
4597 /* Set timestamp only when this is called first time. */
4598 if (threshold == INT64_MAX)
4599 triple_timestamp_now(&e->timestamp);
4600
4601 for (size_t i = 0; i < m; i++) {
4602
4603 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4604 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
4605 else {
4606 WakeupType *t = e->event_queue[i].data.ptr;
4607
4608 switch (*t) {
4609
4610 case WAKEUP_EVENT_SOURCE: {
4611 sd_event_source *s = e->event_queue[i].data.ptr;
4612
4613 assert(s);
4614
4615 if (s->priority > threshold)
4616 continue;
4617
4618 min_priority = MIN(min_priority, s->priority);
4619
4620 switch (s->type) {
4621
4622 case SOURCE_IO:
4623 r = process_io(e, s, e->event_queue[i].events);
4624 break;
4625
4626 case SOURCE_CHILD:
4627 r = process_pidfd(e, s, e->event_queue[i].events);
4628 break;
4629
4630 case SOURCE_MEMORY_PRESSURE:
4631 r = process_memory_pressure(s, e->event_queue[i].events);
4632 break;
4633
4634 default:
4635 assert_not_reached();
4636 }
4637
4638 break;
4639 }
4640
4641 case WAKEUP_CLOCK_DATA: {
4642 struct clock_data *d = e->event_queue[i].data.ptr;
4643
4644 assert(d);
4645
4646 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4647 break;
4648 }
4649
4650 case WAKEUP_SIGNAL_DATA:
4651 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4652 break;
4653
4654 case WAKEUP_INOTIFY_DATA:
4655 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4656 break;
4657
4658 default:
4659 assert_not_reached();
4660 }
4661 }
4662 if (r < 0)
4663 return r;
4664 if (r > 0)
4665 something_new = true;
4666 }
4667
4668 *ret_min_priority = min_priority;
4669 return something_new;
4670}
4671
4672_public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4673 int r;
4674
4675 assert_return(e, -EINVAL);
4676 assert_return(e = event_resolve(e), -ENOPKG);
4677 assert_return(!event_origin_changed(e), -ECHILD);
4678 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4679 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4680
4681 if (e->exit_requested) {
4682 e->state = SD_EVENT_PENDING;
4683 return 1;
4684 }
4685
4686 for (int64_t threshold = INT64_MAX; ; threshold--) {
4687 int64_t epoll_min_priority, child_min_priority;
4688
4689 /* There may be a possibility that new epoll (especially IO) and child events are
4690 * triggered just after process_epoll() call but before process_child(), and the new IO
4691 * events may have higher priority than the child events. To salvage these events,
4692 * let's call epoll_wait() again, but accepts only events with higher priority than the
4693 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4694 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4695 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4696
4697 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4698 if (r == -EINTR) {
4699 e->state = SD_EVENT_PENDING;
4700 return 1;
4701 }
4702 if (r < 0)
4703 goto finish;
4704 if (r == 0 && threshold < INT64_MAX)
4705 /* No new epoll event. */
4706 break;
4707
4708 r = process_child(e, threshold, &child_min_priority);
4709 if (r < 0)
4710 goto finish;
4711 if (r == 0)
4712 /* No new child event. */
4713 break;
4714
4715 threshold = MIN(epoll_min_priority, child_min_priority);
4716 if (threshold == INT64_MIN)
4717 break;
4718
4719 timeout = 0;
4720 }
4721
4722 r = process_watchdog(e);
4723 if (r < 0)
4724 goto finish;
4725
4726 r = process_inotify(e);
4727 if (r < 0)
4728 goto finish;
4729
4730 r = process_timer(e, e->timestamp.realtime, &e->realtime);
4731 if (r < 0)
4732 goto finish;
4733
4734 r = process_timer(e, e->timestamp.boottime, &e->boottime);
4735 if (r < 0)
4736 goto finish;
4737
4738 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4739 if (r < 0)
4740 goto finish;
4741
4742 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4743 if (r < 0)
4744 goto finish;
4745
4746 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4747 if (r < 0)
4748 goto finish;
4749 else if (r == 1) {
4750 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4751 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4752 * there were potentially re-enabled by the callback.
4753 *
4754 * Wondering why we treat only this invocation of process_timer() differently? Once event
4755 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4756 * ratelimit expiry callback is never called for any other timer type. */
4757 r = 0;
4758 goto finish;
4759 }
4760
4761 if (event_next_pending(e)) {
4762 e->state = SD_EVENT_PENDING;
4763 return 1;
4764 }
4765
4766 r = 0;
4767
4768finish:
4769 e->state = SD_EVENT_INITIAL;
4770
4771 return r;
4772}
4773
4774_public_ int sd_event_dispatch(sd_event *e) {
4775 sd_event_source *p;
4776 int r;
4777
4778 assert_return(e, -EINVAL);
4779 assert_return(e = event_resolve(e), -ENOPKG);
4780 assert_return(!event_origin_changed(e), -ECHILD);
4781 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4782 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4783
4784 if (e->exit_requested)
4785 return dispatch_exit(e);
4786
4787 p = event_next_pending(e);
4788 if (p) {
4789 PROTECT_EVENT(e);
4790
4791 e->state = SD_EVENT_RUNNING;
4792 r = source_dispatch(p);
4793 e->state = SD_EVENT_INITIAL;
4794 return r;
4795 }
4796
4797 e->state = SD_EVENT_INITIAL;
4798
4799 return 1;
4800}
4801
4802static void event_log_delays(sd_event *e) {
4803 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4804 size_t l;
4805
4806 p = b;
4807 l = sizeof(b);
4808 FOREACH_ELEMENT(delay, e->delays) {
4809 l = strpcpyf(&p, l, "%u ", *delay);
4810 *delay = 0;
4811 }
4812 log_debug("Event loop iterations: %s", b);
4813}
4814
4815_public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4816 int r;
4817
4818 assert_return(e, -EINVAL);
4819 assert_return(e = event_resolve(e), -ENOPKG);
4820 assert_return(!event_origin_changed(e), -ECHILD);
4821 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4822 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4823
4824 if (e->profile_delays && e->last_run_usec != 0) {
4825 usec_t this_run;
4826 unsigned l;
4827
4828 this_run = now(CLOCK_MONOTONIC);
4829
4830 l = log2u64(this_run - e->last_run_usec);
4831 assert(l < ELEMENTSOF(e->delays));
4832 e->delays[l]++;
4833
4834 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4835 event_log_delays(e);
4836 e->last_log_usec = this_run;
4837 }
4838 }
4839
4840 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4841 PROTECT_EVENT(e);
4842
4843 r = sd_event_prepare(e);
4844 if (r == 0)
4845 /* There was nothing? Then wait... */
4846 r = sd_event_wait(e, timeout);
4847
4848 if (e->profile_delays)
4849 e->last_run_usec = now(CLOCK_MONOTONIC);
4850
4851 if (r > 0) {
4852 /* There's something now, then let's dispatch it */
4853 r = sd_event_dispatch(e);
4854 if (r < 0)
4855 return r;
4856
4857 return 1;
4858 }
4859
4860 return r;
4861}
4862
4863_public_ int sd_event_loop(sd_event *e) {
4864 int r;
4865
4866 assert_return(e, -EINVAL);
4867 assert_return(e = event_resolve(e), -ENOPKG);
4868 assert_return(!event_origin_changed(e), -ECHILD);
4869 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4870
4871 PROTECT_EVENT(e);
4872
4873 while (e->state != SD_EVENT_FINISHED) {
4874 r = sd_event_run(e, UINT64_MAX);
4875 if (r < 0)
4876 return r;
4877 }
4878
4879 return e->exit_code;
4880}
4881
4882_public_ int sd_event_get_fd(sd_event *e) {
4883 assert_return(e, -EINVAL);
4884 assert_return(e = event_resolve(e), -ENOPKG);
4885 assert_return(!event_origin_changed(e), -ECHILD);
4886
4887 return e->epoll_fd;
4888}
4889
4890_public_ int sd_event_get_state(sd_event *e) {
4891 assert_return(e, -EINVAL);
4892 assert_return(e = event_resolve(e), -ENOPKG);
4893 assert_return(!event_origin_changed(e), -ECHILD);
4894
4895 return e->state;
4896}
4897
4898_public_ int sd_event_get_exit_code(sd_event *e, int *ret) {
4899 assert_return(e, -EINVAL);
4900 assert_return(e = event_resolve(e), -ENOPKG);
4901 assert_return(!event_origin_changed(e), -ECHILD);
4902
4903 if (!e->exit_requested)
4904 return -ENODATA;
4905
4906 if (ret)
4907 *ret = e->exit_code;
4908 return 0;
4909}
4910
4911_public_ int sd_event_exit(sd_event *e, int code) {
4912 assert_return(e, -EINVAL);
4913 assert_return(e = event_resolve(e), -ENOPKG);
4914 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4915 assert_return(!event_origin_changed(e), -ECHILD);
4916
4917 e->exit_requested = true;
4918 e->exit_code = code;
4919
4920 return 0;
4921}
4922
4923_public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *ret) {
4924 assert_return(e, -EINVAL);
4925 assert_return(e = event_resolve(e), -ENOPKG);
4926 assert_return(ret, -EINVAL);
4927 assert_return(!event_origin_changed(e), -ECHILD);
4928
4929 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4930 return -EOPNOTSUPP;
4931
4932 if (!triple_timestamp_is_set(&e->timestamp)) {
4933 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4934 *ret = now(clock);
4935 return 1;
4936 }
4937
4938 *ret = triple_timestamp_by_clock(&e->timestamp, clock);
4939 return 0;
4940}
4941
4942_public_ int sd_event_default(sd_event **ret) {
4943 sd_event *e = NULL;
4944 int r;
4945
4946 if (!ret)
4947 return !!default_event;
4948
4949 if (default_event) {
4950 *ret = sd_event_ref(default_event);
4951 return 0;
4952 }
4953
4954 r = sd_event_new(&e);
4955 if (r < 0)
4956 return r;
4957
4958 e->default_event_ptr = &default_event;
4959 e->tid = gettid();
4960 default_event = e;
4961
4962 *ret = e;
4963 return 1;
4964}
4965
4966_public_ int sd_event_get_tid(sd_event *e, pid_t *ret) {
4967 assert_return(e, -EINVAL);
4968 assert_return(e = event_resolve(e), -ENOPKG);
4969 assert_return(ret, -EINVAL);
4970 assert_return(!event_origin_changed(e), -ECHILD);
4971
4972 if (e->tid == 0)
4973 return -ENXIO;
4974
4975 *ret = e->tid;
4976 return 0;
4977}
4978
4979_public_ int sd_event_set_watchdog(sd_event *e, int b) {
4980 int r;
4981
4982 assert_return(e, -EINVAL);
4983 assert_return(e = event_resolve(e), -ENOPKG);
4984 assert_return(!event_origin_changed(e), -ECHILD);
4985
4986 if (e->watchdog == !!b)
4987 return e->watchdog;
4988
4989 if (b) {
4990 r = sd_watchdog_enabled(false, &e->watchdog_period);
4991 if (r <= 0)
4992 return r;
4993
4994 /* Issue first ping immediately */
4995 sd_notify(false, "WATCHDOG=1");
4996 e->watchdog_last = now(CLOCK_MONOTONIC);
4997
4998 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4999 if (e->watchdog_fd < 0)
5000 return -errno;
5001
5002 r = arm_watchdog(e);
5003 if (r < 0)
5004 goto fail;
5005
5006 struct epoll_event ev = {
5007 .events = EPOLLIN,
5008 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
5009 };
5010
5011 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
5012 r = -errno;
5013 goto fail;
5014 }
5015
5016 } else {
5017 if (e->watchdog_fd >= 0) {
5018 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
5019 e->watchdog_fd = safe_close(e->watchdog_fd);
5020 }
5021 }
5022
5023 e->watchdog = b;
5024 return e->watchdog;
5025
5026fail:
5027 e->watchdog_fd = safe_close(e->watchdog_fd);
5028 return r;
5029}
5030
5031_public_ int sd_event_get_watchdog(sd_event *e) {
5032 assert_return(e, -EINVAL);
5033 assert_return(e = event_resolve(e), -ENOPKG);
5034 assert_return(!event_origin_changed(e), -ECHILD);
5035
5036 return e->watchdog;
5037}
5038
5039_public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5040 assert_return(e, -EINVAL);
5041 assert_return(e = event_resolve(e), -ENOPKG);
5042 assert_return(!event_origin_changed(e), -ECHILD);
5043
5044 *ret = e->iteration;
5045 return 0;
5046}
5047
5048_public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5049 assert_return(s, -EINVAL);
5050 assert_return(s->event, -EINVAL);
5051 assert_return(!event_origin_changed(s->event), -ECHILD);
5052
5053 s->destroy_callback = callback;
5054 return 0;
5055}
5056
5057_public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5058 assert_return(s, -EINVAL);
5059 assert_return(!event_origin_changed(s->event), -ECHILD);
5060
5061 if (ret)
5062 *ret = s->destroy_callback;
5063
5064 return !!s->destroy_callback;
5065}
5066
5067_public_ int sd_event_source_get_floating(sd_event_source *s) {
5068 assert_return(s, -EINVAL);
5069 assert_return(!event_origin_changed(s->event), -ECHILD);
5070
5071 return s->floating;
5072}
5073
5074_public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5075 assert_return(s, -EINVAL);
5076 assert_return(!event_origin_changed(s->event), -ECHILD);
5077
5078 if (s->floating == !!b)
5079 return 0;
5080
5081 if (!s->event) /* Already disconnected */
5082 return -ESTALE;
5083
5084 s->floating = b;
5085
5086 if (b) {
5087 sd_event_source_ref(s);
5088 sd_event_unref(s->event);
5089 } else {
5090 sd_event_ref(s->event);
5091 sd_event_source_unref(s);
5092 }
5093
5094 return 1;
5095}
5096
5097_public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5098 assert_return(s, -EINVAL);
5099 assert_return(s->type != SOURCE_EXIT, -EDOM);
5100 assert_return(!event_origin_changed(s->event), -ECHILD);
5101
5102 return s->exit_on_failure;
5103}
5104
5105_public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5106 assert_return(s, -EINVAL);
5107 assert_return(s->type != SOURCE_EXIT, -EDOM);
5108 assert_return(!event_origin_changed(s->event), -ECHILD);
5109
5110 if (s->exit_on_failure == !!b)
5111 return 0;
5112
5113 s->exit_on_failure = b;
5114 return 1;
5115}
5116
5117_public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5118 int r;
5119
5120 assert_return(s, -EINVAL);
5121 assert_return(!event_origin_changed(s->event), -ECHILD);
5122
5123 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5124 * so is a programming error. */
5125 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5126
5127 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5128 * non-ratelimited. */
5129 r = event_source_leave_ratelimit(s, /* run_callback */ false);
5130 if (r < 0)
5131 return r;
5132
5133 s->rate_limit = (RateLimit) { interval, burst };
5134 return 0;
5135}
5136
5137_public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5138 assert_return(s, -EINVAL);
5139 assert_return(!event_origin_changed(s->event), -ECHILD);
5140
5141 s->ratelimit_expire_callback = callback;
5142 return 0;
5143}
5144
5145_public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5146 assert_return(s, -EINVAL);
5147 assert_return(!event_origin_changed(s->event), -ECHILD);
5148
5149 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5150 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5151 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5152 return -EDOM;
5153
5154 if (!ratelimit_configured(&s->rate_limit))
5155 return -ENOEXEC;
5156
5157 if (ret_interval)
5158 *ret_interval = s->rate_limit.interval;
5159 if (ret_burst)
5160 *ret_burst = s->rate_limit.burst;
5161
5162 return 0;
5163}
5164
5165_public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5166 assert_return(s, -EINVAL);
5167 assert_return(!event_origin_changed(s->event), -ECHILD);
5168
5169 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5170 return false;
5171
5172 if (!ratelimit_configured(&s->rate_limit))
5173 return false;
5174
5175 return s->ratelimited;
5176}
5177
5178_public_ int sd_event_source_leave_ratelimit(sd_event_source *s) {
5179 int r;
5180
5181 assert_return(s, -EINVAL);
5182
5183 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5184 return 0;
5185
5186 if (!ratelimit_configured(&s->rate_limit))
5187 return 0;
5188
5189 if (!s->ratelimited)
5190 return 0;
5191
5192 r = event_source_leave_ratelimit(s, /* run_callback */ false);
5193 if (r < 0)
5194 return r;
5195
5196 return 1; /* tell caller that we indeed just left the ratelimit state */
5197}
5198
5199_public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5200 bool change = false;
5201 int r;
5202
5203 assert_return(e, -EINVAL);
5204 assert_return(e = event_resolve(e), -ENOPKG);
5205 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
5206 assert_return(!event_origin_changed(e), -ECHILD);
5207
5208 if (b) {
5209 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5210 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5211 * floating after creation (and undo this before deleting them again). */
5212
5213 if (!e->sigint_event_source) {
5214 r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5215 if (r < 0)
5216 return r;
5217
5218 assert_se(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5219 change = true;
5220 }
5221
5222 if (!e->sigterm_event_source) {
5223 r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5224 if (r < 0) {
5225 if (change) {
5226 assert_se(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5227 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5228 }
5229
5230 return r;
5231 }
5232
5233 assert_se(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5234 change = true;
5235 }
5236
5237 } else {
5238 if (e->sigint_event_source) {
5239 assert_se(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5240 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5241 change = true;
5242 }
5243
5244 if (e->sigterm_event_source) {
5245 assert_se(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5246 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5247 change = true;
5248 }
5249 }
5250
5251 return change;
5252}
5253
5254_public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5255 _cleanup_free_ char *b = NULL;
5256 _cleanup_free_ void *w = NULL;
5257
5258 assert_return(s, -EINVAL);
5259 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5260 assert_return(ty, -EINVAL);
5261 assert_return(!event_origin_changed(s->event), -ECHILD);
5262
5263 if (!STR_IN_SET(ty, "some", "full"))
5264 return -EINVAL;
5265
5266 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5267 return -EBUSY;
5268
5269 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5270 if (!space)
5271 return -EINVAL;
5272
5273 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5274 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5275 if (!b)
5276 return -ENOMEM;
5277 if (!STR_IN_SET(b, "some", "full"))
5278 return -EINVAL;
5279
5280 if (streq(b, ty))
5281 return 0;
5282
5283 size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5284 w = new(char, nl);
5285 if (!w)
5286 return -ENOMEM;
5287
5288 memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5289
5290 free_and_replace(s->memory_pressure.write_buffer, w);
5291 s->memory_pressure.write_buffer_size = nl;
5292 s->memory_pressure.locked = false;
5293
5294 return 1;
5295}
5296
5297_public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5298 _cleanup_free_ char *b = NULL;
5299 _cleanup_free_ void *w = NULL;
5300
5301 assert_return(s, -EINVAL);
5302 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5303 assert_return(!event_origin_changed(s->event), -ECHILD);
5304
5305 if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5306 return -ERANGE;
5307 if (window_usec <= 0 || window_usec >= UINT64_MAX)
5308 return -ERANGE;
5309 if (threshold_usec > window_usec)
5310 return -EINVAL;
5311
5312 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5313 return -EBUSY;
5314
5315 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5316 if (!space)
5317 return -EINVAL;
5318
5319 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5320 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5321 if (!b)
5322 return -ENOMEM;
5323 if (!STR_IN_SET(b, "some", "full"))
5324 return -EINVAL;
5325
5326 if (asprintf((char**) &w,
5327 "%s " USEC_FMT " " USEC_FMT "",
5328 b,
5329 threshold_usec,
5330 window_usec) < 0)
5331 return -EINVAL;
5332
5333 l = strlen(w) + 1;
5334 if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5335 return 0;
5336
5337 free_and_replace(s->memory_pressure.write_buffer, w);
5338 s->memory_pressure.write_buffer_size = l;
5339 s->memory_pressure.locked = false;
5340
5341 return 1;
5342}