]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/libsystemd/sd-event/sd-event.c
sd-event: fix fd leak when fd is owned by IO event source
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <sys/epoll.h>
4 #include <sys/timerfd.h>
5 #include <sys/wait.h>
6
7 #include "sd-daemon.h"
8 #include "sd-event.h"
9 #include "sd-id128.h"
10 #include "sd-messages.h"
11
12 #include "alloc-util.h"
13 #include "env-util.h"
14 #include "event-source.h"
15 #include "fd-util.h"
16 #include "fs-util.h"
17 #include "glyph-util.h"
18 #include "hashmap.h"
19 #include "hexdecoct.h"
20 #include "list.h"
21 #include "logarithm.h"
22 #include "macro.h"
23 #include "mallinfo-util.h"
24 #include "memory-util.h"
25 #include "missing_magic.h"
26 #include "missing_syscall.h"
27 #include "missing_threads.h"
28 #include "origin-id.h"
29 #include "path-util.h"
30 #include "prioq.h"
31 #include "process-util.h"
32 #include "psi-util.h"
33 #include "set.h"
34 #include "signal-util.h"
35 #include "socket-util.h"
36 #include "stat-util.h"
37 #include "string-table.h"
38 #include "string-util.h"
39 #include "strxcpyx.h"
40 #include "time-util.h"
41
42 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
43
44 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
45 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
46 return s &&
47 s->type == SOURCE_CHILD &&
48 s->child.pidfd >= 0 &&
49 s->child.options == WEXITED;
50 }
51
52 static bool event_source_is_online(sd_event_source *s) {
53 assert(s);
54 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
55 }
56
57 static bool event_source_is_offline(sd_event_source *s) {
58 assert(s);
59 return s->enabled == SD_EVENT_OFF || s->ratelimited;
60 }
61
62 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
63 [SOURCE_IO] = "io",
64 [SOURCE_TIME_REALTIME] = "realtime",
65 [SOURCE_TIME_BOOTTIME] = "boottime",
66 [SOURCE_TIME_MONOTONIC] = "monotonic",
67 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
68 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
69 [SOURCE_SIGNAL] = "signal",
70 [SOURCE_CHILD] = "child",
71 [SOURCE_DEFER] = "defer",
72 [SOURCE_POST] = "post",
73 [SOURCE_EXIT] = "exit",
74 [SOURCE_WATCHDOG] = "watchdog",
75 [SOURCE_INOTIFY] = "inotify",
76 [SOURCE_MEMORY_PRESSURE] = "memory-pressure",
77 };
78
79 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
80
81 #define EVENT_SOURCE_IS_TIME(t) \
82 IN_SET((t), \
83 SOURCE_TIME_REALTIME, \
84 SOURCE_TIME_BOOTTIME, \
85 SOURCE_TIME_MONOTONIC, \
86 SOURCE_TIME_REALTIME_ALARM, \
87 SOURCE_TIME_BOOTTIME_ALARM)
88
89 #define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
90 IN_SET((t), \
91 SOURCE_IO, \
92 SOURCE_TIME_REALTIME, \
93 SOURCE_TIME_BOOTTIME, \
94 SOURCE_TIME_MONOTONIC, \
95 SOURCE_TIME_REALTIME_ALARM, \
96 SOURCE_TIME_BOOTTIME_ALARM, \
97 SOURCE_SIGNAL, \
98 SOURCE_DEFER, \
99 SOURCE_INOTIFY, \
100 SOURCE_MEMORY_PRESSURE)
101
102 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
103 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
104 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
105 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
106
107 struct sd_event {
108 unsigned n_ref;
109
110 int epoll_fd;
111 int watchdog_fd;
112
113 Prioq *pending;
114 Prioq *prepare;
115
116 /* timerfd_create() only supports these five clocks so far. We
117 * can add support for more clocks when the kernel learns to
118 * deal with them, too. */
119 struct clock_data realtime;
120 struct clock_data boottime;
121 struct clock_data monotonic;
122 struct clock_data realtime_alarm;
123 struct clock_data boottime_alarm;
124
125 usec_t perturb;
126
127 sd_event_source **signal_sources; /* indexed by signal number */
128 Hashmap *signal_data; /* indexed by priority */
129
130 Hashmap *child_sources;
131 unsigned n_online_child_sources;
132
133 Set *post_sources;
134
135 Prioq *exit;
136
137 Hashmap *inotify_data; /* indexed by priority */
138
139 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
140 LIST_HEAD(struct inode_data, inode_data_to_close_list);
141
142 /* A list of inotify objects that already have events buffered which aren't processed yet */
143 LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
144
145 /* A list of memory pressure event sources that still need their subscription string written */
146 LIST_HEAD(sd_event_source, memory_pressure_write_list);
147
148 uint64_t origin_id;
149
150 uint64_t iteration;
151 triple_timestamp timestamp;
152 int state;
153
154 bool exit_requested:1;
155 bool need_process_child:1;
156 bool watchdog:1;
157 bool profile_delays:1;
158
159 int exit_code;
160
161 pid_t tid;
162 sd_event **default_event_ptr;
163
164 usec_t watchdog_last, watchdog_period;
165
166 unsigned n_sources;
167
168 struct epoll_event *event_queue;
169
170 LIST_HEAD(sd_event_source, sources);
171
172 sd_event_source *sigint_event_source, *sigterm_event_source;
173
174 usec_t last_run_usec, last_log_usec;
175 unsigned delays[sizeof(usec_t) * 8];
176 };
177
178 DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event, event);
179
180 static thread_local sd_event *default_event = NULL;
181
182 static void source_disconnect(sd_event_source *s);
183 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
184
185 static sd_event *event_resolve(sd_event *e) {
186 return e == SD_EVENT_DEFAULT ? default_event : e;
187 }
188
189 static int pending_prioq_compare(const void *a, const void *b) {
190 const sd_event_source *x = a, *y = b;
191 int r;
192
193 assert(x->pending);
194 assert(y->pending);
195
196 /* Enabled ones first */
197 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
198 if (r != 0)
199 return r;
200
201 /* Non rate-limited ones first. */
202 r = CMP(!!x->ratelimited, !!y->ratelimited);
203 if (r != 0)
204 return r;
205
206 /* Lower priority values first */
207 r = CMP(x->priority, y->priority);
208 if (r != 0)
209 return r;
210
211 /* Older entries first */
212 return CMP(x->pending_iteration, y->pending_iteration);
213 }
214
215 static int prepare_prioq_compare(const void *a, const void *b) {
216 const sd_event_source *x = a, *y = b;
217 int r;
218
219 assert(x->prepare);
220 assert(y->prepare);
221
222 /* Enabled ones first */
223 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
224 if (r != 0)
225 return r;
226
227 /* Non rate-limited ones first. */
228 r = CMP(!!x->ratelimited, !!y->ratelimited);
229 if (r != 0)
230 return r;
231
232 /* Move most recently prepared ones last, so that we can stop
233 * preparing as soon as we hit one that has already been
234 * prepared in the current iteration */
235 r = CMP(x->prepare_iteration, y->prepare_iteration);
236 if (r != 0)
237 return r;
238
239 /* Lower priority values first */
240 return CMP(x->priority, y->priority);
241 }
242
243 static usec_t time_event_source_next(const sd_event_source *s) {
244 assert(s);
245
246 /* We have two kinds of event sources that have elapsation times associated with them: the actual
247 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
248 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
249 * looking at here. */
250
251 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
252 assert(s->rate_limit.begin != 0);
253 assert(s->rate_limit.interval != 0);
254 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
255 }
256
257 /* Otherwise this must be a time event source, if not ratelimited */
258 if (EVENT_SOURCE_IS_TIME(s->type))
259 return s->time.next;
260
261 return USEC_INFINITY;
262 }
263
264 static usec_t time_event_source_latest(const sd_event_source *s) {
265 assert(s);
266
267 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
268 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
269 * window */
270 assert(s->rate_limit.begin != 0);
271 assert(s->rate_limit.interval != 0);
272 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
273 }
274
275 /* Must be a time event source, if not ratelimited */
276 if (EVENT_SOURCE_IS_TIME(s->type))
277 return usec_add(s->time.next, s->time.accuracy);
278
279 return USEC_INFINITY;
280 }
281
282 static bool event_source_timer_candidate(const sd_event_source *s) {
283 assert(s);
284
285 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
286 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
287 return !s->pending || s->ratelimited;
288 }
289
290 static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
291 const sd_event_source *x = a, *y = b;
292 int r;
293
294 /* Enabled ones first */
295 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
296 if (r != 0)
297 return r;
298
299 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
300 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
301 if (r != 0)
302 return r;
303
304 /* Order by time */
305 return CMP(time_func(x), time_func(y));
306 }
307
308 static int earliest_time_prioq_compare(const void *a, const void *b) {
309 return time_prioq_compare(a, b, time_event_source_next);
310 }
311
312 static int latest_time_prioq_compare(const void *a, const void *b) {
313 return time_prioq_compare(a, b, time_event_source_latest);
314 }
315
316 static int exit_prioq_compare(const void *a, const void *b) {
317 const sd_event_source *x = a, *y = b;
318 int r;
319
320 assert(x->type == SOURCE_EXIT);
321 assert(y->type == SOURCE_EXIT);
322
323 /* Enabled ones first */
324 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
325 if (r != 0)
326 return r;
327
328 /* Lower priority values first */
329 return CMP(x->priority, y->priority);
330 }
331
332 static void free_clock_data(struct clock_data *d) {
333 assert(d);
334 assert(d->wakeup == WAKEUP_CLOCK_DATA);
335
336 safe_close(d->fd);
337 prioq_free(d->earliest);
338 prioq_free(d->latest);
339 }
340
341 static sd_event *event_free(sd_event *e) {
342 sd_event_source *s;
343
344 assert(e);
345
346 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
347 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
348
349 while ((s = e->sources)) {
350 assert(s->floating);
351 source_disconnect(s);
352 sd_event_source_unref(s);
353 }
354
355 assert(e->n_sources == 0);
356
357 if (e->default_event_ptr)
358 *(e->default_event_ptr) = NULL;
359
360 safe_close(e->epoll_fd);
361 safe_close(e->watchdog_fd);
362
363 free_clock_data(&e->realtime);
364 free_clock_data(&e->boottime);
365 free_clock_data(&e->monotonic);
366 free_clock_data(&e->realtime_alarm);
367 free_clock_data(&e->boottime_alarm);
368
369 prioq_free(e->pending);
370 prioq_free(e->prepare);
371 prioq_free(e->exit);
372
373 free(e->signal_sources);
374 hashmap_free(e->signal_data);
375
376 hashmap_free(e->inotify_data);
377
378 hashmap_free(e->child_sources);
379 set_free(e->post_sources);
380
381 free(e->event_queue);
382
383 return mfree(e);
384 }
385
386 _public_ int sd_event_new(sd_event** ret) {
387 sd_event *e;
388 int r;
389
390 assert_return(ret, -EINVAL);
391
392 e = new(sd_event, 1);
393 if (!e)
394 return -ENOMEM;
395
396 *e = (sd_event) {
397 .n_ref = 1,
398 .epoll_fd = -EBADF,
399 .watchdog_fd = -EBADF,
400 .realtime.wakeup = WAKEUP_CLOCK_DATA,
401 .realtime.fd = -EBADF,
402 .realtime.next = USEC_INFINITY,
403 .boottime.wakeup = WAKEUP_CLOCK_DATA,
404 .boottime.fd = -EBADF,
405 .boottime.next = USEC_INFINITY,
406 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
407 .monotonic.fd = -EBADF,
408 .monotonic.next = USEC_INFINITY,
409 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
410 .realtime_alarm.fd = -EBADF,
411 .realtime_alarm.next = USEC_INFINITY,
412 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
413 .boottime_alarm.fd = -EBADF,
414 .boottime_alarm.next = USEC_INFINITY,
415 .perturb = USEC_INFINITY,
416 .origin_id = origin_id_query(),
417 };
418
419 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
420 if (r < 0)
421 goto fail;
422
423 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
424 if (e->epoll_fd < 0) {
425 r = -errno;
426 goto fail;
427 }
428
429 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
430
431 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
432 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
433 special_glyph(SPECIAL_GLYPH_ELLIPSIS));
434 e->profile_delays = true;
435 }
436
437 *ret = e;
438 return 0;
439
440 fail:
441 event_free(e);
442 return r;
443 }
444
445 /* Define manually so we can add the origin check */
446 _public_ sd_event *sd_event_ref(sd_event *e) {
447 if (!e)
448 return NULL;
449 if (event_origin_changed(e))
450 return NULL;
451
452 e->n_ref++;
453
454 return e;
455 }
456
457 _public_ sd_event* sd_event_unref(sd_event *e) {
458 if (!e)
459 return NULL;
460 if (event_origin_changed(e))
461 return NULL;
462
463 assert(e->n_ref > 0);
464 if (--e->n_ref > 0)
465 return NULL;
466
467 return event_free(e);
468 }
469
470 #define PROTECT_EVENT(e) \
471 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
472
473 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
474 if (s)
475 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
476 return sd_event_source_unref(s);
477 }
478
479 static void source_io_unregister(sd_event_source *s) {
480 assert(s);
481 assert(s->type == SOURCE_IO);
482
483 if (event_origin_changed(s->event))
484 return;
485
486 if (!s->io.registered)
487 return;
488
489 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
490 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
491 strna(s->description), event_source_type_to_string(s->type));
492
493 s->io.registered = false;
494 }
495
496 static int source_io_register(
497 sd_event_source *s,
498 int enabled,
499 uint32_t events) {
500
501 assert(s);
502 assert(s->type == SOURCE_IO);
503 assert(enabled != SD_EVENT_OFF);
504
505 struct epoll_event ev = {
506 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
507 .data.ptr = s,
508 };
509
510 if (epoll_ctl(s->event->epoll_fd,
511 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
512 s->io.fd, &ev) < 0)
513 return -errno;
514
515 s->io.registered = true;
516
517 return 0;
518 }
519
520 static void source_child_pidfd_unregister(sd_event_source *s) {
521 assert(s);
522 assert(s->type == SOURCE_CHILD);
523
524 if (event_origin_changed(s->event))
525 return;
526
527 if (!s->child.registered)
528 return;
529
530 if (EVENT_SOURCE_WATCH_PIDFD(s))
531 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
532 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
533 strna(s->description), event_source_type_to_string(s->type));
534
535 s->child.registered = false;
536 }
537
538 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
539 assert(s);
540 assert(s->type == SOURCE_CHILD);
541 assert(enabled != SD_EVENT_OFF);
542
543 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
544 struct epoll_event ev = {
545 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
546 .data.ptr = s,
547 };
548
549 if (epoll_ctl(s->event->epoll_fd,
550 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
551 s->child.pidfd, &ev) < 0)
552 return -errno;
553 }
554
555 s->child.registered = true;
556 return 0;
557 }
558
559 static void source_memory_pressure_unregister(sd_event_source *s) {
560 assert(s);
561 assert(s->type == SOURCE_MEMORY_PRESSURE);
562
563 if (event_origin_changed(s->event))
564 return;
565
566 if (!s->memory_pressure.registered)
567 return;
568
569 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
570 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
571 strna(s->description), event_source_type_to_string(s->type));
572
573 s->memory_pressure.registered = false;
574 }
575
576 static int source_memory_pressure_register(sd_event_source *s, int enabled) {
577 assert(s);
578 assert(s->type == SOURCE_MEMORY_PRESSURE);
579 assert(enabled != SD_EVENT_OFF);
580
581 struct epoll_event ev = {
582 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
583 (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
584 .data.ptr = s,
585 };
586
587 if (epoll_ctl(s->event->epoll_fd,
588 s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
589 s->memory_pressure.fd, &ev) < 0)
590 return -errno;
591
592 s->memory_pressure.registered = true;
593 return 0;
594 }
595
596 static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
597 assert(s);
598 assert(s->type == SOURCE_MEMORY_PRESSURE);
599
600 if (s->memory_pressure.in_write_list)
601 return;
602
603 LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
604 s->memory_pressure.in_write_list = true;
605 }
606
607 static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
608 assert(s);
609 assert(s->type == SOURCE_MEMORY_PRESSURE);
610
611 if (!s->memory_pressure.in_write_list)
612 return;
613
614 LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
615 s->memory_pressure.in_write_list = false;
616 }
617
618 static clockid_t event_source_type_to_clock(EventSourceType t) {
619
620 switch (t) {
621
622 case SOURCE_TIME_REALTIME:
623 return CLOCK_REALTIME;
624
625 case SOURCE_TIME_BOOTTIME:
626 return CLOCK_BOOTTIME;
627
628 case SOURCE_TIME_MONOTONIC:
629 return CLOCK_MONOTONIC;
630
631 case SOURCE_TIME_REALTIME_ALARM:
632 return CLOCK_REALTIME_ALARM;
633
634 case SOURCE_TIME_BOOTTIME_ALARM:
635 return CLOCK_BOOTTIME_ALARM;
636
637 default:
638 return (clockid_t) -1;
639 }
640 }
641
642 static EventSourceType clock_to_event_source_type(clockid_t clock) {
643
644 switch (clock) {
645
646 case CLOCK_REALTIME:
647 return SOURCE_TIME_REALTIME;
648
649 case CLOCK_BOOTTIME:
650 return SOURCE_TIME_BOOTTIME;
651
652 case CLOCK_MONOTONIC:
653 return SOURCE_TIME_MONOTONIC;
654
655 case CLOCK_REALTIME_ALARM:
656 return SOURCE_TIME_REALTIME_ALARM;
657
658 case CLOCK_BOOTTIME_ALARM:
659 return SOURCE_TIME_BOOTTIME_ALARM;
660
661 default:
662 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
663 }
664 }
665
666 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
667 assert(e);
668
669 switch (t) {
670
671 case SOURCE_TIME_REALTIME:
672 return &e->realtime;
673
674 case SOURCE_TIME_BOOTTIME:
675 return &e->boottime;
676
677 case SOURCE_TIME_MONOTONIC:
678 return &e->monotonic;
679
680 case SOURCE_TIME_REALTIME_ALARM:
681 return &e->realtime_alarm;
682
683 case SOURCE_TIME_BOOTTIME_ALARM:
684 return &e->boottime_alarm;
685
686 default:
687 return NULL;
688 }
689 }
690
691 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
692 assert(e);
693
694 if (!d)
695 return;
696
697 hashmap_remove(e->signal_data, &d->priority);
698 safe_close(d->fd);
699 free(d);
700 }
701
702 static int event_make_signal_data(
703 sd_event *e,
704 int sig,
705 struct signal_data **ret) {
706
707 struct signal_data *d;
708 bool added = false;
709 sigset_t ss_copy;
710 int64_t priority;
711 int r;
712
713 assert(e);
714
715 if (event_origin_changed(e))
716 return -ECHILD;
717
718 if (e->signal_sources && e->signal_sources[sig])
719 priority = e->signal_sources[sig]->priority;
720 else
721 priority = SD_EVENT_PRIORITY_NORMAL;
722
723 d = hashmap_get(e->signal_data, &priority);
724 if (d) {
725 if (sigismember(&d->sigset, sig) > 0) {
726 if (ret)
727 *ret = d;
728 return 0;
729 }
730 } else {
731 d = new(struct signal_data, 1);
732 if (!d)
733 return -ENOMEM;
734
735 *d = (struct signal_data) {
736 .wakeup = WAKEUP_SIGNAL_DATA,
737 .fd = -EBADF,
738 .priority = priority,
739 };
740
741 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
742 if (r < 0) {
743 free(d);
744 return r;
745 }
746
747 added = true;
748 }
749
750 ss_copy = d->sigset;
751 assert_se(sigaddset(&ss_copy, sig) >= 0);
752
753 r = signalfd(d->fd >= 0 ? d->fd : -1, /* the first arg must be -1 or a valid signalfd */
754 &ss_copy,
755 SFD_NONBLOCK|SFD_CLOEXEC);
756 if (r < 0) {
757 r = -errno;
758 goto fail;
759 }
760
761 d->sigset = ss_copy;
762
763 if (d->fd >= 0) {
764 if (ret)
765 *ret = d;
766 return 0;
767 }
768
769 d->fd = fd_move_above_stdio(r);
770
771 struct epoll_event ev = {
772 .events = EPOLLIN,
773 .data.ptr = d,
774 };
775
776 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
777 r = -errno;
778 goto fail;
779 }
780
781 if (ret)
782 *ret = d;
783
784 return 0;
785
786 fail:
787 if (added)
788 event_free_signal_data(e, d);
789
790 return r;
791 }
792
793 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
794 assert(e);
795 assert(d);
796
797 /* Turns off the specified signal in the signal data
798 * object. If the signal mask of the object becomes empty that
799 * way removes it. */
800
801 if (sigismember(&d->sigset, sig) == 0)
802 return;
803
804 assert_se(sigdelset(&d->sigset, sig) >= 0);
805
806 if (sigisemptyset(&d->sigset)) {
807 /* If all the mask is all-zero we can get rid of the structure */
808 event_free_signal_data(e, d);
809 return;
810 }
811
812 if (event_origin_changed(e))
813 return;
814
815 assert(d->fd >= 0);
816
817 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
818 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
819 }
820
821 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
822 struct signal_data *d;
823 static const int64_t zero_priority = 0;
824
825 assert(e);
826
827 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
828 * and possibly drop the signalfd for it. */
829
830 if (sig == SIGCHLD &&
831 e->n_online_child_sources > 0)
832 return;
833
834 if (e->signal_sources &&
835 e->signal_sources[sig] &&
836 event_source_is_online(e->signal_sources[sig]))
837 return;
838
839 /*
840 * The specified signal might be enabled in three different queues:
841 *
842 * 1) the one that belongs to the priority passed (if it is non-NULL)
843 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
844 * 3) the 0 priority (to cover the SIGCHLD case)
845 *
846 * Hence, let's remove it from all three here.
847 */
848
849 if (priority) {
850 d = hashmap_get(e->signal_data, priority);
851 if (d)
852 event_unmask_signal_data(e, d, sig);
853 }
854
855 if (e->signal_sources && e->signal_sources[sig]) {
856 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
857 if (d)
858 event_unmask_signal_data(e, d, sig);
859 }
860
861 d = hashmap_get(e->signal_data, &zero_priority);
862 if (d)
863 event_unmask_signal_data(e, d, sig);
864 }
865
866 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
867 assert(s);
868
869 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
870 * they are enabled/disabled or marked pending and such. */
871
872 if (s->pending)
873 prioq_reshuffle(s->event->pending, s, &s->pending_index);
874
875 if (s->prepare)
876 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
877 }
878
879 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
880 struct clock_data *d;
881
882 assert(s);
883
884 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
885 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
886 * properly again. */
887
888 if (s->ratelimited)
889 d = &s->event->monotonic;
890 else if (EVENT_SOURCE_IS_TIME(s->type))
891 assert_se(d = event_get_clock_data(s->event, s->type));
892 else
893 return; /* no-op for an event source which is neither a timer nor ratelimited. */
894
895 prioq_reshuffle(d->earliest, s, &s->earliest_index);
896 prioq_reshuffle(d->latest, s, &s->latest_index);
897 d->needs_rearm = true;
898 }
899
900 static void event_source_time_prioq_remove(
901 sd_event_source *s,
902 struct clock_data *d) {
903
904 assert(s);
905 assert(d);
906
907 prioq_remove(d->earliest, s, &s->earliest_index);
908 prioq_remove(d->latest, s, &s->latest_index);
909 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
910 d->needs_rearm = true;
911 }
912
913 static void source_disconnect(sd_event_source *s) {
914 sd_event *event;
915 int r;
916
917 assert(s);
918
919 if (!s->event)
920 return;
921
922 assert(s->event->n_sources > 0);
923
924 switch (s->type) {
925
926 case SOURCE_IO:
927 if (s->io.fd >= 0)
928 source_io_unregister(s);
929
930 break;
931
932 case SOURCE_TIME_REALTIME:
933 case SOURCE_TIME_BOOTTIME:
934 case SOURCE_TIME_MONOTONIC:
935 case SOURCE_TIME_REALTIME_ALARM:
936 case SOURCE_TIME_BOOTTIME_ALARM:
937 /* Only remove this event source from the time event source here if it is not ratelimited. If
938 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
939 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
940
941 if (!s->ratelimited) {
942 struct clock_data *d;
943 assert_se(d = event_get_clock_data(s->event, s->type));
944 event_source_time_prioq_remove(s, d);
945 }
946
947 break;
948
949 case SOURCE_SIGNAL:
950 if (s->signal.sig > 0) {
951
952 if (s->event->signal_sources)
953 s->event->signal_sources[s->signal.sig] = NULL;
954
955 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
956
957 if (s->signal.unblock) {
958 sigset_t new_ss;
959
960 if (sigemptyset(&new_ss) < 0)
961 log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
962 else if (sigaddset(&new_ss, s->signal.sig) < 0)
963 log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
964 else {
965 r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
966 if (r != 0)
967 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
968 }
969 }
970 }
971
972 break;
973
974 case SOURCE_CHILD:
975 if (event_origin_changed(s->event))
976 s->child.process_owned = false;
977
978 if (s->child.pid > 0) {
979 if (event_source_is_online(s)) {
980 assert(s->event->n_online_child_sources > 0);
981 s->event->n_online_child_sources--;
982 }
983
984 (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
985 }
986
987 if (EVENT_SOURCE_WATCH_PIDFD(s))
988 source_child_pidfd_unregister(s);
989 else
990 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
991
992 break;
993
994 case SOURCE_DEFER:
995 /* nothing */
996 break;
997
998 case SOURCE_POST:
999 set_remove(s->event->post_sources, s);
1000 break;
1001
1002 case SOURCE_EXIT:
1003 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
1004 break;
1005
1006 case SOURCE_INOTIFY: {
1007 struct inode_data *inode_data;
1008
1009 inode_data = s->inotify.inode_data;
1010 if (inode_data) {
1011 struct inotify_data *inotify_data;
1012 assert_se(inotify_data = inode_data->inotify_data);
1013
1014 /* Detach this event source from the inode object */
1015 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
1016 s->inotify.inode_data = NULL;
1017
1018 if (s->pending) {
1019 assert(inotify_data->n_pending > 0);
1020 inotify_data->n_pending--;
1021 }
1022
1023 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1024 * continued to being watched. That's because inotify doesn't really have an API for that: we
1025 * can only change watch masks with access to the original inode either by fd or by path. But
1026 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1027 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1028 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1029 * there), but given the need for open_by_handle_at() which is privileged and not universally
1030 * available this would be quite an incomplete solution. Hence we go the other way, leave the
1031 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1032 * anymore after reception. Yes, this sucks, but … Linux … */
1033
1034 /* Maybe release the inode data (and its inotify) */
1035 event_gc_inode_data(s->event, inode_data);
1036 }
1037
1038 break;
1039 }
1040
1041 case SOURCE_MEMORY_PRESSURE:
1042 source_memory_pressure_remove_from_write_list(s);
1043 source_memory_pressure_unregister(s);
1044 break;
1045
1046 default:
1047 assert_not_reached();
1048 }
1049
1050 if (s->pending)
1051 prioq_remove(s->event->pending, s, &s->pending_index);
1052
1053 if (s->prepare)
1054 prioq_remove(s->event->prepare, s, &s->prepare_index);
1055
1056 if (s->ratelimited)
1057 event_source_time_prioq_remove(s, &s->event->monotonic);
1058
1059 event = TAKE_PTR(s->event);
1060 LIST_REMOVE(sources, event->sources, s);
1061 event->n_sources--;
1062
1063 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1064 * pidfd associated with this event source, which we'll do only on source_free(). */
1065
1066 if (!s->floating)
1067 sd_event_unref(event);
1068 }
1069
1070 static sd_event_source* source_free(sd_event_source *s) {
1071 assert(s);
1072
1073 source_disconnect(s);
1074
1075 if (s->type == SOURCE_IO && s->io.owned)
1076 s->io.fd = safe_close(s->io.fd);
1077
1078 if (s->type == SOURCE_CHILD) {
1079 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1080
1081 if (s->child.process_owned) {
1082
1083 if (!s->child.exited) {
1084 bool sent = false;
1085
1086 if (s->child.pidfd >= 0) {
1087 if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
1088 if (errno == ESRCH) /* Already dead */
1089 sent = true;
1090 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
1091 log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
1092 s->child.pid);
1093 } else
1094 sent = true;
1095 }
1096
1097 if (!sent)
1098 if (kill(s->child.pid, SIGKILL) < 0)
1099 if (errno != ESRCH) /* Already dead */
1100 log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1101 s->child.pid);
1102 }
1103
1104 if (!s->child.waited) {
1105 siginfo_t si = {};
1106
1107 /* Reap the child if we can */
1108 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1109 }
1110 }
1111
1112 if (s->child.pidfd_owned)
1113 s->child.pidfd = safe_close(s->child.pidfd);
1114 }
1115
1116 if (s->type == SOURCE_MEMORY_PRESSURE) {
1117 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1118 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1119 }
1120
1121 if (s->destroy_callback)
1122 s->destroy_callback(s->userdata);
1123
1124 free(s->description);
1125 return mfree(s);
1126 }
1127 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1128
1129 static int source_set_pending(sd_event_source *s, bool b) {
1130 int r;
1131
1132 assert(s);
1133 assert(s->type != SOURCE_EXIT);
1134
1135 if (s->pending == b)
1136 return 0;
1137
1138 s->pending = b;
1139
1140 if (b) {
1141 s->pending_iteration = s->event->iteration;
1142
1143 r = prioq_put(s->event->pending, s, &s->pending_index);
1144 if (r < 0) {
1145 s->pending = false;
1146 return r;
1147 }
1148 } else
1149 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1150
1151 if (EVENT_SOURCE_IS_TIME(s->type))
1152 event_source_time_prioq_reshuffle(s);
1153
1154 if (s->type == SOURCE_SIGNAL && !b) {
1155 struct signal_data *d;
1156
1157 d = hashmap_get(s->event->signal_data, &s->priority);
1158 if (d && d->current == s)
1159 d->current = NULL;
1160 }
1161
1162 if (s->type == SOURCE_INOTIFY) {
1163
1164 assert(s->inotify.inode_data);
1165 assert(s->inotify.inode_data->inotify_data);
1166
1167 if (b)
1168 s->inotify.inode_data->inotify_data->n_pending++;
1169 else {
1170 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1171 s->inotify.inode_data->inotify_data->n_pending--;
1172 }
1173 }
1174
1175 return 1;
1176 }
1177
1178 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1179
1180 /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1181 * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1182 * lines. */
1183 static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1184 [SOURCE_IO] = endoffsetof_field(sd_event_source, io),
1185 [SOURCE_TIME_REALTIME] = endoffsetof_field(sd_event_source, time),
1186 [SOURCE_TIME_BOOTTIME] = endoffsetof_field(sd_event_source, time),
1187 [SOURCE_TIME_MONOTONIC] = endoffsetof_field(sd_event_source, time),
1188 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1189 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1190 [SOURCE_SIGNAL] = endoffsetof_field(sd_event_source, signal),
1191 [SOURCE_CHILD] = endoffsetof_field(sd_event_source, child),
1192 [SOURCE_DEFER] = endoffsetof_field(sd_event_source, defer),
1193 [SOURCE_POST] = endoffsetof_field(sd_event_source, post),
1194 [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit),
1195 [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify),
1196 [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure),
1197 };
1198
1199 sd_event_source *s;
1200
1201 assert(e);
1202 assert(type >= 0);
1203 assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1204 assert(size_table[type] > 0);
1205
1206 s = malloc0(size_table[type]);
1207 if (!s)
1208 return NULL;
1209 /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1210 * size, even if we only allocate the initial part we need. */
1211 s = expand_to_usable(s, sizeof(sd_event_source));
1212
1213 /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1214 * than what we allocated here. */
1215 s->n_ref = 1;
1216 s->event = e;
1217 s->floating = floating;
1218 s->type = type;
1219 s->pending_index = PRIOQ_IDX_NULL;
1220 s->prepare_index = PRIOQ_IDX_NULL;
1221
1222 if (!floating)
1223 sd_event_ref(e);
1224
1225 LIST_PREPEND(sources, e->sources, s);
1226 e->n_sources++;
1227
1228 return s;
1229 }
1230
1231 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1232 assert(s);
1233
1234 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1235 }
1236
1237 _public_ int sd_event_add_io(
1238 sd_event *e,
1239 sd_event_source **ret,
1240 int fd,
1241 uint32_t events,
1242 sd_event_io_handler_t callback,
1243 void *userdata) {
1244
1245 _cleanup_(source_freep) sd_event_source *s = NULL;
1246 int r;
1247
1248 assert_return(e, -EINVAL);
1249 assert_return(e = event_resolve(e), -ENOPKG);
1250 assert_return(fd >= 0, -EBADF);
1251 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1252 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1253 assert_return(!event_origin_changed(e), -ECHILD);
1254
1255 if (!callback)
1256 callback = io_exit_callback;
1257
1258 s = source_new(e, !ret, SOURCE_IO);
1259 if (!s)
1260 return -ENOMEM;
1261
1262 s->wakeup = WAKEUP_EVENT_SOURCE;
1263 s->io.fd = fd;
1264 s->io.events = events;
1265 s->io.callback = callback;
1266 s->userdata = userdata;
1267 s->enabled = SD_EVENT_ON;
1268
1269 r = source_io_register(s, s->enabled, events);
1270 if (r < 0)
1271 return r;
1272
1273 if (ret)
1274 *ret = s;
1275 TAKE_PTR(s);
1276
1277 return 0;
1278 }
1279
1280 static void initialize_perturb(sd_event *e) {
1281 sd_id128_t id = {};
1282
1283 /* When we sleep for longer, we try to realign the wakeup to the same time within each
1284 * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1285 * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1286 * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1287 * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1288
1289 if (_likely_(e->perturb != USEC_INFINITY))
1290 return;
1291
1292 if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
1293 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1294 else
1295 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
1296 }
1297
1298 static int event_setup_timer_fd(
1299 sd_event *e,
1300 struct clock_data *d,
1301 clockid_t clock) {
1302
1303 assert(e);
1304 assert(d);
1305
1306 if (_likely_(d->fd >= 0))
1307 return 0;
1308
1309 _cleanup_close_ int fd = -EBADF;
1310
1311 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1312 if (fd < 0)
1313 return -errno;
1314
1315 fd = fd_move_above_stdio(fd);
1316
1317 struct epoll_event ev = {
1318 .events = EPOLLIN,
1319 .data.ptr = d,
1320 };
1321
1322 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1323 return -errno;
1324
1325 d->fd = TAKE_FD(fd);
1326 return 0;
1327 }
1328
1329 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1330 assert(s);
1331
1332 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1333 }
1334
1335 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1336 int r;
1337
1338 assert(d);
1339
1340 if (d->fd < 0) {
1341 r = event_setup_timer_fd(e, d, clock);
1342 if (r < 0)
1343 return r;
1344 }
1345
1346 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1347 if (r < 0)
1348 return r;
1349
1350 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1351 if (r < 0)
1352 return r;
1353
1354 return 0;
1355 }
1356
1357 static int event_source_time_prioq_put(
1358 sd_event_source *s,
1359 struct clock_data *d) {
1360
1361 int r;
1362
1363 assert(s);
1364 assert(d);
1365 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1366
1367 r = prioq_put(d->earliest, s, &s->earliest_index);
1368 if (r < 0)
1369 return r;
1370
1371 r = prioq_put(d->latest, s, &s->latest_index);
1372 if (r < 0) {
1373 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1374 s->earliest_index = PRIOQ_IDX_NULL;
1375 return r;
1376 }
1377
1378 d->needs_rearm = true;
1379 return 0;
1380 }
1381
1382 _public_ int sd_event_add_time(
1383 sd_event *e,
1384 sd_event_source **ret,
1385 clockid_t clock,
1386 uint64_t usec,
1387 uint64_t accuracy,
1388 sd_event_time_handler_t callback,
1389 void *userdata) {
1390
1391 EventSourceType type;
1392 _cleanup_(source_freep) sd_event_source *s = NULL;
1393 struct clock_data *d;
1394 int r;
1395
1396 assert_return(e, -EINVAL);
1397 assert_return(e = event_resolve(e), -ENOPKG);
1398 assert_return(accuracy != UINT64_MAX, -EINVAL);
1399 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1400 assert_return(!event_origin_changed(e), -ECHILD);
1401
1402 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1403 return -EOPNOTSUPP;
1404
1405 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1406 if (type < 0)
1407 return -EOPNOTSUPP;
1408
1409 if (!callback)
1410 callback = time_exit_callback;
1411
1412 assert_se(d = event_get_clock_data(e, type));
1413
1414 r = setup_clock_data(e, d, clock);
1415 if (r < 0)
1416 return r;
1417
1418 s = source_new(e, !ret, type);
1419 if (!s)
1420 return -ENOMEM;
1421
1422 s->time.next = usec;
1423 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1424 s->time.callback = callback;
1425 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1426 s->userdata = userdata;
1427 s->enabled = SD_EVENT_ONESHOT;
1428
1429 r = event_source_time_prioq_put(s, d);
1430 if (r < 0)
1431 return r;
1432
1433 if (ret)
1434 *ret = s;
1435 TAKE_PTR(s);
1436
1437 return 0;
1438 }
1439
1440 _public_ int sd_event_add_time_relative(
1441 sd_event *e,
1442 sd_event_source **ret,
1443 clockid_t clock,
1444 uint64_t usec,
1445 uint64_t accuracy,
1446 sd_event_time_handler_t callback,
1447 void *userdata) {
1448
1449 usec_t t;
1450 int r;
1451
1452 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1453 * checks for overflow. */
1454
1455 r = sd_event_now(e, clock, &t);
1456 if (r < 0)
1457 return r;
1458
1459 if (usec >= USEC_INFINITY - t)
1460 return -EOVERFLOW;
1461
1462 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1463 }
1464
1465 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1466 assert(s);
1467
1468 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1469 }
1470
1471 _public_ int sd_event_add_signal(
1472 sd_event *e,
1473 sd_event_source **ret,
1474 int sig,
1475 sd_event_signal_handler_t callback,
1476 void *userdata) {
1477
1478 _cleanup_(source_freep) sd_event_source *s = NULL;
1479 struct signal_data *d;
1480 sigset_t new_ss;
1481 bool block_it;
1482 int r;
1483
1484 assert_return(e, -EINVAL);
1485 assert_return(e = event_resolve(e), -ENOPKG);
1486 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1487 assert_return(!event_origin_changed(e), -ECHILD);
1488
1489 /* Let's make sure our special flag stays outside of the valid signal range */
1490 assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1491
1492 if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1493 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1494 assert_return(SIGNAL_VALID(sig), -EINVAL);
1495
1496 block_it = true;
1497 } else {
1498 assert_return(SIGNAL_VALID(sig), -EINVAL);
1499
1500 r = signal_is_blocked(sig);
1501 if (r < 0)
1502 return r;
1503 if (r == 0)
1504 return -EBUSY;
1505
1506 block_it = false;
1507 }
1508
1509 if (!callback)
1510 callback = signal_exit_callback;
1511
1512 if (!e->signal_sources) {
1513 e->signal_sources = new0(sd_event_source*, _NSIG);
1514 if (!e->signal_sources)
1515 return -ENOMEM;
1516 } else if (e->signal_sources[sig])
1517 return -EBUSY;
1518
1519 s = source_new(e, !ret, SOURCE_SIGNAL);
1520 if (!s)
1521 return -ENOMEM;
1522
1523 s->signal.sig = sig;
1524 s->signal.callback = callback;
1525 s->userdata = userdata;
1526 s->enabled = SD_EVENT_ON;
1527
1528 e->signal_sources[sig] = s;
1529
1530 if (block_it) {
1531 sigset_t old_ss;
1532
1533 if (sigemptyset(&new_ss) < 0)
1534 return -errno;
1535
1536 if (sigaddset(&new_ss, sig) < 0)
1537 return -errno;
1538
1539 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1540 if (r != 0)
1541 return -r;
1542
1543 r = sigismember(&old_ss, sig);
1544 if (r < 0)
1545 return -errno;
1546
1547 s->signal.unblock = !r;
1548 } else
1549 s->signal.unblock = false;
1550
1551 r = event_make_signal_data(e, sig, &d);
1552 if (r < 0) {
1553 if (s->signal.unblock)
1554 (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1555
1556 return r;
1557 }
1558
1559 /* Use the signal name as description for the event source by default */
1560 (void) sd_event_source_set_description(s, signal_to_string(sig));
1561
1562 if (ret)
1563 *ret = s;
1564 TAKE_PTR(s);
1565
1566 return 0;
1567 }
1568
1569 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1570 assert(s);
1571
1572 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1573 }
1574
1575 static bool shall_use_pidfd(void) {
1576 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1577 return secure_getenv_bool("SYSTEMD_PIDFD") != 0;
1578 }
1579
1580 _public_ int sd_event_add_child(
1581 sd_event *e,
1582 sd_event_source **ret,
1583 pid_t pid,
1584 int options,
1585 sd_event_child_handler_t callback,
1586 void *userdata) {
1587
1588 _cleanup_(source_freep) sd_event_source *s = NULL;
1589 int r;
1590
1591 assert_return(e, -EINVAL);
1592 assert_return(e = event_resolve(e), -ENOPKG);
1593 assert_return(pid > 1, -EINVAL);
1594 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1595 assert_return(options != 0, -EINVAL);
1596 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1597 assert_return(!event_origin_changed(e), -ECHILD);
1598
1599 if (!callback)
1600 callback = child_exit_callback;
1601
1602 if (e->n_online_child_sources == 0) {
1603 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1604 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1605 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1606 * take effect.
1607 *
1608 * (As an optimization we only do this check on the first child event source created.) */
1609 r = signal_is_blocked(SIGCHLD);
1610 if (r < 0)
1611 return r;
1612 if (r == 0)
1613 return -EBUSY;
1614 }
1615
1616 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1617 if (r < 0)
1618 return r;
1619
1620 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1621 return -EBUSY;
1622
1623 s = source_new(e, !ret, SOURCE_CHILD);
1624 if (!s)
1625 return -ENOMEM;
1626
1627 s->wakeup = WAKEUP_EVENT_SOURCE;
1628 s->child.options = options;
1629 s->child.callback = callback;
1630 s->userdata = userdata;
1631 s->enabled = SD_EVENT_ONESHOT;
1632
1633 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1634 * pin the PID, and make regular waitid() handling race-free. */
1635
1636 if (shall_use_pidfd()) {
1637 s->child.pidfd = pidfd_open(pid, 0);
1638 if (s->child.pidfd < 0) {
1639 /* Propagate errors unless the syscall is not supported or blocked */
1640 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1641 return -errno;
1642 } else
1643 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1644 } else
1645 s->child.pidfd = -EBADF;
1646
1647 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1648 /* We have a pidfd and we only want to watch for exit */
1649 r = source_child_pidfd_register(s, s->enabled);
1650 if (r < 0)
1651 return r;
1652
1653 } else {
1654 /* We have no pidfd or we shall wait for some other event than WEXITED */
1655 r = event_make_signal_data(e, SIGCHLD, NULL);
1656 if (r < 0)
1657 return r;
1658
1659 e->need_process_child = true;
1660 }
1661
1662 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1663 if (r < 0)
1664 return r;
1665
1666 /* These must be done after everything succeeds. */
1667 s->child.pid = pid;
1668 e->n_online_child_sources++;
1669
1670 if (ret)
1671 *ret = s;
1672 TAKE_PTR(s);
1673 return 0;
1674 }
1675
1676 _public_ int sd_event_add_child_pidfd(
1677 sd_event *e,
1678 sd_event_source **ret,
1679 int pidfd,
1680 int options,
1681 sd_event_child_handler_t callback,
1682 void *userdata) {
1683
1684
1685 _cleanup_(source_freep) sd_event_source *s = NULL;
1686 pid_t pid;
1687 int r;
1688
1689 assert_return(e, -EINVAL);
1690 assert_return(e = event_resolve(e), -ENOPKG);
1691 assert_return(pidfd >= 0, -EBADF);
1692 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1693 assert_return(options != 0, -EINVAL);
1694 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1695 assert_return(!event_origin_changed(e), -ECHILD);
1696
1697 if (!callback)
1698 callback = child_exit_callback;
1699
1700 if (e->n_online_child_sources == 0) {
1701 r = signal_is_blocked(SIGCHLD);
1702 if (r < 0)
1703 return r;
1704 if (r == 0)
1705 return -EBUSY;
1706 }
1707
1708 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1709 if (r < 0)
1710 return r;
1711
1712 r = pidfd_get_pid(pidfd, &pid);
1713 if (r < 0)
1714 return r;
1715
1716 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1717 return -EBUSY;
1718
1719 s = source_new(e, !ret, SOURCE_CHILD);
1720 if (!s)
1721 return -ENOMEM;
1722
1723 s->wakeup = WAKEUP_EVENT_SOURCE;
1724 s->child.pidfd = pidfd;
1725 s->child.pid = pid;
1726 s->child.options = options;
1727 s->child.callback = callback;
1728 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1729 s->userdata = userdata;
1730 s->enabled = SD_EVENT_ONESHOT;
1731
1732 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1733 if (r < 0)
1734 return r;
1735
1736 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1737 /* We only want to watch for WEXITED */
1738 r = source_child_pidfd_register(s, s->enabled);
1739 if (r < 0)
1740 return r;
1741 } else {
1742 /* We shall wait for some other event than WEXITED */
1743 r = event_make_signal_data(e, SIGCHLD, NULL);
1744 if (r < 0)
1745 return r;
1746
1747 e->need_process_child = true;
1748 }
1749
1750 e->n_online_child_sources++;
1751
1752 if (ret)
1753 *ret = s;
1754 TAKE_PTR(s);
1755 return 0;
1756 }
1757
1758 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1759 assert(s);
1760
1761 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1762 }
1763
1764 _public_ int sd_event_add_defer(
1765 sd_event *e,
1766 sd_event_source **ret,
1767 sd_event_handler_t callback,
1768 void *userdata) {
1769
1770 _cleanup_(source_freep) sd_event_source *s = NULL;
1771 int r;
1772
1773 assert_return(e, -EINVAL);
1774 assert_return(e = event_resolve(e), -ENOPKG);
1775 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1776 assert_return(!event_origin_changed(e), -ECHILD);
1777
1778 if (!callback)
1779 callback = generic_exit_callback;
1780
1781 s = source_new(e, !ret, SOURCE_DEFER);
1782 if (!s)
1783 return -ENOMEM;
1784
1785 s->defer.callback = callback;
1786 s->userdata = userdata;
1787 s->enabled = SD_EVENT_ONESHOT;
1788
1789 r = source_set_pending(s, true);
1790 if (r < 0)
1791 return r;
1792
1793 if (ret)
1794 *ret = s;
1795 TAKE_PTR(s);
1796
1797 return 0;
1798 }
1799
1800 _public_ int sd_event_add_post(
1801 sd_event *e,
1802 sd_event_source **ret,
1803 sd_event_handler_t callback,
1804 void *userdata) {
1805
1806 _cleanup_(source_freep) sd_event_source *s = NULL;
1807 int r;
1808
1809 assert_return(e, -EINVAL);
1810 assert_return(e = event_resolve(e), -ENOPKG);
1811 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1812 assert_return(!event_origin_changed(e), -ECHILD);
1813
1814 if (!callback)
1815 callback = generic_exit_callback;
1816
1817 s = source_new(e, !ret, SOURCE_POST);
1818 if (!s)
1819 return -ENOMEM;
1820
1821 s->post.callback = callback;
1822 s->userdata = userdata;
1823 s->enabled = SD_EVENT_ON;
1824
1825 r = set_ensure_put(&e->post_sources, NULL, s);
1826 if (r < 0)
1827 return r;
1828 assert(r > 0);
1829
1830 if (ret)
1831 *ret = s;
1832 TAKE_PTR(s);
1833
1834 return 0;
1835 }
1836
1837 _public_ int sd_event_add_exit(
1838 sd_event *e,
1839 sd_event_source **ret,
1840 sd_event_handler_t callback,
1841 void *userdata) {
1842
1843 _cleanup_(source_freep) sd_event_source *s = NULL;
1844 int r;
1845
1846 assert_return(e, -EINVAL);
1847 assert_return(e = event_resolve(e), -ENOPKG);
1848 assert_return(callback, -EINVAL);
1849 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1850 assert_return(!event_origin_changed(e), -ECHILD);
1851
1852 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1853 if (r < 0)
1854 return r;
1855
1856 s = source_new(e, !ret, SOURCE_EXIT);
1857 if (!s)
1858 return -ENOMEM;
1859
1860 s->exit.callback = callback;
1861 s->userdata = userdata;
1862 s->exit.prioq_index = PRIOQ_IDX_NULL;
1863 s->enabled = SD_EVENT_ONESHOT;
1864
1865 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1866 if (r < 0)
1867 return r;
1868
1869 if (ret)
1870 *ret = s;
1871 TAKE_PTR(s);
1872
1873 return 0;
1874 }
1875
1876 _public_ int sd_event_trim_memory(void) {
1877 int r;
1878
1879 /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1880 * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1881 * NULL callback parameter. */
1882
1883 log_debug("Memory pressure event, trimming malloc() memory.");
1884
1885 #if HAVE_GENERIC_MALLINFO
1886 generic_mallinfo before_mallinfo = generic_mallinfo_get();
1887 #endif
1888
1889 usec_t before_timestamp = now(CLOCK_MONOTONIC);
1890 hashmap_trim_pools();
1891 r = malloc_trim(0);
1892 usec_t after_timestamp = now(CLOCK_MONOTONIC);
1893
1894 if (r > 0)
1895 log_debug("Successfully trimmed some memory.");
1896 else
1897 log_debug("Couldn't trim any memory.");
1898
1899 usec_t period = after_timestamp - before_timestamp;
1900
1901 #if HAVE_GENERIC_MALLINFO
1902 generic_mallinfo after_mallinfo = generic_mallinfo_get();
1903 size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
1904 LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
1905 log_struct(LOG_DEBUG,
1906 LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1907 FORMAT_TIMESPAN(period, 0),
1908 FORMAT_BYTES(l)),
1909 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1910 "TRIMMED_BYTES=%zu", l,
1911 "TRIMMED_USEC=" USEC_FMT, period);
1912 #else
1913 log_struct(LOG_DEBUG,
1914 LOG_MESSAGE("Memory trimming took %s.",
1915 FORMAT_TIMESPAN(period, 0)),
1916 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1917 "TRIMMED_USEC=" USEC_FMT, period);
1918 #endif
1919
1920 return 0;
1921 }
1922
1923 static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1924 assert(s);
1925
1926 sd_event_trim_memory();
1927 return 0;
1928 }
1929
1930 _public_ int sd_event_add_memory_pressure(
1931 sd_event *e,
1932 sd_event_source **ret,
1933 sd_event_handler_t callback,
1934 void *userdata) {
1935
1936 _cleanup_free_ char *w = NULL;
1937 _cleanup_(source_freep) sd_event_source *s = NULL;
1938 _cleanup_close_ int path_fd = -EBADF, fd = -EBADF;
1939 _cleanup_free_ void *write_buffer = NULL;
1940 const char *watch, *watch_fallback = NULL, *env;
1941 size_t write_buffer_size = 0;
1942 struct stat st;
1943 uint32_t events;
1944 bool locked;
1945 int r;
1946
1947 assert_return(e, -EINVAL);
1948 assert_return(e = event_resolve(e), -ENOPKG);
1949 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1950 assert_return(!event_origin_changed(e), -ECHILD);
1951
1952 if (!callback)
1953 callback = memory_pressure_callback;
1954
1955 s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1956 if (!s)
1957 return -ENOMEM;
1958
1959 s->wakeup = WAKEUP_EVENT_SOURCE;
1960 s->memory_pressure.callback = callback;
1961 s->userdata = userdata;
1962 s->enabled = SD_EVENT_ON;
1963 s->memory_pressure.fd = -EBADF;
1964
1965 env = secure_getenv("MEMORY_PRESSURE_WATCH");
1966 if (env) {
1967 if (isempty(env) || path_equal(env, "/dev/null"))
1968 return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1969 "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1970
1971 if (!path_is_absolute(env) || !path_is_normalized(env))
1972 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1973 "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1974
1975 watch = env;
1976
1977 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1978 if (env) {
1979 r = unbase64mem(env, &write_buffer, &write_buffer_size);
1980 if (r < 0)
1981 return r;
1982 }
1983
1984 locked = true;
1985 } else {
1986
1987 r = is_pressure_supported();
1988 if (r < 0)
1989 return r;
1990 if (r == 0)
1991 return -EOPNOTSUPP;
1992
1993 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1994 * the system wide pressure if for some reason we cannot (which could be: memory controller
1995 * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1996 * only use the system-wide logic. */
1997 r = cg_all_unified();
1998 if (r < 0)
1999 return r;
2000 if (r == 0)
2001 watch = "/proc/pressure/memory";
2002 else {
2003 _cleanup_free_ char *cg = NULL;
2004
2005 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
2006 if (r < 0)
2007 return r;
2008
2009 w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
2010 if (!w)
2011 return -ENOMEM;
2012
2013 watch = w;
2014 watch_fallback = "/proc/pressure/memory";
2015 }
2016
2017 /* Android uses three levels in its userspace low memory killer logic:
2018 * some 70000 1000000
2019 * some 100000 1000000
2020 * full 70000 1000000
2021 *
2022 * GNOME's low memory monitor uses:
2023 * some 70000 1000000
2024 * some 100000 1000000
2025 * full 100000 1000000
2026 *
2027 * We'll default to the middle level that both agree on. Except we do it on a 2s window
2028 * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2029 * kernel will allow us to do unprivileged, also in the future. */
2030 if (asprintf((char**) &write_buffer,
2031 "%s " USEC_FMT " " USEC_FMT,
2032 MEMORY_PRESSURE_DEFAULT_TYPE,
2033 MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2034 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2035 return -ENOMEM;
2036
2037 write_buffer_size = strlen(write_buffer) + 1;
2038 locked = false;
2039 }
2040
2041 path_fd = open(watch, O_PATH|O_CLOEXEC);
2042 if (path_fd < 0) {
2043 if (errno != ENOENT)
2044 return -errno;
2045
2046 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2047 * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2048 * the PSI service apparently is not supported) */
2049 if (!watch_fallback)
2050 return locked ? -ENOENT : -EOPNOTSUPP;
2051
2052 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
2053 if (path_fd < 0) {
2054 if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2055 return -EOPNOTSUPP;
2056 return -errno;
2057 }
2058 }
2059
2060 if (fstat(path_fd, &st) < 0)
2061 return -errno;
2062
2063 if (S_ISSOCK(st.st_mode)) {
2064 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2065 if (fd < 0)
2066 return -errno;
2067
2068 r = connect_unix_path(fd, path_fd, NULL);
2069 if (r < 0)
2070 return r;
2071
2072 events = EPOLLIN;
2073
2074 } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2075 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2076 if (fd < 0)
2077 return fd;
2078
2079 if (S_ISREG(st.st_mode)) {
2080 struct statfs sfs;
2081
2082 /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2083
2084 if (fstatfs(fd, &sfs) < 0)
2085 return -errno;
2086
2087 if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2088 !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2089 return -ENOTTY;
2090
2091 events = EPOLLPRI;
2092 } else
2093 /* For fifos and char devices just watch for EPOLLIN */
2094 events = EPOLLIN;
2095
2096 } else if (S_ISDIR(st.st_mode))
2097 return -EISDIR;
2098 else
2099 return -EBADF;
2100
2101 s->memory_pressure.fd = TAKE_FD(fd);
2102 s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2103 s->memory_pressure.write_buffer_size = write_buffer_size;
2104 s->memory_pressure.events = events;
2105 s->memory_pressure.locked = locked;
2106
2107 /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2108 * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2109 * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2110 * event sources on which writes must be executed before the first event loop iteration is
2111 * executed. (We could also write the data here, right away, but we want to give the caller the
2112 * freedom to call sd_event_source_set_memory_pressure_type() and
2113 * sd_event_source_set_memory_pressure_rate() before we write it. */
2114
2115 if (s->memory_pressure.write_buffer_size > 0)
2116 source_memory_pressure_add_to_write_list(s);
2117 else {
2118 r = source_memory_pressure_register(s, s->enabled);
2119 if (r < 0)
2120 return r;
2121 }
2122
2123 if (ret)
2124 *ret = s;
2125 TAKE_PTR(s);
2126
2127 return 0;
2128 }
2129
2130 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2131 assert(e);
2132
2133 if (!d)
2134 return;
2135
2136 assert(hashmap_isempty(d->inodes));
2137 assert(hashmap_isempty(d->wd));
2138
2139 if (d->buffer_filled > 0)
2140 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
2141
2142 hashmap_free(d->inodes);
2143 hashmap_free(d->wd);
2144
2145 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2146
2147 if (d->fd >= 0) {
2148 if (!event_origin_changed(e) &&
2149 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
2150 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2151
2152 safe_close(d->fd);
2153 }
2154 free(d);
2155 }
2156
2157 static int event_make_inotify_data(
2158 sd_event *e,
2159 int64_t priority,
2160 struct inotify_data **ret) {
2161
2162 _cleanup_close_ int fd = -EBADF;
2163 struct inotify_data *d;
2164 int r;
2165
2166 assert(e);
2167
2168 d = hashmap_get(e->inotify_data, &priority);
2169 if (d) {
2170 if (ret)
2171 *ret = d;
2172 return 0;
2173 }
2174
2175 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2176 if (fd < 0)
2177 return -errno;
2178
2179 fd = fd_move_above_stdio(fd);
2180
2181 d = new(struct inotify_data, 1);
2182 if (!d)
2183 return -ENOMEM;
2184
2185 *d = (struct inotify_data) {
2186 .wakeup = WAKEUP_INOTIFY_DATA,
2187 .fd = TAKE_FD(fd),
2188 .priority = priority,
2189 };
2190
2191 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
2192 if (r < 0) {
2193 d->fd = safe_close(d->fd);
2194 free(d);
2195 return r;
2196 }
2197
2198 struct epoll_event ev = {
2199 .events = EPOLLIN,
2200 .data.ptr = d,
2201 };
2202
2203 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2204 r = -errno;
2205 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2206 * remove the fd from the epoll first, which we don't want as we couldn't
2207 * add it in the first place. */
2208 event_free_inotify_data(e, d);
2209 return r;
2210 }
2211
2212 if (ret)
2213 *ret = d;
2214
2215 return 1;
2216 }
2217
2218 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
2219 int r;
2220
2221 assert(x);
2222 assert(y);
2223
2224 r = CMP(x->dev, y->dev);
2225 if (r != 0)
2226 return r;
2227
2228 return CMP(x->ino, y->ino);
2229 }
2230
2231 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2232 assert(d);
2233
2234 siphash24_compress_typesafe(d->dev, state);
2235 siphash24_compress_typesafe(d->ino, state);
2236 }
2237
2238 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
2239
2240 static void event_free_inode_data(
2241 sd_event *e,
2242 struct inode_data *d) {
2243
2244 assert(e);
2245
2246 if (!d)
2247 return;
2248
2249 assert(!d->event_sources);
2250
2251 if (d->fd >= 0) {
2252 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
2253 safe_close(d->fd);
2254 }
2255
2256 if (d->inotify_data) {
2257
2258 if (d->wd >= 0) {
2259 if (d->inotify_data->fd >= 0 && !event_origin_changed(e)) {
2260 /* So here's a problem. At the time this runs the watch descriptor might already be
2261 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2262 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2263 * likely case to happen. */
2264
2265 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2266 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2267 }
2268
2269 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2270 }
2271
2272 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2273 }
2274
2275 free(d->path);
2276 free(d);
2277 }
2278
2279 static void event_gc_inotify_data(
2280 sd_event *e,
2281 struct inotify_data *d) {
2282
2283 assert(e);
2284
2285 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2286 * any inode with it anymore, which in turn happens if no event source of this priority is interested
2287 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2288 * (under the expectation that the GC is called again once the counter is decremented). */
2289
2290 if (!d)
2291 return;
2292
2293 if (!hashmap_isempty(d->inodes))
2294 return;
2295
2296 if (d->n_busy > 0)
2297 return;
2298
2299 event_free_inotify_data(e, d);
2300 }
2301
2302 static void event_gc_inode_data(
2303 sd_event *e,
2304 struct inode_data *d) {
2305
2306 struct inotify_data *inotify_data;
2307
2308 assert(e);
2309
2310 if (!d)
2311 return;
2312
2313 if (d->event_sources)
2314 return;
2315
2316 inotify_data = d->inotify_data;
2317 event_free_inode_data(e, d);
2318
2319 event_gc_inotify_data(e, inotify_data);
2320 }
2321
2322 static int event_make_inode_data(
2323 sd_event *e,
2324 struct inotify_data *inotify_data,
2325 dev_t dev,
2326 ino_t ino,
2327 struct inode_data **ret) {
2328
2329 struct inode_data *d, key;
2330 int r;
2331
2332 assert(e);
2333 assert(inotify_data);
2334
2335 key = (struct inode_data) {
2336 .ino = ino,
2337 .dev = dev,
2338 };
2339
2340 d = hashmap_get(inotify_data->inodes, &key);
2341 if (d) {
2342 if (ret)
2343 *ret = d;
2344
2345 return 0;
2346 }
2347
2348 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2349 if (r < 0)
2350 return r;
2351
2352 d = new(struct inode_data, 1);
2353 if (!d)
2354 return -ENOMEM;
2355
2356 *d = (struct inode_data) {
2357 .dev = dev,
2358 .ino = ino,
2359 .wd = -1,
2360 .fd = -EBADF,
2361 .inotify_data = inotify_data,
2362 };
2363
2364 r = hashmap_put(inotify_data->inodes, d, d);
2365 if (r < 0) {
2366 free(d);
2367 return r;
2368 }
2369
2370 if (ret)
2371 *ret = d;
2372
2373 return 1;
2374 }
2375
2376 static uint32_t inode_data_determine_mask(struct inode_data *d) {
2377 bool excl_unlink = true;
2378 uint32_t combined = 0;
2379
2380 assert(d);
2381
2382 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2383 * the IN_EXCL_UNLINK flag is ANDed instead.
2384 *
2385 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2386 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2387 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2388 * events we don't care for client-side. */
2389
2390 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2391
2392 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2393 excl_unlink = false;
2394
2395 combined |= s->inotify.mask;
2396 }
2397
2398 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2399 }
2400
2401 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2402 uint32_t combined_mask;
2403 int wd, r;
2404
2405 assert(d);
2406 assert(d->fd >= 0);
2407
2408 combined_mask = inode_data_determine_mask(d);
2409
2410 if (d->wd >= 0 && combined_mask == d->combined_mask)
2411 return 0;
2412
2413 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2414 if (r < 0)
2415 return r;
2416
2417 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2418 if (wd < 0)
2419 return wd;
2420
2421 if (d->wd < 0) {
2422 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2423 if (r < 0) {
2424 (void) inotify_rm_watch(d->inotify_data->fd, wd);
2425 return r;
2426 }
2427
2428 d->wd = wd;
2429
2430 } else if (d->wd != wd) {
2431
2432 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2433 (void) inotify_rm_watch(d->fd, wd);
2434 return -EINVAL;
2435 }
2436
2437 d->combined_mask = combined_mask;
2438 return 1;
2439 }
2440
2441 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2442 assert(s);
2443
2444 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2445 }
2446
2447 static int event_add_inotify_fd_internal(
2448 sd_event *e,
2449 sd_event_source **ret,
2450 int fd,
2451 bool donate,
2452 uint32_t mask,
2453 sd_event_inotify_handler_t callback,
2454 void *userdata) {
2455
2456 _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
2457 _cleanup_(source_freep) sd_event_source *s = NULL;
2458 struct inotify_data *inotify_data = NULL;
2459 struct inode_data *inode_data = NULL;
2460 struct stat st;
2461 int r;
2462
2463 assert_return(e, -EINVAL);
2464 assert_return(e = event_resolve(e), -ENOPKG);
2465 assert_return(fd >= 0, -EBADF);
2466 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2467 assert_return(!event_origin_changed(e), -ECHILD);
2468
2469 if (!callback)
2470 callback = inotify_exit_callback;
2471
2472 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2473 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2474 * the user can't use them for us. */
2475 if (mask & IN_MASK_ADD)
2476 return -EINVAL;
2477
2478 if (fstat(fd, &st) < 0)
2479 return -errno;
2480
2481 s = source_new(e, !ret, SOURCE_INOTIFY);
2482 if (!s)
2483 return -ENOMEM;
2484
2485 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2486 s->inotify.mask = mask;
2487 s->inotify.callback = callback;
2488 s->userdata = userdata;
2489
2490 /* Allocate an inotify object for this priority, and an inode object within it */
2491 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2492 if (r < 0)
2493 return r;
2494
2495 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2496 if (r < 0) {
2497 event_gc_inotify_data(e, inotify_data);
2498 return r;
2499 }
2500
2501 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2502 * the event source, until then, for which we need the original inode. */
2503 if (inode_data->fd < 0) {
2504 if (donated_fd >= 0)
2505 inode_data->fd = TAKE_FD(donated_fd);
2506 else {
2507 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2508 if (inode_data->fd < 0) {
2509 r = -errno;
2510 event_gc_inode_data(e, inode_data);
2511 return r;
2512 }
2513 }
2514
2515 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
2516
2517 _cleanup_free_ char *path = NULL;
2518 r = fd_get_path(inode_data->fd, &path);
2519 if (r < 0 && r != -ENOSYS) { /* The path is optional, hence ignore -ENOSYS. */
2520 event_gc_inode_data(e, inode_data);
2521 return r;
2522 }
2523
2524 free_and_replace(inode_data->path, path);
2525 }
2526
2527 /* Link our event source to the inode data object */
2528 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2529 s->inotify.inode_data = inode_data;
2530
2531 /* Actually realize the watch now */
2532 r = inode_data_realize_watch(e, inode_data);
2533 if (r < 0)
2534 return r;
2535
2536 if (ret)
2537 *ret = s;
2538 TAKE_PTR(s);
2539
2540 return 0;
2541 }
2542
2543 _public_ int sd_event_add_inotify_fd(
2544 sd_event *e,
2545 sd_event_source **ret,
2546 int fd,
2547 uint32_t mask,
2548 sd_event_inotify_handler_t callback,
2549 void *userdata) {
2550
2551 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2552 }
2553
2554 _public_ int sd_event_add_inotify(
2555 sd_event *e,
2556 sd_event_source **ret,
2557 const char *path,
2558 uint32_t mask,
2559 sd_event_inotify_handler_t callback,
2560 void *userdata) {
2561
2562 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2563 int fd, r;
2564
2565 assert_return(path, -EINVAL);
2566
2567 fd = open(path, O_PATH | O_CLOEXEC |
2568 (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2569 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2570 if (fd < 0)
2571 return -errno;
2572
2573 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2574 if (r < 0)
2575 return r;
2576
2577 (void) sd_event_source_set_description(s, path);
2578
2579 if (ret)
2580 *ret = s;
2581
2582 return r;
2583 }
2584
2585 static sd_event_source* event_source_free(sd_event_source *s) {
2586 if (!s)
2587 return NULL;
2588
2589 /* Here's a special hack: when we are called from a
2590 * dispatch handler we won't free the event source
2591 * immediately, but we will detach the fd from the
2592 * epoll. This way it is safe for the caller to unref
2593 * the event source and immediately close the fd, but
2594 * we still retain a valid event source object after
2595 * the callback. */
2596
2597 if (s->dispatching)
2598 source_disconnect(s);
2599 else
2600 source_free(s);
2601
2602 return NULL;
2603 }
2604
2605 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2606
2607 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2608 assert_return(s, -EINVAL);
2609 assert_return(!event_origin_changed(s->event), -ECHILD);
2610
2611 return free_and_strdup(&s->description, description);
2612 }
2613
2614 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
2615 assert_return(s, -EINVAL);
2616 assert_return(description, -EINVAL);
2617
2618 if (!s->description)
2619 return -ENXIO;
2620
2621 *description = s->description;
2622 return 0;
2623 }
2624
2625 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
2626 assert_return(s, NULL);
2627 assert_return(!event_origin_changed(s->event), NULL);
2628
2629 return s->event;
2630 }
2631
2632 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2633 assert_return(s, -EINVAL);
2634 assert_return(s->type != SOURCE_EXIT, -EDOM);
2635 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2636 assert_return(!event_origin_changed(s->event), -ECHILD);
2637
2638 return s->pending;
2639 }
2640
2641 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2642 assert_return(s, -EINVAL);
2643 assert_return(s->type == SOURCE_IO, -EDOM);
2644 assert_return(!event_origin_changed(s->event), -ECHILD);
2645
2646 return s->io.fd;
2647 }
2648
2649 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2650 int saved_fd, r;
2651
2652 assert_return(s, -EINVAL);
2653 assert_return(fd >= 0, -EBADF);
2654 assert_return(s->type == SOURCE_IO, -EDOM);
2655 assert_return(!event_origin_changed(s->event), -ECHILD);
2656
2657 if (s->io.fd == fd)
2658 return 0;
2659
2660 saved_fd = s->io.fd;
2661 s->io.fd = fd;
2662
2663 assert(event_source_is_offline(s) == !s->io.registered);
2664
2665 if (s->io.registered) {
2666 s->io.registered = false;
2667
2668 r = source_io_register(s, s->enabled, s->io.events);
2669 if (r < 0) {
2670 s->io.fd = saved_fd;
2671 s->io.registered = true;
2672 return r;
2673 }
2674
2675 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2676 }
2677
2678 if (s->io.owned)
2679 safe_close(saved_fd);
2680
2681 return 0;
2682 }
2683
2684 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2685 assert_return(s, -EINVAL);
2686 assert_return(s->type == SOURCE_IO, -EDOM);
2687 assert_return(!event_origin_changed(s->event), -ECHILD);
2688
2689 return s->io.owned;
2690 }
2691
2692 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2693 assert_return(s, -EINVAL);
2694 assert_return(s->type == SOURCE_IO, -EDOM);
2695 assert_return(!event_origin_changed(s->event), -ECHILD);
2696
2697 s->io.owned = own;
2698 return 0;
2699 }
2700
2701 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2702 assert_return(s, -EINVAL);
2703 assert_return(events, -EINVAL);
2704 assert_return(s->type == SOURCE_IO, -EDOM);
2705 assert_return(!event_origin_changed(s->event), -ECHILD);
2706
2707 *events = s->io.events;
2708 return 0;
2709 }
2710
2711 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2712 int r;
2713
2714 assert_return(s, -EINVAL);
2715 assert_return(s->type == SOURCE_IO, -EDOM);
2716 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2717 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2718 assert_return(!event_origin_changed(s->event), -ECHILD);
2719
2720 /* edge-triggered updates are never skipped, so we can reset edges */
2721 if (s->io.events == events && !(events & EPOLLET))
2722 return 0;
2723
2724 r = source_set_pending(s, false);
2725 if (r < 0)
2726 return r;
2727
2728 if (event_source_is_online(s)) {
2729 r = source_io_register(s, s->enabled, events);
2730 if (r < 0)
2731 return r;
2732 }
2733
2734 s->io.events = events;
2735
2736 return 0;
2737 }
2738
2739 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2740 assert_return(s, -EINVAL);
2741 assert_return(revents, -EINVAL);
2742 assert_return(s->type == SOURCE_IO, -EDOM);
2743 assert_return(s->pending, -ENODATA);
2744 assert_return(!event_origin_changed(s->event), -ECHILD);
2745
2746 *revents = s->io.revents;
2747 return 0;
2748 }
2749
2750 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2751 assert_return(s, -EINVAL);
2752 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2753 assert_return(!event_origin_changed(s->event), -ECHILD);
2754
2755 return s->signal.sig;
2756 }
2757
2758 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2759 assert_return(s, -EINVAL);
2760 assert_return(!event_origin_changed(s->event), -ECHILD);
2761
2762 *priority = s->priority;
2763 return 0;
2764 }
2765
2766 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2767 bool rm_inotify = false, rm_inode = false;
2768 struct inotify_data *new_inotify_data = NULL;
2769 struct inode_data *new_inode_data = NULL;
2770 int r;
2771
2772 assert_return(s, -EINVAL);
2773 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2774 assert_return(!event_origin_changed(s->event), -ECHILD);
2775
2776 if (s->priority == priority)
2777 return 0;
2778
2779 if (s->type == SOURCE_INOTIFY) {
2780 struct inode_data *old_inode_data;
2781
2782 assert(s->inotify.inode_data);
2783 old_inode_data = s->inotify.inode_data;
2784
2785 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2786 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2787 * events we allow priority changes only until the first following iteration. */
2788 if (old_inode_data->fd < 0)
2789 return -EOPNOTSUPP;
2790
2791 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2792 if (r < 0)
2793 return r;
2794 rm_inotify = r > 0;
2795
2796 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2797 if (r < 0)
2798 goto fail;
2799 rm_inode = r > 0;
2800
2801 if (new_inode_data->fd < 0) {
2802 /* Duplicate the fd for the new inode object if we don't have any yet */
2803 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2804 if (new_inode_data->fd < 0) {
2805 r = -errno;
2806 goto fail;
2807 }
2808
2809 LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
2810
2811 _cleanup_free_ char *path = NULL;
2812 r = fd_get_path(new_inode_data->fd, &path);
2813 if (r < 0 && r != -ENOSYS)
2814 goto fail;
2815
2816 free_and_replace(new_inode_data->path, path);
2817 }
2818
2819 /* Move the event source to the new inode data structure */
2820 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2821 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2822 s->inotify.inode_data = new_inode_data;
2823
2824 /* Now create the new watch */
2825 r = inode_data_realize_watch(s->event, new_inode_data);
2826 if (r < 0) {
2827 /* Move it back */
2828 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2829 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2830 s->inotify.inode_data = old_inode_data;
2831 goto fail;
2832 }
2833
2834 s->priority = priority;
2835
2836 event_gc_inode_data(s->event, old_inode_data);
2837
2838 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2839 struct signal_data *old, *d;
2840
2841 /* Move us from the signalfd belonging to the old
2842 * priority to the signalfd of the new priority */
2843
2844 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2845
2846 s->priority = priority;
2847
2848 r = event_make_signal_data(s->event, s->signal.sig, &d);
2849 if (r < 0) {
2850 s->priority = old->priority;
2851 return r;
2852 }
2853
2854 event_unmask_signal_data(s->event, old, s->signal.sig);
2855 } else
2856 s->priority = priority;
2857
2858 event_source_pp_prioq_reshuffle(s);
2859
2860 if (s->type == SOURCE_EXIT)
2861 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2862
2863 return 0;
2864
2865 fail:
2866 if (rm_inode)
2867 event_free_inode_data(s->event, new_inode_data);
2868
2869 if (rm_inotify)
2870 event_free_inotify_data(s->event, new_inotify_data);
2871
2872 return r;
2873 }
2874
2875 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2876 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2877 if (!s && !ret)
2878 return false;
2879
2880 assert_return(s, -EINVAL);
2881 assert_return(!event_origin_changed(s->event), -ECHILD);
2882
2883 if (ret)
2884 *ret = s->enabled;
2885
2886 return s->enabled != SD_EVENT_OFF;
2887 }
2888
2889 static int event_source_offline(
2890 sd_event_source *s,
2891 int enabled,
2892 bool ratelimited) {
2893
2894 bool was_offline;
2895 int r;
2896
2897 assert(s);
2898 assert(enabled == SD_EVENT_OFF || ratelimited);
2899
2900 /* Unset the pending flag when this event source is disabled */
2901 if (s->enabled != SD_EVENT_OFF &&
2902 enabled == SD_EVENT_OFF &&
2903 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2904 r = source_set_pending(s, false);
2905 if (r < 0)
2906 return r;
2907 }
2908
2909 was_offline = event_source_is_offline(s);
2910 s->enabled = enabled;
2911 s->ratelimited = ratelimited;
2912
2913 switch (s->type) {
2914
2915 case SOURCE_IO:
2916 source_io_unregister(s);
2917 break;
2918
2919 case SOURCE_SIGNAL:
2920 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2921 break;
2922
2923 case SOURCE_CHILD:
2924 if (!was_offline) {
2925 assert(s->event->n_online_child_sources > 0);
2926 s->event->n_online_child_sources--;
2927 }
2928
2929 if (EVENT_SOURCE_WATCH_PIDFD(s))
2930 source_child_pidfd_unregister(s);
2931 else
2932 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2933 break;
2934
2935 case SOURCE_EXIT:
2936 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2937 break;
2938
2939 case SOURCE_MEMORY_PRESSURE:
2940 source_memory_pressure_unregister(s);
2941 break;
2942
2943 case SOURCE_TIME_REALTIME:
2944 case SOURCE_TIME_BOOTTIME:
2945 case SOURCE_TIME_MONOTONIC:
2946 case SOURCE_TIME_REALTIME_ALARM:
2947 case SOURCE_TIME_BOOTTIME_ALARM:
2948 case SOURCE_DEFER:
2949 case SOURCE_POST:
2950 case SOURCE_INOTIFY:
2951 break;
2952
2953 default:
2954 assert_not_reached();
2955 }
2956
2957 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2958 event_source_time_prioq_reshuffle(s);
2959
2960 return 1;
2961 }
2962
2963 static int event_source_online(
2964 sd_event_source *s,
2965 int enabled,
2966 bool ratelimited) {
2967
2968 bool was_online;
2969 int r;
2970
2971 assert(s);
2972 assert(enabled != SD_EVENT_OFF || !ratelimited);
2973
2974 /* Unset the pending flag when this event source is enabled */
2975 if (s->enabled == SD_EVENT_OFF &&
2976 enabled != SD_EVENT_OFF &&
2977 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2978 r = source_set_pending(s, false);
2979 if (r < 0)
2980 return r;
2981 }
2982
2983 /* Are we really ready for onlining? */
2984 if (enabled == SD_EVENT_OFF || ratelimited) {
2985 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2986 s->enabled = enabled;
2987 s->ratelimited = ratelimited;
2988 return 0;
2989 }
2990
2991 was_online = event_source_is_online(s);
2992
2993 switch (s->type) {
2994 case SOURCE_IO:
2995 r = source_io_register(s, enabled, s->io.events);
2996 if (r < 0)
2997 return r;
2998 break;
2999
3000 case SOURCE_SIGNAL:
3001 r = event_make_signal_data(s->event, s->signal.sig, NULL);
3002 if (r < 0) {
3003 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
3004 return r;
3005 }
3006
3007 break;
3008
3009 case SOURCE_CHILD:
3010 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
3011 /* yes, we have pidfd */
3012
3013 r = source_child_pidfd_register(s, enabled);
3014 if (r < 0)
3015 return r;
3016 } else {
3017 /* no pidfd, or something other to watch for than WEXITED */
3018
3019 r = event_make_signal_data(s->event, SIGCHLD, NULL);
3020 if (r < 0) {
3021 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
3022 return r;
3023 }
3024 }
3025
3026 if (!was_online)
3027 s->event->n_online_child_sources++;
3028 break;
3029
3030 case SOURCE_MEMORY_PRESSURE:
3031 r = source_memory_pressure_register(s, enabled);
3032 if (r < 0)
3033 return r;
3034
3035 break;
3036
3037 case SOURCE_TIME_REALTIME:
3038 case SOURCE_TIME_BOOTTIME:
3039 case SOURCE_TIME_MONOTONIC:
3040 case SOURCE_TIME_REALTIME_ALARM:
3041 case SOURCE_TIME_BOOTTIME_ALARM:
3042 case SOURCE_EXIT:
3043 case SOURCE_DEFER:
3044 case SOURCE_POST:
3045 case SOURCE_INOTIFY:
3046 break;
3047
3048 default:
3049 assert_not_reached();
3050 }
3051
3052 s->enabled = enabled;
3053 s->ratelimited = ratelimited;
3054
3055 /* Non-failing operations below */
3056 if (s->type == SOURCE_EXIT)
3057 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
3058
3059 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3060 event_source_time_prioq_reshuffle(s);
3061
3062 return 1;
3063 }
3064
3065 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3066 int r;
3067
3068 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
3069
3070 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3071 if (m == SD_EVENT_OFF && !s)
3072 return 0;
3073
3074 assert_return(s, -EINVAL);
3075 assert_return(!event_origin_changed(s->event), -ECHILD);
3076
3077 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3078 if (s->event->state == SD_EVENT_FINISHED)
3079 return m == SD_EVENT_OFF ? 0 : -ESTALE;
3080
3081 if (s->enabled == m) /* No change? */
3082 return 0;
3083
3084 if (m == SD_EVENT_OFF)
3085 r = event_source_offline(s, m, s->ratelimited);
3086 else {
3087 if (s->enabled != SD_EVENT_OFF) {
3088 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3089 * event source is already enabled after all. */
3090 s->enabled = m;
3091 return 0;
3092 }
3093
3094 r = event_source_online(s, m, s->ratelimited);
3095 }
3096 if (r < 0)
3097 return r;
3098
3099 event_source_pp_prioq_reshuffle(s);
3100 return 0;
3101 }
3102
3103 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
3104 assert_return(s, -EINVAL);
3105 assert_return(usec, -EINVAL);
3106 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3107 assert_return(!event_origin_changed(s->event), -ECHILD);
3108
3109 *usec = s->time.next;
3110 return 0;
3111 }
3112
3113 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
3114 int r;
3115
3116 assert_return(s, -EINVAL);
3117 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3118 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3119 assert_return(!event_origin_changed(s->event), -ECHILD);
3120
3121 r = source_set_pending(s, false);
3122 if (r < 0)
3123 return r;
3124
3125 s->time.next = usec;
3126
3127 event_source_time_prioq_reshuffle(s);
3128 return 0;
3129 }
3130
3131 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3132 usec_t t;
3133 int r;
3134
3135 assert_return(s, -EINVAL);
3136 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3137 assert_return(!event_origin_changed(s->event), -ECHILD);
3138
3139 if (usec == USEC_INFINITY)
3140 return sd_event_source_set_time(s, USEC_INFINITY);
3141
3142 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3143 if (r < 0)
3144 return r;
3145
3146 usec = usec_add(t, usec);
3147 if (usec == USEC_INFINITY)
3148 return -EOVERFLOW;
3149
3150 return sd_event_source_set_time(s, usec);
3151 }
3152
3153 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
3154 assert_return(s, -EINVAL);
3155 assert_return(usec, -EINVAL);
3156 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3157 assert_return(!event_origin_changed(s->event), -ECHILD);
3158
3159 *usec = s->time.accuracy;
3160 return 0;
3161 }
3162
3163 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
3164 int r;
3165
3166 assert_return(s, -EINVAL);
3167 assert_return(usec != UINT64_MAX, -EINVAL);
3168 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3169 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3170 assert_return(!event_origin_changed(s->event), -ECHILD);
3171
3172 r = source_set_pending(s, false);
3173 if (r < 0)
3174 return r;
3175
3176 if (usec == 0)
3177 usec = DEFAULT_ACCURACY_USEC;
3178
3179 s->time.accuracy = usec;
3180
3181 event_source_time_prioq_reshuffle(s);
3182 return 0;
3183 }
3184
3185 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
3186 assert_return(s, -EINVAL);
3187 assert_return(clock, -EINVAL);
3188 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3189 assert_return(!event_origin_changed(s->event), -ECHILD);
3190
3191 *clock = event_source_type_to_clock(s->type);
3192 return 0;
3193 }
3194
3195 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
3196 assert_return(s, -EINVAL);
3197 assert_return(pid, -EINVAL);
3198 assert_return(s->type == SOURCE_CHILD, -EDOM);
3199 assert_return(!event_origin_changed(s->event), -ECHILD);
3200
3201 *pid = s->child.pid;
3202 return 0;
3203 }
3204
3205 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3206 assert_return(s, -EINVAL);
3207 assert_return(s->type == SOURCE_CHILD, -EDOM);
3208 assert_return(!event_origin_changed(s->event), -ECHILD);
3209
3210 if (s->child.pidfd < 0)
3211 return -EOPNOTSUPP;
3212
3213 return s->child.pidfd;
3214 }
3215
3216 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3217 assert_return(s, -EINVAL);
3218 assert_return(s->type == SOURCE_CHILD, -EDOM);
3219 assert_return(!event_origin_changed(s->event), -ECHILD);
3220 assert_return(SIGNAL_VALID(sig), -EINVAL);
3221
3222 /* If we already have seen indication the process exited refuse sending a signal early. This way we
3223 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3224 * available. */
3225 if (s->child.exited)
3226 return -ESRCH;
3227
3228 if (s->child.pidfd >= 0) {
3229 siginfo_t copy;
3230
3231 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3232 * structure here */
3233 if (si)
3234 copy = *si;
3235
3236 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
3237 /* Let's propagate the error only if the system call is not implemented or prohibited */
3238 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3239 return -errno;
3240 } else
3241 return 0;
3242 }
3243
3244 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3245 * this here. */
3246 if (flags != 0)
3247 return -EOPNOTSUPP;
3248
3249 if (si) {
3250 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3251 siginfo_t copy = *si;
3252
3253 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
3254 return -errno;
3255 } else if (kill(s->child.pid, sig) < 0)
3256 return -errno;
3257
3258 return 0;
3259 }
3260
3261 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3262 assert_return(s, -EINVAL);
3263 assert_return(s->type == SOURCE_CHILD, -EDOM);
3264 assert_return(!event_origin_changed(s->event), -ECHILD);
3265
3266 if (s->child.pidfd < 0)
3267 return -EOPNOTSUPP;
3268
3269 return s->child.pidfd_owned;
3270 }
3271
3272 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3273 assert_return(s, -EINVAL);
3274 assert_return(s->type == SOURCE_CHILD, -EDOM);
3275 assert_return(!event_origin_changed(s->event), -ECHILD);
3276
3277 if (s->child.pidfd < 0)
3278 return -EOPNOTSUPP;
3279
3280 s->child.pidfd_owned = own;
3281 return 0;
3282 }
3283
3284 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3285 assert_return(s, -EINVAL);
3286 assert_return(s->type == SOURCE_CHILD, -EDOM);
3287 assert_return(!event_origin_changed(s->event), -ECHILD);
3288
3289 return s->child.process_owned;
3290 }
3291
3292 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3293 assert_return(s, -EINVAL);
3294 assert_return(s->type == SOURCE_CHILD, -EDOM);
3295 assert_return(!event_origin_changed(s->event), -ECHILD);
3296
3297 s->child.process_owned = own;
3298 return 0;
3299 }
3300
3301 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret) {
3302 assert_return(s, -EINVAL);
3303 assert_return(ret, -EINVAL);
3304 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3305 assert_return(!event_origin_changed(s->event), -ECHILD);
3306
3307 *ret = s->inotify.mask;
3308 return 0;
3309 }
3310
3311 _public_ int sd_event_source_get_inotify_path(sd_event_source *s, const char **ret) {
3312 assert_return(s, -EINVAL);
3313 assert_return(ret, -EINVAL);
3314 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3315 assert_return(!event_origin_changed(s->event), -ECHILD);
3316
3317 if (!s->inotify.inode_data)
3318 return -ESTALE; /* already disconnected. */
3319
3320 if (!s->inotify.inode_data->path)
3321 return -ENOSYS; /* /proc was not mounted? */
3322
3323 *ret = s->inotify.inode_data->path;
3324 return 0;
3325 }
3326
3327 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
3328 int r;
3329
3330 assert_return(s, -EINVAL);
3331 assert_return(s->type != SOURCE_EXIT, -EDOM);
3332 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3333 assert_return(!event_origin_changed(s->event), -ECHILD);
3334
3335 if (s->prepare == callback)
3336 return 0;
3337
3338 if (callback && s->prepare) {
3339 s->prepare = callback;
3340 return 0;
3341 }
3342
3343 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3344 if (r < 0)
3345 return r;
3346
3347 s->prepare = callback;
3348
3349 if (callback) {
3350 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3351 if (r < 0)
3352 return r;
3353 } else
3354 prioq_remove(s->event->prepare, s, &s->prepare_index);
3355
3356 return 0;
3357 }
3358
3359 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
3360 assert_return(s, NULL);
3361 assert_return(!event_origin_changed(s->event), NULL);
3362
3363 return s->userdata;
3364 }
3365
3366 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3367 void *ret;
3368
3369 assert_return(s, NULL);
3370 assert_return(!event_origin_changed(s->event), NULL);
3371
3372 ret = s->userdata;
3373 s->userdata = userdata;
3374
3375 return ret;
3376 }
3377
3378 static int event_source_enter_ratelimited(sd_event_source *s) {
3379 int r;
3380
3381 assert(s);
3382
3383 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3384 * the end of the rate limit time window, much as if it was a timer event source. */
3385
3386 if (s->ratelimited)
3387 return 0; /* Already ratelimited, this is a NOP hence */
3388
3389 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3390 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3391 if (r < 0)
3392 return r;
3393
3394 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3395 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3396 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3397 if (EVENT_SOURCE_IS_TIME(s->type))
3398 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3399
3400 /* Now, let's add the event source to the monotonic clock instead */
3401 r = event_source_time_prioq_put(s, &s->event->monotonic);
3402 if (r < 0)
3403 goto fail;
3404
3405 /* And let's take the event source officially offline */
3406 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3407 if (r < 0) {
3408 event_source_time_prioq_remove(s, &s->event->monotonic);
3409 goto fail;
3410 }
3411
3412 event_source_pp_prioq_reshuffle(s);
3413
3414 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3415 return 0;
3416
3417 fail:
3418 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3419 * space for it should already be allocated. */
3420 if (EVENT_SOURCE_IS_TIME(s->type))
3421 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3422
3423 return r;
3424 }
3425
3426 static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
3427 int r;
3428
3429 assert(s);
3430
3431 if (!s->ratelimited)
3432 return 0;
3433
3434 /* Let's take the event source out of the monotonic prioq first. */
3435 event_source_time_prioq_remove(s, &s->event->monotonic);
3436
3437 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3438 if (EVENT_SOURCE_IS_TIME(s->type)) {
3439 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3440 if (r < 0)
3441 goto fail;
3442 }
3443
3444 /* Let's try to take it online again. */
3445 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3446 if (r < 0) {
3447 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3448 if (EVENT_SOURCE_IS_TIME(s->type))
3449 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3450
3451 goto fail;
3452 }
3453
3454 event_source_pp_prioq_reshuffle(s);
3455 ratelimit_reset(&s->rate_limit);
3456
3457 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
3458
3459 if (run_callback && s->ratelimit_expire_callback) {
3460 s->dispatching = true;
3461 r = s->ratelimit_expire_callback(s, s->userdata);
3462 s->dispatching = false;
3463
3464 if (r < 0) {
3465 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3466 strna(s->description),
3467 event_source_type_to_string(s->type),
3468 s->exit_on_failure ? "exiting" : "disabling");
3469
3470 if (s->exit_on_failure)
3471 (void) sd_event_exit(s->event, r);
3472 }
3473
3474 if (s->n_ref == 0)
3475 source_free(s);
3476 else if (r < 0)
3477 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3478
3479 return 1;
3480 }
3481
3482 return 0;
3483
3484 fail:
3485 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3486 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3487 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3488
3489 return r;
3490 }
3491
3492 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3493 usec_t c;
3494 assert(e);
3495 assert(a <= b);
3496
3497 if (a <= 0)
3498 return 0;
3499 if (a >= USEC_INFINITY)
3500 return USEC_INFINITY;
3501
3502 if (b <= a + 1)
3503 return a;
3504
3505 initialize_perturb(e);
3506
3507 /*
3508 Find a good time to wake up again between times a and b. We
3509 have two goals here:
3510
3511 a) We want to wake up as seldom as possible, hence prefer
3512 later times over earlier times.
3513
3514 b) But if we have to wake up, then let's make sure to
3515 dispatch as much as possible on the entire system.
3516
3517 We implement this by waking up everywhere at the same time
3518 within any given minute if we can, synchronised via the
3519 perturbation value determined from the boot ID. If we can't,
3520 then we try to find the same spot in every 10s, then 1s and
3521 then 250ms step. Otherwise, we pick the last possible time
3522 to wake up.
3523 */
3524
3525 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3526 if (c >= b) {
3527 if (_unlikely_(c < USEC_PER_MINUTE))
3528 return b;
3529
3530 c -= USEC_PER_MINUTE;
3531 }
3532
3533 if (c >= a)
3534 return c;
3535
3536 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3537 if (c >= b) {
3538 if (_unlikely_(c < USEC_PER_SEC*10))
3539 return b;
3540
3541 c -= USEC_PER_SEC*10;
3542 }
3543
3544 if (c >= a)
3545 return c;
3546
3547 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3548 if (c >= b) {
3549 if (_unlikely_(c < USEC_PER_SEC))
3550 return b;
3551
3552 c -= USEC_PER_SEC;
3553 }
3554
3555 if (c >= a)
3556 return c;
3557
3558 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3559 if (c >= b) {
3560 if (_unlikely_(c < USEC_PER_MSEC*250))
3561 return b;
3562
3563 c -= USEC_PER_MSEC*250;
3564 }
3565
3566 if (c >= a)
3567 return c;
3568
3569 return b;
3570 }
3571
3572 static int event_arm_timer(
3573 sd_event *e,
3574 struct clock_data *d) {
3575
3576 struct itimerspec its = {};
3577 sd_event_source *a, *b;
3578 usec_t t;
3579
3580 assert(e);
3581 assert(d);
3582
3583 if (!d->needs_rearm)
3584 return 0;
3585
3586 d->needs_rearm = false;
3587
3588 a = prioq_peek(d->earliest);
3589 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3590 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3591
3592 if (d->fd < 0)
3593 return 0;
3594
3595 if (d->next == USEC_INFINITY)
3596 return 0;
3597
3598 /* disarm */
3599 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3600 return -errno;
3601
3602 d->next = USEC_INFINITY;
3603 return 0;
3604 }
3605
3606 b = prioq_peek(d->latest);
3607 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3608 assert(b && b->enabled != SD_EVENT_OFF);
3609
3610 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3611 if (d->next == t)
3612 return 0;
3613
3614 assert_se(d->fd >= 0);
3615
3616 if (t == 0) {
3617 /* We don't want to disarm here, just mean some time looooong ago. */
3618 its.it_value.tv_sec = 0;
3619 its.it_value.tv_nsec = 1;
3620 } else
3621 timespec_store(&its.it_value, t);
3622
3623 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3624 return -errno;
3625
3626 d->next = t;
3627 return 0;
3628 }
3629
3630 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3631 assert(e);
3632 assert(s);
3633 assert(s->type == SOURCE_IO);
3634
3635 /* If the event source was already pending, we just OR in the
3636 * new revents, otherwise we reset the value. The ORing is
3637 * necessary to handle EPOLLONESHOT events properly where
3638 * readability might happen independently of writability, and
3639 * we need to keep track of both */
3640
3641 if (s->pending)
3642 s->io.revents |= revents;
3643 else
3644 s->io.revents = revents;
3645
3646 return source_set_pending(s, true);
3647 }
3648
3649 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3650 uint64_t x;
3651 ssize_t ss;
3652
3653 assert(e);
3654 assert(fd >= 0);
3655
3656 assert_return(events == EPOLLIN, -EIO);
3657
3658 ss = read(fd, &x, sizeof(x));
3659 if (ss < 0) {
3660 if (ERRNO_IS_TRANSIENT(errno))
3661 return 0;
3662
3663 return -errno;
3664 }
3665
3666 if (_unlikely_(ss != sizeof(x)))
3667 return -EIO;
3668
3669 if (next)
3670 *next = USEC_INFINITY;
3671
3672 return 0;
3673 }
3674
3675 static int process_timer(
3676 sd_event *e,
3677 usec_t n,
3678 struct clock_data *d) {
3679
3680 sd_event_source *s;
3681 bool callback_invoked = false;
3682 int r;
3683
3684 assert(e);
3685 assert(d);
3686
3687 for (;;) {
3688 s = prioq_peek(d->earliest);
3689 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3690
3691 if (!s || time_event_source_next(s) > n)
3692 break;
3693
3694 if (s->ratelimited) {
3695 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3696 * again. */
3697 assert(s->ratelimited);
3698
3699 r = event_source_leave_ratelimit(s, /* run_callback */ true);
3700 if (r < 0)
3701 return r;
3702 else if (r == 1)
3703 callback_invoked = true;
3704
3705 continue;
3706 }
3707
3708 if (s->enabled == SD_EVENT_OFF || s->pending)
3709 break;
3710
3711 r = source_set_pending(s, true);
3712 if (r < 0)
3713 return r;
3714
3715 event_source_time_prioq_reshuffle(s);
3716 }
3717
3718 return callback_invoked;
3719 }
3720
3721 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3722 int64_t min_priority = threshold;
3723 bool something_new = false;
3724 sd_event_source *s;
3725 int r;
3726
3727 assert(e);
3728 assert(ret_min_priority);
3729
3730 if (!e->need_process_child) {
3731 *ret_min_priority = min_priority;
3732 return 0;
3733 }
3734
3735 e->need_process_child = false;
3736
3737 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3738 * for, instead of using P_ALL. This is because we only want to get child information of very
3739 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3740 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3741 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3742 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3743 * to handle SIGCHLD yourself.
3744 *
3745 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3746 * source is dispatched so that the callback still sees the process as a zombie. */
3747
3748 HASHMAP_FOREACH(s, e->child_sources) {
3749 assert(s->type == SOURCE_CHILD);
3750
3751 if (s->priority > threshold)
3752 continue;
3753
3754 if (s->pending)
3755 continue;
3756
3757 if (event_source_is_offline(s))
3758 continue;
3759
3760 if (s->child.exited)
3761 continue;
3762
3763 if (EVENT_SOURCE_WATCH_PIDFD(s))
3764 /* There's a usable pidfd known for this event source? Then don't waitid() for
3765 * it here */
3766 continue;
3767
3768 zero(s->child.siginfo);
3769 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3770 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3771 return negative_errno();
3772
3773 if (s->child.siginfo.si_pid != 0) {
3774 bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3775
3776 if (zombie)
3777 s->child.exited = true;
3778
3779 if (!zombie && (s->child.options & WEXITED)) {
3780 /* If the child isn't dead then let's immediately remove the state
3781 * change from the queue, since there's no benefit in leaving it
3782 * queued. */
3783
3784 assert(s->child.options & (WSTOPPED|WCONTINUED));
3785 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3786 }
3787
3788 r = source_set_pending(s, true);
3789 if (r < 0)
3790 return r;
3791 if (r > 0) {
3792 something_new = true;
3793 min_priority = MIN(min_priority, s->priority);
3794 }
3795 }
3796 }
3797
3798 *ret_min_priority = min_priority;
3799 return something_new;
3800 }
3801
3802 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3803 assert(e);
3804 assert(s);
3805 assert(s->type == SOURCE_CHILD);
3806
3807 if (s->pending)
3808 return 0;
3809
3810 if (event_source_is_offline(s))
3811 return 0;
3812
3813 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3814 return 0;
3815
3816 zero(s->child.siginfo);
3817 if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3818 return -errno;
3819
3820 if (s->child.siginfo.si_pid == 0)
3821 return 0;
3822
3823 if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3824 s->child.exited = true;
3825
3826 return source_set_pending(s, true);
3827 }
3828
3829 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3830 int r;
3831
3832 assert(e);
3833 assert(d);
3834 assert_return(events == EPOLLIN, -EIO);
3835 assert(min_priority);
3836
3837 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3838 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3839 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3840 * but we might have higher priority children we care about hence we need to check that
3841 * explicitly. */
3842
3843 if (sigismember(&d->sigset, SIGCHLD))
3844 e->need_process_child = true;
3845
3846 /* If there's already an event source pending for this priority we don't read another */
3847 if (d->current)
3848 return 0;
3849
3850 for (;;) {
3851 struct signalfd_siginfo si;
3852 ssize_t n;
3853 sd_event_source *s = NULL;
3854
3855 n = read(d->fd, &si, sizeof(si));
3856 if (n < 0) {
3857 if (ERRNO_IS_TRANSIENT(errno))
3858 return 0;
3859
3860 return -errno;
3861 }
3862
3863 if (_unlikely_(n != sizeof(si)))
3864 return -EIO;
3865
3866 assert(SIGNAL_VALID(si.ssi_signo));
3867
3868 if (e->signal_sources)
3869 s = e->signal_sources[si.ssi_signo];
3870 if (!s)
3871 continue;
3872 if (s->pending)
3873 continue;
3874
3875 s->signal.siginfo = si;
3876 d->current = s;
3877
3878 r = source_set_pending(s, true);
3879 if (r < 0)
3880 return r;
3881 if (r > 0 && *min_priority >= s->priority) {
3882 *min_priority = s->priority;
3883 return 1; /* an event source with smaller priority is queued. */
3884 }
3885
3886 return 0;
3887 }
3888 }
3889
3890 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3891 ssize_t n;
3892
3893 assert(e);
3894 assert(d);
3895
3896 assert_return(revents == EPOLLIN, -EIO);
3897
3898 /* If there's already an event source pending for this priority, don't read another */
3899 if (d->n_pending > 0)
3900 return 0;
3901
3902 /* Is the read buffer non-empty? If so, let's not read more */
3903 if (d->buffer_filled > 0)
3904 return 0;
3905
3906 if (d->priority > threshold)
3907 return 0;
3908
3909 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3910 if (n < 0) {
3911 if (ERRNO_IS_TRANSIENT(errno))
3912 return 0;
3913
3914 return -errno;
3915 }
3916
3917 assert(n > 0);
3918 d->buffer_filled = (size_t) n;
3919 LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
3920
3921 return 1;
3922 }
3923
3924 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3925 assert(e);
3926 assert(d);
3927 assert(sz <= d->buffer_filled);
3928
3929 if (sz == 0)
3930 return;
3931
3932 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3933 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3934 d->buffer_filled -= sz;
3935
3936 if (d->buffer_filled == 0)
3937 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
3938 }
3939
3940 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3941 int r;
3942
3943 assert(e);
3944 assert(d);
3945
3946 /* If there's already an event source pending for this priority, don't read another */
3947 if (d->n_pending > 0)
3948 return 0;
3949
3950 while (d->buffer_filled > 0) {
3951 size_t sz;
3952
3953 /* Let's validate that the event structures are complete */
3954 if (d->buffer_filled < offsetof(struct inotify_event, name))
3955 return -EIO;
3956
3957 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3958 if (d->buffer_filled < sz)
3959 return -EIO;
3960
3961 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3962 struct inode_data *inode_data;
3963
3964 /* The queue overran, let's pass this event to all event sources connected to this inotify
3965 * object */
3966
3967 HASHMAP_FOREACH(inode_data, d->inodes)
3968 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3969
3970 if (event_source_is_offline(s))
3971 continue;
3972
3973 r = source_set_pending(s, true);
3974 if (r < 0)
3975 return r;
3976 }
3977 } else {
3978 struct inode_data *inode_data;
3979
3980 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3981 * our watch descriptor table. */
3982 if (d->buffer.ev.mask & IN_IGNORED) {
3983
3984 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3985 if (!inode_data) {
3986 event_inotify_data_drop(e, d, sz);
3987 continue;
3988 }
3989
3990 /* The watch descriptor was removed by the kernel, let's drop it here too */
3991 inode_data->wd = -1;
3992 } else {
3993 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3994 if (!inode_data) {
3995 event_inotify_data_drop(e, d, sz);
3996 continue;
3997 }
3998 }
3999
4000 /* Trigger all event sources that are interested in these events. Also trigger all event
4001 * sources if IN_IGNORED or IN_UNMOUNT is set. */
4002 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
4003
4004 if (event_source_is_offline(s))
4005 continue;
4006
4007 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
4008 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
4009 continue;
4010
4011 r = source_set_pending(s, true);
4012 if (r < 0)
4013 return r;
4014 }
4015 }
4016
4017 /* Something pending now? If so, let's finish, otherwise let's read more. */
4018 if (d->n_pending > 0)
4019 return 1;
4020 }
4021
4022 return 0;
4023 }
4024
4025 static int process_inotify(sd_event *e) {
4026 int r, done = 0;
4027
4028 assert(e);
4029
4030 LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
4031 r = event_inotify_data_process(e, d);
4032 if (r < 0)
4033 return r;
4034 if (r > 0)
4035 done++;
4036 }
4037
4038 return done;
4039 }
4040
4041 static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
4042 assert(s);
4043 assert(s->type == SOURCE_MEMORY_PRESSURE);
4044
4045 if (s->pending)
4046 s->memory_pressure.revents |= revents;
4047 else
4048 s->memory_pressure.revents = revents;
4049
4050 return source_set_pending(s, true);
4051 }
4052
4053 static int source_memory_pressure_write(sd_event_source *s) {
4054 ssize_t n;
4055 int r;
4056
4057 assert(s);
4058 assert(s->type == SOURCE_MEMORY_PRESSURE);
4059
4060 /* once we start writing, the buffer is locked, we allow no further changes. */
4061 s->memory_pressure.locked = true;
4062
4063 if (s->memory_pressure.write_buffer_size > 0) {
4064 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4065 if (n < 0) {
4066 if (!ERRNO_IS_TRANSIENT(errno)) {
4067 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4068 * files, but then generates EOPNOSUPP on read() and write() (instead of on
4069 * open()!). This sucks hard, since we can only detect this kind of failure
4070 * so late. Let's make the best of it, and turn off the event source like we
4071 * do for failed event source handlers. */
4072
4073 log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4074 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4075 return 0;
4076 }
4077
4078 n = 0;
4079 }
4080 } else
4081 n = 0;
4082
4083 assert(n >= 0);
4084
4085 if ((size_t) n == s->memory_pressure.write_buffer_size) {
4086 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4087
4088 if (n > 0) {
4089 s->memory_pressure.write_buffer_size = 0;
4090
4091 /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4092 r = source_memory_pressure_register(s, s->enabled);
4093 if (r < 0)
4094 return r;
4095 }
4096 } else if (n > 0) {
4097 _cleanup_free_ void *c = NULL;
4098
4099 assert((size_t) n < s->memory_pressure.write_buffer_size);
4100
4101 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4102 if (!c)
4103 return -ENOMEM;
4104
4105 free_and_replace(s->memory_pressure.write_buffer, c);
4106 s->memory_pressure.write_buffer_size -= n;
4107 return 1;
4108 }
4109
4110 return 0;
4111 }
4112
4113 static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4114 int r;
4115
4116 assert(s);
4117 assert(s->type == SOURCE_MEMORY_PRESSURE);
4118
4119 r = source_memory_pressure_write(s);
4120 if (r < 0)
4121 return r;
4122 if (r > 0)
4123 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4124 * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4125
4126 /* No pending incoming IO? Then let's not continue further */
4127 if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4128
4129 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4130 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4131 return -EIO;
4132
4133 return 1; /* leave dispatch, we already processed everything */
4134 }
4135
4136 if (s->memory_pressure.revents & EPOLLIN) {
4137 uint8_t pipe_buf[PIPE_BUF];
4138 ssize_t n;
4139
4140 /* If the fd is readable, then flush out anything that might be queued */
4141
4142 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4143 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4144 return -errno;
4145 }
4146
4147 return 0; /* go on, dispatch to user callback */
4148 }
4149
4150 static int source_dispatch(sd_event_source *s) {
4151 EventSourceType saved_type;
4152 sd_event *saved_event;
4153 int r = 0;
4154
4155 assert(s);
4156 assert(s->pending || s->type == SOURCE_EXIT);
4157
4158 /* Save the event source type, here, so that we still know it after the event callback which might
4159 * invalidate the event. */
4160 saved_type = s->type;
4161
4162 /* Similarly, store a reference to the event loop object, so that we can still access it after the
4163 * callback might have invalidated/disconnected the event source. */
4164 saved_event = s->event;
4165 PROTECT_EVENT(saved_event);
4166
4167 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4168 assert(!s->ratelimited);
4169 if (!ratelimit_below(&s->rate_limit)) {
4170 r = event_source_enter_ratelimited(s);
4171 if (r < 0)
4172 return r;
4173
4174 return 1;
4175 }
4176
4177 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
4178 r = source_set_pending(s, false);
4179 if (r < 0)
4180 return r;
4181 }
4182
4183 if (s->type != SOURCE_POST) {
4184 sd_event_source *z;
4185
4186 /* If we execute a non-post source, let's mark all post sources as pending. */
4187
4188 SET_FOREACH(z, s->event->post_sources) {
4189 if (event_source_is_offline(z))
4190 continue;
4191
4192 r = source_set_pending(z, true);
4193 if (r < 0)
4194 return r;
4195 }
4196 }
4197
4198 if (s->type == SOURCE_MEMORY_PRESSURE) {
4199 r = source_memory_pressure_initiate_dispatch(s);
4200 if (r == -EIO) /* handle EIO errors similar to callback errors */
4201 goto finish;
4202 if (r < 0)
4203 return r;
4204 if (r > 0) /* already handled */
4205 return 1;
4206 }
4207
4208 if (s->enabled == SD_EVENT_ONESHOT) {
4209 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
4210 if (r < 0)
4211 return r;
4212 }
4213
4214 s->dispatching = true;
4215
4216 switch (s->type) {
4217
4218 case SOURCE_IO:
4219 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4220 break;
4221
4222 case SOURCE_TIME_REALTIME:
4223 case SOURCE_TIME_BOOTTIME:
4224 case SOURCE_TIME_MONOTONIC:
4225 case SOURCE_TIME_REALTIME_ALARM:
4226 case SOURCE_TIME_BOOTTIME_ALARM:
4227 r = s->time.callback(s, s->time.next, s->userdata);
4228 break;
4229
4230 case SOURCE_SIGNAL:
4231 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4232 break;
4233
4234 case SOURCE_CHILD: {
4235 bool zombie;
4236
4237 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
4238
4239 r = s->child.callback(s, &s->child.siginfo, s->userdata);
4240
4241 /* Now, reap the PID for good. */
4242 if (zombie) {
4243 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
4244 s->child.waited = true;
4245 }
4246
4247 break;
4248 }
4249
4250 case SOURCE_DEFER:
4251 r = s->defer.callback(s, s->userdata);
4252 break;
4253
4254 case SOURCE_POST:
4255 r = s->post.callback(s, s->userdata);
4256 break;
4257
4258 case SOURCE_EXIT:
4259 r = s->exit.callback(s, s->userdata);
4260 break;
4261
4262 case SOURCE_INOTIFY: {
4263 struct sd_event *e = s->event;
4264 struct inotify_data *d;
4265 size_t sz;
4266
4267 assert(s->inotify.inode_data);
4268 assert_se(d = s->inotify.inode_data->inotify_data);
4269
4270 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4271 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4272 assert(d->buffer_filled >= sz);
4273
4274 /* If the inotify callback destroys the event source then this likely means we don't need to
4275 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4276 * free it immediately, then we couldn't drop the event from the inotify event queue without
4277 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4278 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4279 * explicitly GC it after we are done dropping the inotify event from the buffer. */
4280 d->n_busy++;
4281 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
4282 d->n_busy--;
4283
4284 /* When no event is pending anymore on this inotify object, then let's drop the event from
4285 * the inotify event queue buffer. */
4286 if (d->n_pending == 0)
4287 event_inotify_data_drop(e, d, sz);
4288
4289 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4290 event_gc_inotify_data(e, d);
4291 break;
4292 }
4293
4294 case SOURCE_MEMORY_PRESSURE:
4295 r = s->memory_pressure.callback(s, s->userdata);
4296 break;
4297
4298 case SOURCE_WATCHDOG:
4299 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
4300 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
4301 assert_not_reached();
4302 }
4303
4304 s->dispatching = false;
4305
4306 finish:
4307 if (r < 0) {
4308 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4309 strna(s->description),
4310 event_source_type_to_string(saved_type),
4311 s->exit_on_failure ? "exiting" : "disabling");
4312
4313 if (s->exit_on_failure)
4314 (void) sd_event_exit(saved_event, r);
4315 }
4316
4317 if (s->n_ref == 0)
4318 source_free(s);
4319 else if (r < 0)
4320 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4321
4322 return 1;
4323 }
4324
4325 static int event_prepare(sd_event *e) {
4326 int r;
4327
4328 assert(e);
4329
4330 for (;;) {
4331 sd_event_source *s;
4332
4333 s = prioq_peek(e->prepare);
4334 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
4335 break;
4336
4337 s->prepare_iteration = e->iteration;
4338 prioq_reshuffle(e->prepare, s, &s->prepare_index);
4339
4340 assert(s->prepare);
4341 s->dispatching = true;
4342 r = s->prepare(s, s->userdata);
4343 s->dispatching = false;
4344
4345 if (r < 0) {
4346 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4347 strna(s->description),
4348 event_source_type_to_string(s->type),
4349 s->exit_on_failure ? "exiting" : "disabling");
4350
4351 if (s->exit_on_failure)
4352 (void) sd_event_exit(e, r);
4353 }
4354
4355 if (s->n_ref == 0)
4356 source_free(s);
4357 else if (r < 0)
4358 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4359 }
4360
4361 return 0;
4362 }
4363
4364 static int dispatch_exit(sd_event *e) {
4365 sd_event_source *p;
4366 int r;
4367
4368 assert(e);
4369
4370 p = prioq_peek(e->exit);
4371 assert(!p || p->type == SOURCE_EXIT);
4372
4373 if (!p || event_source_is_offline(p)) {
4374 e->state = SD_EVENT_FINISHED;
4375 return 0;
4376 }
4377
4378 PROTECT_EVENT(e);
4379 e->iteration++;
4380 e->state = SD_EVENT_EXITING;
4381 r = source_dispatch(p);
4382 e->state = SD_EVENT_INITIAL;
4383 return r;
4384 }
4385
4386 static sd_event_source* event_next_pending(sd_event *e) {
4387 sd_event_source *p;
4388
4389 assert(e);
4390
4391 p = prioq_peek(e->pending);
4392 if (!p)
4393 return NULL;
4394
4395 if (event_source_is_offline(p))
4396 return NULL;
4397
4398 return p;
4399 }
4400
4401 static int arm_watchdog(sd_event *e) {
4402 struct itimerspec its = {};
4403 usec_t t;
4404
4405 assert(e);
4406 assert(e->watchdog_fd >= 0);
4407
4408 t = sleep_between(e,
4409 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4410 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
4411
4412 timespec_store(&its.it_value, t);
4413
4414 /* Make sure we never set the watchdog to 0, which tells the
4415 * kernel to disable it. */
4416 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4417 its.it_value.tv_nsec = 1;
4418
4419 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
4420 }
4421
4422 static int process_watchdog(sd_event *e) {
4423 assert(e);
4424
4425 if (!e->watchdog)
4426 return 0;
4427
4428 /* Don't notify watchdog too often */
4429 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4430 return 0;
4431
4432 sd_notify(false, "WATCHDOG=1");
4433 e->watchdog_last = e->timestamp.monotonic;
4434
4435 return arm_watchdog(e);
4436 }
4437
4438 static void event_close_inode_data_fds(sd_event *e) {
4439 struct inode_data *d;
4440
4441 assert(e);
4442
4443 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4444 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4445 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4446 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4447 * compromise. */
4448
4449 while ((d = e->inode_data_to_close_list)) {
4450 assert(d->fd >= 0);
4451 d->fd = safe_close(d->fd);
4452
4453 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
4454 }
4455 }
4456
4457 static int event_memory_pressure_write_list(sd_event *e) {
4458 int r;
4459
4460 assert(e);
4461
4462 for (;;) {
4463 sd_event_source *s;
4464
4465 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4466 if (!s)
4467 break;
4468
4469 assert(s->type == SOURCE_MEMORY_PRESSURE);
4470 assert(s->memory_pressure.write_buffer_size > 0);
4471 s->memory_pressure.in_write_list = false;
4472
4473 r = source_memory_pressure_write(s);
4474 if (r < 0)
4475 return r;
4476 }
4477
4478 return 0;
4479 }
4480
4481 _public_ int sd_event_prepare(sd_event *e) {
4482 int r;
4483
4484 assert_return(e, -EINVAL);
4485 assert_return(e = event_resolve(e), -ENOPKG);
4486 assert_return(!event_origin_changed(e), -ECHILD);
4487 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4488 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4489
4490 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4491 * this check here once, since gettid() is typically not cached, and thus want to minimize
4492 * syscalls */
4493 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4494
4495 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4496 PROTECT_EVENT(e);
4497
4498 if (e->exit_requested)
4499 goto pending;
4500
4501 e->iteration++;
4502
4503 e->state = SD_EVENT_PREPARING;
4504 r = event_prepare(e);
4505 e->state = SD_EVENT_INITIAL;
4506 if (r < 0)
4507 return r;
4508
4509 r = event_memory_pressure_write_list(e);
4510 if (r < 0)
4511 return r;
4512
4513 r = event_arm_timer(e, &e->realtime);
4514 if (r < 0)
4515 return r;
4516
4517 r = event_arm_timer(e, &e->boottime);
4518 if (r < 0)
4519 return r;
4520
4521 r = event_arm_timer(e, &e->monotonic);
4522 if (r < 0)
4523 return r;
4524
4525 r = event_arm_timer(e, &e->realtime_alarm);
4526 if (r < 0)
4527 return r;
4528
4529 r = event_arm_timer(e, &e->boottime_alarm);
4530 if (r < 0)
4531 return r;
4532
4533 event_close_inode_data_fds(e);
4534
4535 if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
4536 goto pending;
4537
4538 e->state = SD_EVENT_ARMED;
4539
4540 return 0;
4541
4542 pending:
4543 e->state = SD_EVENT_ARMED;
4544 r = sd_event_wait(e, 0);
4545 if (r == 0)
4546 e->state = SD_EVENT_ARMED;
4547
4548 return r;
4549 }
4550
4551 static int epoll_wait_usec(
4552 int fd,
4553 struct epoll_event *events,
4554 int maxevents,
4555 usec_t timeout) {
4556
4557 int msec;
4558 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4559
4560 #if HAVE_EPOLL_PWAIT2
4561 static bool epoll_pwait2_absent = false;
4562 int r;
4563
4564 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4565 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4566 * is not that obvious to implement given the libc and kernel definitions differ in the last
4567 * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4568 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4569 * missing. */
4570
4571 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
4572 r = epoll_pwait2(fd,
4573 events,
4574 maxevents,
4575 TIMESPEC_STORE(timeout),
4576 NULL);
4577 if (r >= 0)
4578 return r;
4579 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
4580 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4581 * supported. */
4582
4583 epoll_pwait2_absent = true;
4584 }
4585 #endif
4586
4587 if (timeout == USEC_INFINITY)
4588 msec = -1;
4589 else {
4590 usec_t k;
4591
4592 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4593 if (k >= INT_MAX)
4594 msec = INT_MAX; /* Saturate */
4595 else
4596 msec = (int) k;
4597 }
4598
4599 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
4600 }
4601
4602 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
4603 size_t n_event_queue, m, n_event_max;
4604 int64_t min_priority = threshold;
4605 bool something_new = false;
4606 int r;
4607
4608 assert(e);
4609 assert(ret_min_priority);
4610
4611 n_event_queue = MAX(e->n_sources, 1u);
4612 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
4613 return -ENOMEM;
4614
4615 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4616
4617 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4618 if (e->buffered_inotify_data_list)
4619 timeout = 0;
4620
4621 for (;;) {
4622 r = epoll_wait_usec(
4623 e->epoll_fd,
4624 e->event_queue,
4625 n_event_max,
4626 timeout);
4627 if (r < 0)
4628 return r;
4629
4630 m = (size_t) r;
4631
4632 if (m < n_event_max)
4633 break;
4634
4635 if (n_event_max >= n_event_queue * 10)
4636 break;
4637
4638 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
4639 return -ENOMEM;
4640
4641 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4642 timeout = 0;
4643 }
4644
4645 /* Set timestamp only when this is called first time. */
4646 if (threshold == INT64_MAX)
4647 triple_timestamp_now(&e->timestamp);
4648
4649 for (size_t i = 0; i < m; i++) {
4650
4651 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4652 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
4653 else {
4654 WakeupType *t = e->event_queue[i].data.ptr;
4655
4656 switch (*t) {
4657
4658 case WAKEUP_EVENT_SOURCE: {
4659 sd_event_source *s = e->event_queue[i].data.ptr;
4660
4661 assert(s);
4662
4663 if (s->priority > threshold)
4664 continue;
4665
4666 min_priority = MIN(min_priority, s->priority);
4667
4668 switch (s->type) {
4669
4670 case SOURCE_IO:
4671 r = process_io(e, s, e->event_queue[i].events);
4672 break;
4673
4674 case SOURCE_CHILD:
4675 r = process_pidfd(e, s, e->event_queue[i].events);
4676 break;
4677
4678 case SOURCE_MEMORY_PRESSURE:
4679 r = process_memory_pressure(s, e->event_queue[i].events);
4680 break;
4681
4682 default:
4683 assert_not_reached();
4684 }
4685
4686 break;
4687 }
4688
4689 case WAKEUP_CLOCK_DATA: {
4690 struct clock_data *d = e->event_queue[i].data.ptr;
4691
4692 assert(d);
4693
4694 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4695 break;
4696 }
4697
4698 case WAKEUP_SIGNAL_DATA:
4699 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4700 break;
4701
4702 case WAKEUP_INOTIFY_DATA:
4703 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4704 break;
4705
4706 default:
4707 assert_not_reached();
4708 }
4709 }
4710 if (r < 0)
4711 return r;
4712 if (r > 0)
4713 something_new = true;
4714 }
4715
4716 *ret_min_priority = min_priority;
4717 return something_new;
4718 }
4719
4720 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4721 int r;
4722
4723 assert_return(e, -EINVAL);
4724 assert_return(e = event_resolve(e), -ENOPKG);
4725 assert_return(!event_origin_changed(e), -ECHILD);
4726 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4727 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4728
4729 if (e->exit_requested) {
4730 e->state = SD_EVENT_PENDING;
4731 return 1;
4732 }
4733
4734 for (int64_t threshold = INT64_MAX; ; threshold--) {
4735 int64_t epoll_min_priority, child_min_priority;
4736
4737 /* There may be a possibility that new epoll (especially IO) and child events are
4738 * triggered just after process_epoll() call but before process_child(), and the new IO
4739 * events may have higher priority than the child events. To salvage these events,
4740 * let's call epoll_wait() again, but accepts only events with higher priority than the
4741 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4742 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4743 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4744
4745 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4746 if (r == -EINTR) {
4747 e->state = SD_EVENT_PENDING;
4748 return 1;
4749 }
4750 if (r < 0)
4751 goto finish;
4752 if (r == 0 && threshold < INT64_MAX)
4753 /* No new epoll event. */
4754 break;
4755
4756 r = process_child(e, threshold, &child_min_priority);
4757 if (r < 0)
4758 goto finish;
4759 if (r == 0)
4760 /* No new child event. */
4761 break;
4762
4763 threshold = MIN(epoll_min_priority, child_min_priority);
4764 if (threshold == INT64_MIN)
4765 break;
4766
4767 timeout = 0;
4768 }
4769
4770 r = process_watchdog(e);
4771 if (r < 0)
4772 goto finish;
4773
4774 r = process_inotify(e);
4775 if (r < 0)
4776 goto finish;
4777
4778 r = process_timer(e, e->timestamp.realtime, &e->realtime);
4779 if (r < 0)
4780 goto finish;
4781
4782 r = process_timer(e, e->timestamp.boottime, &e->boottime);
4783 if (r < 0)
4784 goto finish;
4785
4786 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4787 if (r < 0)
4788 goto finish;
4789
4790 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4791 if (r < 0)
4792 goto finish;
4793
4794 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4795 if (r < 0)
4796 goto finish;
4797 else if (r == 1) {
4798 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4799 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4800 * there were potentially re-enabled by the callback.
4801 *
4802 * Wondering why we treat only this invocation of process_timer() differently? Once event
4803 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4804 * ratelimit expiry callback is never called for any other timer type. */
4805 r = 0;
4806 goto finish;
4807 }
4808
4809 if (event_next_pending(e)) {
4810 e->state = SD_EVENT_PENDING;
4811 return 1;
4812 }
4813
4814 r = 0;
4815
4816 finish:
4817 e->state = SD_EVENT_INITIAL;
4818
4819 return r;
4820 }
4821
4822 _public_ int sd_event_dispatch(sd_event *e) {
4823 sd_event_source *p;
4824 int r;
4825
4826 assert_return(e, -EINVAL);
4827 assert_return(e = event_resolve(e), -ENOPKG);
4828 assert_return(!event_origin_changed(e), -ECHILD);
4829 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4830 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4831
4832 if (e->exit_requested)
4833 return dispatch_exit(e);
4834
4835 p = event_next_pending(e);
4836 if (p) {
4837 PROTECT_EVENT(e);
4838
4839 e->state = SD_EVENT_RUNNING;
4840 r = source_dispatch(p);
4841 e->state = SD_EVENT_INITIAL;
4842 return r;
4843 }
4844
4845 e->state = SD_EVENT_INITIAL;
4846
4847 return 1;
4848 }
4849
4850 static void event_log_delays(sd_event *e) {
4851 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4852 size_t l, i;
4853
4854 p = b;
4855 l = sizeof(b);
4856 for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4857 l = strpcpyf(&p, l, "%u ", e->delays[i]);
4858 e->delays[i] = 0;
4859 }
4860 log_debug("Event loop iterations: %s", b);
4861 }
4862
4863 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4864 int r;
4865
4866 assert_return(e, -EINVAL);
4867 assert_return(e = event_resolve(e), -ENOPKG);
4868 assert_return(!event_origin_changed(e), -ECHILD);
4869 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4870 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4871
4872 if (e->profile_delays && e->last_run_usec != 0) {
4873 usec_t this_run;
4874 unsigned l;
4875
4876 this_run = now(CLOCK_MONOTONIC);
4877
4878 l = log2u64(this_run - e->last_run_usec);
4879 assert(l < ELEMENTSOF(e->delays));
4880 e->delays[l]++;
4881
4882 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4883 event_log_delays(e);
4884 e->last_log_usec = this_run;
4885 }
4886 }
4887
4888 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4889 PROTECT_EVENT(e);
4890
4891 r = sd_event_prepare(e);
4892 if (r == 0)
4893 /* There was nothing? Then wait... */
4894 r = sd_event_wait(e, timeout);
4895
4896 if (e->profile_delays)
4897 e->last_run_usec = now(CLOCK_MONOTONIC);
4898
4899 if (r > 0) {
4900 /* There's something now, then let's dispatch it */
4901 r = sd_event_dispatch(e);
4902 if (r < 0)
4903 return r;
4904
4905 return 1;
4906 }
4907
4908 return r;
4909 }
4910
4911 _public_ int sd_event_loop(sd_event *e) {
4912 int r;
4913
4914 assert_return(e, -EINVAL);
4915 assert_return(e = event_resolve(e), -ENOPKG);
4916 assert_return(!event_origin_changed(e), -ECHILD);
4917 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4918
4919
4920 PROTECT_EVENT(e);
4921
4922 while (e->state != SD_EVENT_FINISHED) {
4923 r = sd_event_run(e, UINT64_MAX);
4924 if (r < 0)
4925 return r;
4926 }
4927
4928 return e->exit_code;
4929 }
4930
4931 _public_ int sd_event_get_fd(sd_event *e) {
4932 assert_return(e, -EINVAL);
4933 assert_return(e = event_resolve(e), -ENOPKG);
4934 assert_return(!event_origin_changed(e), -ECHILD);
4935
4936 return e->epoll_fd;
4937 }
4938
4939 _public_ int sd_event_get_state(sd_event *e) {
4940 assert_return(e, -EINVAL);
4941 assert_return(e = event_resolve(e), -ENOPKG);
4942 assert_return(!event_origin_changed(e), -ECHILD);
4943
4944 return e->state;
4945 }
4946
4947 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
4948 assert_return(e, -EINVAL);
4949 assert_return(e = event_resolve(e), -ENOPKG);
4950 assert_return(!event_origin_changed(e), -ECHILD);
4951
4952 if (!e->exit_requested)
4953 return -ENODATA;
4954
4955 if (code)
4956 *code = e->exit_code;
4957 return 0;
4958 }
4959
4960 _public_ int sd_event_exit(sd_event *e, int code) {
4961 assert_return(e, -EINVAL);
4962 assert_return(e = event_resolve(e), -ENOPKG);
4963 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4964 assert_return(!event_origin_changed(e), -ECHILD);
4965
4966 e->exit_requested = true;
4967 e->exit_code = code;
4968
4969 return 0;
4970 }
4971
4972 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
4973 assert_return(e, -EINVAL);
4974 assert_return(e = event_resolve(e), -ENOPKG);
4975 assert_return(usec, -EINVAL);
4976 assert_return(!event_origin_changed(e), -ECHILD);
4977
4978 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4979 return -EOPNOTSUPP;
4980
4981 if (!triple_timestamp_is_set(&e->timestamp)) {
4982 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4983 *usec = now(clock);
4984 return 1;
4985 }
4986
4987 *usec = triple_timestamp_by_clock(&e->timestamp, clock);
4988 return 0;
4989 }
4990
4991 _public_ int sd_event_default(sd_event **ret) {
4992 sd_event *e = NULL;
4993 int r;
4994
4995 if (!ret)
4996 return !!default_event;
4997
4998 if (default_event) {
4999 *ret = sd_event_ref(default_event);
5000 return 0;
5001 }
5002
5003 r = sd_event_new(&e);
5004 if (r < 0)
5005 return r;
5006
5007 e->default_event_ptr = &default_event;
5008 e->tid = gettid();
5009 default_event = e;
5010
5011 *ret = e;
5012 return 1;
5013 }
5014
5015 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
5016 assert_return(e, -EINVAL);
5017 assert_return(e = event_resolve(e), -ENOPKG);
5018 assert_return(tid, -EINVAL);
5019 assert_return(!event_origin_changed(e), -ECHILD);
5020
5021 if (e->tid != 0) {
5022 *tid = e->tid;
5023 return 0;
5024 }
5025
5026 return -ENXIO;
5027 }
5028
5029 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
5030 int r;
5031
5032 assert_return(e, -EINVAL);
5033 assert_return(e = event_resolve(e), -ENOPKG);
5034 assert_return(!event_origin_changed(e), -ECHILD);
5035
5036 if (e->watchdog == !!b)
5037 return e->watchdog;
5038
5039 if (b) {
5040 r = sd_watchdog_enabled(false, &e->watchdog_period);
5041 if (r <= 0)
5042 return r;
5043
5044 /* Issue first ping immediately */
5045 sd_notify(false, "WATCHDOG=1");
5046 e->watchdog_last = now(CLOCK_MONOTONIC);
5047
5048 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
5049 if (e->watchdog_fd < 0)
5050 return -errno;
5051
5052 r = arm_watchdog(e);
5053 if (r < 0)
5054 goto fail;
5055
5056 struct epoll_event ev = {
5057 .events = EPOLLIN,
5058 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
5059 };
5060
5061 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
5062 r = -errno;
5063 goto fail;
5064 }
5065
5066 } else {
5067 if (e->watchdog_fd >= 0) {
5068 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
5069 e->watchdog_fd = safe_close(e->watchdog_fd);
5070 }
5071 }
5072
5073 e->watchdog = b;
5074 return e->watchdog;
5075
5076 fail:
5077 e->watchdog_fd = safe_close(e->watchdog_fd);
5078 return r;
5079 }
5080
5081 _public_ int sd_event_get_watchdog(sd_event *e) {
5082 assert_return(e, -EINVAL);
5083 assert_return(e = event_resolve(e), -ENOPKG);
5084 assert_return(!event_origin_changed(e), -ECHILD);
5085
5086 return e->watchdog;
5087 }
5088
5089 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5090 assert_return(e, -EINVAL);
5091 assert_return(e = event_resolve(e), -ENOPKG);
5092 assert_return(!event_origin_changed(e), -ECHILD);
5093
5094 *ret = e->iteration;
5095 return 0;
5096 }
5097
5098 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5099 assert_return(s, -EINVAL);
5100 assert_return(s->event, -EINVAL);
5101 assert_return(!event_origin_changed(s->event), -ECHILD);
5102
5103 s->destroy_callback = callback;
5104 return 0;
5105 }
5106
5107 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5108 assert_return(s, -EINVAL);
5109 assert_return(!event_origin_changed(s->event), -ECHILD);
5110
5111 if (ret)
5112 *ret = s->destroy_callback;
5113
5114 return !!s->destroy_callback;
5115 }
5116
5117 _public_ int sd_event_source_get_floating(sd_event_source *s) {
5118 assert_return(s, -EINVAL);
5119 assert_return(!event_origin_changed(s->event), -ECHILD);
5120
5121 return s->floating;
5122 }
5123
5124 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5125 assert_return(s, -EINVAL);
5126 assert_return(!event_origin_changed(s->event), -ECHILD);
5127
5128 if (s->floating == !!b)
5129 return 0;
5130
5131 if (!s->event) /* Already disconnected */
5132 return -ESTALE;
5133
5134 s->floating = b;
5135
5136 if (b) {
5137 sd_event_source_ref(s);
5138 sd_event_unref(s->event);
5139 } else {
5140 sd_event_ref(s->event);
5141 sd_event_source_unref(s);
5142 }
5143
5144 return 1;
5145 }
5146
5147 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5148 assert_return(s, -EINVAL);
5149 assert_return(s->type != SOURCE_EXIT, -EDOM);
5150 assert_return(!event_origin_changed(s->event), -ECHILD);
5151
5152 return s->exit_on_failure;
5153 }
5154
5155 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5156 assert_return(s, -EINVAL);
5157 assert_return(s->type != SOURCE_EXIT, -EDOM);
5158 assert_return(!event_origin_changed(s->event), -ECHILD);
5159
5160 if (s->exit_on_failure == !!b)
5161 return 0;
5162
5163 s->exit_on_failure = b;
5164 return 1;
5165 }
5166
5167 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5168 int r;
5169
5170 assert_return(s, -EINVAL);
5171 assert_return(!event_origin_changed(s->event), -ECHILD);
5172
5173 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5174 * so is a programming error. */
5175 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5176
5177 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5178 * non-ratelimited. */
5179 r = event_source_leave_ratelimit(s, /* run_callback */ false);
5180 if (r < 0)
5181 return r;
5182
5183 s->rate_limit = (RateLimit) { interval, burst };
5184 return 0;
5185 }
5186
5187 _public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5188 assert_return(s, -EINVAL);
5189 assert_return(!event_origin_changed(s->event), -ECHILD);
5190
5191 s->ratelimit_expire_callback = callback;
5192 return 0;
5193 }
5194
5195 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5196 assert_return(s, -EINVAL);
5197 assert_return(!event_origin_changed(s->event), -ECHILD);
5198
5199 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5200 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5201 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5202 return -EDOM;
5203
5204 if (!ratelimit_configured(&s->rate_limit))
5205 return -ENOEXEC;
5206
5207 if (ret_interval)
5208 *ret_interval = s->rate_limit.interval;
5209 if (ret_burst)
5210 *ret_burst = s->rate_limit.burst;
5211
5212 return 0;
5213 }
5214
5215 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5216 assert_return(s, -EINVAL);
5217 assert_return(!event_origin_changed(s->event), -ECHILD);
5218
5219 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5220 return false;
5221
5222 if (!ratelimit_configured(&s->rate_limit))
5223 return false;
5224
5225 return s->ratelimited;
5226 }
5227
5228 _public_ int sd_event_source_leave_ratelimit(sd_event_source *s) {
5229 int r;
5230
5231 assert_return(s, -EINVAL);
5232
5233 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5234 return 0;
5235
5236 if (!ratelimit_configured(&s->rate_limit))
5237 return 0;
5238
5239 if (!s->ratelimited)
5240 return 0;
5241
5242 r = event_source_leave_ratelimit(s, /* run_callback */ false);
5243 if (r < 0)
5244 return r;
5245
5246 return 1; /* tell caller that we indeed just left the ratelimit state */
5247 }
5248
5249 _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5250 bool change = false;
5251 int r;
5252
5253 assert_return(e, -EINVAL);
5254
5255 if (b) {
5256 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5257 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5258 * floating after creation (and undo this before deleting them again). */
5259
5260 if (!e->sigint_event_source) {
5261 r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5262 if (r < 0)
5263 return r;
5264
5265 assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5266 change = true;
5267 }
5268
5269 if (!e->sigterm_event_source) {
5270 r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5271 if (r < 0) {
5272 if (change) {
5273 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5274 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5275 }
5276
5277 return r;
5278 }
5279
5280 assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5281 change = true;
5282 }
5283
5284 } else {
5285 if (e->sigint_event_source) {
5286 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5287 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5288 change = true;
5289 }
5290
5291 if (e->sigterm_event_source) {
5292 assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5293 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5294 change = true;
5295 }
5296 }
5297
5298 return change;
5299 }
5300
5301 _public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5302 _cleanup_free_ char *b = NULL;
5303 _cleanup_free_ void *w = NULL;
5304
5305 assert_return(s, -EINVAL);
5306 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5307 assert_return(ty, -EINVAL);
5308 assert_return(!event_origin_changed(s->event), -ECHILD);
5309
5310 if (!STR_IN_SET(ty, "some", "full"))
5311 return -EINVAL;
5312
5313 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5314 return -EBUSY;
5315
5316 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5317 if (!space)
5318 return -EINVAL;
5319
5320 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5321 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5322 if (!b)
5323 return -ENOMEM;
5324 if (!STR_IN_SET(b, "some", "full"))
5325 return -EINVAL;
5326
5327 if (streq(b, ty))
5328 return 0;
5329
5330 size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5331 w = new(char, nl);
5332 if (!w)
5333 return -ENOMEM;
5334
5335 memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5336
5337 free_and_replace(s->memory_pressure.write_buffer, w);
5338 s->memory_pressure.write_buffer_size = nl;
5339 s->memory_pressure.locked = false;
5340
5341 return 1;
5342 }
5343
5344 _public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5345 _cleanup_free_ char *b = NULL;
5346 _cleanup_free_ void *w = NULL;
5347
5348 assert_return(s, -EINVAL);
5349 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5350 assert_return(!event_origin_changed(s->event), -ECHILD);
5351
5352 if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5353 return -ERANGE;
5354 if (window_usec <= 0 || window_usec >= UINT64_MAX)
5355 return -ERANGE;
5356 if (threshold_usec > window_usec)
5357 return -EINVAL;
5358
5359 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5360 return -EBUSY;
5361
5362 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5363 if (!space)
5364 return -EINVAL;
5365
5366 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5367 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5368 if (!b)
5369 return -ENOMEM;
5370 if (!STR_IN_SET(b, "some", "full"))
5371 return -EINVAL;
5372
5373 if (asprintf((char**) &w,
5374 "%s " USEC_FMT " " USEC_FMT "",
5375 b,
5376 threshold_usec,
5377 window_usec) < 0)
5378 return -EINVAL;
5379
5380 l = strlen(w) + 1;
5381 if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5382 return 0;
5383
5384 free_and_replace(s->memory_pressure.write_buffer, w);
5385 s->memory_pressure.write_buffer_size = l;
5386 s->memory_pressure.locked = false;
5387
5388 return 1;
5389 }