]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/libsystemd/sd-event/sd-event.c
update TODO
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <sys/epoll.h>
4 #if HAVE_PIDFD_OPEN
5 #include <sys/pidfd.h>
6 #endif
7 #include <sys/timerfd.h>
8 #include <sys/wait.h>
9
10 #include "sd-daemon.h"
11 #include "sd-event.h"
12 #include "sd-id128.h"
13 #include "sd-messages.h"
14
15 #include "alloc-util.h"
16 #include "env-util.h"
17 #include "event-source.h"
18 #include "fd-util.h"
19 #include "fs-util.h"
20 #include "glyph-util.h"
21 #include "hashmap.h"
22 #include "hexdecoct.h"
23 #include "list.h"
24 #include "logarithm.h"
25 #include "macro.h"
26 #include "mallinfo-util.h"
27 #include "memory-util.h"
28 #include "missing_magic.h"
29 #include "missing_syscall.h"
30 #include "missing_threads.h"
31 #include "origin-id.h"
32 #include "path-util.h"
33 #include "prioq.h"
34 #include "process-util.h"
35 #include "psi-util.h"
36 #include "set.h"
37 #include "signal-util.h"
38 #include "socket-util.h"
39 #include "stat-util.h"
40 #include "string-table.h"
41 #include "string-util.h"
42 #include "strxcpyx.h"
43 #include "time-util.h"
44
45 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
46
47 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
48 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
49 return s &&
50 s->type == SOURCE_CHILD &&
51 s->child.pidfd >= 0 &&
52 s->child.options == WEXITED;
53 }
54
55 static bool event_source_is_online(sd_event_source *s) {
56 assert(s);
57 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
58 }
59
60 static bool event_source_is_offline(sd_event_source *s) {
61 assert(s);
62 return s->enabled == SD_EVENT_OFF || s->ratelimited;
63 }
64
65 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
66 [SOURCE_IO] = "io",
67 [SOURCE_TIME_REALTIME] = "realtime",
68 [SOURCE_TIME_BOOTTIME] = "boottime",
69 [SOURCE_TIME_MONOTONIC] = "monotonic",
70 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
71 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
72 [SOURCE_SIGNAL] = "signal",
73 [SOURCE_CHILD] = "child",
74 [SOURCE_DEFER] = "defer",
75 [SOURCE_POST] = "post",
76 [SOURCE_EXIT] = "exit",
77 [SOURCE_WATCHDOG] = "watchdog",
78 [SOURCE_INOTIFY] = "inotify",
79 [SOURCE_MEMORY_PRESSURE] = "memory-pressure",
80 };
81
82 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
83
84 #define EVENT_SOURCE_IS_TIME(t) \
85 IN_SET((t), \
86 SOURCE_TIME_REALTIME, \
87 SOURCE_TIME_BOOTTIME, \
88 SOURCE_TIME_MONOTONIC, \
89 SOURCE_TIME_REALTIME_ALARM, \
90 SOURCE_TIME_BOOTTIME_ALARM)
91
92 #define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
93 IN_SET((t), \
94 SOURCE_IO, \
95 SOURCE_TIME_REALTIME, \
96 SOURCE_TIME_BOOTTIME, \
97 SOURCE_TIME_MONOTONIC, \
98 SOURCE_TIME_REALTIME_ALARM, \
99 SOURCE_TIME_BOOTTIME_ALARM, \
100 SOURCE_SIGNAL, \
101 SOURCE_DEFER, \
102 SOURCE_INOTIFY, \
103 SOURCE_MEMORY_PRESSURE)
104
105 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
106 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
107 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
108 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
109
110 struct sd_event {
111 unsigned n_ref;
112
113 int epoll_fd;
114 int watchdog_fd;
115
116 Prioq *pending;
117 Prioq *prepare;
118
119 /* timerfd_create() only supports these five clocks so far. We
120 * can add support for more clocks when the kernel learns to
121 * deal with them, too. */
122 struct clock_data realtime;
123 struct clock_data boottime;
124 struct clock_data monotonic;
125 struct clock_data realtime_alarm;
126 struct clock_data boottime_alarm;
127
128 usec_t perturb;
129
130 sd_event_source **signal_sources; /* indexed by signal number */
131 Hashmap *signal_data; /* indexed by priority */
132
133 Hashmap *child_sources;
134 unsigned n_online_child_sources;
135
136 Set *post_sources;
137
138 Prioq *exit;
139
140 Hashmap *inotify_data; /* indexed by priority */
141
142 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
143 LIST_HEAD(struct inode_data, inode_data_to_close_list);
144
145 /* A list of inotify objects that already have events buffered which aren't processed yet */
146 LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
147
148 /* A list of memory pressure event sources that still need their subscription string written */
149 LIST_HEAD(sd_event_source, memory_pressure_write_list);
150
151 uint64_t origin_id;
152
153 uint64_t iteration;
154 triple_timestamp timestamp;
155 int state;
156
157 bool exit_requested:1;
158 bool need_process_child:1;
159 bool watchdog:1;
160 bool profile_delays:1;
161
162 int exit_code;
163
164 pid_t tid;
165 sd_event **default_event_ptr;
166
167 usec_t watchdog_last, watchdog_period;
168
169 unsigned n_sources;
170
171 struct epoll_event *event_queue;
172
173 LIST_HEAD(sd_event_source, sources);
174
175 sd_event_source *sigint_event_source, *sigterm_event_source;
176
177 usec_t last_run_usec, last_log_usec;
178 unsigned delays[sizeof(usec_t) * 8];
179 };
180
181 DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event, event);
182
183 static thread_local sd_event *default_event = NULL;
184
185 static void source_disconnect(sd_event_source *s);
186 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
187
188 static sd_event *event_resolve(sd_event *e) {
189 return e == SD_EVENT_DEFAULT ? default_event : e;
190 }
191
192 static int pending_prioq_compare(const void *a, const void *b) {
193 const sd_event_source *x = a, *y = b;
194 int r;
195
196 assert(x->pending);
197 assert(y->pending);
198
199 /* Enabled ones first */
200 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
201 if (r != 0)
202 return r;
203
204 /* Non rate-limited ones first. */
205 r = CMP(!!x->ratelimited, !!y->ratelimited);
206 if (r != 0)
207 return r;
208
209 /* Lower priority values first */
210 r = CMP(x->priority, y->priority);
211 if (r != 0)
212 return r;
213
214 /* Older entries first */
215 return CMP(x->pending_iteration, y->pending_iteration);
216 }
217
218 static int prepare_prioq_compare(const void *a, const void *b) {
219 const sd_event_source *x = a, *y = b;
220 int r;
221
222 assert(x->prepare);
223 assert(y->prepare);
224
225 /* Enabled ones first */
226 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
227 if (r != 0)
228 return r;
229
230 /* Non rate-limited ones first. */
231 r = CMP(!!x->ratelimited, !!y->ratelimited);
232 if (r != 0)
233 return r;
234
235 /* Move most recently prepared ones last, so that we can stop
236 * preparing as soon as we hit one that has already been
237 * prepared in the current iteration */
238 r = CMP(x->prepare_iteration, y->prepare_iteration);
239 if (r != 0)
240 return r;
241
242 /* Lower priority values first */
243 return CMP(x->priority, y->priority);
244 }
245
246 static usec_t time_event_source_next(const sd_event_source *s) {
247 assert(s);
248
249 /* We have two kinds of event sources that have elapsation times associated with them: the actual
250 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
251 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
252 * looking at here. */
253
254 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
255 assert(s->rate_limit.begin != 0);
256 assert(s->rate_limit.interval != 0);
257 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
258 }
259
260 /* Otherwise this must be a time event source, if not ratelimited */
261 if (EVENT_SOURCE_IS_TIME(s->type))
262 return s->time.next;
263
264 return USEC_INFINITY;
265 }
266
267 static usec_t time_event_source_latest(const sd_event_source *s) {
268 assert(s);
269
270 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
271 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
272 * window */
273 assert(s->rate_limit.begin != 0);
274 assert(s->rate_limit.interval != 0);
275 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
276 }
277
278 /* Must be a time event source, if not ratelimited */
279 if (EVENT_SOURCE_IS_TIME(s->type))
280 return usec_add(s->time.next, s->time.accuracy);
281
282 return USEC_INFINITY;
283 }
284
285 static bool event_source_timer_candidate(const sd_event_source *s) {
286 assert(s);
287
288 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
289 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
290 return !s->pending || s->ratelimited;
291 }
292
293 static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
294 const sd_event_source *x = a, *y = b;
295 int r;
296
297 /* Enabled ones first */
298 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
299 if (r != 0)
300 return r;
301
302 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
303 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
304 if (r != 0)
305 return r;
306
307 /* Order by time */
308 return CMP(time_func(x), time_func(y));
309 }
310
311 static int earliest_time_prioq_compare(const void *a, const void *b) {
312 return time_prioq_compare(a, b, time_event_source_next);
313 }
314
315 static int latest_time_prioq_compare(const void *a, const void *b) {
316 return time_prioq_compare(a, b, time_event_source_latest);
317 }
318
319 static int exit_prioq_compare(const void *a, const void *b) {
320 const sd_event_source *x = a, *y = b;
321 int r;
322
323 assert(x->type == SOURCE_EXIT);
324 assert(y->type == SOURCE_EXIT);
325
326 /* Enabled ones first */
327 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
328 if (r != 0)
329 return r;
330
331 /* Lower priority values first */
332 return CMP(x->priority, y->priority);
333 }
334
335 static void free_clock_data(struct clock_data *d) {
336 assert(d);
337 assert(d->wakeup == WAKEUP_CLOCK_DATA);
338
339 safe_close(d->fd);
340 prioq_free(d->earliest);
341 prioq_free(d->latest);
342 }
343
344 static sd_event *event_free(sd_event *e) {
345 sd_event_source *s;
346
347 assert(e);
348
349 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
350 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
351
352 while ((s = e->sources)) {
353 assert(s->floating);
354 source_disconnect(s);
355 sd_event_source_unref(s);
356 }
357
358 assert(e->n_sources == 0);
359
360 if (e->default_event_ptr)
361 *(e->default_event_ptr) = NULL;
362
363 safe_close(e->epoll_fd);
364 safe_close(e->watchdog_fd);
365
366 free_clock_data(&e->realtime);
367 free_clock_data(&e->boottime);
368 free_clock_data(&e->monotonic);
369 free_clock_data(&e->realtime_alarm);
370 free_clock_data(&e->boottime_alarm);
371
372 prioq_free(e->pending);
373 prioq_free(e->prepare);
374 prioq_free(e->exit);
375
376 free(e->signal_sources);
377 hashmap_free(e->signal_data);
378
379 hashmap_free(e->inotify_data);
380
381 hashmap_free(e->child_sources);
382 set_free(e->post_sources);
383
384 free(e->event_queue);
385
386 return mfree(e);
387 }
388
389 _public_ int sd_event_new(sd_event** ret) {
390 sd_event *e;
391 int r;
392
393 assert_return(ret, -EINVAL);
394
395 e = new(sd_event, 1);
396 if (!e)
397 return -ENOMEM;
398
399 *e = (sd_event) {
400 .n_ref = 1,
401 .epoll_fd = -EBADF,
402 .watchdog_fd = -EBADF,
403 .realtime.wakeup = WAKEUP_CLOCK_DATA,
404 .realtime.fd = -EBADF,
405 .realtime.next = USEC_INFINITY,
406 .boottime.wakeup = WAKEUP_CLOCK_DATA,
407 .boottime.fd = -EBADF,
408 .boottime.next = USEC_INFINITY,
409 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
410 .monotonic.fd = -EBADF,
411 .monotonic.next = USEC_INFINITY,
412 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
413 .realtime_alarm.fd = -EBADF,
414 .realtime_alarm.next = USEC_INFINITY,
415 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
416 .boottime_alarm.fd = -EBADF,
417 .boottime_alarm.next = USEC_INFINITY,
418 .perturb = USEC_INFINITY,
419 .origin_id = origin_id_query(),
420 };
421
422 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
423 if (r < 0)
424 goto fail;
425
426 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
427 if (e->epoll_fd < 0) {
428 r = -errno;
429 goto fail;
430 }
431
432 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
433
434 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
435 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
436 special_glyph(SPECIAL_GLYPH_ELLIPSIS));
437 e->profile_delays = true;
438 }
439
440 *ret = e;
441 return 0;
442
443 fail:
444 event_free(e);
445 return r;
446 }
447
448 /* Define manually so we can add the origin check */
449 _public_ sd_event *sd_event_ref(sd_event *e) {
450 if (!e)
451 return NULL;
452 if (event_origin_changed(e))
453 return NULL;
454
455 e->n_ref++;
456
457 return e;
458 }
459
460 _public_ sd_event* sd_event_unref(sd_event *e) {
461 if (!e)
462 return NULL;
463 if (event_origin_changed(e))
464 return NULL;
465
466 assert(e->n_ref > 0);
467 if (--e->n_ref > 0)
468 return NULL;
469
470 return event_free(e);
471 }
472
473 #define PROTECT_EVENT(e) \
474 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
475
476 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
477 if (s)
478 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
479 return sd_event_source_unref(s);
480 }
481
482 static void source_io_unregister(sd_event_source *s) {
483 assert(s);
484 assert(s->type == SOURCE_IO);
485
486 if (event_origin_changed(s->event))
487 return;
488
489 if (!s->io.registered)
490 return;
491
492 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
493 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
494 strna(s->description), event_source_type_to_string(s->type));
495
496 s->io.registered = false;
497 }
498
499 static int source_io_register(
500 sd_event_source *s,
501 int enabled,
502 uint32_t events) {
503
504 assert(s);
505 assert(s->type == SOURCE_IO);
506 assert(enabled != SD_EVENT_OFF);
507
508 struct epoll_event ev = {
509 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
510 .data.ptr = s,
511 };
512
513 if (epoll_ctl(s->event->epoll_fd,
514 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
515 s->io.fd, &ev) < 0)
516 return -errno;
517
518 s->io.registered = true;
519
520 return 0;
521 }
522
523 static void source_child_pidfd_unregister(sd_event_source *s) {
524 assert(s);
525 assert(s->type == SOURCE_CHILD);
526
527 if (event_origin_changed(s->event))
528 return;
529
530 if (!s->child.registered)
531 return;
532
533 if (EVENT_SOURCE_WATCH_PIDFD(s))
534 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
535 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
536 strna(s->description), event_source_type_to_string(s->type));
537
538 s->child.registered = false;
539 }
540
541 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
542 assert(s);
543 assert(s->type == SOURCE_CHILD);
544 assert(enabled != SD_EVENT_OFF);
545
546 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
547 struct epoll_event ev = {
548 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
549 .data.ptr = s,
550 };
551
552 if (epoll_ctl(s->event->epoll_fd,
553 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
554 s->child.pidfd, &ev) < 0)
555 return -errno;
556 }
557
558 s->child.registered = true;
559 return 0;
560 }
561
562 static void source_memory_pressure_unregister(sd_event_source *s) {
563 assert(s);
564 assert(s->type == SOURCE_MEMORY_PRESSURE);
565
566 if (event_origin_changed(s->event))
567 return;
568
569 if (!s->memory_pressure.registered)
570 return;
571
572 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
573 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
574 strna(s->description), event_source_type_to_string(s->type));
575
576 s->memory_pressure.registered = false;
577 }
578
579 static int source_memory_pressure_register(sd_event_source *s, int enabled) {
580 assert(s);
581 assert(s->type == SOURCE_MEMORY_PRESSURE);
582 assert(enabled != SD_EVENT_OFF);
583
584 struct epoll_event ev = {
585 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
586 (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
587 .data.ptr = s,
588 };
589
590 if (epoll_ctl(s->event->epoll_fd,
591 s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
592 s->memory_pressure.fd, &ev) < 0)
593 return -errno;
594
595 s->memory_pressure.registered = true;
596 return 0;
597 }
598
599 static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
600 assert(s);
601 assert(s->type == SOURCE_MEMORY_PRESSURE);
602
603 if (s->memory_pressure.in_write_list)
604 return;
605
606 LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
607 s->memory_pressure.in_write_list = true;
608 }
609
610 static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
611 assert(s);
612 assert(s->type == SOURCE_MEMORY_PRESSURE);
613
614 if (!s->memory_pressure.in_write_list)
615 return;
616
617 LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
618 s->memory_pressure.in_write_list = false;
619 }
620
621 static clockid_t event_source_type_to_clock(EventSourceType t) {
622
623 switch (t) {
624
625 case SOURCE_TIME_REALTIME:
626 return CLOCK_REALTIME;
627
628 case SOURCE_TIME_BOOTTIME:
629 return CLOCK_BOOTTIME;
630
631 case SOURCE_TIME_MONOTONIC:
632 return CLOCK_MONOTONIC;
633
634 case SOURCE_TIME_REALTIME_ALARM:
635 return CLOCK_REALTIME_ALARM;
636
637 case SOURCE_TIME_BOOTTIME_ALARM:
638 return CLOCK_BOOTTIME_ALARM;
639
640 default:
641 return (clockid_t) -1;
642 }
643 }
644
645 static EventSourceType clock_to_event_source_type(clockid_t clock) {
646
647 switch (clock) {
648
649 case CLOCK_REALTIME:
650 return SOURCE_TIME_REALTIME;
651
652 case CLOCK_BOOTTIME:
653 return SOURCE_TIME_BOOTTIME;
654
655 case CLOCK_MONOTONIC:
656 return SOURCE_TIME_MONOTONIC;
657
658 case CLOCK_REALTIME_ALARM:
659 return SOURCE_TIME_REALTIME_ALARM;
660
661 case CLOCK_BOOTTIME_ALARM:
662 return SOURCE_TIME_BOOTTIME_ALARM;
663
664 default:
665 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
666 }
667 }
668
669 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
670 assert(e);
671
672 switch (t) {
673
674 case SOURCE_TIME_REALTIME:
675 return &e->realtime;
676
677 case SOURCE_TIME_BOOTTIME:
678 return &e->boottime;
679
680 case SOURCE_TIME_MONOTONIC:
681 return &e->monotonic;
682
683 case SOURCE_TIME_REALTIME_ALARM:
684 return &e->realtime_alarm;
685
686 case SOURCE_TIME_BOOTTIME_ALARM:
687 return &e->boottime_alarm;
688
689 default:
690 return NULL;
691 }
692 }
693
694 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
695 assert(e);
696
697 if (!d)
698 return;
699
700 hashmap_remove(e->signal_data, &d->priority);
701 safe_close(d->fd);
702 free(d);
703 }
704
705 static int event_make_signal_data(
706 sd_event *e,
707 int sig,
708 struct signal_data **ret) {
709
710 struct signal_data *d;
711 bool added = false;
712 sigset_t ss_copy;
713 int64_t priority;
714 int r;
715
716 assert(e);
717
718 if (event_origin_changed(e))
719 return -ECHILD;
720
721 if (e->signal_sources && e->signal_sources[sig])
722 priority = e->signal_sources[sig]->priority;
723 else
724 priority = SD_EVENT_PRIORITY_NORMAL;
725
726 d = hashmap_get(e->signal_data, &priority);
727 if (d) {
728 if (sigismember(&d->sigset, sig) > 0) {
729 if (ret)
730 *ret = d;
731 return 0;
732 }
733 } else {
734 d = new(struct signal_data, 1);
735 if (!d)
736 return -ENOMEM;
737
738 *d = (struct signal_data) {
739 .wakeup = WAKEUP_SIGNAL_DATA,
740 .fd = -EBADF,
741 .priority = priority,
742 };
743
744 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
745 if (r < 0) {
746 free(d);
747 return r;
748 }
749
750 added = true;
751 }
752
753 ss_copy = d->sigset;
754 assert_se(sigaddset(&ss_copy, sig) >= 0);
755
756 r = signalfd(d->fd >= 0 ? d->fd : -1, /* the first arg must be -1 or a valid signalfd */
757 &ss_copy,
758 SFD_NONBLOCK|SFD_CLOEXEC);
759 if (r < 0) {
760 r = -errno;
761 goto fail;
762 }
763
764 d->sigset = ss_copy;
765
766 if (d->fd >= 0) {
767 if (ret)
768 *ret = d;
769 return 0;
770 }
771
772 d->fd = fd_move_above_stdio(r);
773
774 struct epoll_event ev = {
775 .events = EPOLLIN,
776 .data.ptr = d,
777 };
778
779 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
780 r = -errno;
781 goto fail;
782 }
783
784 if (ret)
785 *ret = d;
786
787 return 0;
788
789 fail:
790 if (added)
791 event_free_signal_data(e, d);
792
793 return r;
794 }
795
796 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
797 assert(e);
798 assert(d);
799
800 /* Turns off the specified signal in the signal data
801 * object. If the signal mask of the object becomes empty that
802 * way removes it. */
803
804 if (sigismember(&d->sigset, sig) == 0)
805 return;
806
807 assert_se(sigdelset(&d->sigset, sig) >= 0);
808
809 if (sigisemptyset(&d->sigset)) {
810 /* If all the mask is all-zero we can get rid of the structure */
811 event_free_signal_data(e, d);
812 return;
813 }
814
815 if (event_origin_changed(e))
816 return;
817
818 assert(d->fd >= 0);
819
820 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
821 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
822 }
823
824 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
825 struct signal_data *d;
826 static const int64_t zero_priority = 0;
827
828 assert(e);
829
830 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
831 * and possibly drop the signalfd for it. */
832
833 if (sig == SIGCHLD &&
834 e->n_online_child_sources > 0)
835 return;
836
837 if (e->signal_sources &&
838 e->signal_sources[sig] &&
839 event_source_is_online(e->signal_sources[sig]))
840 return;
841
842 /*
843 * The specified signal might be enabled in three different queues:
844 *
845 * 1) the one that belongs to the priority passed (if it is non-NULL)
846 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
847 * 3) the 0 priority (to cover the SIGCHLD case)
848 *
849 * Hence, let's remove it from all three here.
850 */
851
852 if (priority) {
853 d = hashmap_get(e->signal_data, priority);
854 if (d)
855 event_unmask_signal_data(e, d, sig);
856 }
857
858 if (e->signal_sources && e->signal_sources[sig]) {
859 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
860 if (d)
861 event_unmask_signal_data(e, d, sig);
862 }
863
864 d = hashmap_get(e->signal_data, &zero_priority);
865 if (d)
866 event_unmask_signal_data(e, d, sig);
867 }
868
869 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
870 assert(s);
871
872 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
873 * they are enabled/disabled or marked pending and such. */
874
875 if (s->pending)
876 prioq_reshuffle(s->event->pending, s, &s->pending_index);
877
878 if (s->prepare)
879 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
880 }
881
882 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
883 struct clock_data *d;
884
885 assert(s);
886
887 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
888 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
889 * properly again. */
890
891 if (s->ratelimited)
892 d = &s->event->monotonic;
893 else if (EVENT_SOURCE_IS_TIME(s->type))
894 assert_se(d = event_get_clock_data(s->event, s->type));
895 else
896 return; /* no-op for an event source which is neither a timer nor ratelimited. */
897
898 prioq_reshuffle(d->earliest, s, &s->earliest_index);
899 prioq_reshuffle(d->latest, s, &s->latest_index);
900 d->needs_rearm = true;
901 }
902
903 static void event_source_time_prioq_remove(
904 sd_event_source *s,
905 struct clock_data *d) {
906
907 assert(s);
908 assert(d);
909
910 prioq_remove(d->earliest, s, &s->earliest_index);
911 prioq_remove(d->latest, s, &s->latest_index);
912 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
913 d->needs_rearm = true;
914 }
915
916 static void source_disconnect(sd_event_source *s) {
917 sd_event *event;
918 int r;
919
920 assert(s);
921
922 if (!s->event)
923 return;
924
925 assert(s->event->n_sources > 0);
926
927 switch (s->type) {
928
929 case SOURCE_IO:
930 if (s->io.fd >= 0)
931 source_io_unregister(s);
932
933 break;
934
935 case SOURCE_TIME_REALTIME:
936 case SOURCE_TIME_BOOTTIME:
937 case SOURCE_TIME_MONOTONIC:
938 case SOURCE_TIME_REALTIME_ALARM:
939 case SOURCE_TIME_BOOTTIME_ALARM:
940 /* Only remove this event source from the time event source here if it is not ratelimited. If
941 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
942 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
943
944 if (!s->ratelimited) {
945 struct clock_data *d;
946 assert_se(d = event_get_clock_data(s->event, s->type));
947 event_source_time_prioq_remove(s, d);
948 }
949
950 break;
951
952 case SOURCE_SIGNAL:
953 if (s->signal.sig > 0) {
954
955 if (s->event->signal_sources)
956 s->event->signal_sources[s->signal.sig] = NULL;
957
958 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
959
960 if (s->signal.unblock) {
961 sigset_t new_ss;
962
963 if (sigemptyset(&new_ss) < 0)
964 log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
965 else if (sigaddset(&new_ss, s->signal.sig) < 0)
966 log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
967 else {
968 r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
969 if (r != 0)
970 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
971 }
972 }
973 }
974
975 break;
976
977 case SOURCE_CHILD:
978 if (event_origin_changed(s->event))
979 s->child.process_owned = false;
980
981 if (s->child.pid > 0) {
982 if (event_source_is_online(s)) {
983 assert(s->event->n_online_child_sources > 0);
984 s->event->n_online_child_sources--;
985 }
986
987 (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
988 }
989
990 if (EVENT_SOURCE_WATCH_PIDFD(s))
991 source_child_pidfd_unregister(s);
992 else
993 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
994
995 break;
996
997 case SOURCE_DEFER:
998 /* nothing */
999 break;
1000
1001 case SOURCE_POST:
1002 set_remove(s->event->post_sources, s);
1003 break;
1004
1005 case SOURCE_EXIT:
1006 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
1007 break;
1008
1009 case SOURCE_INOTIFY: {
1010 struct inode_data *inode_data;
1011
1012 inode_data = s->inotify.inode_data;
1013 if (inode_data) {
1014 struct inotify_data *inotify_data;
1015 assert_se(inotify_data = inode_data->inotify_data);
1016
1017 /* Detach this event source from the inode object */
1018 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
1019 s->inotify.inode_data = NULL;
1020
1021 if (s->pending) {
1022 assert(inotify_data->n_pending > 0);
1023 inotify_data->n_pending--;
1024 }
1025
1026 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1027 * continued to being watched. That's because inotify doesn't really have an API for that: we
1028 * can only change watch masks with access to the original inode either by fd or by path. But
1029 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1030 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1031 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1032 * there), but given the need for open_by_handle_at() which is privileged and not universally
1033 * available this would be quite an incomplete solution. Hence we go the other way, leave the
1034 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1035 * anymore after reception. Yes, this sucks, but … Linux … */
1036
1037 /* Maybe release the inode data (and its inotify) */
1038 event_gc_inode_data(s->event, inode_data);
1039 }
1040
1041 break;
1042 }
1043
1044 case SOURCE_MEMORY_PRESSURE:
1045 source_memory_pressure_remove_from_write_list(s);
1046 source_memory_pressure_unregister(s);
1047 break;
1048
1049 default:
1050 assert_not_reached();
1051 }
1052
1053 if (s->pending)
1054 prioq_remove(s->event->pending, s, &s->pending_index);
1055
1056 if (s->prepare)
1057 prioq_remove(s->event->prepare, s, &s->prepare_index);
1058
1059 if (s->ratelimited)
1060 event_source_time_prioq_remove(s, &s->event->monotonic);
1061
1062 event = TAKE_PTR(s->event);
1063 LIST_REMOVE(sources, event->sources, s);
1064 event->n_sources--;
1065
1066 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1067 * pidfd associated with this event source, which we'll do only on source_free(). */
1068
1069 if (!s->floating)
1070 sd_event_unref(event);
1071 }
1072
1073 static sd_event_source* source_free(sd_event_source *s) {
1074 assert(s);
1075
1076 source_disconnect(s);
1077
1078 if (s->type == SOURCE_IO && s->io.owned)
1079 s->io.fd = safe_close(s->io.fd);
1080
1081 if (s->type == SOURCE_CHILD) {
1082 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1083
1084 if (s->child.process_owned) {
1085
1086 if (!s->child.exited) {
1087 bool sent = false;
1088
1089 if (s->child.pidfd >= 0) {
1090 if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
1091 if (errno == ESRCH) /* Already dead */
1092 sent = true;
1093 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
1094 log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
1095 s->child.pid);
1096 } else
1097 sent = true;
1098 }
1099
1100 if (!sent)
1101 if (kill(s->child.pid, SIGKILL) < 0)
1102 if (errno != ESRCH) /* Already dead */
1103 log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1104 s->child.pid);
1105 }
1106
1107 if (!s->child.waited) {
1108 siginfo_t si = {};
1109
1110 /* Reap the child if we can */
1111 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1112 }
1113 }
1114
1115 if (s->child.pidfd_owned)
1116 s->child.pidfd = safe_close(s->child.pidfd);
1117 }
1118
1119 if (s->type == SOURCE_MEMORY_PRESSURE) {
1120 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1121 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1122 }
1123
1124 if (s->destroy_callback)
1125 s->destroy_callback(s->userdata);
1126
1127 free(s->description);
1128 return mfree(s);
1129 }
1130 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1131
1132 static int source_set_pending(sd_event_source *s, bool b) {
1133 int r;
1134
1135 assert(s);
1136 assert(s->type != SOURCE_EXIT);
1137
1138 if (s->pending == b)
1139 return 0;
1140
1141 s->pending = b;
1142
1143 if (b) {
1144 s->pending_iteration = s->event->iteration;
1145
1146 r = prioq_put(s->event->pending, s, &s->pending_index);
1147 if (r < 0) {
1148 s->pending = false;
1149 return r;
1150 }
1151 } else
1152 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1153
1154 if (EVENT_SOURCE_IS_TIME(s->type))
1155 event_source_time_prioq_reshuffle(s);
1156
1157 if (s->type == SOURCE_SIGNAL && !b) {
1158 struct signal_data *d;
1159
1160 d = hashmap_get(s->event->signal_data, &s->priority);
1161 if (d && d->current == s)
1162 d->current = NULL;
1163 }
1164
1165 if (s->type == SOURCE_INOTIFY) {
1166
1167 assert(s->inotify.inode_data);
1168 assert(s->inotify.inode_data->inotify_data);
1169
1170 if (b)
1171 s->inotify.inode_data->inotify_data->n_pending++;
1172 else {
1173 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1174 s->inotify.inode_data->inotify_data->n_pending--;
1175 }
1176 }
1177
1178 return 1;
1179 }
1180
1181 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1182
1183 /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1184 * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1185 * lines. */
1186 static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1187 [SOURCE_IO] = endoffsetof_field(sd_event_source, io),
1188 [SOURCE_TIME_REALTIME] = endoffsetof_field(sd_event_source, time),
1189 [SOURCE_TIME_BOOTTIME] = endoffsetof_field(sd_event_source, time),
1190 [SOURCE_TIME_MONOTONIC] = endoffsetof_field(sd_event_source, time),
1191 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1192 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1193 [SOURCE_SIGNAL] = endoffsetof_field(sd_event_source, signal),
1194 [SOURCE_CHILD] = endoffsetof_field(sd_event_source, child),
1195 [SOURCE_DEFER] = endoffsetof_field(sd_event_source, defer),
1196 [SOURCE_POST] = endoffsetof_field(sd_event_source, post),
1197 [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit),
1198 [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify),
1199 [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure),
1200 };
1201
1202 sd_event_source *s;
1203
1204 assert(e);
1205 assert(type >= 0);
1206 assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1207 assert(size_table[type] > 0);
1208
1209 s = malloc0(size_table[type]);
1210 if (!s)
1211 return NULL;
1212 /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1213 * size, even if we only allocate the initial part we need. */
1214 s = expand_to_usable(s, sizeof(sd_event_source));
1215
1216 /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1217 * than what we allocated here. */
1218 s->n_ref = 1;
1219 s->event = e;
1220 s->floating = floating;
1221 s->type = type;
1222 s->pending_index = PRIOQ_IDX_NULL;
1223 s->prepare_index = PRIOQ_IDX_NULL;
1224
1225 if (!floating)
1226 sd_event_ref(e);
1227
1228 LIST_PREPEND(sources, e->sources, s);
1229 e->n_sources++;
1230
1231 return s;
1232 }
1233
1234 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1235 assert(s);
1236
1237 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1238 }
1239
1240 _public_ int sd_event_add_io(
1241 sd_event *e,
1242 sd_event_source **ret,
1243 int fd,
1244 uint32_t events,
1245 sd_event_io_handler_t callback,
1246 void *userdata) {
1247
1248 _cleanup_(source_freep) sd_event_source *s = NULL;
1249 int r;
1250
1251 assert_return(e, -EINVAL);
1252 assert_return(e = event_resolve(e), -ENOPKG);
1253 assert_return(fd >= 0, -EBADF);
1254 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1255 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1256 assert_return(!event_origin_changed(e), -ECHILD);
1257
1258 if (!callback)
1259 callback = io_exit_callback;
1260
1261 s = source_new(e, !ret, SOURCE_IO);
1262 if (!s)
1263 return -ENOMEM;
1264
1265 s->wakeup = WAKEUP_EVENT_SOURCE;
1266 s->io.fd = fd;
1267 s->io.events = events;
1268 s->io.callback = callback;
1269 s->userdata = userdata;
1270 s->enabled = SD_EVENT_ON;
1271
1272 r = source_io_register(s, s->enabled, events);
1273 if (r < 0)
1274 return r;
1275
1276 if (ret)
1277 *ret = s;
1278 TAKE_PTR(s);
1279
1280 return 0;
1281 }
1282
1283 static void initialize_perturb(sd_event *e) {
1284 sd_id128_t id = {};
1285
1286 /* When we sleep for longer, we try to realign the wakeup to the same time within each
1287 * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1288 * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1289 * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1290 * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1291
1292 if (_likely_(e->perturb != USEC_INFINITY))
1293 return;
1294
1295 if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
1296 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1297 else
1298 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
1299 }
1300
1301 static int event_setup_timer_fd(
1302 sd_event *e,
1303 struct clock_data *d,
1304 clockid_t clock) {
1305
1306 assert(e);
1307 assert(d);
1308
1309 if (_likely_(d->fd >= 0))
1310 return 0;
1311
1312 _cleanup_close_ int fd = -EBADF;
1313
1314 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1315 if (fd < 0)
1316 return -errno;
1317
1318 fd = fd_move_above_stdio(fd);
1319
1320 struct epoll_event ev = {
1321 .events = EPOLLIN,
1322 .data.ptr = d,
1323 };
1324
1325 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1326 return -errno;
1327
1328 d->fd = TAKE_FD(fd);
1329 return 0;
1330 }
1331
1332 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1333 assert(s);
1334
1335 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1336 }
1337
1338 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1339 int r;
1340
1341 assert(d);
1342
1343 if (d->fd < 0) {
1344 r = event_setup_timer_fd(e, d, clock);
1345 if (r < 0)
1346 return r;
1347 }
1348
1349 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1350 if (r < 0)
1351 return r;
1352
1353 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1354 if (r < 0)
1355 return r;
1356
1357 return 0;
1358 }
1359
1360 static int event_source_time_prioq_put(
1361 sd_event_source *s,
1362 struct clock_data *d) {
1363
1364 int r;
1365
1366 assert(s);
1367 assert(d);
1368 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1369
1370 r = prioq_put(d->earliest, s, &s->earliest_index);
1371 if (r < 0)
1372 return r;
1373
1374 r = prioq_put(d->latest, s, &s->latest_index);
1375 if (r < 0) {
1376 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1377 s->earliest_index = PRIOQ_IDX_NULL;
1378 return r;
1379 }
1380
1381 d->needs_rearm = true;
1382 return 0;
1383 }
1384
1385 _public_ int sd_event_add_time(
1386 sd_event *e,
1387 sd_event_source **ret,
1388 clockid_t clock,
1389 uint64_t usec,
1390 uint64_t accuracy,
1391 sd_event_time_handler_t callback,
1392 void *userdata) {
1393
1394 EventSourceType type;
1395 _cleanup_(source_freep) sd_event_source *s = NULL;
1396 struct clock_data *d;
1397 int r;
1398
1399 assert_return(e, -EINVAL);
1400 assert_return(e = event_resolve(e), -ENOPKG);
1401 assert_return(accuracy != UINT64_MAX, -EINVAL);
1402 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1403 assert_return(!event_origin_changed(e), -ECHILD);
1404
1405 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1406 return -EOPNOTSUPP;
1407
1408 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1409 if (type < 0)
1410 return -EOPNOTSUPP;
1411
1412 if (!callback)
1413 callback = time_exit_callback;
1414
1415 assert_se(d = event_get_clock_data(e, type));
1416
1417 r = setup_clock_data(e, d, clock);
1418 if (r < 0)
1419 return r;
1420
1421 s = source_new(e, !ret, type);
1422 if (!s)
1423 return -ENOMEM;
1424
1425 s->time.next = usec;
1426 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1427 s->time.callback = callback;
1428 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1429 s->userdata = userdata;
1430 s->enabled = SD_EVENT_ONESHOT;
1431
1432 r = event_source_time_prioq_put(s, d);
1433 if (r < 0)
1434 return r;
1435
1436 if (ret)
1437 *ret = s;
1438 TAKE_PTR(s);
1439
1440 return 0;
1441 }
1442
1443 _public_ int sd_event_add_time_relative(
1444 sd_event *e,
1445 sd_event_source **ret,
1446 clockid_t clock,
1447 uint64_t usec,
1448 uint64_t accuracy,
1449 sd_event_time_handler_t callback,
1450 void *userdata) {
1451
1452 usec_t t;
1453 int r;
1454
1455 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1456 * checks for overflow. */
1457
1458 r = sd_event_now(e, clock, &t);
1459 if (r < 0)
1460 return r;
1461
1462 if (usec >= USEC_INFINITY - t)
1463 return -EOVERFLOW;
1464
1465 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1466 }
1467
1468 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1469 assert(s);
1470
1471 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1472 }
1473
1474 _public_ int sd_event_add_signal(
1475 sd_event *e,
1476 sd_event_source **ret,
1477 int sig,
1478 sd_event_signal_handler_t callback,
1479 void *userdata) {
1480
1481 _cleanup_(source_freep) sd_event_source *s = NULL;
1482 struct signal_data *d;
1483 sigset_t new_ss;
1484 bool block_it;
1485 int r;
1486
1487 assert_return(e, -EINVAL);
1488 assert_return(e = event_resolve(e), -ENOPKG);
1489 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1490 assert_return(!event_origin_changed(e), -ECHILD);
1491
1492 /* Let's make sure our special flag stays outside of the valid signal range */
1493 assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1494
1495 if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1496 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1497 assert_return(SIGNAL_VALID(sig), -EINVAL);
1498
1499 block_it = true;
1500 } else {
1501 assert_return(SIGNAL_VALID(sig), -EINVAL);
1502
1503 r = signal_is_blocked(sig);
1504 if (r < 0)
1505 return r;
1506 if (r == 0)
1507 return -EBUSY;
1508
1509 block_it = false;
1510 }
1511
1512 if (!callback)
1513 callback = signal_exit_callback;
1514
1515 if (!e->signal_sources) {
1516 e->signal_sources = new0(sd_event_source*, _NSIG);
1517 if (!e->signal_sources)
1518 return -ENOMEM;
1519 } else if (e->signal_sources[sig])
1520 return -EBUSY;
1521
1522 s = source_new(e, !ret, SOURCE_SIGNAL);
1523 if (!s)
1524 return -ENOMEM;
1525
1526 s->signal.sig = sig;
1527 s->signal.callback = callback;
1528 s->userdata = userdata;
1529 s->enabled = SD_EVENT_ON;
1530
1531 e->signal_sources[sig] = s;
1532
1533 if (block_it) {
1534 sigset_t old_ss;
1535
1536 if (sigemptyset(&new_ss) < 0)
1537 return -errno;
1538
1539 if (sigaddset(&new_ss, sig) < 0)
1540 return -errno;
1541
1542 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1543 if (r != 0)
1544 return -r;
1545
1546 r = sigismember(&old_ss, sig);
1547 if (r < 0)
1548 return -errno;
1549
1550 s->signal.unblock = !r;
1551 } else
1552 s->signal.unblock = false;
1553
1554 r = event_make_signal_data(e, sig, &d);
1555 if (r < 0) {
1556 if (s->signal.unblock)
1557 (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1558
1559 return r;
1560 }
1561
1562 /* Use the signal name as description for the event source by default */
1563 (void) sd_event_source_set_description(s, signal_to_string(sig));
1564
1565 if (ret)
1566 *ret = s;
1567 TAKE_PTR(s);
1568
1569 return 0;
1570 }
1571
1572 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1573 assert(s);
1574
1575 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1576 }
1577
1578 static bool shall_use_pidfd(void) {
1579 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1580 return secure_getenv_bool("SYSTEMD_PIDFD") != 0;
1581 }
1582
1583 _public_ int sd_event_add_child(
1584 sd_event *e,
1585 sd_event_source **ret,
1586 pid_t pid,
1587 int options,
1588 sd_event_child_handler_t callback,
1589 void *userdata) {
1590
1591 _cleanup_(source_freep) sd_event_source *s = NULL;
1592 int r;
1593
1594 assert_return(e, -EINVAL);
1595 assert_return(e = event_resolve(e), -ENOPKG);
1596 assert_return(pid > 1, -EINVAL);
1597 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1598 assert_return(options != 0, -EINVAL);
1599 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1600 assert_return(!event_origin_changed(e), -ECHILD);
1601
1602 if (!callback)
1603 callback = child_exit_callback;
1604
1605 if (e->n_online_child_sources == 0) {
1606 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1607 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1608 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1609 * take effect.
1610 *
1611 * (As an optimization we only do this check on the first child event source created.) */
1612 r = signal_is_blocked(SIGCHLD);
1613 if (r < 0)
1614 return r;
1615 if (r == 0)
1616 return -EBUSY;
1617 }
1618
1619 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1620 if (r < 0)
1621 return r;
1622
1623 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1624 return -EBUSY;
1625
1626 s = source_new(e, !ret, SOURCE_CHILD);
1627 if (!s)
1628 return -ENOMEM;
1629
1630 s->wakeup = WAKEUP_EVENT_SOURCE;
1631 s->child.options = options;
1632 s->child.callback = callback;
1633 s->userdata = userdata;
1634 s->enabled = SD_EVENT_ONESHOT;
1635
1636 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1637 * pin the PID, and make regular waitid() handling race-free. */
1638
1639 if (shall_use_pidfd()) {
1640 s->child.pidfd = pidfd_open(pid, 0);
1641 if (s->child.pidfd < 0) {
1642 /* Propagate errors unless the syscall is not supported or blocked */
1643 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1644 return -errno;
1645 } else
1646 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1647 } else
1648 s->child.pidfd = -EBADF;
1649
1650 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1651 /* We have a pidfd and we only want to watch for exit */
1652 r = source_child_pidfd_register(s, s->enabled);
1653 if (r < 0)
1654 return r;
1655
1656 } else {
1657 /* We have no pidfd or we shall wait for some other event than WEXITED */
1658 r = event_make_signal_data(e, SIGCHLD, NULL);
1659 if (r < 0)
1660 return r;
1661
1662 e->need_process_child = true;
1663 }
1664
1665 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1666 if (r < 0)
1667 return r;
1668
1669 /* These must be done after everything succeeds. */
1670 s->child.pid = pid;
1671 e->n_online_child_sources++;
1672
1673 if (ret)
1674 *ret = s;
1675 TAKE_PTR(s);
1676 return 0;
1677 }
1678
1679 _public_ int sd_event_add_child_pidfd(
1680 sd_event *e,
1681 sd_event_source **ret,
1682 int pidfd,
1683 int options,
1684 sd_event_child_handler_t callback,
1685 void *userdata) {
1686
1687
1688 _cleanup_(source_freep) sd_event_source *s = NULL;
1689 pid_t pid;
1690 int r;
1691
1692 assert_return(e, -EINVAL);
1693 assert_return(e = event_resolve(e), -ENOPKG);
1694 assert_return(pidfd >= 0, -EBADF);
1695 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1696 assert_return(options != 0, -EINVAL);
1697 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1698 assert_return(!event_origin_changed(e), -ECHILD);
1699
1700 if (!callback)
1701 callback = child_exit_callback;
1702
1703 if (e->n_online_child_sources == 0) {
1704 r = signal_is_blocked(SIGCHLD);
1705 if (r < 0)
1706 return r;
1707 if (r == 0)
1708 return -EBUSY;
1709 }
1710
1711 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1712 if (r < 0)
1713 return r;
1714
1715 r = pidfd_get_pid(pidfd, &pid);
1716 if (r < 0)
1717 return r;
1718
1719 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1720 return -EBUSY;
1721
1722 s = source_new(e, !ret, SOURCE_CHILD);
1723 if (!s)
1724 return -ENOMEM;
1725
1726 s->wakeup = WAKEUP_EVENT_SOURCE;
1727 s->child.pidfd = pidfd;
1728 s->child.pid = pid;
1729 s->child.options = options;
1730 s->child.callback = callback;
1731 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1732 s->userdata = userdata;
1733 s->enabled = SD_EVENT_ONESHOT;
1734
1735 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1736 if (r < 0)
1737 return r;
1738
1739 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1740 /* We only want to watch for WEXITED */
1741 r = source_child_pidfd_register(s, s->enabled);
1742 if (r < 0)
1743 return r;
1744 } else {
1745 /* We shall wait for some other event than WEXITED */
1746 r = event_make_signal_data(e, SIGCHLD, NULL);
1747 if (r < 0)
1748 return r;
1749
1750 e->need_process_child = true;
1751 }
1752
1753 e->n_online_child_sources++;
1754
1755 if (ret)
1756 *ret = s;
1757 TAKE_PTR(s);
1758 return 0;
1759 }
1760
1761 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1762 assert(s);
1763
1764 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1765 }
1766
1767 _public_ int sd_event_add_defer(
1768 sd_event *e,
1769 sd_event_source **ret,
1770 sd_event_handler_t callback,
1771 void *userdata) {
1772
1773 _cleanup_(source_freep) sd_event_source *s = NULL;
1774 int r;
1775
1776 assert_return(e, -EINVAL);
1777 assert_return(e = event_resolve(e), -ENOPKG);
1778 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1779 assert_return(!event_origin_changed(e), -ECHILD);
1780
1781 if (!callback)
1782 callback = generic_exit_callback;
1783
1784 s = source_new(e, !ret, SOURCE_DEFER);
1785 if (!s)
1786 return -ENOMEM;
1787
1788 s->defer.callback = callback;
1789 s->userdata = userdata;
1790 s->enabled = SD_EVENT_ONESHOT;
1791
1792 r = source_set_pending(s, true);
1793 if (r < 0)
1794 return r;
1795
1796 if (ret)
1797 *ret = s;
1798 TAKE_PTR(s);
1799
1800 return 0;
1801 }
1802
1803 _public_ int sd_event_add_post(
1804 sd_event *e,
1805 sd_event_source **ret,
1806 sd_event_handler_t callback,
1807 void *userdata) {
1808
1809 _cleanup_(source_freep) sd_event_source *s = NULL;
1810 int r;
1811
1812 assert_return(e, -EINVAL);
1813 assert_return(e = event_resolve(e), -ENOPKG);
1814 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1815 assert_return(!event_origin_changed(e), -ECHILD);
1816
1817 if (!callback)
1818 callback = generic_exit_callback;
1819
1820 s = source_new(e, !ret, SOURCE_POST);
1821 if (!s)
1822 return -ENOMEM;
1823
1824 s->post.callback = callback;
1825 s->userdata = userdata;
1826 s->enabled = SD_EVENT_ON;
1827
1828 r = set_ensure_put(&e->post_sources, NULL, s);
1829 if (r < 0)
1830 return r;
1831 assert(r > 0);
1832
1833 if (ret)
1834 *ret = s;
1835 TAKE_PTR(s);
1836
1837 return 0;
1838 }
1839
1840 _public_ int sd_event_add_exit(
1841 sd_event *e,
1842 sd_event_source **ret,
1843 sd_event_handler_t callback,
1844 void *userdata) {
1845
1846 _cleanup_(source_freep) sd_event_source *s = NULL;
1847 int r;
1848
1849 assert_return(e, -EINVAL);
1850 assert_return(e = event_resolve(e), -ENOPKG);
1851 assert_return(callback, -EINVAL);
1852 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1853 assert_return(!event_origin_changed(e), -ECHILD);
1854
1855 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1856 if (r < 0)
1857 return r;
1858
1859 s = source_new(e, !ret, SOURCE_EXIT);
1860 if (!s)
1861 return -ENOMEM;
1862
1863 s->exit.callback = callback;
1864 s->userdata = userdata;
1865 s->exit.prioq_index = PRIOQ_IDX_NULL;
1866 s->enabled = SD_EVENT_ONESHOT;
1867
1868 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1869 if (r < 0)
1870 return r;
1871
1872 if (ret)
1873 *ret = s;
1874 TAKE_PTR(s);
1875
1876 return 0;
1877 }
1878
1879 _public_ int sd_event_trim_memory(void) {
1880 int r;
1881
1882 /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1883 * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1884 * NULL callback parameter. */
1885
1886 log_debug("Memory pressure event, trimming malloc() memory.");
1887
1888 #if HAVE_GENERIC_MALLINFO
1889 generic_mallinfo before_mallinfo = generic_mallinfo_get();
1890 #endif
1891
1892 usec_t before_timestamp = now(CLOCK_MONOTONIC);
1893 hashmap_trim_pools();
1894 r = malloc_trim(0);
1895 usec_t after_timestamp = now(CLOCK_MONOTONIC);
1896
1897 if (r > 0)
1898 log_debug("Successfully trimmed some memory.");
1899 else
1900 log_debug("Couldn't trim any memory.");
1901
1902 usec_t period = after_timestamp - before_timestamp;
1903
1904 #if HAVE_GENERIC_MALLINFO
1905 generic_mallinfo after_mallinfo = generic_mallinfo_get();
1906 size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
1907 LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
1908 log_struct(LOG_DEBUG,
1909 LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1910 FORMAT_TIMESPAN(period, 0),
1911 FORMAT_BYTES(l)),
1912 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1913 "TRIMMED_BYTES=%zu", l,
1914 "TRIMMED_USEC=" USEC_FMT, period);
1915 #else
1916 log_struct(LOG_DEBUG,
1917 LOG_MESSAGE("Memory trimming took %s.",
1918 FORMAT_TIMESPAN(period, 0)),
1919 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1920 "TRIMMED_USEC=" USEC_FMT, period);
1921 #endif
1922
1923 return 0;
1924 }
1925
1926 static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1927 assert(s);
1928
1929 sd_event_trim_memory();
1930 return 0;
1931 }
1932
1933 _public_ int sd_event_add_memory_pressure(
1934 sd_event *e,
1935 sd_event_source **ret,
1936 sd_event_handler_t callback,
1937 void *userdata) {
1938
1939 _cleanup_free_ char *w = NULL;
1940 _cleanup_(source_freep) sd_event_source *s = NULL;
1941 _cleanup_close_ int path_fd = -EBADF, fd = -EBADF;
1942 _cleanup_free_ void *write_buffer = NULL;
1943 const char *watch, *watch_fallback = NULL, *env;
1944 size_t write_buffer_size = 0;
1945 struct stat st;
1946 uint32_t events;
1947 bool locked;
1948 int r;
1949
1950 assert_return(e, -EINVAL);
1951 assert_return(e = event_resolve(e), -ENOPKG);
1952 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1953 assert_return(!event_origin_changed(e), -ECHILD);
1954
1955 if (!callback)
1956 callback = memory_pressure_callback;
1957
1958 s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1959 if (!s)
1960 return -ENOMEM;
1961
1962 s->wakeup = WAKEUP_EVENT_SOURCE;
1963 s->memory_pressure.callback = callback;
1964 s->userdata = userdata;
1965 s->enabled = SD_EVENT_ON;
1966 s->memory_pressure.fd = -EBADF;
1967
1968 env = secure_getenv("MEMORY_PRESSURE_WATCH");
1969 if (env) {
1970 if (isempty(env) || path_equal(env, "/dev/null"))
1971 return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1972 "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1973
1974 if (!path_is_absolute(env) || !path_is_normalized(env))
1975 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1976 "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1977
1978 watch = env;
1979
1980 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1981 if (env) {
1982 r = unbase64mem(env, &write_buffer, &write_buffer_size);
1983 if (r < 0)
1984 return r;
1985 }
1986
1987 locked = true;
1988 } else {
1989
1990 r = is_pressure_supported();
1991 if (r < 0)
1992 return r;
1993 if (r == 0)
1994 return -EOPNOTSUPP;
1995
1996 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1997 * the system wide pressure if for some reason we cannot (which could be: memory controller
1998 * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1999 * only use the system-wide logic. */
2000 r = cg_all_unified();
2001 if (r < 0)
2002 return r;
2003 if (r == 0)
2004 watch = "/proc/pressure/memory";
2005 else {
2006 _cleanup_free_ char *cg = NULL;
2007
2008 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
2009 if (r < 0)
2010 return r;
2011
2012 w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
2013 if (!w)
2014 return -ENOMEM;
2015
2016 watch = w;
2017 watch_fallback = "/proc/pressure/memory";
2018 }
2019
2020 /* Android uses three levels in its userspace low memory killer logic:
2021 * some 70000 1000000
2022 * some 100000 1000000
2023 * full 70000 1000000
2024 *
2025 * GNOME's low memory monitor uses:
2026 * some 70000 1000000
2027 * some 100000 1000000
2028 * full 100000 1000000
2029 *
2030 * We'll default to the middle level that both agree on. Except we do it on a 2s window
2031 * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2032 * kernel will allow us to do unprivileged, also in the future. */
2033 if (asprintf((char**) &write_buffer,
2034 "%s " USEC_FMT " " USEC_FMT,
2035 MEMORY_PRESSURE_DEFAULT_TYPE,
2036 MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2037 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2038 return -ENOMEM;
2039
2040 write_buffer_size = strlen(write_buffer) + 1;
2041 locked = false;
2042 }
2043
2044 path_fd = open(watch, O_PATH|O_CLOEXEC);
2045 if (path_fd < 0) {
2046 if (errno != ENOENT)
2047 return -errno;
2048
2049 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2050 * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2051 * the PSI service apparently is not supported) */
2052 if (!watch_fallback)
2053 return locked ? -ENOENT : -EOPNOTSUPP;
2054
2055 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
2056 if (path_fd < 0) {
2057 if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2058 return -EOPNOTSUPP;
2059 return -errno;
2060 }
2061 }
2062
2063 if (fstat(path_fd, &st) < 0)
2064 return -errno;
2065
2066 if (S_ISSOCK(st.st_mode)) {
2067 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2068 if (fd < 0)
2069 return -errno;
2070
2071 r = connect_unix_path(fd, path_fd, NULL);
2072 if (r < 0)
2073 return r;
2074
2075 events = EPOLLIN;
2076
2077 } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2078 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2079 if (fd < 0)
2080 return fd;
2081
2082 if (S_ISREG(st.st_mode)) {
2083 struct statfs sfs;
2084
2085 /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2086
2087 if (fstatfs(fd, &sfs) < 0)
2088 return -errno;
2089
2090 if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2091 !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2092 return -ENOTTY;
2093
2094 events = EPOLLPRI;
2095 } else
2096 /* For fifos and char devices just watch for EPOLLIN */
2097 events = EPOLLIN;
2098
2099 } else if (S_ISDIR(st.st_mode))
2100 return -EISDIR;
2101 else
2102 return -EBADF;
2103
2104 s->memory_pressure.fd = TAKE_FD(fd);
2105 s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2106 s->memory_pressure.write_buffer_size = write_buffer_size;
2107 s->memory_pressure.events = events;
2108 s->memory_pressure.locked = locked;
2109
2110 /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2111 * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2112 * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2113 * event sources on which writes must be executed before the first event loop iteration is
2114 * executed. (We could also write the data here, right away, but we want to give the caller the
2115 * freedom to call sd_event_source_set_memory_pressure_type() and
2116 * sd_event_source_set_memory_pressure_rate() before we write it. */
2117
2118 if (s->memory_pressure.write_buffer_size > 0)
2119 source_memory_pressure_add_to_write_list(s);
2120 else {
2121 r = source_memory_pressure_register(s, s->enabled);
2122 if (r < 0)
2123 return r;
2124 }
2125
2126 if (ret)
2127 *ret = s;
2128 TAKE_PTR(s);
2129
2130 return 0;
2131 }
2132
2133 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2134 assert(e);
2135
2136 if (!d)
2137 return;
2138
2139 assert(hashmap_isempty(d->inodes));
2140 assert(hashmap_isempty(d->wd));
2141
2142 if (d->buffer_filled > 0)
2143 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
2144
2145 hashmap_free(d->inodes);
2146 hashmap_free(d->wd);
2147
2148 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2149
2150 if (d->fd >= 0) {
2151 if (!event_origin_changed(e) &&
2152 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
2153 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2154
2155 safe_close(d->fd);
2156 }
2157 free(d);
2158 }
2159
2160 static int event_make_inotify_data(
2161 sd_event *e,
2162 int64_t priority,
2163 struct inotify_data **ret) {
2164
2165 _cleanup_close_ int fd = -EBADF;
2166 struct inotify_data *d;
2167 int r;
2168
2169 assert(e);
2170
2171 d = hashmap_get(e->inotify_data, &priority);
2172 if (d) {
2173 if (ret)
2174 *ret = d;
2175 return 0;
2176 }
2177
2178 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2179 if (fd < 0)
2180 return -errno;
2181
2182 fd = fd_move_above_stdio(fd);
2183
2184 d = new(struct inotify_data, 1);
2185 if (!d)
2186 return -ENOMEM;
2187
2188 *d = (struct inotify_data) {
2189 .wakeup = WAKEUP_INOTIFY_DATA,
2190 .fd = TAKE_FD(fd),
2191 .priority = priority,
2192 };
2193
2194 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
2195 if (r < 0) {
2196 d->fd = safe_close(d->fd);
2197 free(d);
2198 return r;
2199 }
2200
2201 struct epoll_event ev = {
2202 .events = EPOLLIN,
2203 .data.ptr = d,
2204 };
2205
2206 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2207 r = -errno;
2208 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2209 * remove the fd from the epoll first, which we don't want as we couldn't
2210 * add it in the first place. */
2211 event_free_inotify_data(e, d);
2212 return r;
2213 }
2214
2215 if (ret)
2216 *ret = d;
2217
2218 return 1;
2219 }
2220
2221 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
2222 int r;
2223
2224 assert(x);
2225 assert(y);
2226
2227 r = CMP(x->dev, y->dev);
2228 if (r != 0)
2229 return r;
2230
2231 return CMP(x->ino, y->ino);
2232 }
2233
2234 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2235 assert(d);
2236
2237 siphash24_compress_typesafe(d->dev, state);
2238 siphash24_compress_typesafe(d->ino, state);
2239 }
2240
2241 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
2242
2243 static void event_free_inode_data(
2244 sd_event *e,
2245 struct inode_data *d) {
2246
2247 assert(e);
2248
2249 if (!d)
2250 return;
2251
2252 assert(!d->event_sources);
2253
2254 if (d->fd >= 0) {
2255 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
2256 safe_close(d->fd);
2257 }
2258
2259 if (d->inotify_data) {
2260
2261 if (d->wd >= 0) {
2262 if (d->inotify_data->fd >= 0 && !event_origin_changed(e)) {
2263 /* So here's a problem. At the time this runs the watch descriptor might already be
2264 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2265 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2266 * likely case to happen. */
2267
2268 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2269 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2270 }
2271
2272 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2273 }
2274
2275 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2276 }
2277
2278 free(d->path);
2279 free(d);
2280 }
2281
2282 static void event_gc_inotify_data(
2283 sd_event *e,
2284 struct inotify_data *d) {
2285
2286 assert(e);
2287
2288 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2289 * any inode with it anymore, which in turn happens if no event source of this priority is interested
2290 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2291 * (under the expectation that the GC is called again once the counter is decremented). */
2292
2293 if (!d)
2294 return;
2295
2296 if (!hashmap_isempty(d->inodes))
2297 return;
2298
2299 if (d->n_busy > 0)
2300 return;
2301
2302 event_free_inotify_data(e, d);
2303 }
2304
2305 static void event_gc_inode_data(
2306 sd_event *e,
2307 struct inode_data *d) {
2308
2309 struct inotify_data *inotify_data;
2310
2311 assert(e);
2312
2313 if (!d)
2314 return;
2315
2316 if (d->event_sources)
2317 return;
2318
2319 inotify_data = d->inotify_data;
2320 event_free_inode_data(e, d);
2321
2322 event_gc_inotify_data(e, inotify_data);
2323 }
2324
2325 static int event_make_inode_data(
2326 sd_event *e,
2327 struct inotify_data *inotify_data,
2328 dev_t dev,
2329 ino_t ino,
2330 struct inode_data **ret) {
2331
2332 struct inode_data *d, key;
2333 int r;
2334
2335 assert(e);
2336 assert(inotify_data);
2337
2338 key = (struct inode_data) {
2339 .ino = ino,
2340 .dev = dev,
2341 };
2342
2343 d = hashmap_get(inotify_data->inodes, &key);
2344 if (d) {
2345 if (ret)
2346 *ret = d;
2347
2348 return 0;
2349 }
2350
2351 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2352 if (r < 0)
2353 return r;
2354
2355 d = new(struct inode_data, 1);
2356 if (!d)
2357 return -ENOMEM;
2358
2359 *d = (struct inode_data) {
2360 .dev = dev,
2361 .ino = ino,
2362 .wd = -1,
2363 .fd = -EBADF,
2364 .inotify_data = inotify_data,
2365 };
2366
2367 r = hashmap_put(inotify_data->inodes, d, d);
2368 if (r < 0) {
2369 free(d);
2370 return r;
2371 }
2372
2373 if (ret)
2374 *ret = d;
2375
2376 return 1;
2377 }
2378
2379 static uint32_t inode_data_determine_mask(struct inode_data *d) {
2380 bool excl_unlink = true;
2381 uint32_t combined = 0;
2382
2383 assert(d);
2384
2385 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2386 * the IN_EXCL_UNLINK flag is ANDed instead.
2387 *
2388 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2389 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2390 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2391 * events we don't care for client-side. */
2392
2393 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2394
2395 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2396 excl_unlink = false;
2397
2398 combined |= s->inotify.mask;
2399 }
2400
2401 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2402 }
2403
2404 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2405 uint32_t combined_mask;
2406 int wd, r;
2407
2408 assert(d);
2409 assert(d->fd >= 0);
2410
2411 combined_mask = inode_data_determine_mask(d);
2412
2413 if (d->wd >= 0 && combined_mask == d->combined_mask)
2414 return 0;
2415
2416 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2417 if (r < 0)
2418 return r;
2419
2420 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2421 if (wd < 0)
2422 return wd;
2423
2424 if (d->wd < 0) {
2425 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2426 if (r < 0) {
2427 (void) inotify_rm_watch(d->inotify_data->fd, wd);
2428 return r;
2429 }
2430
2431 d->wd = wd;
2432
2433 } else if (d->wd != wd) {
2434
2435 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2436 (void) inotify_rm_watch(d->fd, wd);
2437 return -EINVAL;
2438 }
2439
2440 d->combined_mask = combined_mask;
2441 return 1;
2442 }
2443
2444 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2445 assert(s);
2446
2447 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2448 }
2449
2450 static int event_add_inotify_fd_internal(
2451 sd_event *e,
2452 sd_event_source **ret,
2453 int fd,
2454 bool donate,
2455 uint32_t mask,
2456 sd_event_inotify_handler_t callback,
2457 void *userdata) {
2458
2459 _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
2460 _cleanup_(source_freep) sd_event_source *s = NULL;
2461 struct inotify_data *inotify_data = NULL;
2462 struct inode_data *inode_data = NULL;
2463 struct stat st;
2464 int r;
2465
2466 assert_return(e, -EINVAL);
2467 assert_return(e = event_resolve(e), -ENOPKG);
2468 assert_return(fd >= 0, -EBADF);
2469 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2470 assert_return(!event_origin_changed(e), -ECHILD);
2471
2472 if (!callback)
2473 callback = inotify_exit_callback;
2474
2475 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2476 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2477 * the user can't use them for us. */
2478 if (mask & IN_MASK_ADD)
2479 return -EINVAL;
2480
2481 if (fstat(fd, &st) < 0)
2482 return -errno;
2483
2484 s = source_new(e, !ret, SOURCE_INOTIFY);
2485 if (!s)
2486 return -ENOMEM;
2487
2488 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2489 s->inotify.mask = mask;
2490 s->inotify.callback = callback;
2491 s->userdata = userdata;
2492
2493 /* Allocate an inotify object for this priority, and an inode object within it */
2494 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2495 if (r < 0)
2496 return r;
2497
2498 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2499 if (r < 0) {
2500 event_gc_inotify_data(e, inotify_data);
2501 return r;
2502 }
2503
2504 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2505 * the event source, until then, for which we need the original inode. */
2506 if (inode_data->fd < 0) {
2507 if (donated_fd >= 0)
2508 inode_data->fd = TAKE_FD(donated_fd);
2509 else {
2510 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2511 if (inode_data->fd < 0) {
2512 r = -errno;
2513 event_gc_inode_data(e, inode_data);
2514 return r;
2515 }
2516 }
2517
2518 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
2519
2520 _cleanup_free_ char *path = NULL;
2521 r = fd_get_path(inode_data->fd, &path);
2522 if (r < 0 && r != -ENOSYS) { /* The path is optional, hence ignore -ENOSYS. */
2523 event_gc_inode_data(e, inode_data);
2524 return r;
2525 }
2526
2527 free_and_replace(inode_data->path, path);
2528 }
2529
2530 /* Link our event source to the inode data object */
2531 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2532 s->inotify.inode_data = inode_data;
2533
2534 /* Actually realize the watch now */
2535 r = inode_data_realize_watch(e, inode_data);
2536 if (r < 0)
2537 return r;
2538
2539 if (ret)
2540 *ret = s;
2541 TAKE_PTR(s);
2542
2543 return 0;
2544 }
2545
2546 _public_ int sd_event_add_inotify_fd(
2547 sd_event *e,
2548 sd_event_source **ret,
2549 int fd,
2550 uint32_t mask,
2551 sd_event_inotify_handler_t callback,
2552 void *userdata) {
2553
2554 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2555 }
2556
2557 _public_ int sd_event_add_inotify(
2558 sd_event *e,
2559 sd_event_source **ret,
2560 const char *path,
2561 uint32_t mask,
2562 sd_event_inotify_handler_t callback,
2563 void *userdata) {
2564
2565 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2566 int fd, r;
2567
2568 assert_return(path, -EINVAL);
2569
2570 fd = open(path, O_PATH | O_CLOEXEC |
2571 (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2572 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2573 if (fd < 0)
2574 return -errno;
2575
2576 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2577 if (r < 0)
2578 return r;
2579
2580 (void) sd_event_source_set_description(s, path);
2581
2582 if (ret)
2583 *ret = s;
2584
2585 return r;
2586 }
2587
2588 static sd_event_source* event_source_free(sd_event_source *s) {
2589 if (!s)
2590 return NULL;
2591
2592 /* Here's a special hack: when we are called from a
2593 * dispatch handler we won't free the event source
2594 * immediately, but we will detach the fd from the
2595 * epoll. This way it is safe for the caller to unref
2596 * the event source and immediately close the fd, but
2597 * we still retain a valid event source object after
2598 * the callback. */
2599
2600 if (s->dispatching)
2601 source_disconnect(s);
2602 else
2603 source_free(s);
2604
2605 return NULL;
2606 }
2607
2608 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2609
2610 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2611 assert_return(s, -EINVAL);
2612 assert_return(!event_origin_changed(s->event), -ECHILD);
2613
2614 return free_and_strdup(&s->description, description);
2615 }
2616
2617 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
2618 assert_return(s, -EINVAL);
2619 assert_return(description, -EINVAL);
2620
2621 if (!s->description)
2622 return -ENXIO;
2623
2624 *description = s->description;
2625 return 0;
2626 }
2627
2628 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
2629 assert_return(s, NULL);
2630 assert_return(!event_origin_changed(s->event), NULL);
2631
2632 return s->event;
2633 }
2634
2635 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2636 assert_return(s, -EINVAL);
2637 assert_return(s->type != SOURCE_EXIT, -EDOM);
2638 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2639 assert_return(!event_origin_changed(s->event), -ECHILD);
2640
2641 return s->pending;
2642 }
2643
2644 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2645 assert_return(s, -EINVAL);
2646 assert_return(s->type == SOURCE_IO, -EDOM);
2647 assert_return(!event_origin_changed(s->event), -ECHILD);
2648
2649 return s->io.fd;
2650 }
2651
2652 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2653 int saved_fd, r;
2654
2655 assert_return(s, -EINVAL);
2656 assert_return(fd >= 0, -EBADF);
2657 assert_return(s->type == SOURCE_IO, -EDOM);
2658 assert_return(!event_origin_changed(s->event), -ECHILD);
2659
2660 if (s->io.fd == fd)
2661 return 0;
2662
2663 saved_fd = s->io.fd;
2664 s->io.fd = fd;
2665
2666 assert(event_source_is_offline(s) == !s->io.registered);
2667
2668 if (s->io.registered) {
2669 s->io.registered = false;
2670
2671 r = source_io_register(s, s->enabled, s->io.events);
2672 if (r < 0) {
2673 s->io.fd = saved_fd;
2674 s->io.registered = true;
2675 return r;
2676 }
2677
2678 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2679 }
2680
2681 if (s->io.owned)
2682 safe_close(saved_fd);
2683
2684 return 0;
2685 }
2686
2687 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2688 assert_return(s, -EINVAL);
2689 assert_return(s->type == SOURCE_IO, -EDOM);
2690 assert_return(!event_origin_changed(s->event), -ECHILD);
2691
2692 return s->io.owned;
2693 }
2694
2695 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2696 assert_return(s, -EINVAL);
2697 assert_return(s->type == SOURCE_IO, -EDOM);
2698 assert_return(!event_origin_changed(s->event), -ECHILD);
2699
2700 s->io.owned = own;
2701 return 0;
2702 }
2703
2704 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2705 assert_return(s, -EINVAL);
2706 assert_return(events, -EINVAL);
2707 assert_return(s->type == SOURCE_IO, -EDOM);
2708 assert_return(!event_origin_changed(s->event), -ECHILD);
2709
2710 *events = s->io.events;
2711 return 0;
2712 }
2713
2714 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2715 int r;
2716
2717 assert_return(s, -EINVAL);
2718 assert_return(s->type == SOURCE_IO, -EDOM);
2719 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2720 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2721 assert_return(!event_origin_changed(s->event), -ECHILD);
2722
2723 /* edge-triggered updates are never skipped, so we can reset edges */
2724 if (s->io.events == events && !(events & EPOLLET))
2725 return 0;
2726
2727 r = source_set_pending(s, false);
2728 if (r < 0)
2729 return r;
2730
2731 if (event_source_is_online(s)) {
2732 r = source_io_register(s, s->enabled, events);
2733 if (r < 0)
2734 return r;
2735 }
2736
2737 s->io.events = events;
2738
2739 return 0;
2740 }
2741
2742 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2743 assert_return(s, -EINVAL);
2744 assert_return(revents, -EINVAL);
2745 assert_return(s->type == SOURCE_IO, -EDOM);
2746 assert_return(s->pending, -ENODATA);
2747 assert_return(!event_origin_changed(s->event), -ECHILD);
2748
2749 *revents = s->io.revents;
2750 return 0;
2751 }
2752
2753 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2754 assert_return(s, -EINVAL);
2755 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2756 assert_return(!event_origin_changed(s->event), -ECHILD);
2757
2758 return s->signal.sig;
2759 }
2760
2761 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2762 assert_return(s, -EINVAL);
2763 assert_return(!event_origin_changed(s->event), -ECHILD);
2764
2765 *priority = s->priority;
2766 return 0;
2767 }
2768
2769 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2770 bool rm_inotify = false, rm_inode = false;
2771 struct inotify_data *new_inotify_data = NULL;
2772 struct inode_data *new_inode_data = NULL;
2773 int r;
2774
2775 assert_return(s, -EINVAL);
2776 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2777 assert_return(!event_origin_changed(s->event), -ECHILD);
2778
2779 if (s->priority == priority)
2780 return 0;
2781
2782 if (s->type == SOURCE_INOTIFY) {
2783 struct inode_data *old_inode_data;
2784
2785 assert(s->inotify.inode_data);
2786 old_inode_data = s->inotify.inode_data;
2787
2788 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2789 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2790 * events we allow priority changes only until the first following iteration. */
2791 if (old_inode_data->fd < 0)
2792 return -EOPNOTSUPP;
2793
2794 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2795 if (r < 0)
2796 return r;
2797 rm_inotify = r > 0;
2798
2799 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2800 if (r < 0)
2801 goto fail;
2802 rm_inode = r > 0;
2803
2804 if (new_inode_data->fd < 0) {
2805 /* Duplicate the fd for the new inode object if we don't have any yet */
2806 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2807 if (new_inode_data->fd < 0) {
2808 r = -errno;
2809 goto fail;
2810 }
2811
2812 LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
2813
2814 _cleanup_free_ char *path = NULL;
2815 r = fd_get_path(new_inode_data->fd, &path);
2816 if (r < 0 && r != -ENOSYS)
2817 goto fail;
2818
2819 free_and_replace(new_inode_data->path, path);
2820 }
2821
2822 /* Move the event source to the new inode data structure */
2823 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2824 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2825 s->inotify.inode_data = new_inode_data;
2826
2827 /* Now create the new watch */
2828 r = inode_data_realize_watch(s->event, new_inode_data);
2829 if (r < 0) {
2830 /* Move it back */
2831 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2832 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2833 s->inotify.inode_data = old_inode_data;
2834 goto fail;
2835 }
2836
2837 s->priority = priority;
2838
2839 event_gc_inode_data(s->event, old_inode_data);
2840
2841 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2842 struct signal_data *old, *d;
2843
2844 /* Move us from the signalfd belonging to the old
2845 * priority to the signalfd of the new priority */
2846
2847 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2848
2849 s->priority = priority;
2850
2851 r = event_make_signal_data(s->event, s->signal.sig, &d);
2852 if (r < 0) {
2853 s->priority = old->priority;
2854 return r;
2855 }
2856
2857 event_unmask_signal_data(s->event, old, s->signal.sig);
2858 } else
2859 s->priority = priority;
2860
2861 event_source_pp_prioq_reshuffle(s);
2862
2863 if (s->type == SOURCE_EXIT)
2864 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2865
2866 return 0;
2867
2868 fail:
2869 if (rm_inode)
2870 event_free_inode_data(s->event, new_inode_data);
2871
2872 if (rm_inotify)
2873 event_free_inotify_data(s->event, new_inotify_data);
2874
2875 return r;
2876 }
2877
2878 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2879 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2880 if (!s && !ret)
2881 return false;
2882
2883 assert_return(s, -EINVAL);
2884 assert_return(!event_origin_changed(s->event), -ECHILD);
2885
2886 if (ret)
2887 *ret = s->enabled;
2888
2889 return s->enabled != SD_EVENT_OFF;
2890 }
2891
2892 static int event_source_offline(
2893 sd_event_source *s,
2894 int enabled,
2895 bool ratelimited) {
2896
2897 bool was_offline;
2898 int r;
2899
2900 assert(s);
2901 assert(enabled == SD_EVENT_OFF || ratelimited);
2902
2903 /* Unset the pending flag when this event source is disabled */
2904 if (s->enabled != SD_EVENT_OFF &&
2905 enabled == SD_EVENT_OFF &&
2906 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2907 r = source_set_pending(s, false);
2908 if (r < 0)
2909 return r;
2910 }
2911
2912 was_offline = event_source_is_offline(s);
2913 s->enabled = enabled;
2914 s->ratelimited = ratelimited;
2915
2916 switch (s->type) {
2917
2918 case SOURCE_IO:
2919 source_io_unregister(s);
2920 break;
2921
2922 case SOURCE_SIGNAL:
2923 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2924 break;
2925
2926 case SOURCE_CHILD:
2927 if (!was_offline) {
2928 assert(s->event->n_online_child_sources > 0);
2929 s->event->n_online_child_sources--;
2930 }
2931
2932 if (EVENT_SOURCE_WATCH_PIDFD(s))
2933 source_child_pidfd_unregister(s);
2934 else
2935 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2936 break;
2937
2938 case SOURCE_EXIT:
2939 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2940 break;
2941
2942 case SOURCE_MEMORY_PRESSURE:
2943 source_memory_pressure_unregister(s);
2944 break;
2945
2946 case SOURCE_TIME_REALTIME:
2947 case SOURCE_TIME_BOOTTIME:
2948 case SOURCE_TIME_MONOTONIC:
2949 case SOURCE_TIME_REALTIME_ALARM:
2950 case SOURCE_TIME_BOOTTIME_ALARM:
2951 case SOURCE_DEFER:
2952 case SOURCE_POST:
2953 case SOURCE_INOTIFY:
2954 break;
2955
2956 default:
2957 assert_not_reached();
2958 }
2959
2960 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2961 event_source_time_prioq_reshuffle(s);
2962
2963 return 1;
2964 }
2965
2966 static int event_source_online(
2967 sd_event_source *s,
2968 int enabled,
2969 bool ratelimited) {
2970
2971 bool was_online;
2972 int r;
2973
2974 assert(s);
2975 assert(enabled != SD_EVENT_OFF || !ratelimited);
2976
2977 /* Unset the pending flag when this event source is enabled */
2978 if (s->enabled == SD_EVENT_OFF &&
2979 enabled != SD_EVENT_OFF &&
2980 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2981 r = source_set_pending(s, false);
2982 if (r < 0)
2983 return r;
2984 }
2985
2986 /* Are we really ready for onlining? */
2987 if (enabled == SD_EVENT_OFF || ratelimited) {
2988 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2989 s->enabled = enabled;
2990 s->ratelimited = ratelimited;
2991 return 0;
2992 }
2993
2994 was_online = event_source_is_online(s);
2995
2996 switch (s->type) {
2997 case SOURCE_IO:
2998 r = source_io_register(s, enabled, s->io.events);
2999 if (r < 0)
3000 return r;
3001 break;
3002
3003 case SOURCE_SIGNAL:
3004 r = event_make_signal_data(s->event, s->signal.sig, NULL);
3005 if (r < 0) {
3006 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
3007 return r;
3008 }
3009
3010 break;
3011
3012 case SOURCE_CHILD:
3013 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
3014 /* yes, we have pidfd */
3015
3016 r = source_child_pidfd_register(s, enabled);
3017 if (r < 0)
3018 return r;
3019 } else {
3020 /* no pidfd, or something other to watch for than WEXITED */
3021
3022 r = event_make_signal_data(s->event, SIGCHLD, NULL);
3023 if (r < 0) {
3024 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
3025 return r;
3026 }
3027 }
3028
3029 if (!was_online)
3030 s->event->n_online_child_sources++;
3031 break;
3032
3033 case SOURCE_MEMORY_PRESSURE:
3034 r = source_memory_pressure_register(s, enabled);
3035 if (r < 0)
3036 return r;
3037
3038 break;
3039
3040 case SOURCE_TIME_REALTIME:
3041 case SOURCE_TIME_BOOTTIME:
3042 case SOURCE_TIME_MONOTONIC:
3043 case SOURCE_TIME_REALTIME_ALARM:
3044 case SOURCE_TIME_BOOTTIME_ALARM:
3045 case SOURCE_EXIT:
3046 case SOURCE_DEFER:
3047 case SOURCE_POST:
3048 case SOURCE_INOTIFY:
3049 break;
3050
3051 default:
3052 assert_not_reached();
3053 }
3054
3055 s->enabled = enabled;
3056 s->ratelimited = ratelimited;
3057
3058 /* Non-failing operations below */
3059 if (s->type == SOURCE_EXIT)
3060 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
3061
3062 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3063 event_source_time_prioq_reshuffle(s);
3064
3065 return 1;
3066 }
3067
3068 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3069 int r;
3070
3071 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
3072
3073 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3074 if (m == SD_EVENT_OFF && !s)
3075 return 0;
3076
3077 assert_return(s, -EINVAL);
3078 assert_return(!event_origin_changed(s->event), -ECHILD);
3079
3080 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3081 if (s->event->state == SD_EVENT_FINISHED)
3082 return m == SD_EVENT_OFF ? 0 : -ESTALE;
3083
3084 if (s->enabled == m) /* No change? */
3085 return 0;
3086
3087 if (m == SD_EVENT_OFF)
3088 r = event_source_offline(s, m, s->ratelimited);
3089 else {
3090 if (s->enabled != SD_EVENT_OFF) {
3091 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3092 * event source is already enabled after all. */
3093 s->enabled = m;
3094 return 0;
3095 }
3096
3097 r = event_source_online(s, m, s->ratelimited);
3098 }
3099 if (r < 0)
3100 return r;
3101
3102 event_source_pp_prioq_reshuffle(s);
3103 return 0;
3104 }
3105
3106 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
3107 assert_return(s, -EINVAL);
3108 assert_return(usec, -EINVAL);
3109 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3110 assert_return(!event_origin_changed(s->event), -ECHILD);
3111
3112 *usec = s->time.next;
3113 return 0;
3114 }
3115
3116 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
3117 int r;
3118
3119 assert_return(s, -EINVAL);
3120 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3121 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3122 assert_return(!event_origin_changed(s->event), -ECHILD);
3123
3124 r = source_set_pending(s, false);
3125 if (r < 0)
3126 return r;
3127
3128 s->time.next = usec;
3129
3130 event_source_time_prioq_reshuffle(s);
3131 return 0;
3132 }
3133
3134 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3135 usec_t t;
3136 int r;
3137
3138 assert_return(s, -EINVAL);
3139 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3140 assert_return(!event_origin_changed(s->event), -ECHILD);
3141
3142 if (usec == USEC_INFINITY)
3143 return sd_event_source_set_time(s, USEC_INFINITY);
3144
3145 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3146 if (r < 0)
3147 return r;
3148
3149 usec = usec_add(t, usec);
3150 if (usec == USEC_INFINITY)
3151 return -EOVERFLOW;
3152
3153 return sd_event_source_set_time(s, usec);
3154 }
3155
3156 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
3157 assert_return(s, -EINVAL);
3158 assert_return(usec, -EINVAL);
3159 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3160 assert_return(!event_origin_changed(s->event), -ECHILD);
3161
3162 *usec = s->time.accuracy;
3163 return 0;
3164 }
3165
3166 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
3167 int r;
3168
3169 assert_return(s, -EINVAL);
3170 assert_return(usec != UINT64_MAX, -EINVAL);
3171 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3172 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3173 assert_return(!event_origin_changed(s->event), -ECHILD);
3174
3175 r = source_set_pending(s, false);
3176 if (r < 0)
3177 return r;
3178
3179 if (usec == 0)
3180 usec = DEFAULT_ACCURACY_USEC;
3181
3182 s->time.accuracy = usec;
3183
3184 event_source_time_prioq_reshuffle(s);
3185 return 0;
3186 }
3187
3188 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
3189 assert_return(s, -EINVAL);
3190 assert_return(clock, -EINVAL);
3191 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3192 assert_return(!event_origin_changed(s->event), -ECHILD);
3193
3194 *clock = event_source_type_to_clock(s->type);
3195 return 0;
3196 }
3197
3198 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
3199 assert_return(s, -EINVAL);
3200 assert_return(pid, -EINVAL);
3201 assert_return(s->type == SOURCE_CHILD, -EDOM);
3202 assert_return(!event_origin_changed(s->event), -ECHILD);
3203
3204 *pid = s->child.pid;
3205 return 0;
3206 }
3207
3208 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3209 assert_return(s, -EINVAL);
3210 assert_return(s->type == SOURCE_CHILD, -EDOM);
3211 assert_return(!event_origin_changed(s->event), -ECHILD);
3212
3213 if (s->child.pidfd < 0)
3214 return -EOPNOTSUPP;
3215
3216 return s->child.pidfd;
3217 }
3218
3219 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3220 assert_return(s, -EINVAL);
3221 assert_return(s->type == SOURCE_CHILD, -EDOM);
3222 assert_return(!event_origin_changed(s->event), -ECHILD);
3223 assert_return(SIGNAL_VALID(sig), -EINVAL);
3224
3225 /* If we already have seen indication the process exited refuse sending a signal early. This way we
3226 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3227 * available. */
3228 if (s->child.exited)
3229 return -ESRCH;
3230
3231 if (s->child.pidfd >= 0) {
3232 siginfo_t copy;
3233
3234 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3235 * structure here */
3236 if (si)
3237 copy = *si;
3238
3239 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
3240 /* Let's propagate the error only if the system call is not implemented or prohibited */
3241 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3242 return -errno;
3243 } else
3244 return 0;
3245 }
3246
3247 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3248 * this here. */
3249 if (flags != 0)
3250 return -EOPNOTSUPP;
3251
3252 if (si) {
3253 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3254 siginfo_t copy = *si;
3255
3256 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
3257 return -errno;
3258 } else if (kill(s->child.pid, sig) < 0)
3259 return -errno;
3260
3261 return 0;
3262 }
3263
3264 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3265 assert_return(s, -EINVAL);
3266 assert_return(s->type == SOURCE_CHILD, -EDOM);
3267 assert_return(!event_origin_changed(s->event), -ECHILD);
3268
3269 if (s->child.pidfd < 0)
3270 return -EOPNOTSUPP;
3271
3272 return s->child.pidfd_owned;
3273 }
3274
3275 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3276 assert_return(s, -EINVAL);
3277 assert_return(s->type == SOURCE_CHILD, -EDOM);
3278 assert_return(!event_origin_changed(s->event), -ECHILD);
3279
3280 if (s->child.pidfd < 0)
3281 return -EOPNOTSUPP;
3282
3283 s->child.pidfd_owned = own;
3284 return 0;
3285 }
3286
3287 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3288 assert_return(s, -EINVAL);
3289 assert_return(s->type == SOURCE_CHILD, -EDOM);
3290 assert_return(!event_origin_changed(s->event), -ECHILD);
3291
3292 return s->child.process_owned;
3293 }
3294
3295 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3296 assert_return(s, -EINVAL);
3297 assert_return(s->type == SOURCE_CHILD, -EDOM);
3298 assert_return(!event_origin_changed(s->event), -ECHILD);
3299
3300 s->child.process_owned = own;
3301 return 0;
3302 }
3303
3304 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret) {
3305 assert_return(s, -EINVAL);
3306 assert_return(ret, -EINVAL);
3307 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3308 assert_return(!event_origin_changed(s->event), -ECHILD);
3309
3310 *ret = s->inotify.mask;
3311 return 0;
3312 }
3313
3314 _public_ int sd_event_source_get_inotify_path(sd_event_source *s, const char **ret) {
3315 assert_return(s, -EINVAL);
3316 assert_return(ret, -EINVAL);
3317 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3318 assert_return(!event_origin_changed(s->event), -ECHILD);
3319
3320 if (!s->inotify.inode_data)
3321 return -ESTALE; /* already disconnected. */
3322
3323 if (!s->inotify.inode_data->path)
3324 return -ENOSYS; /* /proc was not mounted? */
3325
3326 *ret = s->inotify.inode_data->path;
3327 return 0;
3328 }
3329
3330 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
3331 int r;
3332
3333 assert_return(s, -EINVAL);
3334 assert_return(s->type != SOURCE_EXIT, -EDOM);
3335 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3336 assert_return(!event_origin_changed(s->event), -ECHILD);
3337
3338 if (s->prepare == callback)
3339 return 0;
3340
3341 if (callback && s->prepare) {
3342 s->prepare = callback;
3343 return 0;
3344 }
3345
3346 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3347 if (r < 0)
3348 return r;
3349
3350 s->prepare = callback;
3351
3352 if (callback) {
3353 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3354 if (r < 0)
3355 return r;
3356 } else
3357 prioq_remove(s->event->prepare, s, &s->prepare_index);
3358
3359 return 0;
3360 }
3361
3362 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
3363 assert_return(s, NULL);
3364 assert_return(!event_origin_changed(s->event), NULL);
3365
3366 return s->userdata;
3367 }
3368
3369 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3370 void *ret;
3371
3372 assert_return(s, NULL);
3373 assert_return(!event_origin_changed(s->event), NULL);
3374
3375 ret = s->userdata;
3376 s->userdata = userdata;
3377
3378 return ret;
3379 }
3380
3381 static int event_source_enter_ratelimited(sd_event_source *s) {
3382 int r;
3383
3384 assert(s);
3385
3386 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3387 * the end of the rate limit time window, much as if it was a timer event source. */
3388
3389 if (s->ratelimited)
3390 return 0; /* Already ratelimited, this is a NOP hence */
3391
3392 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3393 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3394 if (r < 0)
3395 return r;
3396
3397 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3398 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3399 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3400 if (EVENT_SOURCE_IS_TIME(s->type))
3401 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3402
3403 /* Now, let's add the event source to the monotonic clock instead */
3404 r = event_source_time_prioq_put(s, &s->event->monotonic);
3405 if (r < 0)
3406 goto fail;
3407
3408 /* And let's take the event source officially offline */
3409 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3410 if (r < 0) {
3411 event_source_time_prioq_remove(s, &s->event->monotonic);
3412 goto fail;
3413 }
3414
3415 event_source_pp_prioq_reshuffle(s);
3416
3417 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3418 return 0;
3419
3420 fail:
3421 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3422 * space for it should already be allocated. */
3423 if (EVENT_SOURCE_IS_TIME(s->type))
3424 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3425
3426 return r;
3427 }
3428
3429 static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
3430 int r;
3431
3432 assert(s);
3433
3434 if (!s->ratelimited)
3435 return 0;
3436
3437 /* Let's take the event source out of the monotonic prioq first. */
3438 event_source_time_prioq_remove(s, &s->event->monotonic);
3439
3440 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3441 if (EVENT_SOURCE_IS_TIME(s->type)) {
3442 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3443 if (r < 0)
3444 goto fail;
3445 }
3446
3447 /* Let's try to take it online again. */
3448 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3449 if (r < 0) {
3450 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3451 if (EVENT_SOURCE_IS_TIME(s->type))
3452 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3453
3454 goto fail;
3455 }
3456
3457 event_source_pp_prioq_reshuffle(s);
3458 ratelimit_reset(&s->rate_limit);
3459
3460 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
3461
3462 if (run_callback && s->ratelimit_expire_callback) {
3463 s->dispatching = true;
3464 r = s->ratelimit_expire_callback(s, s->userdata);
3465 s->dispatching = false;
3466
3467 if (r < 0) {
3468 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3469 strna(s->description),
3470 event_source_type_to_string(s->type),
3471 s->exit_on_failure ? "exiting" : "disabling");
3472
3473 if (s->exit_on_failure)
3474 (void) sd_event_exit(s->event, r);
3475 }
3476
3477 if (s->n_ref == 0)
3478 source_free(s);
3479 else if (r < 0)
3480 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3481
3482 return 1;
3483 }
3484
3485 return 0;
3486
3487 fail:
3488 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3489 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3490 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3491
3492 return r;
3493 }
3494
3495 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3496 usec_t c;
3497 assert(e);
3498 assert(a <= b);
3499
3500 if (a <= 0)
3501 return 0;
3502 if (a >= USEC_INFINITY)
3503 return USEC_INFINITY;
3504
3505 if (b <= a + 1)
3506 return a;
3507
3508 initialize_perturb(e);
3509
3510 /*
3511 Find a good time to wake up again between times a and b. We
3512 have two goals here:
3513
3514 a) We want to wake up as seldom as possible, hence prefer
3515 later times over earlier times.
3516
3517 b) But if we have to wake up, then let's make sure to
3518 dispatch as much as possible on the entire system.
3519
3520 We implement this by waking up everywhere at the same time
3521 within any given minute if we can, synchronised via the
3522 perturbation value determined from the boot ID. If we can't,
3523 then we try to find the same spot in every 10s, then 1s and
3524 then 250ms step. Otherwise, we pick the last possible time
3525 to wake up.
3526 */
3527
3528 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3529 if (c >= b) {
3530 if (_unlikely_(c < USEC_PER_MINUTE))
3531 return b;
3532
3533 c -= USEC_PER_MINUTE;
3534 }
3535
3536 if (c >= a)
3537 return c;
3538
3539 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3540 if (c >= b) {
3541 if (_unlikely_(c < USEC_PER_SEC*10))
3542 return b;
3543
3544 c -= USEC_PER_SEC*10;
3545 }
3546
3547 if (c >= a)
3548 return c;
3549
3550 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3551 if (c >= b) {
3552 if (_unlikely_(c < USEC_PER_SEC))
3553 return b;
3554
3555 c -= USEC_PER_SEC;
3556 }
3557
3558 if (c >= a)
3559 return c;
3560
3561 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3562 if (c >= b) {
3563 if (_unlikely_(c < USEC_PER_MSEC*250))
3564 return b;
3565
3566 c -= USEC_PER_MSEC*250;
3567 }
3568
3569 if (c >= a)
3570 return c;
3571
3572 return b;
3573 }
3574
3575 static int event_arm_timer(
3576 sd_event *e,
3577 struct clock_data *d) {
3578
3579 struct itimerspec its = {};
3580 sd_event_source *a, *b;
3581 usec_t t;
3582
3583 assert(e);
3584 assert(d);
3585
3586 if (!d->needs_rearm)
3587 return 0;
3588
3589 d->needs_rearm = false;
3590
3591 a = prioq_peek(d->earliest);
3592 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3593 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3594
3595 if (d->fd < 0)
3596 return 0;
3597
3598 if (d->next == USEC_INFINITY)
3599 return 0;
3600
3601 /* disarm */
3602 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3603 return -errno;
3604
3605 d->next = USEC_INFINITY;
3606 return 0;
3607 }
3608
3609 b = prioq_peek(d->latest);
3610 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3611 assert(b && b->enabled != SD_EVENT_OFF);
3612
3613 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3614 if (d->next == t)
3615 return 0;
3616
3617 assert_se(d->fd >= 0);
3618
3619 if (t == 0) {
3620 /* We don't want to disarm here, just mean some time looooong ago. */
3621 its.it_value.tv_sec = 0;
3622 its.it_value.tv_nsec = 1;
3623 } else
3624 timespec_store(&its.it_value, t);
3625
3626 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3627 return -errno;
3628
3629 d->next = t;
3630 return 0;
3631 }
3632
3633 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3634 assert(e);
3635 assert(s);
3636 assert(s->type == SOURCE_IO);
3637
3638 /* If the event source was already pending, we just OR in the
3639 * new revents, otherwise we reset the value. The ORing is
3640 * necessary to handle EPOLLONESHOT events properly where
3641 * readability might happen independently of writability, and
3642 * we need to keep track of both */
3643
3644 if (s->pending)
3645 s->io.revents |= revents;
3646 else
3647 s->io.revents = revents;
3648
3649 return source_set_pending(s, true);
3650 }
3651
3652 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3653 uint64_t x;
3654 ssize_t ss;
3655
3656 assert(e);
3657 assert(fd >= 0);
3658
3659 assert_return(events == EPOLLIN, -EIO);
3660
3661 ss = read(fd, &x, sizeof(x));
3662 if (ss < 0) {
3663 if (ERRNO_IS_TRANSIENT(errno))
3664 return 0;
3665
3666 return -errno;
3667 }
3668
3669 if (_unlikely_(ss != sizeof(x)))
3670 return -EIO;
3671
3672 if (next)
3673 *next = USEC_INFINITY;
3674
3675 return 0;
3676 }
3677
3678 static int process_timer(
3679 sd_event *e,
3680 usec_t n,
3681 struct clock_data *d) {
3682
3683 sd_event_source *s;
3684 bool callback_invoked = false;
3685 int r;
3686
3687 assert(e);
3688 assert(d);
3689
3690 for (;;) {
3691 s = prioq_peek(d->earliest);
3692 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3693
3694 if (!s || time_event_source_next(s) > n)
3695 break;
3696
3697 if (s->ratelimited) {
3698 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3699 * again. */
3700 assert(s->ratelimited);
3701
3702 r = event_source_leave_ratelimit(s, /* run_callback */ true);
3703 if (r < 0)
3704 return r;
3705 else if (r == 1)
3706 callback_invoked = true;
3707
3708 continue;
3709 }
3710
3711 if (s->enabled == SD_EVENT_OFF || s->pending)
3712 break;
3713
3714 r = source_set_pending(s, true);
3715 if (r < 0)
3716 return r;
3717
3718 event_source_time_prioq_reshuffle(s);
3719 }
3720
3721 return callback_invoked;
3722 }
3723
3724 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3725 int64_t min_priority = threshold;
3726 bool something_new = false;
3727 sd_event_source *s;
3728 int r;
3729
3730 assert(e);
3731 assert(ret_min_priority);
3732
3733 if (!e->need_process_child) {
3734 *ret_min_priority = min_priority;
3735 return 0;
3736 }
3737
3738 e->need_process_child = false;
3739
3740 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3741 * for, instead of using P_ALL. This is because we only want to get child information of very
3742 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3743 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3744 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3745 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3746 * to handle SIGCHLD yourself.
3747 *
3748 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3749 * source is dispatched so that the callback still sees the process as a zombie. */
3750
3751 HASHMAP_FOREACH(s, e->child_sources) {
3752 assert(s->type == SOURCE_CHILD);
3753
3754 if (s->priority > threshold)
3755 continue;
3756
3757 if (s->pending)
3758 continue;
3759
3760 if (event_source_is_offline(s))
3761 continue;
3762
3763 if (s->child.exited)
3764 continue;
3765
3766 if (EVENT_SOURCE_WATCH_PIDFD(s))
3767 /* There's a usable pidfd known for this event source? Then don't waitid() for
3768 * it here */
3769 continue;
3770
3771 zero(s->child.siginfo);
3772 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3773 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3774 return negative_errno();
3775
3776 if (s->child.siginfo.si_pid != 0) {
3777 bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3778
3779 if (zombie)
3780 s->child.exited = true;
3781
3782 if (!zombie && (s->child.options & WEXITED)) {
3783 /* If the child isn't dead then let's immediately remove the state
3784 * change from the queue, since there's no benefit in leaving it
3785 * queued. */
3786
3787 assert(s->child.options & (WSTOPPED|WCONTINUED));
3788 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3789 }
3790
3791 r = source_set_pending(s, true);
3792 if (r < 0)
3793 return r;
3794 if (r > 0) {
3795 something_new = true;
3796 min_priority = MIN(min_priority, s->priority);
3797 }
3798 }
3799 }
3800
3801 *ret_min_priority = min_priority;
3802 return something_new;
3803 }
3804
3805 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3806 assert(e);
3807 assert(s);
3808 assert(s->type == SOURCE_CHILD);
3809
3810 if (s->pending)
3811 return 0;
3812
3813 if (event_source_is_offline(s))
3814 return 0;
3815
3816 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3817 return 0;
3818
3819 zero(s->child.siginfo);
3820 if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3821 return -errno;
3822
3823 if (s->child.siginfo.si_pid == 0)
3824 return 0;
3825
3826 if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3827 s->child.exited = true;
3828
3829 return source_set_pending(s, true);
3830 }
3831
3832 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3833 int r;
3834
3835 assert(e);
3836 assert(d);
3837 assert_return(events == EPOLLIN, -EIO);
3838 assert(min_priority);
3839
3840 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3841 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3842 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3843 * but we might have higher priority children we care about hence we need to check that
3844 * explicitly. */
3845
3846 if (sigismember(&d->sigset, SIGCHLD))
3847 e->need_process_child = true;
3848
3849 /* If there's already an event source pending for this priority we don't read another */
3850 if (d->current)
3851 return 0;
3852
3853 for (;;) {
3854 struct signalfd_siginfo si;
3855 ssize_t n;
3856 sd_event_source *s = NULL;
3857
3858 n = read(d->fd, &si, sizeof(si));
3859 if (n < 0) {
3860 if (ERRNO_IS_TRANSIENT(errno))
3861 return 0;
3862
3863 return -errno;
3864 }
3865
3866 if (_unlikely_(n != sizeof(si)))
3867 return -EIO;
3868
3869 assert(SIGNAL_VALID(si.ssi_signo));
3870
3871 if (e->signal_sources)
3872 s = e->signal_sources[si.ssi_signo];
3873 if (!s)
3874 continue;
3875 if (s->pending)
3876 continue;
3877
3878 s->signal.siginfo = si;
3879 d->current = s;
3880
3881 r = source_set_pending(s, true);
3882 if (r < 0)
3883 return r;
3884 if (r > 0 && *min_priority >= s->priority) {
3885 *min_priority = s->priority;
3886 return 1; /* an event source with smaller priority is queued. */
3887 }
3888
3889 return 0;
3890 }
3891 }
3892
3893 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3894 ssize_t n;
3895
3896 assert(e);
3897 assert(d);
3898
3899 assert_return(revents == EPOLLIN, -EIO);
3900
3901 /* If there's already an event source pending for this priority, don't read another */
3902 if (d->n_pending > 0)
3903 return 0;
3904
3905 /* Is the read buffer non-empty? If so, let's not read more */
3906 if (d->buffer_filled > 0)
3907 return 0;
3908
3909 if (d->priority > threshold)
3910 return 0;
3911
3912 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3913 if (n < 0) {
3914 if (ERRNO_IS_TRANSIENT(errno))
3915 return 0;
3916
3917 return -errno;
3918 }
3919
3920 assert(n > 0);
3921 d->buffer_filled = (size_t) n;
3922 LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
3923
3924 return 1;
3925 }
3926
3927 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3928 assert(e);
3929 assert(d);
3930 assert(sz <= d->buffer_filled);
3931
3932 if (sz == 0)
3933 return;
3934
3935 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3936 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3937 d->buffer_filled -= sz;
3938
3939 if (d->buffer_filled == 0)
3940 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
3941 }
3942
3943 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3944 int r;
3945
3946 assert(e);
3947 assert(d);
3948
3949 /* If there's already an event source pending for this priority, don't read another */
3950 if (d->n_pending > 0)
3951 return 0;
3952
3953 while (d->buffer_filled > 0) {
3954 size_t sz;
3955
3956 /* Let's validate that the event structures are complete */
3957 if (d->buffer_filled < offsetof(struct inotify_event, name))
3958 return -EIO;
3959
3960 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3961 if (d->buffer_filled < sz)
3962 return -EIO;
3963
3964 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3965 struct inode_data *inode_data;
3966
3967 /* The queue overran, let's pass this event to all event sources connected to this inotify
3968 * object */
3969
3970 HASHMAP_FOREACH(inode_data, d->inodes)
3971 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3972
3973 if (event_source_is_offline(s))
3974 continue;
3975
3976 r = source_set_pending(s, true);
3977 if (r < 0)
3978 return r;
3979 }
3980 } else {
3981 struct inode_data *inode_data;
3982
3983 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3984 * our watch descriptor table. */
3985 if (d->buffer.ev.mask & IN_IGNORED) {
3986
3987 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3988 if (!inode_data) {
3989 event_inotify_data_drop(e, d, sz);
3990 continue;
3991 }
3992
3993 /* The watch descriptor was removed by the kernel, let's drop it here too */
3994 inode_data->wd = -1;
3995 } else {
3996 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3997 if (!inode_data) {
3998 event_inotify_data_drop(e, d, sz);
3999 continue;
4000 }
4001 }
4002
4003 /* Trigger all event sources that are interested in these events. Also trigger all event
4004 * sources if IN_IGNORED or IN_UNMOUNT is set. */
4005 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
4006
4007 if (event_source_is_offline(s))
4008 continue;
4009
4010 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
4011 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
4012 continue;
4013
4014 r = source_set_pending(s, true);
4015 if (r < 0)
4016 return r;
4017 }
4018 }
4019
4020 /* Something pending now? If so, let's finish, otherwise let's read more. */
4021 if (d->n_pending > 0)
4022 return 1;
4023 }
4024
4025 return 0;
4026 }
4027
4028 static int process_inotify(sd_event *e) {
4029 int r, done = 0;
4030
4031 assert(e);
4032
4033 LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
4034 r = event_inotify_data_process(e, d);
4035 if (r < 0)
4036 return r;
4037 if (r > 0)
4038 done++;
4039 }
4040
4041 return done;
4042 }
4043
4044 static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
4045 assert(s);
4046 assert(s->type == SOURCE_MEMORY_PRESSURE);
4047
4048 if (s->pending)
4049 s->memory_pressure.revents |= revents;
4050 else
4051 s->memory_pressure.revents = revents;
4052
4053 return source_set_pending(s, true);
4054 }
4055
4056 static int source_memory_pressure_write(sd_event_source *s) {
4057 ssize_t n;
4058 int r;
4059
4060 assert(s);
4061 assert(s->type == SOURCE_MEMORY_PRESSURE);
4062
4063 /* once we start writing, the buffer is locked, we allow no further changes. */
4064 s->memory_pressure.locked = true;
4065
4066 if (s->memory_pressure.write_buffer_size > 0) {
4067 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4068 if (n < 0) {
4069 if (!ERRNO_IS_TRANSIENT(errno)) {
4070 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4071 * files, but then generates EOPNOSUPP on read() and write() (instead of on
4072 * open()!). This sucks hard, since we can only detect this kind of failure
4073 * so late. Let's make the best of it, and turn off the event source like we
4074 * do for failed event source handlers. */
4075
4076 log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4077 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4078 return 0;
4079 }
4080
4081 n = 0;
4082 }
4083 } else
4084 n = 0;
4085
4086 assert(n >= 0);
4087
4088 if ((size_t) n == s->memory_pressure.write_buffer_size) {
4089 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4090
4091 if (n > 0) {
4092 s->memory_pressure.write_buffer_size = 0;
4093
4094 /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4095 r = source_memory_pressure_register(s, s->enabled);
4096 if (r < 0)
4097 return r;
4098 }
4099 } else if (n > 0) {
4100 _cleanup_free_ void *c = NULL;
4101
4102 assert((size_t) n < s->memory_pressure.write_buffer_size);
4103
4104 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4105 if (!c)
4106 return -ENOMEM;
4107
4108 free_and_replace(s->memory_pressure.write_buffer, c);
4109 s->memory_pressure.write_buffer_size -= n;
4110 return 1;
4111 }
4112
4113 return 0;
4114 }
4115
4116 static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4117 int r;
4118
4119 assert(s);
4120 assert(s->type == SOURCE_MEMORY_PRESSURE);
4121
4122 r = source_memory_pressure_write(s);
4123 if (r < 0)
4124 return r;
4125 if (r > 0)
4126 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4127 * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4128
4129 /* No pending incoming IO? Then let's not continue further */
4130 if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4131
4132 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4133 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4134 return -EIO;
4135
4136 return 1; /* leave dispatch, we already processed everything */
4137 }
4138
4139 if (s->memory_pressure.revents & EPOLLIN) {
4140 uint8_t pipe_buf[PIPE_BUF];
4141 ssize_t n;
4142
4143 /* If the fd is readable, then flush out anything that might be queued */
4144
4145 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4146 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4147 return -errno;
4148 }
4149
4150 return 0; /* go on, dispatch to user callback */
4151 }
4152
4153 static int source_dispatch(sd_event_source *s) {
4154 EventSourceType saved_type;
4155 sd_event *saved_event;
4156 int r = 0;
4157
4158 assert(s);
4159 assert(s->pending || s->type == SOURCE_EXIT);
4160
4161 /* Save the event source type, here, so that we still know it after the event callback which might
4162 * invalidate the event. */
4163 saved_type = s->type;
4164
4165 /* Similarly, store a reference to the event loop object, so that we can still access it after the
4166 * callback might have invalidated/disconnected the event source. */
4167 saved_event = s->event;
4168 PROTECT_EVENT(saved_event);
4169
4170 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4171 assert(!s->ratelimited);
4172 if (!ratelimit_below(&s->rate_limit)) {
4173 r = event_source_enter_ratelimited(s);
4174 if (r < 0)
4175 return r;
4176
4177 return 1;
4178 }
4179
4180 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
4181 r = source_set_pending(s, false);
4182 if (r < 0)
4183 return r;
4184 }
4185
4186 if (s->type != SOURCE_POST) {
4187 sd_event_source *z;
4188
4189 /* If we execute a non-post source, let's mark all post sources as pending. */
4190
4191 SET_FOREACH(z, s->event->post_sources) {
4192 if (event_source_is_offline(z))
4193 continue;
4194
4195 r = source_set_pending(z, true);
4196 if (r < 0)
4197 return r;
4198 }
4199 }
4200
4201 if (s->type == SOURCE_MEMORY_PRESSURE) {
4202 r = source_memory_pressure_initiate_dispatch(s);
4203 if (r == -EIO) /* handle EIO errors similar to callback errors */
4204 goto finish;
4205 if (r < 0)
4206 return r;
4207 if (r > 0) /* already handled */
4208 return 1;
4209 }
4210
4211 if (s->enabled == SD_EVENT_ONESHOT) {
4212 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
4213 if (r < 0)
4214 return r;
4215 }
4216
4217 s->dispatching = true;
4218
4219 switch (s->type) {
4220
4221 case SOURCE_IO:
4222 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4223 break;
4224
4225 case SOURCE_TIME_REALTIME:
4226 case SOURCE_TIME_BOOTTIME:
4227 case SOURCE_TIME_MONOTONIC:
4228 case SOURCE_TIME_REALTIME_ALARM:
4229 case SOURCE_TIME_BOOTTIME_ALARM:
4230 r = s->time.callback(s, s->time.next, s->userdata);
4231 break;
4232
4233 case SOURCE_SIGNAL:
4234 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4235 break;
4236
4237 case SOURCE_CHILD: {
4238 bool zombie;
4239
4240 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
4241
4242 r = s->child.callback(s, &s->child.siginfo, s->userdata);
4243
4244 /* Now, reap the PID for good. */
4245 if (zombie) {
4246 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
4247 s->child.waited = true;
4248 }
4249
4250 break;
4251 }
4252
4253 case SOURCE_DEFER:
4254 r = s->defer.callback(s, s->userdata);
4255 break;
4256
4257 case SOURCE_POST:
4258 r = s->post.callback(s, s->userdata);
4259 break;
4260
4261 case SOURCE_EXIT:
4262 r = s->exit.callback(s, s->userdata);
4263 break;
4264
4265 case SOURCE_INOTIFY: {
4266 struct sd_event *e = s->event;
4267 struct inotify_data *d;
4268 size_t sz;
4269
4270 assert(s->inotify.inode_data);
4271 assert_se(d = s->inotify.inode_data->inotify_data);
4272
4273 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4274 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4275 assert(d->buffer_filled >= sz);
4276
4277 /* If the inotify callback destroys the event source then this likely means we don't need to
4278 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4279 * free it immediately, then we couldn't drop the event from the inotify event queue without
4280 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4281 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4282 * explicitly GC it after we are done dropping the inotify event from the buffer. */
4283 d->n_busy++;
4284 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
4285 d->n_busy--;
4286
4287 /* When no event is pending anymore on this inotify object, then let's drop the event from
4288 * the inotify event queue buffer. */
4289 if (d->n_pending == 0)
4290 event_inotify_data_drop(e, d, sz);
4291
4292 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4293 event_gc_inotify_data(e, d);
4294 break;
4295 }
4296
4297 case SOURCE_MEMORY_PRESSURE:
4298 r = s->memory_pressure.callback(s, s->userdata);
4299 break;
4300
4301 case SOURCE_WATCHDOG:
4302 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
4303 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
4304 assert_not_reached();
4305 }
4306
4307 s->dispatching = false;
4308
4309 finish:
4310 if (r < 0) {
4311 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4312 strna(s->description),
4313 event_source_type_to_string(saved_type),
4314 s->exit_on_failure ? "exiting" : "disabling");
4315
4316 if (s->exit_on_failure)
4317 (void) sd_event_exit(saved_event, r);
4318 }
4319
4320 if (s->n_ref == 0)
4321 source_free(s);
4322 else if (r < 0)
4323 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4324
4325 return 1;
4326 }
4327
4328 static int event_prepare(sd_event *e) {
4329 int r;
4330
4331 assert(e);
4332
4333 for (;;) {
4334 sd_event_source *s;
4335
4336 s = prioq_peek(e->prepare);
4337 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
4338 break;
4339
4340 s->prepare_iteration = e->iteration;
4341 prioq_reshuffle(e->prepare, s, &s->prepare_index);
4342
4343 assert(s->prepare);
4344 s->dispatching = true;
4345 r = s->prepare(s, s->userdata);
4346 s->dispatching = false;
4347
4348 if (r < 0) {
4349 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4350 strna(s->description),
4351 event_source_type_to_string(s->type),
4352 s->exit_on_failure ? "exiting" : "disabling");
4353
4354 if (s->exit_on_failure)
4355 (void) sd_event_exit(e, r);
4356 }
4357
4358 if (s->n_ref == 0)
4359 source_free(s);
4360 else if (r < 0)
4361 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4362 }
4363
4364 return 0;
4365 }
4366
4367 static int dispatch_exit(sd_event *e) {
4368 sd_event_source *p;
4369 int r;
4370
4371 assert(e);
4372
4373 p = prioq_peek(e->exit);
4374 assert(!p || p->type == SOURCE_EXIT);
4375
4376 if (!p || event_source_is_offline(p)) {
4377 e->state = SD_EVENT_FINISHED;
4378 return 0;
4379 }
4380
4381 PROTECT_EVENT(e);
4382 e->iteration++;
4383 e->state = SD_EVENT_EXITING;
4384 r = source_dispatch(p);
4385 e->state = SD_EVENT_INITIAL;
4386 return r;
4387 }
4388
4389 static sd_event_source* event_next_pending(sd_event *e) {
4390 sd_event_source *p;
4391
4392 assert(e);
4393
4394 p = prioq_peek(e->pending);
4395 if (!p)
4396 return NULL;
4397
4398 if (event_source_is_offline(p))
4399 return NULL;
4400
4401 return p;
4402 }
4403
4404 static int arm_watchdog(sd_event *e) {
4405 struct itimerspec its = {};
4406 usec_t t;
4407
4408 assert(e);
4409 assert(e->watchdog_fd >= 0);
4410
4411 t = sleep_between(e,
4412 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4413 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
4414
4415 timespec_store(&its.it_value, t);
4416
4417 /* Make sure we never set the watchdog to 0, which tells the
4418 * kernel to disable it. */
4419 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4420 its.it_value.tv_nsec = 1;
4421
4422 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
4423 }
4424
4425 static int process_watchdog(sd_event *e) {
4426 assert(e);
4427
4428 if (!e->watchdog)
4429 return 0;
4430
4431 /* Don't notify watchdog too often */
4432 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4433 return 0;
4434
4435 sd_notify(false, "WATCHDOG=1");
4436 e->watchdog_last = e->timestamp.monotonic;
4437
4438 return arm_watchdog(e);
4439 }
4440
4441 static void event_close_inode_data_fds(sd_event *e) {
4442 struct inode_data *d;
4443
4444 assert(e);
4445
4446 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4447 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4448 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4449 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4450 * compromise. */
4451
4452 while ((d = e->inode_data_to_close_list)) {
4453 assert(d->fd >= 0);
4454 d->fd = safe_close(d->fd);
4455
4456 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
4457 }
4458 }
4459
4460 static int event_memory_pressure_write_list(sd_event *e) {
4461 int r;
4462
4463 assert(e);
4464
4465 for (;;) {
4466 sd_event_source *s;
4467
4468 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4469 if (!s)
4470 break;
4471
4472 assert(s->type == SOURCE_MEMORY_PRESSURE);
4473 assert(s->memory_pressure.write_buffer_size > 0);
4474 s->memory_pressure.in_write_list = false;
4475
4476 r = source_memory_pressure_write(s);
4477 if (r < 0)
4478 return r;
4479 }
4480
4481 return 0;
4482 }
4483
4484 _public_ int sd_event_prepare(sd_event *e) {
4485 int r;
4486
4487 assert_return(e, -EINVAL);
4488 assert_return(e = event_resolve(e), -ENOPKG);
4489 assert_return(!event_origin_changed(e), -ECHILD);
4490 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4491 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4492
4493 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4494 * this check here once, since gettid() is typically not cached, and thus want to minimize
4495 * syscalls */
4496 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4497
4498 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4499 PROTECT_EVENT(e);
4500
4501 if (e->exit_requested)
4502 goto pending;
4503
4504 e->iteration++;
4505
4506 e->state = SD_EVENT_PREPARING;
4507 r = event_prepare(e);
4508 e->state = SD_EVENT_INITIAL;
4509 if (r < 0)
4510 return r;
4511
4512 r = event_memory_pressure_write_list(e);
4513 if (r < 0)
4514 return r;
4515
4516 r = event_arm_timer(e, &e->realtime);
4517 if (r < 0)
4518 return r;
4519
4520 r = event_arm_timer(e, &e->boottime);
4521 if (r < 0)
4522 return r;
4523
4524 r = event_arm_timer(e, &e->monotonic);
4525 if (r < 0)
4526 return r;
4527
4528 r = event_arm_timer(e, &e->realtime_alarm);
4529 if (r < 0)
4530 return r;
4531
4532 r = event_arm_timer(e, &e->boottime_alarm);
4533 if (r < 0)
4534 return r;
4535
4536 event_close_inode_data_fds(e);
4537
4538 if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
4539 goto pending;
4540
4541 e->state = SD_EVENT_ARMED;
4542
4543 return 0;
4544
4545 pending:
4546 e->state = SD_EVENT_ARMED;
4547 r = sd_event_wait(e, 0);
4548 if (r == 0)
4549 e->state = SD_EVENT_ARMED;
4550
4551 return r;
4552 }
4553
4554 static int epoll_wait_usec(
4555 int fd,
4556 struct epoll_event *events,
4557 int maxevents,
4558 usec_t timeout) {
4559
4560 int msec;
4561 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4562
4563 #if HAVE_EPOLL_PWAIT2
4564 static bool epoll_pwait2_absent = false;
4565 int r;
4566
4567 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4568 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4569 * is not that obvious to implement given the libc and kernel definitions differ in the last
4570 * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4571 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4572 * missing. */
4573
4574 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
4575 r = epoll_pwait2(fd,
4576 events,
4577 maxevents,
4578 TIMESPEC_STORE(timeout),
4579 NULL);
4580 if (r >= 0)
4581 return r;
4582 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
4583 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4584 * supported. */
4585
4586 epoll_pwait2_absent = true;
4587 }
4588 #endif
4589
4590 if (timeout == USEC_INFINITY)
4591 msec = -1;
4592 else {
4593 usec_t k;
4594
4595 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4596 if (k >= INT_MAX)
4597 msec = INT_MAX; /* Saturate */
4598 else
4599 msec = (int) k;
4600 }
4601
4602 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
4603 }
4604
4605 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
4606 size_t n_event_queue, m, n_event_max;
4607 int64_t min_priority = threshold;
4608 bool something_new = false;
4609 int r;
4610
4611 assert(e);
4612 assert(ret_min_priority);
4613
4614 n_event_queue = MAX(e->n_sources, 1u);
4615 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
4616 return -ENOMEM;
4617
4618 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4619
4620 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4621 if (e->buffered_inotify_data_list)
4622 timeout = 0;
4623
4624 for (;;) {
4625 r = epoll_wait_usec(
4626 e->epoll_fd,
4627 e->event_queue,
4628 n_event_max,
4629 timeout);
4630 if (r < 0)
4631 return r;
4632
4633 m = (size_t) r;
4634
4635 if (m < n_event_max)
4636 break;
4637
4638 if (n_event_max >= n_event_queue * 10)
4639 break;
4640
4641 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
4642 return -ENOMEM;
4643
4644 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4645 timeout = 0;
4646 }
4647
4648 /* Set timestamp only when this is called first time. */
4649 if (threshold == INT64_MAX)
4650 triple_timestamp_now(&e->timestamp);
4651
4652 for (size_t i = 0; i < m; i++) {
4653
4654 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4655 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
4656 else {
4657 WakeupType *t = e->event_queue[i].data.ptr;
4658
4659 switch (*t) {
4660
4661 case WAKEUP_EVENT_SOURCE: {
4662 sd_event_source *s = e->event_queue[i].data.ptr;
4663
4664 assert(s);
4665
4666 if (s->priority > threshold)
4667 continue;
4668
4669 min_priority = MIN(min_priority, s->priority);
4670
4671 switch (s->type) {
4672
4673 case SOURCE_IO:
4674 r = process_io(e, s, e->event_queue[i].events);
4675 break;
4676
4677 case SOURCE_CHILD:
4678 r = process_pidfd(e, s, e->event_queue[i].events);
4679 break;
4680
4681 case SOURCE_MEMORY_PRESSURE:
4682 r = process_memory_pressure(s, e->event_queue[i].events);
4683 break;
4684
4685 default:
4686 assert_not_reached();
4687 }
4688
4689 break;
4690 }
4691
4692 case WAKEUP_CLOCK_DATA: {
4693 struct clock_data *d = e->event_queue[i].data.ptr;
4694
4695 assert(d);
4696
4697 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4698 break;
4699 }
4700
4701 case WAKEUP_SIGNAL_DATA:
4702 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4703 break;
4704
4705 case WAKEUP_INOTIFY_DATA:
4706 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4707 break;
4708
4709 default:
4710 assert_not_reached();
4711 }
4712 }
4713 if (r < 0)
4714 return r;
4715 if (r > 0)
4716 something_new = true;
4717 }
4718
4719 *ret_min_priority = min_priority;
4720 return something_new;
4721 }
4722
4723 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4724 int r;
4725
4726 assert_return(e, -EINVAL);
4727 assert_return(e = event_resolve(e), -ENOPKG);
4728 assert_return(!event_origin_changed(e), -ECHILD);
4729 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4730 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4731
4732 if (e->exit_requested) {
4733 e->state = SD_EVENT_PENDING;
4734 return 1;
4735 }
4736
4737 for (int64_t threshold = INT64_MAX; ; threshold--) {
4738 int64_t epoll_min_priority, child_min_priority;
4739
4740 /* There may be a possibility that new epoll (especially IO) and child events are
4741 * triggered just after process_epoll() call but before process_child(), and the new IO
4742 * events may have higher priority than the child events. To salvage these events,
4743 * let's call epoll_wait() again, but accepts only events with higher priority than the
4744 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4745 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4746 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4747
4748 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4749 if (r == -EINTR) {
4750 e->state = SD_EVENT_PENDING;
4751 return 1;
4752 }
4753 if (r < 0)
4754 goto finish;
4755 if (r == 0 && threshold < INT64_MAX)
4756 /* No new epoll event. */
4757 break;
4758
4759 r = process_child(e, threshold, &child_min_priority);
4760 if (r < 0)
4761 goto finish;
4762 if (r == 0)
4763 /* No new child event. */
4764 break;
4765
4766 threshold = MIN(epoll_min_priority, child_min_priority);
4767 if (threshold == INT64_MIN)
4768 break;
4769
4770 timeout = 0;
4771 }
4772
4773 r = process_watchdog(e);
4774 if (r < 0)
4775 goto finish;
4776
4777 r = process_inotify(e);
4778 if (r < 0)
4779 goto finish;
4780
4781 r = process_timer(e, e->timestamp.realtime, &e->realtime);
4782 if (r < 0)
4783 goto finish;
4784
4785 r = process_timer(e, e->timestamp.boottime, &e->boottime);
4786 if (r < 0)
4787 goto finish;
4788
4789 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4790 if (r < 0)
4791 goto finish;
4792
4793 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4794 if (r < 0)
4795 goto finish;
4796
4797 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4798 if (r < 0)
4799 goto finish;
4800 else if (r == 1) {
4801 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4802 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4803 * there were potentially re-enabled by the callback.
4804 *
4805 * Wondering why we treat only this invocation of process_timer() differently? Once event
4806 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4807 * ratelimit expiry callback is never called for any other timer type. */
4808 r = 0;
4809 goto finish;
4810 }
4811
4812 if (event_next_pending(e)) {
4813 e->state = SD_EVENT_PENDING;
4814 return 1;
4815 }
4816
4817 r = 0;
4818
4819 finish:
4820 e->state = SD_EVENT_INITIAL;
4821
4822 return r;
4823 }
4824
4825 _public_ int sd_event_dispatch(sd_event *e) {
4826 sd_event_source *p;
4827 int r;
4828
4829 assert_return(e, -EINVAL);
4830 assert_return(e = event_resolve(e), -ENOPKG);
4831 assert_return(!event_origin_changed(e), -ECHILD);
4832 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4833 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4834
4835 if (e->exit_requested)
4836 return dispatch_exit(e);
4837
4838 p = event_next_pending(e);
4839 if (p) {
4840 PROTECT_EVENT(e);
4841
4842 e->state = SD_EVENT_RUNNING;
4843 r = source_dispatch(p);
4844 e->state = SD_EVENT_INITIAL;
4845 return r;
4846 }
4847
4848 e->state = SD_EVENT_INITIAL;
4849
4850 return 1;
4851 }
4852
4853 static void event_log_delays(sd_event *e) {
4854 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4855 size_t l, i;
4856
4857 p = b;
4858 l = sizeof(b);
4859 for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4860 l = strpcpyf(&p, l, "%u ", e->delays[i]);
4861 e->delays[i] = 0;
4862 }
4863 log_debug("Event loop iterations: %s", b);
4864 }
4865
4866 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4867 int r;
4868
4869 assert_return(e, -EINVAL);
4870 assert_return(e = event_resolve(e), -ENOPKG);
4871 assert_return(!event_origin_changed(e), -ECHILD);
4872 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4873 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4874
4875 if (e->profile_delays && e->last_run_usec != 0) {
4876 usec_t this_run;
4877 unsigned l;
4878
4879 this_run = now(CLOCK_MONOTONIC);
4880
4881 l = log2u64(this_run - e->last_run_usec);
4882 assert(l < ELEMENTSOF(e->delays));
4883 e->delays[l]++;
4884
4885 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4886 event_log_delays(e);
4887 e->last_log_usec = this_run;
4888 }
4889 }
4890
4891 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4892 PROTECT_EVENT(e);
4893
4894 r = sd_event_prepare(e);
4895 if (r == 0)
4896 /* There was nothing? Then wait... */
4897 r = sd_event_wait(e, timeout);
4898
4899 if (e->profile_delays)
4900 e->last_run_usec = now(CLOCK_MONOTONIC);
4901
4902 if (r > 0) {
4903 /* There's something now, then let's dispatch it */
4904 r = sd_event_dispatch(e);
4905 if (r < 0)
4906 return r;
4907
4908 return 1;
4909 }
4910
4911 return r;
4912 }
4913
4914 _public_ int sd_event_loop(sd_event *e) {
4915 int r;
4916
4917 assert_return(e, -EINVAL);
4918 assert_return(e = event_resolve(e), -ENOPKG);
4919 assert_return(!event_origin_changed(e), -ECHILD);
4920 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4921
4922
4923 PROTECT_EVENT(e);
4924
4925 while (e->state != SD_EVENT_FINISHED) {
4926 r = sd_event_run(e, UINT64_MAX);
4927 if (r < 0)
4928 return r;
4929 }
4930
4931 return e->exit_code;
4932 }
4933
4934 _public_ int sd_event_get_fd(sd_event *e) {
4935 assert_return(e, -EINVAL);
4936 assert_return(e = event_resolve(e), -ENOPKG);
4937 assert_return(!event_origin_changed(e), -ECHILD);
4938
4939 return e->epoll_fd;
4940 }
4941
4942 _public_ int sd_event_get_state(sd_event *e) {
4943 assert_return(e, -EINVAL);
4944 assert_return(e = event_resolve(e), -ENOPKG);
4945 assert_return(!event_origin_changed(e), -ECHILD);
4946
4947 return e->state;
4948 }
4949
4950 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
4951 assert_return(e, -EINVAL);
4952 assert_return(e = event_resolve(e), -ENOPKG);
4953 assert_return(!event_origin_changed(e), -ECHILD);
4954
4955 if (!e->exit_requested)
4956 return -ENODATA;
4957
4958 if (code)
4959 *code = e->exit_code;
4960 return 0;
4961 }
4962
4963 _public_ int sd_event_exit(sd_event *e, int code) {
4964 assert_return(e, -EINVAL);
4965 assert_return(e = event_resolve(e), -ENOPKG);
4966 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4967 assert_return(!event_origin_changed(e), -ECHILD);
4968
4969 e->exit_requested = true;
4970 e->exit_code = code;
4971
4972 return 0;
4973 }
4974
4975 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
4976 assert_return(e, -EINVAL);
4977 assert_return(e = event_resolve(e), -ENOPKG);
4978 assert_return(usec, -EINVAL);
4979 assert_return(!event_origin_changed(e), -ECHILD);
4980
4981 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4982 return -EOPNOTSUPP;
4983
4984 if (!triple_timestamp_is_set(&e->timestamp)) {
4985 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4986 *usec = now(clock);
4987 return 1;
4988 }
4989
4990 *usec = triple_timestamp_by_clock(&e->timestamp, clock);
4991 return 0;
4992 }
4993
4994 _public_ int sd_event_default(sd_event **ret) {
4995 sd_event *e = NULL;
4996 int r;
4997
4998 if (!ret)
4999 return !!default_event;
5000
5001 if (default_event) {
5002 *ret = sd_event_ref(default_event);
5003 return 0;
5004 }
5005
5006 r = sd_event_new(&e);
5007 if (r < 0)
5008 return r;
5009
5010 e->default_event_ptr = &default_event;
5011 e->tid = gettid();
5012 default_event = e;
5013
5014 *ret = e;
5015 return 1;
5016 }
5017
5018 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
5019 assert_return(e, -EINVAL);
5020 assert_return(e = event_resolve(e), -ENOPKG);
5021 assert_return(tid, -EINVAL);
5022 assert_return(!event_origin_changed(e), -ECHILD);
5023
5024 if (e->tid != 0) {
5025 *tid = e->tid;
5026 return 0;
5027 }
5028
5029 return -ENXIO;
5030 }
5031
5032 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
5033 int r;
5034
5035 assert_return(e, -EINVAL);
5036 assert_return(e = event_resolve(e), -ENOPKG);
5037 assert_return(!event_origin_changed(e), -ECHILD);
5038
5039 if (e->watchdog == !!b)
5040 return e->watchdog;
5041
5042 if (b) {
5043 r = sd_watchdog_enabled(false, &e->watchdog_period);
5044 if (r <= 0)
5045 return r;
5046
5047 /* Issue first ping immediately */
5048 sd_notify(false, "WATCHDOG=1");
5049 e->watchdog_last = now(CLOCK_MONOTONIC);
5050
5051 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
5052 if (e->watchdog_fd < 0)
5053 return -errno;
5054
5055 r = arm_watchdog(e);
5056 if (r < 0)
5057 goto fail;
5058
5059 struct epoll_event ev = {
5060 .events = EPOLLIN,
5061 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
5062 };
5063
5064 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
5065 r = -errno;
5066 goto fail;
5067 }
5068
5069 } else {
5070 if (e->watchdog_fd >= 0) {
5071 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
5072 e->watchdog_fd = safe_close(e->watchdog_fd);
5073 }
5074 }
5075
5076 e->watchdog = b;
5077 return e->watchdog;
5078
5079 fail:
5080 e->watchdog_fd = safe_close(e->watchdog_fd);
5081 return r;
5082 }
5083
5084 _public_ int sd_event_get_watchdog(sd_event *e) {
5085 assert_return(e, -EINVAL);
5086 assert_return(e = event_resolve(e), -ENOPKG);
5087 assert_return(!event_origin_changed(e), -ECHILD);
5088
5089 return e->watchdog;
5090 }
5091
5092 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5093 assert_return(e, -EINVAL);
5094 assert_return(e = event_resolve(e), -ENOPKG);
5095 assert_return(!event_origin_changed(e), -ECHILD);
5096
5097 *ret = e->iteration;
5098 return 0;
5099 }
5100
5101 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5102 assert_return(s, -EINVAL);
5103 assert_return(s->event, -EINVAL);
5104 assert_return(!event_origin_changed(s->event), -ECHILD);
5105
5106 s->destroy_callback = callback;
5107 return 0;
5108 }
5109
5110 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5111 assert_return(s, -EINVAL);
5112 assert_return(!event_origin_changed(s->event), -ECHILD);
5113
5114 if (ret)
5115 *ret = s->destroy_callback;
5116
5117 return !!s->destroy_callback;
5118 }
5119
5120 _public_ int sd_event_source_get_floating(sd_event_source *s) {
5121 assert_return(s, -EINVAL);
5122 assert_return(!event_origin_changed(s->event), -ECHILD);
5123
5124 return s->floating;
5125 }
5126
5127 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5128 assert_return(s, -EINVAL);
5129 assert_return(!event_origin_changed(s->event), -ECHILD);
5130
5131 if (s->floating == !!b)
5132 return 0;
5133
5134 if (!s->event) /* Already disconnected */
5135 return -ESTALE;
5136
5137 s->floating = b;
5138
5139 if (b) {
5140 sd_event_source_ref(s);
5141 sd_event_unref(s->event);
5142 } else {
5143 sd_event_ref(s->event);
5144 sd_event_source_unref(s);
5145 }
5146
5147 return 1;
5148 }
5149
5150 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5151 assert_return(s, -EINVAL);
5152 assert_return(s->type != SOURCE_EXIT, -EDOM);
5153 assert_return(!event_origin_changed(s->event), -ECHILD);
5154
5155 return s->exit_on_failure;
5156 }
5157
5158 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5159 assert_return(s, -EINVAL);
5160 assert_return(s->type != SOURCE_EXIT, -EDOM);
5161 assert_return(!event_origin_changed(s->event), -ECHILD);
5162
5163 if (s->exit_on_failure == !!b)
5164 return 0;
5165
5166 s->exit_on_failure = b;
5167 return 1;
5168 }
5169
5170 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5171 int r;
5172
5173 assert_return(s, -EINVAL);
5174 assert_return(!event_origin_changed(s->event), -ECHILD);
5175
5176 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5177 * so is a programming error. */
5178 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5179
5180 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5181 * non-ratelimited. */
5182 r = event_source_leave_ratelimit(s, /* run_callback */ false);
5183 if (r < 0)
5184 return r;
5185
5186 s->rate_limit = (RateLimit) { interval, burst };
5187 return 0;
5188 }
5189
5190 _public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5191 assert_return(s, -EINVAL);
5192 assert_return(!event_origin_changed(s->event), -ECHILD);
5193
5194 s->ratelimit_expire_callback = callback;
5195 return 0;
5196 }
5197
5198 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5199 assert_return(s, -EINVAL);
5200 assert_return(!event_origin_changed(s->event), -ECHILD);
5201
5202 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5203 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5204 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5205 return -EDOM;
5206
5207 if (!ratelimit_configured(&s->rate_limit))
5208 return -ENOEXEC;
5209
5210 if (ret_interval)
5211 *ret_interval = s->rate_limit.interval;
5212 if (ret_burst)
5213 *ret_burst = s->rate_limit.burst;
5214
5215 return 0;
5216 }
5217
5218 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5219 assert_return(s, -EINVAL);
5220 assert_return(!event_origin_changed(s->event), -ECHILD);
5221
5222 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5223 return false;
5224
5225 if (!ratelimit_configured(&s->rate_limit))
5226 return false;
5227
5228 return s->ratelimited;
5229 }
5230
5231 _public_ int sd_event_source_leave_ratelimit(sd_event_source *s) {
5232 int r;
5233
5234 assert_return(s, -EINVAL);
5235
5236 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5237 return 0;
5238
5239 if (!ratelimit_configured(&s->rate_limit))
5240 return 0;
5241
5242 if (!s->ratelimited)
5243 return 0;
5244
5245 r = event_source_leave_ratelimit(s, /* run_callback */ false);
5246 if (r < 0)
5247 return r;
5248
5249 return 1; /* tell caller that we indeed just left the ratelimit state */
5250 }
5251
5252 _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5253 bool change = false;
5254 int r;
5255
5256 assert_return(e, -EINVAL);
5257
5258 if (b) {
5259 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5260 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5261 * floating after creation (and undo this before deleting them again). */
5262
5263 if (!e->sigint_event_source) {
5264 r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5265 if (r < 0)
5266 return r;
5267
5268 assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5269 change = true;
5270 }
5271
5272 if (!e->sigterm_event_source) {
5273 r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5274 if (r < 0) {
5275 if (change) {
5276 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5277 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5278 }
5279
5280 return r;
5281 }
5282
5283 assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5284 change = true;
5285 }
5286
5287 } else {
5288 if (e->sigint_event_source) {
5289 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5290 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5291 change = true;
5292 }
5293
5294 if (e->sigterm_event_source) {
5295 assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5296 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5297 change = true;
5298 }
5299 }
5300
5301 return change;
5302 }
5303
5304 _public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5305 _cleanup_free_ char *b = NULL;
5306 _cleanup_free_ void *w = NULL;
5307
5308 assert_return(s, -EINVAL);
5309 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5310 assert_return(ty, -EINVAL);
5311 assert_return(!event_origin_changed(s->event), -ECHILD);
5312
5313 if (!STR_IN_SET(ty, "some", "full"))
5314 return -EINVAL;
5315
5316 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5317 return -EBUSY;
5318
5319 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5320 if (!space)
5321 return -EINVAL;
5322
5323 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5324 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5325 if (!b)
5326 return -ENOMEM;
5327 if (!STR_IN_SET(b, "some", "full"))
5328 return -EINVAL;
5329
5330 if (streq(b, ty))
5331 return 0;
5332
5333 size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5334 w = new(char, nl);
5335 if (!w)
5336 return -ENOMEM;
5337
5338 memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5339
5340 free_and_replace(s->memory_pressure.write_buffer, w);
5341 s->memory_pressure.write_buffer_size = nl;
5342 s->memory_pressure.locked = false;
5343
5344 return 1;
5345 }
5346
5347 _public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5348 _cleanup_free_ char *b = NULL;
5349 _cleanup_free_ void *w = NULL;
5350
5351 assert_return(s, -EINVAL);
5352 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5353 assert_return(!event_origin_changed(s->event), -ECHILD);
5354
5355 if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5356 return -ERANGE;
5357 if (window_usec <= 0 || window_usec >= UINT64_MAX)
5358 return -ERANGE;
5359 if (threshold_usec > window_usec)
5360 return -EINVAL;
5361
5362 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5363 return -EBUSY;
5364
5365 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5366 if (!space)
5367 return -EINVAL;
5368
5369 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5370 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5371 if (!b)
5372 return -ENOMEM;
5373 if (!STR_IN_SET(b, "some", "full"))
5374 return -EINVAL;
5375
5376 if (asprintf((char**) &w,
5377 "%s " USEC_FMT " " USEC_FMT "",
5378 b,
5379 threshold_usec,
5380 window_usec) < 0)
5381 return -EINVAL;
5382
5383 l = strlen(w) + 1;
5384 if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5385 return 0;
5386
5387 free_and_replace(s->memory_pressure.write_buffer, w);
5388 s->memory_pressure.write_buffer_size = l;
5389 s->memory_pressure.locked = false;
5390
5391 return 1;
5392 }