]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/libsystemd/sd-event/sd-event.c
sd-event: always initialize sd_event.perturb
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <sys/epoll.h>
4 #include <sys/timerfd.h>
5 #include <sys/wait.h>
6
7 #include "sd-daemon.h"
8 #include "sd-event.h"
9 #include "sd-id128.h"
10 #include "sd-messages.h"
11
12 #include "alloc-util.h"
13 #include "env-util.h"
14 #include "event-source.h"
15 #include "fd-util.h"
16 #include "fs-util.h"
17 #include "glyph-util.h"
18 #include "hashmap.h"
19 #include "hexdecoct.h"
20 #include "list.h"
21 #include "logarithm.h"
22 #include "macro.h"
23 #include "mallinfo-util.h"
24 #include "memory-util.h"
25 #include "missing_magic.h"
26 #include "missing_syscall.h"
27 #include "path-util.h"
28 #include "prioq.h"
29 #include "process-util.h"
30 #include "psi-util.h"
31 #include "set.h"
32 #include "signal-util.h"
33 #include "socket-util.h"
34 #include "stat-util.h"
35 #include "string-table.h"
36 #include "string-util.h"
37 #include "strxcpyx.h"
38 #include "time-util.h"
39
40 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
41
42 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
43 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
44 return s &&
45 s->type == SOURCE_CHILD &&
46 s->child.pidfd >= 0 &&
47 s->child.options == WEXITED;
48 }
49
50 static bool event_source_is_online(sd_event_source *s) {
51 assert(s);
52 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
53 }
54
55 static bool event_source_is_offline(sd_event_source *s) {
56 assert(s);
57 return s->enabled == SD_EVENT_OFF || s->ratelimited;
58 }
59
60 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
61 [SOURCE_IO] = "io",
62 [SOURCE_TIME_REALTIME] = "realtime",
63 [SOURCE_TIME_BOOTTIME] = "bootime",
64 [SOURCE_TIME_MONOTONIC] = "monotonic",
65 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
66 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
67 [SOURCE_SIGNAL] = "signal",
68 [SOURCE_CHILD] = "child",
69 [SOURCE_DEFER] = "defer",
70 [SOURCE_POST] = "post",
71 [SOURCE_EXIT] = "exit",
72 [SOURCE_WATCHDOG] = "watchdog",
73 [SOURCE_INOTIFY] = "inotify",
74 [SOURCE_MEMORY_PRESSURE] = "memory-pressure",
75 };
76
77 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
78
79 #define EVENT_SOURCE_IS_TIME(t) \
80 IN_SET((t), \
81 SOURCE_TIME_REALTIME, \
82 SOURCE_TIME_BOOTTIME, \
83 SOURCE_TIME_MONOTONIC, \
84 SOURCE_TIME_REALTIME_ALARM, \
85 SOURCE_TIME_BOOTTIME_ALARM)
86
87 #define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
88 IN_SET((t), \
89 SOURCE_IO, \
90 SOURCE_TIME_REALTIME, \
91 SOURCE_TIME_BOOTTIME, \
92 SOURCE_TIME_MONOTONIC, \
93 SOURCE_TIME_REALTIME_ALARM, \
94 SOURCE_TIME_BOOTTIME_ALARM, \
95 SOURCE_SIGNAL, \
96 SOURCE_DEFER, \
97 SOURCE_INOTIFY, \
98 SOURCE_MEMORY_PRESSURE)
99
100 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
101 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
102 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
103 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
104
105 struct sd_event {
106 unsigned n_ref;
107
108 int epoll_fd;
109 int watchdog_fd;
110
111 Prioq *pending;
112 Prioq *prepare;
113
114 /* timerfd_create() only supports these five clocks so far. We
115 * can add support for more clocks when the kernel learns to
116 * deal with them, too. */
117 struct clock_data realtime;
118 struct clock_data boottime;
119 struct clock_data monotonic;
120 struct clock_data realtime_alarm;
121 struct clock_data boottime_alarm;
122
123 usec_t perturb;
124
125 sd_event_source **signal_sources; /* indexed by signal number */
126 Hashmap *signal_data; /* indexed by priority */
127
128 Hashmap *child_sources;
129 unsigned n_online_child_sources;
130
131 Set *post_sources;
132
133 Prioq *exit;
134
135 Hashmap *inotify_data; /* indexed by priority */
136
137 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
138 LIST_HEAD(struct inode_data, inode_data_to_close_list);
139
140 /* A list of inotify objects that already have events buffered which aren't processed yet */
141 LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
142
143 /* A list of memory pressure event sources that still need their subscription string written */
144 LIST_HEAD(sd_event_source, memory_pressure_write_list);
145
146 pid_t original_pid;
147
148 uint64_t iteration;
149 triple_timestamp timestamp;
150 int state;
151
152 bool exit_requested:1;
153 bool need_process_child:1;
154 bool watchdog:1;
155 bool profile_delays:1;
156
157 int exit_code;
158
159 pid_t tid;
160 sd_event **default_event_ptr;
161
162 usec_t watchdog_last, watchdog_period;
163
164 unsigned n_sources;
165
166 struct epoll_event *event_queue;
167
168 LIST_HEAD(sd_event_source, sources);
169
170 sd_event_source *sigint_event_source, *sigterm_event_source;
171
172 usec_t last_run_usec, last_log_usec;
173 unsigned delays[sizeof(usec_t) * 8];
174 };
175
176 static thread_local sd_event *default_event = NULL;
177
178 static void source_disconnect(sd_event_source *s);
179 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
180
181 static sd_event *event_resolve(sd_event *e) {
182 return e == SD_EVENT_DEFAULT ? default_event : e;
183 }
184
185 static int pending_prioq_compare(const void *a, const void *b) {
186 const sd_event_source *x = a, *y = b;
187 int r;
188
189 assert(x->pending);
190 assert(y->pending);
191
192 /* Enabled ones first */
193 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
194 if (r != 0)
195 return r;
196
197 /* Non rate-limited ones first. */
198 r = CMP(!!x->ratelimited, !!y->ratelimited);
199 if (r != 0)
200 return r;
201
202 /* Lower priority values first */
203 r = CMP(x->priority, y->priority);
204 if (r != 0)
205 return r;
206
207 /* Older entries first */
208 return CMP(x->pending_iteration, y->pending_iteration);
209 }
210
211 static int prepare_prioq_compare(const void *a, const void *b) {
212 const sd_event_source *x = a, *y = b;
213 int r;
214
215 assert(x->prepare);
216 assert(y->prepare);
217
218 /* Enabled ones first */
219 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
220 if (r != 0)
221 return r;
222
223 /* Non rate-limited ones first. */
224 r = CMP(!!x->ratelimited, !!y->ratelimited);
225 if (r != 0)
226 return r;
227
228 /* Move most recently prepared ones last, so that we can stop
229 * preparing as soon as we hit one that has already been
230 * prepared in the current iteration */
231 r = CMP(x->prepare_iteration, y->prepare_iteration);
232 if (r != 0)
233 return r;
234
235 /* Lower priority values first */
236 return CMP(x->priority, y->priority);
237 }
238
239 static usec_t time_event_source_next(const sd_event_source *s) {
240 assert(s);
241
242 /* We have two kinds of event sources that have elapsation times associated with them: the actual
243 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
244 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
245 * looking at here. */
246
247 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
248 assert(s->rate_limit.begin != 0);
249 assert(s->rate_limit.interval != 0);
250 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
251 }
252
253 /* Otherwise this must be a time event source, if not ratelimited */
254 if (EVENT_SOURCE_IS_TIME(s->type))
255 return s->time.next;
256
257 return USEC_INFINITY;
258 }
259
260 static usec_t time_event_source_latest(const sd_event_source *s) {
261 assert(s);
262
263 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
264 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
265 * window */
266 assert(s->rate_limit.begin != 0);
267 assert(s->rate_limit.interval != 0);
268 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
269 }
270
271 /* Must be a time event source, if not ratelimited */
272 if (EVENT_SOURCE_IS_TIME(s->type))
273 return usec_add(s->time.next, s->time.accuracy);
274
275 return USEC_INFINITY;
276 }
277
278 static bool event_source_timer_candidate(const sd_event_source *s) {
279 assert(s);
280
281 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
282 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
283 return !s->pending || s->ratelimited;
284 }
285
286 static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
287 const sd_event_source *x = a, *y = b;
288 int r;
289
290 /* Enabled ones first */
291 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
292 if (r != 0)
293 return r;
294
295 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
296 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
297 if (r != 0)
298 return r;
299
300 /* Order by time */
301 return CMP(time_func(x), time_func(y));
302 }
303
304 static int earliest_time_prioq_compare(const void *a, const void *b) {
305 return time_prioq_compare(a, b, time_event_source_next);
306 }
307
308 static int latest_time_prioq_compare(const void *a, const void *b) {
309 return time_prioq_compare(a, b, time_event_source_latest);
310 }
311
312 static int exit_prioq_compare(const void *a, const void *b) {
313 const sd_event_source *x = a, *y = b;
314 int r;
315
316 assert(x->type == SOURCE_EXIT);
317 assert(y->type == SOURCE_EXIT);
318
319 /* Enabled ones first */
320 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
321 if (r != 0)
322 return r;
323
324 /* Lower priority values first */
325 return CMP(x->priority, y->priority);
326 }
327
328 static void free_clock_data(struct clock_data *d) {
329 assert(d);
330 assert(d->wakeup == WAKEUP_CLOCK_DATA);
331
332 safe_close(d->fd);
333 prioq_free(d->earliest);
334 prioq_free(d->latest);
335 }
336
337 static sd_event *event_free(sd_event *e) {
338 sd_event_source *s;
339
340 assert(e);
341
342 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
343 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
344
345 while ((s = e->sources)) {
346 assert(s->floating);
347 source_disconnect(s);
348 sd_event_source_unref(s);
349 }
350
351 assert(e->n_sources == 0);
352
353 if (e->default_event_ptr)
354 *(e->default_event_ptr) = NULL;
355
356 safe_close(e->epoll_fd);
357 safe_close(e->watchdog_fd);
358
359 free_clock_data(&e->realtime);
360 free_clock_data(&e->boottime);
361 free_clock_data(&e->monotonic);
362 free_clock_data(&e->realtime_alarm);
363 free_clock_data(&e->boottime_alarm);
364
365 prioq_free(e->pending);
366 prioq_free(e->prepare);
367 prioq_free(e->exit);
368
369 free(e->signal_sources);
370 hashmap_free(e->signal_data);
371
372 hashmap_free(e->inotify_data);
373
374 hashmap_free(e->child_sources);
375 set_free(e->post_sources);
376
377 free(e->event_queue);
378
379 return mfree(e);
380 }
381
382 _public_ int sd_event_new(sd_event** ret) {
383 sd_event *e;
384 int r;
385
386 assert_return(ret, -EINVAL);
387
388 e = new(sd_event, 1);
389 if (!e)
390 return -ENOMEM;
391
392 *e = (sd_event) {
393 .n_ref = 1,
394 .epoll_fd = -EBADF,
395 .watchdog_fd = -EBADF,
396 .realtime.wakeup = WAKEUP_CLOCK_DATA,
397 .realtime.fd = -EBADF,
398 .realtime.next = USEC_INFINITY,
399 .boottime.wakeup = WAKEUP_CLOCK_DATA,
400 .boottime.fd = -EBADF,
401 .boottime.next = USEC_INFINITY,
402 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
403 .monotonic.fd = -EBADF,
404 .monotonic.next = USEC_INFINITY,
405 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
406 .realtime_alarm.fd = -EBADF,
407 .realtime_alarm.next = USEC_INFINITY,
408 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
409 .boottime_alarm.fd = -EBADF,
410 .boottime_alarm.next = USEC_INFINITY,
411 .perturb = USEC_INFINITY,
412 .original_pid = getpid_cached(),
413 };
414
415 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
416 if (r < 0)
417 goto fail;
418
419 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
420 if (e->epoll_fd < 0) {
421 r = -errno;
422 goto fail;
423 }
424
425 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
426
427 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
428 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
429 special_glyph(SPECIAL_GLYPH_ELLIPSIS));
430 e->profile_delays = true;
431 }
432
433 *ret = e;
434 return 0;
435
436 fail:
437 event_free(e);
438 return r;
439 }
440
441 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event, sd_event, event_free);
442 #define PROTECT_EVENT(e) \
443 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
444
445 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
446 if (s)
447 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
448 return sd_event_source_unref(s);
449 }
450
451 static bool event_pid_changed(sd_event *e) {
452 assert(e);
453
454 /* We don't support people creating an event loop and keeping
455 * it around over a fork(). Let's complain. */
456
457 return e->original_pid != getpid_cached();
458 }
459
460 static void source_io_unregister(sd_event_source *s) {
461 assert(s);
462 assert(s->type == SOURCE_IO);
463
464 if (event_pid_changed(s->event))
465 return;
466
467 if (!s->io.registered)
468 return;
469
470 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
471 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
472 strna(s->description), event_source_type_to_string(s->type));
473
474 s->io.registered = false;
475 }
476
477 static int source_io_register(
478 sd_event_source *s,
479 int enabled,
480 uint32_t events) {
481
482 assert(s);
483 assert(s->type == SOURCE_IO);
484 assert(enabled != SD_EVENT_OFF);
485
486 struct epoll_event ev = {
487 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
488 .data.ptr = s,
489 };
490
491 if (epoll_ctl(s->event->epoll_fd,
492 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
493 s->io.fd, &ev) < 0)
494 return -errno;
495
496 s->io.registered = true;
497
498 return 0;
499 }
500
501 static void source_child_pidfd_unregister(sd_event_source *s) {
502 assert(s);
503 assert(s->type == SOURCE_CHILD);
504
505 if (event_pid_changed(s->event))
506 return;
507
508 if (!s->child.registered)
509 return;
510
511 if (EVENT_SOURCE_WATCH_PIDFD(s))
512 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
513 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
514 strna(s->description), event_source_type_to_string(s->type));
515
516 s->child.registered = false;
517 }
518
519 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
520 assert(s);
521 assert(s->type == SOURCE_CHILD);
522 assert(enabled != SD_EVENT_OFF);
523
524 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
525 struct epoll_event ev = {
526 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
527 .data.ptr = s,
528 };
529
530 if (epoll_ctl(s->event->epoll_fd,
531 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
532 s->child.pidfd, &ev) < 0)
533 return -errno;
534 }
535
536 s->child.registered = true;
537 return 0;
538 }
539
540 static void source_memory_pressure_unregister(sd_event_source *s) {
541 assert(s);
542 assert(s->type == SOURCE_MEMORY_PRESSURE);
543
544 if (event_pid_changed(s->event))
545 return;
546
547 if (!s->memory_pressure.registered)
548 return;
549
550 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
551 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
552 strna(s->description), event_source_type_to_string(s->type));
553
554 s->memory_pressure.registered = false;
555 }
556
557 static int source_memory_pressure_register(sd_event_source *s, int enabled) {
558 assert(s);
559 assert(s->type == SOURCE_MEMORY_PRESSURE);
560 assert(enabled != SD_EVENT_OFF);
561
562 struct epoll_event ev = {
563 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
564 (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
565 .data.ptr = s,
566 };
567
568 if (epoll_ctl(s->event->epoll_fd,
569 s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
570 s->memory_pressure.fd, &ev) < 0)
571 return -errno;
572
573 s->memory_pressure.registered = true;
574 return 0;
575 }
576
577 static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
578 assert(s);
579 assert(s->type == SOURCE_MEMORY_PRESSURE);
580
581 if (s->memory_pressure.in_write_list)
582 return;
583
584 LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
585 s->memory_pressure.in_write_list = true;
586 }
587
588 static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
589 assert(s);
590 assert(s->type == SOURCE_MEMORY_PRESSURE);
591
592 if (!s->memory_pressure.in_write_list)
593 return;
594
595 LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
596 s->memory_pressure.in_write_list = false;
597 }
598
599 static clockid_t event_source_type_to_clock(EventSourceType t) {
600
601 switch (t) {
602
603 case SOURCE_TIME_REALTIME:
604 return CLOCK_REALTIME;
605
606 case SOURCE_TIME_BOOTTIME:
607 return CLOCK_BOOTTIME;
608
609 case SOURCE_TIME_MONOTONIC:
610 return CLOCK_MONOTONIC;
611
612 case SOURCE_TIME_REALTIME_ALARM:
613 return CLOCK_REALTIME_ALARM;
614
615 case SOURCE_TIME_BOOTTIME_ALARM:
616 return CLOCK_BOOTTIME_ALARM;
617
618 default:
619 return (clockid_t) -1;
620 }
621 }
622
623 static EventSourceType clock_to_event_source_type(clockid_t clock) {
624
625 switch (clock) {
626
627 case CLOCK_REALTIME:
628 return SOURCE_TIME_REALTIME;
629
630 case CLOCK_BOOTTIME:
631 return SOURCE_TIME_BOOTTIME;
632
633 case CLOCK_MONOTONIC:
634 return SOURCE_TIME_MONOTONIC;
635
636 case CLOCK_REALTIME_ALARM:
637 return SOURCE_TIME_REALTIME_ALARM;
638
639 case CLOCK_BOOTTIME_ALARM:
640 return SOURCE_TIME_BOOTTIME_ALARM;
641
642 default:
643 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
644 }
645 }
646
647 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
648 assert(e);
649
650 switch (t) {
651
652 case SOURCE_TIME_REALTIME:
653 return &e->realtime;
654
655 case SOURCE_TIME_BOOTTIME:
656 return &e->boottime;
657
658 case SOURCE_TIME_MONOTONIC:
659 return &e->monotonic;
660
661 case SOURCE_TIME_REALTIME_ALARM:
662 return &e->realtime_alarm;
663
664 case SOURCE_TIME_BOOTTIME_ALARM:
665 return &e->boottime_alarm;
666
667 default:
668 return NULL;
669 }
670 }
671
672 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
673 assert(e);
674
675 if (!d)
676 return;
677
678 hashmap_remove(e->signal_data, &d->priority);
679 safe_close(d->fd);
680 free(d);
681 }
682
683 static int event_make_signal_data(
684 sd_event *e,
685 int sig,
686 struct signal_data **ret) {
687
688 struct signal_data *d;
689 bool added = false;
690 sigset_t ss_copy;
691 int64_t priority;
692 int r;
693
694 assert(e);
695
696 if (event_pid_changed(e))
697 return -ECHILD;
698
699 if (e->signal_sources && e->signal_sources[sig])
700 priority = e->signal_sources[sig]->priority;
701 else
702 priority = SD_EVENT_PRIORITY_NORMAL;
703
704 d = hashmap_get(e->signal_data, &priority);
705 if (d) {
706 if (sigismember(&d->sigset, sig) > 0) {
707 if (ret)
708 *ret = d;
709 return 0;
710 }
711 } else {
712 d = new(struct signal_data, 1);
713 if (!d)
714 return -ENOMEM;
715
716 *d = (struct signal_data) {
717 .wakeup = WAKEUP_SIGNAL_DATA,
718 .fd = -EBADF,
719 .priority = priority,
720 };
721
722 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
723 if (r < 0) {
724 free(d);
725 return r;
726 }
727
728 added = true;
729 }
730
731 ss_copy = d->sigset;
732 assert_se(sigaddset(&ss_copy, sig) >= 0);
733
734 r = signalfd(d->fd >= 0 ? d->fd : -1, /* the first arg must be -1 or a valid signalfd */
735 &ss_copy,
736 SFD_NONBLOCK|SFD_CLOEXEC);
737 if (r < 0) {
738 r = -errno;
739 goto fail;
740 }
741
742 d->sigset = ss_copy;
743
744 if (d->fd >= 0) {
745 if (ret)
746 *ret = d;
747 return 0;
748 }
749
750 d->fd = fd_move_above_stdio(r);
751
752 struct epoll_event ev = {
753 .events = EPOLLIN,
754 .data.ptr = d,
755 };
756
757 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
758 r = -errno;
759 goto fail;
760 }
761
762 if (ret)
763 *ret = d;
764
765 return 0;
766
767 fail:
768 if (added)
769 event_free_signal_data(e, d);
770
771 return r;
772 }
773
774 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
775 assert(e);
776 assert(d);
777
778 /* Turns off the specified signal in the signal data
779 * object. If the signal mask of the object becomes empty that
780 * way removes it. */
781
782 if (sigismember(&d->sigset, sig) == 0)
783 return;
784
785 assert_se(sigdelset(&d->sigset, sig) >= 0);
786
787 if (sigisemptyset(&d->sigset)) {
788 /* If all the mask is all-zero we can get rid of the structure */
789 event_free_signal_data(e, d);
790 return;
791 }
792
793 if (event_pid_changed(e))
794 return;
795
796 assert(d->fd >= 0);
797
798 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
799 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
800 }
801
802 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
803 struct signal_data *d;
804 static const int64_t zero_priority = 0;
805
806 assert(e);
807
808 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
809 * and possibly drop the signalfd for it. */
810
811 if (sig == SIGCHLD &&
812 e->n_online_child_sources > 0)
813 return;
814
815 if (e->signal_sources &&
816 e->signal_sources[sig] &&
817 event_source_is_online(e->signal_sources[sig]))
818 return;
819
820 /*
821 * The specified signal might be enabled in three different queues:
822 *
823 * 1) the one that belongs to the priority passed (if it is non-NULL)
824 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
825 * 3) the 0 priority (to cover the SIGCHLD case)
826 *
827 * Hence, let's remove it from all three here.
828 */
829
830 if (priority) {
831 d = hashmap_get(e->signal_data, priority);
832 if (d)
833 event_unmask_signal_data(e, d, sig);
834 }
835
836 if (e->signal_sources && e->signal_sources[sig]) {
837 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
838 if (d)
839 event_unmask_signal_data(e, d, sig);
840 }
841
842 d = hashmap_get(e->signal_data, &zero_priority);
843 if (d)
844 event_unmask_signal_data(e, d, sig);
845 }
846
847 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
848 assert(s);
849
850 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
851 * they are enabled/disabled or marked pending and such. */
852
853 if (s->pending)
854 prioq_reshuffle(s->event->pending, s, &s->pending_index);
855
856 if (s->prepare)
857 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
858 }
859
860 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
861 struct clock_data *d;
862
863 assert(s);
864
865 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
866 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
867 * properly again. */
868
869 if (s->ratelimited)
870 d = &s->event->monotonic;
871 else if (EVENT_SOURCE_IS_TIME(s->type))
872 assert_se(d = event_get_clock_data(s->event, s->type));
873 else
874 return; /* no-op for an event source which is neither a timer nor ratelimited. */
875
876 prioq_reshuffle(d->earliest, s, &s->earliest_index);
877 prioq_reshuffle(d->latest, s, &s->latest_index);
878 d->needs_rearm = true;
879 }
880
881 static void event_source_time_prioq_remove(
882 sd_event_source *s,
883 struct clock_data *d) {
884
885 assert(s);
886 assert(d);
887
888 prioq_remove(d->earliest, s, &s->earliest_index);
889 prioq_remove(d->latest, s, &s->latest_index);
890 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
891 d->needs_rearm = true;
892 }
893
894 static void source_disconnect(sd_event_source *s) {
895 sd_event *event;
896 int r;
897
898 assert(s);
899
900 if (!s->event)
901 return;
902
903 assert(s->event->n_sources > 0);
904
905 switch (s->type) {
906
907 case SOURCE_IO:
908 if (s->io.fd >= 0)
909 source_io_unregister(s);
910
911 break;
912
913 case SOURCE_TIME_REALTIME:
914 case SOURCE_TIME_BOOTTIME:
915 case SOURCE_TIME_MONOTONIC:
916 case SOURCE_TIME_REALTIME_ALARM:
917 case SOURCE_TIME_BOOTTIME_ALARM:
918 /* Only remove this event source from the time event source here if it is not ratelimited. If
919 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
920 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
921
922 if (!s->ratelimited) {
923 struct clock_data *d;
924 assert_se(d = event_get_clock_data(s->event, s->type));
925 event_source_time_prioq_remove(s, d);
926 }
927
928 break;
929
930 case SOURCE_SIGNAL:
931 if (s->signal.sig > 0) {
932
933 if (s->event->signal_sources)
934 s->event->signal_sources[s->signal.sig] = NULL;
935
936 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
937
938 if (s->signal.unblock) {
939 sigset_t new_ss;
940
941 if (sigemptyset(&new_ss) < 0)
942 log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
943 else if (sigaddset(&new_ss, s->signal.sig) < 0)
944 log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
945 else {
946 r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
947 if (r != 0)
948 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
949 }
950 }
951 }
952
953 break;
954
955 case SOURCE_CHILD:
956 if (event_pid_changed(s->event))
957 s->child.process_owned = false;
958
959 if (s->child.pid > 0) {
960 if (event_source_is_online(s)) {
961 assert(s->event->n_online_child_sources > 0);
962 s->event->n_online_child_sources--;
963 }
964
965 (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
966 }
967
968 if (EVENT_SOURCE_WATCH_PIDFD(s))
969 source_child_pidfd_unregister(s);
970 else
971 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
972
973 break;
974
975 case SOURCE_DEFER:
976 /* nothing */
977 break;
978
979 case SOURCE_POST:
980 set_remove(s->event->post_sources, s);
981 break;
982
983 case SOURCE_EXIT:
984 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
985 break;
986
987 case SOURCE_INOTIFY: {
988 struct inode_data *inode_data;
989
990 inode_data = s->inotify.inode_data;
991 if (inode_data) {
992 struct inotify_data *inotify_data;
993 assert_se(inotify_data = inode_data->inotify_data);
994
995 /* Detach this event source from the inode object */
996 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
997 s->inotify.inode_data = NULL;
998
999 if (s->pending) {
1000 assert(inotify_data->n_pending > 0);
1001 inotify_data->n_pending--;
1002 }
1003
1004 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1005 * continued to being watched. That's because inotify doesn't really have an API for that: we
1006 * can only change watch masks with access to the original inode either by fd or by path. But
1007 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1008 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1009 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1010 * there), but given the need for open_by_handle_at() which is privileged and not universally
1011 * available this would be quite an incomplete solution. Hence we go the other way, leave the
1012 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1013 * anymore after reception. Yes, this sucks, but … Linux … */
1014
1015 /* Maybe release the inode data (and its inotify) */
1016 event_gc_inode_data(s->event, inode_data);
1017 }
1018
1019 break;
1020 }
1021
1022 case SOURCE_MEMORY_PRESSURE:
1023 source_memory_pressure_remove_from_write_list(s);
1024 source_memory_pressure_unregister(s);
1025 break;
1026
1027 default:
1028 assert_not_reached();
1029 }
1030
1031 if (s->pending)
1032 prioq_remove(s->event->pending, s, &s->pending_index);
1033
1034 if (s->prepare)
1035 prioq_remove(s->event->prepare, s, &s->prepare_index);
1036
1037 if (s->ratelimited)
1038 event_source_time_prioq_remove(s, &s->event->monotonic);
1039
1040 event = TAKE_PTR(s->event);
1041 LIST_REMOVE(sources, event->sources, s);
1042 event->n_sources--;
1043
1044 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1045 * pidfd associated with this event source, which we'll do only on source_free(). */
1046
1047 if (!s->floating)
1048 sd_event_unref(event);
1049 }
1050
1051 static sd_event_source* source_free(sd_event_source *s) {
1052 assert(s);
1053
1054 source_disconnect(s);
1055
1056 if (s->type == SOURCE_IO && s->io.owned)
1057 s->io.fd = safe_close(s->io.fd);
1058
1059 if (s->type == SOURCE_CHILD) {
1060 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1061
1062 if (s->child.process_owned) {
1063
1064 if (!s->child.exited) {
1065 bool sent = false;
1066
1067 if (s->child.pidfd >= 0) {
1068 if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
1069 if (errno == ESRCH) /* Already dead */
1070 sent = true;
1071 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
1072 log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
1073 s->child.pid);
1074 } else
1075 sent = true;
1076 }
1077
1078 if (!sent)
1079 if (kill(s->child.pid, SIGKILL) < 0)
1080 if (errno != ESRCH) /* Already dead */
1081 log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1082 s->child.pid);
1083 }
1084
1085 if (!s->child.waited) {
1086 siginfo_t si = {};
1087
1088 /* Reap the child if we can */
1089 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1090 }
1091 }
1092
1093 if (s->child.pidfd_owned)
1094 s->child.pidfd = safe_close(s->child.pidfd);
1095 }
1096
1097 if (s->type == SOURCE_MEMORY_PRESSURE) {
1098 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1099 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1100 }
1101
1102 if (s->destroy_callback)
1103 s->destroy_callback(s->userdata);
1104
1105 free(s->description);
1106 return mfree(s);
1107 }
1108 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1109
1110 static int source_set_pending(sd_event_source *s, bool b) {
1111 int r;
1112
1113 assert(s);
1114 assert(s->type != SOURCE_EXIT);
1115
1116 if (s->pending == b)
1117 return 0;
1118
1119 s->pending = b;
1120
1121 if (b) {
1122 s->pending_iteration = s->event->iteration;
1123
1124 r = prioq_put(s->event->pending, s, &s->pending_index);
1125 if (r < 0) {
1126 s->pending = false;
1127 return r;
1128 }
1129 } else
1130 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1131
1132 if (EVENT_SOURCE_IS_TIME(s->type))
1133 event_source_time_prioq_reshuffle(s);
1134
1135 if (s->type == SOURCE_SIGNAL && !b) {
1136 struct signal_data *d;
1137
1138 d = hashmap_get(s->event->signal_data, &s->priority);
1139 if (d && d->current == s)
1140 d->current = NULL;
1141 }
1142
1143 if (s->type == SOURCE_INOTIFY) {
1144
1145 assert(s->inotify.inode_data);
1146 assert(s->inotify.inode_data->inotify_data);
1147
1148 if (b)
1149 s->inotify.inode_data->inotify_data->n_pending ++;
1150 else {
1151 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1152 s->inotify.inode_data->inotify_data->n_pending --;
1153 }
1154 }
1155
1156 return 1;
1157 }
1158
1159 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1160
1161 /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1162 * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1163 * lines. */
1164 static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1165 [SOURCE_IO] = endoffsetof_field(sd_event_source, io),
1166 [SOURCE_TIME_REALTIME] = endoffsetof_field(sd_event_source, time),
1167 [SOURCE_TIME_BOOTTIME] = endoffsetof_field(sd_event_source, time),
1168 [SOURCE_TIME_MONOTONIC] = endoffsetof_field(sd_event_source, time),
1169 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1170 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1171 [SOURCE_SIGNAL] = endoffsetof_field(sd_event_source, signal),
1172 [SOURCE_CHILD] = endoffsetof_field(sd_event_source, child),
1173 [SOURCE_DEFER] = endoffsetof_field(sd_event_source, defer),
1174 [SOURCE_POST] = endoffsetof_field(sd_event_source, post),
1175 [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit),
1176 [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify),
1177 [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure),
1178 };
1179
1180 sd_event_source *s;
1181
1182 assert(e);
1183 assert(type >= 0);
1184 assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1185 assert(size_table[type] > 0);
1186
1187 /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1188 * size, even if we only allocate the initial part we need. */
1189 s = expand_to_usable(malloc0(size_table[type]), sizeof(sd_event_source));
1190 if (!s)
1191 return NULL;
1192
1193 /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1194 * than what we allocated here. */
1195 s->n_ref = 1;
1196 s->event = e;
1197 s->floating = floating;
1198 s->type = type;
1199 s->pending_index = PRIOQ_IDX_NULL;
1200 s->prepare_index = PRIOQ_IDX_NULL;
1201
1202 if (!floating)
1203 sd_event_ref(e);
1204
1205 LIST_PREPEND(sources, e->sources, s);
1206 e->n_sources++;
1207
1208 return s;
1209 }
1210
1211 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1212 assert(s);
1213
1214 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1215 }
1216
1217 _public_ int sd_event_add_io(
1218 sd_event *e,
1219 sd_event_source **ret,
1220 int fd,
1221 uint32_t events,
1222 sd_event_io_handler_t callback,
1223 void *userdata) {
1224
1225 _cleanup_(source_freep) sd_event_source *s = NULL;
1226 int r;
1227
1228 assert_return(e, -EINVAL);
1229 assert_return(e = event_resolve(e), -ENOPKG);
1230 assert_return(fd >= 0, -EBADF);
1231 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1232 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1233 assert_return(!event_pid_changed(e), -ECHILD);
1234
1235 if (!callback)
1236 callback = io_exit_callback;
1237
1238 s = source_new(e, !ret, SOURCE_IO);
1239 if (!s)
1240 return -ENOMEM;
1241
1242 s->wakeup = WAKEUP_EVENT_SOURCE;
1243 s->io.fd = fd;
1244 s->io.events = events;
1245 s->io.callback = callback;
1246 s->userdata = userdata;
1247 s->enabled = SD_EVENT_ON;
1248
1249 r = source_io_register(s, s->enabled, events);
1250 if (r < 0)
1251 return r;
1252
1253 if (ret)
1254 *ret = s;
1255 TAKE_PTR(s);
1256
1257 return 0;
1258 }
1259
1260 static void initialize_perturb(sd_event *e) {
1261 sd_id128_t id = {};
1262
1263 /* When we sleep for longer, we try to realign the wakeup to the same time within each
1264 * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1265 * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1266 * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1267 * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1268
1269 if (_likely_(e->perturb != USEC_INFINITY))
1270 return;
1271
1272 if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) > 0)
1273 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1274 else
1275 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
1276 }
1277
1278 static int event_setup_timer_fd(
1279 sd_event *e,
1280 struct clock_data *d,
1281 clockid_t clock) {
1282
1283 assert(e);
1284 assert(d);
1285
1286 if (_likely_(d->fd >= 0))
1287 return 0;
1288
1289 _cleanup_close_ int fd = -EBADF;
1290
1291 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1292 if (fd < 0)
1293 return -errno;
1294
1295 fd = fd_move_above_stdio(fd);
1296
1297 struct epoll_event ev = {
1298 .events = EPOLLIN,
1299 .data.ptr = d,
1300 };
1301
1302 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1303 return -errno;
1304
1305 d->fd = TAKE_FD(fd);
1306 return 0;
1307 }
1308
1309 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1310 assert(s);
1311
1312 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1313 }
1314
1315 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1316 int r;
1317
1318 assert(d);
1319
1320 if (d->fd < 0) {
1321 r = event_setup_timer_fd(e, d, clock);
1322 if (r < 0)
1323 return r;
1324 }
1325
1326 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1327 if (r < 0)
1328 return r;
1329
1330 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1331 if (r < 0)
1332 return r;
1333
1334 return 0;
1335 }
1336
1337 static int event_source_time_prioq_put(
1338 sd_event_source *s,
1339 struct clock_data *d) {
1340
1341 int r;
1342
1343 assert(s);
1344 assert(d);
1345 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1346
1347 r = prioq_put(d->earliest, s, &s->earliest_index);
1348 if (r < 0)
1349 return r;
1350
1351 r = prioq_put(d->latest, s, &s->latest_index);
1352 if (r < 0) {
1353 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1354 s->earliest_index = PRIOQ_IDX_NULL;
1355 return r;
1356 }
1357
1358 d->needs_rearm = true;
1359 return 0;
1360 }
1361
1362 _public_ int sd_event_add_time(
1363 sd_event *e,
1364 sd_event_source **ret,
1365 clockid_t clock,
1366 uint64_t usec,
1367 uint64_t accuracy,
1368 sd_event_time_handler_t callback,
1369 void *userdata) {
1370
1371 EventSourceType type;
1372 _cleanup_(source_freep) sd_event_source *s = NULL;
1373 struct clock_data *d;
1374 int r;
1375
1376 assert_return(e, -EINVAL);
1377 assert_return(e = event_resolve(e), -ENOPKG);
1378 assert_return(accuracy != UINT64_MAX, -EINVAL);
1379 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1380 assert_return(!event_pid_changed(e), -ECHILD);
1381
1382 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1383 return -EOPNOTSUPP;
1384
1385 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1386 if (type < 0)
1387 return -EOPNOTSUPP;
1388
1389 if (!callback)
1390 callback = time_exit_callback;
1391
1392 assert_se(d = event_get_clock_data(e, type));
1393
1394 r = setup_clock_data(e, d, clock);
1395 if (r < 0)
1396 return r;
1397
1398 s = source_new(e, !ret, type);
1399 if (!s)
1400 return -ENOMEM;
1401
1402 s->time.next = usec;
1403 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1404 s->time.callback = callback;
1405 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1406 s->userdata = userdata;
1407 s->enabled = SD_EVENT_ONESHOT;
1408
1409 r = event_source_time_prioq_put(s, d);
1410 if (r < 0)
1411 return r;
1412
1413 if (ret)
1414 *ret = s;
1415 TAKE_PTR(s);
1416
1417 return 0;
1418 }
1419
1420 _public_ int sd_event_add_time_relative(
1421 sd_event *e,
1422 sd_event_source **ret,
1423 clockid_t clock,
1424 uint64_t usec,
1425 uint64_t accuracy,
1426 sd_event_time_handler_t callback,
1427 void *userdata) {
1428
1429 usec_t t;
1430 int r;
1431
1432 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1433 * checks for overflow. */
1434
1435 r = sd_event_now(e, clock, &t);
1436 if (r < 0)
1437 return r;
1438
1439 if (usec >= USEC_INFINITY - t)
1440 return -EOVERFLOW;
1441
1442 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1443 }
1444
1445 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1446 assert(s);
1447
1448 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1449 }
1450
1451 _public_ int sd_event_add_signal(
1452 sd_event *e,
1453 sd_event_source **ret,
1454 int sig,
1455 sd_event_signal_handler_t callback,
1456 void *userdata) {
1457
1458 _cleanup_(source_freep) sd_event_source *s = NULL;
1459 struct signal_data *d;
1460 sigset_t new_ss;
1461 bool block_it;
1462 int r;
1463
1464 assert_return(e, -EINVAL);
1465 assert_return(e = event_resolve(e), -ENOPKG);
1466 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1467 assert_return(!event_pid_changed(e), -ECHILD);
1468
1469 /* Let's make sure our special flag stays outside of the valid signal range */
1470 assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1471
1472 if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1473 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1474 assert_return(SIGNAL_VALID(sig), -EINVAL);
1475
1476 block_it = true;
1477 } else {
1478 assert_return(SIGNAL_VALID(sig), -EINVAL);
1479
1480 r = signal_is_blocked(sig);
1481 if (r < 0)
1482 return r;
1483 if (r == 0)
1484 return -EBUSY;
1485
1486 block_it = false;
1487 }
1488
1489 if (!callback)
1490 callback = signal_exit_callback;
1491
1492 if (!e->signal_sources) {
1493 e->signal_sources = new0(sd_event_source*, _NSIG);
1494 if (!e->signal_sources)
1495 return -ENOMEM;
1496 } else if (e->signal_sources[sig])
1497 return -EBUSY;
1498
1499 s = source_new(e, !ret, SOURCE_SIGNAL);
1500 if (!s)
1501 return -ENOMEM;
1502
1503 s->signal.sig = sig;
1504 s->signal.callback = callback;
1505 s->userdata = userdata;
1506 s->enabled = SD_EVENT_ON;
1507
1508 e->signal_sources[sig] = s;
1509
1510 if (block_it) {
1511 sigset_t old_ss;
1512
1513 if (sigemptyset(&new_ss) < 0)
1514 return -errno;
1515
1516 if (sigaddset(&new_ss, sig) < 0)
1517 return -errno;
1518
1519 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1520 if (r != 0)
1521 return -r;
1522
1523 r = sigismember(&old_ss, sig);
1524 if (r < 0)
1525 return -errno;
1526
1527 s->signal.unblock = !r;
1528 } else
1529 s->signal.unblock = false;
1530
1531 r = event_make_signal_data(e, sig, &d);
1532 if (r < 0) {
1533 if (s->signal.unblock)
1534 (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1535
1536 return r;
1537 }
1538
1539 /* Use the signal name as description for the event source by default */
1540 (void) sd_event_source_set_description(s, signal_to_string(sig));
1541
1542 if (ret)
1543 *ret = s;
1544 TAKE_PTR(s);
1545
1546 return 0;
1547 }
1548
1549 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1550 assert(s);
1551
1552 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1553 }
1554
1555 static bool shall_use_pidfd(void) {
1556 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1557 return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1558 }
1559
1560 _public_ int sd_event_add_child(
1561 sd_event *e,
1562 sd_event_source **ret,
1563 pid_t pid,
1564 int options,
1565 sd_event_child_handler_t callback,
1566 void *userdata) {
1567
1568 _cleanup_(source_freep) sd_event_source *s = NULL;
1569 int r;
1570
1571 assert_return(e, -EINVAL);
1572 assert_return(e = event_resolve(e), -ENOPKG);
1573 assert_return(pid > 1, -EINVAL);
1574 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1575 assert_return(options != 0, -EINVAL);
1576 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1577 assert_return(!event_pid_changed(e), -ECHILD);
1578
1579 if (!callback)
1580 callback = child_exit_callback;
1581
1582 if (e->n_online_child_sources == 0) {
1583 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1584 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1585 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1586 * take effect.
1587 *
1588 * (As an optimization we only do this check on the first child event source created.) */
1589 r = signal_is_blocked(SIGCHLD);
1590 if (r < 0)
1591 return r;
1592 if (r == 0)
1593 return -EBUSY;
1594 }
1595
1596 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1597 if (r < 0)
1598 return r;
1599
1600 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1601 return -EBUSY;
1602
1603 s = source_new(e, !ret, SOURCE_CHILD);
1604 if (!s)
1605 return -ENOMEM;
1606
1607 s->wakeup = WAKEUP_EVENT_SOURCE;
1608 s->child.options = options;
1609 s->child.callback = callback;
1610 s->userdata = userdata;
1611 s->enabled = SD_EVENT_ONESHOT;
1612
1613 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1614 * pin the PID, and make regular waitid() handling race-free. */
1615
1616 if (shall_use_pidfd()) {
1617 s->child.pidfd = pidfd_open(pid, 0);
1618 if (s->child.pidfd < 0) {
1619 /* Propagate errors unless the syscall is not supported or blocked */
1620 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1621 return -errno;
1622 } else
1623 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1624 } else
1625 s->child.pidfd = -EBADF;
1626
1627 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1628 /* We have a pidfd and we only want to watch for exit */
1629 r = source_child_pidfd_register(s, s->enabled);
1630 if (r < 0)
1631 return r;
1632
1633 } else {
1634 /* We have no pidfd or we shall wait for some other event than WEXITED */
1635 r = event_make_signal_data(e, SIGCHLD, NULL);
1636 if (r < 0)
1637 return r;
1638
1639 e->need_process_child = true;
1640 }
1641
1642 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1643 if (r < 0)
1644 return r;
1645
1646 /* These must be done after everything succeeds. */
1647 s->child.pid = pid;
1648 e->n_online_child_sources++;
1649
1650 if (ret)
1651 *ret = s;
1652 TAKE_PTR(s);
1653 return 0;
1654 }
1655
1656 _public_ int sd_event_add_child_pidfd(
1657 sd_event *e,
1658 sd_event_source **ret,
1659 int pidfd,
1660 int options,
1661 sd_event_child_handler_t callback,
1662 void *userdata) {
1663
1664
1665 _cleanup_(source_freep) sd_event_source *s = NULL;
1666 pid_t pid;
1667 int r;
1668
1669 assert_return(e, -EINVAL);
1670 assert_return(e = event_resolve(e), -ENOPKG);
1671 assert_return(pidfd >= 0, -EBADF);
1672 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1673 assert_return(options != 0, -EINVAL);
1674 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1675 assert_return(!event_pid_changed(e), -ECHILD);
1676
1677 if (!callback)
1678 callback = child_exit_callback;
1679
1680 if (e->n_online_child_sources == 0) {
1681 r = signal_is_blocked(SIGCHLD);
1682 if (r < 0)
1683 return r;
1684 if (r == 0)
1685 return -EBUSY;
1686 }
1687
1688 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1689 if (r < 0)
1690 return r;
1691
1692 r = pidfd_get_pid(pidfd, &pid);
1693 if (r < 0)
1694 return r;
1695
1696 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1697 return -EBUSY;
1698
1699 s = source_new(e, !ret, SOURCE_CHILD);
1700 if (!s)
1701 return -ENOMEM;
1702
1703 s->wakeup = WAKEUP_EVENT_SOURCE;
1704 s->child.pidfd = pidfd;
1705 s->child.pid = pid;
1706 s->child.options = options;
1707 s->child.callback = callback;
1708 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1709 s->userdata = userdata;
1710 s->enabled = SD_EVENT_ONESHOT;
1711
1712 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1713 if (r < 0)
1714 return r;
1715
1716 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1717 /* We only want to watch for WEXITED */
1718 r = source_child_pidfd_register(s, s->enabled);
1719 if (r < 0)
1720 return r;
1721 } else {
1722 /* We shall wait for some other event than WEXITED */
1723 r = event_make_signal_data(e, SIGCHLD, NULL);
1724 if (r < 0)
1725 return r;
1726
1727 e->need_process_child = true;
1728 }
1729
1730 e->n_online_child_sources++;
1731
1732 if (ret)
1733 *ret = s;
1734 TAKE_PTR(s);
1735 return 0;
1736 }
1737
1738 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1739 assert(s);
1740
1741 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1742 }
1743
1744 _public_ int sd_event_add_defer(
1745 sd_event *e,
1746 sd_event_source **ret,
1747 sd_event_handler_t callback,
1748 void *userdata) {
1749
1750 _cleanup_(source_freep) sd_event_source *s = NULL;
1751 int r;
1752
1753 assert_return(e, -EINVAL);
1754 assert_return(e = event_resolve(e), -ENOPKG);
1755 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1756 assert_return(!event_pid_changed(e), -ECHILD);
1757
1758 if (!callback)
1759 callback = generic_exit_callback;
1760
1761 s = source_new(e, !ret, SOURCE_DEFER);
1762 if (!s)
1763 return -ENOMEM;
1764
1765 s->defer.callback = callback;
1766 s->userdata = userdata;
1767 s->enabled = SD_EVENT_ONESHOT;
1768
1769 r = source_set_pending(s, true);
1770 if (r < 0)
1771 return r;
1772
1773 if (ret)
1774 *ret = s;
1775 TAKE_PTR(s);
1776
1777 return 0;
1778 }
1779
1780 _public_ int sd_event_add_post(
1781 sd_event *e,
1782 sd_event_source **ret,
1783 sd_event_handler_t callback,
1784 void *userdata) {
1785
1786 _cleanup_(source_freep) sd_event_source *s = NULL;
1787 int r;
1788
1789 assert_return(e, -EINVAL);
1790 assert_return(e = event_resolve(e), -ENOPKG);
1791 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1792 assert_return(!event_pid_changed(e), -ECHILD);
1793
1794 if (!callback)
1795 callback = generic_exit_callback;
1796
1797 s = source_new(e, !ret, SOURCE_POST);
1798 if (!s)
1799 return -ENOMEM;
1800
1801 s->post.callback = callback;
1802 s->userdata = userdata;
1803 s->enabled = SD_EVENT_ON;
1804
1805 r = set_ensure_put(&e->post_sources, NULL, s);
1806 if (r < 0)
1807 return r;
1808 assert(r > 0);
1809
1810 if (ret)
1811 *ret = s;
1812 TAKE_PTR(s);
1813
1814 return 0;
1815 }
1816
1817 _public_ int sd_event_add_exit(
1818 sd_event *e,
1819 sd_event_source **ret,
1820 sd_event_handler_t callback,
1821 void *userdata) {
1822
1823 _cleanup_(source_freep) sd_event_source *s = NULL;
1824 int r;
1825
1826 assert_return(e, -EINVAL);
1827 assert_return(e = event_resolve(e), -ENOPKG);
1828 assert_return(callback, -EINVAL);
1829 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1830 assert_return(!event_pid_changed(e), -ECHILD);
1831
1832 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1833 if (r < 0)
1834 return r;
1835
1836 s = source_new(e, !ret, SOURCE_EXIT);
1837 if (!s)
1838 return -ENOMEM;
1839
1840 s->exit.callback = callback;
1841 s->userdata = userdata;
1842 s->exit.prioq_index = PRIOQ_IDX_NULL;
1843 s->enabled = SD_EVENT_ONESHOT;
1844
1845 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1846 if (r < 0)
1847 return r;
1848
1849 if (ret)
1850 *ret = s;
1851 TAKE_PTR(s);
1852
1853 return 0;
1854 }
1855
1856 int sd_event_trim_memory(void) {
1857 int r;
1858
1859 /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1860 * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1861 * NULL callback parameter. */
1862
1863 log_debug("Memory pressure event, trimming malloc() memory.");
1864
1865 #if HAVE_GENERIC_MALLINFO
1866 generic_mallinfo before_mallinfo = generic_mallinfo_get();
1867 #endif
1868
1869 usec_t before_timestamp = now(CLOCK_MONOTONIC);
1870 hashmap_trim_pools();
1871 r = malloc_trim(0);
1872 usec_t after_timestamp = now(CLOCK_MONOTONIC);
1873
1874 if (r > 0)
1875 log_debug("Successfully trimmed some memory.");
1876 else
1877 log_debug("Couldn't trim any memory.");
1878
1879 usec_t period = after_timestamp - before_timestamp;
1880
1881 #if HAVE_GENERIC_MALLINFO
1882 generic_mallinfo after_mallinfo = generic_mallinfo_get();
1883 size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
1884 LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
1885 log_struct(LOG_DEBUG,
1886 LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1887 FORMAT_TIMESPAN(period, 0),
1888 FORMAT_BYTES(l)),
1889 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1890 "TRIMMED_BYTES=%zu", l,
1891 "TRIMMED_USEC=" USEC_FMT, period);
1892 #else
1893 log_struct(LOG_DEBUG,
1894 LOG_MESSAGE("Memory trimming took %s.",
1895 FORMAT_TIMESPAN(period, 0)),
1896 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1897 "TRIMMED_USEC=" USEC_FMT, period);
1898 #endif
1899
1900 return 0;
1901 }
1902
1903 static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1904 assert(s);
1905
1906 sd_event_trim_memory();
1907 return 0;
1908 }
1909
1910 _public_ int sd_event_add_memory_pressure(
1911 sd_event *e,
1912 sd_event_source **ret,
1913 sd_event_handler_t callback,
1914 void *userdata) {
1915
1916 _cleanup_free_ char *w = NULL;
1917 _cleanup_(source_freep) sd_event_source *s = NULL;
1918 _cleanup_close_ int path_fd = -1, fd = -1;
1919 _cleanup_free_ void *write_buffer = NULL;
1920 const char *watch, *watch_fallback, *env;
1921 size_t write_buffer_size = 0;
1922 struct stat st;
1923 uint32_t events;
1924 bool locked;
1925 int r;
1926
1927 assert_return(e, -EINVAL);
1928 assert_return(e = event_resolve(e), -ENOPKG);
1929 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1930 assert_return(!event_pid_changed(e), -ECHILD);
1931
1932 if (!callback)
1933 callback = memory_pressure_callback;
1934
1935 s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1936 if (!s)
1937 return -ENOMEM;
1938
1939 s->wakeup = WAKEUP_EVENT_SOURCE;
1940 s->memory_pressure.callback = callback;
1941 s->userdata = userdata;
1942 s->enabled = SD_EVENT_ON;
1943 s->memory_pressure.fd = -EBADF;
1944
1945 env = secure_getenv("MEMORY_PRESSURE_WATCH");
1946 if (env) {
1947 if (isempty(env) || path_equal(env, "/dev/null"))
1948 return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1949 "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1950
1951 if (!path_is_absolute(env) || !path_is_normalized(env))
1952 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1953 "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1954
1955 watch = env;
1956
1957 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1958 if (env) {
1959 r = unbase64mem(env, SIZE_MAX, &write_buffer, &write_buffer_size);
1960 if (r < 0)
1961 return r;
1962 }
1963
1964 locked = true;
1965 } else {
1966
1967 r = is_pressure_supported();
1968 if (r < 0)
1969 return r;
1970 if (r == 0)
1971 return -EOPNOTSUPP;
1972
1973 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1974 * the system wide pressure if for some reason we cannot (which could be: memory controller
1975 * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1976 * only use the system-wide logic. */
1977 r = cg_all_unified();
1978 if (r < 0)
1979 return r;
1980 if (r == 0)
1981 watch = "/proc/pressure/memory";
1982 else {
1983 _cleanup_free_ char *cg = NULL;
1984
1985 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
1986 if (r < 0)
1987 return r;
1988
1989 w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
1990 if (!w)
1991 return -ENOMEM;
1992
1993 watch = w;
1994 watch_fallback = "/proc/pressure/memory";
1995 }
1996
1997 /* Android uses three levels in its userspace low memory killer logic:
1998 * some 70000 1000000
1999 * some 100000 1000000
2000 * full 70000 1000000
2001 *
2002 * GNOME's low memory monitor uses:
2003 * some 70000 1000000
2004 * some 100000 1000000
2005 * full 100000 1000000
2006 *
2007 * We'll default to the middle level that both agree on */
2008 if (asprintf((char**) &write_buffer,
2009 "%s " USEC_FMT " " USEC_FMT,
2010 MEMORY_PRESSURE_DEFAULT_TYPE,
2011 MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2012 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2013 return -ENOMEM;
2014
2015 write_buffer_size = strlen(write_buffer) + 1;
2016 locked = false;
2017 }
2018
2019 path_fd = open(watch, O_PATH|O_CLOEXEC);
2020 if (path_fd < 0) {
2021 if (errno != ENOENT)
2022 return -errno;
2023
2024 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2025 * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2026 * the PSI service apparently is not supported) */
2027 if (!watch_fallback)
2028 return locked ? -ENOENT : -EOPNOTSUPP;
2029
2030 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
2031 if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2032 return -EOPNOTSUPP;
2033 if (errno < 0)
2034 return -errno;
2035 }
2036
2037 if (fstat(path_fd, &st) < 0)
2038 return -errno;
2039
2040 if (S_ISSOCK(st.st_mode)) {
2041 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2042 if (fd < 0)
2043 return -errno;
2044
2045 r = connect_unix_path(fd, path_fd, NULL);
2046 if (r < 0)
2047 return r;
2048
2049 events = EPOLLIN;
2050
2051 } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2052 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2053 if (fd < 0)
2054 return fd;
2055
2056 if (S_ISREG(st.st_mode)) {
2057 struct statfs sfs;
2058
2059 /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2060
2061 if (fstatfs(fd, &sfs) < 0)
2062 return -errno;
2063
2064 if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2065 !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2066 return -ENOTTY;
2067
2068 events = EPOLLPRI;
2069 } else
2070 /* For fifos and char devices just watch for EPOLLIN */
2071 events = EPOLLIN;
2072
2073 } else if (S_ISDIR(st.st_mode))
2074 return -EISDIR;
2075 else
2076 return -EBADF;
2077
2078 s->memory_pressure.fd = TAKE_FD(fd);
2079 s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2080 s->memory_pressure.write_buffer_size = write_buffer_size;
2081 s->memory_pressure.events = events;
2082 s->memory_pressure.locked = locked;
2083
2084 /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2085 * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2086 * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2087 * event sources on which writes must be executed before the first event loop iteration is
2088 * executed. (We could also write the data here, right away, but we want to give the caller the
2089 * freedom to call sd_event_source_set_memory_pressure_type() and
2090 * sd_event_source_set_memory_pressure_rate() before we write it. */
2091
2092 if (s->memory_pressure.write_buffer_size > 0)
2093 source_memory_pressure_add_to_write_list(s);
2094 else {
2095 r = source_memory_pressure_register(s, s->enabled);
2096 if (r < 0)
2097 return r;
2098 }
2099
2100 if (ret)
2101 *ret = s;
2102 TAKE_PTR(s);
2103
2104 return 0;
2105 }
2106
2107 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2108 assert(e);
2109
2110 if (!d)
2111 return;
2112
2113 assert(hashmap_isempty(d->inodes));
2114 assert(hashmap_isempty(d->wd));
2115
2116 if (d->buffer_filled > 0)
2117 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
2118
2119 hashmap_free(d->inodes);
2120 hashmap_free(d->wd);
2121
2122 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2123
2124 if (d->fd >= 0) {
2125 if (!event_pid_changed(e) &&
2126 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
2127 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2128
2129 safe_close(d->fd);
2130 }
2131 free(d);
2132 }
2133
2134 static int event_make_inotify_data(
2135 sd_event *e,
2136 int64_t priority,
2137 struct inotify_data **ret) {
2138
2139 _cleanup_close_ int fd = -EBADF;
2140 struct inotify_data *d;
2141 int r;
2142
2143 assert(e);
2144
2145 d = hashmap_get(e->inotify_data, &priority);
2146 if (d) {
2147 if (ret)
2148 *ret = d;
2149 return 0;
2150 }
2151
2152 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2153 if (fd < 0)
2154 return -errno;
2155
2156 fd = fd_move_above_stdio(fd);
2157
2158 d = new(struct inotify_data, 1);
2159 if (!d)
2160 return -ENOMEM;
2161
2162 *d = (struct inotify_data) {
2163 .wakeup = WAKEUP_INOTIFY_DATA,
2164 .fd = TAKE_FD(fd),
2165 .priority = priority,
2166 };
2167
2168 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
2169 if (r < 0) {
2170 d->fd = safe_close(d->fd);
2171 free(d);
2172 return r;
2173 }
2174
2175 struct epoll_event ev = {
2176 .events = EPOLLIN,
2177 .data.ptr = d,
2178 };
2179
2180 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2181 r = -errno;
2182 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2183 * remove the fd from the epoll first, which we don't want as we couldn't
2184 * add it in the first place. */
2185 event_free_inotify_data(e, d);
2186 return r;
2187 }
2188
2189 if (ret)
2190 *ret = d;
2191
2192 return 1;
2193 }
2194
2195 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
2196 int r;
2197
2198 assert(x);
2199 assert(y);
2200
2201 r = CMP(x->dev, y->dev);
2202 if (r != 0)
2203 return r;
2204
2205 return CMP(x->ino, y->ino);
2206 }
2207
2208 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2209 assert(d);
2210
2211 siphash24_compress(&d->dev, sizeof(d->dev), state);
2212 siphash24_compress(&d->ino, sizeof(d->ino), state);
2213 }
2214
2215 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
2216
2217 static void event_free_inode_data(
2218 sd_event *e,
2219 struct inode_data *d) {
2220
2221 assert(e);
2222
2223 if (!d)
2224 return;
2225
2226 assert(!d->event_sources);
2227
2228 if (d->fd >= 0) {
2229 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
2230 safe_close(d->fd);
2231 }
2232
2233 if (d->inotify_data) {
2234
2235 if (d->wd >= 0) {
2236 if (d->inotify_data->fd >= 0 && !event_pid_changed(e)) {
2237 /* So here's a problem. At the time this runs the watch descriptor might already be
2238 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2239 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2240 * likely case to happen. */
2241
2242 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2243 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2244 }
2245
2246 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2247 }
2248
2249 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2250 }
2251
2252 free(d);
2253 }
2254
2255 static void event_gc_inotify_data(
2256 sd_event *e,
2257 struct inotify_data *d) {
2258
2259 assert(e);
2260
2261 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2262 * any inode with it anymore, which in turn happens if no event source of this priority is interested
2263 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2264 * (under the expectation that the GC is called again once the counter is decremented). */
2265
2266 if (!d)
2267 return;
2268
2269 if (!hashmap_isempty(d->inodes))
2270 return;
2271
2272 if (d->n_busy > 0)
2273 return;
2274
2275 event_free_inotify_data(e, d);
2276 }
2277
2278 static void event_gc_inode_data(
2279 sd_event *e,
2280 struct inode_data *d) {
2281
2282 struct inotify_data *inotify_data;
2283
2284 assert(e);
2285
2286 if (!d)
2287 return;
2288
2289 if (d->event_sources)
2290 return;
2291
2292 inotify_data = d->inotify_data;
2293 event_free_inode_data(e, d);
2294
2295 event_gc_inotify_data(e, inotify_data);
2296 }
2297
2298 static int event_make_inode_data(
2299 sd_event *e,
2300 struct inotify_data *inotify_data,
2301 dev_t dev,
2302 ino_t ino,
2303 struct inode_data **ret) {
2304
2305 struct inode_data *d, key;
2306 int r;
2307
2308 assert(e);
2309 assert(inotify_data);
2310
2311 key = (struct inode_data) {
2312 .ino = ino,
2313 .dev = dev,
2314 };
2315
2316 d = hashmap_get(inotify_data->inodes, &key);
2317 if (d) {
2318 if (ret)
2319 *ret = d;
2320
2321 return 0;
2322 }
2323
2324 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2325 if (r < 0)
2326 return r;
2327
2328 d = new(struct inode_data, 1);
2329 if (!d)
2330 return -ENOMEM;
2331
2332 *d = (struct inode_data) {
2333 .dev = dev,
2334 .ino = ino,
2335 .wd = -1,
2336 .fd = -EBADF,
2337 .inotify_data = inotify_data,
2338 };
2339
2340 r = hashmap_put(inotify_data->inodes, d, d);
2341 if (r < 0) {
2342 free(d);
2343 return r;
2344 }
2345
2346 if (ret)
2347 *ret = d;
2348
2349 return 1;
2350 }
2351
2352 static uint32_t inode_data_determine_mask(struct inode_data *d) {
2353 bool excl_unlink = true;
2354 uint32_t combined = 0;
2355
2356 assert(d);
2357
2358 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2359 * the IN_EXCL_UNLINK flag is ANDed instead.
2360 *
2361 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2362 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2363 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2364 * events we don't care for client-side. */
2365
2366 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2367
2368 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2369 excl_unlink = false;
2370
2371 combined |= s->inotify.mask;
2372 }
2373
2374 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2375 }
2376
2377 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2378 uint32_t combined_mask;
2379 int wd, r;
2380
2381 assert(d);
2382 assert(d->fd >= 0);
2383
2384 combined_mask = inode_data_determine_mask(d);
2385
2386 if (d->wd >= 0 && combined_mask == d->combined_mask)
2387 return 0;
2388
2389 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2390 if (r < 0)
2391 return r;
2392
2393 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2394 if (wd < 0)
2395 return -errno;
2396
2397 if (d->wd < 0) {
2398 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2399 if (r < 0) {
2400 (void) inotify_rm_watch(d->inotify_data->fd, wd);
2401 return r;
2402 }
2403
2404 d->wd = wd;
2405
2406 } else if (d->wd != wd) {
2407
2408 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2409 (void) inotify_rm_watch(d->fd, wd);
2410 return -EINVAL;
2411 }
2412
2413 d->combined_mask = combined_mask;
2414 return 1;
2415 }
2416
2417 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2418 assert(s);
2419
2420 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2421 }
2422
2423 static int event_add_inotify_fd_internal(
2424 sd_event *e,
2425 sd_event_source **ret,
2426 int fd,
2427 bool donate,
2428 uint32_t mask,
2429 sd_event_inotify_handler_t callback,
2430 void *userdata) {
2431
2432 _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
2433 _cleanup_(source_freep) sd_event_source *s = NULL;
2434 struct inotify_data *inotify_data = NULL;
2435 struct inode_data *inode_data = NULL;
2436 struct stat st;
2437 int r;
2438
2439 assert_return(e, -EINVAL);
2440 assert_return(e = event_resolve(e), -ENOPKG);
2441 assert_return(fd >= 0, -EBADF);
2442 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2443 assert_return(!event_pid_changed(e), -ECHILD);
2444
2445 if (!callback)
2446 callback = inotify_exit_callback;
2447
2448 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2449 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2450 * the user can't use them for us. */
2451 if (mask & IN_MASK_ADD)
2452 return -EINVAL;
2453
2454 if (fstat(fd, &st) < 0)
2455 return -errno;
2456
2457 s = source_new(e, !ret, SOURCE_INOTIFY);
2458 if (!s)
2459 return -ENOMEM;
2460
2461 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2462 s->inotify.mask = mask;
2463 s->inotify.callback = callback;
2464 s->userdata = userdata;
2465
2466 /* Allocate an inotify object for this priority, and an inode object within it */
2467 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2468 if (r < 0)
2469 return r;
2470
2471 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2472 if (r < 0) {
2473 event_gc_inotify_data(e, inotify_data);
2474 return r;
2475 }
2476
2477 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2478 * the event source, until then, for which we need the original inode. */
2479 if (inode_data->fd < 0) {
2480 if (donated_fd >= 0)
2481 inode_data->fd = TAKE_FD(donated_fd);
2482 else {
2483 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2484 if (inode_data->fd < 0) {
2485 r = -errno;
2486 event_gc_inode_data(e, inode_data);
2487 return r;
2488 }
2489 }
2490
2491 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
2492 }
2493
2494 /* Link our event source to the inode data object */
2495 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2496 s->inotify.inode_data = inode_data;
2497
2498 /* Actually realize the watch now */
2499 r = inode_data_realize_watch(e, inode_data);
2500 if (r < 0)
2501 return r;
2502
2503 if (ret)
2504 *ret = s;
2505 TAKE_PTR(s);
2506
2507 return 0;
2508 }
2509
2510 _public_ int sd_event_add_inotify_fd(
2511 sd_event *e,
2512 sd_event_source **ret,
2513 int fd,
2514 uint32_t mask,
2515 sd_event_inotify_handler_t callback,
2516 void *userdata) {
2517
2518 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2519 }
2520
2521 _public_ int sd_event_add_inotify(
2522 sd_event *e,
2523 sd_event_source **ret,
2524 const char *path,
2525 uint32_t mask,
2526 sd_event_inotify_handler_t callback,
2527 void *userdata) {
2528
2529 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2530 int fd, r;
2531
2532 assert_return(path, -EINVAL);
2533
2534 fd = open(path, O_PATH | O_CLOEXEC |
2535 (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2536 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2537 if (fd < 0)
2538 return -errno;
2539
2540 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2541 if (r < 0)
2542 return r;
2543
2544 (void) sd_event_source_set_description(s, path);
2545
2546 if (ret)
2547 *ret = s;
2548
2549 return r;
2550 }
2551
2552 static sd_event_source* event_source_free(sd_event_source *s) {
2553 if (!s)
2554 return NULL;
2555
2556 /* Here's a special hack: when we are called from a
2557 * dispatch handler we won't free the event source
2558 * immediately, but we will detach the fd from the
2559 * epoll. This way it is safe for the caller to unref
2560 * the event source and immediately close the fd, but
2561 * we still retain a valid event source object after
2562 * the callback. */
2563
2564 if (s->dispatching)
2565 source_disconnect(s);
2566 else
2567 source_free(s);
2568
2569 return NULL;
2570 }
2571
2572 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2573
2574 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2575 assert_return(s, -EINVAL);
2576 assert_return(!event_pid_changed(s->event), -ECHILD);
2577
2578 return free_and_strdup(&s->description, description);
2579 }
2580
2581 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
2582 assert_return(s, -EINVAL);
2583 assert_return(description, -EINVAL);
2584 assert_return(!event_pid_changed(s->event), -ECHILD);
2585
2586 if (!s->description)
2587 return -ENXIO;
2588
2589 *description = s->description;
2590 return 0;
2591 }
2592
2593 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
2594 assert_return(s, NULL);
2595
2596 return s->event;
2597 }
2598
2599 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2600 assert_return(s, -EINVAL);
2601 assert_return(s->type != SOURCE_EXIT, -EDOM);
2602 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2603 assert_return(!event_pid_changed(s->event), -ECHILD);
2604
2605 return s->pending;
2606 }
2607
2608 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2609 assert_return(s, -EINVAL);
2610 assert_return(s->type == SOURCE_IO, -EDOM);
2611 assert_return(!event_pid_changed(s->event), -ECHILD);
2612
2613 return s->io.fd;
2614 }
2615
2616 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2617 int r;
2618
2619 assert_return(s, -EINVAL);
2620 assert_return(fd >= 0, -EBADF);
2621 assert_return(s->type == SOURCE_IO, -EDOM);
2622 assert_return(!event_pid_changed(s->event), -ECHILD);
2623
2624 if (s->io.fd == fd)
2625 return 0;
2626
2627 if (event_source_is_offline(s)) {
2628 s->io.fd = fd;
2629 s->io.registered = false;
2630 } else {
2631 int saved_fd;
2632
2633 saved_fd = s->io.fd;
2634 assert(s->io.registered);
2635
2636 s->io.fd = fd;
2637 s->io.registered = false;
2638
2639 r = source_io_register(s, s->enabled, s->io.events);
2640 if (r < 0) {
2641 s->io.fd = saved_fd;
2642 s->io.registered = true;
2643 return r;
2644 }
2645
2646 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2647 }
2648
2649 return 0;
2650 }
2651
2652 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2653 assert_return(s, -EINVAL);
2654 assert_return(s->type == SOURCE_IO, -EDOM);
2655
2656 return s->io.owned;
2657 }
2658
2659 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2660 assert_return(s, -EINVAL);
2661 assert_return(s->type == SOURCE_IO, -EDOM);
2662
2663 s->io.owned = own;
2664 return 0;
2665 }
2666
2667 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2668 assert_return(s, -EINVAL);
2669 assert_return(events, -EINVAL);
2670 assert_return(s->type == SOURCE_IO, -EDOM);
2671 assert_return(!event_pid_changed(s->event), -ECHILD);
2672
2673 *events = s->io.events;
2674 return 0;
2675 }
2676
2677 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2678 int r;
2679
2680 assert_return(s, -EINVAL);
2681 assert_return(s->type == SOURCE_IO, -EDOM);
2682 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2683 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2684 assert_return(!event_pid_changed(s->event), -ECHILD);
2685
2686 /* edge-triggered updates are never skipped, so we can reset edges */
2687 if (s->io.events == events && !(events & EPOLLET))
2688 return 0;
2689
2690 r = source_set_pending(s, false);
2691 if (r < 0)
2692 return r;
2693
2694 if (event_source_is_online(s)) {
2695 r = source_io_register(s, s->enabled, events);
2696 if (r < 0)
2697 return r;
2698 }
2699
2700 s->io.events = events;
2701
2702 return 0;
2703 }
2704
2705 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2706 assert_return(s, -EINVAL);
2707 assert_return(revents, -EINVAL);
2708 assert_return(s->type == SOURCE_IO, -EDOM);
2709 assert_return(s->pending, -ENODATA);
2710 assert_return(!event_pid_changed(s->event), -ECHILD);
2711
2712 *revents = s->io.revents;
2713 return 0;
2714 }
2715
2716 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2717 assert_return(s, -EINVAL);
2718 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2719 assert_return(!event_pid_changed(s->event), -ECHILD);
2720
2721 return s->signal.sig;
2722 }
2723
2724 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2725 assert_return(s, -EINVAL);
2726 assert_return(!event_pid_changed(s->event), -ECHILD);
2727
2728 *priority = s->priority;
2729 return 0;
2730 }
2731
2732 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2733 bool rm_inotify = false, rm_inode = false;
2734 struct inotify_data *new_inotify_data = NULL;
2735 struct inode_data *new_inode_data = NULL;
2736 int r;
2737
2738 assert_return(s, -EINVAL);
2739 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2740 assert_return(!event_pid_changed(s->event), -ECHILD);
2741
2742 if (s->priority == priority)
2743 return 0;
2744
2745 if (s->type == SOURCE_INOTIFY) {
2746 struct inode_data *old_inode_data;
2747
2748 assert(s->inotify.inode_data);
2749 old_inode_data = s->inotify.inode_data;
2750
2751 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2752 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2753 * events we allow priority changes only until the first following iteration. */
2754 if (old_inode_data->fd < 0)
2755 return -EOPNOTSUPP;
2756
2757 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2758 if (r < 0)
2759 return r;
2760 rm_inotify = r > 0;
2761
2762 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2763 if (r < 0)
2764 goto fail;
2765 rm_inode = r > 0;
2766
2767 if (new_inode_data->fd < 0) {
2768 /* Duplicate the fd for the new inode object if we don't have any yet */
2769 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2770 if (new_inode_data->fd < 0) {
2771 r = -errno;
2772 goto fail;
2773 }
2774
2775 LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
2776 }
2777
2778 /* Move the event source to the new inode data structure */
2779 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2780 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2781 s->inotify.inode_data = new_inode_data;
2782
2783 /* Now create the new watch */
2784 r = inode_data_realize_watch(s->event, new_inode_data);
2785 if (r < 0) {
2786 /* Move it back */
2787 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2788 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2789 s->inotify.inode_data = old_inode_data;
2790 goto fail;
2791 }
2792
2793 s->priority = priority;
2794
2795 event_gc_inode_data(s->event, old_inode_data);
2796
2797 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2798 struct signal_data *old, *d;
2799
2800 /* Move us from the signalfd belonging to the old
2801 * priority to the signalfd of the new priority */
2802
2803 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2804
2805 s->priority = priority;
2806
2807 r = event_make_signal_data(s->event, s->signal.sig, &d);
2808 if (r < 0) {
2809 s->priority = old->priority;
2810 return r;
2811 }
2812
2813 event_unmask_signal_data(s->event, old, s->signal.sig);
2814 } else
2815 s->priority = priority;
2816
2817 event_source_pp_prioq_reshuffle(s);
2818
2819 if (s->type == SOURCE_EXIT)
2820 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2821
2822 return 0;
2823
2824 fail:
2825 if (rm_inode)
2826 event_free_inode_data(s->event, new_inode_data);
2827
2828 if (rm_inotify)
2829 event_free_inotify_data(s->event, new_inotify_data);
2830
2831 return r;
2832 }
2833
2834 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2835 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2836 if (!s && !ret)
2837 return false;
2838
2839 assert_return(s, -EINVAL);
2840 assert_return(!event_pid_changed(s->event), -ECHILD);
2841
2842 if (ret)
2843 *ret = s->enabled;
2844
2845 return s->enabled != SD_EVENT_OFF;
2846 }
2847
2848 static int event_source_offline(
2849 sd_event_source *s,
2850 int enabled,
2851 bool ratelimited) {
2852
2853 bool was_offline;
2854 int r;
2855
2856 assert(s);
2857 assert(enabled == SD_EVENT_OFF || ratelimited);
2858
2859 /* Unset the pending flag when this event source is disabled */
2860 if (s->enabled != SD_EVENT_OFF &&
2861 enabled == SD_EVENT_OFF &&
2862 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2863 r = source_set_pending(s, false);
2864 if (r < 0)
2865 return r;
2866 }
2867
2868 was_offline = event_source_is_offline(s);
2869 s->enabled = enabled;
2870 s->ratelimited = ratelimited;
2871
2872 switch (s->type) {
2873
2874 case SOURCE_IO:
2875 source_io_unregister(s);
2876 break;
2877
2878 case SOURCE_SIGNAL:
2879 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2880 break;
2881
2882 case SOURCE_CHILD:
2883 if (!was_offline) {
2884 assert(s->event->n_online_child_sources > 0);
2885 s->event->n_online_child_sources--;
2886 }
2887
2888 if (EVENT_SOURCE_WATCH_PIDFD(s))
2889 source_child_pidfd_unregister(s);
2890 else
2891 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2892 break;
2893
2894 case SOURCE_EXIT:
2895 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2896 break;
2897
2898 case SOURCE_MEMORY_PRESSURE:
2899 source_memory_pressure_unregister(s);
2900 break;
2901
2902 case SOURCE_TIME_REALTIME:
2903 case SOURCE_TIME_BOOTTIME:
2904 case SOURCE_TIME_MONOTONIC:
2905 case SOURCE_TIME_REALTIME_ALARM:
2906 case SOURCE_TIME_BOOTTIME_ALARM:
2907 case SOURCE_DEFER:
2908 case SOURCE_POST:
2909 case SOURCE_INOTIFY:
2910 break;
2911
2912 default:
2913 assert_not_reached();
2914 }
2915
2916 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2917 event_source_time_prioq_reshuffle(s);
2918
2919 return 1;
2920 }
2921
2922 static int event_source_online(
2923 sd_event_source *s,
2924 int enabled,
2925 bool ratelimited) {
2926
2927 bool was_online;
2928 int r;
2929
2930 assert(s);
2931 assert(enabled != SD_EVENT_OFF || !ratelimited);
2932
2933 /* Unset the pending flag when this event source is enabled */
2934 if (s->enabled == SD_EVENT_OFF &&
2935 enabled != SD_EVENT_OFF &&
2936 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2937 r = source_set_pending(s, false);
2938 if (r < 0)
2939 return r;
2940 }
2941
2942 /* Are we really ready for onlining? */
2943 if (enabled == SD_EVENT_OFF || ratelimited) {
2944 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2945 s->enabled = enabled;
2946 s->ratelimited = ratelimited;
2947 return 0;
2948 }
2949
2950 was_online = event_source_is_online(s);
2951
2952 switch (s->type) {
2953 case SOURCE_IO:
2954 r = source_io_register(s, enabled, s->io.events);
2955 if (r < 0)
2956 return r;
2957 break;
2958
2959 case SOURCE_SIGNAL:
2960 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2961 if (r < 0) {
2962 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2963 return r;
2964 }
2965
2966 break;
2967
2968 case SOURCE_CHILD:
2969 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2970 /* yes, we have pidfd */
2971
2972 r = source_child_pidfd_register(s, enabled);
2973 if (r < 0)
2974 return r;
2975 } else {
2976 /* no pidfd, or something other to watch for than WEXITED */
2977
2978 r = event_make_signal_data(s->event, SIGCHLD, NULL);
2979 if (r < 0) {
2980 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2981 return r;
2982 }
2983 }
2984
2985 if (!was_online)
2986 s->event->n_online_child_sources++;
2987 break;
2988
2989 case SOURCE_MEMORY_PRESSURE:
2990 r = source_memory_pressure_register(s, enabled);
2991 if (r < 0)
2992 return r;
2993
2994 break;
2995
2996 case SOURCE_TIME_REALTIME:
2997 case SOURCE_TIME_BOOTTIME:
2998 case SOURCE_TIME_MONOTONIC:
2999 case SOURCE_TIME_REALTIME_ALARM:
3000 case SOURCE_TIME_BOOTTIME_ALARM:
3001 case SOURCE_EXIT:
3002 case SOURCE_DEFER:
3003 case SOURCE_POST:
3004 case SOURCE_INOTIFY:
3005 break;
3006
3007 default:
3008 assert_not_reached();
3009 }
3010
3011 s->enabled = enabled;
3012 s->ratelimited = ratelimited;
3013
3014 /* Non-failing operations below */
3015 if (s->type == SOURCE_EXIT)
3016 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
3017
3018 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3019 event_source_time_prioq_reshuffle(s);
3020
3021 return 1;
3022 }
3023
3024 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3025 int r;
3026
3027 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
3028
3029 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3030 if (m == SD_EVENT_OFF && !s)
3031 return 0;
3032
3033 assert_return(s, -EINVAL);
3034 assert_return(!event_pid_changed(s->event), -ECHILD);
3035
3036 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3037 if (s->event->state == SD_EVENT_FINISHED)
3038 return m == SD_EVENT_OFF ? 0 : -ESTALE;
3039
3040 if (s->enabled == m) /* No change? */
3041 return 0;
3042
3043 if (m == SD_EVENT_OFF)
3044 r = event_source_offline(s, m, s->ratelimited);
3045 else {
3046 if (s->enabled != SD_EVENT_OFF) {
3047 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3048 * event source is already enabled after all. */
3049 s->enabled = m;
3050 return 0;
3051 }
3052
3053 r = event_source_online(s, m, s->ratelimited);
3054 }
3055 if (r < 0)
3056 return r;
3057
3058 event_source_pp_prioq_reshuffle(s);
3059 return 0;
3060 }
3061
3062 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
3063 assert_return(s, -EINVAL);
3064 assert_return(usec, -EINVAL);
3065 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3066 assert_return(!event_pid_changed(s->event), -ECHILD);
3067
3068 *usec = s->time.next;
3069 return 0;
3070 }
3071
3072 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
3073 int r;
3074
3075 assert_return(s, -EINVAL);
3076 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3077 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3078 assert_return(!event_pid_changed(s->event), -ECHILD);
3079
3080 r = source_set_pending(s, false);
3081 if (r < 0)
3082 return r;
3083
3084 s->time.next = usec;
3085
3086 event_source_time_prioq_reshuffle(s);
3087 return 0;
3088 }
3089
3090 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3091 usec_t t;
3092 int r;
3093
3094 assert_return(s, -EINVAL);
3095 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3096
3097 if (usec == USEC_INFINITY)
3098 return sd_event_source_set_time(s, USEC_INFINITY);
3099
3100 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3101 if (r < 0)
3102 return r;
3103
3104 usec = usec_add(t, usec);
3105 if (usec == USEC_INFINITY)
3106 return -EOVERFLOW;
3107
3108 return sd_event_source_set_time(s, usec);
3109 }
3110
3111 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
3112 assert_return(s, -EINVAL);
3113 assert_return(usec, -EINVAL);
3114 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3115 assert_return(!event_pid_changed(s->event), -ECHILD);
3116
3117 *usec = s->time.accuracy;
3118 return 0;
3119 }
3120
3121 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
3122 int r;
3123
3124 assert_return(s, -EINVAL);
3125 assert_return(usec != UINT64_MAX, -EINVAL);
3126 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3127 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3128 assert_return(!event_pid_changed(s->event), -ECHILD);
3129
3130 r = source_set_pending(s, false);
3131 if (r < 0)
3132 return r;
3133
3134 if (usec == 0)
3135 usec = DEFAULT_ACCURACY_USEC;
3136
3137 s->time.accuracy = usec;
3138
3139 event_source_time_prioq_reshuffle(s);
3140 return 0;
3141 }
3142
3143 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
3144 assert_return(s, -EINVAL);
3145 assert_return(clock, -EINVAL);
3146 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3147 assert_return(!event_pid_changed(s->event), -ECHILD);
3148
3149 *clock = event_source_type_to_clock(s->type);
3150 return 0;
3151 }
3152
3153 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
3154 assert_return(s, -EINVAL);
3155 assert_return(pid, -EINVAL);
3156 assert_return(s->type == SOURCE_CHILD, -EDOM);
3157 assert_return(!event_pid_changed(s->event), -ECHILD);
3158
3159 *pid = s->child.pid;
3160 return 0;
3161 }
3162
3163 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3164 assert_return(s, -EINVAL);
3165 assert_return(s->type == SOURCE_CHILD, -EDOM);
3166 assert_return(!event_pid_changed(s->event), -ECHILD);
3167
3168 if (s->child.pidfd < 0)
3169 return -EOPNOTSUPP;
3170
3171 return s->child.pidfd;
3172 }
3173
3174 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3175 assert_return(s, -EINVAL);
3176 assert_return(s->type == SOURCE_CHILD, -EDOM);
3177 assert_return(!event_pid_changed(s->event), -ECHILD);
3178 assert_return(SIGNAL_VALID(sig), -EINVAL);
3179
3180 /* If we already have seen indication the process exited refuse sending a signal early. This way we
3181 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3182 * available. */
3183 if (s->child.exited)
3184 return -ESRCH;
3185
3186 if (s->child.pidfd >= 0) {
3187 siginfo_t copy;
3188
3189 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3190 * structure here */
3191 if (si)
3192 copy = *si;
3193
3194 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
3195 /* Let's propagate the error only if the system call is not implemented or prohibited */
3196 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3197 return -errno;
3198 } else
3199 return 0;
3200 }
3201
3202 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3203 * this here. */
3204 if (flags != 0)
3205 return -EOPNOTSUPP;
3206
3207 if (si) {
3208 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3209 siginfo_t copy = *si;
3210
3211 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
3212 return -errno;
3213 } else if (kill(s->child.pid, sig) < 0)
3214 return -errno;
3215
3216 return 0;
3217 }
3218
3219 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3220 assert_return(s, -EINVAL);
3221 assert_return(s->type == SOURCE_CHILD, -EDOM);
3222
3223 if (s->child.pidfd < 0)
3224 return -EOPNOTSUPP;
3225
3226 return s->child.pidfd_owned;
3227 }
3228
3229 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3230 assert_return(s, -EINVAL);
3231 assert_return(s->type == SOURCE_CHILD, -EDOM);
3232
3233 if (s->child.pidfd < 0)
3234 return -EOPNOTSUPP;
3235
3236 s->child.pidfd_owned = own;
3237 return 0;
3238 }
3239
3240 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3241 assert_return(s, -EINVAL);
3242 assert_return(s->type == SOURCE_CHILD, -EDOM);
3243
3244 return s->child.process_owned;
3245 }
3246
3247 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3248 assert_return(s, -EINVAL);
3249 assert_return(s->type == SOURCE_CHILD, -EDOM);
3250
3251 s->child.process_owned = own;
3252 return 0;
3253 }
3254
3255 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
3256 assert_return(s, -EINVAL);
3257 assert_return(mask, -EINVAL);
3258 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3259 assert_return(!event_pid_changed(s->event), -ECHILD);
3260
3261 *mask = s->inotify.mask;
3262 return 0;
3263 }
3264
3265 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
3266 int r;
3267
3268 assert_return(s, -EINVAL);
3269 assert_return(s->type != SOURCE_EXIT, -EDOM);
3270 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3271 assert_return(!event_pid_changed(s->event), -ECHILD);
3272
3273 if (s->prepare == callback)
3274 return 0;
3275
3276 if (callback && s->prepare) {
3277 s->prepare = callback;
3278 return 0;
3279 }
3280
3281 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3282 if (r < 0)
3283 return r;
3284
3285 s->prepare = callback;
3286
3287 if (callback) {
3288 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3289 if (r < 0)
3290 return r;
3291 } else
3292 prioq_remove(s->event->prepare, s, &s->prepare_index);
3293
3294 return 0;
3295 }
3296
3297 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
3298 assert_return(s, NULL);
3299
3300 return s->userdata;
3301 }
3302
3303 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3304 void *ret;
3305
3306 assert_return(s, NULL);
3307
3308 ret = s->userdata;
3309 s->userdata = userdata;
3310
3311 return ret;
3312 }
3313
3314 static int event_source_enter_ratelimited(sd_event_source *s) {
3315 int r;
3316
3317 assert(s);
3318
3319 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3320 * the end of the rate limit time window, much as if it was a timer event source. */
3321
3322 if (s->ratelimited)
3323 return 0; /* Already ratelimited, this is a NOP hence */
3324
3325 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3326 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3327 if (r < 0)
3328 return r;
3329
3330 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3331 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3332 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3333 if (EVENT_SOURCE_IS_TIME(s->type))
3334 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3335
3336 /* Now, let's add the event source to the monotonic clock instead */
3337 r = event_source_time_prioq_put(s, &s->event->monotonic);
3338 if (r < 0)
3339 goto fail;
3340
3341 /* And let's take the event source officially offline */
3342 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3343 if (r < 0) {
3344 event_source_time_prioq_remove(s, &s->event->monotonic);
3345 goto fail;
3346 }
3347
3348 event_source_pp_prioq_reshuffle(s);
3349
3350 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3351 return 0;
3352
3353 fail:
3354 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3355 * space for it should already be allocated. */
3356 if (EVENT_SOURCE_IS_TIME(s->type))
3357 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3358
3359 return r;
3360 }
3361
3362 static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
3363 int r;
3364
3365 assert(s);
3366
3367 if (!s->ratelimited)
3368 return 0;
3369
3370 /* Let's take the event source out of the monotonic prioq first. */
3371 event_source_time_prioq_remove(s, &s->event->monotonic);
3372
3373 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3374 if (EVENT_SOURCE_IS_TIME(s->type)) {
3375 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3376 if (r < 0)
3377 goto fail;
3378 }
3379
3380 /* Let's try to take it online again. */
3381 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3382 if (r < 0) {
3383 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3384 if (EVENT_SOURCE_IS_TIME(s->type))
3385 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3386
3387 goto fail;
3388 }
3389
3390 event_source_pp_prioq_reshuffle(s);
3391 ratelimit_reset(&s->rate_limit);
3392
3393 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
3394
3395 if (run_callback && s->ratelimit_expire_callback) {
3396 s->dispatching = true;
3397 r = s->ratelimit_expire_callback(s, s->userdata);
3398 s->dispatching = false;
3399
3400 if (r < 0) {
3401 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3402 strna(s->description),
3403 event_source_type_to_string(s->type),
3404 s->exit_on_failure ? "exiting" : "disabling");
3405
3406 if (s->exit_on_failure)
3407 (void) sd_event_exit(s->event, r);
3408 }
3409
3410 if (s->n_ref == 0)
3411 source_free(s);
3412 else if (r < 0)
3413 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3414
3415 return 1;
3416 }
3417
3418 return 0;
3419
3420 fail:
3421 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3422 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3423 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3424
3425 return r;
3426 }
3427
3428 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3429 usec_t c;
3430 assert(e);
3431 assert(a <= b);
3432
3433 if (a <= 0)
3434 return 0;
3435 if (a >= USEC_INFINITY)
3436 return USEC_INFINITY;
3437
3438 if (b <= a + 1)
3439 return a;
3440
3441 initialize_perturb(e);
3442
3443 /*
3444 Find a good time to wake up again between times a and b. We
3445 have two goals here:
3446
3447 a) We want to wake up as seldom as possible, hence prefer
3448 later times over earlier times.
3449
3450 b) But if we have to wake up, then let's make sure to
3451 dispatch as much as possible on the entire system.
3452
3453 We implement this by waking up everywhere at the same time
3454 within any given minute if we can, synchronised via the
3455 perturbation value determined from the boot ID. If we can't,
3456 then we try to find the same spot in every 10s, then 1s and
3457 then 250ms step. Otherwise, we pick the last possible time
3458 to wake up.
3459 */
3460
3461 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3462 if (c >= b) {
3463 if (_unlikely_(c < USEC_PER_MINUTE))
3464 return b;
3465
3466 c -= USEC_PER_MINUTE;
3467 }
3468
3469 if (c >= a)
3470 return c;
3471
3472 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3473 if (c >= b) {
3474 if (_unlikely_(c < USEC_PER_SEC*10))
3475 return b;
3476
3477 c -= USEC_PER_SEC*10;
3478 }
3479
3480 if (c >= a)
3481 return c;
3482
3483 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3484 if (c >= b) {
3485 if (_unlikely_(c < USEC_PER_SEC))
3486 return b;
3487
3488 c -= USEC_PER_SEC;
3489 }
3490
3491 if (c >= a)
3492 return c;
3493
3494 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3495 if (c >= b) {
3496 if (_unlikely_(c < USEC_PER_MSEC*250))
3497 return b;
3498
3499 c -= USEC_PER_MSEC*250;
3500 }
3501
3502 if (c >= a)
3503 return c;
3504
3505 return b;
3506 }
3507
3508 static int event_arm_timer(
3509 sd_event *e,
3510 struct clock_data *d) {
3511
3512 struct itimerspec its = {};
3513 sd_event_source *a, *b;
3514 usec_t t;
3515
3516 assert(e);
3517 assert(d);
3518
3519 if (!d->needs_rearm)
3520 return 0;
3521
3522 d->needs_rearm = false;
3523
3524 a = prioq_peek(d->earliest);
3525 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3526 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3527
3528 if (d->fd < 0)
3529 return 0;
3530
3531 if (d->next == USEC_INFINITY)
3532 return 0;
3533
3534 /* disarm */
3535 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3536 return -errno;
3537
3538 d->next = USEC_INFINITY;
3539 return 0;
3540 }
3541
3542 b = prioq_peek(d->latest);
3543 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3544 assert(b && b->enabled != SD_EVENT_OFF);
3545
3546 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3547 if (d->next == t)
3548 return 0;
3549
3550 assert_se(d->fd >= 0);
3551
3552 if (t == 0) {
3553 /* We don't want to disarm here, just mean some time looooong ago. */
3554 its.it_value.tv_sec = 0;
3555 its.it_value.tv_nsec = 1;
3556 } else
3557 timespec_store(&its.it_value, t);
3558
3559 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3560 return -errno;
3561
3562 d->next = t;
3563 return 0;
3564 }
3565
3566 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3567 assert(e);
3568 assert(s);
3569 assert(s->type == SOURCE_IO);
3570
3571 /* If the event source was already pending, we just OR in the
3572 * new revents, otherwise we reset the value. The ORing is
3573 * necessary to handle EPOLLONESHOT events properly where
3574 * readability might happen independently of writability, and
3575 * we need to keep track of both */
3576
3577 if (s->pending)
3578 s->io.revents |= revents;
3579 else
3580 s->io.revents = revents;
3581
3582 return source_set_pending(s, true);
3583 }
3584
3585 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3586 uint64_t x;
3587 ssize_t ss;
3588
3589 assert(e);
3590 assert(fd >= 0);
3591
3592 assert_return(events == EPOLLIN, -EIO);
3593
3594 ss = read(fd, &x, sizeof(x));
3595 if (ss < 0) {
3596 if (ERRNO_IS_TRANSIENT(errno))
3597 return 0;
3598
3599 return -errno;
3600 }
3601
3602 if (_unlikely_(ss != sizeof(x)))
3603 return -EIO;
3604
3605 if (next)
3606 *next = USEC_INFINITY;
3607
3608 return 0;
3609 }
3610
3611 static int process_timer(
3612 sd_event *e,
3613 usec_t n,
3614 struct clock_data *d) {
3615
3616 sd_event_source *s;
3617 bool callback_invoked = false;
3618 int r;
3619
3620 assert(e);
3621 assert(d);
3622
3623 for (;;) {
3624 s = prioq_peek(d->earliest);
3625 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3626
3627 if (!s || time_event_source_next(s) > n)
3628 break;
3629
3630 if (s->ratelimited) {
3631 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3632 * again. */
3633 assert(s->ratelimited);
3634
3635 r = event_source_leave_ratelimit(s, /* run_callback */ true);
3636 if (r < 0)
3637 return r;
3638 else if (r == 1)
3639 callback_invoked = true;
3640
3641 continue;
3642 }
3643
3644 if (s->enabled == SD_EVENT_OFF || s->pending)
3645 break;
3646
3647 r = source_set_pending(s, true);
3648 if (r < 0)
3649 return r;
3650
3651 event_source_time_prioq_reshuffle(s);
3652 }
3653
3654 return callback_invoked;
3655 }
3656
3657 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3658 int64_t min_priority = threshold;
3659 bool something_new = false;
3660 sd_event_source *s;
3661 int r;
3662
3663 assert(e);
3664 assert(ret_min_priority);
3665
3666 if (!e->need_process_child) {
3667 *ret_min_priority = min_priority;
3668 return 0;
3669 }
3670
3671 e->need_process_child = false;
3672
3673 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3674 * for, instead of using P_ALL. This is because we only want to get child information of very
3675 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3676 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3677 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3678 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3679 * to handle SIGCHLD yourself.
3680 *
3681 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3682 * source is dispatched so that the callback still sees the process as a zombie. */
3683
3684 HASHMAP_FOREACH(s, e->child_sources) {
3685 assert(s->type == SOURCE_CHILD);
3686
3687 if (s->priority > threshold)
3688 continue;
3689
3690 if (s->pending)
3691 continue;
3692
3693 if (event_source_is_offline(s))
3694 continue;
3695
3696 if (s->child.exited)
3697 continue;
3698
3699 if (EVENT_SOURCE_WATCH_PIDFD(s))
3700 /* There's a usable pidfd known for this event source? Then don't waitid() for
3701 * it here */
3702 continue;
3703
3704 zero(s->child.siginfo);
3705 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3706 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3707 return negative_errno();
3708
3709 if (s->child.siginfo.si_pid != 0) {
3710 bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3711
3712 if (zombie)
3713 s->child.exited = true;
3714
3715 if (!zombie && (s->child.options & WEXITED)) {
3716 /* If the child isn't dead then let's immediately remove the state
3717 * change from the queue, since there's no benefit in leaving it
3718 * queued. */
3719
3720 assert(s->child.options & (WSTOPPED|WCONTINUED));
3721 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3722 }
3723
3724 r = source_set_pending(s, true);
3725 if (r < 0)
3726 return r;
3727 if (r > 0) {
3728 something_new = true;
3729 min_priority = MIN(min_priority, s->priority);
3730 }
3731 }
3732 }
3733
3734 *ret_min_priority = min_priority;
3735 return something_new;
3736 }
3737
3738 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3739 assert(e);
3740 assert(s);
3741 assert(s->type == SOURCE_CHILD);
3742
3743 if (s->pending)
3744 return 0;
3745
3746 if (event_source_is_offline(s))
3747 return 0;
3748
3749 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3750 return 0;
3751
3752 zero(s->child.siginfo);
3753 if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3754 return -errno;
3755
3756 if (s->child.siginfo.si_pid == 0)
3757 return 0;
3758
3759 if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3760 s->child.exited = true;
3761
3762 return source_set_pending(s, true);
3763 }
3764
3765 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3766 int r;
3767
3768 assert(e);
3769 assert(d);
3770 assert_return(events == EPOLLIN, -EIO);
3771 assert(min_priority);
3772
3773 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3774 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3775 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3776 * but we might have higher priority children we care about hence we need to check that
3777 * explicitly. */
3778
3779 if (sigismember(&d->sigset, SIGCHLD))
3780 e->need_process_child = true;
3781
3782 /* If there's already an event source pending for this priority we don't read another */
3783 if (d->current)
3784 return 0;
3785
3786 for (;;) {
3787 struct signalfd_siginfo si;
3788 ssize_t n;
3789 sd_event_source *s = NULL;
3790
3791 n = read(d->fd, &si, sizeof(si));
3792 if (n < 0) {
3793 if (ERRNO_IS_TRANSIENT(errno))
3794 return 0;
3795
3796 return -errno;
3797 }
3798
3799 if (_unlikely_(n != sizeof(si)))
3800 return -EIO;
3801
3802 assert(SIGNAL_VALID(si.ssi_signo));
3803
3804 if (e->signal_sources)
3805 s = e->signal_sources[si.ssi_signo];
3806 if (!s)
3807 continue;
3808 if (s->pending)
3809 continue;
3810
3811 s->signal.siginfo = si;
3812 d->current = s;
3813
3814 r = source_set_pending(s, true);
3815 if (r < 0)
3816 return r;
3817 if (r > 0 && *min_priority >= s->priority) {
3818 *min_priority = s->priority;
3819 return 1; /* an event source with smaller priority is queued. */
3820 }
3821
3822 return 0;
3823 }
3824 }
3825
3826 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3827 ssize_t n;
3828
3829 assert(e);
3830 assert(d);
3831
3832 assert_return(revents == EPOLLIN, -EIO);
3833
3834 /* If there's already an event source pending for this priority, don't read another */
3835 if (d->n_pending > 0)
3836 return 0;
3837
3838 /* Is the read buffer non-empty? If so, let's not read more */
3839 if (d->buffer_filled > 0)
3840 return 0;
3841
3842 if (d->priority > threshold)
3843 return 0;
3844
3845 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3846 if (n < 0) {
3847 if (ERRNO_IS_TRANSIENT(errno))
3848 return 0;
3849
3850 return -errno;
3851 }
3852
3853 assert(n > 0);
3854 d->buffer_filled = (size_t) n;
3855 LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
3856
3857 return 1;
3858 }
3859
3860 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3861 assert(e);
3862 assert(d);
3863 assert(sz <= d->buffer_filled);
3864
3865 if (sz == 0)
3866 return;
3867
3868 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3869 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3870 d->buffer_filled -= sz;
3871
3872 if (d->buffer_filled == 0)
3873 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
3874 }
3875
3876 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3877 int r;
3878
3879 assert(e);
3880 assert(d);
3881
3882 /* If there's already an event source pending for this priority, don't read another */
3883 if (d->n_pending > 0)
3884 return 0;
3885
3886 while (d->buffer_filled > 0) {
3887 size_t sz;
3888
3889 /* Let's validate that the event structures are complete */
3890 if (d->buffer_filled < offsetof(struct inotify_event, name))
3891 return -EIO;
3892
3893 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3894 if (d->buffer_filled < sz)
3895 return -EIO;
3896
3897 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3898 struct inode_data *inode_data;
3899
3900 /* The queue overran, let's pass this event to all event sources connected to this inotify
3901 * object */
3902
3903 HASHMAP_FOREACH(inode_data, d->inodes)
3904 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3905
3906 if (event_source_is_offline(s))
3907 continue;
3908
3909 r = source_set_pending(s, true);
3910 if (r < 0)
3911 return r;
3912 }
3913 } else {
3914 struct inode_data *inode_data;
3915
3916 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3917 * our watch descriptor table. */
3918 if (d->buffer.ev.mask & IN_IGNORED) {
3919
3920 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3921 if (!inode_data) {
3922 event_inotify_data_drop(e, d, sz);
3923 continue;
3924 }
3925
3926 /* The watch descriptor was removed by the kernel, let's drop it here too */
3927 inode_data->wd = -1;
3928 } else {
3929 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3930 if (!inode_data) {
3931 event_inotify_data_drop(e, d, sz);
3932 continue;
3933 }
3934 }
3935
3936 /* Trigger all event sources that are interested in these events. Also trigger all event
3937 * sources if IN_IGNORED or IN_UNMOUNT is set. */
3938 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3939
3940 if (event_source_is_offline(s))
3941 continue;
3942
3943 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3944 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3945 continue;
3946
3947 r = source_set_pending(s, true);
3948 if (r < 0)
3949 return r;
3950 }
3951 }
3952
3953 /* Something pending now? If so, let's finish, otherwise let's read more. */
3954 if (d->n_pending > 0)
3955 return 1;
3956 }
3957
3958 return 0;
3959 }
3960
3961 static int process_inotify(sd_event *e) {
3962 int r, done = 0;
3963
3964 assert(e);
3965
3966 LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
3967 r = event_inotify_data_process(e, d);
3968 if (r < 0)
3969 return r;
3970 if (r > 0)
3971 done ++;
3972 }
3973
3974 return done;
3975 }
3976
3977 static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
3978 assert(s);
3979 assert(s->type == SOURCE_MEMORY_PRESSURE);
3980
3981 if (s->pending)
3982 s->memory_pressure.revents |= revents;
3983 else
3984 s->memory_pressure.revents = revents;
3985
3986 return source_set_pending(s, true);
3987 }
3988
3989 static int source_memory_pressure_write(sd_event_source *s) {
3990 ssize_t n;
3991 int r;
3992
3993 assert(s);
3994 assert(s->type == SOURCE_MEMORY_PRESSURE);
3995
3996 /* once we start writing, the buffer is locked, we allow no further changes. */
3997 s->memory_pressure.locked = true;
3998
3999 if (s->memory_pressure.write_buffer_size > 0) {
4000 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4001 if (n < 0) {
4002 if (!ERRNO_IS_TRANSIENT(errno))
4003 return -errno;
4004
4005 n = 0;
4006 }
4007 } else
4008 n = 0;
4009
4010 assert(n >= 0);
4011
4012 if ((size_t) n == s->memory_pressure.write_buffer_size) {
4013 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4014
4015 if (n > 0) {
4016 s->memory_pressure.write_buffer_size = 0;
4017
4018 /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4019 r = source_memory_pressure_register(s, s->enabled);
4020 if (r < 0)
4021 return r;
4022 }
4023 } else if (n > 0) {
4024 _cleanup_free_ void *c = NULL;
4025
4026 assert((size_t) n < s->memory_pressure.write_buffer_size);
4027
4028 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4029 if (!c)
4030 return -ENOMEM;
4031
4032 free_and_replace(s->memory_pressure.write_buffer, c);
4033 s->memory_pressure.write_buffer_size -= n;
4034 return 1;
4035 }
4036
4037 return 0;
4038 }
4039
4040 static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4041 int r;
4042
4043 assert(s);
4044 assert(s->type == SOURCE_MEMORY_PRESSURE);
4045
4046 r = source_memory_pressure_write(s);
4047 if (r < 0)
4048 return r;
4049 if (r > 0)
4050 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4051 * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4052
4053 /* No pending incoming IO? Then let's not continue further */
4054 if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4055
4056 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4057 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4058 return -EIO;
4059
4060 return 1; /* leave dispatch, we already processed everything */
4061 }
4062
4063 if (s->memory_pressure.revents & EPOLLIN) {
4064 uint8_t pipe_buf[PIPE_BUF];
4065 ssize_t n;
4066
4067 /* If the fd is readable, then flush out anything that might be queued */
4068
4069 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4070 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4071 return -errno;
4072 }
4073
4074 return 0; /* go on, dispatch to user callback */
4075 }
4076
4077 static int source_dispatch(sd_event_source *s) {
4078 EventSourceType saved_type;
4079 sd_event *saved_event;
4080 int r = 0;
4081
4082 assert(s);
4083 assert(s->pending || s->type == SOURCE_EXIT);
4084
4085 /* Save the event source type, here, so that we still know it after the event callback which might
4086 * invalidate the event. */
4087 saved_type = s->type;
4088
4089 /* Similarly, store a reference to the event loop object, so that we can still access it after the
4090 * callback might have invalidated/disconnected the event source. */
4091 saved_event = s->event;
4092 PROTECT_EVENT(saved_event);
4093
4094 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4095 assert(!s->ratelimited);
4096 if (!ratelimit_below(&s->rate_limit)) {
4097 r = event_source_enter_ratelimited(s);
4098 if (r < 0)
4099 return r;
4100
4101 return 1;
4102 }
4103
4104 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
4105 r = source_set_pending(s, false);
4106 if (r < 0)
4107 return r;
4108 }
4109
4110 if (s->type != SOURCE_POST) {
4111 sd_event_source *z;
4112
4113 /* If we execute a non-post source, let's mark all post sources as pending. */
4114
4115 SET_FOREACH(z, s->event->post_sources) {
4116 if (event_source_is_offline(z))
4117 continue;
4118
4119 r = source_set_pending(z, true);
4120 if (r < 0)
4121 return r;
4122 }
4123 }
4124
4125 if (s->type == SOURCE_MEMORY_PRESSURE) {
4126 r = source_memory_pressure_initiate_dispatch(s);
4127 if (r == -EIO) /* handle EIO errors similar to callback errors */
4128 goto finish;
4129 if (r < 0)
4130 return r;
4131 if (r > 0) /* already handled */
4132 return 1;
4133 }
4134
4135 if (s->enabled == SD_EVENT_ONESHOT) {
4136 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
4137 if (r < 0)
4138 return r;
4139 }
4140
4141 s->dispatching = true;
4142
4143 switch (s->type) {
4144
4145 case SOURCE_IO:
4146 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4147 break;
4148
4149 case SOURCE_TIME_REALTIME:
4150 case SOURCE_TIME_BOOTTIME:
4151 case SOURCE_TIME_MONOTONIC:
4152 case SOURCE_TIME_REALTIME_ALARM:
4153 case SOURCE_TIME_BOOTTIME_ALARM:
4154 r = s->time.callback(s, s->time.next, s->userdata);
4155 break;
4156
4157 case SOURCE_SIGNAL:
4158 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4159 break;
4160
4161 case SOURCE_CHILD: {
4162 bool zombie;
4163
4164 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
4165
4166 r = s->child.callback(s, &s->child.siginfo, s->userdata);
4167
4168 /* Now, reap the PID for good. */
4169 if (zombie) {
4170 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
4171 s->child.waited = true;
4172 }
4173
4174 break;
4175 }
4176
4177 case SOURCE_DEFER:
4178 r = s->defer.callback(s, s->userdata);
4179 break;
4180
4181 case SOURCE_POST:
4182 r = s->post.callback(s, s->userdata);
4183 break;
4184
4185 case SOURCE_EXIT:
4186 r = s->exit.callback(s, s->userdata);
4187 break;
4188
4189 case SOURCE_INOTIFY: {
4190 struct sd_event *e = s->event;
4191 struct inotify_data *d;
4192 size_t sz;
4193
4194 assert(s->inotify.inode_data);
4195 assert_se(d = s->inotify.inode_data->inotify_data);
4196
4197 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4198 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4199 assert(d->buffer_filled >= sz);
4200
4201 /* If the inotify callback destroys the event source then this likely means we don't need to
4202 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4203 * free it immediately, then we couldn't drop the event from the inotify event queue without
4204 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4205 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4206 * explicitly GC it after we are done dropping the inotify event from the buffer. */
4207 d->n_busy++;
4208 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
4209 d->n_busy--;
4210
4211 /* When no event is pending anymore on this inotify object, then let's drop the event from
4212 * the inotify event queue buffer. */
4213 if (d->n_pending == 0)
4214 event_inotify_data_drop(e, d, sz);
4215
4216 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4217 event_gc_inotify_data(e, d);
4218 break;
4219 }
4220
4221 case SOURCE_MEMORY_PRESSURE:
4222 r = s->memory_pressure.callback(s, s->userdata);
4223 break;
4224
4225 case SOURCE_WATCHDOG:
4226 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
4227 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
4228 assert_not_reached();
4229 }
4230
4231 s->dispatching = false;
4232
4233 finish:
4234 if (r < 0) {
4235 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4236 strna(s->description),
4237 event_source_type_to_string(saved_type),
4238 s->exit_on_failure ? "exiting" : "disabling");
4239
4240 if (s->exit_on_failure)
4241 (void) sd_event_exit(saved_event, r);
4242 }
4243
4244 if (s->n_ref == 0)
4245 source_free(s);
4246 else if (r < 0)
4247 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4248
4249 return 1;
4250 }
4251
4252 static int event_prepare(sd_event *e) {
4253 int r;
4254
4255 assert(e);
4256
4257 for (;;) {
4258 sd_event_source *s;
4259
4260 s = prioq_peek(e->prepare);
4261 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
4262 break;
4263
4264 s->prepare_iteration = e->iteration;
4265 prioq_reshuffle(e->prepare, s, &s->prepare_index);
4266
4267 assert(s->prepare);
4268 s->dispatching = true;
4269 r = s->prepare(s, s->userdata);
4270 s->dispatching = false;
4271
4272 if (r < 0) {
4273 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4274 strna(s->description),
4275 event_source_type_to_string(s->type),
4276 s->exit_on_failure ? "exiting" : "disabling");
4277
4278 if (s->exit_on_failure)
4279 (void) sd_event_exit(e, r);
4280 }
4281
4282 if (s->n_ref == 0)
4283 source_free(s);
4284 else if (r < 0)
4285 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4286 }
4287
4288 return 0;
4289 }
4290
4291 static int dispatch_exit(sd_event *e) {
4292 sd_event_source *p;
4293 int r;
4294
4295 assert(e);
4296
4297 p = prioq_peek(e->exit);
4298 assert(!p || p->type == SOURCE_EXIT);
4299
4300 if (!p || event_source_is_offline(p)) {
4301 e->state = SD_EVENT_FINISHED;
4302 return 0;
4303 }
4304
4305 PROTECT_EVENT(e);
4306 e->iteration++;
4307 e->state = SD_EVENT_EXITING;
4308 r = source_dispatch(p);
4309 e->state = SD_EVENT_INITIAL;
4310 return r;
4311 }
4312
4313 static sd_event_source* event_next_pending(sd_event *e) {
4314 sd_event_source *p;
4315
4316 assert(e);
4317
4318 p = prioq_peek(e->pending);
4319 if (!p)
4320 return NULL;
4321
4322 if (event_source_is_offline(p))
4323 return NULL;
4324
4325 return p;
4326 }
4327
4328 static int arm_watchdog(sd_event *e) {
4329 struct itimerspec its = {};
4330 usec_t t;
4331
4332 assert(e);
4333 assert(e->watchdog_fd >= 0);
4334
4335 t = sleep_between(e,
4336 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4337 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
4338
4339 timespec_store(&its.it_value, t);
4340
4341 /* Make sure we never set the watchdog to 0, which tells the
4342 * kernel to disable it. */
4343 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4344 its.it_value.tv_nsec = 1;
4345
4346 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
4347 }
4348
4349 static int process_watchdog(sd_event *e) {
4350 assert(e);
4351
4352 if (!e->watchdog)
4353 return 0;
4354
4355 /* Don't notify watchdog too often */
4356 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4357 return 0;
4358
4359 sd_notify(false, "WATCHDOG=1");
4360 e->watchdog_last = e->timestamp.monotonic;
4361
4362 return arm_watchdog(e);
4363 }
4364
4365 static void event_close_inode_data_fds(sd_event *e) {
4366 struct inode_data *d;
4367
4368 assert(e);
4369
4370 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4371 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4372 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4373 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4374 * compromise. */
4375
4376 while ((d = e->inode_data_to_close_list)) {
4377 assert(d->fd >= 0);
4378 d->fd = safe_close(d->fd);
4379
4380 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
4381 }
4382 }
4383
4384 static int event_memory_pressure_write_list(sd_event *e) {
4385 int r;
4386
4387 assert(e);
4388
4389 for (;;) {
4390 sd_event_source *s;
4391
4392 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4393 if (!s)
4394 break;
4395
4396 assert(s->type == SOURCE_MEMORY_PRESSURE);
4397 assert(s->memory_pressure.write_buffer_size > 0);
4398 s->memory_pressure.in_write_list = false;
4399
4400 r = source_memory_pressure_write(s);
4401 if (r < 0)
4402 return r;
4403 }
4404
4405 return 0;
4406 }
4407
4408 _public_ int sd_event_prepare(sd_event *e) {
4409 int r;
4410
4411 assert_return(e, -EINVAL);
4412 assert_return(e = event_resolve(e), -ENOPKG);
4413 assert_return(!event_pid_changed(e), -ECHILD);
4414 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4415 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4416
4417 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4418 * this check here once, since gettid() is typically not cached, and thus want to minimize
4419 * syscalls */
4420 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4421
4422 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4423 PROTECT_EVENT(e);
4424
4425 if (e->exit_requested)
4426 goto pending;
4427
4428 e->iteration++;
4429
4430 e->state = SD_EVENT_PREPARING;
4431 r = event_prepare(e);
4432 e->state = SD_EVENT_INITIAL;
4433 if (r < 0)
4434 return r;
4435
4436 r = event_memory_pressure_write_list(e);
4437 if (r < 0)
4438 return r;
4439
4440 r = event_arm_timer(e, &e->realtime);
4441 if (r < 0)
4442 return r;
4443
4444 r = event_arm_timer(e, &e->boottime);
4445 if (r < 0)
4446 return r;
4447
4448 r = event_arm_timer(e, &e->monotonic);
4449 if (r < 0)
4450 return r;
4451
4452 r = event_arm_timer(e, &e->realtime_alarm);
4453 if (r < 0)
4454 return r;
4455
4456 r = event_arm_timer(e, &e->boottime_alarm);
4457 if (r < 0)
4458 return r;
4459
4460 event_close_inode_data_fds(e);
4461
4462 if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
4463 goto pending;
4464
4465 e->state = SD_EVENT_ARMED;
4466
4467 return 0;
4468
4469 pending:
4470 e->state = SD_EVENT_ARMED;
4471 r = sd_event_wait(e, 0);
4472 if (r == 0)
4473 e->state = SD_EVENT_ARMED;
4474
4475 return r;
4476 }
4477
4478 static int epoll_wait_usec(
4479 int fd,
4480 struct epoll_event *events,
4481 int maxevents,
4482 usec_t timeout) {
4483
4484 int msec;
4485 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4486
4487 #if HAVE_EPOLL_PWAIT2
4488 static bool epoll_pwait2_absent = false;
4489 int r;
4490
4491 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4492 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4493 * is not that obvious to implement given the libc and kernel definitions differ in the last
4494 * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4495 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4496 * missing. */
4497
4498 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
4499 r = epoll_pwait2(fd,
4500 events,
4501 maxevents,
4502 TIMESPEC_STORE(timeout),
4503 NULL);
4504 if (r >= 0)
4505 return r;
4506 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
4507 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4508 * supported. */
4509
4510 epoll_pwait2_absent = true;
4511 }
4512 #endif
4513
4514 if (timeout == USEC_INFINITY)
4515 msec = -1;
4516 else {
4517 usec_t k;
4518
4519 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4520 if (k >= INT_MAX)
4521 msec = INT_MAX; /* Saturate */
4522 else
4523 msec = (int) k;
4524 }
4525
4526 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
4527 }
4528
4529 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
4530 size_t n_event_queue, m, n_event_max;
4531 int64_t min_priority = threshold;
4532 bool something_new = false;
4533 int r;
4534
4535 assert(e);
4536 assert(ret_min_priority);
4537
4538 n_event_queue = MAX(e->n_sources, 1u);
4539 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
4540 return -ENOMEM;
4541
4542 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4543
4544 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4545 if (e->buffered_inotify_data_list)
4546 timeout = 0;
4547
4548 for (;;) {
4549 r = epoll_wait_usec(
4550 e->epoll_fd,
4551 e->event_queue,
4552 n_event_max,
4553 timeout);
4554 if (r < 0)
4555 return r;
4556
4557 m = (size_t) r;
4558
4559 if (m < n_event_max)
4560 break;
4561
4562 if (n_event_max >= n_event_queue * 10)
4563 break;
4564
4565 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
4566 return -ENOMEM;
4567
4568 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4569 timeout = 0;
4570 }
4571
4572 /* Set timestamp only when this is called first time. */
4573 if (threshold == INT64_MAX)
4574 triple_timestamp_get(&e->timestamp);
4575
4576 for (size_t i = 0; i < m; i++) {
4577
4578 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4579 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
4580 else {
4581 WakeupType *t = e->event_queue[i].data.ptr;
4582
4583 switch (*t) {
4584
4585 case WAKEUP_EVENT_SOURCE: {
4586 sd_event_source *s = e->event_queue[i].data.ptr;
4587
4588 assert(s);
4589
4590 if (s->priority > threshold)
4591 continue;
4592
4593 min_priority = MIN(min_priority, s->priority);
4594
4595 switch (s->type) {
4596
4597 case SOURCE_IO:
4598 r = process_io(e, s, e->event_queue[i].events);
4599 break;
4600
4601 case SOURCE_CHILD:
4602 r = process_pidfd(e, s, e->event_queue[i].events);
4603 break;
4604
4605 case SOURCE_MEMORY_PRESSURE:
4606 r = process_memory_pressure(s, e->event_queue[i].events);
4607 break;
4608
4609 default:
4610 assert_not_reached();
4611 }
4612
4613 break;
4614 }
4615
4616 case WAKEUP_CLOCK_DATA: {
4617 struct clock_data *d = e->event_queue[i].data.ptr;
4618
4619 assert(d);
4620
4621 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4622 break;
4623 }
4624
4625 case WAKEUP_SIGNAL_DATA:
4626 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4627 break;
4628
4629 case WAKEUP_INOTIFY_DATA:
4630 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4631 break;
4632
4633 default:
4634 assert_not_reached();
4635 }
4636 }
4637 if (r < 0)
4638 return r;
4639 if (r > 0)
4640 something_new = true;
4641 }
4642
4643 *ret_min_priority = min_priority;
4644 return something_new;
4645 }
4646
4647 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4648 int r;
4649
4650 assert_return(e, -EINVAL);
4651 assert_return(e = event_resolve(e), -ENOPKG);
4652 assert_return(!event_pid_changed(e), -ECHILD);
4653 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4654 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4655
4656 if (e->exit_requested) {
4657 e->state = SD_EVENT_PENDING;
4658 return 1;
4659 }
4660
4661 for (int64_t threshold = INT64_MAX; ; threshold--) {
4662 int64_t epoll_min_priority, child_min_priority;
4663
4664 /* There may be a possibility that new epoll (especially IO) and child events are
4665 * triggered just after process_epoll() call but before process_child(), and the new IO
4666 * events may have higher priority than the child events. To salvage these events,
4667 * let's call epoll_wait() again, but accepts only events with higher priority than the
4668 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4669 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4670 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4671
4672 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4673 if (r == -EINTR) {
4674 e->state = SD_EVENT_PENDING;
4675 return 1;
4676 }
4677 if (r < 0)
4678 goto finish;
4679 if (r == 0 && threshold < INT64_MAX)
4680 /* No new epoll event. */
4681 break;
4682
4683 r = process_child(e, threshold, &child_min_priority);
4684 if (r < 0)
4685 goto finish;
4686 if (r == 0)
4687 /* No new child event. */
4688 break;
4689
4690 threshold = MIN(epoll_min_priority, child_min_priority);
4691 if (threshold == INT64_MIN)
4692 break;
4693
4694 timeout = 0;
4695 }
4696
4697 r = process_watchdog(e);
4698 if (r < 0)
4699 goto finish;
4700
4701 r = process_inotify(e);
4702 if (r < 0)
4703 goto finish;
4704
4705 r = process_timer(e, e->timestamp.realtime, &e->realtime);
4706 if (r < 0)
4707 goto finish;
4708
4709 r = process_timer(e, e->timestamp.boottime, &e->boottime);
4710 if (r < 0)
4711 goto finish;
4712
4713 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4714 if (r < 0)
4715 goto finish;
4716
4717 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4718 if (r < 0)
4719 goto finish;
4720
4721 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4722 if (r < 0)
4723 goto finish;
4724 else if (r == 1) {
4725 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4726 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4727 * there were potentially re-enabled by the callback.
4728 *
4729 * Wondering why we treat only this invocation of process_timer() differently? Once event
4730 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4731 * ratelimit expiry callback is never called for any other timer type. */
4732 r = 0;
4733 goto finish;
4734 }
4735
4736 if (event_next_pending(e)) {
4737 e->state = SD_EVENT_PENDING;
4738 return 1;
4739 }
4740
4741 r = 0;
4742
4743 finish:
4744 e->state = SD_EVENT_INITIAL;
4745
4746 return r;
4747 }
4748
4749 _public_ int sd_event_dispatch(sd_event *e) {
4750 sd_event_source *p;
4751 int r;
4752
4753 assert_return(e, -EINVAL);
4754 assert_return(e = event_resolve(e), -ENOPKG);
4755 assert_return(!event_pid_changed(e), -ECHILD);
4756 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4757 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4758
4759 if (e->exit_requested)
4760 return dispatch_exit(e);
4761
4762 p = event_next_pending(e);
4763 if (p) {
4764 PROTECT_EVENT(e);
4765
4766 e->state = SD_EVENT_RUNNING;
4767 r = source_dispatch(p);
4768 e->state = SD_EVENT_INITIAL;
4769 return r;
4770 }
4771
4772 e->state = SD_EVENT_INITIAL;
4773
4774 return 1;
4775 }
4776
4777 static void event_log_delays(sd_event *e) {
4778 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4779 size_t l, i;
4780
4781 p = b;
4782 l = sizeof(b);
4783 for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4784 l = strpcpyf(&p, l, "%u ", e->delays[i]);
4785 e->delays[i] = 0;
4786 }
4787 log_debug("Event loop iterations: %s", b);
4788 }
4789
4790 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4791 int r;
4792
4793 assert_return(e, -EINVAL);
4794 assert_return(e = event_resolve(e), -ENOPKG);
4795 assert_return(!event_pid_changed(e), -ECHILD);
4796 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4797 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4798
4799 if (e->profile_delays && e->last_run_usec != 0) {
4800 usec_t this_run;
4801 unsigned l;
4802
4803 this_run = now(CLOCK_MONOTONIC);
4804
4805 l = log2u64(this_run - e->last_run_usec);
4806 assert(l < ELEMENTSOF(e->delays));
4807 e->delays[l]++;
4808
4809 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4810 event_log_delays(e);
4811 e->last_log_usec = this_run;
4812 }
4813 }
4814
4815 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4816 PROTECT_EVENT(e);
4817
4818 r = sd_event_prepare(e);
4819 if (r == 0)
4820 /* There was nothing? Then wait... */
4821 r = sd_event_wait(e, timeout);
4822
4823 if (e->profile_delays)
4824 e->last_run_usec = now(CLOCK_MONOTONIC);
4825
4826 if (r > 0) {
4827 /* There's something now, then let's dispatch it */
4828 r = sd_event_dispatch(e);
4829 if (r < 0)
4830 return r;
4831
4832 return 1;
4833 }
4834
4835 return r;
4836 }
4837
4838 _public_ int sd_event_loop(sd_event *e) {
4839 int r;
4840
4841 assert_return(e, -EINVAL);
4842 assert_return(e = event_resolve(e), -ENOPKG);
4843 assert_return(!event_pid_changed(e), -ECHILD);
4844 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4845
4846 PROTECT_EVENT(e);
4847
4848 while (e->state != SD_EVENT_FINISHED) {
4849 r = sd_event_run(e, UINT64_MAX);
4850 if (r < 0)
4851 return r;
4852 }
4853
4854 return e->exit_code;
4855 }
4856
4857 _public_ int sd_event_get_fd(sd_event *e) {
4858 assert_return(e, -EINVAL);
4859 assert_return(e = event_resolve(e), -ENOPKG);
4860 assert_return(!event_pid_changed(e), -ECHILD);
4861
4862 return e->epoll_fd;
4863 }
4864
4865 _public_ int sd_event_get_state(sd_event *e) {
4866 assert_return(e, -EINVAL);
4867 assert_return(e = event_resolve(e), -ENOPKG);
4868 assert_return(!event_pid_changed(e), -ECHILD);
4869
4870 return e->state;
4871 }
4872
4873 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
4874 assert_return(e, -EINVAL);
4875 assert_return(e = event_resolve(e), -ENOPKG);
4876 assert_return(code, -EINVAL);
4877 assert_return(!event_pid_changed(e), -ECHILD);
4878
4879 if (!e->exit_requested)
4880 return -ENODATA;
4881
4882 *code = e->exit_code;
4883 return 0;
4884 }
4885
4886 _public_ int sd_event_exit(sd_event *e, int code) {
4887 assert_return(e, -EINVAL);
4888 assert_return(e = event_resolve(e), -ENOPKG);
4889 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4890 assert_return(!event_pid_changed(e), -ECHILD);
4891
4892 e->exit_requested = true;
4893 e->exit_code = code;
4894
4895 return 0;
4896 }
4897
4898 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
4899 assert_return(e, -EINVAL);
4900 assert_return(e = event_resolve(e), -ENOPKG);
4901 assert_return(usec, -EINVAL);
4902 assert_return(!event_pid_changed(e), -ECHILD);
4903
4904 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4905 return -EOPNOTSUPP;
4906
4907 if (!triple_timestamp_is_set(&e->timestamp)) {
4908 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4909 *usec = now(clock);
4910 return 1;
4911 }
4912
4913 *usec = triple_timestamp_by_clock(&e->timestamp, clock);
4914 return 0;
4915 }
4916
4917 _public_ int sd_event_default(sd_event **ret) {
4918 sd_event *e = NULL;
4919 int r;
4920
4921 if (!ret)
4922 return !!default_event;
4923
4924 if (default_event) {
4925 *ret = sd_event_ref(default_event);
4926 return 0;
4927 }
4928
4929 r = sd_event_new(&e);
4930 if (r < 0)
4931 return r;
4932
4933 e->default_event_ptr = &default_event;
4934 e->tid = gettid();
4935 default_event = e;
4936
4937 *ret = e;
4938 return 1;
4939 }
4940
4941 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4942 assert_return(e, -EINVAL);
4943 assert_return(e = event_resolve(e), -ENOPKG);
4944 assert_return(tid, -EINVAL);
4945 assert_return(!event_pid_changed(e), -ECHILD);
4946
4947 if (e->tid != 0) {
4948 *tid = e->tid;
4949 return 0;
4950 }
4951
4952 return -ENXIO;
4953 }
4954
4955 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
4956 int r;
4957
4958 assert_return(e, -EINVAL);
4959 assert_return(e = event_resolve(e), -ENOPKG);
4960 assert_return(!event_pid_changed(e), -ECHILD);
4961
4962 if (e->watchdog == !!b)
4963 return e->watchdog;
4964
4965 if (b) {
4966 r = sd_watchdog_enabled(false, &e->watchdog_period);
4967 if (r <= 0)
4968 return r;
4969
4970 /* Issue first ping immediately */
4971 sd_notify(false, "WATCHDOG=1");
4972 e->watchdog_last = now(CLOCK_MONOTONIC);
4973
4974 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4975 if (e->watchdog_fd < 0)
4976 return -errno;
4977
4978 r = arm_watchdog(e);
4979 if (r < 0)
4980 goto fail;
4981
4982 struct epoll_event ev = {
4983 .events = EPOLLIN,
4984 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
4985 };
4986
4987 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
4988 r = -errno;
4989 goto fail;
4990 }
4991
4992 } else {
4993 if (e->watchdog_fd >= 0) {
4994 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
4995 e->watchdog_fd = safe_close(e->watchdog_fd);
4996 }
4997 }
4998
4999 e->watchdog = !!b;
5000 return e->watchdog;
5001
5002 fail:
5003 e->watchdog_fd = safe_close(e->watchdog_fd);
5004 return r;
5005 }
5006
5007 _public_ int sd_event_get_watchdog(sd_event *e) {
5008 assert_return(e, -EINVAL);
5009 assert_return(e = event_resolve(e), -ENOPKG);
5010 assert_return(!event_pid_changed(e), -ECHILD);
5011
5012 return e->watchdog;
5013 }
5014
5015 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5016 assert_return(e, -EINVAL);
5017 assert_return(e = event_resolve(e), -ENOPKG);
5018 assert_return(!event_pid_changed(e), -ECHILD);
5019
5020 *ret = e->iteration;
5021 return 0;
5022 }
5023
5024 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5025 assert_return(s, -EINVAL);
5026
5027 s->destroy_callback = callback;
5028 return 0;
5029 }
5030
5031 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5032 assert_return(s, -EINVAL);
5033
5034 if (ret)
5035 *ret = s->destroy_callback;
5036
5037 return !!s->destroy_callback;
5038 }
5039
5040 _public_ int sd_event_source_get_floating(sd_event_source *s) {
5041 assert_return(s, -EINVAL);
5042
5043 return s->floating;
5044 }
5045
5046 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5047 assert_return(s, -EINVAL);
5048
5049 if (s->floating == !!b)
5050 return 0;
5051
5052 if (!s->event) /* Already disconnected */
5053 return -ESTALE;
5054
5055 s->floating = b;
5056
5057 if (b) {
5058 sd_event_source_ref(s);
5059 sd_event_unref(s->event);
5060 } else {
5061 sd_event_ref(s->event);
5062 sd_event_source_unref(s);
5063 }
5064
5065 return 1;
5066 }
5067
5068 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5069 assert_return(s, -EINVAL);
5070 assert_return(s->type != SOURCE_EXIT, -EDOM);
5071
5072 return s->exit_on_failure;
5073 }
5074
5075 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5076 assert_return(s, -EINVAL);
5077 assert_return(s->type != SOURCE_EXIT, -EDOM);
5078
5079 if (s->exit_on_failure == !!b)
5080 return 0;
5081
5082 s->exit_on_failure = b;
5083 return 1;
5084 }
5085
5086 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5087 int r;
5088
5089 assert_return(s, -EINVAL);
5090
5091 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5092 * so is a programming error. */
5093 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5094
5095 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5096 * non-ratelimited. */
5097 r = event_source_leave_ratelimit(s, /* run_callback */ false);
5098 if (r < 0)
5099 return r;
5100
5101 s->rate_limit = (RateLimit) { interval, burst };
5102 return 0;
5103 }
5104
5105 _public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5106 assert_return(s, -EINVAL);
5107
5108 s->ratelimit_expire_callback = callback;
5109 return 0;
5110 }
5111
5112 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5113 assert_return(s, -EINVAL);
5114
5115 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5116 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5117 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5118 return -EDOM;
5119
5120 if (!ratelimit_configured(&s->rate_limit))
5121 return -ENOEXEC;
5122
5123 if (ret_interval)
5124 *ret_interval = s->rate_limit.interval;
5125 if (ret_burst)
5126 *ret_burst = s->rate_limit.burst;
5127
5128 return 0;
5129 }
5130
5131 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5132 assert_return(s, -EINVAL);
5133
5134 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5135 return false;
5136
5137 if (!ratelimit_configured(&s->rate_limit))
5138 return false;
5139
5140 return s->ratelimited;
5141 }
5142
5143 _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5144 bool change = false;
5145 int r;
5146
5147 assert_return(e, -EINVAL);
5148
5149 if (b) {
5150 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5151 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5152 * floating after creation (and undo this before deleting them again). */
5153
5154 if (!e->sigint_event_source) {
5155 r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5156 if (r < 0)
5157 return r;
5158
5159 assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5160 change = true;
5161 }
5162
5163 if (!e->sigterm_event_source) {
5164 r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5165 if (r < 0) {
5166 if (change) {
5167 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5168 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5169 }
5170
5171 return r;
5172 }
5173
5174 assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5175 change = true;
5176 }
5177
5178 } else {
5179 if (e->sigint_event_source) {
5180 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5181 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5182 change = true;
5183 }
5184
5185 if (e->sigterm_event_source) {
5186 assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5187 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5188 change = true;
5189 }
5190 }
5191
5192 return change;
5193 }
5194
5195 _public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5196 _cleanup_free_ char *b = NULL;
5197 _cleanup_free_ void *w = NULL;
5198
5199 assert_return(s, -EINVAL);
5200 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5201 assert_return(ty, -EINVAL);
5202
5203 if (!STR_IN_SET(ty, "some", "full"))
5204 return -EINVAL;
5205
5206 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5207 return -EBUSY;
5208
5209 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5210 if (!space)
5211 return -EINVAL;
5212
5213 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5214 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5215 if (!b)
5216 return -ENOMEM;
5217 if (!STR_IN_SET(b, "some", "full"))
5218 return -EINVAL;
5219
5220 if (streq(b, ty))
5221 return 0;
5222
5223 size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5224 w = new(char, nl);
5225 if (!w)
5226 return -ENOMEM;
5227
5228 memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5229
5230 free_and_replace(s->memory_pressure.write_buffer, w);
5231 s->memory_pressure.write_buffer_size = nl;
5232 s->memory_pressure.locked = false;
5233
5234 return 1;
5235 }
5236
5237 _public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5238 _cleanup_free_ char *b = NULL;
5239 _cleanup_free_ void *w = NULL;
5240
5241 assert_return(s, -EINVAL);
5242 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5243
5244 if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5245 return -ERANGE;
5246 if (window_usec <= 0 || window_usec >= UINT64_MAX)
5247 return -ERANGE;
5248 if (threshold_usec > window_usec)
5249 return -EINVAL;
5250
5251 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5252 return -EBUSY;
5253
5254 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5255 if (!space)
5256 return -EINVAL;
5257
5258 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5259 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5260 if (!b)
5261 return -ENOMEM;
5262 if (!STR_IN_SET(b, "some", "full"))
5263 return -EINVAL;
5264
5265 if (asprintf((char**) &w,
5266 "%s " USEC_FMT " " USEC_FMT "",
5267 b,
5268 threshold_usec,
5269 window_usec) < 0)
5270 return -EINVAL;
5271
5272 l = strlen(w) + 1;
5273 if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5274 return 0;
5275
5276 free_and_replace(s->memory_pressure.write_buffer, w);
5277 s->memory_pressure.write_buffer_size = l;
5278 s->memory_pressure.locked = false;
5279
5280 return 1;
5281 }