]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/libsystemd/sd-event/sd-event.c
338609b186905d81519c382e5070e793377f67b8
[thirdparty/systemd.git] / src / libsystemd / sd-event / sd-event.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <sys/epoll.h>
4 #include <sys/timerfd.h>
5 #include <sys/wait.h>
6
7 #include "sd-daemon.h"
8 #include "sd-event.h"
9 #include "sd-id128.h"
10 #include "sd-messages.h"
11
12 #include "alloc-util.h"
13 #include "env-util.h"
14 #include "event-source.h"
15 #include "fd-util.h"
16 #include "fs-util.h"
17 #include "glyph-util.h"
18 #include "hashmap.h"
19 #include "hexdecoct.h"
20 #include "list.h"
21 #include "logarithm.h"
22 #include "macro.h"
23 #include "mallinfo-util.h"
24 #include "memory-util.h"
25 #include "missing_magic.h"
26 #include "missing_syscall.h"
27 #include "missing_threads.h"
28 #include "origin-id.h"
29 #include "path-util.h"
30 #include "prioq.h"
31 #include "process-util.h"
32 #include "psi-util.h"
33 #include "set.h"
34 #include "signal-util.h"
35 #include "socket-util.h"
36 #include "stat-util.h"
37 #include "string-table.h"
38 #include "string-util.h"
39 #include "strxcpyx.h"
40 #include "time-util.h"
41
42 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
43
44 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
45 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
46 return s &&
47 s->type == SOURCE_CHILD &&
48 s->child.pidfd >= 0 &&
49 s->child.options == WEXITED;
50 }
51
52 static bool event_source_is_online(sd_event_source *s) {
53 assert(s);
54 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
55 }
56
57 static bool event_source_is_offline(sd_event_source *s) {
58 assert(s);
59 return s->enabled == SD_EVENT_OFF || s->ratelimited;
60 }
61
62 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
63 [SOURCE_IO] = "io",
64 [SOURCE_TIME_REALTIME] = "realtime",
65 [SOURCE_TIME_BOOTTIME] = "boottime",
66 [SOURCE_TIME_MONOTONIC] = "monotonic",
67 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
68 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
69 [SOURCE_SIGNAL] = "signal",
70 [SOURCE_CHILD] = "child",
71 [SOURCE_DEFER] = "defer",
72 [SOURCE_POST] = "post",
73 [SOURCE_EXIT] = "exit",
74 [SOURCE_WATCHDOG] = "watchdog",
75 [SOURCE_INOTIFY] = "inotify",
76 [SOURCE_MEMORY_PRESSURE] = "memory-pressure",
77 };
78
79 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
80
81 #define EVENT_SOURCE_IS_TIME(t) \
82 IN_SET((t), \
83 SOURCE_TIME_REALTIME, \
84 SOURCE_TIME_BOOTTIME, \
85 SOURCE_TIME_MONOTONIC, \
86 SOURCE_TIME_REALTIME_ALARM, \
87 SOURCE_TIME_BOOTTIME_ALARM)
88
89 #define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
90 IN_SET((t), \
91 SOURCE_IO, \
92 SOURCE_TIME_REALTIME, \
93 SOURCE_TIME_BOOTTIME, \
94 SOURCE_TIME_MONOTONIC, \
95 SOURCE_TIME_REALTIME_ALARM, \
96 SOURCE_TIME_BOOTTIME_ALARM, \
97 SOURCE_SIGNAL, \
98 SOURCE_DEFER, \
99 SOURCE_INOTIFY, \
100 SOURCE_MEMORY_PRESSURE)
101
102 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
103 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
104 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
105 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
106
107 struct sd_event {
108 unsigned n_ref;
109
110 int epoll_fd;
111 int watchdog_fd;
112
113 Prioq *pending;
114 Prioq *prepare;
115
116 /* timerfd_create() only supports these five clocks so far. We
117 * can add support for more clocks when the kernel learns to
118 * deal with them, too. */
119 struct clock_data realtime;
120 struct clock_data boottime;
121 struct clock_data monotonic;
122 struct clock_data realtime_alarm;
123 struct clock_data boottime_alarm;
124
125 usec_t perturb;
126
127 sd_event_source **signal_sources; /* indexed by signal number */
128 Hashmap *signal_data; /* indexed by priority */
129
130 Hashmap *child_sources;
131 unsigned n_online_child_sources;
132
133 Set *post_sources;
134
135 Prioq *exit;
136
137 Hashmap *inotify_data; /* indexed by priority */
138
139 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
140 LIST_HEAD(struct inode_data, inode_data_to_close_list);
141
142 /* A list of inotify objects that already have events buffered which aren't processed yet */
143 LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
144
145 /* A list of memory pressure event sources that still need their subscription string written */
146 LIST_HEAD(sd_event_source, memory_pressure_write_list);
147
148 uint64_t origin_id;
149
150 uint64_t iteration;
151 triple_timestamp timestamp;
152 int state;
153
154 bool exit_requested:1;
155 bool need_process_child:1;
156 bool watchdog:1;
157 bool profile_delays:1;
158
159 int exit_code;
160
161 pid_t tid;
162 sd_event **default_event_ptr;
163
164 usec_t watchdog_last, watchdog_period;
165
166 unsigned n_sources;
167
168 struct epoll_event *event_queue;
169
170 LIST_HEAD(sd_event_source, sources);
171
172 sd_event_source *sigint_event_source, *sigterm_event_source;
173
174 usec_t last_run_usec, last_log_usec;
175 unsigned delays[sizeof(usec_t) * 8];
176 };
177
178 DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event, event);
179
180 static thread_local sd_event *default_event = NULL;
181
182 static void source_disconnect(sd_event_source *s);
183 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
184
185 static sd_event *event_resolve(sd_event *e) {
186 return e == SD_EVENT_DEFAULT ? default_event : e;
187 }
188
189 static int pending_prioq_compare(const void *a, const void *b) {
190 const sd_event_source *x = a, *y = b;
191 int r;
192
193 assert(x->pending);
194 assert(y->pending);
195
196 /* Enabled ones first */
197 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
198 if (r != 0)
199 return r;
200
201 /* Non rate-limited ones first. */
202 r = CMP(!!x->ratelimited, !!y->ratelimited);
203 if (r != 0)
204 return r;
205
206 /* Lower priority values first */
207 r = CMP(x->priority, y->priority);
208 if (r != 0)
209 return r;
210
211 /* Older entries first */
212 return CMP(x->pending_iteration, y->pending_iteration);
213 }
214
215 static int prepare_prioq_compare(const void *a, const void *b) {
216 const sd_event_source *x = a, *y = b;
217 int r;
218
219 assert(x->prepare);
220 assert(y->prepare);
221
222 /* Enabled ones first */
223 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
224 if (r != 0)
225 return r;
226
227 /* Non rate-limited ones first. */
228 r = CMP(!!x->ratelimited, !!y->ratelimited);
229 if (r != 0)
230 return r;
231
232 /* Move most recently prepared ones last, so that we can stop
233 * preparing as soon as we hit one that has already been
234 * prepared in the current iteration */
235 r = CMP(x->prepare_iteration, y->prepare_iteration);
236 if (r != 0)
237 return r;
238
239 /* Lower priority values first */
240 return CMP(x->priority, y->priority);
241 }
242
243 static usec_t time_event_source_next(const sd_event_source *s) {
244 assert(s);
245
246 /* We have two kinds of event sources that have elapsation times associated with them: the actual
247 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
248 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
249 * looking at here. */
250
251 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
252 assert(s->rate_limit.begin != 0);
253 assert(s->rate_limit.interval != 0);
254 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
255 }
256
257 /* Otherwise this must be a time event source, if not ratelimited */
258 if (EVENT_SOURCE_IS_TIME(s->type))
259 return s->time.next;
260
261 return USEC_INFINITY;
262 }
263
264 static usec_t time_event_source_latest(const sd_event_source *s) {
265 assert(s);
266
267 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
268 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
269 * window */
270 assert(s->rate_limit.begin != 0);
271 assert(s->rate_limit.interval != 0);
272 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
273 }
274
275 /* Must be a time event source, if not ratelimited */
276 if (EVENT_SOURCE_IS_TIME(s->type))
277 return usec_add(s->time.next, s->time.accuracy);
278
279 return USEC_INFINITY;
280 }
281
282 static bool event_source_timer_candidate(const sd_event_source *s) {
283 assert(s);
284
285 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
286 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
287 return !s->pending || s->ratelimited;
288 }
289
290 static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
291 const sd_event_source *x = a, *y = b;
292 int r;
293
294 /* Enabled ones first */
295 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
296 if (r != 0)
297 return r;
298
299 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
300 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
301 if (r != 0)
302 return r;
303
304 /* Order by time */
305 return CMP(time_func(x), time_func(y));
306 }
307
308 static int earliest_time_prioq_compare(const void *a, const void *b) {
309 return time_prioq_compare(a, b, time_event_source_next);
310 }
311
312 static int latest_time_prioq_compare(const void *a, const void *b) {
313 return time_prioq_compare(a, b, time_event_source_latest);
314 }
315
316 static int exit_prioq_compare(const void *a, const void *b) {
317 const sd_event_source *x = a, *y = b;
318 int r;
319
320 assert(x->type == SOURCE_EXIT);
321 assert(y->type == SOURCE_EXIT);
322
323 /* Enabled ones first */
324 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
325 if (r != 0)
326 return r;
327
328 /* Lower priority values first */
329 return CMP(x->priority, y->priority);
330 }
331
332 static void free_clock_data(struct clock_data *d) {
333 assert(d);
334 assert(d->wakeup == WAKEUP_CLOCK_DATA);
335
336 safe_close(d->fd);
337 prioq_free(d->earliest);
338 prioq_free(d->latest);
339 }
340
341 static sd_event *event_free(sd_event *e) {
342 sd_event_source *s;
343
344 assert(e);
345
346 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
347 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
348
349 while ((s = e->sources)) {
350 assert(s->floating);
351 source_disconnect(s);
352 sd_event_source_unref(s);
353 }
354
355 assert(e->n_sources == 0);
356
357 if (e->default_event_ptr)
358 *(e->default_event_ptr) = NULL;
359
360 safe_close(e->epoll_fd);
361 safe_close(e->watchdog_fd);
362
363 free_clock_data(&e->realtime);
364 free_clock_data(&e->boottime);
365 free_clock_data(&e->monotonic);
366 free_clock_data(&e->realtime_alarm);
367 free_clock_data(&e->boottime_alarm);
368
369 prioq_free(e->pending);
370 prioq_free(e->prepare);
371 prioq_free(e->exit);
372
373 free(e->signal_sources);
374 hashmap_free(e->signal_data);
375
376 hashmap_free(e->inotify_data);
377
378 hashmap_free(e->child_sources);
379 set_free(e->post_sources);
380
381 free(e->event_queue);
382
383 return mfree(e);
384 }
385
386 _public_ int sd_event_new(sd_event** ret) {
387 sd_event *e;
388 int r;
389
390 assert_return(ret, -EINVAL);
391
392 e = new(sd_event, 1);
393 if (!e)
394 return -ENOMEM;
395
396 *e = (sd_event) {
397 .n_ref = 1,
398 .epoll_fd = -EBADF,
399 .watchdog_fd = -EBADF,
400 .realtime.wakeup = WAKEUP_CLOCK_DATA,
401 .realtime.fd = -EBADF,
402 .realtime.next = USEC_INFINITY,
403 .boottime.wakeup = WAKEUP_CLOCK_DATA,
404 .boottime.fd = -EBADF,
405 .boottime.next = USEC_INFINITY,
406 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
407 .monotonic.fd = -EBADF,
408 .monotonic.next = USEC_INFINITY,
409 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
410 .realtime_alarm.fd = -EBADF,
411 .realtime_alarm.next = USEC_INFINITY,
412 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
413 .boottime_alarm.fd = -EBADF,
414 .boottime_alarm.next = USEC_INFINITY,
415 .perturb = USEC_INFINITY,
416 .origin_id = origin_id_query(),
417 };
418
419 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
420 if (r < 0)
421 goto fail;
422
423 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
424 if (e->epoll_fd < 0) {
425 r = -errno;
426 goto fail;
427 }
428
429 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
430
431 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
432 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
433 special_glyph(SPECIAL_GLYPH_ELLIPSIS));
434 e->profile_delays = true;
435 }
436
437 *ret = e;
438 return 0;
439
440 fail:
441 event_free(e);
442 return r;
443 }
444
445 /* Define manually so we can add the origin check */
446 _public_ sd_event *sd_event_ref(sd_event *e) {
447 if (!e)
448 return NULL;
449 if (event_origin_changed(e))
450 return NULL;
451
452 e->n_ref++;
453
454 return e;
455 }
456
457 _public_ sd_event* sd_event_unref(sd_event *e) {
458 if (!e)
459 return NULL;
460 if (event_origin_changed(e))
461 return NULL;
462
463 assert(e->n_ref > 0);
464 if (--e->n_ref > 0)
465 return NULL;
466
467 return event_free(e);
468 }
469
470 #define PROTECT_EVENT(e) \
471 _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
472
473 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
474 if (s)
475 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
476 return sd_event_source_unref(s);
477 }
478
479 static void source_io_unregister(sd_event_source *s) {
480 assert(s);
481 assert(s->type == SOURCE_IO);
482
483 if (event_origin_changed(s->event))
484 return;
485
486 if (!s->io.registered)
487 return;
488
489 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
490 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
491 strna(s->description), event_source_type_to_string(s->type));
492
493 s->io.registered = false;
494 }
495
496 static int source_io_register(
497 sd_event_source *s,
498 int enabled,
499 uint32_t events) {
500
501 assert(s);
502 assert(s->type == SOURCE_IO);
503 assert(enabled != SD_EVENT_OFF);
504
505 struct epoll_event ev = {
506 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
507 .data.ptr = s,
508 };
509
510 if (epoll_ctl(s->event->epoll_fd,
511 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
512 s->io.fd, &ev) < 0)
513 return -errno;
514
515 s->io.registered = true;
516
517 return 0;
518 }
519
520 static void source_child_pidfd_unregister(sd_event_source *s) {
521 assert(s);
522 assert(s->type == SOURCE_CHILD);
523
524 if (event_origin_changed(s->event))
525 return;
526
527 if (!s->child.registered)
528 return;
529
530 if (EVENT_SOURCE_WATCH_PIDFD(s))
531 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
532 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
533 strna(s->description), event_source_type_to_string(s->type));
534
535 s->child.registered = false;
536 }
537
538 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
539 assert(s);
540 assert(s->type == SOURCE_CHILD);
541 assert(enabled != SD_EVENT_OFF);
542
543 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
544 struct epoll_event ev = {
545 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
546 .data.ptr = s,
547 };
548
549 if (epoll_ctl(s->event->epoll_fd,
550 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
551 s->child.pidfd, &ev) < 0)
552 return -errno;
553 }
554
555 s->child.registered = true;
556 return 0;
557 }
558
559 static void source_memory_pressure_unregister(sd_event_source *s) {
560 assert(s);
561 assert(s->type == SOURCE_MEMORY_PRESSURE);
562
563 if (event_origin_changed(s->event))
564 return;
565
566 if (!s->memory_pressure.registered)
567 return;
568
569 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
570 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
571 strna(s->description), event_source_type_to_string(s->type));
572
573 s->memory_pressure.registered = false;
574 }
575
576 static int source_memory_pressure_register(sd_event_source *s, int enabled) {
577 assert(s);
578 assert(s->type == SOURCE_MEMORY_PRESSURE);
579 assert(enabled != SD_EVENT_OFF);
580
581 struct epoll_event ev = {
582 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
583 (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
584 .data.ptr = s,
585 };
586
587 if (epoll_ctl(s->event->epoll_fd,
588 s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
589 s->memory_pressure.fd, &ev) < 0)
590 return -errno;
591
592 s->memory_pressure.registered = true;
593 return 0;
594 }
595
596 static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
597 assert(s);
598 assert(s->type == SOURCE_MEMORY_PRESSURE);
599
600 if (s->memory_pressure.in_write_list)
601 return;
602
603 LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
604 s->memory_pressure.in_write_list = true;
605 }
606
607 static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
608 assert(s);
609 assert(s->type == SOURCE_MEMORY_PRESSURE);
610
611 if (!s->memory_pressure.in_write_list)
612 return;
613
614 LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
615 s->memory_pressure.in_write_list = false;
616 }
617
618 static clockid_t event_source_type_to_clock(EventSourceType t) {
619
620 switch (t) {
621
622 case SOURCE_TIME_REALTIME:
623 return CLOCK_REALTIME;
624
625 case SOURCE_TIME_BOOTTIME:
626 return CLOCK_BOOTTIME;
627
628 case SOURCE_TIME_MONOTONIC:
629 return CLOCK_MONOTONIC;
630
631 case SOURCE_TIME_REALTIME_ALARM:
632 return CLOCK_REALTIME_ALARM;
633
634 case SOURCE_TIME_BOOTTIME_ALARM:
635 return CLOCK_BOOTTIME_ALARM;
636
637 default:
638 return (clockid_t) -1;
639 }
640 }
641
642 static EventSourceType clock_to_event_source_type(clockid_t clock) {
643
644 switch (clock) {
645
646 case CLOCK_REALTIME:
647 return SOURCE_TIME_REALTIME;
648
649 case CLOCK_BOOTTIME:
650 return SOURCE_TIME_BOOTTIME;
651
652 case CLOCK_MONOTONIC:
653 return SOURCE_TIME_MONOTONIC;
654
655 case CLOCK_REALTIME_ALARM:
656 return SOURCE_TIME_REALTIME_ALARM;
657
658 case CLOCK_BOOTTIME_ALARM:
659 return SOURCE_TIME_BOOTTIME_ALARM;
660
661 default:
662 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
663 }
664 }
665
666 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
667 assert(e);
668
669 switch (t) {
670
671 case SOURCE_TIME_REALTIME:
672 return &e->realtime;
673
674 case SOURCE_TIME_BOOTTIME:
675 return &e->boottime;
676
677 case SOURCE_TIME_MONOTONIC:
678 return &e->monotonic;
679
680 case SOURCE_TIME_REALTIME_ALARM:
681 return &e->realtime_alarm;
682
683 case SOURCE_TIME_BOOTTIME_ALARM:
684 return &e->boottime_alarm;
685
686 default:
687 return NULL;
688 }
689 }
690
691 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
692 assert(e);
693
694 if (!d)
695 return;
696
697 hashmap_remove(e->signal_data, &d->priority);
698 safe_close(d->fd);
699 free(d);
700 }
701
702 static int event_make_signal_data(
703 sd_event *e,
704 int sig,
705 struct signal_data **ret) {
706
707 struct signal_data *d;
708 bool added = false;
709 sigset_t ss_copy;
710 int64_t priority;
711 int r;
712
713 assert(e);
714
715 if (event_origin_changed(e))
716 return -ECHILD;
717
718 if (e->signal_sources && e->signal_sources[sig])
719 priority = e->signal_sources[sig]->priority;
720 else
721 priority = SD_EVENT_PRIORITY_NORMAL;
722
723 d = hashmap_get(e->signal_data, &priority);
724 if (d) {
725 if (sigismember(&d->sigset, sig) > 0) {
726 if (ret)
727 *ret = d;
728 return 0;
729 }
730 } else {
731 d = new(struct signal_data, 1);
732 if (!d)
733 return -ENOMEM;
734
735 *d = (struct signal_data) {
736 .wakeup = WAKEUP_SIGNAL_DATA,
737 .fd = -EBADF,
738 .priority = priority,
739 };
740
741 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
742 if (r < 0) {
743 free(d);
744 return r;
745 }
746
747 added = true;
748 }
749
750 ss_copy = d->sigset;
751 assert_se(sigaddset(&ss_copy, sig) >= 0);
752
753 r = signalfd(d->fd >= 0 ? d->fd : -1, /* the first arg must be -1 or a valid signalfd */
754 &ss_copy,
755 SFD_NONBLOCK|SFD_CLOEXEC);
756 if (r < 0) {
757 r = -errno;
758 goto fail;
759 }
760
761 d->sigset = ss_copy;
762
763 if (d->fd >= 0) {
764 if (ret)
765 *ret = d;
766 return 0;
767 }
768
769 d->fd = fd_move_above_stdio(r);
770
771 struct epoll_event ev = {
772 .events = EPOLLIN,
773 .data.ptr = d,
774 };
775
776 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
777 r = -errno;
778 goto fail;
779 }
780
781 if (ret)
782 *ret = d;
783
784 return 0;
785
786 fail:
787 if (added)
788 event_free_signal_data(e, d);
789
790 return r;
791 }
792
793 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
794 assert(e);
795 assert(d);
796
797 /* Turns off the specified signal in the signal data
798 * object. If the signal mask of the object becomes empty that
799 * way removes it. */
800
801 if (sigismember(&d->sigset, sig) == 0)
802 return;
803
804 assert_se(sigdelset(&d->sigset, sig) >= 0);
805
806 if (sigisemptyset(&d->sigset)) {
807 /* If all the mask is all-zero we can get rid of the structure */
808 event_free_signal_data(e, d);
809 return;
810 }
811
812 if (event_origin_changed(e))
813 return;
814
815 assert(d->fd >= 0);
816
817 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
818 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
819 }
820
821 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
822 struct signal_data *d;
823 static const int64_t zero_priority = 0;
824
825 assert(e);
826
827 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
828 * and possibly drop the signalfd for it. */
829
830 if (sig == SIGCHLD &&
831 e->n_online_child_sources > 0)
832 return;
833
834 if (e->signal_sources &&
835 e->signal_sources[sig] &&
836 event_source_is_online(e->signal_sources[sig]))
837 return;
838
839 /*
840 * The specified signal might be enabled in three different queues:
841 *
842 * 1) the one that belongs to the priority passed (if it is non-NULL)
843 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
844 * 3) the 0 priority (to cover the SIGCHLD case)
845 *
846 * Hence, let's remove it from all three here.
847 */
848
849 if (priority) {
850 d = hashmap_get(e->signal_data, priority);
851 if (d)
852 event_unmask_signal_data(e, d, sig);
853 }
854
855 if (e->signal_sources && e->signal_sources[sig]) {
856 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
857 if (d)
858 event_unmask_signal_data(e, d, sig);
859 }
860
861 d = hashmap_get(e->signal_data, &zero_priority);
862 if (d)
863 event_unmask_signal_data(e, d, sig);
864 }
865
866 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
867 assert(s);
868
869 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
870 * they are enabled/disabled or marked pending and such. */
871
872 if (s->pending)
873 prioq_reshuffle(s->event->pending, s, &s->pending_index);
874
875 if (s->prepare)
876 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
877 }
878
879 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
880 struct clock_data *d;
881
882 assert(s);
883
884 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
885 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
886 * properly again. */
887
888 if (s->ratelimited)
889 d = &s->event->monotonic;
890 else if (EVENT_SOURCE_IS_TIME(s->type))
891 assert_se(d = event_get_clock_data(s->event, s->type));
892 else
893 return; /* no-op for an event source which is neither a timer nor ratelimited. */
894
895 prioq_reshuffle(d->earliest, s, &s->earliest_index);
896 prioq_reshuffle(d->latest, s, &s->latest_index);
897 d->needs_rearm = true;
898 }
899
900 static void event_source_time_prioq_remove(
901 sd_event_source *s,
902 struct clock_data *d) {
903
904 assert(s);
905 assert(d);
906
907 prioq_remove(d->earliest, s, &s->earliest_index);
908 prioq_remove(d->latest, s, &s->latest_index);
909 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
910 d->needs_rearm = true;
911 }
912
913 static void source_disconnect(sd_event_source *s) {
914 sd_event *event;
915 int r;
916
917 assert(s);
918
919 if (!s->event)
920 return;
921
922 assert(s->event->n_sources > 0);
923
924 switch (s->type) {
925
926 case SOURCE_IO:
927 if (s->io.fd >= 0)
928 source_io_unregister(s);
929
930 break;
931
932 case SOURCE_TIME_REALTIME:
933 case SOURCE_TIME_BOOTTIME:
934 case SOURCE_TIME_MONOTONIC:
935 case SOURCE_TIME_REALTIME_ALARM:
936 case SOURCE_TIME_BOOTTIME_ALARM:
937 /* Only remove this event source from the time event source here if it is not ratelimited. If
938 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
939 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
940
941 if (!s->ratelimited) {
942 struct clock_data *d;
943 assert_se(d = event_get_clock_data(s->event, s->type));
944 event_source_time_prioq_remove(s, d);
945 }
946
947 break;
948
949 case SOURCE_SIGNAL:
950 if (s->signal.sig > 0) {
951
952 if (s->event->signal_sources)
953 s->event->signal_sources[s->signal.sig] = NULL;
954
955 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
956
957 if (s->signal.unblock) {
958 sigset_t new_ss;
959
960 if (sigemptyset(&new_ss) < 0)
961 log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
962 else if (sigaddset(&new_ss, s->signal.sig) < 0)
963 log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
964 else {
965 r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
966 if (r != 0)
967 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
968 }
969 }
970 }
971
972 break;
973
974 case SOURCE_CHILD:
975 if (event_origin_changed(s->event))
976 s->child.process_owned = false;
977
978 if (s->child.pid > 0) {
979 if (event_source_is_online(s)) {
980 assert(s->event->n_online_child_sources > 0);
981 s->event->n_online_child_sources--;
982 }
983
984 (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
985 }
986
987 if (EVENT_SOURCE_WATCH_PIDFD(s))
988 source_child_pidfd_unregister(s);
989 else
990 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
991
992 break;
993
994 case SOURCE_DEFER:
995 /* nothing */
996 break;
997
998 case SOURCE_POST:
999 set_remove(s->event->post_sources, s);
1000 break;
1001
1002 case SOURCE_EXIT:
1003 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
1004 break;
1005
1006 case SOURCE_INOTIFY: {
1007 struct inode_data *inode_data;
1008
1009 inode_data = s->inotify.inode_data;
1010 if (inode_data) {
1011 struct inotify_data *inotify_data;
1012 assert_se(inotify_data = inode_data->inotify_data);
1013
1014 /* Detach this event source from the inode object */
1015 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
1016 s->inotify.inode_data = NULL;
1017
1018 if (s->pending) {
1019 assert(inotify_data->n_pending > 0);
1020 inotify_data->n_pending--;
1021 }
1022
1023 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1024 * continued to being watched. That's because inotify doesn't really have an API for that: we
1025 * can only change watch masks with access to the original inode either by fd or by path. But
1026 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1027 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1028 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1029 * there), but given the need for open_by_handle_at() which is privileged and not universally
1030 * available this would be quite an incomplete solution. Hence we go the other way, leave the
1031 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1032 * anymore after reception. Yes, this sucks, but … Linux … */
1033
1034 /* Maybe release the inode data (and its inotify) */
1035 event_gc_inode_data(s->event, inode_data);
1036 }
1037
1038 break;
1039 }
1040
1041 case SOURCE_MEMORY_PRESSURE:
1042 source_memory_pressure_remove_from_write_list(s);
1043 source_memory_pressure_unregister(s);
1044 break;
1045
1046 default:
1047 assert_not_reached();
1048 }
1049
1050 if (s->pending)
1051 prioq_remove(s->event->pending, s, &s->pending_index);
1052
1053 if (s->prepare)
1054 prioq_remove(s->event->prepare, s, &s->prepare_index);
1055
1056 if (s->ratelimited)
1057 event_source_time_prioq_remove(s, &s->event->monotonic);
1058
1059 event = TAKE_PTR(s->event);
1060 LIST_REMOVE(sources, event->sources, s);
1061 event->n_sources--;
1062
1063 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1064 * pidfd associated with this event source, which we'll do only on source_free(). */
1065
1066 if (!s->floating)
1067 sd_event_unref(event);
1068 }
1069
1070 static sd_event_source* source_free(sd_event_source *s) {
1071 assert(s);
1072
1073 source_disconnect(s);
1074
1075 if (s->type == SOURCE_IO && s->io.owned)
1076 s->io.fd = safe_close(s->io.fd);
1077
1078 if (s->type == SOURCE_CHILD) {
1079 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1080
1081 if (s->child.process_owned) {
1082
1083 if (!s->child.exited) {
1084 bool sent = false;
1085
1086 if (s->child.pidfd >= 0) {
1087 if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
1088 if (errno == ESRCH) /* Already dead */
1089 sent = true;
1090 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
1091 log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
1092 s->child.pid);
1093 } else
1094 sent = true;
1095 }
1096
1097 if (!sent)
1098 if (kill(s->child.pid, SIGKILL) < 0)
1099 if (errno != ESRCH) /* Already dead */
1100 log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1101 s->child.pid);
1102 }
1103
1104 if (!s->child.waited) {
1105 siginfo_t si = {};
1106
1107 /* Reap the child if we can */
1108 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1109 }
1110 }
1111
1112 if (s->child.pidfd_owned)
1113 s->child.pidfd = safe_close(s->child.pidfd);
1114 }
1115
1116 if (s->type == SOURCE_MEMORY_PRESSURE) {
1117 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1118 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1119 }
1120
1121 if (s->destroy_callback)
1122 s->destroy_callback(s->userdata);
1123
1124 free(s->description);
1125 return mfree(s);
1126 }
1127 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1128
1129 static int source_set_pending(sd_event_source *s, bool b) {
1130 int r;
1131
1132 assert(s);
1133 assert(s->type != SOURCE_EXIT);
1134
1135 if (s->pending == b)
1136 return 0;
1137
1138 s->pending = b;
1139
1140 if (b) {
1141 s->pending_iteration = s->event->iteration;
1142
1143 r = prioq_put(s->event->pending, s, &s->pending_index);
1144 if (r < 0) {
1145 s->pending = false;
1146 return r;
1147 }
1148 } else
1149 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1150
1151 if (EVENT_SOURCE_IS_TIME(s->type))
1152 event_source_time_prioq_reshuffle(s);
1153
1154 if (s->type == SOURCE_SIGNAL && !b) {
1155 struct signal_data *d;
1156
1157 d = hashmap_get(s->event->signal_data, &s->priority);
1158 if (d && d->current == s)
1159 d->current = NULL;
1160 }
1161
1162 if (s->type == SOURCE_INOTIFY) {
1163
1164 assert(s->inotify.inode_data);
1165 assert(s->inotify.inode_data->inotify_data);
1166
1167 if (b)
1168 s->inotify.inode_data->inotify_data->n_pending++;
1169 else {
1170 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1171 s->inotify.inode_data->inotify_data->n_pending--;
1172 }
1173 }
1174
1175 return 1;
1176 }
1177
1178 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1179
1180 /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1181 * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1182 * lines. */
1183 static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1184 [SOURCE_IO] = endoffsetof_field(sd_event_source, io),
1185 [SOURCE_TIME_REALTIME] = endoffsetof_field(sd_event_source, time),
1186 [SOURCE_TIME_BOOTTIME] = endoffsetof_field(sd_event_source, time),
1187 [SOURCE_TIME_MONOTONIC] = endoffsetof_field(sd_event_source, time),
1188 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1189 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1190 [SOURCE_SIGNAL] = endoffsetof_field(sd_event_source, signal),
1191 [SOURCE_CHILD] = endoffsetof_field(sd_event_source, child),
1192 [SOURCE_DEFER] = endoffsetof_field(sd_event_source, defer),
1193 [SOURCE_POST] = endoffsetof_field(sd_event_source, post),
1194 [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit),
1195 [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify),
1196 [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure),
1197 };
1198
1199 sd_event_source *s;
1200
1201 assert(e);
1202 assert(type >= 0);
1203 assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1204 assert(size_table[type] > 0);
1205
1206 s = malloc0(size_table[type]);
1207 if (!s)
1208 return NULL;
1209 /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1210 * size, even if we only allocate the initial part we need. */
1211 s = expand_to_usable(s, sizeof(sd_event_source));
1212
1213 /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1214 * than what we allocated here. */
1215 s->n_ref = 1;
1216 s->event = e;
1217 s->floating = floating;
1218 s->type = type;
1219 s->pending_index = PRIOQ_IDX_NULL;
1220 s->prepare_index = PRIOQ_IDX_NULL;
1221
1222 if (!floating)
1223 sd_event_ref(e);
1224
1225 LIST_PREPEND(sources, e->sources, s);
1226 e->n_sources++;
1227
1228 return s;
1229 }
1230
1231 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1232 assert(s);
1233
1234 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1235 }
1236
1237 _public_ int sd_event_add_io(
1238 sd_event *e,
1239 sd_event_source **ret,
1240 int fd,
1241 uint32_t events,
1242 sd_event_io_handler_t callback,
1243 void *userdata) {
1244
1245 _cleanup_(source_freep) sd_event_source *s = NULL;
1246 int r;
1247
1248 assert_return(e, -EINVAL);
1249 assert_return(e = event_resolve(e), -ENOPKG);
1250 assert_return(fd >= 0, -EBADF);
1251 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1252 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1253 assert_return(!event_origin_changed(e), -ECHILD);
1254
1255 if (!callback)
1256 callback = io_exit_callback;
1257
1258 s = source_new(e, !ret, SOURCE_IO);
1259 if (!s)
1260 return -ENOMEM;
1261
1262 s->wakeup = WAKEUP_EVENT_SOURCE;
1263 s->io.fd = fd;
1264 s->io.events = events;
1265 s->io.callback = callback;
1266 s->userdata = userdata;
1267 s->enabled = SD_EVENT_ON;
1268
1269 r = source_io_register(s, s->enabled, events);
1270 if (r < 0)
1271 return r;
1272
1273 if (ret)
1274 *ret = s;
1275 TAKE_PTR(s);
1276
1277 return 0;
1278 }
1279
1280 static void initialize_perturb(sd_event *e) {
1281 sd_id128_t id = {};
1282
1283 /* When we sleep for longer, we try to realign the wakeup to the same time within each
1284 * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1285 * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1286 * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1287 * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1288
1289 if (_likely_(e->perturb != USEC_INFINITY))
1290 return;
1291
1292 if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
1293 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1294 else
1295 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
1296 }
1297
1298 static int event_setup_timer_fd(
1299 sd_event *e,
1300 struct clock_data *d,
1301 clockid_t clock) {
1302
1303 assert(e);
1304 assert(d);
1305
1306 if (_likely_(d->fd >= 0))
1307 return 0;
1308
1309 _cleanup_close_ int fd = -EBADF;
1310
1311 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1312 if (fd < 0)
1313 return -errno;
1314
1315 fd = fd_move_above_stdio(fd);
1316
1317 struct epoll_event ev = {
1318 .events = EPOLLIN,
1319 .data.ptr = d,
1320 };
1321
1322 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1323 return -errno;
1324
1325 d->fd = TAKE_FD(fd);
1326 return 0;
1327 }
1328
1329 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1330 assert(s);
1331
1332 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1333 }
1334
1335 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1336 int r;
1337
1338 assert(d);
1339
1340 if (d->fd < 0) {
1341 r = event_setup_timer_fd(e, d, clock);
1342 if (r < 0)
1343 return r;
1344 }
1345
1346 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1347 if (r < 0)
1348 return r;
1349
1350 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1351 if (r < 0)
1352 return r;
1353
1354 return 0;
1355 }
1356
1357 static int event_source_time_prioq_put(
1358 sd_event_source *s,
1359 struct clock_data *d) {
1360
1361 int r;
1362
1363 assert(s);
1364 assert(d);
1365 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1366
1367 r = prioq_put(d->earliest, s, &s->earliest_index);
1368 if (r < 0)
1369 return r;
1370
1371 r = prioq_put(d->latest, s, &s->latest_index);
1372 if (r < 0) {
1373 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1374 s->earliest_index = PRIOQ_IDX_NULL;
1375 return r;
1376 }
1377
1378 d->needs_rearm = true;
1379 return 0;
1380 }
1381
1382 _public_ int sd_event_add_time(
1383 sd_event *e,
1384 sd_event_source **ret,
1385 clockid_t clock,
1386 uint64_t usec,
1387 uint64_t accuracy,
1388 sd_event_time_handler_t callback,
1389 void *userdata) {
1390
1391 EventSourceType type;
1392 _cleanup_(source_freep) sd_event_source *s = NULL;
1393 struct clock_data *d;
1394 int r;
1395
1396 assert_return(e, -EINVAL);
1397 assert_return(e = event_resolve(e), -ENOPKG);
1398 assert_return(accuracy != UINT64_MAX, -EINVAL);
1399 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1400 assert_return(!event_origin_changed(e), -ECHILD);
1401
1402 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1403 return -EOPNOTSUPP;
1404
1405 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1406 if (type < 0)
1407 return -EOPNOTSUPP;
1408
1409 if (!callback)
1410 callback = time_exit_callback;
1411
1412 assert_se(d = event_get_clock_data(e, type));
1413
1414 r = setup_clock_data(e, d, clock);
1415 if (r < 0)
1416 return r;
1417
1418 s = source_new(e, !ret, type);
1419 if (!s)
1420 return -ENOMEM;
1421
1422 s->time.next = usec;
1423 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1424 s->time.callback = callback;
1425 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1426 s->userdata = userdata;
1427 s->enabled = SD_EVENT_ONESHOT;
1428
1429 r = event_source_time_prioq_put(s, d);
1430 if (r < 0)
1431 return r;
1432
1433 if (ret)
1434 *ret = s;
1435 TAKE_PTR(s);
1436
1437 return 0;
1438 }
1439
1440 _public_ int sd_event_add_time_relative(
1441 sd_event *e,
1442 sd_event_source **ret,
1443 clockid_t clock,
1444 uint64_t usec,
1445 uint64_t accuracy,
1446 sd_event_time_handler_t callback,
1447 void *userdata) {
1448
1449 usec_t t;
1450 int r;
1451
1452 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1453 * checks for overflow. */
1454
1455 r = sd_event_now(e, clock, &t);
1456 if (r < 0)
1457 return r;
1458
1459 if (usec >= USEC_INFINITY - t)
1460 return -EOVERFLOW;
1461
1462 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1463 }
1464
1465 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1466 assert(s);
1467
1468 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1469 }
1470
1471 _public_ int sd_event_add_signal(
1472 sd_event *e,
1473 sd_event_source **ret,
1474 int sig,
1475 sd_event_signal_handler_t callback,
1476 void *userdata) {
1477
1478 _cleanup_(source_freep) sd_event_source *s = NULL;
1479 struct signal_data *d;
1480 sigset_t new_ss;
1481 bool block_it;
1482 int r;
1483
1484 assert_return(e, -EINVAL);
1485 assert_return(e = event_resolve(e), -ENOPKG);
1486 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1487 assert_return(!event_origin_changed(e), -ECHILD);
1488
1489 /* Let's make sure our special flag stays outside of the valid signal range */
1490 assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1491
1492 if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1493 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1494 assert_return(SIGNAL_VALID(sig), -EINVAL);
1495
1496 block_it = true;
1497 } else {
1498 assert_return(SIGNAL_VALID(sig), -EINVAL);
1499
1500 r = signal_is_blocked(sig);
1501 if (r < 0)
1502 return r;
1503 if (r == 0)
1504 return -EBUSY;
1505
1506 block_it = false;
1507 }
1508
1509 if (!callback)
1510 callback = signal_exit_callback;
1511
1512 if (!e->signal_sources) {
1513 e->signal_sources = new0(sd_event_source*, _NSIG);
1514 if (!e->signal_sources)
1515 return -ENOMEM;
1516 } else if (e->signal_sources[sig])
1517 return -EBUSY;
1518
1519 s = source_new(e, !ret, SOURCE_SIGNAL);
1520 if (!s)
1521 return -ENOMEM;
1522
1523 s->signal.sig = sig;
1524 s->signal.callback = callback;
1525 s->userdata = userdata;
1526 s->enabled = SD_EVENT_ON;
1527
1528 e->signal_sources[sig] = s;
1529
1530 if (block_it) {
1531 sigset_t old_ss;
1532
1533 if (sigemptyset(&new_ss) < 0)
1534 return -errno;
1535
1536 if (sigaddset(&new_ss, sig) < 0)
1537 return -errno;
1538
1539 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1540 if (r != 0)
1541 return -r;
1542
1543 r = sigismember(&old_ss, sig);
1544 if (r < 0)
1545 return -errno;
1546
1547 s->signal.unblock = !r;
1548 } else
1549 s->signal.unblock = false;
1550
1551 r = event_make_signal_data(e, sig, &d);
1552 if (r < 0) {
1553 if (s->signal.unblock)
1554 (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1555
1556 return r;
1557 }
1558
1559 /* Use the signal name as description for the event source by default */
1560 (void) sd_event_source_set_description(s, signal_to_string(sig));
1561
1562 if (ret)
1563 *ret = s;
1564 TAKE_PTR(s);
1565
1566 return 0;
1567 }
1568
1569 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1570 assert(s);
1571
1572 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1573 }
1574
1575 static bool shall_use_pidfd(void) {
1576 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1577 return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1578 }
1579
1580 _public_ int sd_event_add_child(
1581 sd_event *e,
1582 sd_event_source **ret,
1583 pid_t pid,
1584 int options,
1585 sd_event_child_handler_t callback,
1586 void *userdata) {
1587
1588 _cleanup_(source_freep) sd_event_source *s = NULL;
1589 int r;
1590
1591 assert_return(e, -EINVAL);
1592 assert_return(e = event_resolve(e), -ENOPKG);
1593 assert_return(pid > 1, -EINVAL);
1594 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1595 assert_return(options != 0, -EINVAL);
1596 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1597 assert_return(!event_origin_changed(e), -ECHILD);
1598
1599 if (!callback)
1600 callback = child_exit_callback;
1601
1602 if (e->n_online_child_sources == 0) {
1603 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1604 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1605 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1606 * take effect.
1607 *
1608 * (As an optimization we only do this check on the first child event source created.) */
1609 r = signal_is_blocked(SIGCHLD);
1610 if (r < 0)
1611 return r;
1612 if (r == 0)
1613 return -EBUSY;
1614 }
1615
1616 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1617 if (r < 0)
1618 return r;
1619
1620 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1621 return -EBUSY;
1622
1623 s = source_new(e, !ret, SOURCE_CHILD);
1624 if (!s)
1625 return -ENOMEM;
1626
1627 s->wakeup = WAKEUP_EVENT_SOURCE;
1628 s->child.options = options;
1629 s->child.callback = callback;
1630 s->userdata = userdata;
1631 s->enabled = SD_EVENT_ONESHOT;
1632
1633 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1634 * pin the PID, and make regular waitid() handling race-free. */
1635
1636 if (shall_use_pidfd()) {
1637 s->child.pidfd = pidfd_open(pid, 0);
1638 if (s->child.pidfd < 0) {
1639 /* Propagate errors unless the syscall is not supported or blocked */
1640 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1641 return -errno;
1642 } else
1643 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1644 } else
1645 s->child.pidfd = -EBADF;
1646
1647 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1648 /* We have a pidfd and we only want to watch for exit */
1649 r = source_child_pidfd_register(s, s->enabled);
1650 if (r < 0)
1651 return r;
1652
1653 } else {
1654 /* We have no pidfd or we shall wait for some other event than WEXITED */
1655 r = event_make_signal_data(e, SIGCHLD, NULL);
1656 if (r < 0)
1657 return r;
1658
1659 e->need_process_child = true;
1660 }
1661
1662 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1663 if (r < 0)
1664 return r;
1665
1666 /* These must be done after everything succeeds. */
1667 s->child.pid = pid;
1668 e->n_online_child_sources++;
1669
1670 if (ret)
1671 *ret = s;
1672 TAKE_PTR(s);
1673 return 0;
1674 }
1675
1676 _public_ int sd_event_add_child_pidfd(
1677 sd_event *e,
1678 sd_event_source **ret,
1679 int pidfd,
1680 int options,
1681 sd_event_child_handler_t callback,
1682 void *userdata) {
1683
1684
1685 _cleanup_(source_freep) sd_event_source *s = NULL;
1686 pid_t pid;
1687 int r;
1688
1689 assert_return(e, -EINVAL);
1690 assert_return(e = event_resolve(e), -ENOPKG);
1691 assert_return(pidfd >= 0, -EBADF);
1692 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1693 assert_return(options != 0, -EINVAL);
1694 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1695 assert_return(!event_origin_changed(e), -ECHILD);
1696
1697 if (!callback)
1698 callback = child_exit_callback;
1699
1700 if (e->n_online_child_sources == 0) {
1701 r = signal_is_blocked(SIGCHLD);
1702 if (r < 0)
1703 return r;
1704 if (r == 0)
1705 return -EBUSY;
1706 }
1707
1708 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1709 if (r < 0)
1710 return r;
1711
1712 r = pidfd_get_pid(pidfd, &pid);
1713 if (r < 0)
1714 return r;
1715
1716 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1717 return -EBUSY;
1718
1719 s = source_new(e, !ret, SOURCE_CHILD);
1720 if (!s)
1721 return -ENOMEM;
1722
1723 s->wakeup = WAKEUP_EVENT_SOURCE;
1724 s->child.pidfd = pidfd;
1725 s->child.pid = pid;
1726 s->child.options = options;
1727 s->child.callback = callback;
1728 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1729 s->userdata = userdata;
1730 s->enabled = SD_EVENT_ONESHOT;
1731
1732 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1733 if (r < 0)
1734 return r;
1735
1736 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1737 /* We only want to watch for WEXITED */
1738 r = source_child_pidfd_register(s, s->enabled);
1739 if (r < 0)
1740 return r;
1741 } else {
1742 /* We shall wait for some other event than WEXITED */
1743 r = event_make_signal_data(e, SIGCHLD, NULL);
1744 if (r < 0)
1745 return r;
1746
1747 e->need_process_child = true;
1748 }
1749
1750 e->n_online_child_sources++;
1751
1752 if (ret)
1753 *ret = s;
1754 TAKE_PTR(s);
1755 return 0;
1756 }
1757
1758 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1759 assert(s);
1760
1761 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1762 }
1763
1764 _public_ int sd_event_add_defer(
1765 sd_event *e,
1766 sd_event_source **ret,
1767 sd_event_handler_t callback,
1768 void *userdata) {
1769
1770 _cleanup_(source_freep) sd_event_source *s = NULL;
1771 int r;
1772
1773 assert_return(e, -EINVAL);
1774 assert_return(e = event_resolve(e), -ENOPKG);
1775 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1776 assert_return(!event_origin_changed(e), -ECHILD);
1777
1778 if (!callback)
1779 callback = generic_exit_callback;
1780
1781 s = source_new(e, !ret, SOURCE_DEFER);
1782 if (!s)
1783 return -ENOMEM;
1784
1785 s->defer.callback = callback;
1786 s->userdata = userdata;
1787 s->enabled = SD_EVENT_ONESHOT;
1788
1789 r = source_set_pending(s, true);
1790 if (r < 0)
1791 return r;
1792
1793 if (ret)
1794 *ret = s;
1795 TAKE_PTR(s);
1796
1797 return 0;
1798 }
1799
1800 _public_ int sd_event_add_post(
1801 sd_event *e,
1802 sd_event_source **ret,
1803 sd_event_handler_t callback,
1804 void *userdata) {
1805
1806 _cleanup_(source_freep) sd_event_source *s = NULL;
1807 int r;
1808
1809 assert_return(e, -EINVAL);
1810 assert_return(e = event_resolve(e), -ENOPKG);
1811 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1812 assert_return(!event_origin_changed(e), -ECHILD);
1813
1814 if (!callback)
1815 callback = generic_exit_callback;
1816
1817 s = source_new(e, !ret, SOURCE_POST);
1818 if (!s)
1819 return -ENOMEM;
1820
1821 s->post.callback = callback;
1822 s->userdata = userdata;
1823 s->enabled = SD_EVENT_ON;
1824
1825 r = set_ensure_put(&e->post_sources, NULL, s);
1826 if (r < 0)
1827 return r;
1828 assert(r > 0);
1829
1830 if (ret)
1831 *ret = s;
1832 TAKE_PTR(s);
1833
1834 return 0;
1835 }
1836
1837 _public_ int sd_event_add_exit(
1838 sd_event *e,
1839 sd_event_source **ret,
1840 sd_event_handler_t callback,
1841 void *userdata) {
1842
1843 _cleanup_(source_freep) sd_event_source *s = NULL;
1844 int r;
1845
1846 assert_return(e, -EINVAL);
1847 assert_return(e = event_resolve(e), -ENOPKG);
1848 assert_return(callback, -EINVAL);
1849 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1850 assert_return(!event_origin_changed(e), -ECHILD);
1851
1852 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1853 if (r < 0)
1854 return r;
1855
1856 s = source_new(e, !ret, SOURCE_EXIT);
1857 if (!s)
1858 return -ENOMEM;
1859
1860 s->exit.callback = callback;
1861 s->userdata = userdata;
1862 s->exit.prioq_index = PRIOQ_IDX_NULL;
1863 s->enabled = SD_EVENT_ONESHOT;
1864
1865 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1866 if (r < 0)
1867 return r;
1868
1869 if (ret)
1870 *ret = s;
1871 TAKE_PTR(s);
1872
1873 return 0;
1874 }
1875
1876 _public_ int sd_event_trim_memory(void) {
1877 int r;
1878
1879 /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1880 * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1881 * NULL callback parameter. */
1882
1883 log_debug("Memory pressure event, trimming malloc() memory.");
1884
1885 #if HAVE_GENERIC_MALLINFO
1886 generic_mallinfo before_mallinfo = generic_mallinfo_get();
1887 #endif
1888
1889 usec_t before_timestamp = now(CLOCK_MONOTONIC);
1890 hashmap_trim_pools();
1891 r = malloc_trim(0);
1892 usec_t after_timestamp = now(CLOCK_MONOTONIC);
1893
1894 if (r > 0)
1895 log_debug("Successfully trimmed some memory.");
1896 else
1897 log_debug("Couldn't trim any memory.");
1898
1899 usec_t period = after_timestamp - before_timestamp;
1900
1901 #if HAVE_GENERIC_MALLINFO
1902 generic_mallinfo after_mallinfo = generic_mallinfo_get();
1903 size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
1904 LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
1905 log_struct(LOG_DEBUG,
1906 LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1907 FORMAT_TIMESPAN(period, 0),
1908 FORMAT_BYTES(l)),
1909 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1910 "TRIMMED_BYTES=%zu", l,
1911 "TRIMMED_USEC=" USEC_FMT, period);
1912 #else
1913 log_struct(LOG_DEBUG,
1914 LOG_MESSAGE("Memory trimming took %s.",
1915 FORMAT_TIMESPAN(period, 0)),
1916 "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1917 "TRIMMED_USEC=" USEC_FMT, period);
1918 #endif
1919
1920 return 0;
1921 }
1922
1923 static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1924 assert(s);
1925
1926 sd_event_trim_memory();
1927 return 0;
1928 }
1929
1930 _public_ int sd_event_add_memory_pressure(
1931 sd_event *e,
1932 sd_event_source **ret,
1933 sd_event_handler_t callback,
1934 void *userdata) {
1935
1936 _cleanup_free_ char *w = NULL;
1937 _cleanup_(source_freep) sd_event_source *s = NULL;
1938 _cleanup_close_ int path_fd = -EBADF, fd = -EBADF;
1939 _cleanup_free_ void *write_buffer = NULL;
1940 const char *watch, *watch_fallback = NULL, *env;
1941 size_t write_buffer_size = 0;
1942 struct stat st;
1943 uint32_t events;
1944 bool locked;
1945 int r;
1946
1947 assert_return(e, -EINVAL);
1948 assert_return(e = event_resolve(e), -ENOPKG);
1949 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1950 assert_return(!event_origin_changed(e), -ECHILD);
1951
1952 if (!callback)
1953 callback = memory_pressure_callback;
1954
1955 s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1956 if (!s)
1957 return -ENOMEM;
1958
1959 s->wakeup = WAKEUP_EVENT_SOURCE;
1960 s->memory_pressure.callback = callback;
1961 s->userdata = userdata;
1962 s->enabled = SD_EVENT_ON;
1963 s->memory_pressure.fd = -EBADF;
1964
1965 env = secure_getenv("MEMORY_PRESSURE_WATCH");
1966 if (env) {
1967 if (isempty(env) || path_equal(env, "/dev/null"))
1968 return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1969 "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1970
1971 if (!path_is_absolute(env) || !path_is_normalized(env))
1972 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1973 "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1974
1975 watch = env;
1976
1977 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1978 if (env) {
1979 r = unbase64mem(env, &write_buffer, &write_buffer_size);
1980 if (r < 0)
1981 return r;
1982 }
1983
1984 locked = true;
1985 } else {
1986
1987 r = is_pressure_supported();
1988 if (r < 0)
1989 return r;
1990 if (r == 0)
1991 return -EOPNOTSUPP;
1992
1993 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1994 * the system wide pressure if for some reason we cannot (which could be: memory controller
1995 * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1996 * only use the system-wide logic. */
1997 r = cg_all_unified();
1998 if (r < 0)
1999 return r;
2000 if (r == 0)
2001 watch = "/proc/pressure/memory";
2002 else {
2003 _cleanup_free_ char *cg = NULL;
2004
2005 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
2006 if (r < 0)
2007 return r;
2008
2009 w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
2010 if (!w)
2011 return -ENOMEM;
2012
2013 watch = w;
2014 watch_fallback = "/proc/pressure/memory";
2015 }
2016
2017 /* Android uses three levels in its userspace low memory killer logic:
2018 * some 70000 1000000
2019 * some 100000 1000000
2020 * full 70000 1000000
2021 *
2022 * GNOME's low memory monitor uses:
2023 * some 70000 1000000
2024 * some 100000 1000000
2025 * full 100000 1000000
2026 *
2027 * We'll default to the middle level that both agree on. Except we do it on a 2s window
2028 * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2029 * kernel will allow us to do unprivileged, also in the future. */
2030 if (asprintf((char**) &write_buffer,
2031 "%s " USEC_FMT " " USEC_FMT,
2032 MEMORY_PRESSURE_DEFAULT_TYPE,
2033 MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2034 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2035 return -ENOMEM;
2036
2037 write_buffer_size = strlen(write_buffer) + 1;
2038 locked = false;
2039 }
2040
2041 path_fd = open(watch, O_PATH|O_CLOEXEC);
2042 if (path_fd < 0) {
2043 if (errno != ENOENT)
2044 return -errno;
2045
2046 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2047 * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2048 * the PSI service apparently is not supported) */
2049 if (!watch_fallback)
2050 return locked ? -ENOENT : -EOPNOTSUPP;
2051
2052 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
2053 if (path_fd < 0) {
2054 if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2055 return -EOPNOTSUPP;
2056 return -errno;
2057 }
2058 }
2059
2060 if (fstat(path_fd, &st) < 0)
2061 return -errno;
2062
2063 if (S_ISSOCK(st.st_mode)) {
2064 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2065 if (fd < 0)
2066 return -errno;
2067
2068 r = connect_unix_path(fd, path_fd, NULL);
2069 if (r < 0)
2070 return r;
2071
2072 events = EPOLLIN;
2073
2074 } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2075 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2076 if (fd < 0)
2077 return fd;
2078
2079 if (S_ISREG(st.st_mode)) {
2080 struct statfs sfs;
2081
2082 /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2083
2084 if (fstatfs(fd, &sfs) < 0)
2085 return -errno;
2086
2087 if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2088 !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2089 return -ENOTTY;
2090
2091 events = EPOLLPRI;
2092 } else
2093 /* For fifos and char devices just watch for EPOLLIN */
2094 events = EPOLLIN;
2095
2096 } else if (S_ISDIR(st.st_mode))
2097 return -EISDIR;
2098 else
2099 return -EBADF;
2100
2101 s->memory_pressure.fd = TAKE_FD(fd);
2102 s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2103 s->memory_pressure.write_buffer_size = write_buffer_size;
2104 s->memory_pressure.events = events;
2105 s->memory_pressure.locked = locked;
2106
2107 /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2108 * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2109 * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2110 * event sources on which writes must be executed before the first event loop iteration is
2111 * executed. (We could also write the data here, right away, but we want to give the caller the
2112 * freedom to call sd_event_source_set_memory_pressure_type() and
2113 * sd_event_source_set_memory_pressure_rate() before we write it. */
2114
2115 if (s->memory_pressure.write_buffer_size > 0)
2116 source_memory_pressure_add_to_write_list(s);
2117 else {
2118 r = source_memory_pressure_register(s, s->enabled);
2119 if (r < 0)
2120 return r;
2121 }
2122
2123 if (ret)
2124 *ret = s;
2125 TAKE_PTR(s);
2126
2127 return 0;
2128 }
2129
2130 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2131 assert(e);
2132
2133 if (!d)
2134 return;
2135
2136 assert(hashmap_isempty(d->inodes));
2137 assert(hashmap_isempty(d->wd));
2138
2139 if (d->buffer_filled > 0)
2140 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
2141
2142 hashmap_free(d->inodes);
2143 hashmap_free(d->wd);
2144
2145 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2146
2147 if (d->fd >= 0) {
2148 if (!event_origin_changed(e) &&
2149 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
2150 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2151
2152 safe_close(d->fd);
2153 }
2154 free(d);
2155 }
2156
2157 static int event_make_inotify_data(
2158 sd_event *e,
2159 int64_t priority,
2160 struct inotify_data **ret) {
2161
2162 _cleanup_close_ int fd = -EBADF;
2163 struct inotify_data *d;
2164 int r;
2165
2166 assert(e);
2167
2168 d = hashmap_get(e->inotify_data, &priority);
2169 if (d) {
2170 if (ret)
2171 *ret = d;
2172 return 0;
2173 }
2174
2175 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2176 if (fd < 0)
2177 return -errno;
2178
2179 fd = fd_move_above_stdio(fd);
2180
2181 d = new(struct inotify_data, 1);
2182 if (!d)
2183 return -ENOMEM;
2184
2185 *d = (struct inotify_data) {
2186 .wakeup = WAKEUP_INOTIFY_DATA,
2187 .fd = TAKE_FD(fd),
2188 .priority = priority,
2189 };
2190
2191 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
2192 if (r < 0) {
2193 d->fd = safe_close(d->fd);
2194 free(d);
2195 return r;
2196 }
2197
2198 struct epoll_event ev = {
2199 .events = EPOLLIN,
2200 .data.ptr = d,
2201 };
2202
2203 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2204 r = -errno;
2205 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2206 * remove the fd from the epoll first, which we don't want as we couldn't
2207 * add it in the first place. */
2208 event_free_inotify_data(e, d);
2209 return r;
2210 }
2211
2212 if (ret)
2213 *ret = d;
2214
2215 return 1;
2216 }
2217
2218 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
2219 int r;
2220
2221 assert(x);
2222 assert(y);
2223
2224 r = CMP(x->dev, y->dev);
2225 if (r != 0)
2226 return r;
2227
2228 return CMP(x->ino, y->ino);
2229 }
2230
2231 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2232 assert(d);
2233
2234 siphash24_compress_typesafe(d->dev, state);
2235 siphash24_compress_typesafe(d->ino, state);
2236 }
2237
2238 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
2239
2240 static void event_free_inode_data(
2241 sd_event *e,
2242 struct inode_data *d) {
2243
2244 assert(e);
2245
2246 if (!d)
2247 return;
2248
2249 assert(!d->event_sources);
2250
2251 if (d->fd >= 0) {
2252 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
2253 safe_close(d->fd);
2254 }
2255
2256 if (d->inotify_data) {
2257
2258 if (d->wd >= 0) {
2259 if (d->inotify_data->fd >= 0 && !event_origin_changed(e)) {
2260 /* So here's a problem. At the time this runs the watch descriptor might already be
2261 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2262 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2263 * likely case to happen. */
2264
2265 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2266 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2267 }
2268
2269 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2270 }
2271
2272 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2273 }
2274
2275 free(d);
2276 }
2277
2278 static void event_gc_inotify_data(
2279 sd_event *e,
2280 struct inotify_data *d) {
2281
2282 assert(e);
2283
2284 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2285 * any inode with it anymore, which in turn happens if no event source of this priority is interested
2286 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2287 * (under the expectation that the GC is called again once the counter is decremented). */
2288
2289 if (!d)
2290 return;
2291
2292 if (!hashmap_isempty(d->inodes))
2293 return;
2294
2295 if (d->n_busy > 0)
2296 return;
2297
2298 event_free_inotify_data(e, d);
2299 }
2300
2301 static void event_gc_inode_data(
2302 sd_event *e,
2303 struct inode_data *d) {
2304
2305 struct inotify_data *inotify_data;
2306
2307 assert(e);
2308
2309 if (!d)
2310 return;
2311
2312 if (d->event_sources)
2313 return;
2314
2315 inotify_data = d->inotify_data;
2316 event_free_inode_data(e, d);
2317
2318 event_gc_inotify_data(e, inotify_data);
2319 }
2320
2321 static int event_make_inode_data(
2322 sd_event *e,
2323 struct inotify_data *inotify_data,
2324 dev_t dev,
2325 ino_t ino,
2326 struct inode_data **ret) {
2327
2328 struct inode_data *d, key;
2329 int r;
2330
2331 assert(e);
2332 assert(inotify_data);
2333
2334 key = (struct inode_data) {
2335 .ino = ino,
2336 .dev = dev,
2337 };
2338
2339 d = hashmap_get(inotify_data->inodes, &key);
2340 if (d) {
2341 if (ret)
2342 *ret = d;
2343
2344 return 0;
2345 }
2346
2347 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2348 if (r < 0)
2349 return r;
2350
2351 d = new(struct inode_data, 1);
2352 if (!d)
2353 return -ENOMEM;
2354
2355 *d = (struct inode_data) {
2356 .dev = dev,
2357 .ino = ino,
2358 .wd = -1,
2359 .fd = -EBADF,
2360 .inotify_data = inotify_data,
2361 };
2362
2363 r = hashmap_put(inotify_data->inodes, d, d);
2364 if (r < 0) {
2365 free(d);
2366 return r;
2367 }
2368
2369 if (ret)
2370 *ret = d;
2371
2372 return 1;
2373 }
2374
2375 static uint32_t inode_data_determine_mask(struct inode_data *d) {
2376 bool excl_unlink = true;
2377 uint32_t combined = 0;
2378
2379 assert(d);
2380
2381 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2382 * the IN_EXCL_UNLINK flag is ANDed instead.
2383 *
2384 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2385 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2386 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2387 * events we don't care for client-side. */
2388
2389 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2390
2391 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2392 excl_unlink = false;
2393
2394 combined |= s->inotify.mask;
2395 }
2396
2397 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2398 }
2399
2400 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2401 uint32_t combined_mask;
2402 int wd, r;
2403
2404 assert(d);
2405 assert(d->fd >= 0);
2406
2407 combined_mask = inode_data_determine_mask(d);
2408
2409 if (d->wd >= 0 && combined_mask == d->combined_mask)
2410 return 0;
2411
2412 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2413 if (r < 0)
2414 return r;
2415
2416 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2417 if (wd < 0)
2418 return -errno;
2419
2420 if (d->wd < 0) {
2421 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2422 if (r < 0) {
2423 (void) inotify_rm_watch(d->inotify_data->fd, wd);
2424 return r;
2425 }
2426
2427 d->wd = wd;
2428
2429 } else if (d->wd != wd) {
2430
2431 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2432 (void) inotify_rm_watch(d->fd, wd);
2433 return -EINVAL;
2434 }
2435
2436 d->combined_mask = combined_mask;
2437 return 1;
2438 }
2439
2440 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2441 assert(s);
2442
2443 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2444 }
2445
2446 static int event_add_inotify_fd_internal(
2447 sd_event *e,
2448 sd_event_source **ret,
2449 int fd,
2450 bool donate,
2451 uint32_t mask,
2452 sd_event_inotify_handler_t callback,
2453 void *userdata) {
2454
2455 _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
2456 _cleanup_(source_freep) sd_event_source *s = NULL;
2457 struct inotify_data *inotify_data = NULL;
2458 struct inode_data *inode_data = NULL;
2459 struct stat st;
2460 int r;
2461
2462 assert_return(e, -EINVAL);
2463 assert_return(e = event_resolve(e), -ENOPKG);
2464 assert_return(fd >= 0, -EBADF);
2465 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2466 assert_return(!event_origin_changed(e), -ECHILD);
2467
2468 if (!callback)
2469 callback = inotify_exit_callback;
2470
2471 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2472 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2473 * the user can't use them for us. */
2474 if (mask & IN_MASK_ADD)
2475 return -EINVAL;
2476
2477 if (fstat(fd, &st) < 0)
2478 return -errno;
2479
2480 s = source_new(e, !ret, SOURCE_INOTIFY);
2481 if (!s)
2482 return -ENOMEM;
2483
2484 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2485 s->inotify.mask = mask;
2486 s->inotify.callback = callback;
2487 s->userdata = userdata;
2488
2489 /* Allocate an inotify object for this priority, and an inode object within it */
2490 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2491 if (r < 0)
2492 return r;
2493
2494 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2495 if (r < 0) {
2496 event_gc_inotify_data(e, inotify_data);
2497 return r;
2498 }
2499
2500 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2501 * the event source, until then, for which we need the original inode. */
2502 if (inode_data->fd < 0) {
2503 if (donated_fd >= 0)
2504 inode_data->fd = TAKE_FD(donated_fd);
2505 else {
2506 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2507 if (inode_data->fd < 0) {
2508 r = -errno;
2509 event_gc_inode_data(e, inode_data);
2510 return r;
2511 }
2512 }
2513
2514 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
2515 }
2516
2517 /* Link our event source to the inode data object */
2518 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2519 s->inotify.inode_data = inode_data;
2520
2521 /* Actually realize the watch now */
2522 r = inode_data_realize_watch(e, inode_data);
2523 if (r < 0)
2524 return r;
2525
2526 if (ret)
2527 *ret = s;
2528 TAKE_PTR(s);
2529
2530 return 0;
2531 }
2532
2533 _public_ int sd_event_add_inotify_fd(
2534 sd_event *e,
2535 sd_event_source **ret,
2536 int fd,
2537 uint32_t mask,
2538 sd_event_inotify_handler_t callback,
2539 void *userdata) {
2540
2541 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2542 }
2543
2544 _public_ int sd_event_add_inotify(
2545 sd_event *e,
2546 sd_event_source **ret,
2547 const char *path,
2548 uint32_t mask,
2549 sd_event_inotify_handler_t callback,
2550 void *userdata) {
2551
2552 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2553 int fd, r;
2554
2555 assert_return(path, -EINVAL);
2556
2557 fd = open(path, O_PATH | O_CLOEXEC |
2558 (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2559 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2560 if (fd < 0)
2561 return -errno;
2562
2563 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2564 if (r < 0)
2565 return r;
2566
2567 (void) sd_event_source_set_description(s, path);
2568
2569 if (ret)
2570 *ret = s;
2571
2572 return r;
2573 }
2574
2575 static sd_event_source* event_source_free(sd_event_source *s) {
2576 if (!s)
2577 return NULL;
2578
2579 /* Here's a special hack: when we are called from a
2580 * dispatch handler we won't free the event source
2581 * immediately, but we will detach the fd from the
2582 * epoll. This way it is safe for the caller to unref
2583 * the event source and immediately close the fd, but
2584 * we still retain a valid event source object after
2585 * the callback. */
2586
2587 if (s->dispatching)
2588 source_disconnect(s);
2589 else
2590 source_free(s);
2591
2592 return NULL;
2593 }
2594
2595 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2596
2597 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2598 assert_return(s, -EINVAL);
2599 assert_return(!event_origin_changed(s->event), -ECHILD);
2600
2601 return free_and_strdup(&s->description, description);
2602 }
2603
2604 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
2605 assert_return(s, -EINVAL);
2606 assert_return(description, -EINVAL);
2607
2608 if (!s->description)
2609 return -ENXIO;
2610
2611 *description = s->description;
2612 return 0;
2613 }
2614
2615 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
2616 assert_return(s, NULL);
2617 assert_return(!event_origin_changed(s->event), NULL);
2618
2619 return s->event;
2620 }
2621
2622 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2623 assert_return(s, -EINVAL);
2624 assert_return(s->type != SOURCE_EXIT, -EDOM);
2625 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2626 assert_return(!event_origin_changed(s->event), -ECHILD);
2627
2628 return s->pending;
2629 }
2630
2631 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2632 assert_return(s, -EINVAL);
2633 assert_return(s->type == SOURCE_IO, -EDOM);
2634 assert_return(!event_origin_changed(s->event), -ECHILD);
2635
2636 return s->io.fd;
2637 }
2638
2639 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2640 int r;
2641
2642 assert_return(s, -EINVAL);
2643 assert_return(fd >= 0, -EBADF);
2644 assert_return(s->type == SOURCE_IO, -EDOM);
2645 assert_return(!event_origin_changed(s->event), -ECHILD);
2646
2647 if (s->io.fd == fd)
2648 return 0;
2649
2650 if (event_source_is_offline(s)) {
2651 s->io.fd = fd;
2652 s->io.registered = false;
2653 } else {
2654 int saved_fd;
2655
2656 saved_fd = s->io.fd;
2657 assert(s->io.registered);
2658
2659 s->io.fd = fd;
2660 s->io.registered = false;
2661
2662 r = source_io_register(s, s->enabled, s->io.events);
2663 if (r < 0) {
2664 s->io.fd = saved_fd;
2665 s->io.registered = true;
2666 return r;
2667 }
2668
2669 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2670 }
2671
2672 return 0;
2673 }
2674
2675 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2676 assert_return(s, -EINVAL);
2677 assert_return(s->type == SOURCE_IO, -EDOM);
2678 assert_return(!event_origin_changed(s->event), -ECHILD);
2679
2680 return s->io.owned;
2681 }
2682
2683 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2684 assert_return(s, -EINVAL);
2685 assert_return(s->type == SOURCE_IO, -EDOM);
2686 assert_return(!event_origin_changed(s->event), -ECHILD);
2687
2688 s->io.owned = own;
2689 return 0;
2690 }
2691
2692 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2693 assert_return(s, -EINVAL);
2694 assert_return(events, -EINVAL);
2695 assert_return(s->type == SOURCE_IO, -EDOM);
2696 assert_return(!event_origin_changed(s->event), -ECHILD);
2697
2698 *events = s->io.events;
2699 return 0;
2700 }
2701
2702 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2703 int r;
2704
2705 assert_return(s, -EINVAL);
2706 assert_return(s->type == SOURCE_IO, -EDOM);
2707 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2708 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2709 assert_return(!event_origin_changed(s->event), -ECHILD);
2710
2711 /* edge-triggered updates are never skipped, so we can reset edges */
2712 if (s->io.events == events && !(events & EPOLLET))
2713 return 0;
2714
2715 r = source_set_pending(s, false);
2716 if (r < 0)
2717 return r;
2718
2719 if (event_source_is_online(s)) {
2720 r = source_io_register(s, s->enabled, events);
2721 if (r < 0)
2722 return r;
2723 }
2724
2725 s->io.events = events;
2726
2727 return 0;
2728 }
2729
2730 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2731 assert_return(s, -EINVAL);
2732 assert_return(revents, -EINVAL);
2733 assert_return(s->type == SOURCE_IO, -EDOM);
2734 assert_return(s->pending, -ENODATA);
2735 assert_return(!event_origin_changed(s->event), -ECHILD);
2736
2737 *revents = s->io.revents;
2738 return 0;
2739 }
2740
2741 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2742 assert_return(s, -EINVAL);
2743 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2744 assert_return(!event_origin_changed(s->event), -ECHILD);
2745
2746 return s->signal.sig;
2747 }
2748
2749 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2750 assert_return(s, -EINVAL);
2751 assert_return(!event_origin_changed(s->event), -ECHILD);
2752
2753 *priority = s->priority;
2754 return 0;
2755 }
2756
2757 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2758 bool rm_inotify = false, rm_inode = false;
2759 struct inotify_data *new_inotify_data = NULL;
2760 struct inode_data *new_inode_data = NULL;
2761 int r;
2762
2763 assert_return(s, -EINVAL);
2764 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2765 assert_return(!event_origin_changed(s->event), -ECHILD);
2766
2767 if (s->priority == priority)
2768 return 0;
2769
2770 if (s->type == SOURCE_INOTIFY) {
2771 struct inode_data *old_inode_data;
2772
2773 assert(s->inotify.inode_data);
2774 old_inode_data = s->inotify.inode_data;
2775
2776 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2777 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2778 * events we allow priority changes only until the first following iteration. */
2779 if (old_inode_data->fd < 0)
2780 return -EOPNOTSUPP;
2781
2782 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2783 if (r < 0)
2784 return r;
2785 rm_inotify = r > 0;
2786
2787 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2788 if (r < 0)
2789 goto fail;
2790 rm_inode = r > 0;
2791
2792 if (new_inode_data->fd < 0) {
2793 /* Duplicate the fd for the new inode object if we don't have any yet */
2794 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2795 if (new_inode_data->fd < 0) {
2796 r = -errno;
2797 goto fail;
2798 }
2799
2800 LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
2801 }
2802
2803 /* Move the event source to the new inode data structure */
2804 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2805 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2806 s->inotify.inode_data = new_inode_data;
2807
2808 /* Now create the new watch */
2809 r = inode_data_realize_watch(s->event, new_inode_data);
2810 if (r < 0) {
2811 /* Move it back */
2812 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2813 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2814 s->inotify.inode_data = old_inode_data;
2815 goto fail;
2816 }
2817
2818 s->priority = priority;
2819
2820 event_gc_inode_data(s->event, old_inode_data);
2821
2822 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2823 struct signal_data *old, *d;
2824
2825 /* Move us from the signalfd belonging to the old
2826 * priority to the signalfd of the new priority */
2827
2828 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2829
2830 s->priority = priority;
2831
2832 r = event_make_signal_data(s->event, s->signal.sig, &d);
2833 if (r < 0) {
2834 s->priority = old->priority;
2835 return r;
2836 }
2837
2838 event_unmask_signal_data(s->event, old, s->signal.sig);
2839 } else
2840 s->priority = priority;
2841
2842 event_source_pp_prioq_reshuffle(s);
2843
2844 if (s->type == SOURCE_EXIT)
2845 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2846
2847 return 0;
2848
2849 fail:
2850 if (rm_inode)
2851 event_free_inode_data(s->event, new_inode_data);
2852
2853 if (rm_inotify)
2854 event_free_inotify_data(s->event, new_inotify_data);
2855
2856 return r;
2857 }
2858
2859 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2860 /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2861 if (!s && !ret)
2862 return false;
2863
2864 assert_return(s, -EINVAL);
2865 assert_return(!event_origin_changed(s->event), -ECHILD);
2866
2867 if (ret)
2868 *ret = s->enabled;
2869
2870 return s->enabled != SD_EVENT_OFF;
2871 }
2872
2873 static int event_source_offline(
2874 sd_event_source *s,
2875 int enabled,
2876 bool ratelimited) {
2877
2878 bool was_offline;
2879 int r;
2880
2881 assert(s);
2882 assert(enabled == SD_EVENT_OFF || ratelimited);
2883
2884 /* Unset the pending flag when this event source is disabled */
2885 if (s->enabled != SD_EVENT_OFF &&
2886 enabled == SD_EVENT_OFF &&
2887 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2888 r = source_set_pending(s, false);
2889 if (r < 0)
2890 return r;
2891 }
2892
2893 was_offline = event_source_is_offline(s);
2894 s->enabled = enabled;
2895 s->ratelimited = ratelimited;
2896
2897 switch (s->type) {
2898
2899 case SOURCE_IO:
2900 source_io_unregister(s);
2901 break;
2902
2903 case SOURCE_SIGNAL:
2904 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2905 break;
2906
2907 case SOURCE_CHILD:
2908 if (!was_offline) {
2909 assert(s->event->n_online_child_sources > 0);
2910 s->event->n_online_child_sources--;
2911 }
2912
2913 if (EVENT_SOURCE_WATCH_PIDFD(s))
2914 source_child_pidfd_unregister(s);
2915 else
2916 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2917 break;
2918
2919 case SOURCE_EXIT:
2920 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2921 break;
2922
2923 case SOURCE_MEMORY_PRESSURE:
2924 source_memory_pressure_unregister(s);
2925 break;
2926
2927 case SOURCE_TIME_REALTIME:
2928 case SOURCE_TIME_BOOTTIME:
2929 case SOURCE_TIME_MONOTONIC:
2930 case SOURCE_TIME_REALTIME_ALARM:
2931 case SOURCE_TIME_BOOTTIME_ALARM:
2932 case SOURCE_DEFER:
2933 case SOURCE_POST:
2934 case SOURCE_INOTIFY:
2935 break;
2936
2937 default:
2938 assert_not_reached();
2939 }
2940
2941 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2942 event_source_time_prioq_reshuffle(s);
2943
2944 return 1;
2945 }
2946
2947 static int event_source_online(
2948 sd_event_source *s,
2949 int enabled,
2950 bool ratelimited) {
2951
2952 bool was_online;
2953 int r;
2954
2955 assert(s);
2956 assert(enabled != SD_EVENT_OFF || !ratelimited);
2957
2958 /* Unset the pending flag when this event source is enabled */
2959 if (s->enabled == SD_EVENT_OFF &&
2960 enabled != SD_EVENT_OFF &&
2961 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2962 r = source_set_pending(s, false);
2963 if (r < 0)
2964 return r;
2965 }
2966
2967 /* Are we really ready for onlining? */
2968 if (enabled == SD_EVENT_OFF || ratelimited) {
2969 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2970 s->enabled = enabled;
2971 s->ratelimited = ratelimited;
2972 return 0;
2973 }
2974
2975 was_online = event_source_is_online(s);
2976
2977 switch (s->type) {
2978 case SOURCE_IO:
2979 r = source_io_register(s, enabled, s->io.events);
2980 if (r < 0)
2981 return r;
2982 break;
2983
2984 case SOURCE_SIGNAL:
2985 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2986 if (r < 0) {
2987 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2988 return r;
2989 }
2990
2991 break;
2992
2993 case SOURCE_CHILD:
2994 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2995 /* yes, we have pidfd */
2996
2997 r = source_child_pidfd_register(s, enabled);
2998 if (r < 0)
2999 return r;
3000 } else {
3001 /* no pidfd, or something other to watch for than WEXITED */
3002
3003 r = event_make_signal_data(s->event, SIGCHLD, NULL);
3004 if (r < 0) {
3005 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
3006 return r;
3007 }
3008 }
3009
3010 if (!was_online)
3011 s->event->n_online_child_sources++;
3012 break;
3013
3014 case SOURCE_MEMORY_PRESSURE:
3015 r = source_memory_pressure_register(s, enabled);
3016 if (r < 0)
3017 return r;
3018
3019 break;
3020
3021 case SOURCE_TIME_REALTIME:
3022 case SOURCE_TIME_BOOTTIME:
3023 case SOURCE_TIME_MONOTONIC:
3024 case SOURCE_TIME_REALTIME_ALARM:
3025 case SOURCE_TIME_BOOTTIME_ALARM:
3026 case SOURCE_EXIT:
3027 case SOURCE_DEFER:
3028 case SOURCE_POST:
3029 case SOURCE_INOTIFY:
3030 break;
3031
3032 default:
3033 assert_not_reached();
3034 }
3035
3036 s->enabled = enabled;
3037 s->ratelimited = ratelimited;
3038
3039 /* Non-failing operations below */
3040 if (s->type == SOURCE_EXIT)
3041 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
3042
3043 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3044 event_source_time_prioq_reshuffle(s);
3045
3046 return 1;
3047 }
3048
3049 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3050 int r;
3051
3052 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
3053
3054 /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3055 if (m == SD_EVENT_OFF && !s)
3056 return 0;
3057
3058 assert_return(s, -EINVAL);
3059 assert_return(!event_origin_changed(s->event), -ECHILD);
3060
3061 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3062 if (s->event->state == SD_EVENT_FINISHED)
3063 return m == SD_EVENT_OFF ? 0 : -ESTALE;
3064
3065 if (s->enabled == m) /* No change? */
3066 return 0;
3067
3068 if (m == SD_EVENT_OFF)
3069 r = event_source_offline(s, m, s->ratelimited);
3070 else {
3071 if (s->enabled != SD_EVENT_OFF) {
3072 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3073 * event source is already enabled after all. */
3074 s->enabled = m;
3075 return 0;
3076 }
3077
3078 r = event_source_online(s, m, s->ratelimited);
3079 }
3080 if (r < 0)
3081 return r;
3082
3083 event_source_pp_prioq_reshuffle(s);
3084 return 0;
3085 }
3086
3087 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
3088 assert_return(s, -EINVAL);
3089 assert_return(usec, -EINVAL);
3090 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3091 assert_return(!event_origin_changed(s->event), -ECHILD);
3092
3093 *usec = s->time.next;
3094 return 0;
3095 }
3096
3097 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
3098 int r;
3099
3100 assert_return(s, -EINVAL);
3101 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3102 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3103 assert_return(!event_origin_changed(s->event), -ECHILD);
3104
3105 r = source_set_pending(s, false);
3106 if (r < 0)
3107 return r;
3108
3109 s->time.next = usec;
3110
3111 event_source_time_prioq_reshuffle(s);
3112 return 0;
3113 }
3114
3115 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3116 usec_t t;
3117 int r;
3118
3119 assert_return(s, -EINVAL);
3120 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3121 assert_return(!event_origin_changed(s->event), -ECHILD);
3122
3123 if (usec == USEC_INFINITY)
3124 return sd_event_source_set_time(s, USEC_INFINITY);
3125
3126 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3127 if (r < 0)
3128 return r;
3129
3130 usec = usec_add(t, usec);
3131 if (usec == USEC_INFINITY)
3132 return -EOVERFLOW;
3133
3134 return sd_event_source_set_time(s, usec);
3135 }
3136
3137 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
3138 assert_return(s, -EINVAL);
3139 assert_return(usec, -EINVAL);
3140 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3141 assert_return(!event_origin_changed(s->event), -ECHILD);
3142
3143 *usec = s->time.accuracy;
3144 return 0;
3145 }
3146
3147 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
3148 int r;
3149
3150 assert_return(s, -EINVAL);
3151 assert_return(usec != UINT64_MAX, -EINVAL);
3152 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3153 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3154 assert_return(!event_origin_changed(s->event), -ECHILD);
3155
3156 r = source_set_pending(s, false);
3157 if (r < 0)
3158 return r;
3159
3160 if (usec == 0)
3161 usec = DEFAULT_ACCURACY_USEC;
3162
3163 s->time.accuracy = usec;
3164
3165 event_source_time_prioq_reshuffle(s);
3166 return 0;
3167 }
3168
3169 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
3170 assert_return(s, -EINVAL);
3171 assert_return(clock, -EINVAL);
3172 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3173 assert_return(!event_origin_changed(s->event), -ECHILD);
3174
3175 *clock = event_source_type_to_clock(s->type);
3176 return 0;
3177 }
3178
3179 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
3180 assert_return(s, -EINVAL);
3181 assert_return(pid, -EINVAL);
3182 assert_return(s->type == SOURCE_CHILD, -EDOM);
3183 assert_return(!event_origin_changed(s->event), -ECHILD);
3184
3185 *pid = s->child.pid;
3186 return 0;
3187 }
3188
3189 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3190 assert_return(s, -EINVAL);
3191 assert_return(s->type == SOURCE_CHILD, -EDOM);
3192 assert_return(!event_origin_changed(s->event), -ECHILD);
3193
3194 if (s->child.pidfd < 0)
3195 return -EOPNOTSUPP;
3196
3197 return s->child.pidfd;
3198 }
3199
3200 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3201 assert_return(s, -EINVAL);
3202 assert_return(s->type == SOURCE_CHILD, -EDOM);
3203 assert_return(!event_origin_changed(s->event), -ECHILD);
3204 assert_return(SIGNAL_VALID(sig), -EINVAL);
3205
3206 /* If we already have seen indication the process exited refuse sending a signal early. This way we
3207 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3208 * available. */
3209 if (s->child.exited)
3210 return -ESRCH;
3211
3212 if (s->child.pidfd >= 0) {
3213 siginfo_t copy;
3214
3215 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3216 * structure here */
3217 if (si)
3218 copy = *si;
3219
3220 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
3221 /* Let's propagate the error only if the system call is not implemented or prohibited */
3222 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3223 return -errno;
3224 } else
3225 return 0;
3226 }
3227
3228 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3229 * this here. */
3230 if (flags != 0)
3231 return -EOPNOTSUPP;
3232
3233 if (si) {
3234 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3235 siginfo_t copy = *si;
3236
3237 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
3238 return -errno;
3239 } else if (kill(s->child.pid, sig) < 0)
3240 return -errno;
3241
3242 return 0;
3243 }
3244
3245 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3246 assert_return(s, -EINVAL);
3247 assert_return(s->type == SOURCE_CHILD, -EDOM);
3248 assert_return(!event_origin_changed(s->event), -ECHILD);
3249
3250 if (s->child.pidfd < 0)
3251 return -EOPNOTSUPP;
3252
3253 return s->child.pidfd_owned;
3254 }
3255
3256 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3257 assert_return(s, -EINVAL);
3258 assert_return(s->type == SOURCE_CHILD, -EDOM);
3259 assert_return(!event_origin_changed(s->event), -ECHILD);
3260
3261 if (s->child.pidfd < 0)
3262 return -EOPNOTSUPP;
3263
3264 s->child.pidfd_owned = own;
3265 return 0;
3266 }
3267
3268 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3269 assert_return(s, -EINVAL);
3270 assert_return(s->type == SOURCE_CHILD, -EDOM);
3271 assert_return(!event_origin_changed(s->event), -ECHILD);
3272
3273 return s->child.process_owned;
3274 }
3275
3276 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3277 assert_return(s, -EINVAL);
3278 assert_return(s->type == SOURCE_CHILD, -EDOM);
3279 assert_return(!event_origin_changed(s->event), -ECHILD);
3280
3281 s->child.process_owned = own;
3282 return 0;
3283 }
3284
3285 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
3286 assert_return(s, -EINVAL);
3287 assert_return(mask, -EINVAL);
3288 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3289 assert_return(!event_origin_changed(s->event), -ECHILD);
3290
3291 *mask = s->inotify.mask;
3292 return 0;
3293 }
3294
3295 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
3296 int r;
3297
3298 assert_return(s, -EINVAL);
3299 assert_return(s->type != SOURCE_EXIT, -EDOM);
3300 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3301 assert_return(!event_origin_changed(s->event), -ECHILD);
3302
3303 if (s->prepare == callback)
3304 return 0;
3305
3306 if (callback && s->prepare) {
3307 s->prepare = callback;
3308 return 0;
3309 }
3310
3311 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3312 if (r < 0)
3313 return r;
3314
3315 s->prepare = callback;
3316
3317 if (callback) {
3318 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3319 if (r < 0)
3320 return r;
3321 } else
3322 prioq_remove(s->event->prepare, s, &s->prepare_index);
3323
3324 return 0;
3325 }
3326
3327 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
3328 assert_return(s, NULL);
3329 assert_return(!event_origin_changed(s->event), NULL);
3330
3331 return s->userdata;
3332 }
3333
3334 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3335 void *ret;
3336
3337 assert_return(s, NULL);
3338 assert_return(!event_origin_changed(s->event), NULL);
3339
3340 ret = s->userdata;
3341 s->userdata = userdata;
3342
3343 return ret;
3344 }
3345
3346 static int event_source_enter_ratelimited(sd_event_source *s) {
3347 int r;
3348
3349 assert(s);
3350
3351 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3352 * the end of the rate limit time window, much as if it was a timer event source. */
3353
3354 if (s->ratelimited)
3355 return 0; /* Already ratelimited, this is a NOP hence */
3356
3357 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3358 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3359 if (r < 0)
3360 return r;
3361
3362 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3363 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3364 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3365 if (EVENT_SOURCE_IS_TIME(s->type))
3366 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3367
3368 /* Now, let's add the event source to the monotonic clock instead */
3369 r = event_source_time_prioq_put(s, &s->event->monotonic);
3370 if (r < 0)
3371 goto fail;
3372
3373 /* And let's take the event source officially offline */
3374 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3375 if (r < 0) {
3376 event_source_time_prioq_remove(s, &s->event->monotonic);
3377 goto fail;
3378 }
3379
3380 event_source_pp_prioq_reshuffle(s);
3381
3382 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3383 return 0;
3384
3385 fail:
3386 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3387 * space for it should already be allocated. */
3388 if (EVENT_SOURCE_IS_TIME(s->type))
3389 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3390
3391 return r;
3392 }
3393
3394 static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
3395 int r;
3396
3397 assert(s);
3398
3399 if (!s->ratelimited)
3400 return 0;
3401
3402 /* Let's take the event source out of the monotonic prioq first. */
3403 event_source_time_prioq_remove(s, &s->event->monotonic);
3404
3405 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3406 if (EVENT_SOURCE_IS_TIME(s->type)) {
3407 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3408 if (r < 0)
3409 goto fail;
3410 }
3411
3412 /* Let's try to take it online again. */
3413 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3414 if (r < 0) {
3415 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3416 if (EVENT_SOURCE_IS_TIME(s->type))
3417 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3418
3419 goto fail;
3420 }
3421
3422 event_source_pp_prioq_reshuffle(s);
3423 ratelimit_reset(&s->rate_limit);
3424
3425 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
3426
3427 if (run_callback && s->ratelimit_expire_callback) {
3428 s->dispatching = true;
3429 r = s->ratelimit_expire_callback(s, s->userdata);
3430 s->dispatching = false;
3431
3432 if (r < 0) {
3433 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3434 strna(s->description),
3435 event_source_type_to_string(s->type),
3436 s->exit_on_failure ? "exiting" : "disabling");
3437
3438 if (s->exit_on_failure)
3439 (void) sd_event_exit(s->event, r);
3440 }
3441
3442 if (s->n_ref == 0)
3443 source_free(s);
3444 else if (r < 0)
3445 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3446
3447 return 1;
3448 }
3449
3450 return 0;
3451
3452 fail:
3453 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3454 * simply put it back in it, maybe we can then process it more successfully next iteration. */
3455 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3456
3457 return r;
3458 }
3459
3460 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3461 usec_t c;
3462 assert(e);
3463 assert(a <= b);
3464
3465 if (a <= 0)
3466 return 0;
3467 if (a >= USEC_INFINITY)
3468 return USEC_INFINITY;
3469
3470 if (b <= a + 1)
3471 return a;
3472
3473 initialize_perturb(e);
3474
3475 /*
3476 Find a good time to wake up again between times a and b. We
3477 have two goals here:
3478
3479 a) We want to wake up as seldom as possible, hence prefer
3480 later times over earlier times.
3481
3482 b) But if we have to wake up, then let's make sure to
3483 dispatch as much as possible on the entire system.
3484
3485 We implement this by waking up everywhere at the same time
3486 within any given minute if we can, synchronised via the
3487 perturbation value determined from the boot ID. If we can't,
3488 then we try to find the same spot in every 10s, then 1s and
3489 then 250ms step. Otherwise, we pick the last possible time
3490 to wake up.
3491 */
3492
3493 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3494 if (c >= b) {
3495 if (_unlikely_(c < USEC_PER_MINUTE))
3496 return b;
3497
3498 c -= USEC_PER_MINUTE;
3499 }
3500
3501 if (c >= a)
3502 return c;
3503
3504 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3505 if (c >= b) {
3506 if (_unlikely_(c < USEC_PER_SEC*10))
3507 return b;
3508
3509 c -= USEC_PER_SEC*10;
3510 }
3511
3512 if (c >= a)
3513 return c;
3514
3515 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3516 if (c >= b) {
3517 if (_unlikely_(c < USEC_PER_SEC))
3518 return b;
3519
3520 c -= USEC_PER_SEC;
3521 }
3522
3523 if (c >= a)
3524 return c;
3525
3526 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3527 if (c >= b) {
3528 if (_unlikely_(c < USEC_PER_MSEC*250))
3529 return b;
3530
3531 c -= USEC_PER_MSEC*250;
3532 }
3533
3534 if (c >= a)
3535 return c;
3536
3537 return b;
3538 }
3539
3540 static int event_arm_timer(
3541 sd_event *e,
3542 struct clock_data *d) {
3543
3544 struct itimerspec its = {};
3545 sd_event_source *a, *b;
3546 usec_t t;
3547
3548 assert(e);
3549 assert(d);
3550
3551 if (!d->needs_rearm)
3552 return 0;
3553
3554 d->needs_rearm = false;
3555
3556 a = prioq_peek(d->earliest);
3557 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3558 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3559
3560 if (d->fd < 0)
3561 return 0;
3562
3563 if (d->next == USEC_INFINITY)
3564 return 0;
3565
3566 /* disarm */
3567 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3568 return -errno;
3569
3570 d->next = USEC_INFINITY;
3571 return 0;
3572 }
3573
3574 b = prioq_peek(d->latest);
3575 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3576 assert(b && b->enabled != SD_EVENT_OFF);
3577
3578 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3579 if (d->next == t)
3580 return 0;
3581
3582 assert_se(d->fd >= 0);
3583
3584 if (t == 0) {
3585 /* We don't want to disarm here, just mean some time looooong ago. */
3586 its.it_value.tv_sec = 0;
3587 its.it_value.tv_nsec = 1;
3588 } else
3589 timespec_store(&its.it_value, t);
3590
3591 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3592 return -errno;
3593
3594 d->next = t;
3595 return 0;
3596 }
3597
3598 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3599 assert(e);
3600 assert(s);
3601 assert(s->type == SOURCE_IO);
3602
3603 /* If the event source was already pending, we just OR in the
3604 * new revents, otherwise we reset the value. The ORing is
3605 * necessary to handle EPOLLONESHOT events properly where
3606 * readability might happen independently of writability, and
3607 * we need to keep track of both */
3608
3609 if (s->pending)
3610 s->io.revents |= revents;
3611 else
3612 s->io.revents = revents;
3613
3614 return source_set_pending(s, true);
3615 }
3616
3617 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3618 uint64_t x;
3619 ssize_t ss;
3620
3621 assert(e);
3622 assert(fd >= 0);
3623
3624 assert_return(events == EPOLLIN, -EIO);
3625
3626 ss = read(fd, &x, sizeof(x));
3627 if (ss < 0) {
3628 if (ERRNO_IS_TRANSIENT(errno))
3629 return 0;
3630
3631 return -errno;
3632 }
3633
3634 if (_unlikely_(ss != sizeof(x)))
3635 return -EIO;
3636
3637 if (next)
3638 *next = USEC_INFINITY;
3639
3640 return 0;
3641 }
3642
3643 static int process_timer(
3644 sd_event *e,
3645 usec_t n,
3646 struct clock_data *d) {
3647
3648 sd_event_source *s;
3649 bool callback_invoked = false;
3650 int r;
3651
3652 assert(e);
3653 assert(d);
3654
3655 for (;;) {
3656 s = prioq_peek(d->earliest);
3657 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3658
3659 if (!s || time_event_source_next(s) > n)
3660 break;
3661
3662 if (s->ratelimited) {
3663 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3664 * again. */
3665 assert(s->ratelimited);
3666
3667 r = event_source_leave_ratelimit(s, /* run_callback */ true);
3668 if (r < 0)
3669 return r;
3670 else if (r == 1)
3671 callback_invoked = true;
3672
3673 continue;
3674 }
3675
3676 if (s->enabled == SD_EVENT_OFF || s->pending)
3677 break;
3678
3679 r = source_set_pending(s, true);
3680 if (r < 0)
3681 return r;
3682
3683 event_source_time_prioq_reshuffle(s);
3684 }
3685
3686 return callback_invoked;
3687 }
3688
3689 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3690 int64_t min_priority = threshold;
3691 bool something_new = false;
3692 sd_event_source *s;
3693 int r;
3694
3695 assert(e);
3696 assert(ret_min_priority);
3697
3698 if (!e->need_process_child) {
3699 *ret_min_priority = min_priority;
3700 return 0;
3701 }
3702
3703 e->need_process_child = false;
3704
3705 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3706 * for, instead of using P_ALL. This is because we only want to get child information of very
3707 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3708 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3709 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3710 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3711 * to handle SIGCHLD yourself.
3712 *
3713 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3714 * source is dispatched so that the callback still sees the process as a zombie. */
3715
3716 HASHMAP_FOREACH(s, e->child_sources) {
3717 assert(s->type == SOURCE_CHILD);
3718
3719 if (s->priority > threshold)
3720 continue;
3721
3722 if (s->pending)
3723 continue;
3724
3725 if (event_source_is_offline(s))
3726 continue;
3727
3728 if (s->child.exited)
3729 continue;
3730
3731 if (EVENT_SOURCE_WATCH_PIDFD(s))
3732 /* There's a usable pidfd known for this event source? Then don't waitid() for
3733 * it here */
3734 continue;
3735
3736 zero(s->child.siginfo);
3737 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3738 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3739 return negative_errno();
3740
3741 if (s->child.siginfo.si_pid != 0) {
3742 bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3743
3744 if (zombie)
3745 s->child.exited = true;
3746
3747 if (!zombie && (s->child.options & WEXITED)) {
3748 /* If the child isn't dead then let's immediately remove the state
3749 * change from the queue, since there's no benefit in leaving it
3750 * queued. */
3751
3752 assert(s->child.options & (WSTOPPED|WCONTINUED));
3753 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3754 }
3755
3756 r = source_set_pending(s, true);
3757 if (r < 0)
3758 return r;
3759 if (r > 0) {
3760 something_new = true;
3761 min_priority = MIN(min_priority, s->priority);
3762 }
3763 }
3764 }
3765
3766 *ret_min_priority = min_priority;
3767 return something_new;
3768 }
3769
3770 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3771 assert(e);
3772 assert(s);
3773 assert(s->type == SOURCE_CHILD);
3774
3775 if (s->pending)
3776 return 0;
3777
3778 if (event_source_is_offline(s))
3779 return 0;
3780
3781 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3782 return 0;
3783
3784 zero(s->child.siginfo);
3785 if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3786 return -errno;
3787
3788 if (s->child.siginfo.si_pid == 0)
3789 return 0;
3790
3791 if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3792 s->child.exited = true;
3793
3794 return source_set_pending(s, true);
3795 }
3796
3797 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3798 int r;
3799
3800 assert(e);
3801 assert(d);
3802 assert_return(events == EPOLLIN, -EIO);
3803 assert(min_priority);
3804
3805 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3806 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3807 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3808 * but we might have higher priority children we care about hence we need to check that
3809 * explicitly. */
3810
3811 if (sigismember(&d->sigset, SIGCHLD))
3812 e->need_process_child = true;
3813
3814 /* If there's already an event source pending for this priority we don't read another */
3815 if (d->current)
3816 return 0;
3817
3818 for (;;) {
3819 struct signalfd_siginfo si;
3820 ssize_t n;
3821 sd_event_source *s = NULL;
3822
3823 n = read(d->fd, &si, sizeof(si));
3824 if (n < 0) {
3825 if (ERRNO_IS_TRANSIENT(errno))
3826 return 0;
3827
3828 return -errno;
3829 }
3830
3831 if (_unlikely_(n != sizeof(si)))
3832 return -EIO;
3833
3834 assert(SIGNAL_VALID(si.ssi_signo));
3835
3836 if (e->signal_sources)
3837 s = e->signal_sources[si.ssi_signo];
3838 if (!s)
3839 continue;
3840 if (s->pending)
3841 continue;
3842
3843 s->signal.siginfo = si;
3844 d->current = s;
3845
3846 r = source_set_pending(s, true);
3847 if (r < 0)
3848 return r;
3849 if (r > 0 && *min_priority >= s->priority) {
3850 *min_priority = s->priority;
3851 return 1; /* an event source with smaller priority is queued. */
3852 }
3853
3854 return 0;
3855 }
3856 }
3857
3858 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3859 ssize_t n;
3860
3861 assert(e);
3862 assert(d);
3863
3864 assert_return(revents == EPOLLIN, -EIO);
3865
3866 /* If there's already an event source pending for this priority, don't read another */
3867 if (d->n_pending > 0)
3868 return 0;
3869
3870 /* Is the read buffer non-empty? If so, let's not read more */
3871 if (d->buffer_filled > 0)
3872 return 0;
3873
3874 if (d->priority > threshold)
3875 return 0;
3876
3877 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3878 if (n < 0) {
3879 if (ERRNO_IS_TRANSIENT(errno))
3880 return 0;
3881
3882 return -errno;
3883 }
3884
3885 assert(n > 0);
3886 d->buffer_filled = (size_t) n;
3887 LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
3888
3889 return 1;
3890 }
3891
3892 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3893 assert(e);
3894 assert(d);
3895 assert(sz <= d->buffer_filled);
3896
3897 if (sz == 0)
3898 return;
3899
3900 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3901 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3902 d->buffer_filled -= sz;
3903
3904 if (d->buffer_filled == 0)
3905 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
3906 }
3907
3908 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3909 int r;
3910
3911 assert(e);
3912 assert(d);
3913
3914 /* If there's already an event source pending for this priority, don't read another */
3915 if (d->n_pending > 0)
3916 return 0;
3917
3918 while (d->buffer_filled > 0) {
3919 size_t sz;
3920
3921 /* Let's validate that the event structures are complete */
3922 if (d->buffer_filled < offsetof(struct inotify_event, name))
3923 return -EIO;
3924
3925 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3926 if (d->buffer_filled < sz)
3927 return -EIO;
3928
3929 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3930 struct inode_data *inode_data;
3931
3932 /* The queue overran, let's pass this event to all event sources connected to this inotify
3933 * object */
3934
3935 HASHMAP_FOREACH(inode_data, d->inodes)
3936 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3937
3938 if (event_source_is_offline(s))
3939 continue;
3940
3941 r = source_set_pending(s, true);
3942 if (r < 0)
3943 return r;
3944 }
3945 } else {
3946 struct inode_data *inode_data;
3947
3948 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3949 * our watch descriptor table. */
3950 if (d->buffer.ev.mask & IN_IGNORED) {
3951
3952 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3953 if (!inode_data) {
3954 event_inotify_data_drop(e, d, sz);
3955 continue;
3956 }
3957
3958 /* The watch descriptor was removed by the kernel, let's drop it here too */
3959 inode_data->wd = -1;
3960 } else {
3961 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3962 if (!inode_data) {
3963 event_inotify_data_drop(e, d, sz);
3964 continue;
3965 }
3966 }
3967
3968 /* Trigger all event sources that are interested in these events. Also trigger all event
3969 * sources if IN_IGNORED or IN_UNMOUNT is set. */
3970 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3971
3972 if (event_source_is_offline(s))
3973 continue;
3974
3975 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3976 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3977 continue;
3978
3979 r = source_set_pending(s, true);
3980 if (r < 0)
3981 return r;
3982 }
3983 }
3984
3985 /* Something pending now? If so, let's finish, otherwise let's read more. */
3986 if (d->n_pending > 0)
3987 return 1;
3988 }
3989
3990 return 0;
3991 }
3992
3993 static int process_inotify(sd_event *e) {
3994 int r, done = 0;
3995
3996 assert(e);
3997
3998 LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
3999 r = event_inotify_data_process(e, d);
4000 if (r < 0)
4001 return r;
4002 if (r > 0)
4003 done++;
4004 }
4005
4006 return done;
4007 }
4008
4009 static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
4010 assert(s);
4011 assert(s->type == SOURCE_MEMORY_PRESSURE);
4012
4013 if (s->pending)
4014 s->memory_pressure.revents |= revents;
4015 else
4016 s->memory_pressure.revents = revents;
4017
4018 return source_set_pending(s, true);
4019 }
4020
4021 static int source_memory_pressure_write(sd_event_source *s) {
4022 ssize_t n;
4023 int r;
4024
4025 assert(s);
4026 assert(s->type == SOURCE_MEMORY_PRESSURE);
4027
4028 /* once we start writing, the buffer is locked, we allow no further changes. */
4029 s->memory_pressure.locked = true;
4030
4031 if (s->memory_pressure.write_buffer_size > 0) {
4032 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4033 if (n < 0) {
4034 if (!ERRNO_IS_TRANSIENT(errno)) {
4035 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4036 * files, but then generates EOPNOSUPP on read() and write() (instead of on
4037 * open()!). This sucks hard, since we can only detect this kind of failure
4038 * so late. Let's make the best of it, and turn off the event source like we
4039 * do for failed event source handlers. */
4040
4041 log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4042 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4043 return 0;
4044 }
4045
4046 n = 0;
4047 }
4048 } else
4049 n = 0;
4050
4051 assert(n >= 0);
4052
4053 if ((size_t) n == s->memory_pressure.write_buffer_size) {
4054 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4055
4056 if (n > 0) {
4057 s->memory_pressure.write_buffer_size = 0;
4058
4059 /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4060 r = source_memory_pressure_register(s, s->enabled);
4061 if (r < 0)
4062 return r;
4063 }
4064 } else if (n > 0) {
4065 _cleanup_free_ void *c = NULL;
4066
4067 assert((size_t) n < s->memory_pressure.write_buffer_size);
4068
4069 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4070 if (!c)
4071 return -ENOMEM;
4072
4073 free_and_replace(s->memory_pressure.write_buffer, c);
4074 s->memory_pressure.write_buffer_size -= n;
4075 return 1;
4076 }
4077
4078 return 0;
4079 }
4080
4081 static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4082 int r;
4083
4084 assert(s);
4085 assert(s->type == SOURCE_MEMORY_PRESSURE);
4086
4087 r = source_memory_pressure_write(s);
4088 if (r < 0)
4089 return r;
4090 if (r > 0)
4091 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4092 * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4093
4094 /* No pending incoming IO? Then let's not continue further */
4095 if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4096
4097 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4098 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4099 return -EIO;
4100
4101 return 1; /* leave dispatch, we already processed everything */
4102 }
4103
4104 if (s->memory_pressure.revents & EPOLLIN) {
4105 uint8_t pipe_buf[PIPE_BUF];
4106 ssize_t n;
4107
4108 /* If the fd is readable, then flush out anything that might be queued */
4109
4110 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4111 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4112 return -errno;
4113 }
4114
4115 return 0; /* go on, dispatch to user callback */
4116 }
4117
4118 static int source_dispatch(sd_event_source *s) {
4119 EventSourceType saved_type;
4120 sd_event *saved_event;
4121 int r = 0;
4122
4123 assert(s);
4124 assert(s->pending || s->type == SOURCE_EXIT);
4125
4126 /* Save the event source type, here, so that we still know it after the event callback which might
4127 * invalidate the event. */
4128 saved_type = s->type;
4129
4130 /* Similarly, store a reference to the event loop object, so that we can still access it after the
4131 * callback might have invalidated/disconnected the event source. */
4132 saved_event = s->event;
4133 PROTECT_EVENT(saved_event);
4134
4135 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4136 assert(!s->ratelimited);
4137 if (!ratelimit_below(&s->rate_limit)) {
4138 r = event_source_enter_ratelimited(s);
4139 if (r < 0)
4140 return r;
4141
4142 return 1;
4143 }
4144
4145 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
4146 r = source_set_pending(s, false);
4147 if (r < 0)
4148 return r;
4149 }
4150
4151 if (s->type != SOURCE_POST) {
4152 sd_event_source *z;
4153
4154 /* If we execute a non-post source, let's mark all post sources as pending. */
4155
4156 SET_FOREACH(z, s->event->post_sources) {
4157 if (event_source_is_offline(z))
4158 continue;
4159
4160 r = source_set_pending(z, true);
4161 if (r < 0)
4162 return r;
4163 }
4164 }
4165
4166 if (s->type == SOURCE_MEMORY_PRESSURE) {
4167 r = source_memory_pressure_initiate_dispatch(s);
4168 if (r == -EIO) /* handle EIO errors similar to callback errors */
4169 goto finish;
4170 if (r < 0)
4171 return r;
4172 if (r > 0) /* already handled */
4173 return 1;
4174 }
4175
4176 if (s->enabled == SD_EVENT_ONESHOT) {
4177 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
4178 if (r < 0)
4179 return r;
4180 }
4181
4182 s->dispatching = true;
4183
4184 switch (s->type) {
4185
4186 case SOURCE_IO:
4187 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4188 break;
4189
4190 case SOURCE_TIME_REALTIME:
4191 case SOURCE_TIME_BOOTTIME:
4192 case SOURCE_TIME_MONOTONIC:
4193 case SOURCE_TIME_REALTIME_ALARM:
4194 case SOURCE_TIME_BOOTTIME_ALARM:
4195 r = s->time.callback(s, s->time.next, s->userdata);
4196 break;
4197
4198 case SOURCE_SIGNAL:
4199 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4200 break;
4201
4202 case SOURCE_CHILD: {
4203 bool zombie;
4204
4205 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
4206
4207 r = s->child.callback(s, &s->child.siginfo, s->userdata);
4208
4209 /* Now, reap the PID for good. */
4210 if (zombie) {
4211 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
4212 s->child.waited = true;
4213 }
4214
4215 break;
4216 }
4217
4218 case SOURCE_DEFER:
4219 r = s->defer.callback(s, s->userdata);
4220 break;
4221
4222 case SOURCE_POST:
4223 r = s->post.callback(s, s->userdata);
4224 break;
4225
4226 case SOURCE_EXIT:
4227 r = s->exit.callback(s, s->userdata);
4228 break;
4229
4230 case SOURCE_INOTIFY: {
4231 struct sd_event *e = s->event;
4232 struct inotify_data *d;
4233 size_t sz;
4234
4235 assert(s->inotify.inode_data);
4236 assert_se(d = s->inotify.inode_data->inotify_data);
4237
4238 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4239 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4240 assert(d->buffer_filled >= sz);
4241
4242 /* If the inotify callback destroys the event source then this likely means we don't need to
4243 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4244 * free it immediately, then we couldn't drop the event from the inotify event queue without
4245 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4246 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4247 * explicitly GC it after we are done dropping the inotify event from the buffer. */
4248 d->n_busy++;
4249 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
4250 d->n_busy--;
4251
4252 /* When no event is pending anymore on this inotify object, then let's drop the event from
4253 * the inotify event queue buffer. */
4254 if (d->n_pending == 0)
4255 event_inotify_data_drop(e, d, sz);
4256
4257 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4258 event_gc_inotify_data(e, d);
4259 break;
4260 }
4261
4262 case SOURCE_MEMORY_PRESSURE:
4263 r = s->memory_pressure.callback(s, s->userdata);
4264 break;
4265
4266 case SOURCE_WATCHDOG:
4267 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
4268 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
4269 assert_not_reached();
4270 }
4271
4272 s->dispatching = false;
4273
4274 finish:
4275 if (r < 0) {
4276 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4277 strna(s->description),
4278 event_source_type_to_string(saved_type),
4279 s->exit_on_failure ? "exiting" : "disabling");
4280
4281 if (s->exit_on_failure)
4282 (void) sd_event_exit(saved_event, r);
4283 }
4284
4285 if (s->n_ref == 0)
4286 source_free(s);
4287 else if (r < 0)
4288 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4289
4290 return 1;
4291 }
4292
4293 static int event_prepare(sd_event *e) {
4294 int r;
4295
4296 assert(e);
4297
4298 for (;;) {
4299 sd_event_source *s;
4300
4301 s = prioq_peek(e->prepare);
4302 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
4303 break;
4304
4305 s->prepare_iteration = e->iteration;
4306 prioq_reshuffle(e->prepare, s, &s->prepare_index);
4307
4308 assert(s->prepare);
4309 s->dispatching = true;
4310 r = s->prepare(s, s->userdata);
4311 s->dispatching = false;
4312
4313 if (r < 0) {
4314 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4315 strna(s->description),
4316 event_source_type_to_string(s->type),
4317 s->exit_on_failure ? "exiting" : "disabling");
4318
4319 if (s->exit_on_failure)
4320 (void) sd_event_exit(e, r);
4321 }
4322
4323 if (s->n_ref == 0)
4324 source_free(s);
4325 else if (r < 0)
4326 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4327 }
4328
4329 return 0;
4330 }
4331
4332 static int dispatch_exit(sd_event *e) {
4333 sd_event_source *p;
4334 int r;
4335
4336 assert(e);
4337
4338 p = prioq_peek(e->exit);
4339 assert(!p || p->type == SOURCE_EXIT);
4340
4341 if (!p || event_source_is_offline(p)) {
4342 e->state = SD_EVENT_FINISHED;
4343 return 0;
4344 }
4345
4346 PROTECT_EVENT(e);
4347 e->iteration++;
4348 e->state = SD_EVENT_EXITING;
4349 r = source_dispatch(p);
4350 e->state = SD_EVENT_INITIAL;
4351 return r;
4352 }
4353
4354 static sd_event_source* event_next_pending(sd_event *e) {
4355 sd_event_source *p;
4356
4357 assert(e);
4358
4359 p = prioq_peek(e->pending);
4360 if (!p)
4361 return NULL;
4362
4363 if (event_source_is_offline(p))
4364 return NULL;
4365
4366 return p;
4367 }
4368
4369 static int arm_watchdog(sd_event *e) {
4370 struct itimerspec its = {};
4371 usec_t t;
4372
4373 assert(e);
4374 assert(e->watchdog_fd >= 0);
4375
4376 t = sleep_between(e,
4377 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4378 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
4379
4380 timespec_store(&its.it_value, t);
4381
4382 /* Make sure we never set the watchdog to 0, which tells the
4383 * kernel to disable it. */
4384 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4385 its.it_value.tv_nsec = 1;
4386
4387 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
4388 }
4389
4390 static int process_watchdog(sd_event *e) {
4391 assert(e);
4392
4393 if (!e->watchdog)
4394 return 0;
4395
4396 /* Don't notify watchdog too often */
4397 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4398 return 0;
4399
4400 sd_notify(false, "WATCHDOG=1");
4401 e->watchdog_last = e->timestamp.monotonic;
4402
4403 return arm_watchdog(e);
4404 }
4405
4406 static void event_close_inode_data_fds(sd_event *e) {
4407 struct inode_data *d;
4408
4409 assert(e);
4410
4411 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4412 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4413 * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4414 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4415 * compromise. */
4416
4417 while ((d = e->inode_data_to_close_list)) {
4418 assert(d->fd >= 0);
4419 d->fd = safe_close(d->fd);
4420
4421 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
4422 }
4423 }
4424
4425 static int event_memory_pressure_write_list(sd_event *e) {
4426 int r;
4427
4428 assert(e);
4429
4430 for (;;) {
4431 sd_event_source *s;
4432
4433 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4434 if (!s)
4435 break;
4436
4437 assert(s->type == SOURCE_MEMORY_PRESSURE);
4438 assert(s->memory_pressure.write_buffer_size > 0);
4439 s->memory_pressure.in_write_list = false;
4440
4441 r = source_memory_pressure_write(s);
4442 if (r < 0)
4443 return r;
4444 }
4445
4446 return 0;
4447 }
4448
4449 _public_ int sd_event_prepare(sd_event *e) {
4450 int r;
4451
4452 assert_return(e, -EINVAL);
4453 assert_return(e = event_resolve(e), -ENOPKG);
4454 assert_return(!event_origin_changed(e), -ECHILD);
4455 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4456 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4457
4458 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4459 * this check here once, since gettid() is typically not cached, and thus want to minimize
4460 * syscalls */
4461 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4462
4463 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4464 PROTECT_EVENT(e);
4465
4466 if (e->exit_requested)
4467 goto pending;
4468
4469 e->iteration++;
4470
4471 e->state = SD_EVENT_PREPARING;
4472 r = event_prepare(e);
4473 e->state = SD_EVENT_INITIAL;
4474 if (r < 0)
4475 return r;
4476
4477 r = event_memory_pressure_write_list(e);
4478 if (r < 0)
4479 return r;
4480
4481 r = event_arm_timer(e, &e->realtime);
4482 if (r < 0)
4483 return r;
4484
4485 r = event_arm_timer(e, &e->boottime);
4486 if (r < 0)
4487 return r;
4488
4489 r = event_arm_timer(e, &e->monotonic);
4490 if (r < 0)
4491 return r;
4492
4493 r = event_arm_timer(e, &e->realtime_alarm);
4494 if (r < 0)
4495 return r;
4496
4497 r = event_arm_timer(e, &e->boottime_alarm);
4498 if (r < 0)
4499 return r;
4500
4501 event_close_inode_data_fds(e);
4502
4503 if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
4504 goto pending;
4505
4506 e->state = SD_EVENT_ARMED;
4507
4508 return 0;
4509
4510 pending:
4511 e->state = SD_EVENT_ARMED;
4512 r = sd_event_wait(e, 0);
4513 if (r == 0)
4514 e->state = SD_EVENT_ARMED;
4515
4516 return r;
4517 }
4518
4519 static int epoll_wait_usec(
4520 int fd,
4521 struct epoll_event *events,
4522 int maxevents,
4523 usec_t timeout) {
4524
4525 int msec;
4526 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4527
4528 #if HAVE_EPOLL_PWAIT2
4529 static bool epoll_pwait2_absent = false;
4530 int r;
4531
4532 /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4533 * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4534 * is not that obvious to implement given the libc and kernel definitions differ in the last
4535 * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4536 * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4537 * missing. */
4538
4539 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
4540 r = epoll_pwait2(fd,
4541 events,
4542 maxevents,
4543 TIMESPEC_STORE(timeout),
4544 NULL);
4545 if (r >= 0)
4546 return r;
4547 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
4548 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4549 * supported. */
4550
4551 epoll_pwait2_absent = true;
4552 }
4553 #endif
4554
4555 if (timeout == USEC_INFINITY)
4556 msec = -1;
4557 else {
4558 usec_t k;
4559
4560 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4561 if (k >= INT_MAX)
4562 msec = INT_MAX; /* Saturate */
4563 else
4564 msec = (int) k;
4565 }
4566
4567 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
4568 }
4569
4570 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
4571 size_t n_event_queue, m, n_event_max;
4572 int64_t min_priority = threshold;
4573 bool something_new = false;
4574 int r;
4575
4576 assert(e);
4577 assert(ret_min_priority);
4578
4579 n_event_queue = MAX(e->n_sources, 1u);
4580 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
4581 return -ENOMEM;
4582
4583 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4584
4585 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4586 if (e->buffered_inotify_data_list)
4587 timeout = 0;
4588
4589 for (;;) {
4590 r = epoll_wait_usec(
4591 e->epoll_fd,
4592 e->event_queue,
4593 n_event_max,
4594 timeout);
4595 if (r < 0)
4596 return r;
4597
4598 m = (size_t) r;
4599
4600 if (m < n_event_max)
4601 break;
4602
4603 if (n_event_max >= n_event_queue * 10)
4604 break;
4605
4606 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
4607 return -ENOMEM;
4608
4609 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4610 timeout = 0;
4611 }
4612
4613 /* Set timestamp only when this is called first time. */
4614 if (threshold == INT64_MAX)
4615 triple_timestamp_now(&e->timestamp);
4616
4617 for (size_t i = 0; i < m; i++) {
4618
4619 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4620 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
4621 else {
4622 WakeupType *t = e->event_queue[i].data.ptr;
4623
4624 switch (*t) {
4625
4626 case WAKEUP_EVENT_SOURCE: {
4627 sd_event_source *s = e->event_queue[i].data.ptr;
4628
4629 assert(s);
4630
4631 if (s->priority > threshold)
4632 continue;
4633
4634 min_priority = MIN(min_priority, s->priority);
4635
4636 switch (s->type) {
4637
4638 case SOURCE_IO:
4639 r = process_io(e, s, e->event_queue[i].events);
4640 break;
4641
4642 case SOURCE_CHILD:
4643 r = process_pidfd(e, s, e->event_queue[i].events);
4644 break;
4645
4646 case SOURCE_MEMORY_PRESSURE:
4647 r = process_memory_pressure(s, e->event_queue[i].events);
4648 break;
4649
4650 default:
4651 assert_not_reached();
4652 }
4653
4654 break;
4655 }
4656
4657 case WAKEUP_CLOCK_DATA: {
4658 struct clock_data *d = e->event_queue[i].data.ptr;
4659
4660 assert(d);
4661
4662 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4663 break;
4664 }
4665
4666 case WAKEUP_SIGNAL_DATA:
4667 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4668 break;
4669
4670 case WAKEUP_INOTIFY_DATA:
4671 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4672 break;
4673
4674 default:
4675 assert_not_reached();
4676 }
4677 }
4678 if (r < 0)
4679 return r;
4680 if (r > 0)
4681 something_new = true;
4682 }
4683
4684 *ret_min_priority = min_priority;
4685 return something_new;
4686 }
4687
4688 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4689 int r;
4690
4691 assert_return(e, -EINVAL);
4692 assert_return(e = event_resolve(e), -ENOPKG);
4693 assert_return(!event_origin_changed(e), -ECHILD);
4694 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4695 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4696
4697 if (e->exit_requested) {
4698 e->state = SD_EVENT_PENDING;
4699 return 1;
4700 }
4701
4702 for (int64_t threshold = INT64_MAX; ; threshold--) {
4703 int64_t epoll_min_priority, child_min_priority;
4704
4705 /* There may be a possibility that new epoll (especially IO) and child events are
4706 * triggered just after process_epoll() call but before process_child(), and the new IO
4707 * events may have higher priority than the child events. To salvage these events,
4708 * let's call epoll_wait() again, but accepts only events with higher priority than the
4709 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4710 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4711 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4712
4713 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4714 if (r == -EINTR) {
4715 e->state = SD_EVENT_PENDING;
4716 return 1;
4717 }
4718 if (r < 0)
4719 goto finish;
4720 if (r == 0 && threshold < INT64_MAX)
4721 /* No new epoll event. */
4722 break;
4723
4724 r = process_child(e, threshold, &child_min_priority);
4725 if (r < 0)
4726 goto finish;
4727 if (r == 0)
4728 /* No new child event. */
4729 break;
4730
4731 threshold = MIN(epoll_min_priority, child_min_priority);
4732 if (threshold == INT64_MIN)
4733 break;
4734
4735 timeout = 0;
4736 }
4737
4738 r = process_watchdog(e);
4739 if (r < 0)
4740 goto finish;
4741
4742 r = process_inotify(e);
4743 if (r < 0)
4744 goto finish;
4745
4746 r = process_timer(e, e->timestamp.realtime, &e->realtime);
4747 if (r < 0)
4748 goto finish;
4749
4750 r = process_timer(e, e->timestamp.boottime, &e->boottime);
4751 if (r < 0)
4752 goto finish;
4753
4754 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4755 if (r < 0)
4756 goto finish;
4757
4758 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4759 if (r < 0)
4760 goto finish;
4761
4762 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4763 if (r < 0)
4764 goto finish;
4765 else if (r == 1) {
4766 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4767 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4768 * there were potentially re-enabled by the callback.
4769 *
4770 * Wondering why we treat only this invocation of process_timer() differently? Once event
4771 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4772 * ratelimit expiry callback is never called for any other timer type. */
4773 r = 0;
4774 goto finish;
4775 }
4776
4777 if (event_next_pending(e)) {
4778 e->state = SD_EVENT_PENDING;
4779 return 1;
4780 }
4781
4782 r = 0;
4783
4784 finish:
4785 e->state = SD_EVENT_INITIAL;
4786
4787 return r;
4788 }
4789
4790 _public_ int sd_event_dispatch(sd_event *e) {
4791 sd_event_source *p;
4792 int r;
4793
4794 assert_return(e, -EINVAL);
4795 assert_return(e = event_resolve(e), -ENOPKG);
4796 assert_return(!event_origin_changed(e), -ECHILD);
4797 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4798 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4799
4800 if (e->exit_requested)
4801 return dispatch_exit(e);
4802
4803 p = event_next_pending(e);
4804 if (p) {
4805 PROTECT_EVENT(e);
4806
4807 e->state = SD_EVENT_RUNNING;
4808 r = source_dispatch(p);
4809 e->state = SD_EVENT_INITIAL;
4810 return r;
4811 }
4812
4813 e->state = SD_EVENT_INITIAL;
4814
4815 return 1;
4816 }
4817
4818 static void event_log_delays(sd_event *e) {
4819 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4820 size_t l, i;
4821
4822 p = b;
4823 l = sizeof(b);
4824 for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4825 l = strpcpyf(&p, l, "%u ", e->delays[i]);
4826 e->delays[i] = 0;
4827 }
4828 log_debug("Event loop iterations: %s", b);
4829 }
4830
4831 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4832 int r;
4833
4834 assert_return(e, -EINVAL);
4835 assert_return(e = event_resolve(e), -ENOPKG);
4836 assert_return(!event_origin_changed(e), -ECHILD);
4837 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4838 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4839
4840 if (e->profile_delays && e->last_run_usec != 0) {
4841 usec_t this_run;
4842 unsigned l;
4843
4844 this_run = now(CLOCK_MONOTONIC);
4845
4846 l = log2u64(this_run - e->last_run_usec);
4847 assert(l < ELEMENTSOF(e->delays));
4848 e->delays[l]++;
4849
4850 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4851 event_log_delays(e);
4852 e->last_log_usec = this_run;
4853 }
4854 }
4855
4856 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4857 PROTECT_EVENT(e);
4858
4859 r = sd_event_prepare(e);
4860 if (r == 0)
4861 /* There was nothing? Then wait... */
4862 r = sd_event_wait(e, timeout);
4863
4864 if (e->profile_delays)
4865 e->last_run_usec = now(CLOCK_MONOTONIC);
4866
4867 if (r > 0) {
4868 /* There's something now, then let's dispatch it */
4869 r = sd_event_dispatch(e);
4870 if (r < 0)
4871 return r;
4872
4873 return 1;
4874 }
4875
4876 return r;
4877 }
4878
4879 _public_ int sd_event_loop(sd_event *e) {
4880 int r;
4881
4882 assert_return(e, -EINVAL);
4883 assert_return(e = event_resolve(e), -ENOPKG);
4884 assert_return(!event_origin_changed(e), -ECHILD);
4885 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4886
4887
4888 PROTECT_EVENT(e);
4889
4890 while (e->state != SD_EVENT_FINISHED) {
4891 r = sd_event_run(e, UINT64_MAX);
4892 if (r < 0)
4893 return r;
4894 }
4895
4896 return e->exit_code;
4897 }
4898
4899 _public_ int sd_event_get_fd(sd_event *e) {
4900 assert_return(e, -EINVAL);
4901 assert_return(e = event_resolve(e), -ENOPKG);
4902 assert_return(!event_origin_changed(e), -ECHILD);
4903
4904 return e->epoll_fd;
4905 }
4906
4907 _public_ int sd_event_get_state(sd_event *e) {
4908 assert_return(e, -EINVAL);
4909 assert_return(e = event_resolve(e), -ENOPKG);
4910 assert_return(!event_origin_changed(e), -ECHILD);
4911
4912 return e->state;
4913 }
4914
4915 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
4916 assert_return(e, -EINVAL);
4917 assert_return(e = event_resolve(e), -ENOPKG);
4918 assert_return(code, -EINVAL);
4919 assert_return(!event_origin_changed(e), -ECHILD);
4920
4921 if (!e->exit_requested)
4922 return -ENODATA;
4923
4924 *code = e->exit_code;
4925 return 0;
4926 }
4927
4928 _public_ int sd_event_exit(sd_event *e, int code) {
4929 assert_return(e, -EINVAL);
4930 assert_return(e = event_resolve(e), -ENOPKG);
4931 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4932 assert_return(!event_origin_changed(e), -ECHILD);
4933
4934 e->exit_requested = true;
4935 e->exit_code = code;
4936
4937 return 0;
4938 }
4939
4940 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
4941 assert_return(e, -EINVAL);
4942 assert_return(e = event_resolve(e), -ENOPKG);
4943 assert_return(usec, -EINVAL);
4944 assert_return(!event_origin_changed(e), -ECHILD);
4945
4946 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4947 return -EOPNOTSUPP;
4948
4949 if (!triple_timestamp_is_set(&e->timestamp)) {
4950 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4951 *usec = now(clock);
4952 return 1;
4953 }
4954
4955 *usec = triple_timestamp_by_clock(&e->timestamp, clock);
4956 return 0;
4957 }
4958
4959 _public_ int sd_event_default(sd_event **ret) {
4960 sd_event *e = NULL;
4961 int r;
4962
4963 if (!ret)
4964 return !!default_event;
4965
4966 if (default_event) {
4967 *ret = sd_event_ref(default_event);
4968 return 0;
4969 }
4970
4971 r = sd_event_new(&e);
4972 if (r < 0)
4973 return r;
4974
4975 e->default_event_ptr = &default_event;
4976 e->tid = gettid();
4977 default_event = e;
4978
4979 *ret = e;
4980 return 1;
4981 }
4982
4983 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4984 assert_return(e, -EINVAL);
4985 assert_return(e = event_resolve(e), -ENOPKG);
4986 assert_return(tid, -EINVAL);
4987 assert_return(!event_origin_changed(e), -ECHILD);
4988
4989 if (e->tid != 0) {
4990 *tid = e->tid;
4991 return 0;
4992 }
4993
4994 return -ENXIO;
4995 }
4996
4997 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
4998 int r;
4999
5000 assert_return(e, -EINVAL);
5001 assert_return(e = event_resolve(e), -ENOPKG);
5002 assert_return(!event_origin_changed(e), -ECHILD);
5003
5004 if (e->watchdog == !!b)
5005 return e->watchdog;
5006
5007 if (b) {
5008 r = sd_watchdog_enabled(false, &e->watchdog_period);
5009 if (r <= 0)
5010 return r;
5011
5012 /* Issue first ping immediately */
5013 sd_notify(false, "WATCHDOG=1");
5014 e->watchdog_last = now(CLOCK_MONOTONIC);
5015
5016 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
5017 if (e->watchdog_fd < 0)
5018 return -errno;
5019
5020 r = arm_watchdog(e);
5021 if (r < 0)
5022 goto fail;
5023
5024 struct epoll_event ev = {
5025 .events = EPOLLIN,
5026 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
5027 };
5028
5029 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
5030 r = -errno;
5031 goto fail;
5032 }
5033
5034 } else {
5035 if (e->watchdog_fd >= 0) {
5036 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
5037 e->watchdog_fd = safe_close(e->watchdog_fd);
5038 }
5039 }
5040
5041 e->watchdog = b;
5042 return e->watchdog;
5043
5044 fail:
5045 e->watchdog_fd = safe_close(e->watchdog_fd);
5046 return r;
5047 }
5048
5049 _public_ int sd_event_get_watchdog(sd_event *e) {
5050 assert_return(e, -EINVAL);
5051 assert_return(e = event_resolve(e), -ENOPKG);
5052 assert_return(!event_origin_changed(e), -ECHILD);
5053
5054 return e->watchdog;
5055 }
5056
5057 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5058 assert_return(e, -EINVAL);
5059 assert_return(e = event_resolve(e), -ENOPKG);
5060 assert_return(!event_origin_changed(e), -ECHILD);
5061
5062 *ret = e->iteration;
5063 return 0;
5064 }
5065
5066 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5067 assert_return(s, -EINVAL);
5068 assert_return(s->event, -EINVAL);
5069 assert_return(!event_origin_changed(s->event), -ECHILD);
5070
5071 s->destroy_callback = callback;
5072 return 0;
5073 }
5074
5075 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5076 assert_return(s, -EINVAL);
5077 assert_return(!event_origin_changed(s->event), -ECHILD);
5078
5079 if (ret)
5080 *ret = s->destroy_callback;
5081
5082 return !!s->destroy_callback;
5083 }
5084
5085 _public_ int sd_event_source_get_floating(sd_event_source *s) {
5086 assert_return(s, -EINVAL);
5087 assert_return(!event_origin_changed(s->event), -ECHILD);
5088
5089 return s->floating;
5090 }
5091
5092 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5093 assert_return(s, -EINVAL);
5094 assert_return(!event_origin_changed(s->event), -ECHILD);
5095
5096 if (s->floating == !!b)
5097 return 0;
5098
5099 if (!s->event) /* Already disconnected */
5100 return -ESTALE;
5101
5102 s->floating = b;
5103
5104 if (b) {
5105 sd_event_source_ref(s);
5106 sd_event_unref(s->event);
5107 } else {
5108 sd_event_ref(s->event);
5109 sd_event_source_unref(s);
5110 }
5111
5112 return 1;
5113 }
5114
5115 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5116 assert_return(s, -EINVAL);
5117 assert_return(s->type != SOURCE_EXIT, -EDOM);
5118 assert_return(!event_origin_changed(s->event), -ECHILD);
5119
5120 return s->exit_on_failure;
5121 }
5122
5123 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5124 assert_return(s, -EINVAL);
5125 assert_return(s->type != SOURCE_EXIT, -EDOM);
5126 assert_return(!event_origin_changed(s->event), -ECHILD);
5127
5128 if (s->exit_on_failure == !!b)
5129 return 0;
5130
5131 s->exit_on_failure = b;
5132 return 1;
5133 }
5134
5135 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5136 int r;
5137
5138 assert_return(s, -EINVAL);
5139 assert_return(!event_origin_changed(s->event), -ECHILD);
5140
5141 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5142 * so is a programming error. */
5143 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5144
5145 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5146 * non-ratelimited. */
5147 r = event_source_leave_ratelimit(s, /* run_callback */ false);
5148 if (r < 0)
5149 return r;
5150
5151 s->rate_limit = (RateLimit) { interval, burst };
5152 return 0;
5153 }
5154
5155 _public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5156 assert_return(s, -EINVAL);
5157 assert_return(!event_origin_changed(s->event), -ECHILD);
5158
5159 s->ratelimit_expire_callback = callback;
5160 return 0;
5161 }
5162
5163 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5164 assert_return(s, -EINVAL);
5165 assert_return(!event_origin_changed(s->event), -ECHILD);
5166
5167 /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5168 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5169 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5170 return -EDOM;
5171
5172 if (!ratelimit_configured(&s->rate_limit))
5173 return -ENOEXEC;
5174
5175 if (ret_interval)
5176 *ret_interval = s->rate_limit.interval;
5177 if (ret_burst)
5178 *ret_burst = s->rate_limit.burst;
5179
5180 return 0;
5181 }
5182
5183 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5184 assert_return(s, -EINVAL);
5185 assert_return(!event_origin_changed(s->event), -ECHILD);
5186
5187 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5188 return false;
5189
5190 if (!ratelimit_configured(&s->rate_limit))
5191 return false;
5192
5193 return s->ratelimited;
5194 }
5195
5196 _public_ int sd_event_source_leave_ratelimit(sd_event_source *s) {
5197 int r;
5198
5199 assert_return(s, -EINVAL);
5200
5201 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5202 return 0;
5203
5204 if (!ratelimit_configured(&s->rate_limit))
5205 return 0;
5206
5207 if (!s->ratelimited)
5208 return 0;
5209
5210 r = event_source_leave_ratelimit(s, /* run_callback */ false);
5211 if (r < 0)
5212 return r;
5213
5214 return 1; /* tell caller that we indeed just left the ratelimit state */
5215 }
5216
5217 _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5218 bool change = false;
5219 int r;
5220
5221 assert_return(e, -EINVAL);
5222
5223 if (b) {
5224 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5225 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5226 * floating after creation (and undo this before deleting them again). */
5227
5228 if (!e->sigint_event_source) {
5229 r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5230 if (r < 0)
5231 return r;
5232
5233 assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5234 change = true;
5235 }
5236
5237 if (!e->sigterm_event_source) {
5238 r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5239 if (r < 0) {
5240 if (change) {
5241 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5242 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5243 }
5244
5245 return r;
5246 }
5247
5248 assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5249 change = true;
5250 }
5251
5252 } else {
5253 if (e->sigint_event_source) {
5254 assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5255 e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5256 change = true;
5257 }
5258
5259 if (e->sigterm_event_source) {
5260 assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5261 e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5262 change = true;
5263 }
5264 }
5265
5266 return change;
5267 }
5268
5269 _public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5270 _cleanup_free_ char *b = NULL;
5271 _cleanup_free_ void *w = NULL;
5272
5273 assert_return(s, -EINVAL);
5274 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5275 assert_return(ty, -EINVAL);
5276 assert_return(!event_origin_changed(s->event), -ECHILD);
5277
5278 if (!STR_IN_SET(ty, "some", "full"))
5279 return -EINVAL;
5280
5281 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5282 return -EBUSY;
5283
5284 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5285 if (!space)
5286 return -EINVAL;
5287
5288 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5289 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5290 if (!b)
5291 return -ENOMEM;
5292 if (!STR_IN_SET(b, "some", "full"))
5293 return -EINVAL;
5294
5295 if (streq(b, ty))
5296 return 0;
5297
5298 size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5299 w = new(char, nl);
5300 if (!w)
5301 return -ENOMEM;
5302
5303 memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5304
5305 free_and_replace(s->memory_pressure.write_buffer, w);
5306 s->memory_pressure.write_buffer_size = nl;
5307 s->memory_pressure.locked = false;
5308
5309 return 1;
5310 }
5311
5312 _public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5313 _cleanup_free_ char *b = NULL;
5314 _cleanup_free_ void *w = NULL;
5315
5316 assert_return(s, -EINVAL);
5317 assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5318 assert_return(!event_origin_changed(s->event), -ECHILD);
5319
5320 if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5321 return -ERANGE;
5322 if (window_usec <= 0 || window_usec >= UINT64_MAX)
5323 return -ERANGE;
5324 if (threshold_usec > window_usec)
5325 return -EINVAL;
5326
5327 if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5328 return -EBUSY;
5329
5330 char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5331 if (!space)
5332 return -EINVAL;
5333
5334 size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5335 b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5336 if (!b)
5337 return -ENOMEM;
5338 if (!STR_IN_SET(b, "some", "full"))
5339 return -EINVAL;
5340
5341 if (asprintf((char**) &w,
5342 "%s " USEC_FMT " " USEC_FMT "",
5343 b,
5344 threshold_usec,
5345 window_usec) < 0)
5346 return -EINVAL;
5347
5348 l = strlen(w) + 1;
5349 if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5350 return 0;
5351
5352 free_and_replace(s->memory_pressure.write_buffer, w);
5353 s->memory_pressure.write_buffer_size = l;
5354 s->memory_pressure.locked = false;
5355
5356 return 1;
5357 }