src/libsystemd/sd-event/sd-event.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <sys/epoll.h>
   4 #include <sys/timerfd.h>
   5 #include <sys/wait.h>
   6
   7 #include "sd-daemon.h"
   8 #include "sd-event.h"
   9 #include "sd-id128.h"
  10 #include "sd-messages.h"
  11
  12 #include "alloc-util.h"
  13 #include "env-util.h"
  14 #include "event-source.h"
  15 #include "fd-util.h"
  16 #include "fs-util.h"
  17 #include "glyph-util.h"
  18 #include "hashmap.h"
  19 #include "hexdecoct.h"
  20 #include "list.h"
  21 #include "logarithm.h"
  22 #include "macro.h"
  23 #include "mallinfo-util.h"
  24 #include "memory-util.h"
  25 #include "missing_magic.h"
  26 #include "missing_syscall.h"
  27 #include "path-util.h"
  28 #include "prioq.h"
  29 #include "process-util.h"
  30 #include "psi-util.h"
  31 #include "set.h"
  32 #include "signal-util.h"
  33 #include "socket-util.h"
  34 #include "stat-util.h"
  35 #include "string-table.h"
  36 #include "string-util.h"
  37 #include "strxcpyx.h"
  38 #include "time-util.h"
  39
  40 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
  41
  42 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
  43         /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
  44         return s &&
  45                 s->type == SOURCE_CHILD &&
  46                 s->child.pidfd >= 0 &&
  47                 s->child.options == WEXITED;
  48 }
  49
  50 static bool event_source_is_online(sd_event_source *s) {
  51         assert(s);
  52         return s->enabled != SD_EVENT_OFF && !s->ratelimited;
  53 }
  54
  55 static bool event_source_is_offline(sd_event_source *s) {
  56         assert(s);
  57         return s->enabled == SD_EVENT_OFF || s->ratelimited;
  58 }
  59
  60 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
  61         [SOURCE_IO]                  = "io",
  62         [SOURCE_TIME_REALTIME]       = "realtime",
  63         [SOURCE_TIME_BOOTTIME]       = "bootime",
  64         [SOURCE_TIME_MONOTONIC]      = "monotonic",
  65         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
  66         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
  67         [SOURCE_SIGNAL]              = "signal",
  68         [SOURCE_CHILD]               = "child",
  69         [SOURCE_DEFER]               = "defer",
  70         [SOURCE_POST]                = "post",
  71         [SOURCE_EXIT]                = "exit",
  72         [SOURCE_WATCHDOG]            = "watchdog",
  73         [SOURCE_INOTIFY]             = "inotify",
  74         [SOURCE_MEMORY_PRESSURE]     = "memory-pressure",
  75 };
  76
  77 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
  78
  79 #define EVENT_SOURCE_IS_TIME(t)                 \
  80         IN_SET((t),                             \
  81                SOURCE_TIME_REALTIME,            \
  82                SOURCE_TIME_BOOTTIME,            \
  83                SOURCE_TIME_MONOTONIC,           \
  84                SOURCE_TIME_REALTIME_ALARM,      \
  85                SOURCE_TIME_BOOTTIME_ALARM)
  86
  87 #define EVENT_SOURCE_CAN_RATE_LIMIT(t)          \
  88         IN_SET((t),                             \
  89                SOURCE_IO,                       \
  90                SOURCE_TIME_REALTIME,            \
  91                SOURCE_TIME_BOOTTIME,            \
  92                SOURCE_TIME_MONOTONIC,           \
  93                SOURCE_TIME_REALTIME_ALARM,      \
  94                SOURCE_TIME_BOOTTIME_ALARM,      \
  95                SOURCE_SIGNAL,                   \
  96                SOURCE_DEFER,                    \
  97                SOURCE_INOTIFY,                  \
  98                SOURCE_MEMORY_PRESSURE)
  99
 100 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
 101  * Time sources and ratelimited sources can be passed, so effectively this is the same as the
 102  * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
 103 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
 104
 105 struct sd_event {
 106         unsigned n_ref;
 107
 108         int epoll_fd;
 109         int watchdog_fd;
 110
 111         Prioq *pending;
 112         Prioq *prepare;
 113
 114         /* timerfd_create() only supports these five clocks so far. We
 115          * can add support for more clocks when the kernel learns to
 116          * deal with them, too. */
 117         struct clock_data realtime;
 118         struct clock_data boottime;
 119         struct clock_data monotonic;
 120         struct clock_data realtime_alarm;
 121         struct clock_data boottime_alarm;
 122
 123         usec_t perturb;
 124
 125         sd_event_source **signal_sources; /* indexed by signal number */
 126         Hashmap *signal_data; /* indexed by priority */
 127
 128         Hashmap *child_sources;
 129         unsigned n_online_child_sources;
 130
 131         Set *post_sources;
 132
 133         Prioq *exit;
 134
 135         Hashmap *inotify_data; /* indexed by priority */
 136
 137         /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
 138         LIST_HEAD(struct inode_data, inode_data_to_close_list);
 139
 140         /* A list of inotify objects that already have events buffered which aren't processed yet */
 141         LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
 142
 143         /* A list of memory pressure event sources that still need their subscription string written */
 144         LIST_HEAD(sd_event_source, memory_pressure_write_list);
 145
 146         pid_t original_pid;
 147
 148         uint64_t iteration;
 149         triple_timestamp timestamp;
 150         int state;
 151
 152         bool exit_requested:1;
 153         bool need_process_child:1;
 154         bool watchdog:1;
 155         bool profile_delays:1;
 156
 157         int exit_code;
 158
 159         pid_t tid;
 160         sd_event **default_event_ptr;
 161
 162         usec_t watchdog_last, watchdog_period;
 163
 164         unsigned n_sources;
 165
 166         struct epoll_event *event_queue;
 167
 168         LIST_HEAD(sd_event_source, sources);
 169
 170         sd_event_source *sigint_event_source, *sigterm_event_source;
 171
 172         usec_t last_run_usec, last_log_usec;
 173         unsigned delays[sizeof(usec_t) * 8];
 174 };
 175
 176 static thread_local sd_event *default_event = NULL;
 177
 178 static void source_disconnect(sd_event_source *s);
 179 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
 180
 181 static sd_event *event_resolve(sd_event *e) {
 182         return e == SD_EVENT_DEFAULT ? default_event : e;
 183 }
 184
 185 static int pending_prioq_compare(const void *a, const void *b) {
 186         const sd_event_source *x = a, *y = b;
 187         int r;
 188
 189         assert(x->pending);
 190         assert(y->pending);
 191
 192         /* Enabled ones first */
 193         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 194         if (r != 0)
 195                 return r;
 196
 197         /* Non rate-limited ones first. */
 198         r = CMP(!!x->ratelimited, !!y->ratelimited);
 199         if (r != 0)
 200                 return r;
 201
 202         /* Lower priority values first */
 203         r = CMP(x->priority, y->priority);
 204         if (r != 0)
 205                 return r;
 206
 207         /* Older entries first */
 208         return CMP(x->pending_iteration, y->pending_iteration);
 209 }
 210
 211 static int prepare_prioq_compare(const void *a, const void *b) {
 212         const sd_event_source *x = a, *y = b;
 213         int r;
 214
 215         assert(x->prepare);
 216         assert(y->prepare);
 217
 218         /* Enabled ones first */
 219         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 220         if (r != 0)
 221                 return r;
 222
 223         /* Non rate-limited ones first. */
 224         r = CMP(!!x->ratelimited, !!y->ratelimited);
 225         if (r != 0)
 226                 return r;
 227
 228         /* Move most recently prepared ones last, so that we can stop
 229          * preparing as soon as we hit one that has already been
 230          * prepared in the current iteration */
 231         r = CMP(x->prepare_iteration, y->prepare_iteration);
 232         if (r != 0)
 233                 return r;
 234
 235         /* Lower priority values first */
 236         return CMP(x->priority, y->priority);
 237 }
 238
 239 static usec_t time_event_source_next(const sd_event_source *s) {
 240         assert(s);
 241
 242         /* We have two kinds of event sources that have elapsation times associated with them: the actual
 243          * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
 244          * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
 245          * looking at here. */
 246
 247         if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
 248                 assert(s->rate_limit.begin != 0);
 249                 assert(s->rate_limit.interval != 0);
 250                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 251         }
 252
 253         /* Otherwise this must be a time event source, if not ratelimited */
 254         if (EVENT_SOURCE_IS_TIME(s->type))
 255                 return s->time.next;
 256
 257         return USEC_INFINITY;
 258 }
 259
 260 static usec_t time_event_source_latest(const sd_event_source *s) {
 261         assert(s);
 262
 263         if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
 264                                * same, as we should avoid adding additional inaccuracy on an inaccuracy time
 265                                * window */
 266                 assert(s->rate_limit.begin != 0);
 267                 assert(s->rate_limit.interval != 0);
 268                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 269         }
 270
 271         /* Must be a time event source, if not ratelimited */
 272         if (EVENT_SOURCE_IS_TIME(s->type))
 273                 return usec_add(s->time.next, s->time.accuracy);
 274
 275         return USEC_INFINITY;
 276 }
 277
 278 static bool event_source_timer_candidate(const sd_event_source *s) {
 279         assert(s);
 280
 281         /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
 282          * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
 283         return !s->pending || s->ratelimited;
 284 }
 285
 286 static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
 287         const sd_event_source *x = a, *y = b;
 288         int r;
 289
 290         /* Enabled ones first */
 291         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 292         if (r != 0)
 293                 return r;
 294
 295         /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
 296         r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
 297         if (r != 0)
 298                 return r;
 299
 300         /* Order by time */
 301         return CMP(time_func(x), time_func(y));
 302 }
 303
 304 static int earliest_time_prioq_compare(const void *a, const void *b) {
 305         return time_prioq_compare(a, b, time_event_source_next);
 306 }
 307
 308 static int latest_time_prioq_compare(const void *a, const void *b) {
 309         return time_prioq_compare(a, b, time_event_source_latest);
 310 }
 311
 312 static int exit_prioq_compare(const void *a, const void *b) {
 313         const sd_event_source *x = a, *y = b;
 314         int r;
 315
 316         assert(x->type == SOURCE_EXIT);
 317         assert(y->type == SOURCE_EXIT);
 318
 319         /* Enabled ones first */
 320         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 321         if (r != 0)
 322                 return r;
 323
 324         /* Lower priority values first */
 325         return CMP(x->priority, y->priority);
 326 }
 327
 328 static void free_clock_data(struct clock_data *d) {
 329         assert(d);
 330         assert(d->wakeup == WAKEUP_CLOCK_DATA);
 331
 332         safe_close(d->fd);
 333         prioq_free(d->earliest);
 334         prioq_free(d->latest);
 335 }
 336
 337 static sd_event *event_free(sd_event *e) {
 338         sd_event_source *s;
 339
 340         assert(e);
 341
 342         e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
 343         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
 344
 345         while ((s = e->sources)) {
 346                 assert(s->floating);
 347                 source_disconnect(s);
 348                 sd_event_source_unref(s);
 349         }
 350
 351         assert(e->n_sources == 0);
 352
 353         if (e->default_event_ptr)
 354                 *(e->default_event_ptr) = NULL;
 355
 356         safe_close(e->epoll_fd);
 357         safe_close(e->watchdog_fd);
 358
 359         free_clock_data(&e->realtime);
 360         free_clock_data(&e->boottime);
 361         free_clock_data(&e->monotonic);
 362         free_clock_data(&e->realtime_alarm);
 363         free_clock_data(&e->boottime_alarm);
 364
 365         prioq_free(e->pending);
 366         prioq_free(e->prepare);
 367         prioq_free(e->exit);
 368
 369         free(e->signal_sources);
 370         hashmap_free(e->signal_data);
 371
 372         hashmap_free(e->inotify_data);
 373
 374         hashmap_free(e->child_sources);
 375         set_free(e->post_sources);
 376
 377         free(e->event_queue);
 378
 379         return mfree(e);
 380 }
 381
 382 _public_ int sd_event_new(sd_event** ret) {
 383         sd_event *e;
 384         int r;
 385
 386         assert_return(ret, -EINVAL);
 387
 388         e = new(sd_event, 1);
 389         if (!e)
 390                 return -ENOMEM;
 391
 392         *e = (sd_event) {
 393                 .n_ref = 1,
 394                 .epoll_fd = -EBADF,
 395                 .watchdog_fd = -EBADF,
 396                 .realtime.wakeup = WAKEUP_CLOCK_DATA,
 397                 .realtime.fd = -EBADF,
 398                 .realtime.next = USEC_INFINITY,
 399                 .boottime.wakeup = WAKEUP_CLOCK_DATA,
 400                 .boottime.fd = -EBADF,
 401                 .boottime.next = USEC_INFINITY,
 402                 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
 403                 .monotonic.fd = -EBADF,
 404                 .monotonic.next = USEC_INFINITY,
 405                 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 406                 .realtime_alarm.fd = -EBADF,
 407                 .realtime_alarm.next = USEC_INFINITY,
 408                 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 409                 .boottime_alarm.fd = -EBADF,
 410                 .boottime_alarm.next = USEC_INFINITY,
 411                 .perturb = USEC_INFINITY,
 412                 .original_pid = getpid_cached(),
 413         };
 414
 415         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
 416         if (r < 0)
 417                 goto fail;
 418
 419         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
 420         if (e->epoll_fd < 0) {
 421                 r = -errno;
 422                 goto fail;
 423         }
 424
 425         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
 426
 427         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
 428                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
 429                           special_glyph(SPECIAL_GLYPH_ELLIPSIS));
 430                 e->profile_delays = true;
 431         }
 432
 433         *ret = e;
 434         return 0;
 435
 436 fail:
 437         event_free(e);
 438         return r;
 439 }
 440
 441 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event, sd_event, event_free);
 442 #define PROTECT_EVENT(e)                                                \
 443         _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
 444
 445 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
 446         if (s)
 447                 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
 448         return sd_event_source_unref(s);
 449 }
 450
 451 static bool event_pid_changed(sd_event *e) {
 452         assert(e);
 453
 454         /* We don't support people creating an event loop and keeping
 455          * it around over a fork(). Let's complain. */
 456
 457         return e->original_pid != getpid_cached();
 458 }
 459
 460 static void source_io_unregister(sd_event_source *s) {
 461         assert(s);
 462         assert(s->type == SOURCE_IO);
 463
 464         if (event_pid_changed(s->event))
 465                 return;
 466
 467         if (!s->io.registered)
 468                 return;
 469
 470         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
 471                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 472                                 strna(s->description), event_source_type_to_string(s->type));
 473
 474         s->io.registered = false;
 475 }
 476
 477 static int source_io_register(
 478                 sd_event_source *s,
 479                 int enabled,
 480                 uint32_t events) {
 481
 482         assert(s);
 483         assert(s->type == SOURCE_IO);
 484         assert(enabled != SD_EVENT_OFF);
 485
 486         struct epoll_event ev = {
 487                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 488                 .data.ptr = s,
 489         };
 490
 491         if (epoll_ctl(s->event->epoll_fd,
 492                       s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 493                       s->io.fd, &ev) < 0)
 494                 return -errno;
 495
 496         s->io.registered = true;
 497
 498         return 0;
 499 }
 500
 501 static void source_child_pidfd_unregister(sd_event_source *s) {
 502         assert(s);
 503         assert(s->type == SOURCE_CHILD);
 504
 505         if (event_pid_changed(s->event))
 506                 return;
 507
 508         if (!s->child.registered)
 509                 return;
 510
 511         if (EVENT_SOURCE_WATCH_PIDFD(s))
 512                 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
 513                         log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 514                                         strna(s->description), event_source_type_to_string(s->type));
 515
 516         s->child.registered = false;
 517 }
 518
 519 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
 520         assert(s);
 521         assert(s->type == SOURCE_CHILD);
 522         assert(enabled != SD_EVENT_OFF);
 523
 524         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
 525                 struct epoll_event ev = {
 526                         .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 527                         .data.ptr = s,
 528                 };
 529
 530                 if (epoll_ctl(s->event->epoll_fd,
 531                               s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 532                               s->child.pidfd, &ev) < 0)
 533                         return -errno;
 534         }
 535
 536         s->child.registered = true;
 537         return 0;
 538 }
 539
 540 static void source_memory_pressure_unregister(sd_event_source *s) {
 541         assert(s);
 542         assert(s->type == SOURCE_MEMORY_PRESSURE);
 543
 544         if (event_pid_changed(s->event))
 545                 return;
 546
 547         if (!s->memory_pressure.registered)
 548                 return;
 549
 550         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
 551                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 552                                 strna(s->description), event_source_type_to_string(s->type));
 553
 554         s->memory_pressure.registered = false;
 555 }
 556
 557 static int source_memory_pressure_register(sd_event_source *s, int enabled) {
 558         assert(s);
 559         assert(s->type == SOURCE_MEMORY_PRESSURE);
 560         assert(enabled != SD_EVENT_OFF);
 561
 562         struct epoll_event ev = {
 563                 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
 564                           (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
 565                 .data.ptr = s,
 566         };
 567
 568         if (epoll_ctl(s->event->epoll_fd,
 569                       s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 570                       s->memory_pressure.fd, &ev) < 0)
 571                 return -errno;
 572
 573         s->memory_pressure.registered = true;
 574         return 0;
 575 }
 576
 577 static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
 578         assert(s);
 579         assert(s->type == SOURCE_MEMORY_PRESSURE);
 580
 581         if (s->memory_pressure.in_write_list)
 582                 return;
 583
 584         LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
 585         s->memory_pressure.in_write_list = true;
 586 }
 587
 588 static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
 589         assert(s);
 590         assert(s->type == SOURCE_MEMORY_PRESSURE);
 591
 592         if (!s->memory_pressure.in_write_list)
 593                 return;
 594
 595         LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
 596         s->memory_pressure.in_write_list = false;
 597 }
 598
 599 static clockid_t event_source_type_to_clock(EventSourceType t) {
 600
 601         switch (t) {
 602
 603         case SOURCE_TIME_REALTIME:
 604                 return CLOCK_REALTIME;
 605
 606         case SOURCE_TIME_BOOTTIME:
 607                 return CLOCK_BOOTTIME;
 608
 609         case SOURCE_TIME_MONOTONIC:
 610                 return CLOCK_MONOTONIC;
 611
 612         case SOURCE_TIME_REALTIME_ALARM:
 613                 return CLOCK_REALTIME_ALARM;
 614
 615         case SOURCE_TIME_BOOTTIME_ALARM:
 616                 return CLOCK_BOOTTIME_ALARM;
 617
 618         default:
 619                 return (clockid_t) -1;
 620         }
 621 }
 622
 623 static EventSourceType clock_to_event_source_type(clockid_t clock) {
 624
 625         switch (clock) {
 626
 627         case CLOCK_REALTIME:
 628                 return SOURCE_TIME_REALTIME;
 629
 630         case CLOCK_BOOTTIME:
 631                 return SOURCE_TIME_BOOTTIME;
 632
 633         case CLOCK_MONOTONIC:
 634                 return SOURCE_TIME_MONOTONIC;
 635
 636         case CLOCK_REALTIME_ALARM:
 637                 return SOURCE_TIME_REALTIME_ALARM;
 638
 639         case CLOCK_BOOTTIME_ALARM:
 640                 return SOURCE_TIME_BOOTTIME_ALARM;
 641
 642         default:
 643                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
 644         }
 645 }
 646
 647 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
 648         assert(e);
 649
 650         switch (t) {
 651
 652         case SOURCE_TIME_REALTIME:
 653                 return &e->realtime;
 654
 655         case SOURCE_TIME_BOOTTIME:
 656                 return &e->boottime;
 657
 658         case SOURCE_TIME_MONOTONIC:
 659                 return &e->monotonic;
 660
 661         case SOURCE_TIME_REALTIME_ALARM:
 662                 return &e->realtime_alarm;
 663
 664         case SOURCE_TIME_BOOTTIME_ALARM:
 665                 return &e->boottime_alarm;
 666
 667         default:
 668                 return NULL;
 669         }
 670 }
 671
 672 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
 673         assert(e);
 674
 675         if (!d)
 676                 return;
 677
 678         hashmap_remove(e->signal_data, &d->priority);
 679         safe_close(d->fd);
 680         free(d);
 681 }
 682
 683 static int event_make_signal_data(
 684                 sd_event *e,
 685                 int sig,
 686                 struct signal_data **ret) {
 687
 688         struct signal_data *d;
 689         bool added = false;
 690         sigset_t ss_copy;
 691         int64_t priority;
 692         int r;
 693
 694         assert(e);
 695
 696         if (event_pid_changed(e))
 697                 return -ECHILD;
 698
 699         if (e->signal_sources && e->signal_sources[sig])
 700                 priority = e->signal_sources[sig]->priority;
 701         else
 702                 priority = SD_EVENT_PRIORITY_NORMAL;
 703
 704         d = hashmap_get(e->signal_data, &priority);
 705         if (d) {
 706                 if (sigismember(&d->sigset, sig) > 0) {
 707                         if (ret)
 708                                 *ret = d;
 709                         return 0;
 710                 }
 711         } else {
 712                 d = new(struct signal_data, 1);
 713                 if (!d)
 714                         return -ENOMEM;
 715
 716                 *d = (struct signal_data) {
 717                         .wakeup = WAKEUP_SIGNAL_DATA,
 718                         .fd = -EBADF,
 719                         .priority = priority,
 720                 };
 721
 722                 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
 723                 if (r < 0) {
 724                         free(d);
 725                         return r;
 726                 }
 727
 728                 added = true;
 729         }
 730
 731         ss_copy = d->sigset;
 732         assert_se(sigaddset(&ss_copy, sig) >= 0);
 733
 734         r = signalfd(d->fd >= 0 ? d->fd : -1,   /* the first arg must be -1 or a valid signalfd */
 735                      &ss_copy,
 736                      SFD_NONBLOCK|SFD_CLOEXEC);
 737         if (r < 0) {
 738                 r = -errno;
 739                 goto fail;
 740         }
 741
 742         d->sigset = ss_copy;
 743
 744         if (d->fd >= 0) {
 745                 if (ret)
 746                         *ret = d;
 747                 return 0;
 748         }
 749
 750         d->fd = fd_move_above_stdio(r);
 751
 752         struct epoll_event ev = {
 753                 .events = EPOLLIN,
 754                 .data.ptr = d,
 755         };
 756
 757         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
 758                 r = -errno;
 759                 goto fail;
 760         }
 761
 762         if (ret)
 763                 *ret = d;
 764
 765         return 0;
 766
 767 fail:
 768         if (added)
 769                 event_free_signal_data(e, d);
 770
 771         return r;
 772 }
 773
 774 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
 775         assert(e);
 776         assert(d);
 777
 778         /* Turns off the specified signal in the signal data
 779          * object. If the signal mask of the object becomes empty that
 780          * way removes it. */
 781
 782         if (sigismember(&d->sigset, sig) == 0)
 783                 return;
 784
 785         assert_se(sigdelset(&d->sigset, sig) >= 0);
 786
 787         if (sigisemptyset(&d->sigset)) {
 788                 /* If all the mask is all-zero we can get rid of the structure */
 789                 event_free_signal_data(e, d);
 790                 return;
 791         }
 792
 793         if (event_pid_changed(e))
 794                 return;
 795
 796         assert(d->fd >= 0);
 797
 798         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
 799                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
 800 }
 801
 802 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
 803         struct signal_data *d;
 804         static const int64_t zero_priority = 0;
 805
 806         assert(e);
 807
 808         /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
 809          * and possibly drop the signalfd for it. */
 810
 811         if (sig == SIGCHLD &&
 812             e->n_online_child_sources > 0)
 813                 return;
 814
 815         if (e->signal_sources &&
 816             e->signal_sources[sig] &&
 817             event_source_is_online(e->signal_sources[sig]))
 818                 return;
 819
 820         /*
 821          * The specified signal might be enabled in three different queues:
 822          *
 823          * 1) the one that belongs to the priority passed (if it is non-NULL)
 824          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
 825          * 3) the 0 priority (to cover the SIGCHLD case)
 826          *
 827          * Hence, let's remove it from all three here.
 828          */
 829
 830         if (priority) {
 831                 d = hashmap_get(e->signal_data, priority);
 832                 if (d)
 833                         event_unmask_signal_data(e, d, sig);
 834         }
 835
 836         if (e->signal_sources && e->signal_sources[sig]) {
 837                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
 838                 if (d)
 839                         event_unmask_signal_data(e, d, sig);
 840         }
 841
 842         d = hashmap_get(e->signal_data, &zero_priority);
 843         if (d)
 844                 event_unmask_signal_data(e, d, sig);
 845 }
 846
 847 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
 848         assert(s);
 849
 850         /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
 851          * they are enabled/disabled or marked pending and such. */
 852
 853         if (s->pending)
 854                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
 855
 856         if (s->prepare)
 857                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
 858 }
 859
 860 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
 861         struct clock_data *d;
 862
 863         assert(s);
 864
 865         /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
 866          * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
 867          * properly again. */
 868
 869         if (s->ratelimited)
 870                 d = &s->event->monotonic;
 871         else if (EVENT_SOURCE_IS_TIME(s->type))
 872                 assert_se(d = event_get_clock_data(s->event, s->type));
 873         else
 874                 return; /* no-op for an event source which is neither a timer nor ratelimited. */
 875
 876         prioq_reshuffle(d->earliest, s, &s->earliest_index);
 877         prioq_reshuffle(d->latest, s, &s->latest_index);
 878         d->needs_rearm = true;
 879 }
 880
 881 static void event_source_time_prioq_remove(
 882                 sd_event_source *s,
 883                 struct clock_data *d) {
 884
 885         assert(s);
 886         assert(d);
 887
 888         prioq_remove(d->earliest, s, &s->earliest_index);
 889         prioq_remove(d->latest, s, &s->latest_index);
 890         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
 891         d->needs_rearm = true;
 892 }
 893
 894 static void source_disconnect(sd_event_source *s) {
 895         sd_event *event;
 896         int r;
 897
 898         assert(s);
 899
 900         if (!s->event)
 901                 return;
 902
 903         assert(s->event->n_sources > 0);
 904
 905         switch (s->type) {
 906
 907         case SOURCE_IO:
 908                 if (s->io.fd >= 0)
 909                         source_io_unregister(s);
 910
 911                 break;
 912
 913         case SOURCE_TIME_REALTIME:
 914         case SOURCE_TIME_BOOTTIME:
 915         case SOURCE_TIME_MONOTONIC:
 916         case SOURCE_TIME_REALTIME_ALARM:
 917         case SOURCE_TIME_BOOTTIME_ALARM:
 918                 /* Only remove this event source from the time event source here if it is not ratelimited. If
 919                  * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
 920                  * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
 921
 922                 if (!s->ratelimited) {
 923                         struct clock_data *d;
 924                         assert_se(d = event_get_clock_data(s->event, s->type));
 925                         event_source_time_prioq_remove(s, d);
 926                 }
 927
 928                 break;
 929
 930         case SOURCE_SIGNAL:
 931                 if (s->signal.sig > 0) {
 932
 933                         if (s->event->signal_sources)
 934                                 s->event->signal_sources[s->signal.sig] = NULL;
 935
 936                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
 937
 938                         if (s->signal.unblock) {
 939                                 sigset_t new_ss;
 940
 941                                 if (sigemptyset(&new_ss) < 0)
 942                                         log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
 943                                 else if (sigaddset(&new_ss, s->signal.sig) < 0)
 944                                         log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
 945                                 else {
 946                                         r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
 947                                         if (r != 0)
 948                                                 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
 949                                 }
 950                         }
 951                 }
 952
 953                 break;
 954
 955         case SOURCE_CHILD:
 956                 if (event_pid_changed(s->event))
 957                         s->child.process_owned = false;
 958
 959                 if (s->child.pid > 0) {
 960                         if (event_source_is_online(s)) {
 961                                 assert(s->event->n_online_child_sources > 0);
 962                                 s->event->n_online_child_sources--;
 963                         }
 964
 965                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
 966                 }
 967
 968                 if (EVENT_SOURCE_WATCH_PIDFD(s))
 969                         source_child_pidfd_unregister(s);
 970                 else
 971                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
 972
 973                 break;
 974
 975         case SOURCE_DEFER:
 976                 /* nothing */
 977                 break;
 978
 979         case SOURCE_POST:
 980                 set_remove(s->event->post_sources, s);
 981                 break;
 982
 983         case SOURCE_EXIT:
 984                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
 985                 break;
 986
 987         case SOURCE_INOTIFY: {
 988                 struct inode_data *inode_data;
 989
 990                 inode_data = s->inotify.inode_data;
 991                 if (inode_data) {
 992                         struct inotify_data *inotify_data;
 993                         assert_se(inotify_data = inode_data->inotify_data);
 994
 995                         /* Detach this event source from the inode object */
 996                         LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
 997                         s->inotify.inode_data = NULL;
 998
 999                         if (s->pending) {
1000                                 assert(inotify_data->n_pending > 0);
1001                                 inotify_data->n_pending--;
1002                         }
1003
1004                         /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1005                          * continued to being watched. That's because inotify doesn't really have an API for that: we
1006                          * can only change watch masks with access to the original inode either by fd or by path. But
1007                          * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1008                          * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1009                          * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1010                          * there), but given the need for open_by_handle_at() which is privileged and not universally
1011                          * available this would be quite an incomplete solution. Hence we go the other way, leave the
1012                          * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1013                          * anymore after reception. Yes, this sucks, but … Linux … */
1014
1015                         /* Maybe release the inode data (and its inotify) */
1016                         event_gc_inode_data(s->event, inode_data);
1017                 }
1018
1019                 break;
1020         }
1021
1022         case SOURCE_MEMORY_PRESSURE:
1023                 source_memory_pressure_remove_from_write_list(s);
1024                 source_memory_pressure_unregister(s);
1025                 break;
1026
1027         default:
1028                 assert_not_reached();
1029         }
1030
1031         if (s->pending)
1032                 prioq_remove(s->event->pending, s, &s->pending_index);
1033
1034         if (s->prepare)
1035                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1036
1037         if (s->ratelimited)
1038                 event_source_time_prioq_remove(s, &s->event->monotonic);
1039
1040         event = TAKE_PTR(s->event);
1041         LIST_REMOVE(sources, event->sources, s);
1042         event->n_sources--;
1043
1044         /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1045          * pidfd associated with this event source, which we'll do only on source_free(). */
1046
1047         if (!s->floating)
1048                 sd_event_unref(event);
1049 }
1050
1051 static sd_event_source* source_free(sd_event_source *s) {
1052         assert(s);
1053
1054         source_disconnect(s);
1055
1056         if (s->type == SOURCE_IO && s->io.owned)
1057                 s->io.fd = safe_close(s->io.fd);
1058
1059         if (s->type == SOURCE_CHILD) {
1060                 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1061
1062                 if (s->child.process_owned) {
1063
1064                         if (!s->child.exited) {
1065                                 bool sent = false;
1066
1067                                 if (s->child.pidfd >= 0) {
1068                                         if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
1069                                                 if (errno == ESRCH) /* Already dead */
1070                                                         sent = true;
1071                                                 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
1072                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
1073                                                                         s->child.pid);
1074                                         } else
1075                                                 sent = true;
1076                                 }
1077
1078                                 if (!sent)
1079                                         if (kill(s->child.pid, SIGKILL) < 0)
1080                                                 if (errno != ESRCH) /* Already dead */
1081                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1082                                                                         s->child.pid);
1083                         }
1084
1085                         if (!s->child.waited) {
1086                                 siginfo_t si = {};
1087
1088                                 /* Reap the child if we can */
1089                                 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1090                         }
1091                 }
1092
1093                 if (s->child.pidfd_owned)
1094                         s->child.pidfd = safe_close(s->child.pidfd);
1095         }
1096
1097         if (s->type == SOURCE_MEMORY_PRESSURE) {
1098                 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1099                 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1100         }
1101
1102         if (s->destroy_callback)
1103                 s->destroy_callback(s->userdata);
1104
1105         free(s->description);
1106         return mfree(s);
1107 }
1108 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1109
1110 static int source_set_pending(sd_event_source *s, bool b) {
1111         int r;
1112
1113         assert(s);
1114         assert(s->type != SOURCE_EXIT);
1115
1116         if (s->pending == b)
1117                 return 0;
1118
1119         s->pending = b;
1120
1121         if (b) {
1122                 s->pending_iteration = s->event->iteration;
1123
1124                 r = prioq_put(s->event->pending, s, &s->pending_index);
1125                 if (r < 0) {
1126                         s->pending = false;
1127                         return r;
1128                 }
1129         } else
1130                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1131
1132         if (EVENT_SOURCE_IS_TIME(s->type))
1133                 event_source_time_prioq_reshuffle(s);
1134
1135         if (s->type == SOURCE_SIGNAL && !b) {
1136                 struct signal_data *d;
1137
1138                 d = hashmap_get(s->event->signal_data, &s->priority);
1139                 if (d && d->current == s)
1140                         d->current = NULL;
1141         }
1142
1143         if (s->type == SOURCE_INOTIFY) {
1144
1145                 assert(s->inotify.inode_data);
1146                 assert(s->inotify.inode_data->inotify_data);
1147
1148                 if (b)
1149                         s->inotify.inode_data->inotify_data->n_pending ++;
1150                 else {
1151                         assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1152                         s->inotify.inode_data->inotify_data->n_pending --;
1153                 }
1154         }
1155
1156         return 1;
1157 }
1158
1159 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1160
1161         /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1162          * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1163          * lines. */
1164         static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1165                 [SOURCE_IO]                  = endoffsetof_field(sd_event_source, io),
1166                 [SOURCE_TIME_REALTIME]       = endoffsetof_field(sd_event_source, time),
1167                 [SOURCE_TIME_BOOTTIME]       = endoffsetof_field(sd_event_source, time),
1168                 [SOURCE_TIME_MONOTONIC]      = endoffsetof_field(sd_event_source, time),
1169                 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1170                 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1171                 [SOURCE_SIGNAL]              = endoffsetof_field(sd_event_source, signal),
1172                 [SOURCE_CHILD]               = endoffsetof_field(sd_event_source, child),
1173                 [SOURCE_DEFER]               = endoffsetof_field(sd_event_source, defer),
1174                 [SOURCE_POST]                = endoffsetof_field(sd_event_source, post),
1175                 [SOURCE_EXIT]                = endoffsetof_field(sd_event_source, exit),
1176                 [SOURCE_INOTIFY]             = endoffsetof_field(sd_event_source, inotify),
1177                 [SOURCE_MEMORY_PRESSURE]     = endoffsetof_field(sd_event_source, memory_pressure),
1178         };
1179
1180         sd_event_source *s;
1181
1182         assert(e);
1183         assert(type >= 0);
1184         assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1185         assert(size_table[type] > 0);
1186
1187         /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1188          * size, even if we only allocate the initial part we need. */
1189         s = expand_to_usable(malloc0(size_table[type]), sizeof(sd_event_source));
1190         if (!s)
1191                 return NULL;
1192
1193         /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1194          * than what we allocated here. */
1195         s->n_ref = 1;
1196         s->event = e;
1197         s->floating = floating;
1198         s->type = type;
1199         s->pending_index = PRIOQ_IDX_NULL;
1200         s->prepare_index = PRIOQ_IDX_NULL;
1201
1202         if (!floating)
1203                 sd_event_ref(e);
1204
1205         LIST_PREPEND(sources, e->sources, s);
1206         e->n_sources++;
1207
1208         return s;
1209 }
1210
1211 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1212         assert(s);
1213
1214         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1215 }
1216
1217 _public_ int sd_event_add_io(
1218                 sd_event *e,
1219                 sd_event_source **ret,
1220                 int fd,
1221                 uint32_t events,
1222                 sd_event_io_handler_t callback,
1223                 void *userdata) {
1224
1225         _cleanup_(source_freep) sd_event_source *s = NULL;
1226         int r;
1227
1228         assert_return(e, -EINVAL);
1229         assert_return(e = event_resolve(e), -ENOPKG);
1230         assert_return(fd >= 0, -EBADF);
1231         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1232         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1233         assert_return(!event_pid_changed(e), -ECHILD);
1234
1235         if (!callback)
1236                 callback = io_exit_callback;
1237
1238         s = source_new(e, !ret, SOURCE_IO);
1239         if (!s)
1240                 return -ENOMEM;
1241
1242         s->wakeup = WAKEUP_EVENT_SOURCE;
1243         s->io.fd = fd;
1244         s->io.events = events;
1245         s->io.callback = callback;
1246         s->userdata = userdata;
1247         s->enabled = SD_EVENT_ON;
1248
1249         r = source_io_register(s, s->enabled, events);
1250         if (r < 0)
1251                 return r;
1252
1253         if (ret)
1254                 *ret = s;
1255         TAKE_PTR(s);
1256
1257         return 0;
1258 }
1259
1260 static void initialize_perturb(sd_event *e) {
1261         sd_id128_t id = {};
1262
1263         /* When we sleep for longer, we try to realign the wakeup to the same time within each
1264          * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1265          * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1266          * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1267          * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1268
1269         if (_likely_(e->perturb != USEC_INFINITY))
1270                 return;
1271
1272         if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) > 0)
1273                 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1274         else
1275                 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
1276 }
1277
1278 static int event_setup_timer_fd(
1279                 sd_event *e,
1280                 struct clock_data *d,
1281                 clockid_t clock) {
1282
1283         assert(e);
1284         assert(d);
1285
1286         if (_likely_(d->fd >= 0))
1287                 return 0;
1288
1289         _cleanup_close_ int fd = -EBADF;
1290
1291         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1292         if (fd < 0)
1293                 return -errno;
1294
1295         fd = fd_move_above_stdio(fd);
1296
1297         struct epoll_event ev = {
1298                 .events = EPOLLIN,
1299                 .data.ptr = d,
1300         };
1301
1302         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1303                 return -errno;
1304
1305         d->fd = TAKE_FD(fd);
1306         return 0;
1307 }
1308
1309 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1310         assert(s);
1311
1312         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1313 }
1314
1315 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1316         int r;
1317
1318         assert(d);
1319
1320         if (d->fd < 0) {
1321                 r = event_setup_timer_fd(e, d, clock);
1322                 if (r < 0)
1323                         return r;
1324         }
1325
1326         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1327         if (r < 0)
1328                 return r;
1329
1330         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1331         if (r < 0)
1332                 return r;
1333
1334         return 0;
1335 }
1336
1337 static int event_source_time_prioq_put(
1338                 sd_event_source *s,
1339                 struct clock_data *d) {
1340
1341         int r;
1342
1343         assert(s);
1344         assert(d);
1345         assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1346
1347         r = prioq_put(d->earliest, s, &s->earliest_index);
1348         if (r < 0)
1349                 return r;
1350
1351         r = prioq_put(d->latest, s, &s->latest_index);
1352         if (r < 0) {
1353                 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1354                 s->earliest_index = PRIOQ_IDX_NULL;
1355                 return r;
1356         }
1357
1358         d->needs_rearm = true;
1359         return 0;
1360 }
1361
1362 _public_ int sd_event_add_time(
1363                 sd_event *e,
1364                 sd_event_source **ret,
1365                 clockid_t clock,
1366                 uint64_t usec,
1367                 uint64_t accuracy,
1368                 sd_event_time_handler_t callback,
1369                 void *userdata) {
1370
1371         EventSourceType type;
1372         _cleanup_(source_freep) sd_event_source *s = NULL;
1373         struct clock_data *d;
1374         int r;
1375
1376         assert_return(e, -EINVAL);
1377         assert_return(e = event_resolve(e), -ENOPKG);
1378         assert_return(accuracy != UINT64_MAX, -EINVAL);
1379         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1380         assert_return(!event_pid_changed(e), -ECHILD);
1381
1382         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1383                 return -EOPNOTSUPP;
1384
1385         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1386         if (type < 0)
1387                 return -EOPNOTSUPP;
1388
1389         if (!callback)
1390                 callback = time_exit_callback;
1391
1392         assert_se(d = event_get_clock_data(e, type));
1393
1394         r = setup_clock_data(e, d, clock);
1395         if (r < 0)
1396                 return r;
1397
1398         s = source_new(e, !ret, type);
1399         if (!s)
1400                 return -ENOMEM;
1401
1402         s->time.next = usec;
1403         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1404         s->time.callback = callback;
1405         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1406         s->userdata = userdata;
1407         s->enabled = SD_EVENT_ONESHOT;
1408
1409         r = event_source_time_prioq_put(s, d);
1410         if (r < 0)
1411                 return r;
1412
1413         if (ret)
1414                 *ret = s;
1415         TAKE_PTR(s);
1416
1417         return 0;
1418 }
1419
1420 _public_ int sd_event_add_time_relative(
1421                 sd_event *e,
1422                 sd_event_source **ret,
1423                 clockid_t clock,
1424                 uint64_t usec,
1425                 uint64_t accuracy,
1426                 sd_event_time_handler_t callback,
1427                 void *userdata) {
1428
1429         usec_t t;
1430         int r;
1431
1432         /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1433          * checks for overflow. */
1434
1435         r = sd_event_now(e, clock, &t);
1436         if (r < 0)
1437                 return r;
1438
1439         if (usec >= USEC_INFINITY - t)
1440                 return -EOVERFLOW;
1441
1442         return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1443 }
1444
1445 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1446         assert(s);
1447
1448         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1449 }
1450
1451 _public_ int sd_event_add_signal(
1452                 sd_event *e,
1453                 sd_event_source **ret,
1454                 int sig,
1455                 sd_event_signal_handler_t callback,
1456                 void *userdata) {
1457
1458         _cleanup_(source_freep) sd_event_source *s = NULL;
1459         struct signal_data *d;
1460         sigset_t new_ss;
1461         bool block_it;
1462         int r;
1463
1464         assert_return(e, -EINVAL);
1465         assert_return(e = event_resolve(e), -ENOPKG);
1466         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1467         assert_return(!event_pid_changed(e), -ECHILD);
1468
1469         /* Let's make sure our special flag stays outside of the valid signal range */
1470         assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1471
1472         if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1473                 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1474                 assert_return(SIGNAL_VALID(sig), -EINVAL);
1475
1476                 block_it = true;
1477         } else {
1478                 assert_return(SIGNAL_VALID(sig), -EINVAL);
1479
1480                 r = signal_is_blocked(sig);
1481                 if (r < 0)
1482                         return r;
1483                 if (r == 0)
1484                         return -EBUSY;
1485
1486                 block_it = false;
1487         }
1488
1489         if (!callback)
1490                 callback = signal_exit_callback;
1491
1492         if (!e->signal_sources) {
1493                 e->signal_sources = new0(sd_event_source*, _NSIG);
1494                 if (!e->signal_sources)
1495                         return -ENOMEM;
1496         } else if (e->signal_sources[sig])
1497                 return -EBUSY;
1498
1499         s = source_new(e, !ret, SOURCE_SIGNAL);
1500         if (!s)
1501                 return -ENOMEM;
1502
1503         s->signal.sig = sig;
1504         s->signal.callback = callback;
1505         s->userdata = userdata;
1506         s->enabled = SD_EVENT_ON;
1507
1508         e->signal_sources[sig] = s;
1509
1510         if (block_it) {
1511                 sigset_t old_ss;
1512
1513                 if (sigemptyset(&new_ss) < 0)
1514                         return -errno;
1515
1516                 if (sigaddset(&new_ss, sig) < 0)
1517                         return -errno;
1518
1519                 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1520                 if (r != 0)
1521                         return -r;
1522
1523                 r = sigismember(&old_ss, sig);
1524                 if (r < 0)
1525                         return -errno;
1526
1527                 s->signal.unblock = !r;
1528         } else
1529                 s->signal.unblock = false;
1530
1531         r = event_make_signal_data(e, sig, &d);
1532         if (r < 0) {
1533                 if (s->signal.unblock)
1534                         (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1535
1536                 return r;
1537         }
1538
1539         /* Use the signal name as description for the event source by default */
1540         (void) sd_event_source_set_description(s, signal_to_string(sig));
1541
1542         if (ret)
1543                 *ret = s;
1544         TAKE_PTR(s);
1545
1546         return 0;
1547 }
1548
1549 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1550         assert(s);
1551
1552         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1553 }
1554
1555 static bool shall_use_pidfd(void) {
1556         /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1557         return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1558 }
1559
1560 _public_ int sd_event_add_child(
1561                 sd_event *e,
1562                 sd_event_source **ret,
1563                 pid_t pid,
1564                 int options,
1565                 sd_event_child_handler_t callback,
1566                 void *userdata) {
1567
1568         _cleanup_(source_freep) sd_event_source *s = NULL;
1569         int r;
1570
1571         assert_return(e, -EINVAL);
1572         assert_return(e = event_resolve(e), -ENOPKG);
1573         assert_return(pid > 1, -EINVAL);
1574         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1575         assert_return(options != 0, -EINVAL);
1576         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1577         assert_return(!event_pid_changed(e), -ECHILD);
1578
1579         if (!callback)
1580                 callback = child_exit_callback;
1581
1582         if (e->n_online_child_sources == 0) {
1583                 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1584                  * for compatibility with pre-pidfd and because we don't want the reap the child processes
1585                  * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1586                  * take effect.
1587                  *
1588                  * (As an optimization we only do this check on the first child event source created.) */
1589                 r = signal_is_blocked(SIGCHLD);
1590                 if (r < 0)
1591                         return r;
1592                 if (r == 0)
1593                         return -EBUSY;
1594         }
1595
1596         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1597         if (r < 0)
1598                 return r;
1599
1600         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1601                 return -EBUSY;
1602
1603         s = source_new(e, !ret, SOURCE_CHILD);
1604         if (!s)
1605                 return -ENOMEM;
1606
1607         s->wakeup = WAKEUP_EVENT_SOURCE;
1608         s->child.options = options;
1609         s->child.callback = callback;
1610         s->userdata = userdata;
1611         s->enabled = SD_EVENT_ONESHOT;
1612
1613         /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1614          * pin the PID, and make regular waitid() handling race-free. */
1615
1616         if (shall_use_pidfd()) {
1617                 s->child.pidfd = pidfd_open(pid, 0);
1618                 if (s->child.pidfd < 0) {
1619                         /* Propagate errors unless the syscall is not supported or blocked */
1620                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1621                                 return -errno;
1622                 } else
1623                         s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1624         } else
1625                 s->child.pidfd = -EBADF;
1626
1627         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1628                 /* We have a pidfd and we only want to watch for exit */
1629                 r = source_child_pidfd_register(s, s->enabled);
1630                 if (r < 0)
1631                         return r;
1632
1633         } else {
1634                 /* We have no pidfd or we shall wait for some other event than WEXITED */
1635                 r = event_make_signal_data(e, SIGCHLD, NULL);
1636                 if (r < 0)
1637                         return r;
1638
1639                 e->need_process_child = true;
1640         }
1641
1642         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1643         if (r < 0)
1644                 return r;
1645
1646         /* These must be done after everything succeeds. */
1647         s->child.pid = pid;
1648         e->n_online_child_sources++;
1649
1650         if (ret)
1651                 *ret = s;
1652         TAKE_PTR(s);
1653         return 0;
1654 }
1655
1656 _public_ int sd_event_add_child_pidfd(
1657                 sd_event *e,
1658                 sd_event_source **ret,
1659                 int pidfd,
1660                 int options,
1661                 sd_event_child_handler_t callback,
1662                 void *userdata) {
1663
1664
1665         _cleanup_(source_freep) sd_event_source *s = NULL;
1666         pid_t pid;
1667         int r;
1668
1669         assert_return(e, -EINVAL);
1670         assert_return(e = event_resolve(e), -ENOPKG);
1671         assert_return(pidfd >= 0, -EBADF);
1672         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1673         assert_return(options != 0, -EINVAL);
1674         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1675         assert_return(!event_pid_changed(e), -ECHILD);
1676
1677         if (!callback)
1678                 callback = child_exit_callback;
1679
1680         if (e->n_online_child_sources == 0) {
1681                 r = signal_is_blocked(SIGCHLD);
1682                 if (r < 0)
1683                         return r;
1684                 if (r == 0)
1685                         return -EBUSY;
1686         }
1687
1688         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1689         if (r < 0)
1690                 return r;
1691
1692         r = pidfd_get_pid(pidfd, &pid);
1693         if (r < 0)
1694                 return r;
1695
1696         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1697                 return -EBUSY;
1698
1699         s = source_new(e, !ret, SOURCE_CHILD);
1700         if (!s)
1701                 return -ENOMEM;
1702
1703         s->wakeup = WAKEUP_EVENT_SOURCE;
1704         s->child.pidfd = pidfd;
1705         s->child.pid = pid;
1706         s->child.options = options;
1707         s->child.callback = callback;
1708         s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1709         s->userdata = userdata;
1710         s->enabled = SD_EVENT_ONESHOT;
1711
1712         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1713         if (r < 0)
1714                 return r;
1715
1716         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1717                 /* We only want to watch for WEXITED */
1718                 r = source_child_pidfd_register(s, s->enabled);
1719                 if (r < 0)
1720                         return r;
1721         } else {
1722                 /* We shall wait for some other event than WEXITED */
1723                 r = event_make_signal_data(e, SIGCHLD, NULL);
1724                 if (r < 0)
1725                         return r;
1726
1727                 e->need_process_child = true;
1728         }
1729
1730         e->n_online_child_sources++;
1731
1732         if (ret)
1733                 *ret = s;
1734         TAKE_PTR(s);
1735         return 0;
1736 }
1737
1738 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1739         assert(s);
1740
1741         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1742 }
1743
1744 _public_ int sd_event_add_defer(
1745                 sd_event *e,
1746                 sd_event_source **ret,
1747                 sd_event_handler_t callback,
1748                 void *userdata) {
1749
1750         _cleanup_(source_freep) sd_event_source *s = NULL;
1751         int r;
1752
1753         assert_return(e, -EINVAL);
1754         assert_return(e = event_resolve(e), -ENOPKG);
1755         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1756         assert_return(!event_pid_changed(e), -ECHILD);
1757
1758         if (!callback)
1759                 callback = generic_exit_callback;
1760
1761         s = source_new(e, !ret, SOURCE_DEFER);
1762         if (!s)
1763                 return -ENOMEM;
1764
1765         s->defer.callback = callback;
1766         s->userdata = userdata;
1767         s->enabled = SD_EVENT_ONESHOT;
1768
1769         r = source_set_pending(s, true);
1770         if (r < 0)
1771                 return r;
1772
1773         if (ret)
1774                 *ret = s;
1775         TAKE_PTR(s);
1776
1777         return 0;
1778 }
1779
1780 _public_ int sd_event_add_post(
1781                 sd_event *e,
1782                 sd_event_source **ret,
1783                 sd_event_handler_t callback,
1784                 void *userdata) {
1785
1786         _cleanup_(source_freep) sd_event_source *s = NULL;
1787         int r;
1788
1789         assert_return(e, -EINVAL);
1790         assert_return(e = event_resolve(e), -ENOPKG);
1791         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1792         assert_return(!event_pid_changed(e), -ECHILD);
1793
1794         if (!callback)
1795                 callback = generic_exit_callback;
1796
1797         s = source_new(e, !ret, SOURCE_POST);
1798         if (!s)
1799                 return -ENOMEM;
1800
1801         s->post.callback = callback;
1802         s->userdata = userdata;
1803         s->enabled = SD_EVENT_ON;
1804
1805         r = set_ensure_put(&e->post_sources, NULL, s);
1806         if (r < 0)
1807                 return r;
1808         assert(r > 0);
1809
1810         if (ret)
1811                 *ret = s;
1812         TAKE_PTR(s);
1813
1814         return 0;
1815 }
1816
1817 _public_ int sd_event_add_exit(
1818                 sd_event *e,
1819                 sd_event_source **ret,
1820                 sd_event_handler_t callback,
1821                 void *userdata) {
1822
1823         _cleanup_(source_freep) sd_event_source *s = NULL;
1824         int r;
1825
1826         assert_return(e, -EINVAL);
1827         assert_return(e = event_resolve(e), -ENOPKG);
1828         assert_return(callback, -EINVAL);
1829         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1830         assert_return(!event_pid_changed(e), -ECHILD);
1831
1832         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1833         if (r < 0)
1834                 return r;
1835
1836         s = source_new(e, !ret, SOURCE_EXIT);
1837         if (!s)
1838                 return -ENOMEM;
1839
1840         s->exit.callback = callback;
1841         s->userdata = userdata;
1842         s->exit.prioq_index = PRIOQ_IDX_NULL;
1843         s->enabled = SD_EVENT_ONESHOT;
1844
1845         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1846         if (r < 0)
1847                 return r;
1848
1849         if (ret)
1850                 *ret = s;
1851         TAKE_PTR(s);
1852
1853         return 0;
1854 }
1855
1856 int sd_event_trim_memory(void) {
1857         int r;
1858
1859         /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1860          * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1861          * NULL callback parameter. */
1862
1863         log_debug("Memory pressure event, trimming malloc() memory.");
1864
1865 #if HAVE_GENERIC_MALLINFO
1866         generic_mallinfo before_mallinfo = generic_mallinfo_get();
1867 #endif
1868
1869         usec_t before_timestamp = now(CLOCK_MONOTONIC);
1870         hashmap_trim_pools();
1871         r = malloc_trim(0);
1872         usec_t after_timestamp = now(CLOCK_MONOTONIC);
1873
1874         if (r > 0)
1875                 log_debug("Successfully trimmed some memory.");
1876         else
1877                 log_debug("Couldn't trim any memory.");
1878
1879         usec_t period = after_timestamp - before_timestamp;
1880
1881 #if HAVE_GENERIC_MALLINFO
1882         generic_mallinfo after_mallinfo = generic_mallinfo_get();
1883         size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
1884                 LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
1885         log_struct(LOG_DEBUG,
1886                    LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1887                                FORMAT_TIMESPAN(period, 0),
1888                                FORMAT_BYTES(l)),
1889                    "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1890                    "TRIMMED_BYTES=%zu", l,
1891                    "TRIMMED_USEC=" USEC_FMT, period);
1892 #else
1893         log_struct(LOG_DEBUG,
1894                    LOG_MESSAGE("Memory trimming took %s.",
1895                                FORMAT_TIMESPAN(period, 0)),
1896                    "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1897                    "TRIMMED_USEC=" USEC_FMT, period);
1898 #endif
1899
1900         return 0;
1901 }
1902
1903 static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1904         assert(s);
1905
1906         sd_event_trim_memory();
1907         return 0;
1908 }
1909
1910 _public_ int sd_event_add_memory_pressure(
1911                 sd_event *e,
1912                 sd_event_source **ret,
1913                 sd_event_handler_t callback,
1914                 void *userdata) {
1915
1916         _cleanup_free_ char *w = NULL;
1917         _cleanup_(source_freep) sd_event_source *s = NULL;
1918         _cleanup_close_ int path_fd = -1, fd = -1;
1919         _cleanup_free_ void *write_buffer = NULL;
1920         const char *watch, *watch_fallback, *env;
1921         size_t write_buffer_size = 0;
1922         struct stat st;
1923         uint32_t events;
1924         bool locked;
1925         int r;
1926
1927         assert_return(e, -EINVAL);
1928         assert_return(e = event_resolve(e), -ENOPKG);
1929         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1930         assert_return(!event_pid_changed(e), -ECHILD);
1931
1932         if (!callback)
1933                 callback = memory_pressure_callback;
1934
1935         s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1936         if (!s)
1937                 return -ENOMEM;
1938
1939         s->wakeup = WAKEUP_EVENT_SOURCE;
1940         s->memory_pressure.callback = callback;
1941         s->userdata = userdata;
1942         s->enabled = SD_EVENT_ON;
1943         s->memory_pressure.fd = -EBADF;
1944
1945         env = secure_getenv("MEMORY_PRESSURE_WATCH");
1946         if (env) {
1947                 if (isempty(env) || path_equal(env, "/dev/null"))
1948                         return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1949                                                "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1950
1951                 if (!path_is_absolute(env) || !path_is_normalized(env))
1952                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1953                                                "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1954
1955                 watch = env;
1956
1957                 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1958                 if (env) {
1959                         r = unbase64mem(env, SIZE_MAX, &write_buffer, &write_buffer_size);
1960                         if (r < 0)
1961                                 return r;
1962                 }
1963
1964                 locked = true;
1965         } else {
1966
1967                 r = is_pressure_supported();
1968                 if (r < 0)
1969                         return r;
1970                 if (r == 0)
1971                         return -EOPNOTSUPP;
1972
1973                 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1974                  * the system wide pressure if for some reason we cannot (which could be: memory controller
1975                  * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1976                  * only use the system-wide logic. */
1977                 r = cg_all_unified();
1978                 if (r < 0)
1979                         return r;
1980                 if (r == 0)
1981                         watch = "/proc/pressure/memory";
1982                 else {
1983                         _cleanup_free_ char *cg = NULL;
1984
1985                         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
1986                         if (r < 0)
1987                                 return r;
1988
1989                         w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
1990                         if (!w)
1991                                 return -ENOMEM;
1992
1993                         watch = w;
1994                         watch_fallback = "/proc/pressure/memory";
1995                 }
1996
1997                 /* Android uses three levels in its userspace low memory killer logic:
1998                  *     some  70000 1000000
1999                  *     some 100000 1000000
2000                  *     full  70000 1000000
2001                  *
2002                  * GNOME's low memory monitor uses:
2003                  *     some  70000 1000000
2004                  *     some 100000 1000000
2005                  *     full 100000 1000000
2006                  *
2007                  * We'll default to the middle level that both agree on */
2008                 if (asprintf((char**) &write_buffer,
2009                              "%s " USEC_FMT " " USEC_FMT,
2010                              MEMORY_PRESSURE_DEFAULT_TYPE,
2011                              MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2012                              MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2013                         return -ENOMEM;
2014
2015                 write_buffer_size = strlen(write_buffer) + 1;
2016                 locked = false;
2017         }
2018
2019         path_fd = open(watch, O_PATH|O_CLOEXEC);
2020         if (path_fd < 0) {
2021                 if (errno != ENOENT)
2022                         return -errno;
2023
2024                 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2025                  * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2026                  * the PSI service apparently is not supported) */
2027                 if (!watch_fallback)
2028                         return locked ? -ENOENT : -EOPNOTSUPP;
2029
2030                 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
2031                 if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2032                         return -EOPNOTSUPP;
2033                 if (errno < 0)
2034                         return -errno;
2035         }
2036
2037         if (fstat(path_fd, &st) < 0)
2038                 return -errno;
2039
2040         if (S_ISSOCK(st.st_mode)) {
2041                 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2042                 if (fd < 0)
2043                         return -errno;
2044
2045                 r = connect_unix_path(fd, path_fd, NULL);
2046                 if (r < 0)
2047                         return r;
2048
2049                 events = EPOLLIN;
2050
2051         } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2052                 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2053                 if (fd < 0)
2054                         return fd;
2055
2056                 if (S_ISREG(st.st_mode)) {
2057                         struct statfs sfs;
2058
2059                         /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2060
2061                         if (fstatfs(fd, &sfs) < 0)
2062                                 return -errno;
2063
2064                         if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2065                             !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2066                                 return -ENOTTY;
2067
2068                         events = EPOLLPRI;
2069                 } else
2070                         /* For fifos and char devices just watch for EPOLLIN */
2071                         events = EPOLLIN;
2072
2073         } else if (S_ISDIR(st.st_mode))
2074                 return -EISDIR;
2075         else
2076                 return -EBADF;
2077
2078         s->memory_pressure.fd = TAKE_FD(fd);
2079         s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2080         s->memory_pressure.write_buffer_size = write_buffer_size;
2081         s->memory_pressure.events = events;
2082         s->memory_pressure.locked = locked;
2083
2084         /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2085          * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2086          * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2087          * event sources on which writes must be executed before the first event loop iteration is
2088          * executed. (We could also write the data here, right away, but we want to give the caller the
2089          * freedom to call sd_event_source_set_memory_pressure_type() and
2090          * sd_event_source_set_memory_pressure_rate() before we write it. */
2091
2092         if (s->memory_pressure.write_buffer_size > 0)
2093                 source_memory_pressure_add_to_write_list(s);
2094         else {
2095                 r = source_memory_pressure_register(s, s->enabled);
2096                 if (r < 0)
2097                         return r;
2098         }
2099
2100         if (ret)
2101                 *ret = s;
2102         TAKE_PTR(s);
2103
2104         return 0;
2105 }
2106
2107 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2108         assert(e);
2109
2110         if (!d)
2111                 return;
2112
2113         assert(hashmap_isempty(d->inodes));
2114         assert(hashmap_isempty(d->wd));
2115
2116         if (d->buffer_filled > 0)
2117                 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
2118
2119         hashmap_free(d->inodes);
2120         hashmap_free(d->wd);
2121
2122         assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2123
2124         if (d->fd >= 0) {
2125                 if (!event_pid_changed(e) &&
2126                     epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
2127                         log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2128
2129                 safe_close(d->fd);
2130         }
2131         free(d);
2132 }
2133
2134 static int event_make_inotify_data(
2135                 sd_event *e,
2136                 int64_t priority,
2137                 struct inotify_data **ret) {
2138
2139         _cleanup_close_ int fd = -EBADF;
2140         struct inotify_data *d;
2141         int r;
2142
2143         assert(e);
2144
2145         d = hashmap_get(e->inotify_data, &priority);
2146         if (d) {
2147                 if (ret)
2148                         *ret = d;
2149                 return 0;
2150         }
2151
2152         fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2153         if (fd < 0)
2154                 return -errno;
2155
2156         fd = fd_move_above_stdio(fd);
2157
2158         d = new(struct inotify_data, 1);
2159         if (!d)
2160                 return -ENOMEM;
2161
2162         *d = (struct inotify_data) {
2163                 .wakeup = WAKEUP_INOTIFY_DATA,
2164                 .fd = TAKE_FD(fd),
2165                 .priority = priority,
2166         };
2167
2168         r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
2169         if (r < 0) {
2170                 d->fd = safe_close(d->fd);
2171                 free(d);
2172                 return r;
2173         }
2174
2175         struct epoll_event ev = {
2176                 .events = EPOLLIN,
2177                 .data.ptr = d,
2178         };
2179
2180         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2181                 r = -errno;
2182                 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2183                                             * remove the fd from the epoll first, which we don't want as we couldn't
2184                                             * add it in the first place. */
2185                 event_free_inotify_data(e, d);
2186                 return r;
2187         }
2188
2189         if (ret)
2190                 *ret = d;
2191
2192         return 1;
2193 }
2194
2195 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
2196         int r;
2197
2198         assert(x);
2199         assert(y);
2200
2201         r = CMP(x->dev, y->dev);
2202         if (r != 0)
2203                 return r;
2204
2205         return CMP(x->ino, y->ino);
2206 }
2207
2208 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2209         assert(d);
2210
2211         siphash24_compress(&d->dev, sizeof(d->dev), state);
2212         siphash24_compress(&d->ino, sizeof(d->ino), state);
2213 }
2214
2215 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
2216
2217 static void event_free_inode_data(
2218                 sd_event *e,
2219                 struct inode_data *d) {
2220
2221         assert(e);
2222
2223         if (!d)
2224                 return;
2225
2226         assert(!d->event_sources);
2227
2228         if (d->fd >= 0) {
2229                 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
2230                 safe_close(d->fd);
2231         }
2232
2233         if (d->inotify_data) {
2234
2235                 if (d->wd >= 0) {
2236                         if (d->inotify_data->fd >= 0 && !event_pid_changed(e)) {
2237                                 /* So here's a problem. At the time this runs the watch descriptor might already be
2238                                  * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2239                                  * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2240                                  * likely case to happen. */
2241
2242                                 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2243                                         log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2244                         }
2245
2246                         assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2247                 }
2248
2249                 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2250         }
2251
2252         free(d);
2253 }
2254
2255 static void event_gc_inotify_data(
2256                 sd_event *e,
2257                 struct inotify_data *d) {
2258
2259         assert(e);
2260
2261         /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2262          * any inode with it anymore, which in turn happens if no event source of this priority is interested
2263          * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2264          * (under the expectation that the GC is called again once the counter is decremented). */
2265
2266         if (!d)
2267                 return;
2268
2269         if (!hashmap_isempty(d->inodes))
2270                 return;
2271
2272         if (d->n_busy > 0)
2273                 return;
2274
2275         event_free_inotify_data(e, d);
2276 }
2277
2278 static void event_gc_inode_data(
2279                 sd_event *e,
2280                 struct inode_data *d) {
2281
2282         struct inotify_data *inotify_data;
2283
2284         assert(e);
2285
2286         if (!d)
2287                 return;
2288
2289         if (d->event_sources)
2290                 return;
2291
2292         inotify_data = d->inotify_data;
2293         event_free_inode_data(e, d);
2294
2295         event_gc_inotify_data(e, inotify_data);
2296 }
2297
2298 static int event_make_inode_data(
2299                 sd_event *e,
2300                 struct inotify_data *inotify_data,
2301                 dev_t dev,
2302                 ino_t ino,
2303                 struct inode_data **ret) {
2304
2305         struct inode_data *d, key;
2306         int r;
2307
2308         assert(e);
2309         assert(inotify_data);
2310
2311         key = (struct inode_data) {
2312                 .ino = ino,
2313                 .dev = dev,
2314         };
2315
2316         d = hashmap_get(inotify_data->inodes, &key);
2317         if (d) {
2318                 if (ret)
2319                         *ret = d;
2320
2321                 return 0;
2322         }
2323
2324         r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2325         if (r < 0)
2326                 return r;
2327
2328         d = new(struct inode_data, 1);
2329         if (!d)
2330                 return -ENOMEM;
2331
2332         *d = (struct inode_data) {
2333                 .dev = dev,
2334                 .ino = ino,
2335                 .wd = -1,
2336                 .fd = -EBADF,
2337                 .inotify_data = inotify_data,
2338         };
2339
2340         r = hashmap_put(inotify_data->inodes, d, d);
2341         if (r < 0) {
2342                 free(d);
2343                 return r;
2344         }
2345
2346         if (ret)
2347                 *ret = d;
2348
2349         return 1;
2350 }
2351
2352 static uint32_t inode_data_determine_mask(struct inode_data *d) {
2353         bool excl_unlink = true;
2354         uint32_t combined = 0;
2355
2356         assert(d);
2357
2358         /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2359          * the IN_EXCL_UNLINK flag is ANDed instead.
2360          *
2361          * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2362          * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2363          * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2364          * events we don't care for client-side. */
2365
2366         LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2367
2368                 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2369                         excl_unlink = false;
2370
2371                 combined |= s->inotify.mask;
2372         }
2373
2374         return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2375 }
2376
2377 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2378         uint32_t combined_mask;
2379         int wd, r;
2380
2381         assert(d);
2382         assert(d->fd >= 0);
2383
2384         combined_mask = inode_data_determine_mask(d);
2385
2386         if (d->wd >= 0 && combined_mask == d->combined_mask)
2387                 return 0;
2388
2389         r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2390         if (r < 0)
2391                 return r;
2392
2393         wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2394         if (wd < 0)
2395                 return -errno;
2396
2397         if (d->wd < 0) {
2398                 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2399                 if (r < 0) {
2400                         (void) inotify_rm_watch(d->inotify_data->fd, wd);
2401                         return r;
2402                 }
2403
2404                 d->wd = wd;
2405
2406         } else if (d->wd != wd) {
2407
2408                 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2409                 (void) inotify_rm_watch(d->fd, wd);
2410                 return -EINVAL;
2411         }
2412
2413         d->combined_mask = combined_mask;
2414         return 1;
2415 }
2416
2417 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2418         assert(s);
2419
2420         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2421 }
2422
2423 static int event_add_inotify_fd_internal(
2424                 sd_event *e,
2425                 sd_event_source **ret,
2426                 int fd,
2427                 bool donate,
2428                 uint32_t mask,
2429                 sd_event_inotify_handler_t callback,
2430                 void *userdata) {
2431
2432         _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
2433         _cleanup_(source_freep) sd_event_source *s = NULL;
2434         struct inotify_data *inotify_data = NULL;
2435         struct inode_data *inode_data = NULL;
2436         struct stat st;
2437         int r;
2438
2439         assert_return(e, -EINVAL);
2440         assert_return(e = event_resolve(e), -ENOPKG);
2441         assert_return(fd >= 0, -EBADF);
2442         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2443         assert_return(!event_pid_changed(e), -ECHILD);
2444
2445         if (!callback)
2446                 callback = inotify_exit_callback;
2447
2448         /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2449          * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2450          * the user can't use them for us. */
2451         if (mask & IN_MASK_ADD)
2452                 return -EINVAL;
2453
2454         if (fstat(fd, &st) < 0)
2455                 return -errno;
2456
2457         s = source_new(e, !ret, SOURCE_INOTIFY);
2458         if (!s)
2459                 return -ENOMEM;
2460
2461         s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2462         s->inotify.mask = mask;
2463         s->inotify.callback = callback;
2464         s->userdata = userdata;
2465
2466         /* Allocate an inotify object for this priority, and an inode object within it */
2467         r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2468         if (r < 0)
2469                 return r;
2470
2471         r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2472         if (r < 0) {
2473                 event_gc_inotify_data(e, inotify_data);
2474                 return r;
2475         }
2476
2477         /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2478          * the event source, until then, for which we need the original inode. */
2479         if (inode_data->fd < 0) {
2480                 if (donated_fd >= 0)
2481                         inode_data->fd = TAKE_FD(donated_fd);
2482                 else {
2483                         inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2484                         if (inode_data->fd < 0) {
2485                                 r = -errno;
2486                                 event_gc_inode_data(e, inode_data);
2487                                 return r;
2488                         }
2489                 }
2490
2491                 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
2492         }
2493
2494         /* Link our event source to the inode data object */
2495         LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2496         s->inotify.inode_data = inode_data;
2497
2498         /* Actually realize the watch now */
2499         r = inode_data_realize_watch(e, inode_data);
2500         if (r < 0)
2501                 return r;
2502
2503         if (ret)
2504                 *ret = s;
2505         TAKE_PTR(s);
2506
2507         return 0;
2508 }
2509
2510 _public_ int sd_event_add_inotify_fd(
2511                 sd_event *e,
2512                 sd_event_source **ret,
2513                 int fd,
2514                 uint32_t mask,
2515                 sd_event_inotify_handler_t callback,
2516                 void *userdata) {
2517
2518         return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2519 }
2520
2521 _public_ int sd_event_add_inotify(
2522                 sd_event *e,
2523                 sd_event_source **ret,
2524                 const char *path,
2525                 uint32_t mask,
2526                 sd_event_inotify_handler_t callback,
2527                 void *userdata) {
2528
2529         sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2530         int fd, r;
2531
2532         assert_return(path, -EINVAL);
2533
2534         fd = open(path, O_PATH | O_CLOEXEC |
2535                         (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2536                         (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2537         if (fd < 0)
2538                 return -errno;
2539
2540         r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2541         if (r < 0)
2542                 return r;
2543
2544         (void) sd_event_source_set_description(s, path);
2545
2546         if (ret)
2547                 *ret = s;
2548
2549         return r;
2550 }
2551
2552 static sd_event_source* event_source_free(sd_event_source *s) {
2553         if (!s)
2554                 return NULL;
2555
2556         /* Here's a special hack: when we are called from a
2557          * dispatch handler we won't free the event source
2558          * immediately, but we will detach the fd from the
2559          * epoll. This way it is safe for the caller to unref
2560          * the event source and immediately close the fd, but
2561          * we still retain a valid event source object after
2562          * the callback. */
2563
2564         if (s->dispatching)
2565                 source_disconnect(s);
2566         else
2567                 source_free(s);
2568
2569         return NULL;
2570 }
2571
2572 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2573
2574 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2575         assert_return(s, -EINVAL);
2576         assert_return(!event_pid_changed(s->event), -ECHILD);
2577
2578         return free_and_strdup(&s->description, description);
2579 }
2580
2581 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
2582         assert_return(s, -EINVAL);
2583         assert_return(description, -EINVAL);
2584         assert_return(!event_pid_changed(s->event), -ECHILD);
2585
2586         if (!s->description)
2587                 return -ENXIO;
2588
2589         *description = s->description;
2590         return 0;
2591 }
2592
2593 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
2594         assert_return(s, NULL);
2595
2596         return s->event;
2597 }
2598
2599 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2600         assert_return(s, -EINVAL);
2601         assert_return(s->type != SOURCE_EXIT, -EDOM);
2602         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2603         assert_return(!event_pid_changed(s->event), -ECHILD);
2604
2605         return s->pending;
2606 }
2607
2608 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2609         assert_return(s, -EINVAL);
2610         assert_return(s->type == SOURCE_IO, -EDOM);
2611         assert_return(!event_pid_changed(s->event), -ECHILD);
2612
2613         return s->io.fd;
2614 }
2615
2616 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2617         int r;
2618
2619         assert_return(s, -EINVAL);
2620         assert_return(fd >= 0, -EBADF);
2621         assert_return(s->type == SOURCE_IO, -EDOM);
2622         assert_return(!event_pid_changed(s->event), -ECHILD);
2623
2624         if (s->io.fd == fd)
2625                 return 0;
2626
2627         if (event_source_is_offline(s)) {
2628                 s->io.fd = fd;
2629                 s->io.registered = false;
2630         } else {
2631                 int saved_fd;
2632
2633                 saved_fd = s->io.fd;
2634                 assert(s->io.registered);
2635
2636                 s->io.fd = fd;
2637                 s->io.registered = false;
2638
2639                 r = source_io_register(s, s->enabled, s->io.events);
2640                 if (r < 0) {
2641                         s->io.fd = saved_fd;
2642                         s->io.registered = true;
2643                         return r;
2644                 }
2645
2646                 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2647         }
2648
2649         return 0;
2650 }
2651
2652 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2653         assert_return(s, -EINVAL);
2654         assert_return(s->type == SOURCE_IO, -EDOM);
2655
2656         return s->io.owned;
2657 }
2658
2659 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2660         assert_return(s, -EINVAL);
2661         assert_return(s->type == SOURCE_IO, -EDOM);
2662
2663         s->io.owned = own;
2664         return 0;
2665 }
2666
2667 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2668         assert_return(s, -EINVAL);
2669         assert_return(events, -EINVAL);
2670         assert_return(s->type == SOURCE_IO, -EDOM);
2671         assert_return(!event_pid_changed(s->event), -ECHILD);
2672
2673         *events = s->io.events;
2674         return 0;
2675 }
2676
2677 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2678         int r;
2679
2680         assert_return(s, -EINVAL);
2681         assert_return(s->type == SOURCE_IO, -EDOM);
2682         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2683         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2684         assert_return(!event_pid_changed(s->event), -ECHILD);
2685
2686         /* edge-triggered updates are never skipped, so we can reset edges */
2687         if (s->io.events == events && !(events & EPOLLET))
2688                 return 0;
2689
2690         r = source_set_pending(s, false);
2691         if (r < 0)
2692                 return r;
2693
2694         if (event_source_is_online(s)) {
2695                 r = source_io_register(s, s->enabled, events);
2696                 if (r < 0)
2697                         return r;
2698         }
2699
2700         s->io.events = events;
2701
2702         return 0;
2703 }
2704
2705 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2706         assert_return(s, -EINVAL);
2707         assert_return(revents, -EINVAL);
2708         assert_return(s->type == SOURCE_IO, -EDOM);
2709         assert_return(s->pending, -ENODATA);
2710         assert_return(!event_pid_changed(s->event), -ECHILD);
2711
2712         *revents = s->io.revents;
2713         return 0;
2714 }
2715
2716 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2717         assert_return(s, -EINVAL);
2718         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2719         assert_return(!event_pid_changed(s->event), -ECHILD);
2720
2721         return s->signal.sig;
2722 }
2723
2724 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2725         assert_return(s, -EINVAL);
2726         assert_return(!event_pid_changed(s->event), -ECHILD);
2727
2728         *priority = s->priority;
2729         return 0;
2730 }
2731
2732 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2733         bool rm_inotify = false, rm_inode = false;
2734         struct inotify_data *new_inotify_data = NULL;
2735         struct inode_data *new_inode_data = NULL;
2736         int r;
2737
2738         assert_return(s, -EINVAL);
2739         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2740         assert_return(!event_pid_changed(s->event), -ECHILD);
2741
2742         if (s->priority == priority)
2743                 return 0;
2744
2745         if (s->type == SOURCE_INOTIFY) {
2746                 struct inode_data *old_inode_data;
2747
2748                 assert(s->inotify.inode_data);
2749                 old_inode_data = s->inotify.inode_data;
2750
2751                 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2752                  * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2753                  * events we allow priority changes only until the first following iteration. */
2754                 if (old_inode_data->fd < 0)
2755                         return -EOPNOTSUPP;
2756
2757                 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2758                 if (r < 0)
2759                         return r;
2760                 rm_inotify = r > 0;
2761
2762                 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2763                 if (r < 0)
2764                         goto fail;
2765                 rm_inode = r > 0;
2766
2767                 if (new_inode_data->fd < 0) {
2768                         /* Duplicate the fd for the new inode object if we don't have any yet */
2769                         new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2770                         if (new_inode_data->fd < 0) {
2771                                 r = -errno;
2772                                 goto fail;
2773                         }
2774
2775                         LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
2776                 }
2777
2778                 /* Move the event source to the new inode data structure */
2779                 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2780                 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2781                 s->inotify.inode_data = new_inode_data;
2782
2783                 /* Now create the new watch */
2784                 r = inode_data_realize_watch(s->event, new_inode_data);
2785                 if (r < 0) {
2786                         /* Move it back */
2787                         LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2788                         LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2789                         s->inotify.inode_data = old_inode_data;
2790                         goto fail;
2791                 }
2792
2793                 s->priority = priority;
2794
2795                 event_gc_inode_data(s->event, old_inode_data);
2796
2797         } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2798                 struct signal_data *old, *d;
2799
2800                 /* Move us from the signalfd belonging to the old
2801                  * priority to the signalfd of the new priority */
2802
2803                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2804
2805                 s->priority = priority;
2806
2807                 r = event_make_signal_data(s->event, s->signal.sig, &d);
2808                 if (r < 0) {
2809                         s->priority = old->priority;
2810                         return r;
2811                 }
2812
2813                 event_unmask_signal_data(s->event, old, s->signal.sig);
2814         } else
2815                 s->priority = priority;
2816
2817         event_source_pp_prioq_reshuffle(s);
2818
2819         if (s->type == SOURCE_EXIT)
2820                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2821
2822         return 0;
2823
2824 fail:
2825         if (rm_inode)
2826                 event_free_inode_data(s->event, new_inode_data);
2827
2828         if (rm_inotify)
2829                 event_free_inotify_data(s->event, new_inotify_data);
2830
2831         return r;
2832 }
2833
2834 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2835         /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2836         if (!s && !ret)
2837                 return false;
2838
2839         assert_return(s, -EINVAL);
2840         assert_return(!event_pid_changed(s->event), -ECHILD);
2841
2842         if (ret)
2843                 *ret = s->enabled;
2844
2845         return s->enabled != SD_EVENT_OFF;
2846 }
2847
2848 static int event_source_offline(
2849                 sd_event_source *s,
2850                 int enabled,
2851                 bool ratelimited) {
2852
2853         bool was_offline;
2854         int r;
2855
2856         assert(s);
2857         assert(enabled == SD_EVENT_OFF || ratelimited);
2858
2859         /* Unset the pending flag when this event source is disabled */
2860         if (s->enabled != SD_EVENT_OFF &&
2861             enabled == SD_EVENT_OFF &&
2862             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2863                 r = source_set_pending(s, false);
2864                 if (r < 0)
2865                         return r;
2866         }
2867
2868         was_offline = event_source_is_offline(s);
2869         s->enabled = enabled;
2870         s->ratelimited = ratelimited;
2871
2872         switch (s->type) {
2873
2874         case SOURCE_IO:
2875                 source_io_unregister(s);
2876                 break;
2877
2878         case SOURCE_SIGNAL:
2879                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2880                 break;
2881
2882         case SOURCE_CHILD:
2883                 if (!was_offline) {
2884                         assert(s->event->n_online_child_sources > 0);
2885                         s->event->n_online_child_sources--;
2886                 }
2887
2888                 if (EVENT_SOURCE_WATCH_PIDFD(s))
2889                         source_child_pidfd_unregister(s);
2890                 else
2891                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2892                 break;
2893
2894         case SOURCE_EXIT:
2895                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2896                 break;
2897
2898         case SOURCE_MEMORY_PRESSURE:
2899                 source_memory_pressure_unregister(s);
2900                 break;
2901
2902         case SOURCE_TIME_REALTIME:
2903         case SOURCE_TIME_BOOTTIME:
2904         case SOURCE_TIME_MONOTONIC:
2905         case SOURCE_TIME_REALTIME_ALARM:
2906         case SOURCE_TIME_BOOTTIME_ALARM:
2907         case SOURCE_DEFER:
2908         case SOURCE_POST:
2909         case SOURCE_INOTIFY:
2910                 break;
2911
2912         default:
2913                 assert_not_reached();
2914         }
2915
2916         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2917         event_source_time_prioq_reshuffle(s);
2918
2919         return 1;
2920 }
2921
2922 static int event_source_online(
2923                 sd_event_source *s,
2924                 int enabled,
2925                 bool ratelimited) {
2926
2927         bool was_online;
2928         int r;
2929
2930         assert(s);
2931         assert(enabled != SD_EVENT_OFF || !ratelimited);
2932
2933         /* Unset the pending flag when this event source is enabled */
2934         if (s->enabled == SD_EVENT_OFF &&
2935             enabled != SD_EVENT_OFF &&
2936             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2937                 r = source_set_pending(s, false);
2938                 if (r < 0)
2939                         return r;
2940         }
2941
2942         /* Are we really ready for onlining? */
2943         if (enabled == SD_EVENT_OFF || ratelimited) {
2944                 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2945                 s->enabled = enabled;
2946                 s->ratelimited = ratelimited;
2947                 return 0;
2948         }
2949
2950         was_online = event_source_is_online(s);
2951
2952         switch (s->type) {
2953         case SOURCE_IO:
2954                 r = source_io_register(s, enabled, s->io.events);
2955                 if (r < 0)
2956                         return r;
2957                 break;
2958
2959         case SOURCE_SIGNAL:
2960                 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2961                 if (r < 0) {
2962                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2963                         return r;
2964                 }
2965
2966                 break;
2967
2968         case SOURCE_CHILD:
2969                 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2970                         /* yes, we have pidfd */
2971
2972                         r = source_child_pidfd_register(s, enabled);
2973                         if (r < 0)
2974                                 return r;
2975                 } else {
2976                         /* no pidfd, or something other to watch for than WEXITED */
2977
2978                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
2979                         if (r < 0) {
2980                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2981                                 return r;
2982                         }
2983                 }
2984
2985                 if (!was_online)
2986                         s->event->n_online_child_sources++;
2987                 break;
2988
2989         case SOURCE_MEMORY_PRESSURE:
2990                 r = source_memory_pressure_register(s, enabled);
2991                 if (r < 0)
2992                         return r;
2993
2994                 break;
2995
2996         case SOURCE_TIME_REALTIME:
2997         case SOURCE_TIME_BOOTTIME:
2998         case SOURCE_TIME_MONOTONIC:
2999         case SOURCE_TIME_REALTIME_ALARM:
3000         case SOURCE_TIME_BOOTTIME_ALARM:
3001         case SOURCE_EXIT:
3002         case SOURCE_DEFER:
3003         case SOURCE_POST:
3004         case SOURCE_INOTIFY:
3005                 break;
3006
3007         default:
3008                 assert_not_reached();
3009         }
3010
3011         s->enabled = enabled;
3012         s->ratelimited = ratelimited;
3013
3014         /* Non-failing operations below */
3015         if (s->type == SOURCE_EXIT)
3016                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
3017
3018         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3019         event_source_time_prioq_reshuffle(s);
3020
3021         return 1;
3022 }
3023
3024 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3025         int r;
3026
3027         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
3028
3029         /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3030         if (m == SD_EVENT_OFF && !s)
3031                 return 0;
3032
3033         assert_return(s, -EINVAL);
3034         assert_return(!event_pid_changed(s->event), -ECHILD);
3035
3036         /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3037         if (s->event->state == SD_EVENT_FINISHED)
3038                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
3039
3040         if (s->enabled == m) /* No change? */
3041                 return 0;
3042
3043         if (m == SD_EVENT_OFF)
3044                 r = event_source_offline(s, m, s->ratelimited);
3045         else {
3046                 if (s->enabled != SD_EVENT_OFF) {
3047                         /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3048                          * event source is already enabled after all. */
3049                         s->enabled = m;
3050                         return 0;
3051                 }
3052
3053                 r = event_source_online(s, m, s->ratelimited);
3054         }
3055         if (r < 0)
3056                 return r;
3057
3058         event_source_pp_prioq_reshuffle(s);
3059         return 0;
3060 }
3061
3062 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
3063         assert_return(s, -EINVAL);
3064         assert_return(usec, -EINVAL);
3065         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3066         assert_return(!event_pid_changed(s->event), -ECHILD);
3067
3068         *usec = s->time.next;
3069         return 0;
3070 }
3071
3072 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
3073         int r;
3074
3075         assert_return(s, -EINVAL);
3076         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3077         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3078         assert_return(!event_pid_changed(s->event), -ECHILD);
3079
3080         r = source_set_pending(s, false);
3081         if (r < 0)
3082                 return r;
3083
3084         s->time.next = usec;
3085
3086         event_source_time_prioq_reshuffle(s);
3087         return 0;
3088 }
3089
3090 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3091         usec_t t;
3092         int r;
3093
3094         assert_return(s, -EINVAL);
3095         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3096
3097         if (usec == USEC_INFINITY)
3098                 return sd_event_source_set_time(s, USEC_INFINITY);
3099
3100         r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3101         if (r < 0)
3102                 return r;
3103
3104         usec = usec_add(t, usec);
3105         if (usec == USEC_INFINITY)
3106                 return -EOVERFLOW;
3107
3108         return sd_event_source_set_time(s, usec);
3109 }
3110
3111 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
3112         assert_return(s, -EINVAL);
3113         assert_return(usec, -EINVAL);
3114         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3115         assert_return(!event_pid_changed(s->event), -ECHILD);
3116
3117         *usec = s->time.accuracy;
3118         return 0;
3119 }
3120
3121 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
3122         int r;
3123
3124         assert_return(s, -EINVAL);
3125         assert_return(usec != UINT64_MAX, -EINVAL);
3126         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3127         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3128         assert_return(!event_pid_changed(s->event), -ECHILD);
3129
3130         r = source_set_pending(s, false);
3131         if (r < 0)
3132                 return r;
3133
3134         if (usec == 0)
3135                 usec = DEFAULT_ACCURACY_USEC;
3136
3137         s->time.accuracy = usec;
3138
3139         event_source_time_prioq_reshuffle(s);
3140         return 0;
3141 }
3142
3143 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
3144         assert_return(s, -EINVAL);
3145         assert_return(clock, -EINVAL);
3146         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3147         assert_return(!event_pid_changed(s->event), -ECHILD);
3148
3149         *clock = event_source_type_to_clock(s->type);
3150         return 0;
3151 }
3152
3153 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
3154         assert_return(s, -EINVAL);
3155         assert_return(pid, -EINVAL);
3156         assert_return(s->type == SOURCE_CHILD, -EDOM);
3157         assert_return(!event_pid_changed(s->event), -ECHILD);
3158
3159         *pid = s->child.pid;
3160         return 0;
3161 }
3162
3163 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3164         assert_return(s, -EINVAL);
3165         assert_return(s->type == SOURCE_CHILD, -EDOM);
3166         assert_return(!event_pid_changed(s->event), -ECHILD);
3167
3168         if (s->child.pidfd < 0)
3169                 return -EOPNOTSUPP;
3170
3171         return s->child.pidfd;
3172 }
3173
3174 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3175         assert_return(s, -EINVAL);
3176         assert_return(s->type == SOURCE_CHILD, -EDOM);
3177         assert_return(!event_pid_changed(s->event), -ECHILD);
3178         assert_return(SIGNAL_VALID(sig), -EINVAL);
3179
3180         /* If we already have seen indication the process exited refuse sending a signal early. This way we
3181          * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3182          * available. */
3183         if (s->child.exited)
3184                 return -ESRCH;
3185
3186         if (s->child.pidfd >= 0) {
3187                 siginfo_t copy;
3188
3189                 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3190                  * structure here */
3191                 if (si)
3192                         copy = *si;
3193
3194                 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
3195                         /* Let's propagate the error only if the system call is not implemented or prohibited */
3196                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3197                                 return -errno;
3198                 } else
3199                         return 0;
3200         }
3201
3202         /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3203          * this here. */
3204         if (flags != 0)
3205                 return -EOPNOTSUPP;
3206
3207         if (si) {
3208                 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3209                 siginfo_t copy = *si;
3210
3211                 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
3212                         return -errno;
3213         } else if (kill(s->child.pid, sig) < 0)
3214                 return -errno;
3215
3216         return 0;
3217 }
3218
3219 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3220         assert_return(s, -EINVAL);
3221         assert_return(s->type == SOURCE_CHILD, -EDOM);
3222
3223         if (s->child.pidfd < 0)
3224                 return -EOPNOTSUPP;
3225
3226         return s->child.pidfd_owned;
3227 }
3228
3229 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3230         assert_return(s, -EINVAL);
3231         assert_return(s->type == SOURCE_CHILD, -EDOM);
3232
3233         if (s->child.pidfd < 0)
3234                 return -EOPNOTSUPP;
3235
3236         s->child.pidfd_owned = own;
3237         return 0;
3238 }
3239
3240 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3241         assert_return(s, -EINVAL);
3242         assert_return(s->type == SOURCE_CHILD, -EDOM);
3243
3244         return s->child.process_owned;
3245 }
3246
3247 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3248         assert_return(s, -EINVAL);
3249         assert_return(s->type == SOURCE_CHILD, -EDOM);
3250
3251         s->child.process_owned = own;
3252         return 0;
3253 }
3254
3255 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
3256         assert_return(s, -EINVAL);
3257         assert_return(mask, -EINVAL);
3258         assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3259         assert_return(!event_pid_changed(s->event), -ECHILD);
3260
3261         *mask = s->inotify.mask;
3262         return 0;
3263 }
3264
3265 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
3266         int r;
3267
3268         assert_return(s, -EINVAL);
3269         assert_return(s->type != SOURCE_EXIT, -EDOM);
3270         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3271         assert_return(!event_pid_changed(s->event), -ECHILD);
3272
3273         if (s->prepare == callback)
3274                 return 0;
3275
3276         if (callback && s->prepare) {
3277                 s->prepare = callback;
3278                 return 0;
3279         }
3280
3281         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3282         if (r < 0)
3283                 return r;
3284
3285         s->prepare = callback;
3286
3287         if (callback) {
3288                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3289                 if (r < 0)
3290                         return r;
3291         } else
3292                 prioq_remove(s->event->prepare, s, &s->prepare_index);
3293
3294         return 0;
3295 }
3296
3297 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
3298         assert_return(s, NULL);
3299
3300         return s->userdata;
3301 }
3302
3303 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3304         void *ret;
3305
3306         assert_return(s, NULL);
3307
3308         ret = s->userdata;
3309         s->userdata = userdata;
3310
3311         return ret;
3312 }
3313
3314 static int event_source_enter_ratelimited(sd_event_source *s) {
3315         int r;
3316
3317         assert(s);
3318
3319         /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3320          * the end of the rate limit time window, much as if it was a timer event source. */
3321
3322         if (s->ratelimited)
3323                 return 0; /* Already ratelimited, this is a NOP hence */
3324
3325         /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3326         r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3327         if (r < 0)
3328                 return r;
3329
3330         /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3331          * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3332          * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3333         if (EVENT_SOURCE_IS_TIME(s->type))
3334                 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3335
3336         /* Now, let's add the event source to the monotonic clock instead */
3337         r = event_source_time_prioq_put(s, &s->event->monotonic);
3338         if (r < 0)
3339                 goto fail;
3340
3341         /* And let's take the event source officially offline */
3342         r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3343         if (r < 0) {
3344                 event_source_time_prioq_remove(s, &s->event->monotonic);
3345                 goto fail;
3346         }
3347
3348         event_source_pp_prioq_reshuffle(s);
3349
3350         log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3351         return 0;
3352
3353 fail:
3354         /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3355          * space for it should already be allocated. */
3356         if (EVENT_SOURCE_IS_TIME(s->type))
3357                 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3358
3359         return r;
3360 }
3361
3362 static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
3363         int r;
3364
3365         assert(s);
3366
3367         if (!s->ratelimited)
3368                 return 0;
3369
3370         /* Let's take the event source out of the monotonic prioq first. */
3371         event_source_time_prioq_remove(s, &s->event->monotonic);
3372
3373         /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3374         if (EVENT_SOURCE_IS_TIME(s->type)) {
3375                 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3376                 if (r < 0)
3377                         goto fail;
3378         }
3379
3380         /* Let's try to take it online again.  */
3381         r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3382         if (r < 0) {
3383                 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3384                 if (EVENT_SOURCE_IS_TIME(s->type))
3385                         event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3386
3387                 goto fail;
3388         }
3389
3390         event_source_pp_prioq_reshuffle(s);
3391         ratelimit_reset(&s->rate_limit);
3392
3393         log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
3394
3395         if (run_callback && s->ratelimit_expire_callback) {
3396                 s->dispatching = true;
3397                 r = s->ratelimit_expire_callback(s, s->userdata);
3398                 s->dispatching = false;
3399
3400                 if (r < 0) {
3401                         log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3402                                         strna(s->description),
3403                                         event_source_type_to_string(s->type),
3404                                         s->exit_on_failure ? "exiting" : "disabling");
3405
3406                         if (s->exit_on_failure)
3407                                 (void) sd_event_exit(s->event, r);
3408                 }
3409
3410                 if (s->n_ref == 0)
3411                         source_free(s);
3412                 else if (r < 0)
3413                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3414
3415                 return 1;
3416         }
3417
3418         return 0;
3419
3420 fail:
3421         /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3422          * simply put it back in it, maybe we can then process it more successfully next iteration. */
3423         assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3424
3425         return r;
3426 }
3427
3428 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3429         usec_t c;
3430         assert(e);
3431         assert(a <= b);
3432
3433         if (a <= 0)
3434                 return 0;
3435         if (a >= USEC_INFINITY)
3436                 return USEC_INFINITY;
3437
3438         if (b <= a + 1)
3439                 return a;
3440
3441         initialize_perturb(e);
3442
3443         /*
3444           Find a good time to wake up again between times a and b. We
3445           have two goals here:
3446
3447           a) We want to wake up as seldom as possible, hence prefer
3448              later times over earlier times.
3449
3450           b) But if we have to wake up, then let's make sure to
3451              dispatch as much as possible on the entire system.
3452
3453           We implement this by waking up everywhere at the same time
3454           within any given minute if we can, synchronised via the
3455           perturbation value determined from the boot ID. If we can't,
3456           then we try to find the same spot in every 10s, then 1s and
3457           then 250ms step. Otherwise, we pick the last possible time
3458           to wake up.
3459         */
3460
3461         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3462         if (c >= b) {
3463                 if (_unlikely_(c < USEC_PER_MINUTE))
3464                         return b;
3465
3466                 c -= USEC_PER_MINUTE;
3467         }
3468
3469         if (c >= a)
3470                 return c;
3471
3472         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3473         if (c >= b) {
3474                 if (_unlikely_(c < USEC_PER_SEC*10))
3475                         return b;
3476
3477                 c -= USEC_PER_SEC*10;
3478         }
3479
3480         if (c >= a)
3481                 return c;
3482
3483         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3484         if (c >= b) {
3485                 if (_unlikely_(c < USEC_PER_SEC))
3486                         return b;
3487
3488                 c -= USEC_PER_SEC;
3489         }
3490
3491         if (c >= a)
3492                 return c;
3493
3494         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3495         if (c >= b) {
3496                 if (_unlikely_(c < USEC_PER_MSEC*250))
3497                         return b;
3498
3499                 c -= USEC_PER_MSEC*250;
3500         }
3501
3502         if (c >= a)
3503                 return c;
3504
3505         return b;
3506 }
3507
3508 static int event_arm_timer(
3509                 sd_event *e,
3510                 struct clock_data *d) {
3511
3512         struct itimerspec its = {};
3513         sd_event_source *a, *b;
3514         usec_t t;
3515
3516         assert(e);
3517         assert(d);
3518
3519         if (!d->needs_rearm)
3520                 return 0;
3521
3522         d->needs_rearm = false;
3523
3524         a = prioq_peek(d->earliest);
3525         assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3526         if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3527
3528                 if (d->fd < 0)
3529                         return 0;
3530
3531                 if (d->next == USEC_INFINITY)
3532                         return 0;
3533
3534                 /* disarm */
3535                 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3536                         return -errno;
3537
3538                 d->next = USEC_INFINITY;
3539                 return 0;
3540         }
3541
3542         b = prioq_peek(d->latest);
3543         assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3544         assert(b && b->enabled != SD_EVENT_OFF);
3545
3546         t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3547         if (d->next == t)
3548                 return 0;
3549
3550         assert_se(d->fd >= 0);
3551
3552         if (t == 0) {
3553                 /* We don't want to disarm here, just mean some time looooong ago. */
3554                 its.it_value.tv_sec = 0;
3555                 its.it_value.tv_nsec = 1;
3556         } else
3557                 timespec_store(&its.it_value, t);
3558
3559         if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3560                 return -errno;
3561
3562         d->next = t;
3563         return 0;
3564 }
3565
3566 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3567         assert(e);
3568         assert(s);
3569         assert(s->type == SOURCE_IO);
3570
3571         /* If the event source was already pending, we just OR in the
3572          * new revents, otherwise we reset the value. The ORing is
3573          * necessary to handle EPOLLONESHOT events properly where
3574          * readability might happen independently of writability, and
3575          * we need to keep track of both */
3576
3577         if (s->pending)
3578                 s->io.revents |= revents;
3579         else
3580                 s->io.revents = revents;
3581
3582         return source_set_pending(s, true);
3583 }
3584
3585 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3586         uint64_t x;
3587         ssize_t ss;
3588
3589         assert(e);
3590         assert(fd >= 0);
3591
3592         assert_return(events == EPOLLIN, -EIO);
3593
3594         ss = read(fd, &x, sizeof(x));
3595         if (ss < 0) {
3596                 if (ERRNO_IS_TRANSIENT(errno))
3597                         return 0;
3598
3599                 return -errno;
3600         }
3601
3602         if (_unlikely_(ss != sizeof(x)))
3603                 return -EIO;
3604
3605         if (next)
3606                 *next = USEC_INFINITY;
3607
3608         return 0;
3609 }
3610
3611 static int process_timer(
3612                 sd_event *e,
3613                 usec_t n,
3614                 struct clock_data *d) {
3615
3616         sd_event_source *s;
3617         bool callback_invoked = false;
3618         int r;
3619
3620         assert(e);
3621         assert(d);
3622
3623         for (;;) {
3624                 s = prioq_peek(d->earliest);
3625                 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3626
3627                 if (!s || time_event_source_next(s) > n)
3628                         break;
3629
3630                 if (s->ratelimited) {
3631                         /* This is an event sources whose ratelimit window has ended. Let's turn it on
3632                          * again. */
3633                         assert(s->ratelimited);
3634
3635                         r = event_source_leave_ratelimit(s, /* run_callback */ true);
3636                         if (r < 0)
3637                                 return r;
3638                         else if (r == 1)
3639                                 callback_invoked = true;
3640
3641                         continue;
3642                 }
3643
3644                 if (s->enabled == SD_EVENT_OFF || s->pending)
3645                         break;
3646
3647                 r = source_set_pending(s, true);
3648                 if (r < 0)
3649                         return r;
3650
3651                 event_source_time_prioq_reshuffle(s);
3652         }
3653
3654         return callback_invoked;
3655 }
3656
3657 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3658         int64_t min_priority = threshold;
3659         bool something_new = false;
3660         sd_event_source *s;
3661         int r;
3662
3663         assert(e);
3664         assert(ret_min_priority);
3665
3666         if (!e->need_process_child) {
3667                 *ret_min_priority = min_priority;
3668                 return 0;
3669         }
3670
3671         e->need_process_child = false;
3672
3673         /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3674          * for, instead of using P_ALL. This is because we only want to get child information of very
3675          * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3676          * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3677          * hence we really don't want anything flushed out of the kernel's queue that we don't care
3678          * about. Since this is O(n) this means that if you have a lot of processes you probably want
3679          * to handle SIGCHLD yourself.
3680          *
3681          * We do not reap the children here (by using WNOWAIT), this is only done after the event
3682          * source is dispatched so that the callback still sees the process as a zombie. */
3683
3684         HASHMAP_FOREACH(s, e->child_sources) {
3685                 assert(s->type == SOURCE_CHILD);
3686
3687                 if (s->priority > threshold)
3688                         continue;
3689
3690                 if (s->pending)
3691                         continue;
3692
3693                 if (event_source_is_offline(s))
3694                         continue;
3695
3696                 if (s->child.exited)
3697                         continue;
3698
3699                 if (EVENT_SOURCE_WATCH_PIDFD(s))
3700                         /* There's a usable pidfd known for this event source? Then don't waitid() for
3701                          * it here */
3702                         continue;
3703
3704                 zero(s->child.siginfo);
3705                 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3706                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3707                         return negative_errno();
3708
3709                 if (s->child.siginfo.si_pid != 0) {
3710                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3711
3712                         if (zombie)
3713                                 s->child.exited = true;
3714
3715                         if (!zombie && (s->child.options & WEXITED)) {
3716                                 /* If the child isn't dead then let's immediately remove the state
3717                                  * change from the queue, since there's no benefit in leaving it
3718                                  * queued. */
3719
3720                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
3721                                 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3722                         }
3723
3724                         r = source_set_pending(s, true);
3725                         if (r < 0)
3726                                 return r;
3727                         if (r > 0) {
3728                                 something_new = true;
3729                                 min_priority = MIN(min_priority, s->priority);
3730                         }
3731                 }
3732         }
3733
3734         *ret_min_priority = min_priority;
3735         return something_new;
3736 }
3737
3738 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3739         assert(e);
3740         assert(s);
3741         assert(s->type == SOURCE_CHILD);
3742
3743         if (s->pending)
3744                 return 0;
3745
3746         if (event_source_is_offline(s))
3747                 return 0;
3748
3749         if (!EVENT_SOURCE_WATCH_PIDFD(s))
3750                 return 0;
3751
3752         zero(s->child.siginfo);
3753         if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3754                 return -errno;
3755
3756         if (s->child.siginfo.si_pid == 0)
3757                 return 0;
3758
3759         if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3760                 s->child.exited = true;
3761
3762         return source_set_pending(s, true);
3763 }
3764
3765 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3766         int r;
3767
3768         assert(e);
3769         assert(d);
3770         assert_return(events == EPOLLIN, -EIO);
3771         assert(min_priority);
3772
3773         /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3774          * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3775          * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3776          * but we might have higher priority children we care about hence we need to check that
3777          * explicitly. */
3778
3779         if (sigismember(&d->sigset, SIGCHLD))
3780                 e->need_process_child = true;
3781
3782         /* If there's already an event source pending for this priority we don't read another */
3783         if (d->current)
3784                 return 0;
3785
3786         for (;;) {
3787                 struct signalfd_siginfo si;
3788                 ssize_t n;
3789                 sd_event_source *s = NULL;
3790
3791                 n = read(d->fd, &si, sizeof(si));
3792                 if (n < 0) {
3793                         if (ERRNO_IS_TRANSIENT(errno))
3794                                 return 0;
3795
3796                         return -errno;
3797                 }
3798
3799                 if (_unlikely_(n != sizeof(si)))
3800                         return -EIO;
3801
3802                 assert(SIGNAL_VALID(si.ssi_signo));
3803
3804                 if (e->signal_sources)
3805                         s = e->signal_sources[si.ssi_signo];
3806                 if (!s)
3807                         continue;
3808                 if (s->pending)
3809                         continue;
3810
3811                 s->signal.siginfo = si;
3812                 d->current = s;
3813
3814                 r = source_set_pending(s, true);
3815                 if (r < 0)
3816                         return r;
3817                 if (r > 0 && *min_priority >= s->priority) {
3818                         *min_priority = s->priority;
3819                         return 1; /* an event source with smaller priority is queued. */
3820                 }
3821
3822                 return 0;
3823         }
3824 }
3825
3826 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3827         ssize_t n;
3828
3829         assert(e);
3830         assert(d);
3831
3832         assert_return(revents == EPOLLIN, -EIO);
3833
3834         /* If there's already an event source pending for this priority, don't read another */
3835         if (d->n_pending > 0)
3836                 return 0;
3837
3838         /* Is the read buffer non-empty? If so, let's not read more */
3839         if (d->buffer_filled > 0)
3840                 return 0;
3841
3842         if (d->priority > threshold)
3843                 return 0;
3844
3845         n = read(d->fd, &d->buffer, sizeof(d->buffer));
3846         if (n < 0) {
3847                 if (ERRNO_IS_TRANSIENT(errno))
3848                         return 0;
3849
3850                 return -errno;
3851         }
3852
3853         assert(n > 0);
3854         d->buffer_filled = (size_t) n;
3855         LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
3856
3857         return 1;
3858 }
3859
3860 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3861         assert(e);
3862         assert(d);
3863         assert(sz <= d->buffer_filled);
3864
3865         if (sz == 0)
3866                 return;
3867
3868         /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3869         memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3870         d->buffer_filled -= sz;
3871
3872         if (d->buffer_filled == 0)
3873                 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
3874 }
3875
3876 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3877         int r;
3878
3879         assert(e);
3880         assert(d);
3881
3882         /* If there's already an event source pending for this priority, don't read another */
3883         if (d->n_pending > 0)
3884                 return 0;
3885
3886         while (d->buffer_filled > 0) {
3887                 size_t sz;
3888
3889                 /* Let's validate that the event structures are complete */
3890                 if (d->buffer_filled < offsetof(struct inotify_event, name))
3891                         return -EIO;
3892
3893                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3894                 if (d->buffer_filled < sz)
3895                         return -EIO;
3896
3897                 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3898                         struct inode_data *inode_data;
3899
3900                         /* The queue overran, let's pass this event to all event sources connected to this inotify
3901                          * object */
3902
3903                         HASHMAP_FOREACH(inode_data, d->inodes)
3904                                 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3905
3906                                         if (event_source_is_offline(s))
3907                                                 continue;
3908
3909                                         r = source_set_pending(s, true);
3910                                         if (r < 0)
3911                                                 return r;
3912                                 }
3913                 } else {
3914                         struct inode_data *inode_data;
3915
3916                         /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3917                          * our watch descriptor table. */
3918                         if (d->buffer.ev.mask & IN_IGNORED) {
3919
3920                                 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3921                                 if (!inode_data) {
3922                                         event_inotify_data_drop(e, d, sz);
3923                                         continue;
3924                                 }
3925
3926                                 /* The watch descriptor was removed by the kernel, let's drop it here too */
3927                                 inode_data->wd = -1;
3928                         } else {
3929                                 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3930                                 if (!inode_data) {
3931                                         event_inotify_data_drop(e, d, sz);
3932                                         continue;
3933                                 }
3934                         }
3935
3936                         /* Trigger all event sources that are interested in these events. Also trigger all event
3937                          * sources if IN_IGNORED or IN_UNMOUNT is set. */
3938                         LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3939
3940                                 if (event_source_is_offline(s))
3941                                         continue;
3942
3943                                 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3944                                     (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3945                                         continue;
3946
3947                                 r = source_set_pending(s, true);
3948                                 if (r < 0)
3949                                         return r;
3950                         }
3951                 }
3952
3953                 /* Something pending now? If so, let's finish, otherwise let's read more. */
3954                 if (d->n_pending > 0)
3955                         return 1;
3956         }
3957
3958         return 0;
3959 }
3960
3961 static int process_inotify(sd_event *e) {
3962         int r, done = 0;
3963
3964         assert(e);
3965
3966         LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
3967                 r = event_inotify_data_process(e, d);
3968                 if (r < 0)
3969                         return r;
3970                 if (r > 0)
3971                         done ++;
3972         }
3973
3974         return done;
3975 }
3976
3977 static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
3978         assert(s);
3979         assert(s->type == SOURCE_MEMORY_PRESSURE);
3980
3981         if (s->pending)
3982                 s->memory_pressure.revents |= revents;
3983         else
3984                 s->memory_pressure.revents = revents;
3985
3986         return source_set_pending(s, true);
3987 }
3988
3989 static int source_memory_pressure_write(sd_event_source *s) {
3990         ssize_t n;
3991         int r;
3992
3993         assert(s);
3994         assert(s->type == SOURCE_MEMORY_PRESSURE);
3995
3996         /* once we start writing, the buffer is locked, we allow no further changes. */
3997         s->memory_pressure.locked = true;
3998
3999         if (s->memory_pressure.write_buffer_size > 0) {
4000                 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4001                 if (n < 0) {
4002                         if (!ERRNO_IS_TRANSIENT(errno))
4003                                 return -errno;
4004
4005                         n = 0;
4006                 }
4007         } else
4008                 n = 0;
4009
4010         assert(n >= 0);
4011
4012         if ((size_t) n == s->memory_pressure.write_buffer_size) {
4013                 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4014
4015                 if (n > 0) {
4016                         s->memory_pressure.write_buffer_size = 0;
4017
4018                         /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4019                         r = source_memory_pressure_register(s, s->enabled);
4020                         if (r < 0)
4021                                 return r;
4022                 }
4023         } else if (n > 0) {
4024                 _cleanup_free_ void *c = NULL;
4025
4026                 assert((size_t) n < s->memory_pressure.write_buffer_size);
4027
4028                 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4029                 if (!c)
4030                         return -ENOMEM;
4031
4032                 free_and_replace(s->memory_pressure.write_buffer, c);
4033                 s->memory_pressure.write_buffer_size -= n;
4034                 return 1;
4035         }
4036
4037         return 0;
4038 }
4039
4040 static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4041         int r;
4042
4043         assert(s);
4044         assert(s->type == SOURCE_MEMORY_PRESSURE);
4045
4046         r = source_memory_pressure_write(s);
4047         if (r < 0)
4048                 return r;
4049         if (r > 0)
4050                 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4051                            * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4052
4053         /* No pending incoming IO? Then let's not continue further */
4054         if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4055
4056                 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4057                 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4058                         return -EIO;
4059
4060                 return 1; /* leave dispatch, we already processed everything */
4061         }
4062
4063         if (s->memory_pressure.revents & EPOLLIN) {
4064                 uint8_t pipe_buf[PIPE_BUF];
4065                 ssize_t n;
4066
4067                 /* If the fd is readable, then flush out anything that might be queued */
4068
4069                 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4070                 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4071                         return -errno;
4072         }
4073
4074         return 0; /* go on, dispatch to user callback */
4075 }
4076
4077 static int source_dispatch(sd_event_source *s) {
4078         EventSourceType saved_type;
4079         sd_event *saved_event;
4080         int r = 0;
4081
4082         assert(s);
4083         assert(s->pending || s->type == SOURCE_EXIT);
4084
4085         /* Save the event source type, here, so that we still know it after the event callback which might
4086          * invalidate the event. */
4087         saved_type = s->type;
4088
4089         /* Similarly, store a reference to the event loop object, so that we can still access it after the
4090          * callback might have invalidated/disconnected the event source. */
4091         saved_event = s->event;
4092         PROTECT_EVENT(saved_event);
4093
4094         /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4095         assert(!s->ratelimited);
4096         if (!ratelimit_below(&s->rate_limit)) {
4097                 r = event_source_enter_ratelimited(s);
4098                 if (r < 0)
4099                         return r;
4100
4101                 return 1;
4102         }
4103
4104         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
4105                 r = source_set_pending(s, false);
4106                 if (r < 0)
4107                         return r;
4108         }
4109
4110         if (s->type != SOURCE_POST) {
4111                 sd_event_source *z;
4112
4113                 /* If we execute a non-post source, let's mark all post sources as pending. */
4114
4115                 SET_FOREACH(z, s->event->post_sources) {
4116                         if (event_source_is_offline(z))
4117                                 continue;
4118
4119                         r = source_set_pending(z, true);
4120                         if (r < 0)
4121                                 return r;
4122                 }
4123         }
4124
4125         if (s->type == SOURCE_MEMORY_PRESSURE) {
4126                 r = source_memory_pressure_initiate_dispatch(s);
4127                 if (r == -EIO) /* handle EIO errors similar to callback errors */
4128                         goto finish;
4129                 if (r < 0)
4130                         return r;
4131                 if (r > 0) /* already handled */
4132                         return 1;
4133         }
4134
4135         if (s->enabled == SD_EVENT_ONESHOT) {
4136                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
4137                 if (r < 0)
4138                         return r;
4139         }
4140
4141         s->dispatching = true;
4142
4143         switch (s->type) {
4144
4145         case SOURCE_IO:
4146                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4147                 break;
4148
4149         case SOURCE_TIME_REALTIME:
4150         case SOURCE_TIME_BOOTTIME:
4151         case SOURCE_TIME_MONOTONIC:
4152         case SOURCE_TIME_REALTIME_ALARM:
4153         case SOURCE_TIME_BOOTTIME_ALARM:
4154                 r = s->time.callback(s, s->time.next, s->userdata);
4155                 break;
4156
4157         case SOURCE_SIGNAL:
4158                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4159                 break;
4160
4161         case SOURCE_CHILD: {
4162                 bool zombie;
4163
4164                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
4165
4166                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
4167
4168                 /* Now, reap the PID for good. */
4169                 if (zombie) {
4170                         (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
4171                         s->child.waited = true;
4172                 }
4173
4174                 break;
4175         }
4176
4177         case SOURCE_DEFER:
4178                 r = s->defer.callback(s, s->userdata);
4179                 break;
4180
4181         case SOURCE_POST:
4182                 r = s->post.callback(s, s->userdata);
4183                 break;
4184
4185         case SOURCE_EXIT:
4186                 r = s->exit.callback(s, s->userdata);
4187                 break;
4188
4189         case SOURCE_INOTIFY: {
4190                 struct sd_event *e = s->event;
4191                 struct inotify_data *d;
4192                 size_t sz;
4193
4194                 assert(s->inotify.inode_data);
4195                 assert_se(d = s->inotify.inode_data->inotify_data);
4196
4197                 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4198                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4199                 assert(d->buffer_filled >= sz);
4200
4201                 /* If the inotify callback destroys the event source then this likely means we don't need to
4202                  * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4203                  * free it immediately, then we couldn't drop the event from the inotify event queue without
4204                  * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4205                  * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4206                  * explicitly GC it after we are done dropping the inotify event from the buffer. */
4207                 d->n_busy++;
4208                 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
4209                 d->n_busy--;
4210
4211                 /* When no event is pending anymore on this inotify object, then let's drop the event from
4212                  * the inotify event queue buffer. */
4213                 if (d->n_pending == 0)
4214                         event_inotify_data_drop(e, d, sz);
4215
4216                 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4217                 event_gc_inotify_data(e, d);
4218                 break;
4219         }
4220
4221         case SOURCE_MEMORY_PRESSURE:
4222                 r = s->memory_pressure.callback(s, s->userdata);
4223                 break;
4224
4225         case SOURCE_WATCHDOG:
4226         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
4227         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
4228                 assert_not_reached();
4229         }
4230
4231         s->dispatching = false;
4232
4233 finish:
4234         if (r < 0) {
4235                 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4236                                 strna(s->description),
4237                                 event_source_type_to_string(saved_type),
4238                                 s->exit_on_failure ? "exiting" : "disabling");
4239
4240                 if (s->exit_on_failure)
4241                         (void) sd_event_exit(saved_event, r);
4242         }
4243
4244         if (s->n_ref == 0)
4245                 source_free(s);
4246         else if (r < 0)
4247                 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4248
4249         return 1;
4250 }
4251
4252 static int event_prepare(sd_event *e) {
4253         int r;
4254
4255         assert(e);
4256
4257         for (;;) {
4258                 sd_event_source *s;
4259
4260                 s = prioq_peek(e->prepare);
4261                 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
4262                         break;
4263
4264                 s->prepare_iteration = e->iteration;
4265                 prioq_reshuffle(e->prepare, s, &s->prepare_index);
4266
4267                 assert(s->prepare);
4268                 s->dispatching = true;
4269                 r = s->prepare(s, s->userdata);
4270                 s->dispatching = false;
4271
4272                 if (r < 0) {
4273                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4274                                         strna(s->description),
4275                                         event_source_type_to_string(s->type),
4276                                         s->exit_on_failure ? "exiting" : "disabling");
4277
4278                         if (s->exit_on_failure)
4279                                 (void) sd_event_exit(e, r);
4280                 }
4281
4282                 if (s->n_ref == 0)
4283                         source_free(s);
4284                 else if (r < 0)
4285                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4286         }
4287
4288         return 0;
4289 }
4290
4291 static int dispatch_exit(sd_event *e) {
4292         sd_event_source *p;
4293         int r;
4294
4295         assert(e);
4296
4297         p = prioq_peek(e->exit);
4298         assert(!p || p->type == SOURCE_EXIT);
4299
4300         if (!p || event_source_is_offline(p)) {
4301                 e->state = SD_EVENT_FINISHED;
4302                 return 0;
4303         }
4304
4305         PROTECT_EVENT(e);
4306         e->iteration++;
4307         e->state = SD_EVENT_EXITING;
4308         r = source_dispatch(p);
4309         e->state = SD_EVENT_INITIAL;
4310         return r;
4311 }
4312
4313 static sd_event_source* event_next_pending(sd_event *e) {
4314         sd_event_source *p;
4315
4316         assert(e);
4317
4318         p = prioq_peek(e->pending);
4319         if (!p)
4320                 return NULL;
4321
4322         if (event_source_is_offline(p))
4323                 return NULL;
4324
4325         return p;
4326 }
4327
4328 static int arm_watchdog(sd_event *e) {
4329         struct itimerspec its = {};
4330         usec_t t;
4331
4332         assert(e);
4333         assert(e->watchdog_fd >= 0);
4334
4335         t = sleep_between(e,
4336                           usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4337                           usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
4338
4339         timespec_store(&its.it_value, t);
4340
4341         /* Make sure we never set the watchdog to 0, which tells the
4342          * kernel to disable it. */
4343         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4344                 its.it_value.tv_nsec = 1;
4345
4346         return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
4347 }
4348
4349 static int process_watchdog(sd_event *e) {
4350         assert(e);
4351
4352         if (!e->watchdog)
4353                 return 0;
4354
4355         /* Don't notify watchdog too often */
4356         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4357                 return 0;
4358
4359         sd_notify(false, "WATCHDOG=1");
4360         e->watchdog_last = e->timestamp.monotonic;
4361
4362         return arm_watchdog(e);
4363 }
4364
4365 static void event_close_inode_data_fds(sd_event *e) {
4366         struct inode_data *d;
4367
4368         assert(e);
4369
4370         /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4371          * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4372          * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4373          * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4374          * compromise. */
4375
4376         while ((d = e->inode_data_to_close_list)) {
4377                 assert(d->fd >= 0);
4378                 d->fd = safe_close(d->fd);
4379
4380                 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
4381         }
4382 }
4383
4384 static int event_memory_pressure_write_list(sd_event *e) {
4385         int r;
4386
4387         assert(e);
4388
4389         for (;;) {
4390                 sd_event_source *s;
4391
4392                 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4393                 if (!s)
4394                         break;
4395
4396                 assert(s->type == SOURCE_MEMORY_PRESSURE);
4397                 assert(s->memory_pressure.write_buffer_size > 0);
4398                 s->memory_pressure.in_write_list = false;
4399
4400                 r = source_memory_pressure_write(s);
4401                 if (r < 0)
4402                         return r;
4403         }
4404
4405         return 0;
4406 }
4407
4408 _public_ int sd_event_prepare(sd_event *e) {
4409         int r;
4410
4411         assert_return(e, -EINVAL);
4412         assert_return(e = event_resolve(e), -ENOPKG);
4413         assert_return(!event_pid_changed(e), -ECHILD);
4414         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4415         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4416
4417         /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4418          * this check here once, since gettid() is typically not cached, and thus want to minimize
4419          * syscalls */
4420         assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4421
4422         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4423         PROTECT_EVENT(e);
4424
4425         if (e->exit_requested)
4426                 goto pending;
4427
4428         e->iteration++;
4429
4430         e->state = SD_EVENT_PREPARING;
4431         r = event_prepare(e);
4432         e->state = SD_EVENT_INITIAL;
4433         if (r < 0)
4434                 return r;
4435
4436         r = event_memory_pressure_write_list(e);
4437         if (r < 0)
4438                 return r;
4439
4440         r = event_arm_timer(e, &e->realtime);
4441         if (r < 0)
4442                 return r;
4443
4444         r = event_arm_timer(e, &e->boottime);
4445         if (r < 0)
4446                 return r;
4447
4448         r = event_arm_timer(e, &e->monotonic);
4449         if (r < 0)
4450                 return r;
4451
4452         r = event_arm_timer(e, &e->realtime_alarm);
4453         if (r < 0)
4454                 return r;
4455
4456         r = event_arm_timer(e, &e->boottime_alarm);
4457         if (r < 0)
4458                 return r;
4459
4460         event_close_inode_data_fds(e);
4461
4462         if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
4463                 goto pending;
4464
4465         e->state = SD_EVENT_ARMED;
4466
4467         return 0;
4468
4469 pending:
4470         e->state = SD_EVENT_ARMED;
4471         r = sd_event_wait(e, 0);
4472         if (r == 0)
4473                 e->state = SD_EVENT_ARMED;
4474
4475         return r;
4476 }
4477
4478 static int epoll_wait_usec(
4479                 int fd,
4480                 struct epoll_event *events,
4481                 int maxevents,
4482                 usec_t timeout) {
4483
4484         int msec;
4485         /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4486
4487 #if HAVE_EPOLL_PWAIT2
4488         static bool epoll_pwait2_absent = false;
4489         int r;
4490
4491         /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4492          * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4493          * is not that obvious to implement given the libc and kernel definitions differ in the last
4494          * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4495          * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4496          * missing. */
4497
4498         if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
4499                 r = epoll_pwait2(fd,
4500                                  events,
4501                                  maxevents,
4502                                  TIMESPEC_STORE(timeout),
4503                                  NULL);
4504                 if (r >= 0)
4505                         return r;
4506                 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
4507                         return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4508                                         * supported. */
4509
4510                 epoll_pwait2_absent = true;
4511         }
4512 #endif
4513
4514         if (timeout == USEC_INFINITY)
4515                 msec = -1;
4516         else {
4517                 usec_t k;
4518
4519                 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4520                 if (k >= INT_MAX)
4521                         msec = INT_MAX; /* Saturate */
4522                 else
4523                         msec = (int) k;
4524         }
4525
4526         return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
4527 }
4528
4529 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
4530         size_t n_event_queue, m, n_event_max;
4531         int64_t min_priority = threshold;
4532         bool something_new = false;
4533         int r;
4534
4535         assert(e);
4536         assert(ret_min_priority);
4537
4538         n_event_queue = MAX(e->n_sources, 1u);
4539         if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
4540                 return -ENOMEM;
4541
4542         n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4543
4544         /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4545         if (e->buffered_inotify_data_list)
4546                 timeout = 0;
4547
4548         for (;;) {
4549                 r = epoll_wait_usec(
4550                                 e->epoll_fd,
4551                                 e->event_queue,
4552                                 n_event_max,
4553                                 timeout);
4554                 if (r < 0)
4555                         return r;
4556
4557                 m = (size_t) r;
4558
4559                 if (m < n_event_max)
4560                         break;
4561
4562                 if (n_event_max >= n_event_queue * 10)
4563                         break;
4564
4565                 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
4566                         return -ENOMEM;
4567
4568                 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4569                 timeout = 0;
4570         }
4571
4572         /* Set timestamp only when this is called first time. */
4573         if (threshold == INT64_MAX)
4574                 triple_timestamp_get(&e->timestamp);
4575
4576         for (size_t i = 0; i < m; i++) {
4577
4578                 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4579                         r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
4580                 else {
4581                         WakeupType *t = e->event_queue[i].data.ptr;
4582
4583                         switch (*t) {
4584
4585                         case WAKEUP_EVENT_SOURCE: {
4586                                 sd_event_source *s = e->event_queue[i].data.ptr;
4587
4588                                 assert(s);
4589
4590                                 if (s->priority > threshold)
4591                                         continue;
4592
4593                                 min_priority = MIN(min_priority, s->priority);
4594
4595                                 switch (s->type) {
4596
4597                                 case SOURCE_IO:
4598                                         r = process_io(e, s, e->event_queue[i].events);
4599                                         break;
4600
4601                                 case SOURCE_CHILD:
4602                                         r = process_pidfd(e, s, e->event_queue[i].events);
4603                                         break;
4604
4605                                 case SOURCE_MEMORY_PRESSURE:
4606                                         r = process_memory_pressure(s, e->event_queue[i].events);
4607                                         break;
4608
4609                                 default:
4610                                         assert_not_reached();
4611                                 }
4612
4613                                 break;
4614                         }
4615
4616                         case WAKEUP_CLOCK_DATA: {
4617                                 struct clock_data *d = e->event_queue[i].data.ptr;
4618
4619                                 assert(d);
4620
4621                                 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4622                                 break;
4623                         }
4624
4625                         case WAKEUP_SIGNAL_DATA:
4626                                 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4627                                 break;
4628
4629                         case WAKEUP_INOTIFY_DATA:
4630                                 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4631                                 break;
4632
4633                         default:
4634                                 assert_not_reached();
4635                         }
4636                 }
4637                 if (r < 0)
4638                         return r;
4639                 if (r > 0)
4640                         something_new = true;
4641         }
4642
4643         *ret_min_priority = min_priority;
4644         return something_new;
4645 }
4646
4647 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4648         int r;
4649
4650         assert_return(e, -EINVAL);
4651         assert_return(e = event_resolve(e), -ENOPKG);
4652         assert_return(!event_pid_changed(e), -ECHILD);
4653         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4654         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4655
4656         if (e->exit_requested) {
4657                 e->state = SD_EVENT_PENDING;
4658                 return 1;
4659         }
4660
4661         for (int64_t threshold = INT64_MAX; ; threshold--) {
4662                 int64_t epoll_min_priority, child_min_priority;
4663
4664                 /* There may be a possibility that new epoll (especially IO) and child events are
4665                  * triggered just after process_epoll() call but before process_child(), and the new IO
4666                  * events may have higher priority than the child events. To salvage these events,
4667                  * let's call epoll_wait() again, but accepts only events with higher priority than the
4668                  * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4669                  * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4670                  * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4671
4672                 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4673                 if (r == -EINTR) {
4674                         e->state = SD_EVENT_PENDING;
4675                         return 1;
4676                 }
4677                 if (r < 0)
4678                         goto finish;
4679                 if (r == 0 && threshold < INT64_MAX)
4680                         /* No new epoll event. */
4681                         break;
4682
4683                 r = process_child(e, threshold, &child_min_priority);
4684                 if (r < 0)
4685                         goto finish;
4686                 if (r == 0)
4687                         /* No new child event. */
4688                         break;
4689
4690                 threshold = MIN(epoll_min_priority, child_min_priority);
4691                 if (threshold == INT64_MIN)
4692                         break;
4693
4694                 timeout = 0;
4695         }
4696
4697         r = process_watchdog(e);
4698         if (r < 0)
4699                 goto finish;
4700
4701         r = process_inotify(e);
4702         if (r < 0)
4703                 goto finish;
4704
4705         r = process_timer(e, e->timestamp.realtime, &e->realtime);
4706         if (r < 0)
4707                 goto finish;
4708
4709         r = process_timer(e, e->timestamp.boottime, &e->boottime);
4710         if (r < 0)
4711                 goto finish;
4712
4713         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4714         if (r < 0)
4715                 goto finish;
4716
4717         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4718         if (r < 0)
4719                 goto finish;
4720
4721         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4722         if (r < 0)
4723                 goto finish;
4724         else if (r == 1) {
4725                 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4726                  * put loop in the initial state in order to evaluate (in the next iteration) also sources
4727                  * there were potentially re-enabled by the callback.
4728                  *
4729                  * Wondering why we treat only this invocation of process_timer() differently? Once event
4730                  * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4731                  * ratelimit expiry callback is never called for any other timer type. */
4732                 r = 0;
4733                 goto finish;
4734         }
4735
4736         if (event_next_pending(e)) {
4737                 e->state = SD_EVENT_PENDING;
4738                 return 1;
4739         }
4740
4741         r = 0;
4742
4743 finish:
4744         e->state = SD_EVENT_INITIAL;
4745
4746         return r;
4747 }
4748
4749 _public_ int sd_event_dispatch(sd_event *e) {
4750         sd_event_source *p;
4751         int r;
4752
4753         assert_return(e, -EINVAL);
4754         assert_return(e = event_resolve(e), -ENOPKG);
4755         assert_return(!event_pid_changed(e), -ECHILD);
4756         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4757         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4758
4759         if (e->exit_requested)
4760                 return dispatch_exit(e);
4761
4762         p = event_next_pending(e);
4763         if (p) {
4764                 PROTECT_EVENT(e);
4765
4766                 e->state = SD_EVENT_RUNNING;
4767                 r = source_dispatch(p);
4768                 e->state = SD_EVENT_INITIAL;
4769                 return r;
4770         }
4771
4772         e->state = SD_EVENT_INITIAL;
4773
4774         return 1;
4775 }
4776
4777 static void event_log_delays(sd_event *e) {
4778         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4779         size_t l, i;
4780
4781         p = b;
4782         l = sizeof(b);
4783         for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4784                 l = strpcpyf(&p, l, "%u ", e->delays[i]);
4785                 e->delays[i] = 0;
4786         }
4787         log_debug("Event loop iterations: %s", b);
4788 }
4789
4790 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4791         int r;
4792
4793         assert_return(e, -EINVAL);
4794         assert_return(e = event_resolve(e), -ENOPKG);
4795         assert_return(!event_pid_changed(e), -ECHILD);
4796         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4797         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4798
4799         if (e->profile_delays && e->last_run_usec != 0) {
4800                 usec_t this_run;
4801                 unsigned l;
4802
4803                 this_run = now(CLOCK_MONOTONIC);
4804
4805                 l = log2u64(this_run - e->last_run_usec);
4806                 assert(l < ELEMENTSOF(e->delays));
4807                 e->delays[l]++;
4808
4809                 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4810                         event_log_delays(e);
4811                         e->last_log_usec = this_run;
4812                 }
4813         }
4814
4815         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4816         PROTECT_EVENT(e);
4817
4818         r = sd_event_prepare(e);
4819         if (r == 0)
4820                 /* There was nothing? Then wait... */
4821                 r = sd_event_wait(e, timeout);
4822
4823         if (e->profile_delays)
4824                 e->last_run_usec = now(CLOCK_MONOTONIC);
4825
4826         if (r > 0) {
4827                 /* There's something now, then let's dispatch it */
4828                 r = sd_event_dispatch(e);
4829                 if (r < 0)
4830                         return r;
4831
4832                 return 1;
4833         }
4834
4835         return r;
4836 }
4837
4838 _public_ int sd_event_loop(sd_event *e) {
4839         int r;
4840
4841         assert_return(e, -EINVAL);
4842         assert_return(e = event_resolve(e), -ENOPKG);
4843         assert_return(!event_pid_changed(e), -ECHILD);
4844         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4845
4846         PROTECT_EVENT(e);
4847
4848         while (e->state != SD_EVENT_FINISHED) {
4849                 r = sd_event_run(e, UINT64_MAX);
4850                 if (r < 0)
4851                         return r;
4852         }
4853
4854         return e->exit_code;
4855 }
4856
4857 _public_ int sd_event_get_fd(sd_event *e) {
4858         assert_return(e, -EINVAL);
4859         assert_return(e = event_resolve(e), -ENOPKG);
4860         assert_return(!event_pid_changed(e), -ECHILD);
4861
4862         return e->epoll_fd;
4863 }
4864
4865 _public_ int sd_event_get_state(sd_event *e) {
4866         assert_return(e, -EINVAL);
4867         assert_return(e = event_resolve(e), -ENOPKG);
4868         assert_return(!event_pid_changed(e), -ECHILD);
4869
4870         return e->state;
4871 }
4872
4873 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
4874         assert_return(e, -EINVAL);
4875         assert_return(e = event_resolve(e), -ENOPKG);
4876         assert_return(code, -EINVAL);
4877         assert_return(!event_pid_changed(e), -ECHILD);
4878
4879         if (!e->exit_requested)
4880                 return -ENODATA;
4881
4882         *code = e->exit_code;
4883         return 0;
4884 }
4885
4886 _public_ int sd_event_exit(sd_event *e, int code) {
4887         assert_return(e, -EINVAL);
4888         assert_return(e = event_resolve(e), -ENOPKG);
4889         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4890         assert_return(!event_pid_changed(e), -ECHILD);
4891
4892         e->exit_requested = true;
4893         e->exit_code = code;
4894
4895         return 0;
4896 }
4897
4898 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
4899         assert_return(e, -EINVAL);
4900         assert_return(e = event_resolve(e), -ENOPKG);
4901         assert_return(usec, -EINVAL);
4902         assert_return(!event_pid_changed(e), -ECHILD);
4903
4904         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4905                 return -EOPNOTSUPP;
4906
4907         if (!triple_timestamp_is_set(&e->timestamp)) {
4908                 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4909                 *usec = now(clock);
4910                 return 1;
4911         }
4912
4913         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
4914         return 0;
4915 }
4916
4917 _public_ int sd_event_default(sd_event **ret) {
4918         sd_event *e = NULL;
4919         int r;
4920
4921         if (!ret)
4922                 return !!default_event;
4923
4924         if (default_event) {
4925                 *ret = sd_event_ref(default_event);
4926                 return 0;
4927         }
4928
4929         r = sd_event_new(&e);
4930         if (r < 0)
4931                 return r;
4932
4933         e->default_event_ptr = &default_event;
4934         e->tid = gettid();
4935         default_event = e;
4936
4937         *ret = e;
4938         return 1;
4939 }
4940
4941 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4942         assert_return(e, -EINVAL);
4943         assert_return(e = event_resolve(e), -ENOPKG);
4944         assert_return(tid, -EINVAL);
4945         assert_return(!event_pid_changed(e), -ECHILD);
4946
4947         if (e->tid != 0) {
4948                 *tid = e->tid;
4949                 return 0;
4950         }
4951
4952         return -ENXIO;
4953 }
4954
4955 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
4956         int r;
4957
4958         assert_return(e, -EINVAL);
4959         assert_return(e = event_resolve(e), -ENOPKG);
4960         assert_return(!event_pid_changed(e), -ECHILD);
4961
4962         if (e->watchdog == !!b)
4963                 return e->watchdog;
4964
4965         if (b) {
4966                 r = sd_watchdog_enabled(false, &e->watchdog_period);
4967                 if (r <= 0)
4968                         return r;
4969
4970                 /* Issue first ping immediately */
4971                 sd_notify(false, "WATCHDOG=1");
4972                 e->watchdog_last = now(CLOCK_MONOTONIC);
4973
4974                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4975                 if (e->watchdog_fd < 0)
4976                         return -errno;
4977
4978                 r = arm_watchdog(e);
4979                 if (r < 0)
4980                         goto fail;
4981
4982                 struct epoll_event ev = {
4983                         .events = EPOLLIN,
4984                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
4985                 };
4986
4987                 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
4988                         r = -errno;
4989                         goto fail;
4990                 }
4991
4992         } else {
4993                 if (e->watchdog_fd >= 0) {
4994                         (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
4995                         e->watchdog_fd = safe_close(e->watchdog_fd);
4996                 }
4997         }
4998
4999         e->watchdog = !!b;
5000         return e->watchdog;
5001
5002 fail:
5003         e->watchdog_fd = safe_close(e->watchdog_fd);
5004         return r;
5005 }
5006
5007 _public_ int sd_event_get_watchdog(sd_event *e) {
5008         assert_return(e, -EINVAL);
5009         assert_return(e = event_resolve(e), -ENOPKG);
5010         assert_return(!event_pid_changed(e), -ECHILD);
5011
5012         return e->watchdog;
5013 }
5014
5015 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5016         assert_return(e, -EINVAL);
5017         assert_return(e = event_resolve(e), -ENOPKG);
5018         assert_return(!event_pid_changed(e), -ECHILD);
5019
5020         *ret = e->iteration;
5021         return 0;
5022 }
5023
5024 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5025         assert_return(s, -EINVAL);
5026
5027         s->destroy_callback = callback;
5028         return 0;
5029 }
5030
5031 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5032         assert_return(s, -EINVAL);
5033
5034         if (ret)
5035                 *ret = s->destroy_callback;
5036
5037         return !!s->destroy_callback;
5038 }
5039
5040 _public_ int sd_event_source_get_floating(sd_event_source *s) {
5041         assert_return(s, -EINVAL);
5042
5043         return s->floating;
5044 }
5045
5046 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5047         assert_return(s, -EINVAL);
5048
5049         if (s->floating == !!b)
5050                 return 0;
5051
5052         if (!s->event) /* Already disconnected */
5053                 return -ESTALE;
5054
5055         s->floating = b;
5056
5057         if (b) {
5058                 sd_event_source_ref(s);
5059                 sd_event_unref(s->event);
5060         } else {
5061                 sd_event_ref(s->event);
5062                 sd_event_source_unref(s);
5063         }
5064
5065         return 1;
5066 }
5067
5068 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5069         assert_return(s, -EINVAL);
5070         assert_return(s->type != SOURCE_EXIT, -EDOM);
5071
5072         return s->exit_on_failure;
5073 }
5074
5075 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5076         assert_return(s, -EINVAL);
5077         assert_return(s->type != SOURCE_EXIT, -EDOM);
5078
5079         if (s->exit_on_failure == !!b)
5080                 return 0;
5081
5082         s->exit_on_failure = b;
5083         return 1;
5084 }
5085
5086 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5087         int r;
5088
5089         assert_return(s, -EINVAL);
5090
5091         /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5092          * so is a programming error. */
5093         assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5094
5095         /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5096          * non-ratelimited. */
5097         r = event_source_leave_ratelimit(s, /* run_callback */ false);
5098         if (r < 0)
5099                 return r;
5100
5101         s->rate_limit = (RateLimit) { interval, burst };
5102         return 0;
5103 }
5104
5105 _public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5106         assert_return(s, -EINVAL);
5107
5108         s->ratelimit_expire_callback = callback;
5109         return 0;
5110 }
5111
5112 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5113         assert_return(s, -EINVAL);
5114
5115         /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5116          * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5117         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5118                 return -EDOM;
5119
5120         if (!ratelimit_configured(&s->rate_limit))
5121                 return -ENOEXEC;
5122
5123         if (ret_interval)
5124                 *ret_interval = s->rate_limit.interval;
5125         if (ret_burst)
5126                 *ret_burst = s->rate_limit.burst;
5127
5128         return 0;
5129 }
5130
5131 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5132         assert_return(s, -EINVAL);
5133
5134         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5135                 return false;
5136
5137         if (!ratelimit_configured(&s->rate_limit))
5138                 return false;
5139
5140         return s->ratelimited;
5141 }
5142
5143 _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5144         bool change = false;
5145         int r;
5146
5147         assert_return(e, -EINVAL);
5148
5149         if (b) {
5150                 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5151                  * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5152                  * floating after creation (and undo this before deleting them again). */
5153
5154                 if (!e->sigint_event_source) {
5155                         r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5156                         if (r < 0)
5157                                 return r;
5158
5159                         assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5160                         change = true;
5161                 }
5162
5163                 if (!e->sigterm_event_source) {
5164                         r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5165                         if (r < 0) {
5166                                 if (change) {
5167                                         assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5168                                         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5169                                 }
5170
5171                                 return r;
5172                         }
5173
5174                         assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5175                         change = true;
5176                 }
5177
5178         } else {
5179                 if (e->sigint_event_source) {
5180                         assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5181                         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5182                         change = true;
5183                 }
5184
5185                 if (e->sigterm_event_source) {
5186                         assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5187                         e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5188                         change = true;
5189                 }
5190         }
5191
5192         return change;
5193 }
5194
5195 _public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5196         _cleanup_free_ char *b = NULL;
5197         _cleanup_free_ void *w = NULL;
5198
5199         assert_return(s, -EINVAL);
5200         assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5201         assert_return(ty, -EINVAL);
5202
5203         if (!STR_IN_SET(ty, "some", "full"))
5204                 return -EINVAL;
5205
5206         if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5207                 return -EBUSY;
5208
5209         char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5210         if (!space)
5211                 return -EINVAL;
5212
5213         size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5214         b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5215         if (!b)
5216                 return -ENOMEM;
5217         if (!STR_IN_SET(b, "some", "full"))
5218                 return -EINVAL;
5219
5220         if (streq(b, ty))
5221                 return 0;
5222
5223         size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5224         w = new(char, nl);
5225         if (!w)
5226                 return -ENOMEM;
5227
5228         memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5229
5230         free_and_replace(s->memory_pressure.write_buffer, w);
5231         s->memory_pressure.write_buffer_size = nl;
5232         s->memory_pressure.locked = false;
5233
5234         return 1;
5235 }
5236
5237 _public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5238         _cleanup_free_ char *b = NULL;
5239         _cleanup_free_ void *w = NULL;
5240
5241         assert_return(s, -EINVAL);
5242         assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5243
5244         if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5245                 return -ERANGE;
5246         if (window_usec <= 0 || window_usec >= UINT64_MAX)
5247                 return -ERANGE;
5248         if (threshold_usec > window_usec)
5249                 return -EINVAL;
5250
5251         if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5252                 return -EBUSY;
5253
5254         char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5255         if (!space)
5256                 return -EINVAL;
5257
5258         size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5259         b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5260         if (!b)
5261                 return -ENOMEM;
5262         if (!STR_IN_SET(b, "some", "full"))
5263                 return -EINVAL;
5264
5265         if (asprintf((char**) &w,
5266                      "%s " USEC_FMT " " USEC_FMT "",
5267                      b,
5268                      threshold_usec,
5269                      window_usec) < 0)
5270                 return -EINVAL;
5271
5272         l = strlen(w) + 1;
5273         if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5274                 return 0;
5275
5276         free_and_replace(s->memory_pressure.write_buffer, w);
5277         s->memory_pressure.write_buffer_size = l;
5278         s->memory_pressure.locked = false;
5279
5280         return 1;
5281 }