src/libsystemd/sd-event/sd-event.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <malloc.h>
   4 #include <stdlib.h>
   5 #include <sys/timerfd.h>
   6 #include <sys/wait.h>
   7 #include <threads.h>
   8 #include <unistd.h>
   9
  10 #include "sd-daemon.h"
  11 #include "sd-event.h"
  12 #include "sd-id128.h"
  13 #include "sd-messages.h"
  14
  15 #include "alloc-util.h"
  16 #include "errno-util.h"
  17 #include "event-source.h"
  18 #include "fd-util.h"
  19 #include "format-util.h"
  20 #include "glyph-util.h"
  21 #include "hashmap.h"
  22 #include "hexdecoct.h"
  23 #include "list.h"
  24 #include "log.h"
  25 #include "logarithm.h"
  26 #include "memory-util.h"
  27 #include "missing_magic.h"
  28 #include "missing_wait.h"
  29 #include "origin-id.h"
  30 #include "path-util.h"
  31 #include "pidfd-util.h"
  32 #include "prioq.h"
  33 #include "process-util.h"
  34 #include "psi-util.h"
  35 #include "set.h"
  36 #include "signal-util.h"
  37 #include "siphash24.h"
  38 #include "socket-util.h"
  39 #include "stat-util.h"
  40 #include "string-table.h"
  41 #include "string-util.h"
  42 #include "strv.h"
  43 #include "strxcpyx.h"
  44 #include "time-util.h"
  45
  46 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
  47
  48 static bool EVENT_SOURCE_WATCH_PIDFD(const sd_event_source *s) {
  49         /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
  50         return s &&
  51                 s->type == SOURCE_CHILD &&
  52                 s->child.options == WEXITED;
  53 }
  54
  55 static bool event_source_is_online(sd_event_source *s) {
  56         assert(s);
  57         return s->enabled != SD_EVENT_OFF && !s->ratelimited;
  58 }
  59
  60 static bool event_source_is_offline(sd_event_source *s) {
  61         assert(s);
  62         return s->enabled == SD_EVENT_OFF || s->ratelimited;
  63 }
  64
  65 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
  66         [SOURCE_IO]                  = "io",
  67         [SOURCE_TIME_REALTIME]       = "realtime",
  68         [SOURCE_TIME_BOOTTIME]       = "boottime",
  69         [SOURCE_TIME_MONOTONIC]      = "monotonic",
  70         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
  71         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
  72         [SOURCE_SIGNAL]              = "signal",
  73         [SOURCE_CHILD]               = "child",
  74         [SOURCE_DEFER]               = "defer",
  75         [SOURCE_POST]                = "post",
  76         [SOURCE_EXIT]                = "exit",
  77         [SOURCE_WATCHDOG]            = "watchdog",
  78         [SOURCE_INOTIFY]             = "inotify",
  79         [SOURCE_MEMORY_PRESSURE]     = "memory-pressure",
  80 };
  81
  82 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
  83
  84 #define EVENT_SOURCE_IS_TIME(t)                 \
  85         IN_SET((t),                             \
  86                SOURCE_TIME_REALTIME,            \
  87                SOURCE_TIME_BOOTTIME,            \
  88                SOURCE_TIME_MONOTONIC,           \
  89                SOURCE_TIME_REALTIME_ALARM,      \
  90                SOURCE_TIME_BOOTTIME_ALARM)
  91
  92 #define EVENT_SOURCE_CAN_RATE_LIMIT(t)          \
  93         IN_SET((t),                             \
  94                SOURCE_IO,                       \
  95                SOURCE_TIME_REALTIME,            \
  96                SOURCE_TIME_BOOTTIME,            \
  97                SOURCE_TIME_MONOTONIC,           \
  98                SOURCE_TIME_REALTIME_ALARM,      \
  99                SOURCE_TIME_BOOTTIME_ALARM,      \
 100                SOURCE_SIGNAL,                   \
 101                SOURCE_DEFER,                    \
 102                SOURCE_INOTIFY,                  \
 103                SOURCE_MEMORY_PRESSURE)
 104
 105 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
 106  * Time sources and ratelimited sources can be passed, so effectively this is the same as the
 107  * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
 108 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
 109
 110 struct sd_event {
 111         unsigned n_ref;
 112
 113         int epoll_fd;
 114         int watchdog_fd;
 115
 116         Prioq *pending;
 117         Prioq *prepare;
 118
 119         /* timerfd_create() only supports these five clocks so far. We
 120          * can add support for more clocks when the kernel learns to
 121          * deal with them, too. */
 122         struct clock_data realtime;
 123         struct clock_data boottime;
 124         struct clock_data monotonic;
 125         struct clock_data realtime_alarm;
 126         struct clock_data boottime_alarm;
 127
 128         usec_t perturb;
 129
 130         sd_event_source **signal_sources; /* indexed by signal number */
 131         Hashmap *signal_data; /* indexed by priority */
 132
 133         Hashmap *child_sources;
 134         unsigned n_online_child_sources;
 135
 136         Set *post_sources;
 137
 138         Prioq *exit;
 139
 140         Hashmap *inotify_data; /* indexed by priority */
 141
 142         /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
 143         LIST_HEAD(struct inode_data, inode_data_to_close_list);
 144
 145         /* A list of inotify objects that already have events buffered which aren't processed yet */
 146         LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
 147
 148         /* A list of memory pressure event sources that still need their subscription string written */
 149         LIST_HEAD(sd_event_source, memory_pressure_write_list);
 150
 151         uint64_t origin_id;
 152
 153         uint64_t iteration;
 154         triple_timestamp timestamp;
 155         int state;
 156
 157         bool exit_requested:1;
 158         bool need_process_child:1;
 159         bool watchdog:1;
 160         bool profile_delays:1;
 161
 162         int exit_code;
 163
 164         pid_t tid;
 165         sd_event **default_event_ptr;
 166
 167         usec_t watchdog_last, watchdog_period;
 168
 169         unsigned n_sources;
 170
 171         struct epoll_event *event_queue;
 172
 173         LIST_HEAD(sd_event_source, sources);
 174
 175         sd_event_source *sigint_event_source, *sigterm_event_source;
 176
 177         usec_t last_run_usec, last_log_usec;
 178         unsigned delays[sizeof(usec_t) * 8];
 179 };
 180
 181 DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event, event);
 182
 183 static thread_local sd_event *default_event = NULL;
 184
 185 static void source_disconnect(sd_event_source *s);
 186 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
 187
 188 static sd_event* event_resolve(sd_event *e) {
 189         return e == SD_EVENT_DEFAULT ? default_event : e;
 190 }
 191
 192 static int pending_prioq_compare(const void *a, const void *b) {
 193         const sd_event_source *x = a, *y = b;
 194         int r;
 195
 196         assert(x->pending);
 197         assert(y->pending);
 198
 199         /* Enabled ones first */
 200         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 201         if (r != 0)
 202                 return r;
 203
 204         /* Non rate-limited ones first. */
 205         r = CMP(!!x->ratelimited, !!y->ratelimited);
 206         if (r != 0)
 207                 return r;
 208
 209         /* Lower priority values first */
 210         r = CMP(x->priority, y->priority);
 211         if (r != 0)
 212                 return r;
 213
 214         /* Older entries first */
 215         return CMP(x->pending_iteration, y->pending_iteration);
 216 }
 217
 218 static int prepare_prioq_compare(const void *a, const void *b) {
 219         const sd_event_source *x = a, *y = b;
 220         int r;
 221
 222         assert(x->prepare);
 223         assert(y->prepare);
 224
 225         /* Enabled ones first */
 226         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 227         if (r != 0)
 228                 return r;
 229
 230         /* Non rate-limited ones first. */
 231         r = CMP(!!x->ratelimited, !!y->ratelimited);
 232         if (r != 0)
 233                 return r;
 234
 235         /* Move most recently prepared ones last, so that we can stop
 236          * preparing as soon as we hit one that has already been
 237          * prepared in the current iteration */
 238         r = CMP(x->prepare_iteration, y->prepare_iteration);
 239         if (r != 0)
 240                 return r;
 241
 242         /* Lower priority values first */
 243         return CMP(x->priority, y->priority);
 244 }
 245
 246 static usec_t time_event_source_next(const sd_event_source *s) {
 247         assert(s);
 248
 249         /* We have two kinds of event sources that have elapsation times associated with them: the actual
 250          * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
 251          * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
 252          * looking at here. */
 253
 254         if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
 255                 assert(s->rate_limit.begin != 0);
 256                 assert(s->rate_limit.interval != 0);
 257                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 258         }
 259
 260         /* Otherwise this must be a time event source, if not ratelimited */
 261         if (EVENT_SOURCE_IS_TIME(s->type))
 262                 return s->time.next;
 263
 264         return USEC_INFINITY;
 265 }
 266
 267 static usec_t time_event_source_latest(const sd_event_source *s) {
 268         assert(s);
 269
 270         if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
 271                                * same, as we should avoid adding additional inaccuracy on an inaccuracy time
 272                                * window */
 273                 assert(s->rate_limit.begin != 0);
 274                 assert(s->rate_limit.interval != 0);
 275                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 276         }
 277
 278         /* Must be a time event source, if not ratelimited */
 279         if (EVENT_SOURCE_IS_TIME(s->type))
 280                 return usec_add(s->time.next, s->time.accuracy);
 281
 282         return USEC_INFINITY;
 283 }
 284
 285 static bool event_source_timer_candidate(const sd_event_source *s) {
 286         assert(s);
 287
 288         /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
 289          * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
 290         return !s->pending || s->ratelimited;
 291 }
 292
 293 static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
 294         const sd_event_source *x = a, *y = b;
 295         int r;
 296
 297         /* Enabled ones first */
 298         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 299         if (r != 0)
 300                 return r;
 301
 302         /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
 303         r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
 304         if (r != 0)
 305                 return r;
 306
 307         /* Order by time */
 308         return CMP(time_func(x), time_func(y));
 309 }
 310
 311 static int earliest_time_prioq_compare(const void *a, const void *b) {
 312         return time_prioq_compare(a, b, time_event_source_next);
 313 }
 314
 315 static int latest_time_prioq_compare(const void *a, const void *b) {
 316         return time_prioq_compare(a, b, time_event_source_latest);
 317 }
 318
 319 static int exit_prioq_compare(const void *a, const void *b) {
 320         const sd_event_source *x = a, *y = b;
 321         int r;
 322
 323         assert(x->type == SOURCE_EXIT);
 324         assert(y->type == SOURCE_EXIT);
 325
 326         /* Enabled ones first */
 327         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 328         if (r != 0)
 329                 return r;
 330
 331         /* Lower priority values first */
 332         return CMP(x->priority, y->priority);
 333 }
 334
 335 static void free_clock_data(struct clock_data *d) {
 336         assert(d);
 337         assert(d->wakeup == WAKEUP_CLOCK_DATA);
 338
 339         safe_close(d->fd);
 340         prioq_free(d->earliest);
 341         prioq_free(d->latest);
 342 }
 343
 344 static sd_event* event_free(sd_event *e) {
 345         sd_event_source *s;
 346
 347         assert(e);
 348
 349         e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
 350         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
 351
 352         while ((s = e->sources)) {
 353                 assert(s->floating);
 354                 source_disconnect(s);
 355                 sd_event_source_unref(s);
 356         }
 357
 358         assert(e->n_sources == 0);
 359
 360         if (e->default_event_ptr)
 361                 *(e->default_event_ptr) = NULL;
 362
 363         safe_close(e->epoll_fd);
 364         safe_close(e->watchdog_fd);
 365
 366         free_clock_data(&e->realtime);
 367         free_clock_data(&e->boottime);
 368         free_clock_data(&e->monotonic);
 369         free_clock_data(&e->realtime_alarm);
 370         free_clock_data(&e->boottime_alarm);
 371
 372         prioq_free(e->pending);
 373         prioq_free(e->prepare);
 374         prioq_free(e->exit);
 375
 376         free(e->signal_sources);
 377         hashmap_free(e->signal_data);
 378
 379         hashmap_free(e->inotify_data);
 380
 381         hashmap_free(e->child_sources);
 382         set_free(e->post_sources);
 383
 384         free(e->event_queue);
 385
 386         return mfree(e);
 387 }
 388
 389 _public_ int sd_event_new(sd_event** ret) {
 390         sd_event *e;
 391         int r;
 392
 393         assert_return(ret, -EINVAL);
 394
 395         e = new(sd_event, 1);
 396         if (!e)
 397                 return -ENOMEM;
 398
 399         *e = (sd_event) {
 400                 .n_ref = 1,
 401                 .epoll_fd = -EBADF,
 402                 .watchdog_fd = -EBADF,
 403                 .realtime.wakeup = WAKEUP_CLOCK_DATA,
 404                 .realtime.fd = -EBADF,
 405                 .realtime.next = USEC_INFINITY,
 406                 .boottime.wakeup = WAKEUP_CLOCK_DATA,
 407                 .boottime.fd = -EBADF,
 408                 .boottime.next = USEC_INFINITY,
 409                 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
 410                 .monotonic.fd = -EBADF,
 411                 .monotonic.next = USEC_INFINITY,
 412                 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 413                 .realtime_alarm.fd = -EBADF,
 414                 .realtime_alarm.next = USEC_INFINITY,
 415                 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 416                 .boottime_alarm.fd = -EBADF,
 417                 .boottime_alarm.next = USEC_INFINITY,
 418                 .perturb = USEC_INFINITY,
 419                 .origin_id = origin_id_query(),
 420         };
 421
 422         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
 423         if (r < 0)
 424                 goto fail;
 425
 426         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
 427         if (e->epoll_fd < 0) {
 428                 r = -errno;
 429                 goto fail;
 430         }
 431
 432         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
 433
 434         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
 435                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
 436                           glyph(GLYPH_ELLIPSIS));
 437                 e->profile_delays = true;
 438         }
 439
 440         *ret = e;
 441         return 0;
 442
 443 fail:
 444         event_free(e);
 445         return r;
 446 }
 447
 448 /* Define manually so we can add the origin check */
 449 _public_ sd_event* sd_event_ref(sd_event *e) {
 450         if (!e)
 451                 return NULL;
 452         if (event_origin_changed(e))
 453                 return NULL;
 454
 455         e->n_ref++;
 456
 457         return e;
 458 }
 459
 460 _public_ sd_event* sd_event_unref(sd_event *e) {
 461         if (!e)
 462                 return NULL;
 463         if (event_origin_changed(e))
 464                 return NULL;
 465
 466         assert(e->n_ref > 0);
 467         if (--e->n_ref > 0)
 468                 return NULL;
 469
 470         return event_free(e);
 471 }
 472
 473 #define PROTECT_EVENT(e)                                                \
 474         _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
 475
 476 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
 477         int r;
 478
 479         r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
 480         if (r < 0)
 481                 log_debug_errno(r, "Failed to disable event source %p (%s): %m",
 482                                 s, strna(s->description));
 483
 484         return sd_event_source_unref(s);
 485 }
 486
 487 static void source_io_unregister(sd_event_source *s) {
 488         assert(s);
 489         assert(s->type == SOURCE_IO);
 490
 491         if (event_origin_changed(s->event))
 492                 return;
 493
 494         if (!s->io.registered)
 495                 return;
 496
 497         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
 498                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 499                                 strna(s->description), event_source_type_to_string(s->type));
 500
 501         s->io.registered = false;
 502 }
 503
 504 static int source_io_register(
 505                 sd_event_source *s,
 506                 int enabled,
 507                 uint32_t events) {
 508
 509         assert(s);
 510         assert(s->type == SOURCE_IO);
 511         assert(enabled != SD_EVENT_OFF);
 512
 513         struct epoll_event ev = {
 514                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 515                 .data.ptr = s,
 516         };
 517
 518         if (epoll_ctl(s->event->epoll_fd,
 519                       s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 520                       s->io.fd, &ev) < 0)
 521                 return -errno;
 522
 523         s->io.registered = true;
 524
 525         return 0;
 526 }
 527
 528 static void source_child_pidfd_unregister(sd_event_source *s) {
 529         assert(s);
 530         assert(s->type == SOURCE_CHILD);
 531
 532         if (event_origin_changed(s->event))
 533                 return;
 534
 535         if (!s->child.registered)
 536                 return;
 537
 538         if (EVENT_SOURCE_WATCH_PIDFD(s))
 539                 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
 540                         log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 541                                         strna(s->description), event_source_type_to_string(s->type));
 542
 543         s->child.registered = false;
 544 }
 545
 546 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
 547         assert(s);
 548         assert(s->type == SOURCE_CHILD);
 549         assert(enabled != SD_EVENT_OFF);
 550
 551         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
 552                 struct epoll_event ev = {
 553                         .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 554                         .data.ptr = s,
 555                 };
 556
 557                 if (epoll_ctl(s->event->epoll_fd,
 558                               s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 559                               s->child.pidfd, &ev) < 0)
 560                         return -errno;
 561         }
 562
 563         s->child.registered = true;
 564         return 0;
 565 }
 566
 567 static void source_memory_pressure_unregister(sd_event_source *s) {
 568         assert(s);
 569         assert(s->type == SOURCE_MEMORY_PRESSURE);
 570
 571         if (event_origin_changed(s->event))
 572                 return;
 573
 574         if (!s->memory_pressure.registered)
 575                 return;
 576
 577         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
 578                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 579                                 strna(s->description), event_source_type_to_string(s->type));
 580
 581         s->memory_pressure.registered = false;
 582 }
 583
 584 static int source_memory_pressure_register(sd_event_source *s, int enabled) {
 585         assert(s);
 586         assert(s->type == SOURCE_MEMORY_PRESSURE);
 587         assert(enabled != SD_EVENT_OFF);
 588
 589         struct epoll_event ev = {
 590                 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
 591                           (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
 592                 .data.ptr = s,
 593         };
 594
 595         if (epoll_ctl(s->event->epoll_fd,
 596                       s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 597                       s->memory_pressure.fd, &ev) < 0)
 598                 return -errno;
 599
 600         s->memory_pressure.registered = true;
 601         return 0;
 602 }
 603
 604 static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
 605         assert(s);
 606         assert(s->type == SOURCE_MEMORY_PRESSURE);
 607
 608         if (s->memory_pressure.in_write_list)
 609                 return;
 610
 611         LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
 612         s->memory_pressure.in_write_list = true;
 613 }
 614
 615 static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
 616         assert(s);
 617         assert(s->type == SOURCE_MEMORY_PRESSURE);
 618
 619         if (!s->memory_pressure.in_write_list)
 620                 return;
 621
 622         LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
 623         s->memory_pressure.in_write_list = false;
 624 }
 625
 626 static clockid_t event_source_type_to_clock(EventSourceType t) {
 627
 628         switch (t) {
 629
 630         case SOURCE_TIME_REALTIME:
 631                 return CLOCK_REALTIME;
 632
 633         case SOURCE_TIME_BOOTTIME:
 634                 return CLOCK_BOOTTIME;
 635
 636         case SOURCE_TIME_MONOTONIC:
 637                 return CLOCK_MONOTONIC;
 638
 639         case SOURCE_TIME_REALTIME_ALARM:
 640                 return CLOCK_REALTIME_ALARM;
 641
 642         case SOURCE_TIME_BOOTTIME_ALARM:
 643                 return CLOCK_BOOTTIME_ALARM;
 644
 645         default:
 646                 return (clockid_t) -1;
 647         }
 648 }
 649
 650 static EventSourceType clock_to_event_source_type(clockid_t clock) {
 651
 652         switch (clock) {
 653
 654         case CLOCK_REALTIME:
 655                 return SOURCE_TIME_REALTIME;
 656
 657         case CLOCK_BOOTTIME:
 658                 return SOURCE_TIME_BOOTTIME;
 659
 660         case CLOCK_MONOTONIC:
 661                 return SOURCE_TIME_MONOTONIC;
 662
 663         case CLOCK_REALTIME_ALARM:
 664                 return SOURCE_TIME_REALTIME_ALARM;
 665
 666         case CLOCK_BOOTTIME_ALARM:
 667                 return SOURCE_TIME_BOOTTIME_ALARM;
 668
 669         default:
 670                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
 671         }
 672 }
 673
 674 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
 675         assert(e);
 676
 677         switch (t) {
 678
 679         case SOURCE_TIME_REALTIME:
 680                 return &e->realtime;
 681
 682         case SOURCE_TIME_BOOTTIME:
 683                 return &e->boottime;
 684
 685         case SOURCE_TIME_MONOTONIC:
 686                 return &e->monotonic;
 687
 688         case SOURCE_TIME_REALTIME_ALARM:
 689                 return &e->realtime_alarm;
 690
 691         case SOURCE_TIME_BOOTTIME_ALARM:
 692                 return &e->boottime_alarm;
 693
 694         default:
 695                 return NULL;
 696         }
 697 }
 698
 699 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
 700         assert(e);
 701
 702         if (!d)
 703                 return;
 704
 705         hashmap_remove(e->signal_data, &d->priority);
 706         safe_close(d->fd);
 707         free(d);
 708 }
 709
 710 static int event_make_signal_data(
 711                 sd_event *e,
 712                 int sig,
 713                 struct signal_data **ret) {
 714
 715         struct signal_data *d;
 716         bool added = false;
 717         sigset_t ss_copy;
 718         int64_t priority;
 719         int r;
 720
 721         assert(e);
 722
 723         if (event_origin_changed(e))
 724                 return -ECHILD;
 725
 726         if (e->signal_sources && e->signal_sources[sig])
 727                 priority = e->signal_sources[sig]->priority;
 728         else
 729                 priority = SD_EVENT_PRIORITY_NORMAL;
 730
 731         d = hashmap_get(e->signal_data, &priority);
 732         if (d) {
 733                 if (sigismember(&d->sigset, sig) > 0) {
 734                         if (ret)
 735                                 *ret = d;
 736                         return 0;
 737                 }
 738         } else {
 739                 d = new(struct signal_data, 1);
 740                 if (!d)
 741                         return -ENOMEM;
 742
 743                 *d = (struct signal_data) {
 744                         .wakeup = WAKEUP_SIGNAL_DATA,
 745                         .fd = -EBADF,
 746                         .priority = priority,
 747                 };
 748
 749                 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
 750                 if (r < 0) {
 751                         free(d);
 752                         return r;
 753                 }
 754
 755                 added = true;
 756         }
 757
 758         ss_copy = d->sigset;
 759         assert_se(sigaddset(&ss_copy, sig) >= 0);
 760
 761         r = signalfd(d->fd >= 0 ? d->fd : -1,   /* the first arg must be -1 or a valid signalfd */
 762                      &ss_copy,
 763                      SFD_NONBLOCK|SFD_CLOEXEC);
 764         if (r < 0) {
 765                 r = -errno;
 766                 goto fail;
 767         }
 768
 769         d->sigset = ss_copy;
 770
 771         if (d->fd >= 0) {
 772                 if (ret)
 773                         *ret = d;
 774                 return 0;
 775         }
 776
 777         d->fd = fd_move_above_stdio(r);
 778
 779         struct epoll_event ev = {
 780                 .events = EPOLLIN,
 781                 .data.ptr = d,
 782         };
 783
 784         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
 785                 r = -errno;
 786                 goto fail;
 787         }
 788
 789         if (ret)
 790                 *ret = d;
 791
 792         return 0;
 793
 794 fail:
 795         if (added)
 796                 event_free_signal_data(e, d);
 797
 798         return r;
 799 }
 800
 801 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
 802         assert(e);
 803         assert(d);
 804
 805         /* Turns off the specified signal in the signal data
 806          * object. If the signal mask of the object becomes empty that
 807          * way removes it. */
 808
 809         if (sigismember(&d->sigset, sig) == 0)
 810                 return;
 811
 812         assert_se(sigdelset(&d->sigset, sig) >= 0);
 813
 814         if (sigisemptyset(&d->sigset)) {
 815                 /* If all the mask is all-zero we can get rid of the structure */
 816                 event_free_signal_data(e, d);
 817                 return;
 818         }
 819
 820         if (event_origin_changed(e))
 821                 return;
 822
 823         assert(d->fd >= 0);
 824
 825         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
 826                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
 827 }
 828
 829 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
 830         struct signal_data *d;
 831         static const int64_t zero_priority = 0;
 832
 833         assert(e);
 834
 835         /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
 836          * and possibly drop the signalfd for it. */
 837
 838         if (sig == SIGCHLD &&
 839             e->n_online_child_sources > 0)
 840                 return;
 841
 842         if (e->signal_sources &&
 843             e->signal_sources[sig] &&
 844             event_source_is_online(e->signal_sources[sig]))
 845                 return;
 846
 847         /*
 848          * The specified signal might be enabled in three different queues:
 849          *
 850          * 1) the one that belongs to the priority passed (if it is non-NULL)
 851          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
 852          * 3) the 0 priority (to cover the SIGCHLD case)
 853          *
 854          * Hence, let's remove it from all three here.
 855          */
 856
 857         if (priority) {
 858                 d = hashmap_get(e->signal_data, priority);
 859                 if (d)
 860                         event_unmask_signal_data(e, d, sig);
 861         }
 862
 863         if (e->signal_sources && e->signal_sources[sig]) {
 864                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
 865                 if (d)
 866                         event_unmask_signal_data(e, d, sig);
 867         }
 868
 869         d = hashmap_get(e->signal_data, &zero_priority);
 870         if (d)
 871                 event_unmask_signal_data(e, d, sig);
 872 }
 873
 874 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
 875         assert(s);
 876
 877         /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
 878          * they are enabled/disabled or marked pending and such. */
 879
 880         if (s->pending)
 881                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
 882
 883         if (s->prepare)
 884                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
 885 }
 886
 887 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
 888         struct clock_data *d;
 889
 890         assert(s);
 891
 892         /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
 893          * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
 894          * properly again. */
 895
 896         if (s->ratelimited)
 897                 d = &s->event->monotonic;
 898         else if (EVENT_SOURCE_IS_TIME(s->type))
 899                 assert_se(d = event_get_clock_data(s->event, s->type));
 900         else
 901                 return; /* no-op for an event source which is neither a timer nor ratelimited. */
 902
 903         prioq_reshuffle(d->earliest, s, &s->earliest_index);
 904         prioq_reshuffle(d->latest, s, &s->latest_index);
 905         d->needs_rearm = true;
 906 }
 907
 908 static void event_source_time_prioq_remove(
 909                 sd_event_source *s,
 910                 struct clock_data *d) {
 911
 912         assert(s);
 913         assert(d);
 914
 915         prioq_remove(d->earliest, s, &s->earliest_index);
 916         prioq_remove(d->latest, s, &s->latest_index);
 917         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
 918         d->needs_rearm = true;
 919 }
 920
 921 static void source_disconnect(sd_event_source *s) {
 922         sd_event *event;
 923         int r;
 924
 925         assert(s);
 926
 927         if (!s->event)
 928                 return;
 929
 930         assert(s->event->n_sources > 0);
 931
 932         switch (s->type) {
 933
 934         case SOURCE_IO:
 935                 if (s->io.fd >= 0)
 936                         source_io_unregister(s);
 937
 938                 break;
 939
 940         case SOURCE_TIME_REALTIME:
 941         case SOURCE_TIME_BOOTTIME:
 942         case SOURCE_TIME_MONOTONIC:
 943         case SOURCE_TIME_REALTIME_ALARM:
 944         case SOURCE_TIME_BOOTTIME_ALARM:
 945                 /* Only remove this event source from the time event source here if it is not ratelimited. If
 946                  * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
 947                  * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
 948
 949                 if (!s->ratelimited) {
 950                         struct clock_data *d;
 951                         assert_se(d = event_get_clock_data(s->event, s->type));
 952                         event_source_time_prioq_remove(s, d);
 953                 }
 954
 955                 break;
 956
 957         case SOURCE_SIGNAL:
 958                 if (s->signal.sig > 0) {
 959
 960                         if (s->event->signal_sources)
 961                                 s->event->signal_sources[s->signal.sig] = NULL;
 962
 963                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
 964
 965                         if (s->signal.unblock) {
 966                                 sigset_t new_ss;
 967
 968                                 if (sigemptyset(&new_ss) < 0)
 969                                         log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
 970                                 else if (sigaddset(&new_ss, s->signal.sig) < 0)
 971                                         log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
 972                                 else {
 973                                         r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
 974                                         if (r != 0)
 975                                                 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
 976                                 }
 977                         }
 978                 }
 979
 980                 break;
 981
 982         case SOURCE_CHILD:
 983                 if (event_origin_changed(s->event))
 984                         s->child.process_owned = false;
 985
 986                 if (s->child.pid > 0) {
 987                         if (event_source_is_online(s)) {
 988                                 assert(s->event->n_online_child_sources > 0);
 989                                 s->event->n_online_child_sources--;
 990                         }
 991
 992                         assert_se(hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid)));
 993                 }
 994
 995                 if (EVENT_SOURCE_WATCH_PIDFD(s))
 996                         source_child_pidfd_unregister(s);
 997                 else
 998                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
 999
1000                 break;
1001
1002         case SOURCE_DEFER:
1003                 /* nothing */
1004                 break;
1005
1006         case SOURCE_POST:
1007                 set_remove(s->event->post_sources, s);
1008                 break;
1009
1010         case SOURCE_EXIT:
1011                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
1012                 break;
1013
1014         case SOURCE_INOTIFY: {
1015                 struct inode_data *inode_data;
1016
1017                 inode_data = s->inotify.inode_data;
1018                 if (inode_data) {
1019                         struct inotify_data *inotify_data;
1020                         assert_se(inotify_data = inode_data->inotify_data);
1021
1022                         /* Detach this event source from the inode object */
1023                         LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
1024                         s->inotify.inode_data = NULL;
1025
1026                         if (s->pending) {
1027                                 assert(inotify_data->n_pending > 0);
1028                                 inotify_data->n_pending--;
1029                         }
1030
1031                         /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1032                          * continued to being watched. That's because inotify doesn't really have an API for that: we
1033                          * can only change watch masks with access to the original inode either by fd or by path. But
1034                          * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1035                          * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1036                          * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1037                          * there), but given the need for open_by_handle_at() which is privileged and not universally
1038                          * available this would be quite an incomplete solution. Hence we go the other way, leave the
1039                          * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1040                          * anymore after reception. Yes, this sucks, but … Linux … */
1041
1042                         /* Maybe release the inode data (and its inotify) */
1043                         event_gc_inode_data(s->event, inode_data);
1044                 }
1045
1046                 break;
1047         }
1048
1049         case SOURCE_MEMORY_PRESSURE:
1050                 source_memory_pressure_remove_from_write_list(s);
1051                 source_memory_pressure_unregister(s);
1052                 break;
1053
1054         default:
1055                 assert_not_reached();
1056         }
1057
1058         if (s->pending)
1059                 prioq_remove(s->event->pending, s, &s->pending_index);
1060
1061         if (s->prepare)
1062                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1063
1064         if (s->ratelimited)
1065                 event_source_time_prioq_remove(s, &s->event->monotonic);
1066
1067         event = TAKE_PTR(s->event);
1068         LIST_REMOVE(sources, event->sources, s);
1069         event->n_sources--;
1070
1071         /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1072          * pidfd associated with this event source, which we'll do only on source_free(). */
1073
1074         if (!s->floating)
1075                 sd_event_unref(event);
1076 }
1077
1078 static sd_event_source* source_free(sd_event_source *s) {
1079         int r;
1080
1081         assert(s);
1082
1083         source_disconnect(s);
1084
1085         if (s->type == SOURCE_IO && s->io.owned)
1086                 s->io.fd = safe_close(s->io.fd);
1087
1088         if (s->type == SOURCE_CHILD) {
1089                 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1090
1091                 if (s->child.process_owned) {
1092                         assert(s->child.pid > 0);
1093                         assert(s->child.pidfd >= 0);
1094
1095                         if (!s->child.exited) {
1096                                 r = RET_NERRNO(pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0));
1097                                 if (r < 0 && r != -ESRCH)
1098                                         log_debug_errno(r, "Failed to kill process " PID_FMT ", ignoring: %m",
1099                                                         s->child.pid);
1100                         }
1101
1102                         if (!s->child.waited) {
1103                                 siginfo_t si = {};
1104
1105                                 /* Reap the child if we can */
1106                                 (void) waitid(P_PIDFD, s->child.pidfd, &si, WEXITED);
1107                         }
1108                 }
1109
1110                 if (s->child.pidfd_owned)
1111                         s->child.pidfd = safe_close(s->child.pidfd);
1112         }
1113
1114         if (s->type == SOURCE_MEMORY_PRESSURE) {
1115                 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1116                 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1117         }
1118
1119         if (s->destroy_callback)
1120                 s->destroy_callback(s->userdata);
1121
1122         free(s->description);
1123         return mfree(s);
1124 }
1125 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1126
1127 static int source_set_pending(sd_event_source *s, bool b) {
1128         int r;
1129
1130         assert(s);
1131         assert(s->type != SOURCE_EXIT);
1132
1133         if (s->pending == b)
1134                 return 0;
1135
1136         s->pending = b;
1137
1138         if (b) {
1139                 s->pending_iteration = s->event->iteration;
1140
1141                 r = prioq_put(s->event->pending, s, &s->pending_index);
1142                 if (r < 0) {
1143                         s->pending = false;
1144                         return r;
1145                 }
1146         } else
1147                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1148
1149         if (EVENT_SOURCE_IS_TIME(s->type))
1150                 event_source_time_prioq_reshuffle(s);
1151
1152         if (s->type == SOURCE_SIGNAL && !b) {
1153                 struct signal_data *d;
1154
1155                 d = hashmap_get(s->event->signal_data, &s->priority);
1156                 if (d && d->current == s)
1157                         d->current = NULL;
1158         }
1159
1160         if (s->type == SOURCE_INOTIFY) {
1161
1162                 assert(s->inotify.inode_data);
1163                 assert(s->inotify.inode_data->inotify_data);
1164
1165                 if (b)
1166                         s->inotify.inode_data->inotify_data->n_pending++;
1167                 else {
1168                         assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1169                         s->inotify.inode_data->inotify_data->n_pending--;
1170                 }
1171         }
1172
1173         return 1;
1174 }
1175
1176 static sd_event_source* source_new(sd_event *e, bool floating, EventSourceType type) {
1177
1178         /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1179          * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1180          * lines. */
1181         static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1182                 [SOURCE_IO]                  = endoffsetof_field(sd_event_source, io),
1183                 [SOURCE_TIME_REALTIME]       = endoffsetof_field(sd_event_source, time),
1184                 [SOURCE_TIME_BOOTTIME]       = endoffsetof_field(sd_event_source, time),
1185                 [SOURCE_TIME_MONOTONIC]      = endoffsetof_field(sd_event_source, time),
1186                 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1187                 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1188                 [SOURCE_SIGNAL]              = endoffsetof_field(sd_event_source, signal),
1189                 [SOURCE_CHILD]               = endoffsetof_field(sd_event_source, child),
1190                 [SOURCE_DEFER]               = endoffsetof_field(sd_event_source, defer),
1191                 [SOURCE_POST]                = endoffsetof_field(sd_event_source, post),
1192                 [SOURCE_EXIT]                = endoffsetof_field(sd_event_source, exit),
1193                 [SOURCE_INOTIFY]             = endoffsetof_field(sd_event_source, inotify),
1194                 [SOURCE_MEMORY_PRESSURE]     = endoffsetof_field(sd_event_source, memory_pressure),
1195         };
1196
1197         sd_event_source *s;
1198
1199         assert(e);
1200         assert(type >= 0);
1201         assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1202         assert(size_table[type] > 0);
1203
1204         s = malloc0(size_table[type]);
1205         if (!s)
1206                 return NULL;
1207         /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1208          * size, even if we only allocate the initial part we need. */
1209         s = expand_to_usable(s, sizeof(sd_event_source));
1210
1211         /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1212          * than what we allocated here. */
1213         s->n_ref = 1;
1214         s->event = e;
1215         s->floating = floating;
1216         s->type = type;
1217         s->pending_index = PRIOQ_IDX_NULL;
1218         s->prepare_index = PRIOQ_IDX_NULL;
1219
1220         if (!floating)
1221                 sd_event_ref(e);
1222
1223         LIST_PREPEND(sources, e->sources, s);
1224         e->n_sources++;
1225
1226         return s;
1227 }
1228
1229 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1230         assert(s);
1231
1232         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1233 }
1234
1235 _public_ int sd_event_add_io(
1236                 sd_event *e,
1237                 sd_event_source **ret,
1238                 int fd,
1239                 uint32_t events,
1240                 sd_event_io_handler_t callback,
1241                 void *userdata) {
1242
1243         _cleanup_(source_freep) sd_event_source *s = NULL;
1244         int r;
1245
1246         assert_return(e, -EINVAL);
1247         assert_return(e = event_resolve(e), -ENOPKG);
1248         assert_return(fd >= 0, -EBADF);
1249         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1250         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1251         assert_return(!event_origin_changed(e), -ECHILD);
1252
1253         if (!callback)
1254                 callback = io_exit_callback;
1255
1256         s = source_new(e, !ret, SOURCE_IO);
1257         if (!s)
1258                 return -ENOMEM;
1259
1260         s->wakeup = WAKEUP_EVENT_SOURCE;
1261         s->io.fd = fd;
1262         s->io.events = events;
1263         s->io.callback = callback;
1264         s->userdata = userdata;
1265         s->enabled = SD_EVENT_ON;
1266
1267         r = source_io_register(s, s->enabled, events);
1268         if (r < 0)
1269                 return r;
1270
1271         if (ret)
1272                 *ret = s;
1273         TAKE_PTR(s);
1274
1275         return 0;
1276 }
1277
1278 static void initialize_perturb(sd_event *e) {
1279         sd_id128_t id = {};
1280
1281         /* When we sleep for longer, we try to realign the wakeup to the same time within each
1282          * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1283          * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1284          * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1285          * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1286
1287         if (_likely_(e->perturb != USEC_INFINITY))
1288                 return;
1289
1290         if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
1291                 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1292         else
1293                 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
1294 }
1295
1296 static int event_setup_timer_fd(
1297                 sd_event *e,
1298                 struct clock_data *d,
1299                 clockid_t clock) {
1300
1301         assert(e);
1302         assert(d);
1303
1304         if (_likely_(d->fd >= 0))
1305                 return 0;
1306
1307         _cleanup_close_ int fd = -EBADF;
1308
1309         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1310         if (fd < 0)
1311                 return -errno;
1312
1313         fd = fd_move_above_stdio(fd);
1314
1315         struct epoll_event ev = {
1316                 .events = EPOLLIN,
1317                 .data.ptr = d,
1318         };
1319
1320         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1321                 return -errno;
1322
1323         d->fd = TAKE_FD(fd);
1324         return 0;
1325 }
1326
1327 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1328         assert(s);
1329
1330         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1331 }
1332
1333 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1334         int r;
1335
1336         assert(d);
1337
1338         if (d->fd < 0) {
1339                 r = event_setup_timer_fd(e, d, clock);
1340                 if (r < 0)
1341                         return r;
1342         }
1343
1344         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1345         if (r < 0)
1346                 return r;
1347
1348         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1349         if (r < 0)
1350                 return r;
1351
1352         return 0;
1353 }
1354
1355 static int event_source_time_prioq_put(
1356                 sd_event_source *s,
1357                 struct clock_data *d) {
1358
1359         int r;
1360
1361         assert(s);
1362         assert(d);
1363         assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1364
1365         r = prioq_put(d->earliest, s, &s->earliest_index);
1366         if (r < 0)
1367                 return r;
1368
1369         r = prioq_put(d->latest, s, &s->latest_index);
1370         if (r < 0) {
1371                 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1372                 s->earliest_index = PRIOQ_IDX_NULL;
1373                 return r;
1374         }
1375
1376         d->needs_rearm = true;
1377         return 0;
1378 }
1379
1380 _public_ int sd_event_add_time(
1381                 sd_event *e,
1382                 sd_event_source **ret,
1383                 clockid_t clock,
1384                 uint64_t usec,
1385                 uint64_t accuracy,
1386                 sd_event_time_handler_t callback,
1387                 void *userdata) {
1388
1389         EventSourceType type;
1390         _cleanup_(source_freep) sd_event_source *s = NULL;
1391         struct clock_data *d;
1392         int r;
1393
1394         assert_return(e, -EINVAL);
1395         assert_return(e = event_resolve(e), -ENOPKG);
1396         assert_return(accuracy != UINT64_MAX, -EINVAL);
1397         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1398         assert_return(!event_origin_changed(e), -ECHILD);
1399
1400         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1401                 return -EOPNOTSUPP;
1402
1403         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1404         if (type < 0)
1405                 return -EOPNOTSUPP;
1406
1407         if (!callback)
1408                 callback = time_exit_callback;
1409
1410         assert_se(d = event_get_clock_data(e, type));
1411
1412         r = setup_clock_data(e, d, clock);
1413         if (r < 0)
1414                 return r;
1415
1416         s = source_new(e, !ret, type);
1417         if (!s)
1418                 return -ENOMEM;
1419
1420         s->time.next = usec;
1421         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1422         s->time.callback = callback;
1423         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1424         s->userdata = userdata;
1425         s->enabled = SD_EVENT_ONESHOT;
1426
1427         r = event_source_time_prioq_put(s, d);
1428         if (r < 0)
1429                 return r;
1430
1431         if (ret)
1432                 *ret = s;
1433         TAKE_PTR(s);
1434
1435         return 0;
1436 }
1437
1438 _public_ int sd_event_add_time_relative(
1439                 sd_event *e,
1440                 sd_event_source **ret,
1441                 clockid_t clock,
1442                 uint64_t usec,
1443                 uint64_t accuracy,
1444                 sd_event_time_handler_t callback,
1445                 void *userdata) {
1446
1447         usec_t t;
1448         int r;
1449
1450         /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1451          * checks for overflow. */
1452
1453         r = sd_event_now(e, clock, &t);
1454         if (r < 0)
1455                 return r;
1456
1457         if (usec >= USEC_INFINITY - t)
1458                 return -EOVERFLOW;
1459
1460         return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1461 }
1462
1463 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1464         assert(s);
1465
1466         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1467 }
1468
1469 _public_ int sd_event_add_signal(
1470                 sd_event *e,
1471                 sd_event_source **ret,
1472                 int sig,
1473                 sd_event_signal_handler_t callback,
1474                 void *userdata) {
1475
1476         _cleanup_(source_freep) sd_event_source *s = NULL;
1477         struct signal_data *d;
1478         sigset_t new_ss;
1479         bool block_it;
1480         int r;
1481
1482         assert_return(e, -EINVAL);
1483         assert_return(e = event_resolve(e), -ENOPKG);
1484         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1485         assert_return(!event_origin_changed(e), -ECHILD);
1486
1487         /* Let's make sure our special flag stays outside of the valid signal range */
1488         assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1489
1490         if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1491                 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1492                 assert_return(SIGNAL_VALID(sig), -EINVAL);
1493
1494                 block_it = true;
1495         } else {
1496                 assert_return(SIGNAL_VALID(sig), -EINVAL);
1497
1498                 r = signal_is_blocked(sig);
1499                 if (r < 0)
1500                         return r;
1501                 if (r == 0)
1502                         return -EBUSY;
1503
1504                 block_it = false;
1505         }
1506
1507         if (!callback)
1508                 callback = signal_exit_callback;
1509
1510         if (!e->signal_sources) {
1511                 e->signal_sources = new0(sd_event_source*, _NSIG);
1512                 if (!e->signal_sources)
1513                         return -ENOMEM;
1514         } else if (e->signal_sources[sig])
1515                 return -EBUSY;
1516
1517         s = source_new(e, !ret, SOURCE_SIGNAL);
1518         if (!s)
1519                 return -ENOMEM;
1520
1521         s->signal.sig = sig;
1522         s->signal.callback = callback;
1523         s->userdata = userdata;
1524         s->enabled = SD_EVENT_ON;
1525
1526         e->signal_sources[sig] = s;
1527
1528         if (block_it) {
1529                 sigset_t old_ss;
1530
1531                 if (sigemptyset(&new_ss) < 0)
1532                         return -errno;
1533
1534                 if (sigaddset(&new_ss, sig) < 0)
1535                         return -errno;
1536
1537                 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1538                 if (r != 0)
1539                         return -r;
1540
1541                 r = sigismember(&old_ss, sig);
1542                 if (r < 0)
1543                         return -errno;
1544
1545                 s->signal.unblock = !r;
1546         } else
1547                 s->signal.unblock = false;
1548
1549         r = event_make_signal_data(e, sig, &d);
1550         if (r < 0) {
1551                 if (s->signal.unblock)
1552                         (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1553
1554                 return r;
1555         }
1556
1557         /* Use the signal name as description for the event source by default */
1558         (void) sd_event_source_set_description(s, signal_to_string(sig));
1559
1560         if (ret)
1561                 *ret = s;
1562         TAKE_PTR(s);
1563
1564         return 0;
1565 }
1566
1567 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1568         assert(s);
1569
1570         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1571 }
1572
1573 _public_ int sd_event_add_child(
1574                 sd_event *e,
1575                 sd_event_source **ret,
1576                 pid_t pid,
1577                 int options,
1578                 sd_event_child_handler_t callback,
1579                 void *userdata) {
1580
1581         _cleanup_(source_freep) sd_event_source *s = NULL;
1582         int r;
1583
1584         assert_return(e, -EINVAL);
1585         assert_return(e = event_resolve(e), -ENOPKG);
1586         assert_return(pid > 1, -EINVAL);
1587         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1588         assert_return(options != 0, -EINVAL);
1589         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1590         assert_return(!event_origin_changed(e), -ECHILD);
1591
1592         if (!callback)
1593                 callback = child_exit_callback;
1594
1595         if (e->n_online_child_sources == 0) {
1596                 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1597                  * for compatibility with pre-pidfd and because we don't want the reap the child processes
1598                  * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1599                  * take effect.
1600                  *
1601                  * (As an optimization we only do this check on the first child event source created.) */
1602                 r = signal_is_blocked(SIGCHLD);
1603                 if (r < 0)
1604                         return r;
1605                 if (r == 0)
1606                         return -EBUSY;
1607         }
1608
1609         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1610         if (r < 0)
1611                 return r;
1612
1613         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1614                 return -EBUSY;
1615
1616         s = source_new(e, !ret, SOURCE_CHILD);
1617         if (!s)
1618                 return -ENOMEM;
1619
1620         /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1621          * pin the PID, and make regular waitid() handling race-free. */
1622
1623         s->child.pidfd = pidfd_open(pid, 0);
1624         if (s->child.pidfd < 0)
1625                 return -errno;
1626
1627         s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1628
1629         s->wakeup = WAKEUP_EVENT_SOURCE;
1630         s->child.options = options;
1631         s->child.callback = callback;
1632         s->userdata = userdata;
1633         s->enabled = SD_EVENT_ONESHOT;
1634
1635         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1636                 /* We only want to watch for exit */
1637                 r = source_child_pidfd_register(s, s->enabled);
1638                 if (r < 0)
1639                         return r;
1640
1641         } else {
1642                 /* We shall wait for some other event than WEXITED */
1643                 r = event_make_signal_data(e, SIGCHLD, NULL);
1644                 if (r < 0)
1645                         return r;
1646
1647                 e->need_process_child = true;
1648         }
1649
1650         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1651         if (r < 0)
1652                 return r;
1653
1654         /* These must be done after everything succeeds. */
1655         s->child.pid = pid;
1656         e->n_online_child_sources++;
1657
1658         if (ret)
1659                 *ret = s;
1660         TAKE_PTR(s);
1661         return 0;
1662 }
1663
1664 _public_ int sd_event_add_child_pidfd(
1665                 sd_event *e,
1666                 sd_event_source **ret,
1667                 int pidfd,
1668                 int options,
1669                 sd_event_child_handler_t callback,
1670                 void *userdata) {
1671
1672         _cleanup_(source_freep) sd_event_source *s = NULL;
1673         pid_t pid;
1674         int r;
1675
1676         assert_return(e, -EINVAL);
1677         assert_return(e = event_resolve(e), -ENOPKG);
1678         assert_return(pidfd >= 0, -EBADF);
1679         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1680         assert_return(options != 0, -EINVAL);
1681         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1682         assert_return(!event_origin_changed(e), -ECHILD);
1683
1684         if (!callback)
1685                 callback = child_exit_callback;
1686
1687         if (e->n_online_child_sources == 0) {
1688                 r = signal_is_blocked(SIGCHLD);
1689                 if (r < 0)
1690                         return r;
1691                 if (r == 0)
1692                         return -EBUSY;
1693         }
1694
1695         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1696         if (r < 0)
1697                 return r;
1698
1699         r = pidfd_get_pid(pidfd, &pid);
1700         if (r < 0)
1701                 return r;
1702
1703         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1704                 return -EBUSY;
1705
1706         s = source_new(e, !ret, SOURCE_CHILD);
1707         if (!s)
1708                 return -ENOMEM;
1709
1710         s->wakeup = WAKEUP_EVENT_SOURCE;
1711         s->child.pidfd = pidfd;
1712         s->child.options = options;
1713         s->child.callback = callback;
1714         s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1715         s->userdata = userdata;
1716         s->enabled = SD_EVENT_ONESHOT;
1717
1718         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1719                 /* We only want to watch for WEXITED */
1720                 r = source_child_pidfd_register(s, s->enabled);
1721                 if (r < 0)
1722                         return r;
1723         } else {
1724                 /* We shall wait for some other event than WEXITED */
1725                 r = event_make_signal_data(e, SIGCHLD, NULL);
1726                 if (r < 0)
1727                         return r;
1728
1729                 e->need_process_child = true;
1730         }
1731
1732         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1733         if (r < 0)
1734                 return r;
1735
1736         s->child.pid = pid;
1737         e->n_online_child_sources++;
1738
1739         if (ret)
1740                 *ret = s;
1741         TAKE_PTR(s);
1742         return 0;
1743 }
1744
1745 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1746         assert(s);
1747
1748         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1749 }
1750
1751 _public_ int sd_event_add_defer(
1752                 sd_event *e,
1753                 sd_event_source **ret,
1754                 sd_event_handler_t callback,
1755                 void *userdata) {
1756
1757         _cleanup_(source_freep) sd_event_source *s = NULL;
1758         int r;
1759
1760         assert_return(e, -EINVAL);
1761         assert_return(e = event_resolve(e), -ENOPKG);
1762         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1763         assert_return(!event_origin_changed(e), -ECHILD);
1764
1765         if (!callback)
1766                 callback = generic_exit_callback;
1767
1768         s = source_new(e, !ret, SOURCE_DEFER);
1769         if (!s)
1770                 return -ENOMEM;
1771
1772         s->defer.callback = callback;
1773         s->userdata = userdata;
1774         s->enabled = SD_EVENT_ONESHOT;
1775
1776         r = source_set_pending(s, true);
1777         if (r < 0)
1778                 return r;
1779
1780         if (ret)
1781                 *ret = s;
1782         TAKE_PTR(s);
1783
1784         return 0;
1785 }
1786
1787 _public_ int sd_event_add_post(
1788                 sd_event *e,
1789                 sd_event_source **ret,
1790                 sd_event_handler_t callback,
1791                 void *userdata) {
1792
1793         _cleanup_(source_freep) sd_event_source *s = NULL;
1794         int r;
1795
1796         assert_return(e, -EINVAL);
1797         assert_return(e = event_resolve(e), -ENOPKG);
1798         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1799         assert_return(!event_origin_changed(e), -ECHILD);
1800
1801         if (!callback)
1802                 callback = generic_exit_callback;
1803
1804         s = source_new(e, !ret, SOURCE_POST);
1805         if (!s)
1806                 return -ENOMEM;
1807
1808         s->post.callback = callback;
1809         s->userdata = userdata;
1810         s->enabled = SD_EVENT_ON;
1811
1812         r = set_ensure_put(&e->post_sources, NULL, s);
1813         if (r < 0)
1814                 return r;
1815         assert(r > 0);
1816
1817         if (ret)
1818                 *ret = s;
1819         TAKE_PTR(s);
1820
1821         return 0;
1822 }
1823
1824 _public_ int sd_event_add_exit(
1825                 sd_event *e,
1826                 sd_event_source **ret,
1827                 sd_event_handler_t callback,
1828                 void *userdata) {
1829
1830         _cleanup_(source_freep) sd_event_source *s = NULL;
1831         int r;
1832
1833         assert_return(e, -EINVAL);
1834         assert_return(e = event_resolve(e), -ENOPKG);
1835         assert_return(callback, -EINVAL);
1836         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1837         assert_return(!event_origin_changed(e), -ECHILD);
1838
1839         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1840         if (r < 0)
1841                 return r;
1842
1843         s = source_new(e, !ret, SOURCE_EXIT);
1844         if (!s)
1845                 return -ENOMEM;
1846
1847         s->exit.callback = callback;
1848         s->userdata = userdata;
1849         s->exit.prioq_index = PRIOQ_IDX_NULL;
1850         s->enabled = SD_EVENT_ONESHOT;
1851
1852         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1853         if (r < 0)
1854                 return r;
1855
1856         if (ret)
1857                 *ret = s;
1858         TAKE_PTR(s);
1859
1860         return 0;
1861 }
1862
1863 _public_ int sd_event_trim_memory(void) {
1864         int r;
1865
1866         /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1867          * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1868          * NULL callback parameter. */
1869
1870         log_debug("Memory pressure event, trimming malloc() memory.");
1871
1872         struct mallinfo2 before_mallinfo = mallinfo2();
1873
1874         usec_t before_timestamp = now(CLOCK_MONOTONIC);
1875         hashmap_trim_pools();
1876         r = malloc_trim(0);
1877         usec_t after_timestamp = now(CLOCK_MONOTONIC);
1878
1879         if (r > 0)
1880                 log_debug("Successfully trimmed some memory.");
1881         else
1882                 log_debug("Couldn't trim any memory.");
1883
1884         usec_t period = after_timestamp - before_timestamp;
1885
1886         struct mallinfo2 after_mallinfo = mallinfo2();
1887         size_t l = LESS_BY(before_mallinfo.hblkhd, after_mallinfo.hblkhd) +
1888                 LESS_BY(before_mallinfo.arena, after_mallinfo.arena);
1889         log_struct(LOG_DEBUG,
1890                    LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1891                                FORMAT_TIMESPAN(period, 0),
1892                                FORMAT_BYTES(l)),
1893                    LOG_MESSAGE_ID(SD_MESSAGE_MEMORY_TRIM_STR),
1894                    LOG_ITEM("TRIMMED_BYTES=%zu", l),
1895                    LOG_ITEM("TRIMMED_USEC=" USEC_FMT, period));
1896
1897         return 0;
1898 }
1899
1900 static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1901         assert(s);
1902
1903         sd_event_trim_memory();
1904         return 0;
1905 }
1906
1907 _public_ int sd_event_add_memory_pressure(
1908                 sd_event *e,
1909                 sd_event_source **ret,
1910                 sd_event_handler_t callback,
1911                 void *userdata) {
1912
1913         _cleanup_free_ char *w = NULL;
1914         _cleanup_(source_freep) sd_event_source *s = NULL;
1915         _cleanup_close_ int path_fd = -EBADF, fd = -EBADF;
1916         _cleanup_free_ void *write_buffer = NULL;
1917         const char *watch, *watch_fallback = NULL, *env;
1918         size_t write_buffer_size = 0;
1919         struct stat st;
1920         uint32_t events;
1921         bool locked;
1922         int r;
1923
1924         assert_return(e, -EINVAL);
1925         assert_return(e = event_resolve(e), -ENOPKG);
1926         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1927         assert_return(!event_origin_changed(e), -ECHILD);
1928
1929         if (!callback)
1930                 callback = memory_pressure_callback;
1931
1932         s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1933         if (!s)
1934                 return -ENOMEM;
1935
1936         s->wakeup = WAKEUP_EVENT_SOURCE;
1937         s->memory_pressure.callback = callback;
1938         s->userdata = userdata;
1939         s->enabled = SD_EVENT_ON;
1940         s->memory_pressure.fd = -EBADF;
1941
1942         env = secure_getenv("MEMORY_PRESSURE_WATCH");
1943         if (env) {
1944                 if (isempty(env) || path_equal(env, "/dev/null"))
1945                         return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1946                                                "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1947
1948                 if (!path_is_absolute(env) || !path_is_normalized(env))
1949                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1950                                                "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1951
1952                 watch = env;
1953
1954                 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1955                 if (env) {
1956                         r = unbase64mem(env, &write_buffer, &write_buffer_size);
1957                         if (r < 0)
1958                                 return r;
1959                 }
1960
1961                 locked = true;
1962         } else {
1963
1964                 r = is_pressure_supported();
1965                 if (r < 0)
1966                         return r;
1967                 if (r == 0)
1968                         return -EOPNOTSUPP;
1969
1970                 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1971                  * the system wide pressure if for some reason we cannot (which could be: memory controller
1972                  * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1973                  * only use the system-wide logic. */
1974                 r = cg_all_unified();
1975                 if (r < 0)
1976                         return r;
1977                 if (r == 0)
1978                         watch = "/proc/pressure/memory";
1979                 else {
1980                         _cleanup_free_ char *cg = NULL;
1981
1982                         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
1983                         if (r < 0)
1984                                 return r;
1985
1986                         w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
1987                         if (!w)
1988                                 return -ENOMEM;
1989
1990                         watch = w;
1991                         watch_fallback = "/proc/pressure/memory";
1992                 }
1993
1994                 /* Android uses three levels in its userspace low memory killer logic:
1995                  *     some  70000 1000000
1996                  *     some 100000 1000000
1997                  *     full  70000 1000000
1998                  *
1999                  * GNOME's low memory monitor uses:
2000                  *     some  70000 1000000
2001                  *     some 100000 1000000
2002                  *     full 100000 1000000
2003                  *
2004                  * We'll default to the middle level that both agree on. Except we do it on a 2s window
2005                  * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2006                  * kernel will allow us to do unprivileged, also in the future. */
2007                 if (asprintf((char**) &write_buffer,
2008                              "%s " USEC_FMT " " USEC_FMT,
2009                              MEMORY_PRESSURE_DEFAULT_TYPE,
2010                              MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2011                              MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2012                         return -ENOMEM;
2013
2014                 write_buffer_size = strlen(write_buffer) + 1;
2015                 locked = false;
2016         }
2017
2018         path_fd = open(watch, O_PATH|O_CLOEXEC);
2019         if (path_fd < 0) {
2020                 if (errno != ENOENT)
2021                         return -errno;
2022
2023                 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2024                  * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2025                  * the PSI service apparently is not supported) */
2026                 if (!watch_fallback)
2027                         return locked ? -ENOENT : -EOPNOTSUPP;
2028
2029                 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
2030                 if (path_fd < 0) {
2031                         if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2032                                 return -EOPNOTSUPP;
2033                         return -errno;
2034                 }
2035         }
2036
2037         if (fstat(path_fd, &st) < 0)
2038                 return -errno;
2039
2040         if (S_ISSOCK(st.st_mode)) {
2041                 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2042                 if (fd < 0)
2043                         return -errno;
2044
2045                 r = connect_unix_path(fd, path_fd, NULL);
2046                 if (r < 0)
2047                         return r;
2048
2049                 events = EPOLLIN;
2050
2051         } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2052                 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2053                 if (fd < 0)
2054                         return fd;
2055
2056                 if (S_ISREG(st.st_mode)) {
2057                         struct statfs sfs;
2058
2059                         /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2060
2061                         if (fstatfs(fd, &sfs) < 0)
2062                                 return -errno;
2063
2064                         if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2065                             !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2066                                 return -ENOTTY;
2067
2068                         events = EPOLLPRI;
2069                 } else
2070                         /* For fifos and char devices just watch for EPOLLIN */
2071                         events = EPOLLIN;
2072
2073         } else if (S_ISDIR(st.st_mode))
2074                 return -EISDIR;
2075         else
2076                 return -EBADF;
2077
2078         s->memory_pressure.fd = TAKE_FD(fd);
2079         s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2080         s->memory_pressure.write_buffer_size = write_buffer_size;
2081         s->memory_pressure.events = events;
2082         s->memory_pressure.locked = locked;
2083
2084         /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2085          * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2086          * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2087          * event sources on which writes must be executed before the first event loop iteration is
2088          * executed. (We could also write the data here, right away, but we want to give the caller the
2089          * freedom to call sd_event_source_set_memory_pressure_type() and
2090          * sd_event_source_set_memory_pressure_rate() before we write it. */
2091
2092         if (s->memory_pressure.write_buffer_size > 0)
2093                 source_memory_pressure_add_to_write_list(s);
2094         else {
2095                 r = source_memory_pressure_register(s, s->enabled);
2096                 if (r < 0)
2097                         return r;
2098         }
2099
2100         if (ret)
2101                 *ret = s;
2102         TAKE_PTR(s);
2103
2104         return 0;
2105 }
2106
2107 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2108         assert(e);
2109
2110         if (!d)
2111                 return;
2112
2113         assert(hashmap_isempty(d->inodes));
2114         assert(hashmap_isempty(d->wd));
2115
2116         if (d->buffer_filled > 0)
2117                 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
2118
2119         hashmap_free(d->inodes);
2120         hashmap_free(d->wd);
2121
2122         assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2123
2124         if (d->fd >= 0) {
2125                 if (!event_origin_changed(e) &&
2126                     epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
2127                         log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2128
2129                 safe_close(d->fd);
2130         }
2131         free(d);
2132 }
2133
2134 static int event_make_inotify_data(
2135                 sd_event *e,
2136                 int64_t priority,
2137                 struct inotify_data **ret) {
2138
2139         _cleanup_close_ int fd = -EBADF;
2140         struct inotify_data *d;
2141         int r;
2142
2143         assert(e);
2144
2145         d = hashmap_get(e->inotify_data, &priority);
2146         if (d) {
2147                 if (ret)
2148                         *ret = d;
2149                 return 0;
2150         }
2151
2152         fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2153         if (fd < 0)
2154                 return -errno;
2155
2156         fd = fd_move_above_stdio(fd);
2157
2158         d = new(struct inotify_data, 1);
2159         if (!d)
2160                 return -ENOMEM;
2161
2162         *d = (struct inotify_data) {
2163                 .wakeup = WAKEUP_INOTIFY_DATA,
2164                 .fd = TAKE_FD(fd),
2165                 .priority = priority,
2166         };
2167
2168         r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
2169         if (r < 0) {
2170                 d->fd = safe_close(d->fd);
2171                 free(d);
2172                 return r;
2173         }
2174
2175         struct epoll_event ev = {
2176                 .events = EPOLLIN,
2177                 .data.ptr = d,
2178         };
2179
2180         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2181                 r = -errno;
2182                 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2183                                             * remove the fd from the epoll first, which we don't want as we couldn't
2184                                             * add it in the first place. */
2185                 event_free_inotify_data(e, d);
2186                 return r;
2187         }
2188
2189         if (ret)
2190                 *ret = d;
2191
2192         return 1;
2193 }
2194
2195 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
2196         int r;
2197
2198         assert(x);
2199         assert(y);
2200
2201         r = CMP(x->dev, y->dev);
2202         if (r != 0)
2203                 return r;
2204
2205         return CMP(x->ino, y->ino);
2206 }
2207
2208 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2209         assert(d);
2210
2211         siphash24_compress_typesafe(d->dev, state);
2212         siphash24_compress_typesafe(d->ino, state);
2213 }
2214
2215 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
2216
2217 static void event_free_inode_data(
2218                 sd_event *e,
2219                 struct inode_data *d) {
2220
2221         assert(e);
2222
2223         if (!d)
2224                 return;
2225
2226         assert(!d->event_sources);
2227
2228         if (d->fd >= 0) {
2229                 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
2230                 safe_close(d->fd);
2231         }
2232
2233         if (d->inotify_data) {
2234
2235                 if (d->wd >= 0) {
2236                         if (d->inotify_data->fd >= 0 && !event_origin_changed(e)) {
2237                                 /* So here's a problem. At the time this runs the watch descriptor might already be
2238                                  * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2239                                  * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2240                                  * likely case to happen. */
2241
2242                                 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2243                                         log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2244                         }
2245
2246                         assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2247                 }
2248
2249                 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2250         }
2251
2252         free(d->path);
2253         free(d);
2254 }
2255
2256 static void event_gc_inotify_data(
2257                 sd_event *e,
2258                 struct inotify_data *d) {
2259
2260         assert(e);
2261
2262         /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2263          * any inode with it anymore, which in turn happens if no event source of this priority is interested
2264          * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2265          * (under the expectation that the GC is called again once the counter is decremented). */
2266
2267         if (!d)
2268                 return;
2269
2270         if (!hashmap_isempty(d->inodes))
2271                 return;
2272
2273         if (d->n_busy > 0)
2274                 return;
2275
2276         event_free_inotify_data(e, d);
2277 }
2278
2279 static void event_gc_inode_data(
2280                 sd_event *e,
2281                 struct inode_data *d) {
2282
2283         struct inotify_data *inotify_data;
2284
2285         assert(e);
2286
2287         if (!d)
2288                 return;
2289
2290         if (d->event_sources)
2291                 return;
2292
2293         inotify_data = d->inotify_data;
2294         event_free_inode_data(e, d);
2295
2296         event_gc_inotify_data(e, inotify_data);
2297 }
2298
2299 static int event_make_inode_data(
2300                 sd_event *e,
2301                 struct inotify_data *inotify_data,
2302                 dev_t dev,
2303                 ino_t ino,
2304                 struct inode_data **ret) {
2305
2306         struct inode_data *d, key;
2307         int r;
2308
2309         assert(e);
2310         assert(inotify_data);
2311
2312         key = (struct inode_data) {
2313                 .ino = ino,
2314                 .dev = dev,
2315         };
2316
2317         d = hashmap_get(inotify_data->inodes, &key);
2318         if (d) {
2319                 if (ret)
2320                         *ret = d;
2321
2322                 return 0;
2323         }
2324
2325         r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2326         if (r < 0)
2327                 return r;
2328
2329         d = new(struct inode_data, 1);
2330         if (!d)
2331                 return -ENOMEM;
2332
2333         *d = (struct inode_data) {
2334                 .dev = dev,
2335                 .ino = ino,
2336                 .wd = -1,
2337                 .fd = -EBADF,
2338                 .inotify_data = inotify_data,
2339         };
2340
2341         r = hashmap_put(inotify_data->inodes, d, d);
2342         if (r < 0) {
2343                 free(d);
2344                 return r;
2345         }
2346
2347         if (ret)
2348                 *ret = d;
2349
2350         return 1;
2351 }
2352
2353 static uint32_t inode_data_determine_mask(struct inode_data *d) {
2354         bool excl_unlink = true;
2355         uint32_t combined = 0;
2356
2357         assert(d);
2358
2359         /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2360          * the IN_EXCL_UNLINK flag is ANDed instead.
2361          *
2362          * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2363          * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2364          * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2365          * events we don't care for client-side. */
2366
2367         LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2368
2369                 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2370                         excl_unlink = false;
2371
2372                 combined |= s->inotify.mask;
2373         }
2374
2375         return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2376 }
2377
2378 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2379         uint32_t combined_mask;
2380         int wd, r;
2381
2382         assert(d);
2383         assert(d->fd >= 0);
2384
2385         combined_mask = inode_data_determine_mask(d);
2386
2387         if (d->wd >= 0 && combined_mask == d->combined_mask)
2388                 return 0;
2389
2390         r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2391         if (r < 0)
2392                 return r;
2393
2394         wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2395         if (wd < 0)
2396                 return wd;
2397
2398         if (d->wd < 0) {
2399                 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2400                 if (r < 0) {
2401                         (void) inotify_rm_watch(d->inotify_data->fd, wd);
2402                         return r;
2403                 }
2404
2405                 d->wd = wd;
2406
2407         } else if (d->wd != wd) {
2408
2409                 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2410                 (void) inotify_rm_watch(d->fd, wd);
2411                 return -EINVAL;
2412         }
2413
2414         d->combined_mask = combined_mask;
2415         return 1;
2416 }
2417
2418 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2419         assert(s);
2420
2421         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2422 }
2423
2424 static int event_add_inotify_fd_internal(
2425                 sd_event *e,
2426                 sd_event_source **ret,
2427                 int fd,
2428                 bool donate,
2429                 uint32_t mask,
2430                 sd_event_inotify_handler_t callback,
2431                 void *userdata) {
2432
2433         _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
2434         _cleanup_(source_freep) sd_event_source *s = NULL;
2435         struct inotify_data *inotify_data = NULL;
2436         struct inode_data *inode_data = NULL;
2437         struct stat st;
2438         int r;
2439
2440         assert_return(e, -EINVAL);
2441         assert_return(e = event_resolve(e), -ENOPKG);
2442         assert_return(fd >= 0, -EBADF);
2443         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2444         assert_return(!event_origin_changed(e), -ECHILD);
2445
2446         if (!callback)
2447                 callback = inotify_exit_callback;
2448
2449         /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2450          * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2451          * the user can't use them for us. */
2452         if (mask & IN_MASK_ADD)
2453                 return -EINVAL;
2454
2455         if (fstat(fd, &st) < 0)
2456                 return -errno;
2457
2458         s = source_new(e, !ret, SOURCE_INOTIFY);
2459         if (!s)
2460                 return -ENOMEM;
2461
2462         s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2463         s->inotify.mask = mask;
2464         s->inotify.callback = callback;
2465         s->userdata = userdata;
2466
2467         /* Allocate an inotify object for this priority, and an inode object within it */
2468         r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2469         if (r < 0)
2470                 return r;
2471
2472         r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2473         if (r < 0) {
2474                 event_gc_inotify_data(e, inotify_data);
2475                 return r;
2476         }
2477
2478         /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2479          * the event source, until then, for which we need the original inode. */
2480         if (inode_data->fd < 0) {
2481                 if (donated_fd >= 0)
2482                         inode_data->fd = TAKE_FD(donated_fd);
2483                 else {
2484                         inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2485                         if (inode_data->fd < 0) {
2486                                 r = -errno;
2487                                 event_gc_inode_data(e, inode_data);
2488                                 return r;
2489                         }
2490                 }
2491
2492                 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
2493
2494                 _cleanup_free_ char *path = NULL;
2495                 r = fd_get_path(inode_data->fd, &path);
2496                 if (r < 0 && r != -ENOSYS) { /* The path is optional, hence ignore -ENOSYS. */
2497                         event_gc_inode_data(e, inode_data);
2498                         return r;
2499                 }
2500
2501                 free_and_replace(inode_data->path, path);
2502         }
2503
2504         /* Link our event source to the inode data object */
2505         LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2506         s->inotify.inode_data = inode_data;
2507
2508         /* Actually realize the watch now */
2509         r = inode_data_realize_watch(e, inode_data);
2510         if (r < 0)
2511                 return r;
2512
2513         if (ret)
2514                 *ret = s;
2515         TAKE_PTR(s);
2516
2517         return 0;
2518 }
2519
2520 _public_ int sd_event_add_inotify_fd(
2521                 sd_event *e,
2522                 sd_event_source **ret,
2523                 int fd,
2524                 uint32_t mask,
2525                 sd_event_inotify_handler_t callback,
2526                 void *userdata) {
2527
2528         return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2529 }
2530
2531 _public_ int sd_event_add_inotify(
2532                 sd_event *e,
2533                 sd_event_source **ret,
2534                 const char *path,
2535                 uint32_t mask,
2536                 sd_event_inotify_handler_t callback,
2537                 void *userdata) {
2538
2539         sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2540         int fd, r;
2541
2542         assert_return(path, -EINVAL);
2543
2544         fd = open(path, O_PATH | O_CLOEXEC |
2545                         (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2546                         (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2547         if (fd < 0)
2548                 return -errno;
2549
2550         r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2551         if (r < 0)
2552                 return r;
2553
2554         (void) sd_event_source_set_description(s, path);
2555
2556         if (ret)
2557                 *ret = s;
2558
2559         return r;
2560 }
2561
2562 static sd_event_source* event_source_free(sd_event_source *s) {
2563         if (!s)
2564                 return NULL;
2565
2566         /* Here's a special hack: when we are called from a
2567          * dispatch handler we won't free the event source
2568          * immediately, but we will detach the fd from the
2569          * epoll. This way it is safe for the caller to unref
2570          * the event source and immediately close the fd, but
2571          * we still retain a valid event source object after
2572          * the callback. */
2573
2574         if (s->dispatching)
2575                 source_disconnect(s);
2576         else
2577                 source_free(s);
2578
2579         return NULL;
2580 }
2581
2582 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2583
2584 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2585         assert_return(s, -EINVAL);
2586         assert_return(!event_origin_changed(s->event), -ECHILD);
2587
2588         return free_and_strdup(&s->description, description);
2589 }
2590
2591 _public_ int sd_event_source_get_description(sd_event_source *s, const char **ret) {
2592         assert_return(s, -EINVAL);
2593         assert_return(ret, -EINVAL);
2594
2595         if (!s->description)
2596                 return -ENXIO;
2597
2598         *ret = s->description;
2599         return 0;
2600 }
2601
2602 _public_ sd_event* sd_event_source_get_event(sd_event_source *s) {
2603         assert_return(s, NULL);
2604         assert_return(!event_origin_changed(s->event), NULL);
2605
2606         return s->event;
2607 }
2608
2609 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2610         assert_return(s, -EINVAL);
2611         assert_return(s->type != SOURCE_EXIT, -EDOM);
2612         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2613         assert_return(!event_origin_changed(s->event), -ECHILD);
2614
2615         return s->pending;
2616 }
2617
2618 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2619         assert_return(s, -EINVAL);
2620         assert_return(s->type == SOURCE_IO, -EDOM);
2621         assert_return(!event_origin_changed(s->event), -ECHILD);
2622
2623         return s->io.fd;
2624 }
2625
2626 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2627         int saved_fd, r;
2628
2629         assert_return(s, -EINVAL);
2630         assert_return(fd >= 0, -EBADF);
2631         assert_return(s->type == SOURCE_IO, -EDOM);
2632         assert_return(!event_origin_changed(s->event), -ECHILD);
2633
2634         if (s->io.fd == fd)
2635                 return 0;
2636
2637         saved_fd = s->io.fd;
2638         s->io.fd = fd;
2639
2640         assert(event_source_is_offline(s) == !s->io.registered);
2641
2642         if (s->io.registered) {
2643                 s->io.registered = false;
2644
2645                 r = source_io_register(s, s->enabled, s->io.events);
2646                 if (r < 0) {
2647                         s->io.fd = saved_fd;
2648                         s->io.registered = true;
2649                         return r;
2650                 }
2651
2652                 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2653         }
2654
2655         if (s->io.owned)
2656                 safe_close(saved_fd);
2657
2658         return 0;
2659 }
2660
2661 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2662         assert_return(s, -EINVAL);
2663         assert_return(s->type == SOURCE_IO, -EDOM);
2664         assert_return(!event_origin_changed(s->event), -ECHILD);
2665
2666         return s->io.owned;
2667 }
2668
2669 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2670         assert_return(s, -EINVAL);
2671         assert_return(s->type == SOURCE_IO, -EDOM);
2672         assert_return(!event_origin_changed(s->event), -ECHILD);
2673
2674         s->io.owned = own;
2675         return 0;
2676 }
2677
2678 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t *ret) {
2679         assert_return(s, -EINVAL);
2680         assert_return(ret, -EINVAL);
2681         assert_return(s->type == SOURCE_IO, -EDOM);
2682         assert_return(!event_origin_changed(s->event), -ECHILD);
2683
2684         *ret = s->io.events;
2685         return 0;
2686 }
2687
2688 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2689         int r;
2690
2691         assert_return(s, -EINVAL);
2692         assert_return(s->type == SOURCE_IO, -EDOM);
2693         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2694         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2695         assert_return(!event_origin_changed(s->event), -ECHILD);
2696
2697         /* edge-triggered updates are never skipped, so we can reset edges */
2698         if (s->io.events == events && !(events & EPOLLET))
2699                 return 0;
2700
2701         r = source_set_pending(s, false);
2702         if (r < 0)
2703                 return r;
2704
2705         if (event_source_is_online(s)) {
2706                 r = source_io_register(s, s->enabled, events);
2707                 if (r < 0)
2708                         return r;
2709         }
2710
2711         s->io.events = events;
2712
2713         return 0;
2714 }
2715
2716 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t *ret) {
2717         assert_return(s, -EINVAL);
2718         assert_return(ret, -EINVAL);
2719         assert_return(s->type == SOURCE_IO, -EDOM);
2720         assert_return(!event_origin_changed(s->event), -ECHILD);
2721
2722         if (!s->pending)
2723                 return -ENODATA;
2724
2725         *ret = s->io.revents;
2726         return 0;
2727 }
2728
2729 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2730         assert_return(s, -EINVAL);
2731         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2732         assert_return(!event_origin_changed(s->event), -ECHILD);
2733
2734         return s->signal.sig;
2735 }
2736
2737 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *ret) {
2738         assert_return(s, -EINVAL);
2739         assert_return(ret, -EINVAL);
2740         assert_return(!event_origin_changed(s->event), -ECHILD);
2741
2742         *ret = s->priority;
2743         return 0;
2744 }
2745
2746 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2747         bool rm_inotify = false, rm_inode = false;
2748         struct inotify_data *new_inotify_data = NULL;
2749         struct inode_data *new_inode_data = NULL;
2750         int r;
2751
2752         assert_return(s, -EINVAL);
2753         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2754         assert_return(!event_origin_changed(s->event), -ECHILD);
2755
2756         if (s->priority == priority)
2757                 return 0;
2758
2759         if (s->type == SOURCE_INOTIFY) {
2760                 struct inode_data *old_inode_data;
2761
2762                 assert(s->inotify.inode_data);
2763                 old_inode_data = s->inotify.inode_data;
2764
2765                 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2766                  * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2767                  * events we allow priority changes only until the first following iteration. */
2768                 if (old_inode_data->fd < 0)
2769                         return -EOPNOTSUPP;
2770
2771                 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2772                 if (r < 0)
2773                         return r;
2774                 rm_inotify = r > 0;
2775
2776                 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2777                 if (r < 0)
2778                         goto fail;
2779                 rm_inode = r > 0;
2780
2781                 if (new_inode_data->fd < 0) {
2782                         /* Duplicate the fd for the new inode object if we don't have any yet */
2783                         new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2784                         if (new_inode_data->fd < 0) {
2785                                 r = -errno;
2786                                 goto fail;
2787                         }
2788
2789                         LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
2790
2791                         _cleanup_free_ char *path = NULL;
2792                         r = fd_get_path(new_inode_data->fd, &path);
2793                         if (r < 0 && r != -ENOSYS)
2794                                 goto fail;
2795
2796                         free_and_replace(new_inode_data->path, path);
2797                 }
2798
2799                 /* Move the event source to the new inode data structure */
2800                 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2801                 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2802                 s->inotify.inode_data = new_inode_data;
2803
2804                 /* Now create the new watch */
2805                 r = inode_data_realize_watch(s->event, new_inode_data);
2806                 if (r < 0) {
2807                         /* Move it back */
2808                         LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2809                         LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2810                         s->inotify.inode_data = old_inode_data;
2811                         goto fail;
2812                 }
2813
2814                 s->priority = priority;
2815
2816                 event_gc_inode_data(s->event, old_inode_data);
2817
2818         } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2819                 struct signal_data *old, *d;
2820
2821                 /* Move us from the signalfd belonging to the old
2822                  * priority to the signalfd of the new priority */
2823
2824                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2825
2826                 s->priority = priority;
2827
2828                 r = event_make_signal_data(s->event, s->signal.sig, &d);
2829                 if (r < 0) {
2830                         s->priority = old->priority;
2831                         return r;
2832                 }
2833
2834                 event_unmask_signal_data(s->event, old, s->signal.sig);
2835         } else
2836                 s->priority = priority;
2837
2838         event_source_pp_prioq_reshuffle(s);
2839
2840         if (s->type == SOURCE_EXIT)
2841                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2842
2843         return 0;
2844
2845 fail:
2846         if (rm_inode)
2847                 event_free_inode_data(s->event, new_inode_data);
2848
2849         if (rm_inotify)
2850                 event_free_inotify_data(s->event, new_inotify_data);
2851
2852         return r;
2853 }
2854
2855 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2856         /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2857         if (!s && !ret)
2858                 return false;
2859
2860         assert_return(s, -EINVAL);
2861         assert_return(!event_origin_changed(s->event), -ECHILD);
2862
2863         if (ret)
2864                 *ret = s->enabled;
2865
2866         return s->enabled != SD_EVENT_OFF;
2867 }
2868
2869 static int event_source_offline(
2870                 sd_event_source *s,
2871                 int enabled,
2872                 bool ratelimited) {
2873
2874         bool was_offline;
2875         int r;
2876
2877         assert(s);
2878         assert(enabled == SD_EVENT_OFF || ratelimited);
2879
2880         /* Unset the pending flag when this event source is disabled */
2881         if (s->enabled != SD_EVENT_OFF &&
2882             enabled == SD_EVENT_OFF &&
2883             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2884                 r = source_set_pending(s, false);
2885                 if (r < 0)
2886                         return r;
2887         }
2888
2889         was_offline = event_source_is_offline(s);
2890         s->enabled = enabled;
2891         s->ratelimited = ratelimited;
2892
2893         switch (s->type) {
2894
2895         case SOURCE_IO:
2896                 source_io_unregister(s);
2897                 break;
2898
2899         case SOURCE_SIGNAL:
2900                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2901                 break;
2902
2903         case SOURCE_CHILD:
2904                 if (!was_offline) {
2905                         assert(s->event->n_online_child_sources > 0);
2906                         s->event->n_online_child_sources--;
2907                 }
2908
2909                 if (EVENT_SOURCE_WATCH_PIDFD(s))
2910                         source_child_pidfd_unregister(s);
2911                 else
2912                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2913                 break;
2914
2915         case SOURCE_EXIT:
2916                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2917                 break;
2918
2919         case SOURCE_MEMORY_PRESSURE:
2920                 source_memory_pressure_unregister(s);
2921                 break;
2922
2923         case SOURCE_TIME_REALTIME:
2924         case SOURCE_TIME_BOOTTIME:
2925         case SOURCE_TIME_MONOTONIC:
2926         case SOURCE_TIME_REALTIME_ALARM:
2927         case SOURCE_TIME_BOOTTIME_ALARM:
2928         case SOURCE_DEFER:
2929         case SOURCE_POST:
2930         case SOURCE_INOTIFY:
2931                 break;
2932
2933         default:
2934                 assert_not_reached();
2935         }
2936
2937         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2938         event_source_time_prioq_reshuffle(s);
2939
2940         return 1;
2941 }
2942
2943 static int event_source_online(
2944                 sd_event_source *s,
2945                 int enabled,
2946                 bool ratelimited) {
2947
2948         bool was_online;
2949         int r;
2950
2951         assert(s);
2952         assert(enabled != SD_EVENT_OFF || !ratelimited);
2953
2954         /* Unset the pending flag when this event source is enabled */
2955         if (s->enabled == SD_EVENT_OFF &&
2956             enabled != SD_EVENT_OFF &&
2957             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2958                 r = source_set_pending(s, false);
2959                 if (r < 0)
2960                         return r;
2961         }
2962
2963         /* Are we really ready for onlining? */
2964         if (enabled == SD_EVENT_OFF || ratelimited) {
2965                 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2966                 s->enabled = enabled;
2967                 s->ratelimited = ratelimited;
2968                 return 0;
2969         }
2970
2971         was_online = event_source_is_online(s);
2972
2973         switch (s->type) {
2974         case SOURCE_IO:
2975                 r = source_io_register(s, enabled, s->io.events);
2976                 if (r < 0)
2977                         return r;
2978                 break;
2979
2980         case SOURCE_SIGNAL:
2981                 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2982                 if (r < 0) {
2983                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2984                         return r;
2985                 }
2986
2987                 break;
2988
2989         case SOURCE_CHILD:
2990                 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2991                         /* yes, we can rely on pidfd */
2992
2993                         r = source_child_pidfd_register(s, enabled);
2994                         if (r < 0)
2995                                 return r;
2996                 } else {
2997                         /* something other to watch for than WEXITED */
2998
2999                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
3000                         if (r < 0) {
3001                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
3002                                 return r;
3003                         }
3004                 }
3005
3006                 if (!was_online)
3007                         s->event->n_online_child_sources++;
3008                 break;
3009
3010         case SOURCE_MEMORY_PRESSURE:
3011                 r = source_memory_pressure_register(s, enabled);
3012                 if (r < 0)
3013                         return r;
3014
3015                 break;
3016
3017         case SOURCE_TIME_REALTIME:
3018         case SOURCE_TIME_BOOTTIME:
3019         case SOURCE_TIME_MONOTONIC:
3020         case SOURCE_TIME_REALTIME_ALARM:
3021         case SOURCE_TIME_BOOTTIME_ALARM:
3022         case SOURCE_EXIT:
3023         case SOURCE_DEFER:
3024         case SOURCE_POST:
3025         case SOURCE_INOTIFY:
3026                 break;
3027
3028         default:
3029                 assert_not_reached();
3030         }
3031
3032         s->enabled = enabled;
3033         s->ratelimited = ratelimited;
3034
3035         /* Non-failing operations below */
3036         if (s->type == SOURCE_EXIT)
3037                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
3038
3039         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3040         event_source_time_prioq_reshuffle(s);
3041
3042         return 1;
3043 }
3044
3045 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3046         int r;
3047
3048         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
3049
3050         /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3051         if (m == SD_EVENT_OFF && !s)
3052                 return 0;
3053
3054         assert_return(s, -EINVAL);
3055         assert_return(!event_origin_changed(s->event), -ECHILD);
3056
3057         /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3058         if (s->event->state == SD_EVENT_FINISHED)
3059                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
3060
3061         if (s->enabled == m) /* No change? */
3062                 return 0;
3063
3064         if (m == SD_EVENT_OFF)
3065                 r = event_source_offline(s, m, s->ratelimited);
3066         else {
3067                 if (s->enabled != SD_EVENT_OFF) {
3068                         /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3069                          * event source is already enabled after all. */
3070                         s->enabled = m;
3071                         return 0;
3072                 }
3073
3074                 r = event_source_online(s, m, s->ratelimited);
3075         }
3076         if (r < 0)
3077                 return r;
3078
3079         event_source_pp_prioq_reshuffle(s);
3080         return 0;
3081 }
3082
3083 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *ret) {
3084         assert_return(s, -EINVAL);
3085         assert_return(ret, -EINVAL);
3086         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3087         assert_return(!event_origin_changed(s->event), -ECHILD);
3088
3089         *ret = s->time.next;
3090         return 0;
3091 }
3092
3093 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
3094         int r;
3095
3096         assert_return(s, -EINVAL);
3097         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3098         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3099         assert_return(!event_origin_changed(s->event), -ECHILD);
3100
3101         r = source_set_pending(s, false);
3102         if (r < 0)
3103                 return r;
3104
3105         s->time.next = usec;
3106
3107         event_source_time_prioq_reshuffle(s);
3108         return 0;
3109 }
3110
3111 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3112         usec_t t;
3113         int r;
3114
3115         assert_return(s, -EINVAL);
3116         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3117         assert_return(!event_origin_changed(s->event), -ECHILD);
3118
3119         if (usec == USEC_INFINITY)
3120                 return sd_event_source_set_time(s, USEC_INFINITY);
3121
3122         r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3123         if (r < 0)
3124                 return r;
3125
3126         usec = usec_add(t, usec);
3127         if (usec == USEC_INFINITY)
3128                 return -EOVERFLOW;
3129
3130         return sd_event_source_set_time(s, usec);
3131 }
3132
3133 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *ret) {
3134         assert_return(s, -EINVAL);
3135         assert_return(ret, -EINVAL);
3136         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3137         assert_return(!event_origin_changed(s->event), -ECHILD);
3138
3139         *ret = s->time.accuracy;
3140         return 0;
3141 }
3142
3143 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
3144         int r;
3145
3146         assert_return(s, -EINVAL);
3147         assert_return(usec != UINT64_MAX, -EINVAL);
3148         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3149         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3150         assert_return(!event_origin_changed(s->event), -ECHILD);
3151
3152         r = source_set_pending(s, false);
3153         if (r < 0)
3154                 return r;
3155
3156         if (usec == 0)
3157                 usec = DEFAULT_ACCURACY_USEC;
3158
3159         s->time.accuracy = usec;
3160
3161         event_source_time_prioq_reshuffle(s);
3162         return 0;
3163 }
3164
3165 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *ret) {
3166         assert_return(s, -EINVAL);
3167         assert_return(ret, -EINVAL);
3168         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3169         assert_return(!event_origin_changed(s->event), -ECHILD);
3170
3171         *ret = event_source_type_to_clock(s->type);
3172         return 0;
3173 }
3174
3175 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *ret) {
3176         assert_return(s, -EINVAL);
3177         assert_return(ret, -EINVAL);
3178         assert_return(s->type == SOURCE_CHILD, -EDOM);
3179         assert_return(!event_origin_changed(s->event), -ECHILD);
3180
3181         *ret = s->child.pid;
3182         return 0;
3183 }
3184
3185 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3186         assert_return(s, -EINVAL);
3187         assert_return(s->type == SOURCE_CHILD, -EDOM);
3188         assert_return(!event_origin_changed(s->event), -ECHILD);
3189
3190         return s->child.pidfd;
3191 }
3192
3193 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3194         assert_return(s, -EINVAL);
3195         assert_return(s->type == SOURCE_CHILD, -EDOM);
3196         assert_return(!event_origin_changed(s->event), -ECHILD);
3197         assert_return(SIGNAL_VALID(sig), -EINVAL);
3198         assert(s->child.pidfd >= 0);
3199
3200         /* If we already have seen indication the process exited refuse sending a signal early. */
3201         if (s->child.exited)
3202                 return -ESRCH;
3203         assert(!s->child.waited);
3204
3205         /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the structure here. */
3206         siginfo_t copy;
3207         if (si)
3208                 copy = *si;
3209
3210         return RET_NERRNO(pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, flags));
3211 }
3212
3213 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3214         assert_return(s, -EINVAL);
3215         assert_return(s->type == SOURCE_CHILD, -EDOM);
3216         assert_return(!event_origin_changed(s->event), -ECHILD);
3217         assert(s->child.pidfd >= 0);
3218
3219         return s->child.pidfd_owned;
3220 }
3221
3222 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3223         assert_return(s, -EINVAL);
3224         assert_return(s->type == SOURCE_CHILD, -EDOM);
3225         assert_return(!event_origin_changed(s->event), -ECHILD);
3226         assert(s->child.pidfd >= 0);
3227
3228         s->child.pidfd_owned = own;
3229         return 0;
3230 }
3231
3232 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3233         assert_return(s, -EINVAL);
3234         assert_return(s->type == SOURCE_CHILD, -EDOM);
3235         assert_return(!event_origin_changed(s->event), -ECHILD);
3236
3237         return s->child.process_owned;
3238 }
3239
3240 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3241         assert_return(s, -EINVAL);
3242         assert_return(s->type == SOURCE_CHILD, -EDOM);
3243         assert_return(!event_origin_changed(s->event), -ECHILD);
3244
3245         s->child.process_owned = own;
3246         return 0;
3247 }
3248
3249 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret) {
3250         assert_return(s, -EINVAL);
3251         assert_return(ret, -EINVAL);
3252         assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3253         assert_return(!event_origin_changed(s->event), -ECHILD);
3254
3255         *ret = s->inotify.mask;
3256         return 0;
3257 }
3258
3259 _public_ int sd_event_source_get_inotify_path(sd_event_source *s, const char **ret) {
3260         assert_return(s, -EINVAL);
3261         assert_return(ret, -EINVAL);
3262         assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3263         assert_return(!event_origin_changed(s->event), -ECHILD);
3264
3265         if (!s->inotify.inode_data)
3266                 return -ESTALE; /* already disconnected. */
3267
3268         if (!s->inotify.inode_data->path)
3269                 return -ENOSYS; /* /proc was not mounted? */
3270
3271         *ret = s->inotify.inode_data->path;
3272         return 0;
3273 }
3274
3275 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
3276         int r;
3277
3278         assert_return(s, -EINVAL);
3279         assert_return(s->type != SOURCE_EXIT, -EDOM);
3280         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3281         assert_return(!event_origin_changed(s->event), -ECHILD);
3282
3283         if (s->prepare == callback)
3284                 return 0;
3285
3286         if (callback && s->prepare) {
3287                 s->prepare = callback;
3288                 return 0;
3289         }
3290
3291         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3292         if (r < 0)
3293                 return r;
3294
3295         s->prepare = callback;
3296
3297         if (callback) {
3298                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3299                 if (r < 0)
3300                         return r;
3301         } else
3302                 prioq_remove(s->event->prepare, s, &s->prepare_index);
3303
3304         return 0;
3305 }
3306
3307 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
3308         assert_return(s, NULL);
3309         assert_return(!event_origin_changed(s->event), NULL);
3310
3311         return s->userdata;
3312 }
3313
3314 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3315         void *ret;
3316
3317         assert_return(s, NULL);
3318         assert_return(!event_origin_changed(s->event), NULL);
3319
3320         ret = s->userdata;
3321         s->userdata = userdata;
3322
3323         return ret;
3324 }
3325
3326 static int event_source_enter_ratelimited(sd_event_source *s) {
3327         int r;
3328
3329         assert(s);
3330
3331         /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3332          * the end of the rate limit time window, much as if it was a timer event source. */
3333
3334         if (s->ratelimited)
3335                 return 0; /* Already ratelimited, this is a NOP hence */
3336
3337         /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3338         r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3339         if (r < 0)
3340                 return r;
3341
3342         /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3343          * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3344          * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3345         if (EVENT_SOURCE_IS_TIME(s->type))
3346                 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3347
3348         /* Now, let's add the event source to the monotonic clock instead */
3349         r = event_source_time_prioq_put(s, &s->event->monotonic);
3350         if (r < 0)
3351                 goto fail;
3352
3353         /* And let's take the event source officially offline */
3354         r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3355         if (r < 0) {
3356                 event_source_time_prioq_remove(s, &s->event->monotonic);
3357                 goto fail;
3358         }
3359
3360         event_source_pp_prioq_reshuffle(s);
3361
3362         log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3363         return 0;
3364
3365 fail:
3366         /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3367          * space for it should already be allocated. */
3368         if (EVENT_SOURCE_IS_TIME(s->type))
3369                 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3370
3371         return r;
3372 }
3373
3374 static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
3375         int r;
3376
3377         assert(s);
3378
3379         if (!s->ratelimited)
3380                 return 0;
3381
3382         /* Let's take the event source out of the monotonic prioq first. */
3383         event_source_time_prioq_remove(s, &s->event->monotonic);
3384
3385         /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3386         if (EVENT_SOURCE_IS_TIME(s->type)) {
3387                 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3388                 if (r < 0)
3389                         goto fail;
3390         }
3391
3392         /* Let's try to take it online again.  */
3393         r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3394         if (r < 0) {
3395                 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3396                 if (EVENT_SOURCE_IS_TIME(s->type))
3397                         event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3398
3399                 goto fail;
3400         }
3401
3402         event_source_pp_prioq_reshuffle(s);
3403         ratelimit_reset(&s->rate_limit);
3404
3405         log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
3406
3407         if (run_callback && s->ratelimit_expire_callback) {
3408                 s->dispatching = true;
3409                 r = s->ratelimit_expire_callback(s, s->userdata);
3410                 s->dispatching = false;
3411
3412                 if (r < 0) {
3413                         log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3414                                         strna(s->description),
3415                                         event_source_type_to_string(s->type),
3416                                         s->exit_on_failure ? "exiting" : "disabling");
3417
3418                         if (s->exit_on_failure)
3419                                 (void) sd_event_exit(s->event, r);
3420                 }
3421
3422                 if (s->n_ref == 0)
3423                         source_free(s);
3424                 else if (r < 0)
3425                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3426
3427                 return 1;
3428         }
3429
3430         return 0;
3431
3432 fail:
3433         /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3434          * simply put it back in it, maybe we can then process it more successfully next iteration. */
3435         assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3436
3437         return r;
3438 }
3439
3440 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3441         usec_t c;
3442         assert(e);
3443         assert(a <= b);
3444
3445         if (a <= 0)
3446                 return 0;
3447         if (a >= USEC_INFINITY)
3448                 return USEC_INFINITY;
3449
3450         if (b <= a + 1)
3451                 return a;
3452
3453         initialize_perturb(e);
3454
3455         /*
3456           Find a good time to wake up again between times a and b. We
3457           have two goals here:
3458
3459           a) We want to wake up as seldom as possible, hence prefer
3460              later times over earlier times.
3461
3462           b) But if we have to wake up, then let's make sure to
3463              dispatch as much as possible on the entire system.
3464
3465           We implement this by waking up everywhere at the same time
3466           within any given minute if we can, synchronised via the
3467           perturbation value determined from the boot ID. If we can't,
3468           then we try to find the same spot in every 10s, then 1s and
3469           then 250ms step. Otherwise, we pick the last possible time
3470           to wake up.
3471         */
3472
3473         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3474         if (c >= b) {
3475                 if (_unlikely_(c < USEC_PER_MINUTE))
3476                         return b;
3477
3478                 c -= USEC_PER_MINUTE;
3479         }
3480
3481         if (c >= a)
3482                 return c;
3483
3484         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3485         if (c >= b) {
3486                 if (_unlikely_(c < USEC_PER_SEC*10))
3487                         return b;
3488
3489                 c -= USEC_PER_SEC*10;
3490         }
3491
3492         if (c >= a)
3493                 return c;
3494
3495         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3496         if (c >= b) {
3497                 if (_unlikely_(c < USEC_PER_SEC))
3498                         return b;
3499
3500                 c -= USEC_PER_SEC;
3501         }
3502
3503         if (c >= a)
3504                 return c;
3505
3506         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3507         if (c >= b) {
3508                 if (_unlikely_(c < USEC_PER_MSEC*250))
3509                         return b;
3510
3511                 c -= USEC_PER_MSEC*250;
3512         }
3513
3514         if (c >= a)
3515                 return c;
3516
3517         return b;
3518 }
3519
3520 static int event_arm_timer(
3521                 sd_event *e,
3522                 struct clock_data *d) {
3523
3524         struct itimerspec its = {};
3525         sd_event_source *a, *b;
3526         usec_t t;
3527
3528         assert(e);
3529         assert(d);
3530
3531         if (!d->needs_rearm)
3532                 return 0;
3533
3534         d->needs_rearm = false;
3535
3536         a = prioq_peek(d->earliest);
3537         assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3538         if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3539
3540                 if (d->fd < 0)
3541                         return 0;
3542
3543                 if (d->next == USEC_INFINITY)
3544                         return 0;
3545
3546                 /* disarm */
3547                 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3548                         return -errno;
3549
3550                 d->next = USEC_INFINITY;
3551                 return 0;
3552         }
3553
3554         b = prioq_peek(d->latest);
3555         assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3556         assert(b && b->enabled != SD_EVENT_OFF);
3557
3558         t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3559         if (d->next == t)
3560                 return 0;
3561
3562         assert_se(d->fd >= 0);
3563
3564         if (t == 0) {
3565                 /* We don't want to disarm here, just mean some time looooong ago. */
3566                 its.it_value.tv_sec = 0;
3567                 its.it_value.tv_nsec = 1;
3568         } else
3569                 timespec_store(&its.it_value, t);
3570
3571         if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3572                 return -errno;
3573
3574         d->next = t;
3575         return 0;
3576 }
3577
3578 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3579         assert(e);
3580         assert(s);
3581         assert(s->type == SOURCE_IO);
3582
3583         /* If the event source was already pending, we just OR in the
3584          * new revents, otherwise we reset the value. The ORing is
3585          * necessary to handle EPOLLONESHOT events properly where
3586          * readability might happen independently of writability, and
3587          * we need to keep track of both */
3588
3589         if (s->pending)
3590                 s->io.revents |= revents;
3591         else
3592                 s->io.revents = revents;
3593
3594         return source_set_pending(s, true);
3595 }
3596
3597 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3598         uint64_t x;
3599         ssize_t ss;
3600
3601         assert(e);
3602         assert(fd >= 0);
3603
3604         assert_return(events == EPOLLIN, -EIO);
3605
3606         ss = read(fd, &x, sizeof(x));
3607         if (ss < 0) {
3608                 if (ERRNO_IS_TRANSIENT(errno))
3609                         return 0;
3610
3611                 return -errno;
3612         }
3613
3614         if (_unlikely_(ss != sizeof(x)))
3615                 return -EIO;
3616
3617         if (next)
3618                 *next = USEC_INFINITY;
3619
3620         return 0;
3621 }
3622
3623 static int process_timer(
3624                 sd_event *e,
3625                 usec_t n,
3626                 struct clock_data *d) {
3627
3628         sd_event_source *s;
3629         bool callback_invoked = false;
3630         int r;
3631
3632         assert(e);
3633         assert(d);
3634
3635         for (;;) {
3636                 s = prioq_peek(d->earliest);
3637                 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3638
3639                 if (!s || time_event_source_next(s) > n)
3640                         break;
3641
3642                 if (s->ratelimited) {
3643                         /* This is an event sources whose ratelimit window has ended. Let's turn it on
3644                          * again. */
3645                         assert(s->ratelimited);
3646
3647                         r = event_source_leave_ratelimit(s, /* run_callback */ true);
3648                         if (r < 0)
3649                                 return r;
3650                         else if (r == 1)
3651                                 callback_invoked = true;
3652
3653                         continue;
3654                 }
3655
3656                 if (s->enabled == SD_EVENT_OFF || s->pending)
3657                         break;
3658
3659                 r = source_set_pending(s, true);
3660                 if (r < 0)
3661                         return r;
3662
3663                 event_source_time_prioq_reshuffle(s);
3664         }
3665
3666         return callback_invoked;
3667 }
3668
3669 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3670         int64_t min_priority = threshold;
3671         bool something_new = false;
3672         sd_event_source *s;
3673         int r;
3674
3675         assert(e);
3676         assert(ret_min_priority);
3677
3678         if (!e->need_process_child) {
3679                 *ret_min_priority = min_priority;
3680                 return 0;
3681         }
3682
3683         e->need_process_child = false;
3684
3685         /* So, this is ugly. We iteratively invoke waitid() + WNOHANG with each child process we shall wait for,
3686          * instead of using P_ALL. This is because we only want to get child information of very specific
3687          * child processes, and not all of them. We might not have processed the SIGCHLD event
3688          * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3689          * hence we really don't want anything flushed out of the kernel's queue that we don't care
3690          * about. Since this is O(n) this means that if you have a lot of processes you probably want
3691          * to handle SIGCHLD yourself.
3692          *
3693          * We do not reap the children here (by using WNOWAIT), this is only done after the event
3694          * source is dispatched so that the callback still sees the process as a zombie. */
3695
3696         HASHMAP_FOREACH(s, e->child_sources) {
3697                 assert(s->type == SOURCE_CHILD);
3698                 assert(s->child.pidfd >= 0);
3699
3700                 if (s->priority > threshold)
3701                         continue;
3702
3703                 if (s->pending)
3704                         continue;
3705
3706                 if (event_source_is_offline(s))
3707                         continue;
3708
3709                 if (s->child.exited)
3710                         continue;
3711
3712                 if (EVENT_SOURCE_WATCH_PIDFD(s))
3713                         /* There's a usable pidfd known for this event source? Then don't waitid() for
3714                          * it here */
3715                         continue;
3716
3717                 zero(s->child.siginfo);
3718                 if (waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo,
3719                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3720                         return negative_errno();
3721
3722                 if (s->child.siginfo.si_pid != 0) {
3723                         bool zombie = SIGINFO_CODE_IS_DEAD(s->child.siginfo.si_code);
3724
3725                         if (zombie)
3726                                 s->child.exited = true;
3727                         else if (s->child.options & WEXITED) {
3728                                 /* If the child isn't dead then let's immediately remove the state change
3729                                  * from the queue, since there's no benefit in leaving it queued. */
3730
3731                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
3732                                 (void) waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3733                         }
3734
3735                         r = source_set_pending(s, true);
3736                         if (r < 0)
3737                                 return r;
3738                         if (r > 0) {
3739                                 something_new = true;
3740                                 min_priority = MIN(min_priority, s->priority);
3741                         }
3742                 }
3743         }
3744
3745         *ret_min_priority = min_priority;
3746         return something_new;
3747 }
3748
3749 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3750         assert(e);
3751         assert(s);
3752         assert(s->type == SOURCE_CHILD);
3753         assert(s->child.pidfd >= 0);
3754
3755         if (s->pending)
3756                 return 0;
3757
3758         if (event_source_is_offline(s))
3759                 return 0;
3760
3761         if (!EVENT_SOURCE_WATCH_PIDFD(s))
3762                 return 0;
3763
3764         /* Note that pidfd would also generate EPOLLHUP when the process gets reaped. But at this point we
3765          * only permit EPOLLIN, under the assumption that upon EPOLLHUP the child source should already
3766          * be set to pending, and we would have returned early above. */
3767         assert(!s->child.exited);
3768
3769         zero(s->child.siginfo);
3770         if (waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3771                 return -errno;
3772
3773         if (s->child.siginfo.si_pid == 0)
3774                 return 0;
3775
3776         if (SIGINFO_CODE_IS_DEAD(s->child.siginfo.si_code))
3777                 s->child.exited = true;
3778
3779         return source_set_pending(s, true);
3780 }
3781
3782 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3783         int r;
3784
3785         assert(e);
3786         assert(d);
3787         assert_return(events == EPOLLIN, -EIO);
3788         assert(min_priority);
3789
3790         /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3791          * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3792          * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3793          * but we might have higher priority children we care about hence we need to check that
3794          * explicitly. */
3795
3796         if (sigismember(&d->sigset, SIGCHLD))
3797                 e->need_process_child = true;
3798
3799         /* If there's already an event source pending for this priority we don't read another */
3800         if (d->current)
3801                 return 0;
3802
3803         for (;;) {
3804                 struct signalfd_siginfo si;
3805                 ssize_t n;
3806                 sd_event_source *s = NULL;
3807
3808                 n = read(d->fd, &si, sizeof(si));
3809                 if (n < 0) {
3810                         if (ERRNO_IS_TRANSIENT(errno))
3811                                 return 0;
3812
3813                         return -errno;
3814                 }
3815
3816                 if (_unlikely_(n != sizeof(si)))
3817                         return -EIO;
3818
3819                 if (_unlikely_(!SIGNAL_VALID(si.ssi_signo)))
3820                         return -EIO;
3821
3822                 if (e->signal_sources)
3823                         s = e->signal_sources[si.ssi_signo];
3824                 if (!s)
3825                         continue;
3826                 if (s->pending)
3827                         continue;
3828
3829                 s->signal.siginfo = si;
3830                 d->current = s;
3831
3832                 r = source_set_pending(s, true);
3833                 if (r < 0)
3834                         return r;
3835                 if (r > 0 && *min_priority >= s->priority) {
3836                         *min_priority = s->priority;
3837                         return 1; /* an event source with smaller priority is queued. */
3838                 }
3839
3840                 return 0;
3841         }
3842 }
3843
3844 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3845         ssize_t n;
3846
3847         assert(e);
3848         assert(d);
3849
3850         assert_return(revents == EPOLLIN, -EIO);
3851
3852         /* If there's already an event source pending for this priority, don't read another */
3853         if (d->n_pending > 0)
3854                 return 0;
3855
3856         /* Is the read buffer non-empty? If so, let's not read more */
3857         if (d->buffer_filled > 0)
3858                 return 0;
3859
3860         if (d->priority > threshold)
3861                 return 0;
3862
3863         n = read(d->fd, &d->buffer, sizeof(d->buffer));
3864         if (n < 0) {
3865                 if (ERRNO_IS_TRANSIENT(errno))
3866                         return 0;
3867
3868                 return -errno;
3869         }
3870
3871         assert(n > 0);
3872         d->buffer_filled = (size_t) n;
3873         LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
3874
3875         return 1;
3876 }
3877
3878 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3879         assert(e);
3880         assert(d);
3881         assert(sz <= d->buffer_filled);
3882
3883         if (sz == 0)
3884                 return;
3885
3886         /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3887         memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3888         d->buffer_filled -= sz;
3889
3890         if (d->buffer_filled == 0)
3891                 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
3892 }
3893
3894 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3895         int r;
3896
3897         assert(e);
3898         assert(d);
3899
3900         /* If there's already an event source pending for this priority, don't read another */
3901         if (d->n_pending > 0)
3902                 return 0;
3903
3904         while (d->buffer_filled > 0) {
3905                 size_t sz;
3906
3907                 /* Let's validate that the event structures are complete */
3908                 if (d->buffer_filled < offsetof(struct inotify_event, name))
3909                         return -EIO;
3910
3911                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3912                 if (d->buffer_filled < sz)
3913                         return -EIO;
3914
3915                 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3916                         struct inode_data *inode_data;
3917
3918                         /* The queue overran, let's pass this event to all event sources connected to this inotify
3919                          * object */
3920
3921                         HASHMAP_FOREACH(inode_data, d->inodes)
3922                                 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3923
3924                                         if (event_source_is_offline(s))
3925                                                 continue;
3926
3927                                         r = source_set_pending(s, true);
3928                                         if (r < 0)
3929                                                 return r;
3930                                 }
3931                 } else {
3932                         struct inode_data *inode_data;
3933
3934                         /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3935                          * our watch descriptor table. */
3936                         if (d->buffer.ev.mask & IN_IGNORED) {
3937
3938                                 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3939                                 if (!inode_data) {
3940                                         event_inotify_data_drop(e, d, sz);
3941                                         continue;
3942                                 }
3943
3944                                 /* The watch descriptor was removed by the kernel, let's drop it here too */
3945                                 inode_data->wd = -1;
3946                         } else {
3947                                 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3948                                 if (!inode_data) {
3949                                         event_inotify_data_drop(e, d, sz);
3950                                         continue;
3951                                 }
3952                         }
3953
3954                         /* Trigger all event sources that are interested in these events. Also trigger all event
3955                          * sources if IN_IGNORED or IN_UNMOUNT is set. */
3956                         LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3957
3958                                 if (event_source_is_offline(s))
3959                                         continue;
3960
3961                                 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3962                                     (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3963                                         continue;
3964
3965                                 r = source_set_pending(s, true);
3966                                 if (r < 0)
3967                                         return r;
3968                         }
3969                 }
3970
3971                 /* Something pending now? If so, let's finish, otherwise let's read more. */
3972                 if (d->n_pending > 0)
3973                         return 1;
3974         }
3975
3976         return 0;
3977 }
3978
3979 static int process_inotify(sd_event *e) {
3980         int r, done = 0;
3981
3982         assert(e);
3983
3984         LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
3985                 r = event_inotify_data_process(e, d);
3986                 if (r < 0)
3987                         return r;
3988                 if (r > 0)
3989                         done++;
3990         }
3991
3992         return done;
3993 }
3994
3995 static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
3996         assert(s);
3997         assert(s->type == SOURCE_MEMORY_PRESSURE);
3998
3999         if (s->pending)
4000                 s->memory_pressure.revents |= revents;
4001         else
4002                 s->memory_pressure.revents = revents;
4003
4004         return source_set_pending(s, true);
4005 }
4006
4007 static int source_memory_pressure_write(sd_event_source *s) {
4008         ssize_t n;
4009         int r;
4010
4011         assert(s);
4012         assert(s->type == SOURCE_MEMORY_PRESSURE);
4013
4014         /* once we start writing, the buffer is locked, we allow no further changes. */
4015         s->memory_pressure.locked = true;
4016
4017         if (s->memory_pressure.write_buffer_size > 0) {
4018                 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4019                 if (n < 0) {
4020                         if (!ERRNO_IS_TRANSIENT(errno)) {
4021                                 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4022                                  * files, but then generates EOPNOSUPP on read() and write() (instead of on
4023                                  * open()!). This sucks hard, since we can only detect this kind of failure
4024                                  * so late. Let's make the best of it, and turn off the event source like we
4025                                  * do for failed event source handlers. */
4026
4027                                 log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4028                                 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4029                                 return 0;
4030                         }
4031
4032                         n = 0;
4033                 }
4034         } else
4035                 n = 0;
4036
4037         assert(n >= 0);
4038
4039         if ((size_t) n == s->memory_pressure.write_buffer_size) {
4040                 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4041
4042                 if (n > 0) {
4043                         s->memory_pressure.write_buffer_size = 0;
4044
4045                         /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4046                         r = source_memory_pressure_register(s, s->enabled);
4047                         if (r < 0)
4048                                 return r;
4049                 }
4050         } else if (n > 0) {
4051                 _cleanup_free_ void *c = NULL;
4052
4053                 assert((size_t) n < s->memory_pressure.write_buffer_size);
4054
4055                 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4056                 if (!c)
4057                         return -ENOMEM;
4058
4059                 free_and_replace(s->memory_pressure.write_buffer, c);
4060                 s->memory_pressure.write_buffer_size -= n;
4061                 return 1;
4062         }
4063
4064         return 0;
4065 }
4066
4067 static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4068         int r;
4069
4070         assert(s);
4071         assert(s->type == SOURCE_MEMORY_PRESSURE);
4072
4073         r = source_memory_pressure_write(s);
4074         if (r < 0)
4075                 return r;
4076         if (r > 0)
4077                 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4078                            * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4079
4080         /* No pending incoming IO? Then let's not continue further */
4081         if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4082
4083                 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4084                 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4085                         return -EIO;
4086
4087                 return 1; /* leave dispatch, we already processed everything */
4088         }
4089
4090         if (s->memory_pressure.revents & EPOLLIN) {
4091                 uint8_t pipe_buf[PIPE_BUF];
4092                 ssize_t n;
4093
4094                 /* If the fd is readable, then flush out anything that might be queued */
4095
4096                 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4097                 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4098                         return -errno;
4099         }
4100
4101         return 0; /* go on, dispatch to user callback */
4102 }
4103
4104 static int source_dispatch(sd_event_source *s) {
4105         EventSourceType saved_type;
4106         sd_event *saved_event;
4107         int r = 0;
4108
4109         assert(s);
4110         assert(s->pending || s->type == SOURCE_EXIT);
4111
4112         /* Save the event source type, here, so that we still know it after the event callback which might
4113          * invalidate the event. */
4114         saved_type = s->type;
4115
4116         /* Similarly, store a reference to the event loop object, so that we can still access it after the
4117          * callback might have invalidated/disconnected the event source. */
4118         saved_event = s->event;
4119         PROTECT_EVENT(saved_event);
4120
4121         /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4122         assert(!s->ratelimited);
4123         if (!ratelimit_below(&s->rate_limit)) {
4124                 r = event_source_enter_ratelimited(s);
4125                 if (r < 0)
4126                         return r;
4127
4128                 return 1;
4129         }
4130
4131         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
4132                 r = source_set_pending(s, false);
4133                 if (r < 0)
4134                         return r;
4135         }
4136
4137         if (s->type != SOURCE_POST) {
4138                 sd_event_source *z;
4139
4140                 /* If we execute a non-post source, let's mark all post sources as pending. */
4141
4142                 SET_FOREACH(z, s->event->post_sources) {
4143                         if (event_source_is_offline(z))
4144                                 continue;
4145
4146                         r = source_set_pending(z, true);
4147                         if (r < 0)
4148                                 return r;
4149                 }
4150         }
4151
4152         if (s->type == SOURCE_MEMORY_PRESSURE) {
4153                 r = source_memory_pressure_initiate_dispatch(s);
4154                 if (r == -EIO) /* handle EIO errors similar to callback errors */
4155                         goto finish;
4156                 if (r < 0)
4157                         return r;
4158                 if (r > 0) /* already handled */
4159                         return 1;
4160         }
4161
4162         if (s->enabled == SD_EVENT_ONESHOT) {
4163                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
4164                 if (r < 0)
4165                         return r;
4166         }
4167
4168         s->dispatching = true;
4169
4170         switch (s->type) {
4171
4172         case SOURCE_IO:
4173                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4174                 break;
4175
4176         case SOURCE_TIME_REALTIME:
4177         case SOURCE_TIME_BOOTTIME:
4178         case SOURCE_TIME_MONOTONIC:
4179         case SOURCE_TIME_REALTIME_ALARM:
4180         case SOURCE_TIME_BOOTTIME_ALARM:
4181                 r = s->time.callback(s, s->time.next, s->userdata);
4182                 break;
4183
4184         case SOURCE_SIGNAL:
4185                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4186                 break;
4187
4188         case SOURCE_CHILD: {
4189                 bool zombie = SIGINFO_CODE_IS_DEAD(s->child.siginfo.si_code);
4190
4191                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
4192
4193                 /* Now, reap the PID for good. */
4194                 if (zombie) {
4195                         (void) waitid(P_PIDFD, s->child.pidfd, &s->child.siginfo, WNOHANG|WEXITED);
4196                         s->child.waited = true;
4197                 }
4198
4199                 break;
4200         }
4201
4202         case SOURCE_DEFER:
4203                 r = s->defer.callback(s, s->userdata);
4204                 break;
4205
4206         case SOURCE_POST:
4207                 r = s->post.callback(s, s->userdata);
4208                 break;
4209
4210         case SOURCE_EXIT:
4211                 r = s->exit.callback(s, s->userdata);
4212                 break;
4213
4214         case SOURCE_INOTIFY: {
4215                 struct sd_event *e = s->event;
4216                 struct inotify_data *d;
4217                 size_t sz;
4218
4219                 assert(s->inotify.inode_data);
4220                 assert_se(d = s->inotify.inode_data->inotify_data);
4221
4222                 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4223                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4224                 assert(d->buffer_filled >= sz);
4225
4226                 /* If the inotify callback destroys the event source then this likely means we don't need to
4227                  * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4228                  * free it immediately, then we couldn't drop the event from the inotify event queue without
4229                  * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4230                  * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4231                  * explicitly GC it after we are done dropping the inotify event from the buffer. */
4232                 d->n_busy++;
4233                 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
4234                 d->n_busy--;
4235
4236                 /* When no event is pending anymore on this inotify object, then let's drop the event from
4237                  * the inotify event queue buffer. */
4238                 if (d->n_pending == 0)
4239                         event_inotify_data_drop(e, d, sz);
4240
4241                 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4242                 event_gc_inotify_data(e, d);
4243                 break;
4244         }
4245
4246         case SOURCE_MEMORY_PRESSURE:
4247                 r = s->memory_pressure.callback(s, s->userdata);
4248                 break;
4249
4250         case SOURCE_WATCHDOG:
4251         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
4252         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
4253                 assert_not_reached();
4254         }
4255
4256         s->dispatching = false;
4257
4258 finish:
4259         if (r < 0) {
4260                 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4261                                 strna(s->description),
4262                                 event_source_type_to_string(saved_type),
4263                                 s->exit_on_failure ? "exiting" : "disabling");
4264
4265                 if (s->exit_on_failure)
4266                         (void) sd_event_exit(saved_event, r);
4267         }
4268
4269         if (s->n_ref == 0)
4270                 source_free(s);
4271         else if (r < 0)
4272                 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4273
4274         return 1;
4275 }
4276
4277 static int event_prepare(sd_event *e) {
4278         int r;
4279
4280         assert(e);
4281
4282         for (;;) {
4283                 sd_event_source *s;
4284
4285                 s = prioq_peek(e->prepare);
4286                 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
4287                         break;
4288
4289                 s->prepare_iteration = e->iteration;
4290                 prioq_reshuffle(e->prepare, s, &s->prepare_index);
4291
4292                 assert(s->prepare);
4293                 s->dispatching = true;
4294                 r = s->prepare(s, s->userdata);
4295                 s->dispatching = false;
4296
4297                 if (r < 0) {
4298                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4299                                         strna(s->description),
4300                                         event_source_type_to_string(s->type),
4301                                         s->exit_on_failure ? "exiting" : "disabling");
4302
4303                         if (s->exit_on_failure)
4304                                 (void) sd_event_exit(e, r);
4305                 }
4306
4307                 if (s->n_ref == 0)
4308                         source_free(s);
4309                 else if (r < 0)
4310                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4311         }
4312
4313         return 0;
4314 }
4315
4316 static int dispatch_exit(sd_event *e) {
4317         sd_event_source *p;
4318         int r;
4319
4320         assert(e);
4321
4322         p = prioq_peek(e->exit);
4323         assert(!p || p->type == SOURCE_EXIT);
4324
4325         if (!p || event_source_is_offline(p)) {
4326                 e->state = SD_EVENT_FINISHED;
4327                 return 0;
4328         }
4329
4330         PROTECT_EVENT(e);
4331         e->iteration++;
4332         e->state = SD_EVENT_EXITING;
4333         r = source_dispatch(p);
4334         e->state = SD_EVENT_INITIAL;
4335         return r;
4336 }
4337
4338 static sd_event_source* event_next_pending(sd_event *e) {
4339         sd_event_source *p;
4340
4341         assert(e);
4342
4343         p = prioq_peek(e->pending);
4344         if (!p)
4345                 return NULL;
4346
4347         if (event_source_is_offline(p))
4348                 return NULL;
4349
4350         return p;
4351 }
4352
4353 static int arm_watchdog(sd_event *e) {
4354         struct itimerspec its = {};
4355         usec_t t;
4356
4357         assert(e);
4358         assert(e->watchdog_fd >= 0);
4359
4360         t = sleep_between(e,
4361                           usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4362                           usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
4363
4364         timespec_store(&its.it_value, t);
4365
4366         /* Make sure we never set the watchdog to 0, which tells the
4367          * kernel to disable it. */
4368         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4369                 its.it_value.tv_nsec = 1;
4370
4371         return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
4372 }
4373
4374 static int process_watchdog(sd_event *e) {
4375         assert(e);
4376
4377         if (!e->watchdog)
4378                 return 0;
4379
4380         /* Don't notify watchdog too often */
4381         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4382                 return 0;
4383
4384         sd_notify(false, "WATCHDOG=1");
4385         e->watchdog_last = e->timestamp.monotonic;
4386
4387         return arm_watchdog(e);
4388 }
4389
4390 static void event_close_inode_data_fds(sd_event *e) {
4391         struct inode_data *d;
4392
4393         assert(e);
4394
4395         /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4396          * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4397          * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4398          * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4399          * compromise. */
4400
4401         while ((d = e->inode_data_to_close_list)) {
4402                 assert(d->fd >= 0);
4403                 d->fd = safe_close(d->fd);
4404
4405                 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
4406         }
4407 }
4408
4409 static int event_memory_pressure_write_list(sd_event *e) {
4410         int r;
4411
4412         assert(e);
4413
4414         for (;;) {
4415                 sd_event_source *s;
4416
4417                 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4418                 if (!s)
4419                         break;
4420
4421                 assert(s->type == SOURCE_MEMORY_PRESSURE);
4422                 assert(s->memory_pressure.write_buffer_size > 0);
4423                 s->memory_pressure.in_write_list = false;
4424
4425                 r = source_memory_pressure_write(s);
4426                 if (r < 0)
4427                         return r;
4428         }
4429
4430         return 0;
4431 }
4432
4433 _public_ int sd_event_prepare(sd_event *e) {
4434         int r;
4435
4436         assert_return(e, -EINVAL);
4437         assert_return(e = event_resolve(e), -ENOPKG);
4438         assert_return(!event_origin_changed(e), -ECHILD);
4439         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4440         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4441
4442         /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4443          * this check here once, since gettid() is typically not cached, and thus want to minimize
4444          * syscalls */
4445         assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4446
4447         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4448         PROTECT_EVENT(e);
4449
4450         if (e->exit_requested)
4451                 goto pending;
4452
4453         e->iteration++;
4454
4455         e->state = SD_EVENT_PREPARING;
4456         r = event_prepare(e);
4457         e->state = SD_EVENT_INITIAL;
4458         if (r < 0)
4459                 return r;
4460
4461         r = event_memory_pressure_write_list(e);
4462         if (r < 0)
4463                 return r;
4464
4465         r = event_arm_timer(e, &e->realtime);
4466         if (r < 0)
4467                 return r;
4468
4469         r = event_arm_timer(e, &e->boottime);
4470         if (r < 0)
4471                 return r;
4472
4473         r = event_arm_timer(e, &e->monotonic);
4474         if (r < 0)
4475                 return r;
4476
4477         r = event_arm_timer(e, &e->realtime_alarm);
4478         if (r < 0)
4479                 return r;
4480
4481         r = event_arm_timer(e, &e->boottime_alarm);
4482         if (r < 0)
4483                 return r;
4484
4485         event_close_inode_data_fds(e);
4486
4487         if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
4488                 goto pending;
4489
4490         e->state = SD_EVENT_ARMED;
4491
4492         return 0;
4493
4494 pending:
4495         e->state = SD_EVENT_ARMED;
4496         r = sd_event_wait(e, 0);
4497         if (r == 0)
4498                 e->state = SD_EVENT_ARMED;
4499
4500         return r;
4501 }
4502
4503 static int epoll_wait_usec(
4504                 int fd,
4505                 struct epoll_event *events,
4506                 int maxevents,
4507                 usec_t timeout) {
4508
4509         int msec;
4510         /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4511
4512 #if HAVE_EPOLL_PWAIT2
4513         static bool epoll_pwait2_absent = false;
4514         int r;
4515
4516         /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4517          * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4518          * is not that obvious to implement given the libc and kernel definitions differ in the last
4519          * argument. Moreover, the only reason to use it is the more accurate timeouts (which is not a
4520          * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4521          * missing. */
4522
4523         if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
4524                 r = epoll_pwait2(fd,
4525                                  events,
4526                                  maxevents,
4527                                  TIMESPEC_STORE(timeout),
4528                                  NULL);
4529                 if (r >= 0)
4530                         return r;
4531                 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
4532                         return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4533                                         * supported. */
4534
4535                 epoll_pwait2_absent = true;
4536         }
4537 #endif
4538
4539         if (timeout == USEC_INFINITY)
4540                 msec = -1;
4541         else {
4542                 usec_t k;
4543
4544                 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4545                 if (k >= INT_MAX)
4546                         msec = INT_MAX; /* Saturate */
4547                 else
4548                         msec = (int) k;
4549         }
4550
4551         return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
4552 }
4553
4554 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
4555         size_t n_event_queue, m, n_event_max;
4556         int64_t min_priority = threshold;
4557         bool something_new = false;
4558         int r;
4559
4560         assert(e);
4561         assert(ret_min_priority);
4562
4563         n_event_queue = MAX(e->n_sources, 1u);
4564         if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
4565                 return -ENOMEM;
4566
4567         n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4568
4569         /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4570         if (e->buffered_inotify_data_list)
4571                 timeout = 0;
4572
4573         for (;;) {
4574                 r = epoll_wait_usec(
4575                                 e->epoll_fd,
4576                                 e->event_queue,
4577                                 n_event_max,
4578                                 timeout);
4579                 if (r < 0)
4580                         return r;
4581
4582                 m = (size_t) r;
4583
4584                 if (m < n_event_max)
4585                         break;
4586
4587                 if (n_event_max >= n_event_queue * 10)
4588                         break;
4589
4590                 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
4591                         return -ENOMEM;
4592
4593                 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4594                 timeout = 0;
4595         }
4596
4597         /* Set timestamp only when this is called first time. */
4598         if (threshold == INT64_MAX)
4599                 triple_timestamp_now(&e->timestamp);
4600
4601         for (size_t i = 0; i < m; i++) {
4602
4603                 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4604                         r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
4605                 else {
4606                         WakeupType *t = e->event_queue[i].data.ptr;
4607
4608                         switch (*t) {
4609
4610                         case WAKEUP_EVENT_SOURCE: {
4611                                 sd_event_source *s = e->event_queue[i].data.ptr;
4612
4613                                 assert(s);
4614
4615                                 if (s->priority > threshold)
4616                                         continue;
4617
4618                                 min_priority = MIN(min_priority, s->priority);
4619
4620                                 switch (s->type) {
4621
4622                                 case SOURCE_IO:
4623                                         r = process_io(e, s, e->event_queue[i].events);
4624                                         break;
4625
4626                                 case SOURCE_CHILD:
4627                                         r = process_pidfd(e, s, e->event_queue[i].events);
4628                                         break;
4629
4630                                 case SOURCE_MEMORY_PRESSURE:
4631                                         r = process_memory_pressure(s, e->event_queue[i].events);
4632                                         break;
4633
4634                                 default:
4635                                         assert_not_reached();
4636                                 }
4637
4638                                 break;
4639                         }
4640
4641                         case WAKEUP_CLOCK_DATA: {
4642                                 struct clock_data *d = e->event_queue[i].data.ptr;
4643
4644                                 assert(d);
4645
4646                                 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4647                                 break;
4648                         }
4649
4650                         case WAKEUP_SIGNAL_DATA:
4651                                 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4652                                 break;
4653
4654                         case WAKEUP_INOTIFY_DATA:
4655                                 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4656                                 break;
4657
4658                         default:
4659                                 assert_not_reached();
4660                         }
4661                 }
4662                 if (r < 0)
4663                         return r;
4664                 if (r > 0)
4665                         something_new = true;
4666         }
4667
4668         *ret_min_priority = min_priority;
4669         return something_new;
4670 }
4671
4672 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4673         int r;
4674
4675         assert_return(e, -EINVAL);
4676         assert_return(e = event_resolve(e), -ENOPKG);
4677         assert_return(!event_origin_changed(e), -ECHILD);
4678         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4679         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4680
4681         if (e->exit_requested) {
4682                 e->state = SD_EVENT_PENDING;
4683                 return 1;
4684         }
4685
4686         for (int64_t threshold = INT64_MAX; ; threshold--) {
4687                 int64_t epoll_min_priority, child_min_priority;
4688
4689                 /* There may be a possibility that new epoll (especially IO) and child events are
4690                  * triggered just after process_epoll() call but before process_child(), and the new IO
4691                  * events may have higher priority than the child events. To salvage these events,
4692                  * let's call epoll_wait() again, but accepts only events with higher priority than the
4693                  * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4694                  * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4695                  * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4696
4697                 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4698                 if (r == -EINTR) {
4699                         e->state = SD_EVENT_PENDING;
4700                         return 1;
4701                 }
4702                 if (r < 0)
4703                         goto finish;
4704                 if (r == 0 && threshold < INT64_MAX)
4705                         /* No new epoll event. */
4706                         break;
4707
4708                 r = process_child(e, threshold, &child_min_priority);
4709                 if (r < 0)
4710                         goto finish;
4711                 if (r == 0)
4712                         /* No new child event. */
4713                         break;
4714
4715                 threshold = MIN(epoll_min_priority, child_min_priority);
4716                 if (threshold == INT64_MIN)
4717                         break;
4718
4719                 timeout = 0;
4720         }
4721
4722         r = process_watchdog(e);
4723         if (r < 0)
4724                 goto finish;
4725
4726         r = process_inotify(e);
4727         if (r < 0)
4728                 goto finish;
4729
4730         r = process_timer(e, e->timestamp.realtime, &e->realtime);
4731         if (r < 0)
4732                 goto finish;
4733
4734         r = process_timer(e, e->timestamp.boottime, &e->boottime);
4735         if (r < 0)
4736                 goto finish;
4737
4738         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4739         if (r < 0)
4740                 goto finish;
4741
4742         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4743         if (r < 0)
4744                 goto finish;
4745
4746         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4747         if (r < 0)
4748                 goto finish;
4749         else if (r == 1) {
4750                 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4751                  * put loop in the initial state in order to evaluate (in the next iteration) also sources
4752                  * there were potentially re-enabled by the callback.
4753                  *
4754                  * Wondering why we treat only this invocation of process_timer() differently? Once event
4755                  * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4756                  * ratelimit expiry callback is never called for any other timer type. */
4757                 r = 0;
4758                 goto finish;
4759         }
4760
4761         if (event_next_pending(e)) {
4762                 e->state = SD_EVENT_PENDING;
4763                 return 1;
4764         }
4765
4766         r = 0;
4767
4768 finish:
4769         e->state = SD_EVENT_INITIAL;
4770
4771         return r;
4772 }
4773
4774 _public_ int sd_event_dispatch(sd_event *e) {
4775         sd_event_source *p;
4776         int r;
4777
4778         assert_return(e, -EINVAL);
4779         assert_return(e = event_resolve(e), -ENOPKG);
4780         assert_return(!event_origin_changed(e), -ECHILD);
4781         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4782         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4783
4784         if (e->exit_requested)
4785                 return dispatch_exit(e);
4786
4787         p = event_next_pending(e);
4788         if (p) {
4789                 PROTECT_EVENT(e);
4790
4791                 e->state = SD_EVENT_RUNNING;
4792                 r = source_dispatch(p);
4793                 e->state = SD_EVENT_INITIAL;
4794                 return r;
4795         }
4796
4797         e->state = SD_EVENT_INITIAL;
4798
4799         return 1;
4800 }
4801
4802 static void event_log_delays(sd_event *e) {
4803         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4804         size_t l;
4805
4806         p = b;
4807         l = sizeof(b);
4808         FOREACH_ELEMENT(delay, e->delays) {
4809                 l = strpcpyf(&p, l, "%u ", *delay);
4810                 *delay = 0;
4811         }
4812         log_debug("Event loop iterations: %s", b);
4813 }
4814
4815 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4816         int r;
4817
4818         assert_return(e, -EINVAL);
4819         assert_return(e = event_resolve(e), -ENOPKG);
4820         assert_return(!event_origin_changed(e), -ECHILD);
4821         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4822         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4823
4824         if (e->profile_delays && e->last_run_usec != 0) {
4825                 usec_t this_run;
4826                 unsigned l;
4827
4828                 this_run = now(CLOCK_MONOTONIC);
4829
4830                 l = log2u64(this_run - e->last_run_usec);
4831                 assert(l < ELEMENTSOF(e->delays));
4832                 e->delays[l]++;
4833
4834                 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4835                         event_log_delays(e);
4836                         e->last_log_usec = this_run;
4837                 }
4838         }
4839
4840         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4841         PROTECT_EVENT(e);
4842
4843         r = sd_event_prepare(e);
4844         if (r == 0)
4845                 /* There was nothing? Then wait... */
4846                 r = sd_event_wait(e, timeout);
4847
4848         if (e->profile_delays)
4849                 e->last_run_usec = now(CLOCK_MONOTONIC);
4850
4851         if (r > 0) {
4852                 /* There's something now, then let's dispatch it */
4853                 r = sd_event_dispatch(e);
4854                 if (r < 0)
4855                         return r;
4856
4857                 return 1;
4858         }
4859
4860         return r;
4861 }
4862
4863 _public_ int sd_event_loop(sd_event *e) {
4864         int r;
4865
4866         assert_return(e, -EINVAL);
4867         assert_return(e = event_resolve(e), -ENOPKG);
4868         assert_return(!event_origin_changed(e), -ECHILD);
4869         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4870
4871         PROTECT_EVENT(e);
4872
4873         while (e->state != SD_EVENT_FINISHED) {
4874                 r = sd_event_run(e, UINT64_MAX);
4875                 if (r < 0)
4876                         return r;
4877         }
4878
4879         return e->exit_code;
4880 }
4881
4882 _public_ int sd_event_get_fd(sd_event *e) {
4883         assert_return(e, -EINVAL);
4884         assert_return(e = event_resolve(e), -ENOPKG);
4885         assert_return(!event_origin_changed(e), -ECHILD);
4886
4887         return e->epoll_fd;
4888 }
4889
4890 _public_ int sd_event_get_state(sd_event *e) {
4891         assert_return(e, -EINVAL);
4892         assert_return(e = event_resolve(e), -ENOPKG);
4893         assert_return(!event_origin_changed(e), -ECHILD);
4894
4895         return e->state;
4896 }
4897
4898 _public_ int sd_event_get_exit_code(sd_event *e, int *ret) {
4899         assert_return(e, -EINVAL);
4900         assert_return(e = event_resolve(e), -ENOPKG);
4901         assert_return(!event_origin_changed(e), -ECHILD);
4902
4903         if (!e->exit_requested)
4904                 return -ENODATA;
4905
4906         if (ret)
4907                 *ret = e->exit_code;
4908         return 0;
4909 }
4910
4911 _public_ int sd_event_exit(sd_event *e, int code) {
4912         assert_return(e, -EINVAL);
4913         assert_return(e = event_resolve(e), -ENOPKG);
4914         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4915         assert_return(!event_origin_changed(e), -ECHILD);
4916
4917         e->exit_requested = true;
4918         e->exit_code = code;
4919
4920         return 0;
4921 }
4922
4923 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *ret) {
4924         assert_return(e, -EINVAL);
4925         assert_return(e = event_resolve(e), -ENOPKG);
4926         assert_return(ret, -EINVAL);
4927         assert_return(!event_origin_changed(e), -ECHILD);
4928
4929         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4930                 return -EOPNOTSUPP;
4931
4932         if (!triple_timestamp_is_set(&e->timestamp)) {
4933                 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4934                 *ret = now(clock);
4935                 return 1;
4936         }
4937
4938         *ret = triple_timestamp_by_clock(&e->timestamp, clock);
4939         return 0;
4940 }
4941
4942 _public_ int sd_event_default(sd_event **ret) {
4943         sd_event *e = NULL;
4944         int r;
4945
4946         if (!ret)
4947                 return !!default_event;
4948
4949         if (default_event) {
4950                 *ret = sd_event_ref(default_event);
4951                 return 0;
4952         }
4953
4954         r = sd_event_new(&e);
4955         if (r < 0)
4956                 return r;
4957
4958         e->default_event_ptr = &default_event;
4959         e->tid = gettid();
4960         default_event = e;
4961
4962         *ret = e;
4963         return 1;
4964 }
4965
4966 _public_ int sd_event_get_tid(sd_event *e, pid_t *ret) {
4967         assert_return(e, -EINVAL);
4968         assert_return(e = event_resolve(e), -ENOPKG);
4969         assert_return(ret, -EINVAL);
4970         assert_return(!event_origin_changed(e), -ECHILD);
4971
4972         if (e->tid == 0)
4973                 return -ENXIO;
4974
4975         *ret = e->tid;
4976         return 0;
4977 }
4978
4979 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
4980         int r;
4981
4982         assert_return(e, -EINVAL);
4983         assert_return(e = event_resolve(e), -ENOPKG);
4984         assert_return(!event_origin_changed(e), -ECHILD);
4985
4986         if (e->watchdog == !!b)
4987                 return e->watchdog;
4988
4989         if (b) {
4990                 r = sd_watchdog_enabled(false, &e->watchdog_period);
4991                 if (r <= 0)
4992                         return r;
4993
4994                 /* Issue first ping immediately */
4995                 sd_notify(false, "WATCHDOG=1");
4996                 e->watchdog_last = now(CLOCK_MONOTONIC);
4997
4998                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4999                 if (e->watchdog_fd < 0)
5000                         return -errno;
5001
5002                 r = arm_watchdog(e);
5003                 if (r < 0)
5004                         goto fail;
5005
5006                 struct epoll_event ev = {
5007                         .events = EPOLLIN,
5008                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
5009                 };
5010
5011                 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
5012                         r = -errno;
5013                         goto fail;
5014                 }
5015
5016         } else {
5017                 if (e->watchdog_fd >= 0) {
5018                         (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
5019                         e->watchdog_fd = safe_close(e->watchdog_fd);
5020                 }
5021         }
5022
5023         e->watchdog = b;
5024         return e->watchdog;
5025
5026 fail:
5027         e->watchdog_fd = safe_close(e->watchdog_fd);
5028         return r;
5029 }
5030
5031 _public_ int sd_event_get_watchdog(sd_event *e) {
5032         assert_return(e, -EINVAL);
5033         assert_return(e = event_resolve(e), -ENOPKG);
5034         assert_return(!event_origin_changed(e), -ECHILD);
5035
5036         return e->watchdog;
5037 }
5038
5039 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5040         assert_return(e, -EINVAL);
5041         assert_return(e = event_resolve(e), -ENOPKG);
5042         assert_return(!event_origin_changed(e), -ECHILD);
5043
5044         *ret = e->iteration;
5045         return 0;
5046 }
5047
5048 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5049         assert_return(s, -EINVAL);
5050         assert_return(s->event, -EINVAL);
5051         assert_return(!event_origin_changed(s->event), -ECHILD);
5052
5053         s->destroy_callback = callback;
5054         return 0;
5055 }
5056
5057 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5058         assert_return(s, -EINVAL);
5059         assert_return(!event_origin_changed(s->event), -ECHILD);
5060
5061         if (ret)
5062                 *ret = s->destroy_callback;
5063
5064         return !!s->destroy_callback;
5065 }
5066
5067 _public_ int sd_event_source_get_floating(sd_event_source *s) {
5068         assert_return(s, -EINVAL);
5069         assert_return(!event_origin_changed(s->event), -ECHILD);
5070
5071         return s->floating;
5072 }
5073
5074 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5075         assert_return(s, -EINVAL);
5076         assert_return(!event_origin_changed(s->event), -ECHILD);
5077
5078         if (s->floating == !!b)
5079                 return 0;
5080
5081         if (!s->event) /* Already disconnected */
5082                 return -ESTALE;
5083
5084         s->floating = b;
5085
5086         if (b) {
5087                 sd_event_source_ref(s);
5088                 sd_event_unref(s->event);
5089         } else {
5090                 sd_event_ref(s->event);
5091                 sd_event_source_unref(s);
5092         }
5093
5094         return 1;
5095 }
5096
5097 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5098         assert_return(s, -EINVAL);
5099         assert_return(s->type != SOURCE_EXIT, -EDOM);
5100         assert_return(!event_origin_changed(s->event), -ECHILD);
5101
5102         return s->exit_on_failure;
5103 }
5104
5105 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5106         assert_return(s, -EINVAL);
5107         assert_return(s->type != SOURCE_EXIT, -EDOM);
5108         assert_return(!event_origin_changed(s->event), -ECHILD);
5109
5110         if (s->exit_on_failure == !!b)
5111                 return 0;
5112
5113         s->exit_on_failure = b;
5114         return 1;
5115 }
5116
5117 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5118         int r;
5119
5120         assert_return(s, -EINVAL);
5121         assert_return(!event_origin_changed(s->event), -ECHILD);
5122
5123         /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5124          * so is a programming error. */
5125         assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5126
5127         /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5128          * non-ratelimited. */
5129         r = event_source_leave_ratelimit(s, /* run_callback */ false);
5130         if (r < 0)
5131                 return r;
5132
5133         s->rate_limit = (RateLimit) { interval, burst };
5134         return 0;
5135 }
5136
5137 _public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5138         assert_return(s, -EINVAL);
5139         assert_return(!event_origin_changed(s->event), -ECHILD);
5140
5141         s->ratelimit_expire_callback = callback;
5142         return 0;
5143 }
5144
5145 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5146         assert_return(s, -EINVAL);
5147         assert_return(!event_origin_changed(s->event), -ECHILD);
5148
5149         /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5150          * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5151         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5152                 return -EDOM;
5153
5154         if (!ratelimit_configured(&s->rate_limit))
5155                 return -ENOEXEC;
5156
5157         if (ret_interval)
5158                 *ret_interval = s->rate_limit.interval;
5159         if (ret_burst)
5160                 *ret_burst = s->rate_limit.burst;
5161
5162         return 0;
5163 }
5164
5165 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5166         assert_return(s, -EINVAL);
5167         assert_return(!event_origin_changed(s->event), -ECHILD);
5168
5169         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5170                 return false;
5171
5172         if (!ratelimit_configured(&s->rate_limit))
5173                 return false;
5174
5175         return s->ratelimited;
5176 }
5177
5178 _public_ int sd_event_source_leave_ratelimit(sd_event_source *s) {
5179         int r;
5180
5181         assert_return(s, -EINVAL);
5182
5183         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5184                 return 0;
5185
5186         if (!ratelimit_configured(&s->rate_limit))
5187                 return 0;
5188
5189         if (!s->ratelimited)
5190                 return 0;
5191
5192         r = event_source_leave_ratelimit(s, /* run_callback */ false);
5193         if (r < 0)
5194                 return r;
5195
5196         return 1; /* tell caller that we indeed just left the ratelimit state */
5197 }
5198
5199 _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5200         bool change = false;
5201         int r;
5202
5203         assert_return(e, -EINVAL);
5204         assert_return(e = event_resolve(e), -ENOPKG);
5205         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
5206         assert_return(!event_origin_changed(e), -ECHILD);
5207
5208         if (b) {
5209                 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5210                  * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5211                  * floating after creation (and undo this before deleting them again). */
5212
5213                 if (!e->sigint_event_source) {
5214                         r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5215                         if (r < 0)
5216                                 return r;
5217
5218                         assert_se(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5219                         change = true;
5220                 }
5221
5222                 if (!e->sigterm_event_source) {
5223                         r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5224                         if (r < 0) {
5225                                 if (change) {
5226                                         assert_se(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5227                                         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5228                                 }
5229
5230                                 return r;
5231                         }
5232
5233                         assert_se(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5234                         change = true;
5235                 }
5236
5237         } else {
5238                 if (e->sigint_event_source) {
5239                         assert_se(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5240                         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5241                         change = true;
5242                 }
5243
5244                 if (e->sigterm_event_source) {
5245                         assert_se(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5246                         e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5247                         change = true;
5248                 }
5249         }
5250
5251         return change;
5252 }
5253
5254 _public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5255         _cleanup_free_ char *b = NULL;
5256         _cleanup_free_ void *w = NULL;
5257
5258         assert_return(s, -EINVAL);
5259         assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5260         assert_return(ty, -EINVAL);
5261         assert_return(!event_origin_changed(s->event), -ECHILD);
5262
5263         if (!STR_IN_SET(ty, "some", "full"))
5264                 return -EINVAL;
5265
5266         if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5267                 return -EBUSY;
5268
5269         char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5270         if (!space)
5271                 return -EINVAL;
5272
5273         size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5274         b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5275         if (!b)
5276                 return -ENOMEM;
5277         if (!STR_IN_SET(b, "some", "full"))
5278                 return -EINVAL;
5279
5280         if (streq(b, ty))
5281                 return 0;
5282
5283         size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5284         w = new(char, nl);
5285         if (!w)
5286                 return -ENOMEM;
5287
5288         memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5289
5290         free_and_replace(s->memory_pressure.write_buffer, w);
5291         s->memory_pressure.write_buffer_size = nl;
5292         s->memory_pressure.locked = false;
5293
5294         return 1;
5295 }
5296
5297 _public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5298         _cleanup_free_ char *b = NULL;
5299         _cleanup_free_ void *w = NULL;
5300
5301         assert_return(s, -EINVAL);
5302         assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5303         assert_return(!event_origin_changed(s->event), -ECHILD);
5304
5305         if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5306                 return -ERANGE;
5307         if (window_usec <= 0 || window_usec >= UINT64_MAX)
5308                 return -ERANGE;
5309         if (threshold_usec > window_usec)
5310                 return -EINVAL;
5311
5312         if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5313                 return -EBUSY;
5314
5315         char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5316         if (!space)
5317                 return -EINVAL;
5318
5319         size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5320         b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5321         if (!b)
5322                 return -ENOMEM;
5323         if (!STR_IN_SET(b, "some", "full"))
5324                 return -EINVAL;
5325
5326         if (asprintf((char**) &w,
5327                      "%s " USEC_FMT " " USEC_FMT "",
5328                      b,
5329                      threshold_usec,
5330                      window_usec) < 0)
5331                 return -EINVAL;
5332
5333         l = strlen(w) + 1;
5334         if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5335                 return 0;
5336
5337         free_and_replace(s->memory_pressure.write_buffer, w);
5338         s->memory_pressure.write_buffer_size = l;
5339         s->memory_pressure.locked = false;
5340
5341         return 1;
5342 }