src/libsystemd/sd-event/sd-event.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <sys/epoll.h>
   4 #if HAVE_PIDFD_OPEN
   5 #include <sys/pidfd.h>
   6 #endif
   7 #include <sys/timerfd.h>
   8 #include <sys/wait.h>
   9
  10 #include "sd-daemon.h"
  11 #include "sd-event.h"
  12 #include "sd-id128.h"
  13 #include "sd-messages.h"
  14
  15 #include "alloc-util.h"
  16 #include "env-util.h"
  17 #include "event-source.h"
  18 #include "fd-util.h"
  19 #include "fs-util.h"
  20 #include "glyph-util.h"
  21 #include "hashmap.h"
  22 #include "hexdecoct.h"
  23 #include "list.h"
  24 #include "logarithm.h"
  25 #include "macro.h"
  26 #include "mallinfo-util.h"
  27 #include "memory-util.h"
  28 #include "missing_magic.h"
  29 #include "missing_syscall.h"
  30 #include "missing_threads.h"
  31 #include "origin-id.h"
  32 #include "path-util.h"
  33 #include "prioq.h"
  34 #include "process-util.h"
  35 #include "psi-util.h"
  36 #include "set.h"
  37 #include "signal-util.h"
  38 #include "socket-util.h"
  39 #include "stat-util.h"
  40 #include "string-table.h"
  41 #include "string-util.h"
  42 #include "strxcpyx.h"
  43 #include "time-util.h"
  44
  45 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
  46
  47 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
  48         /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
  49         return s &&
  50                 s->type == SOURCE_CHILD &&
  51                 s->child.pidfd >= 0 &&
  52                 s->child.options == WEXITED;
  53 }
  54
  55 static bool event_source_is_online(sd_event_source *s) {
  56         assert(s);
  57         return s->enabled != SD_EVENT_OFF && !s->ratelimited;
  58 }
  59
  60 static bool event_source_is_offline(sd_event_source *s) {
  61         assert(s);
  62         return s->enabled == SD_EVENT_OFF || s->ratelimited;
  63 }
  64
  65 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
  66         [SOURCE_IO]                  = "io",
  67         [SOURCE_TIME_REALTIME]       = "realtime",
  68         [SOURCE_TIME_BOOTTIME]       = "boottime",
  69         [SOURCE_TIME_MONOTONIC]      = "monotonic",
  70         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
  71         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
  72         [SOURCE_SIGNAL]              = "signal",
  73         [SOURCE_CHILD]               = "child",
  74         [SOURCE_DEFER]               = "defer",
  75         [SOURCE_POST]                = "post",
  76         [SOURCE_EXIT]                = "exit",
  77         [SOURCE_WATCHDOG]            = "watchdog",
  78         [SOURCE_INOTIFY]             = "inotify",
  79         [SOURCE_MEMORY_PRESSURE]     = "memory-pressure",
  80 };
  81
  82 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
  83
  84 #define EVENT_SOURCE_IS_TIME(t)                 \
  85         IN_SET((t),                             \
  86                SOURCE_TIME_REALTIME,            \
  87                SOURCE_TIME_BOOTTIME,            \
  88                SOURCE_TIME_MONOTONIC,           \
  89                SOURCE_TIME_REALTIME_ALARM,      \
  90                SOURCE_TIME_BOOTTIME_ALARM)
  91
  92 #define EVENT_SOURCE_CAN_RATE_LIMIT(t)          \
  93         IN_SET((t),                             \
  94                SOURCE_IO,                       \
  95                SOURCE_TIME_REALTIME,            \
  96                SOURCE_TIME_BOOTTIME,            \
  97                SOURCE_TIME_MONOTONIC,           \
  98                SOURCE_TIME_REALTIME_ALARM,      \
  99                SOURCE_TIME_BOOTTIME_ALARM,      \
 100                SOURCE_SIGNAL,                   \
 101                SOURCE_DEFER,                    \
 102                SOURCE_INOTIFY,                  \
 103                SOURCE_MEMORY_PRESSURE)
 104
 105 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
 106  * Time sources and ratelimited sources can be passed, so effectively this is the same as the
 107  * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
 108 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
 109
 110 struct sd_event {
 111         unsigned n_ref;
 112
 113         int epoll_fd;
 114         int watchdog_fd;
 115
 116         Prioq *pending;
 117         Prioq *prepare;
 118
 119         /* timerfd_create() only supports these five clocks so far. We
 120          * can add support for more clocks when the kernel learns to
 121          * deal with them, too. */
 122         struct clock_data realtime;
 123         struct clock_data boottime;
 124         struct clock_data monotonic;
 125         struct clock_data realtime_alarm;
 126         struct clock_data boottime_alarm;
 127
 128         usec_t perturb;
 129
 130         sd_event_source **signal_sources; /* indexed by signal number */
 131         Hashmap *signal_data; /* indexed by priority */
 132
 133         Hashmap *child_sources;
 134         unsigned n_online_child_sources;
 135
 136         Set *post_sources;
 137
 138         Prioq *exit;
 139
 140         Hashmap *inotify_data; /* indexed by priority */
 141
 142         /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
 143         LIST_HEAD(struct inode_data, inode_data_to_close_list);
 144
 145         /* A list of inotify objects that already have events buffered which aren't processed yet */
 146         LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
 147
 148         /* A list of memory pressure event sources that still need their subscription string written */
 149         LIST_HEAD(sd_event_source, memory_pressure_write_list);
 150
 151         uint64_t origin_id;
 152
 153         uint64_t iteration;
 154         triple_timestamp timestamp;
 155         int state;
 156
 157         bool exit_requested:1;
 158         bool need_process_child:1;
 159         bool watchdog:1;
 160         bool profile_delays:1;
 161
 162         int exit_code;
 163
 164         pid_t tid;
 165         sd_event **default_event_ptr;
 166
 167         usec_t watchdog_last, watchdog_period;
 168
 169         unsigned n_sources;
 170
 171         struct epoll_event *event_queue;
 172
 173         LIST_HEAD(sd_event_source, sources);
 174
 175         sd_event_source *sigint_event_source, *sigterm_event_source;
 176
 177         usec_t last_run_usec, last_log_usec;
 178         unsigned delays[sizeof(usec_t) * 8];
 179 };
 180
 181 DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event, event);
 182
 183 static thread_local sd_event *default_event = NULL;
 184
 185 static void source_disconnect(sd_event_source *s);
 186 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
 187
 188 static sd_event *event_resolve(sd_event *e) {
 189         return e == SD_EVENT_DEFAULT ? default_event : e;
 190 }
 191
 192 static int pending_prioq_compare(const void *a, const void *b) {
 193         const sd_event_source *x = a, *y = b;
 194         int r;
 195
 196         assert(x->pending);
 197         assert(y->pending);
 198
 199         /* Enabled ones first */
 200         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 201         if (r != 0)
 202                 return r;
 203
 204         /* Non rate-limited ones first. */
 205         r = CMP(!!x->ratelimited, !!y->ratelimited);
 206         if (r != 0)
 207                 return r;
 208
 209         /* Lower priority values first */
 210         r = CMP(x->priority, y->priority);
 211         if (r != 0)
 212                 return r;
 213
 214         /* Older entries first */
 215         return CMP(x->pending_iteration, y->pending_iteration);
 216 }
 217
 218 static int prepare_prioq_compare(const void *a, const void *b) {
 219         const sd_event_source *x = a, *y = b;
 220         int r;
 221
 222         assert(x->prepare);
 223         assert(y->prepare);
 224
 225         /* Enabled ones first */
 226         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 227         if (r != 0)
 228                 return r;
 229
 230         /* Non rate-limited ones first. */
 231         r = CMP(!!x->ratelimited, !!y->ratelimited);
 232         if (r != 0)
 233                 return r;
 234
 235         /* Move most recently prepared ones last, so that we can stop
 236          * preparing as soon as we hit one that has already been
 237          * prepared in the current iteration */
 238         r = CMP(x->prepare_iteration, y->prepare_iteration);
 239         if (r != 0)
 240                 return r;
 241
 242         /* Lower priority values first */
 243         return CMP(x->priority, y->priority);
 244 }
 245
 246 static usec_t time_event_source_next(const sd_event_source *s) {
 247         assert(s);
 248
 249         /* We have two kinds of event sources that have elapsation times associated with them: the actual
 250          * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
 251          * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
 252          * looking at here. */
 253
 254         if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
 255                 assert(s->rate_limit.begin != 0);
 256                 assert(s->rate_limit.interval != 0);
 257                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 258         }
 259
 260         /* Otherwise this must be a time event source, if not ratelimited */
 261         if (EVENT_SOURCE_IS_TIME(s->type))
 262                 return s->time.next;
 263
 264         return USEC_INFINITY;
 265 }
 266
 267 static usec_t time_event_source_latest(const sd_event_source *s) {
 268         assert(s);
 269
 270         if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
 271                                * same, as we should avoid adding additional inaccuracy on an inaccuracy time
 272                                * window */
 273                 assert(s->rate_limit.begin != 0);
 274                 assert(s->rate_limit.interval != 0);
 275                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 276         }
 277
 278         /* Must be a time event source, if not ratelimited */
 279         if (EVENT_SOURCE_IS_TIME(s->type))
 280                 return usec_add(s->time.next, s->time.accuracy);
 281
 282         return USEC_INFINITY;
 283 }
 284
 285 static bool event_source_timer_candidate(const sd_event_source *s) {
 286         assert(s);
 287
 288         /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
 289          * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
 290         return !s->pending || s->ratelimited;
 291 }
 292
 293 static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
 294         const sd_event_source *x = a, *y = b;
 295         int r;
 296
 297         /* Enabled ones first */
 298         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 299         if (r != 0)
 300                 return r;
 301
 302         /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
 303         r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
 304         if (r != 0)
 305                 return r;
 306
 307         /* Order by time */
 308         return CMP(time_func(x), time_func(y));
 309 }
 310
 311 static int earliest_time_prioq_compare(const void *a, const void *b) {
 312         return time_prioq_compare(a, b, time_event_source_next);
 313 }
 314
 315 static int latest_time_prioq_compare(const void *a, const void *b) {
 316         return time_prioq_compare(a, b, time_event_source_latest);
 317 }
 318
 319 static int exit_prioq_compare(const void *a, const void *b) {
 320         const sd_event_source *x = a, *y = b;
 321         int r;
 322
 323         assert(x->type == SOURCE_EXIT);
 324         assert(y->type == SOURCE_EXIT);
 325
 326         /* Enabled ones first */
 327         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 328         if (r != 0)
 329                 return r;
 330
 331         /* Lower priority values first */
 332         return CMP(x->priority, y->priority);
 333 }
 334
 335 static void free_clock_data(struct clock_data *d) {
 336         assert(d);
 337         assert(d->wakeup == WAKEUP_CLOCK_DATA);
 338
 339         safe_close(d->fd);
 340         prioq_free(d->earliest);
 341         prioq_free(d->latest);
 342 }
 343
 344 static sd_event *event_free(sd_event *e) {
 345         sd_event_source *s;
 346
 347         assert(e);
 348
 349         e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
 350         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
 351
 352         while ((s = e->sources)) {
 353                 assert(s->floating);
 354                 source_disconnect(s);
 355                 sd_event_source_unref(s);
 356         }
 357
 358         assert(e->n_sources == 0);
 359
 360         if (e->default_event_ptr)
 361                 *(e->default_event_ptr) = NULL;
 362
 363         safe_close(e->epoll_fd);
 364         safe_close(e->watchdog_fd);
 365
 366         free_clock_data(&e->realtime);
 367         free_clock_data(&e->boottime);
 368         free_clock_data(&e->monotonic);
 369         free_clock_data(&e->realtime_alarm);
 370         free_clock_data(&e->boottime_alarm);
 371
 372         prioq_free(e->pending);
 373         prioq_free(e->prepare);
 374         prioq_free(e->exit);
 375
 376         free(e->signal_sources);
 377         hashmap_free(e->signal_data);
 378
 379         hashmap_free(e->inotify_data);
 380
 381         hashmap_free(e->child_sources);
 382         set_free(e->post_sources);
 383
 384         free(e->event_queue);
 385
 386         return mfree(e);
 387 }
 388
 389 _public_ int sd_event_new(sd_event** ret) {
 390         sd_event *e;
 391         int r;
 392
 393         assert_return(ret, -EINVAL);
 394
 395         e = new(sd_event, 1);
 396         if (!e)
 397                 return -ENOMEM;
 398
 399         *e = (sd_event) {
 400                 .n_ref = 1,
 401                 .epoll_fd = -EBADF,
 402                 .watchdog_fd = -EBADF,
 403                 .realtime.wakeup = WAKEUP_CLOCK_DATA,
 404                 .realtime.fd = -EBADF,
 405                 .realtime.next = USEC_INFINITY,
 406                 .boottime.wakeup = WAKEUP_CLOCK_DATA,
 407                 .boottime.fd = -EBADF,
 408                 .boottime.next = USEC_INFINITY,
 409                 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
 410                 .monotonic.fd = -EBADF,
 411                 .monotonic.next = USEC_INFINITY,
 412                 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 413                 .realtime_alarm.fd = -EBADF,
 414                 .realtime_alarm.next = USEC_INFINITY,
 415                 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 416                 .boottime_alarm.fd = -EBADF,
 417                 .boottime_alarm.next = USEC_INFINITY,
 418                 .perturb = USEC_INFINITY,
 419                 .origin_id = origin_id_query(),
 420         };
 421
 422         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
 423         if (r < 0)
 424                 goto fail;
 425
 426         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
 427         if (e->epoll_fd < 0) {
 428                 r = -errno;
 429                 goto fail;
 430         }
 431
 432         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
 433
 434         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
 435                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
 436                           special_glyph(SPECIAL_GLYPH_ELLIPSIS));
 437                 e->profile_delays = true;
 438         }
 439
 440         *ret = e;
 441         return 0;
 442
 443 fail:
 444         event_free(e);
 445         return r;
 446 }
 447
 448 /* Define manually so we can add the origin check */
 449 _public_ sd_event *sd_event_ref(sd_event *e) {
 450         if (!e)
 451                 return NULL;
 452         if (event_origin_changed(e))
 453                 return NULL;
 454
 455         e->n_ref++;
 456
 457         return e;
 458 }
 459
 460 _public_ sd_event* sd_event_unref(sd_event *e) {
 461         if (!e)
 462                 return NULL;
 463         if (event_origin_changed(e))
 464                 return NULL;
 465
 466         assert(e->n_ref > 0);
 467         if (--e->n_ref > 0)
 468                 return NULL;
 469
 470         return event_free(e);
 471 }
 472
 473 #define PROTECT_EVENT(e)                                                \
 474         _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
 475
 476 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
 477         if (s)
 478                 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
 479         return sd_event_source_unref(s);
 480 }
 481
 482 static void source_io_unregister(sd_event_source *s) {
 483         assert(s);
 484         assert(s->type == SOURCE_IO);
 485
 486         if (event_origin_changed(s->event))
 487                 return;
 488
 489         if (!s->io.registered)
 490                 return;
 491
 492         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
 493                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 494                                 strna(s->description), event_source_type_to_string(s->type));
 495
 496         s->io.registered = false;
 497 }
 498
 499 static int source_io_register(
 500                 sd_event_source *s,
 501                 int enabled,
 502                 uint32_t events) {
 503
 504         assert(s);
 505         assert(s->type == SOURCE_IO);
 506         assert(enabled != SD_EVENT_OFF);
 507
 508         struct epoll_event ev = {
 509                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 510                 .data.ptr = s,
 511         };
 512
 513         if (epoll_ctl(s->event->epoll_fd,
 514                       s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 515                       s->io.fd, &ev) < 0)
 516                 return -errno;
 517
 518         s->io.registered = true;
 519
 520         return 0;
 521 }
 522
 523 static void source_child_pidfd_unregister(sd_event_source *s) {
 524         assert(s);
 525         assert(s->type == SOURCE_CHILD);
 526
 527         if (event_origin_changed(s->event))
 528                 return;
 529
 530         if (!s->child.registered)
 531                 return;
 532
 533         if (EVENT_SOURCE_WATCH_PIDFD(s))
 534                 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
 535                         log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 536                                         strna(s->description), event_source_type_to_string(s->type));
 537
 538         s->child.registered = false;
 539 }
 540
 541 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
 542         assert(s);
 543         assert(s->type == SOURCE_CHILD);
 544         assert(enabled != SD_EVENT_OFF);
 545
 546         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
 547                 struct epoll_event ev = {
 548                         .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 549                         .data.ptr = s,
 550                 };
 551
 552                 if (epoll_ctl(s->event->epoll_fd,
 553                               s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 554                               s->child.pidfd, &ev) < 0)
 555                         return -errno;
 556         }
 557
 558         s->child.registered = true;
 559         return 0;
 560 }
 561
 562 static void source_memory_pressure_unregister(sd_event_source *s) {
 563         assert(s);
 564         assert(s->type == SOURCE_MEMORY_PRESSURE);
 565
 566         if (event_origin_changed(s->event))
 567                 return;
 568
 569         if (!s->memory_pressure.registered)
 570                 return;
 571
 572         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
 573                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 574                                 strna(s->description), event_source_type_to_string(s->type));
 575
 576         s->memory_pressure.registered = false;
 577 }
 578
 579 static int source_memory_pressure_register(sd_event_source *s, int enabled) {
 580         assert(s);
 581         assert(s->type == SOURCE_MEMORY_PRESSURE);
 582         assert(enabled != SD_EVENT_OFF);
 583
 584         struct epoll_event ev = {
 585                 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
 586                           (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
 587                 .data.ptr = s,
 588         };
 589
 590         if (epoll_ctl(s->event->epoll_fd,
 591                       s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 592                       s->memory_pressure.fd, &ev) < 0)
 593                 return -errno;
 594
 595         s->memory_pressure.registered = true;
 596         return 0;
 597 }
 598
 599 static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
 600         assert(s);
 601         assert(s->type == SOURCE_MEMORY_PRESSURE);
 602
 603         if (s->memory_pressure.in_write_list)
 604                 return;
 605
 606         LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
 607         s->memory_pressure.in_write_list = true;
 608 }
 609
 610 static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
 611         assert(s);
 612         assert(s->type == SOURCE_MEMORY_PRESSURE);
 613
 614         if (!s->memory_pressure.in_write_list)
 615                 return;
 616
 617         LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
 618         s->memory_pressure.in_write_list = false;
 619 }
 620
 621 static clockid_t event_source_type_to_clock(EventSourceType t) {
 622
 623         switch (t) {
 624
 625         case SOURCE_TIME_REALTIME:
 626                 return CLOCK_REALTIME;
 627
 628         case SOURCE_TIME_BOOTTIME:
 629                 return CLOCK_BOOTTIME;
 630
 631         case SOURCE_TIME_MONOTONIC:
 632                 return CLOCK_MONOTONIC;
 633
 634         case SOURCE_TIME_REALTIME_ALARM:
 635                 return CLOCK_REALTIME_ALARM;
 636
 637         case SOURCE_TIME_BOOTTIME_ALARM:
 638                 return CLOCK_BOOTTIME_ALARM;
 639
 640         default:
 641                 return (clockid_t) -1;
 642         }
 643 }
 644
 645 static EventSourceType clock_to_event_source_type(clockid_t clock) {
 646
 647         switch (clock) {
 648
 649         case CLOCK_REALTIME:
 650                 return SOURCE_TIME_REALTIME;
 651
 652         case CLOCK_BOOTTIME:
 653                 return SOURCE_TIME_BOOTTIME;
 654
 655         case CLOCK_MONOTONIC:
 656                 return SOURCE_TIME_MONOTONIC;
 657
 658         case CLOCK_REALTIME_ALARM:
 659                 return SOURCE_TIME_REALTIME_ALARM;
 660
 661         case CLOCK_BOOTTIME_ALARM:
 662                 return SOURCE_TIME_BOOTTIME_ALARM;
 663
 664         default:
 665                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
 666         }
 667 }
 668
 669 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
 670         assert(e);
 671
 672         switch (t) {
 673
 674         case SOURCE_TIME_REALTIME:
 675                 return &e->realtime;
 676
 677         case SOURCE_TIME_BOOTTIME:
 678                 return &e->boottime;
 679
 680         case SOURCE_TIME_MONOTONIC:
 681                 return &e->monotonic;
 682
 683         case SOURCE_TIME_REALTIME_ALARM:
 684                 return &e->realtime_alarm;
 685
 686         case SOURCE_TIME_BOOTTIME_ALARM:
 687                 return &e->boottime_alarm;
 688
 689         default:
 690                 return NULL;
 691         }
 692 }
 693
 694 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
 695         assert(e);
 696
 697         if (!d)
 698                 return;
 699
 700         hashmap_remove(e->signal_data, &d->priority);
 701         safe_close(d->fd);
 702         free(d);
 703 }
 704
 705 static int event_make_signal_data(
 706                 sd_event *e,
 707                 int sig,
 708                 struct signal_data **ret) {
 709
 710         struct signal_data *d;
 711         bool added = false;
 712         sigset_t ss_copy;
 713         int64_t priority;
 714         int r;
 715
 716         assert(e);
 717
 718         if (event_origin_changed(e))
 719                 return -ECHILD;
 720
 721         if (e->signal_sources && e->signal_sources[sig])
 722                 priority = e->signal_sources[sig]->priority;
 723         else
 724                 priority = SD_EVENT_PRIORITY_NORMAL;
 725
 726         d = hashmap_get(e->signal_data, &priority);
 727         if (d) {
 728                 if (sigismember(&d->sigset, sig) > 0) {
 729                         if (ret)
 730                                 *ret = d;
 731                         return 0;
 732                 }
 733         } else {
 734                 d = new(struct signal_data, 1);
 735                 if (!d)
 736                         return -ENOMEM;
 737
 738                 *d = (struct signal_data) {
 739                         .wakeup = WAKEUP_SIGNAL_DATA,
 740                         .fd = -EBADF,
 741                         .priority = priority,
 742                 };
 743
 744                 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
 745                 if (r < 0) {
 746                         free(d);
 747                         return r;
 748                 }
 749
 750                 added = true;
 751         }
 752
 753         ss_copy = d->sigset;
 754         assert_se(sigaddset(&ss_copy, sig) >= 0);
 755
 756         r = signalfd(d->fd >= 0 ? d->fd : -1,   /* the first arg must be -1 or a valid signalfd */
 757                      &ss_copy,
 758                      SFD_NONBLOCK|SFD_CLOEXEC);
 759         if (r < 0) {
 760                 r = -errno;
 761                 goto fail;
 762         }
 763
 764         d->sigset = ss_copy;
 765
 766         if (d->fd >= 0) {
 767                 if (ret)
 768                         *ret = d;
 769                 return 0;
 770         }
 771
 772         d->fd = fd_move_above_stdio(r);
 773
 774         struct epoll_event ev = {
 775                 .events = EPOLLIN,
 776                 .data.ptr = d,
 777         };
 778
 779         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
 780                 r = -errno;
 781                 goto fail;
 782         }
 783
 784         if (ret)
 785                 *ret = d;
 786
 787         return 0;
 788
 789 fail:
 790         if (added)
 791                 event_free_signal_data(e, d);
 792
 793         return r;
 794 }
 795
 796 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
 797         assert(e);
 798         assert(d);
 799
 800         /* Turns off the specified signal in the signal data
 801          * object. If the signal mask of the object becomes empty that
 802          * way removes it. */
 803
 804         if (sigismember(&d->sigset, sig) == 0)
 805                 return;
 806
 807         assert_se(sigdelset(&d->sigset, sig) >= 0);
 808
 809         if (sigisemptyset(&d->sigset)) {
 810                 /* If all the mask is all-zero we can get rid of the structure */
 811                 event_free_signal_data(e, d);
 812                 return;
 813         }
 814
 815         if (event_origin_changed(e))
 816                 return;
 817
 818         assert(d->fd >= 0);
 819
 820         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
 821                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
 822 }
 823
 824 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
 825         struct signal_data *d;
 826         static const int64_t zero_priority = 0;
 827
 828         assert(e);
 829
 830         /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
 831          * and possibly drop the signalfd for it. */
 832
 833         if (sig == SIGCHLD &&
 834             e->n_online_child_sources > 0)
 835                 return;
 836
 837         if (e->signal_sources &&
 838             e->signal_sources[sig] &&
 839             event_source_is_online(e->signal_sources[sig]))
 840                 return;
 841
 842         /*
 843          * The specified signal might be enabled in three different queues:
 844          *
 845          * 1) the one that belongs to the priority passed (if it is non-NULL)
 846          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
 847          * 3) the 0 priority (to cover the SIGCHLD case)
 848          *
 849          * Hence, let's remove it from all three here.
 850          */
 851
 852         if (priority) {
 853                 d = hashmap_get(e->signal_data, priority);
 854                 if (d)
 855                         event_unmask_signal_data(e, d, sig);
 856         }
 857
 858         if (e->signal_sources && e->signal_sources[sig]) {
 859                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
 860                 if (d)
 861                         event_unmask_signal_data(e, d, sig);
 862         }
 863
 864         d = hashmap_get(e->signal_data, &zero_priority);
 865         if (d)
 866                 event_unmask_signal_data(e, d, sig);
 867 }
 868
 869 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
 870         assert(s);
 871
 872         /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
 873          * they are enabled/disabled or marked pending and such. */
 874
 875         if (s->pending)
 876                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
 877
 878         if (s->prepare)
 879                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
 880 }
 881
 882 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
 883         struct clock_data *d;
 884
 885         assert(s);
 886
 887         /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
 888          * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
 889          * properly again. */
 890
 891         if (s->ratelimited)
 892                 d = &s->event->monotonic;
 893         else if (EVENT_SOURCE_IS_TIME(s->type))
 894                 assert_se(d = event_get_clock_data(s->event, s->type));
 895         else
 896                 return; /* no-op for an event source which is neither a timer nor ratelimited. */
 897
 898         prioq_reshuffle(d->earliest, s, &s->earliest_index);
 899         prioq_reshuffle(d->latest, s, &s->latest_index);
 900         d->needs_rearm = true;
 901 }
 902
 903 static void event_source_time_prioq_remove(
 904                 sd_event_source *s,
 905                 struct clock_data *d) {
 906
 907         assert(s);
 908         assert(d);
 909
 910         prioq_remove(d->earliest, s, &s->earliest_index);
 911         prioq_remove(d->latest, s, &s->latest_index);
 912         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
 913         d->needs_rearm = true;
 914 }
 915
 916 static void source_disconnect(sd_event_source *s) {
 917         sd_event *event;
 918         int r;
 919
 920         assert(s);
 921
 922         if (!s->event)
 923                 return;
 924
 925         assert(s->event->n_sources > 0);
 926
 927         switch (s->type) {
 928
 929         case SOURCE_IO:
 930                 if (s->io.fd >= 0)
 931                         source_io_unregister(s);
 932
 933                 break;
 934
 935         case SOURCE_TIME_REALTIME:
 936         case SOURCE_TIME_BOOTTIME:
 937         case SOURCE_TIME_MONOTONIC:
 938         case SOURCE_TIME_REALTIME_ALARM:
 939         case SOURCE_TIME_BOOTTIME_ALARM:
 940                 /* Only remove this event source from the time event source here if it is not ratelimited. If
 941                  * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
 942                  * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
 943
 944                 if (!s->ratelimited) {
 945                         struct clock_data *d;
 946                         assert_se(d = event_get_clock_data(s->event, s->type));
 947                         event_source_time_prioq_remove(s, d);
 948                 }
 949
 950                 break;
 951
 952         case SOURCE_SIGNAL:
 953                 if (s->signal.sig > 0) {
 954
 955                         if (s->event->signal_sources)
 956                                 s->event->signal_sources[s->signal.sig] = NULL;
 957
 958                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
 959
 960                         if (s->signal.unblock) {
 961                                 sigset_t new_ss;
 962
 963                                 if (sigemptyset(&new_ss) < 0)
 964                                         log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
 965                                 else if (sigaddset(&new_ss, s->signal.sig) < 0)
 966                                         log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
 967                                 else {
 968                                         r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
 969                                         if (r != 0)
 970                                                 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
 971                                 }
 972                         }
 973                 }
 974
 975                 break;
 976
 977         case SOURCE_CHILD:
 978                 if (event_origin_changed(s->event))
 979                         s->child.process_owned = false;
 980
 981                 if (s->child.pid > 0) {
 982                         if (event_source_is_online(s)) {
 983                                 assert(s->event->n_online_child_sources > 0);
 984                                 s->event->n_online_child_sources--;
 985                         }
 986
 987                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
 988                 }
 989
 990                 if (EVENT_SOURCE_WATCH_PIDFD(s))
 991                         source_child_pidfd_unregister(s);
 992                 else
 993                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
 994
 995                 break;
 996
 997         case SOURCE_DEFER:
 998                 /* nothing */
 999                 break;
1000
1001         case SOURCE_POST:
1002                 set_remove(s->event->post_sources, s);
1003                 break;
1004
1005         case SOURCE_EXIT:
1006                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
1007                 break;
1008
1009         case SOURCE_INOTIFY: {
1010                 struct inode_data *inode_data;
1011
1012                 inode_data = s->inotify.inode_data;
1013                 if (inode_data) {
1014                         struct inotify_data *inotify_data;
1015                         assert_se(inotify_data = inode_data->inotify_data);
1016
1017                         /* Detach this event source from the inode object */
1018                         LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
1019                         s->inotify.inode_data = NULL;
1020
1021                         if (s->pending) {
1022                                 assert(inotify_data->n_pending > 0);
1023                                 inotify_data->n_pending--;
1024                         }
1025
1026                         /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1027                          * continued to being watched. That's because inotify doesn't really have an API for that: we
1028                          * can only change watch masks with access to the original inode either by fd or by path. But
1029                          * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1030                          * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1031                          * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1032                          * there), but given the need for open_by_handle_at() which is privileged and not universally
1033                          * available this would be quite an incomplete solution. Hence we go the other way, leave the
1034                          * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1035                          * anymore after reception. Yes, this sucks, but … Linux … */
1036
1037                         /* Maybe release the inode data (and its inotify) */
1038                         event_gc_inode_data(s->event, inode_data);
1039                 }
1040
1041                 break;
1042         }
1043
1044         case SOURCE_MEMORY_PRESSURE:
1045                 source_memory_pressure_remove_from_write_list(s);
1046                 source_memory_pressure_unregister(s);
1047                 break;
1048
1049         default:
1050                 assert_not_reached();
1051         }
1052
1053         if (s->pending)
1054                 prioq_remove(s->event->pending, s, &s->pending_index);
1055
1056         if (s->prepare)
1057                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1058
1059         if (s->ratelimited)
1060                 event_source_time_prioq_remove(s, &s->event->monotonic);
1061
1062         event = TAKE_PTR(s->event);
1063         LIST_REMOVE(sources, event->sources, s);
1064         event->n_sources--;
1065
1066         /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1067          * pidfd associated with this event source, which we'll do only on source_free(). */
1068
1069         if (!s->floating)
1070                 sd_event_unref(event);
1071 }
1072
1073 static sd_event_source* source_free(sd_event_source *s) {
1074         assert(s);
1075
1076         source_disconnect(s);
1077
1078         if (s->type == SOURCE_IO && s->io.owned)
1079                 s->io.fd = safe_close(s->io.fd);
1080
1081         if (s->type == SOURCE_CHILD) {
1082                 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1083
1084                 if (s->child.process_owned) {
1085
1086                         if (!s->child.exited) {
1087                                 bool sent = false;
1088
1089                                 if (s->child.pidfd >= 0) {
1090                                         if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
1091                                                 if (errno == ESRCH) /* Already dead */
1092                                                         sent = true;
1093                                                 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
1094                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
1095                                                                         s->child.pid);
1096                                         } else
1097                                                 sent = true;
1098                                 }
1099
1100                                 if (!sent)
1101                                         if (kill(s->child.pid, SIGKILL) < 0)
1102                                                 if (errno != ESRCH) /* Already dead */
1103                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1104                                                                         s->child.pid);
1105                         }
1106
1107                         if (!s->child.waited) {
1108                                 siginfo_t si = {};
1109
1110                                 /* Reap the child if we can */
1111                                 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1112                         }
1113                 }
1114
1115                 if (s->child.pidfd_owned)
1116                         s->child.pidfd = safe_close(s->child.pidfd);
1117         }
1118
1119         if (s->type == SOURCE_MEMORY_PRESSURE) {
1120                 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1121                 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1122         }
1123
1124         if (s->destroy_callback)
1125                 s->destroy_callback(s->userdata);
1126
1127         free(s->description);
1128         return mfree(s);
1129 }
1130 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1131
1132 static int source_set_pending(sd_event_source *s, bool b) {
1133         int r;
1134
1135         assert(s);
1136         assert(s->type != SOURCE_EXIT);
1137
1138         if (s->pending == b)
1139                 return 0;
1140
1141         s->pending = b;
1142
1143         if (b) {
1144                 s->pending_iteration = s->event->iteration;
1145
1146                 r = prioq_put(s->event->pending, s, &s->pending_index);
1147                 if (r < 0) {
1148                         s->pending = false;
1149                         return r;
1150                 }
1151         } else
1152                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1153
1154         if (EVENT_SOURCE_IS_TIME(s->type))
1155                 event_source_time_prioq_reshuffle(s);
1156
1157         if (s->type == SOURCE_SIGNAL && !b) {
1158                 struct signal_data *d;
1159
1160                 d = hashmap_get(s->event->signal_data, &s->priority);
1161                 if (d && d->current == s)
1162                         d->current = NULL;
1163         }
1164
1165         if (s->type == SOURCE_INOTIFY) {
1166
1167                 assert(s->inotify.inode_data);
1168                 assert(s->inotify.inode_data->inotify_data);
1169
1170                 if (b)
1171                         s->inotify.inode_data->inotify_data->n_pending++;
1172                 else {
1173                         assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1174                         s->inotify.inode_data->inotify_data->n_pending--;
1175                 }
1176         }
1177
1178         return 1;
1179 }
1180
1181 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1182
1183         /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1184          * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1185          * lines. */
1186         static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1187                 [SOURCE_IO]                  = endoffsetof_field(sd_event_source, io),
1188                 [SOURCE_TIME_REALTIME]       = endoffsetof_field(sd_event_source, time),
1189                 [SOURCE_TIME_BOOTTIME]       = endoffsetof_field(sd_event_source, time),
1190                 [SOURCE_TIME_MONOTONIC]      = endoffsetof_field(sd_event_source, time),
1191                 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1192                 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1193                 [SOURCE_SIGNAL]              = endoffsetof_field(sd_event_source, signal),
1194                 [SOURCE_CHILD]               = endoffsetof_field(sd_event_source, child),
1195                 [SOURCE_DEFER]               = endoffsetof_field(sd_event_source, defer),
1196                 [SOURCE_POST]                = endoffsetof_field(sd_event_source, post),
1197                 [SOURCE_EXIT]                = endoffsetof_field(sd_event_source, exit),
1198                 [SOURCE_INOTIFY]             = endoffsetof_field(sd_event_source, inotify),
1199                 [SOURCE_MEMORY_PRESSURE]     = endoffsetof_field(sd_event_source, memory_pressure),
1200         };
1201
1202         sd_event_source *s;
1203
1204         assert(e);
1205         assert(type >= 0);
1206         assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1207         assert(size_table[type] > 0);
1208
1209         s = malloc0(size_table[type]);
1210         if (!s)
1211                 return NULL;
1212         /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1213          * size, even if we only allocate the initial part we need. */
1214         s = expand_to_usable(s, sizeof(sd_event_source));
1215
1216         /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1217          * than what we allocated here. */
1218         s->n_ref = 1;
1219         s->event = e;
1220         s->floating = floating;
1221         s->type = type;
1222         s->pending_index = PRIOQ_IDX_NULL;
1223         s->prepare_index = PRIOQ_IDX_NULL;
1224
1225         if (!floating)
1226                 sd_event_ref(e);
1227
1228         LIST_PREPEND(sources, e->sources, s);
1229         e->n_sources++;
1230
1231         return s;
1232 }
1233
1234 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1235         assert(s);
1236
1237         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1238 }
1239
1240 _public_ int sd_event_add_io(
1241                 sd_event *e,
1242                 sd_event_source **ret,
1243                 int fd,
1244                 uint32_t events,
1245                 sd_event_io_handler_t callback,
1246                 void *userdata) {
1247
1248         _cleanup_(source_freep) sd_event_source *s = NULL;
1249         int r;
1250
1251         assert_return(e, -EINVAL);
1252         assert_return(e = event_resolve(e), -ENOPKG);
1253         assert_return(fd >= 0, -EBADF);
1254         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1255         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1256         assert_return(!event_origin_changed(e), -ECHILD);
1257
1258         if (!callback)
1259                 callback = io_exit_callback;
1260
1261         s = source_new(e, !ret, SOURCE_IO);
1262         if (!s)
1263                 return -ENOMEM;
1264
1265         s->wakeup = WAKEUP_EVENT_SOURCE;
1266         s->io.fd = fd;
1267         s->io.events = events;
1268         s->io.callback = callback;
1269         s->userdata = userdata;
1270         s->enabled = SD_EVENT_ON;
1271
1272         r = source_io_register(s, s->enabled, events);
1273         if (r < 0)
1274                 return r;
1275
1276         if (ret)
1277                 *ret = s;
1278         TAKE_PTR(s);
1279
1280         return 0;
1281 }
1282
1283 static void initialize_perturb(sd_event *e) {
1284         sd_id128_t id = {};
1285
1286         /* When we sleep for longer, we try to realign the wakeup to the same time within each
1287          * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1288          * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1289          * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1290          * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1291
1292         if (_likely_(e->perturb != USEC_INFINITY))
1293                 return;
1294
1295         if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
1296                 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1297         else
1298                 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
1299 }
1300
1301 static int event_setup_timer_fd(
1302                 sd_event *e,
1303                 struct clock_data *d,
1304                 clockid_t clock) {
1305
1306         assert(e);
1307         assert(d);
1308
1309         if (_likely_(d->fd >= 0))
1310                 return 0;
1311
1312         _cleanup_close_ int fd = -EBADF;
1313
1314         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1315         if (fd < 0)
1316                 return -errno;
1317
1318         fd = fd_move_above_stdio(fd);
1319
1320         struct epoll_event ev = {
1321                 .events = EPOLLIN,
1322                 .data.ptr = d,
1323         };
1324
1325         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1326                 return -errno;
1327
1328         d->fd = TAKE_FD(fd);
1329         return 0;
1330 }
1331
1332 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1333         assert(s);
1334
1335         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1336 }
1337
1338 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1339         int r;
1340
1341         assert(d);
1342
1343         if (d->fd < 0) {
1344                 r = event_setup_timer_fd(e, d, clock);
1345                 if (r < 0)
1346                         return r;
1347         }
1348
1349         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1350         if (r < 0)
1351                 return r;
1352
1353         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1354         if (r < 0)
1355                 return r;
1356
1357         return 0;
1358 }
1359
1360 static int event_source_time_prioq_put(
1361                 sd_event_source *s,
1362                 struct clock_data *d) {
1363
1364         int r;
1365
1366         assert(s);
1367         assert(d);
1368         assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1369
1370         r = prioq_put(d->earliest, s, &s->earliest_index);
1371         if (r < 0)
1372                 return r;
1373
1374         r = prioq_put(d->latest, s, &s->latest_index);
1375         if (r < 0) {
1376                 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1377                 s->earliest_index = PRIOQ_IDX_NULL;
1378                 return r;
1379         }
1380
1381         d->needs_rearm = true;
1382         return 0;
1383 }
1384
1385 _public_ int sd_event_add_time(
1386                 sd_event *e,
1387                 sd_event_source **ret,
1388                 clockid_t clock,
1389                 uint64_t usec,
1390                 uint64_t accuracy,
1391                 sd_event_time_handler_t callback,
1392                 void *userdata) {
1393
1394         EventSourceType type;
1395         _cleanup_(source_freep) sd_event_source *s = NULL;
1396         struct clock_data *d;
1397         int r;
1398
1399         assert_return(e, -EINVAL);
1400         assert_return(e = event_resolve(e), -ENOPKG);
1401         assert_return(accuracy != UINT64_MAX, -EINVAL);
1402         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1403         assert_return(!event_origin_changed(e), -ECHILD);
1404
1405         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1406                 return -EOPNOTSUPP;
1407
1408         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1409         if (type < 0)
1410                 return -EOPNOTSUPP;
1411
1412         if (!callback)
1413                 callback = time_exit_callback;
1414
1415         assert_se(d = event_get_clock_data(e, type));
1416
1417         r = setup_clock_data(e, d, clock);
1418         if (r < 0)
1419                 return r;
1420
1421         s = source_new(e, !ret, type);
1422         if (!s)
1423                 return -ENOMEM;
1424
1425         s->time.next = usec;
1426         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1427         s->time.callback = callback;
1428         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1429         s->userdata = userdata;
1430         s->enabled = SD_EVENT_ONESHOT;
1431
1432         r = event_source_time_prioq_put(s, d);
1433         if (r < 0)
1434                 return r;
1435
1436         if (ret)
1437                 *ret = s;
1438         TAKE_PTR(s);
1439
1440         return 0;
1441 }
1442
1443 _public_ int sd_event_add_time_relative(
1444                 sd_event *e,
1445                 sd_event_source **ret,
1446                 clockid_t clock,
1447                 uint64_t usec,
1448                 uint64_t accuracy,
1449                 sd_event_time_handler_t callback,
1450                 void *userdata) {
1451
1452         usec_t t;
1453         int r;
1454
1455         /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1456          * checks for overflow. */
1457
1458         r = sd_event_now(e, clock, &t);
1459         if (r < 0)
1460                 return r;
1461
1462         if (usec >= USEC_INFINITY - t)
1463                 return -EOVERFLOW;
1464
1465         return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1466 }
1467
1468 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1469         assert(s);
1470
1471         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1472 }
1473
1474 _public_ int sd_event_add_signal(
1475                 sd_event *e,
1476                 sd_event_source **ret,
1477                 int sig,
1478                 sd_event_signal_handler_t callback,
1479                 void *userdata) {
1480
1481         _cleanup_(source_freep) sd_event_source *s = NULL;
1482         struct signal_data *d;
1483         sigset_t new_ss;
1484         bool block_it;
1485         int r;
1486
1487         assert_return(e, -EINVAL);
1488         assert_return(e = event_resolve(e), -ENOPKG);
1489         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1490         assert_return(!event_origin_changed(e), -ECHILD);
1491
1492         /* Let's make sure our special flag stays outside of the valid signal range */
1493         assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1494
1495         if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1496                 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1497                 assert_return(SIGNAL_VALID(sig), -EINVAL);
1498
1499                 block_it = true;
1500         } else {
1501                 assert_return(SIGNAL_VALID(sig), -EINVAL);
1502
1503                 r = signal_is_blocked(sig);
1504                 if (r < 0)
1505                         return r;
1506                 if (r == 0)
1507                         return -EBUSY;
1508
1509                 block_it = false;
1510         }
1511
1512         if (!callback)
1513                 callback = signal_exit_callback;
1514
1515         if (!e->signal_sources) {
1516                 e->signal_sources = new0(sd_event_source*, _NSIG);
1517                 if (!e->signal_sources)
1518                         return -ENOMEM;
1519         } else if (e->signal_sources[sig])
1520                 return -EBUSY;
1521
1522         s = source_new(e, !ret, SOURCE_SIGNAL);
1523         if (!s)
1524                 return -ENOMEM;
1525
1526         s->signal.sig = sig;
1527         s->signal.callback = callback;
1528         s->userdata = userdata;
1529         s->enabled = SD_EVENT_ON;
1530
1531         e->signal_sources[sig] = s;
1532
1533         if (block_it) {
1534                 sigset_t old_ss;
1535
1536                 if (sigemptyset(&new_ss) < 0)
1537                         return -errno;
1538
1539                 if (sigaddset(&new_ss, sig) < 0)
1540                         return -errno;
1541
1542                 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1543                 if (r != 0)
1544                         return -r;
1545
1546                 r = sigismember(&old_ss, sig);
1547                 if (r < 0)
1548                         return -errno;
1549
1550                 s->signal.unblock = !r;
1551         } else
1552                 s->signal.unblock = false;
1553
1554         r = event_make_signal_data(e, sig, &d);
1555         if (r < 0) {
1556                 if (s->signal.unblock)
1557                         (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1558
1559                 return r;
1560         }
1561
1562         /* Use the signal name as description for the event source by default */
1563         (void) sd_event_source_set_description(s, signal_to_string(sig));
1564
1565         if (ret)
1566                 *ret = s;
1567         TAKE_PTR(s);
1568
1569         return 0;
1570 }
1571
1572 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1573         assert(s);
1574
1575         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1576 }
1577
1578 static bool shall_use_pidfd(void) {
1579         /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1580         return secure_getenv_bool("SYSTEMD_PIDFD") != 0;
1581 }
1582
1583 _public_ int sd_event_add_child(
1584                 sd_event *e,
1585                 sd_event_source **ret,
1586                 pid_t pid,
1587                 int options,
1588                 sd_event_child_handler_t callback,
1589                 void *userdata) {
1590
1591         _cleanup_(source_freep) sd_event_source *s = NULL;
1592         int r;
1593
1594         assert_return(e, -EINVAL);
1595         assert_return(e = event_resolve(e), -ENOPKG);
1596         assert_return(pid > 1, -EINVAL);
1597         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1598         assert_return(options != 0, -EINVAL);
1599         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1600         assert_return(!event_origin_changed(e), -ECHILD);
1601
1602         if (!callback)
1603                 callback = child_exit_callback;
1604
1605         if (e->n_online_child_sources == 0) {
1606                 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1607                  * for compatibility with pre-pidfd and because we don't want the reap the child processes
1608                  * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1609                  * take effect.
1610                  *
1611                  * (As an optimization we only do this check on the first child event source created.) */
1612                 r = signal_is_blocked(SIGCHLD);
1613                 if (r < 0)
1614                         return r;
1615                 if (r == 0)
1616                         return -EBUSY;
1617         }
1618
1619         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1620         if (r < 0)
1621                 return r;
1622
1623         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1624                 return -EBUSY;
1625
1626         s = source_new(e, !ret, SOURCE_CHILD);
1627         if (!s)
1628                 return -ENOMEM;
1629
1630         s->wakeup = WAKEUP_EVENT_SOURCE;
1631         s->child.options = options;
1632         s->child.callback = callback;
1633         s->userdata = userdata;
1634         s->enabled = SD_EVENT_ONESHOT;
1635
1636         /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1637          * pin the PID, and make regular waitid() handling race-free. */
1638
1639         if (shall_use_pidfd()) {
1640                 s->child.pidfd = pidfd_open(pid, 0);
1641                 if (s->child.pidfd < 0) {
1642                         /* Propagate errors unless the syscall is not supported or blocked */
1643                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1644                                 return -errno;
1645                 } else
1646                         s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1647         } else
1648                 s->child.pidfd = -EBADF;
1649
1650         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1651                 /* We have a pidfd and we only want to watch for exit */
1652                 r = source_child_pidfd_register(s, s->enabled);
1653                 if (r < 0)
1654                         return r;
1655
1656         } else {
1657                 /* We have no pidfd or we shall wait for some other event than WEXITED */
1658                 r = event_make_signal_data(e, SIGCHLD, NULL);
1659                 if (r < 0)
1660                         return r;
1661
1662                 e->need_process_child = true;
1663         }
1664
1665         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1666         if (r < 0)
1667                 return r;
1668
1669         /* These must be done after everything succeeds. */
1670         s->child.pid = pid;
1671         e->n_online_child_sources++;
1672
1673         if (ret)
1674                 *ret = s;
1675         TAKE_PTR(s);
1676         return 0;
1677 }
1678
1679 _public_ int sd_event_add_child_pidfd(
1680                 sd_event *e,
1681                 sd_event_source **ret,
1682                 int pidfd,
1683                 int options,
1684                 sd_event_child_handler_t callback,
1685                 void *userdata) {
1686
1687
1688         _cleanup_(source_freep) sd_event_source *s = NULL;
1689         pid_t pid;
1690         int r;
1691
1692         assert_return(e, -EINVAL);
1693         assert_return(e = event_resolve(e), -ENOPKG);
1694         assert_return(pidfd >= 0, -EBADF);
1695         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1696         assert_return(options != 0, -EINVAL);
1697         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1698         assert_return(!event_origin_changed(e), -ECHILD);
1699
1700         if (!callback)
1701                 callback = child_exit_callback;
1702
1703         if (e->n_online_child_sources == 0) {
1704                 r = signal_is_blocked(SIGCHLD);
1705                 if (r < 0)
1706                         return r;
1707                 if (r == 0)
1708                         return -EBUSY;
1709         }
1710
1711         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1712         if (r < 0)
1713                 return r;
1714
1715         r = pidfd_get_pid(pidfd, &pid);
1716         if (r < 0)
1717                 return r;
1718
1719         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1720                 return -EBUSY;
1721
1722         s = source_new(e, !ret, SOURCE_CHILD);
1723         if (!s)
1724                 return -ENOMEM;
1725
1726         s->wakeup = WAKEUP_EVENT_SOURCE;
1727         s->child.pidfd = pidfd;
1728         s->child.pid = pid;
1729         s->child.options = options;
1730         s->child.callback = callback;
1731         s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1732         s->userdata = userdata;
1733         s->enabled = SD_EVENT_ONESHOT;
1734
1735         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1736         if (r < 0)
1737                 return r;
1738
1739         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1740                 /* We only want to watch for WEXITED */
1741                 r = source_child_pidfd_register(s, s->enabled);
1742                 if (r < 0)
1743                         return r;
1744         } else {
1745                 /* We shall wait for some other event than WEXITED */
1746                 r = event_make_signal_data(e, SIGCHLD, NULL);
1747                 if (r < 0)
1748                         return r;
1749
1750                 e->need_process_child = true;
1751         }
1752
1753         e->n_online_child_sources++;
1754
1755         if (ret)
1756                 *ret = s;
1757         TAKE_PTR(s);
1758         return 0;
1759 }
1760
1761 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1762         assert(s);
1763
1764         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1765 }
1766
1767 _public_ int sd_event_add_defer(
1768                 sd_event *e,
1769                 sd_event_source **ret,
1770                 sd_event_handler_t callback,
1771                 void *userdata) {
1772
1773         _cleanup_(source_freep) sd_event_source *s = NULL;
1774         int r;
1775
1776         assert_return(e, -EINVAL);
1777         assert_return(e = event_resolve(e), -ENOPKG);
1778         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1779         assert_return(!event_origin_changed(e), -ECHILD);
1780
1781         if (!callback)
1782                 callback = generic_exit_callback;
1783
1784         s = source_new(e, !ret, SOURCE_DEFER);
1785         if (!s)
1786                 return -ENOMEM;
1787
1788         s->defer.callback = callback;
1789         s->userdata = userdata;
1790         s->enabled = SD_EVENT_ONESHOT;
1791
1792         r = source_set_pending(s, true);
1793         if (r < 0)
1794                 return r;
1795
1796         if (ret)
1797                 *ret = s;
1798         TAKE_PTR(s);
1799
1800         return 0;
1801 }
1802
1803 _public_ int sd_event_add_post(
1804                 sd_event *e,
1805                 sd_event_source **ret,
1806                 sd_event_handler_t callback,
1807                 void *userdata) {
1808
1809         _cleanup_(source_freep) sd_event_source *s = NULL;
1810         int r;
1811
1812         assert_return(e, -EINVAL);
1813         assert_return(e = event_resolve(e), -ENOPKG);
1814         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1815         assert_return(!event_origin_changed(e), -ECHILD);
1816
1817         if (!callback)
1818                 callback = generic_exit_callback;
1819
1820         s = source_new(e, !ret, SOURCE_POST);
1821         if (!s)
1822                 return -ENOMEM;
1823
1824         s->post.callback = callback;
1825         s->userdata = userdata;
1826         s->enabled = SD_EVENT_ON;
1827
1828         r = set_ensure_put(&e->post_sources, NULL, s);
1829         if (r < 0)
1830                 return r;
1831         assert(r > 0);
1832
1833         if (ret)
1834                 *ret = s;
1835         TAKE_PTR(s);
1836
1837         return 0;
1838 }
1839
1840 _public_ int sd_event_add_exit(
1841                 sd_event *e,
1842                 sd_event_source **ret,
1843                 sd_event_handler_t callback,
1844                 void *userdata) {
1845
1846         _cleanup_(source_freep) sd_event_source *s = NULL;
1847         int r;
1848
1849         assert_return(e, -EINVAL);
1850         assert_return(e = event_resolve(e), -ENOPKG);
1851         assert_return(callback, -EINVAL);
1852         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1853         assert_return(!event_origin_changed(e), -ECHILD);
1854
1855         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1856         if (r < 0)
1857                 return r;
1858
1859         s = source_new(e, !ret, SOURCE_EXIT);
1860         if (!s)
1861                 return -ENOMEM;
1862
1863         s->exit.callback = callback;
1864         s->userdata = userdata;
1865         s->exit.prioq_index = PRIOQ_IDX_NULL;
1866         s->enabled = SD_EVENT_ONESHOT;
1867
1868         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1869         if (r < 0)
1870                 return r;
1871
1872         if (ret)
1873                 *ret = s;
1874         TAKE_PTR(s);
1875
1876         return 0;
1877 }
1878
1879 _public_ int sd_event_trim_memory(void) {
1880         int r;
1881
1882         /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1883          * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1884          * NULL callback parameter. */
1885
1886         log_debug("Memory pressure event, trimming malloc() memory.");
1887
1888 #if HAVE_GENERIC_MALLINFO
1889         generic_mallinfo before_mallinfo = generic_mallinfo_get();
1890 #endif
1891
1892         usec_t before_timestamp = now(CLOCK_MONOTONIC);
1893         hashmap_trim_pools();
1894         r = malloc_trim(0);
1895         usec_t after_timestamp = now(CLOCK_MONOTONIC);
1896
1897         if (r > 0)
1898                 log_debug("Successfully trimmed some memory.");
1899         else
1900                 log_debug("Couldn't trim any memory.");
1901
1902         usec_t period = after_timestamp - before_timestamp;
1903
1904 #if HAVE_GENERIC_MALLINFO
1905         generic_mallinfo after_mallinfo = generic_mallinfo_get();
1906         size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
1907                 LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
1908         log_struct(LOG_DEBUG,
1909                    LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1910                                FORMAT_TIMESPAN(period, 0),
1911                                FORMAT_BYTES(l)),
1912                    "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1913                    "TRIMMED_BYTES=%zu", l,
1914                    "TRIMMED_USEC=" USEC_FMT, period);
1915 #else
1916         log_struct(LOG_DEBUG,
1917                    LOG_MESSAGE("Memory trimming took %s.",
1918                                FORMAT_TIMESPAN(period, 0)),
1919                    "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1920                    "TRIMMED_USEC=" USEC_FMT, period);
1921 #endif
1922
1923         return 0;
1924 }
1925
1926 static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1927         assert(s);
1928
1929         sd_event_trim_memory();
1930         return 0;
1931 }
1932
1933 _public_ int sd_event_add_memory_pressure(
1934                 sd_event *e,
1935                 sd_event_source **ret,
1936                 sd_event_handler_t callback,
1937                 void *userdata) {
1938
1939         _cleanup_free_ char *w = NULL;
1940         _cleanup_(source_freep) sd_event_source *s = NULL;
1941         _cleanup_close_ int path_fd = -EBADF, fd = -EBADF;
1942         _cleanup_free_ void *write_buffer = NULL;
1943         const char *watch, *watch_fallback = NULL, *env;
1944         size_t write_buffer_size = 0;
1945         struct stat st;
1946         uint32_t events;
1947         bool locked;
1948         int r;
1949
1950         assert_return(e, -EINVAL);
1951         assert_return(e = event_resolve(e), -ENOPKG);
1952         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1953         assert_return(!event_origin_changed(e), -ECHILD);
1954
1955         if (!callback)
1956                 callback = memory_pressure_callback;
1957
1958         s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1959         if (!s)
1960                 return -ENOMEM;
1961
1962         s->wakeup = WAKEUP_EVENT_SOURCE;
1963         s->memory_pressure.callback = callback;
1964         s->userdata = userdata;
1965         s->enabled = SD_EVENT_ON;
1966         s->memory_pressure.fd = -EBADF;
1967
1968         env = secure_getenv("MEMORY_PRESSURE_WATCH");
1969         if (env) {
1970                 if (isempty(env) || path_equal(env, "/dev/null"))
1971                         return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1972                                                "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1973
1974                 if (!path_is_absolute(env) || !path_is_normalized(env))
1975                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1976                                                "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1977
1978                 watch = env;
1979
1980                 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1981                 if (env) {
1982                         r = unbase64mem(env, &write_buffer, &write_buffer_size);
1983                         if (r < 0)
1984                                 return r;
1985                 }
1986
1987                 locked = true;
1988         } else {
1989
1990                 r = is_pressure_supported();
1991                 if (r < 0)
1992                         return r;
1993                 if (r == 0)
1994                         return -EOPNOTSUPP;
1995
1996                 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1997                  * the system wide pressure if for some reason we cannot (which could be: memory controller
1998                  * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1999                  * only use the system-wide logic. */
2000                 r = cg_all_unified();
2001                 if (r < 0)
2002                         return r;
2003                 if (r == 0)
2004                         watch = "/proc/pressure/memory";
2005                 else {
2006                         _cleanup_free_ char *cg = NULL;
2007
2008                         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
2009                         if (r < 0)
2010                                 return r;
2011
2012                         w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
2013                         if (!w)
2014                                 return -ENOMEM;
2015
2016                         watch = w;
2017                         watch_fallback = "/proc/pressure/memory";
2018                 }
2019
2020                 /* Android uses three levels in its userspace low memory killer logic:
2021                  *     some  70000 1000000
2022                  *     some 100000 1000000
2023                  *     full  70000 1000000
2024                  *
2025                  * GNOME's low memory monitor uses:
2026                  *     some  70000 1000000
2027                  *     some 100000 1000000
2028                  *     full 100000 1000000
2029                  *
2030                  * We'll default to the middle level that both agree on. Except we do it on a 2s window
2031                  * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2032                  * kernel will allow us to do unprivileged, also in the future. */
2033                 if (asprintf((char**) &write_buffer,
2034                              "%s " USEC_FMT " " USEC_FMT,
2035                              MEMORY_PRESSURE_DEFAULT_TYPE,
2036                              MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2037                              MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2038                         return -ENOMEM;
2039
2040                 write_buffer_size = strlen(write_buffer) + 1;
2041                 locked = false;
2042         }
2043
2044         path_fd = open(watch, O_PATH|O_CLOEXEC);
2045         if (path_fd < 0) {
2046                 if (errno != ENOENT)
2047                         return -errno;
2048
2049                 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2050                  * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2051                  * the PSI service apparently is not supported) */
2052                 if (!watch_fallback)
2053                         return locked ? -ENOENT : -EOPNOTSUPP;
2054
2055                 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
2056                 if (path_fd < 0) {
2057                         if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2058                                 return -EOPNOTSUPP;
2059                         return -errno;
2060                 }
2061         }
2062
2063         if (fstat(path_fd, &st) < 0)
2064                 return -errno;
2065
2066         if (S_ISSOCK(st.st_mode)) {
2067                 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2068                 if (fd < 0)
2069                         return -errno;
2070
2071                 r = connect_unix_path(fd, path_fd, NULL);
2072                 if (r < 0)
2073                         return r;
2074
2075                 events = EPOLLIN;
2076
2077         } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2078                 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2079                 if (fd < 0)
2080                         return fd;
2081
2082                 if (S_ISREG(st.st_mode)) {
2083                         struct statfs sfs;
2084
2085                         /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2086
2087                         if (fstatfs(fd, &sfs) < 0)
2088                                 return -errno;
2089
2090                         if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2091                             !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2092                                 return -ENOTTY;
2093
2094                         events = EPOLLPRI;
2095                 } else
2096                         /* For fifos and char devices just watch for EPOLLIN */
2097                         events = EPOLLIN;
2098
2099         } else if (S_ISDIR(st.st_mode))
2100                 return -EISDIR;
2101         else
2102                 return -EBADF;
2103
2104         s->memory_pressure.fd = TAKE_FD(fd);
2105         s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2106         s->memory_pressure.write_buffer_size = write_buffer_size;
2107         s->memory_pressure.events = events;
2108         s->memory_pressure.locked = locked;
2109
2110         /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2111          * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2112          * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2113          * event sources on which writes must be executed before the first event loop iteration is
2114          * executed. (We could also write the data here, right away, but we want to give the caller the
2115          * freedom to call sd_event_source_set_memory_pressure_type() and
2116          * sd_event_source_set_memory_pressure_rate() before we write it. */
2117
2118         if (s->memory_pressure.write_buffer_size > 0)
2119                 source_memory_pressure_add_to_write_list(s);
2120         else {
2121                 r = source_memory_pressure_register(s, s->enabled);
2122                 if (r < 0)
2123                         return r;
2124         }
2125
2126         if (ret)
2127                 *ret = s;
2128         TAKE_PTR(s);
2129
2130         return 0;
2131 }
2132
2133 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2134         assert(e);
2135
2136         if (!d)
2137                 return;
2138
2139         assert(hashmap_isempty(d->inodes));
2140         assert(hashmap_isempty(d->wd));
2141
2142         if (d->buffer_filled > 0)
2143                 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
2144
2145         hashmap_free(d->inodes);
2146         hashmap_free(d->wd);
2147
2148         assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2149
2150         if (d->fd >= 0) {
2151                 if (!event_origin_changed(e) &&
2152                     epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
2153                         log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2154
2155                 safe_close(d->fd);
2156         }
2157         free(d);
2158 }
2159
2160 static int event_make_inotify_data(
2161                 sd_event *e,
2162                 int64_t priority,
2163                 struct inotify_data **ret) {
2164
2165         _cleanup_close_ int fd = -EBADF;
2166         struct inotify_data *d;
2167         int r;
2168
2169         assert(e);
2170
2171         d = hashmap_get(e->inotify_data, &priority);
2172         if (d) {
2173                 if (ret)
2174                         *ret = d;
2175                 return 0;
2176         }
2177
2178         fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2179         if (fd < 0)
2180                 return -errno;
2181
2182         fd = fd_move_above_stdio(fd);
2183
2184         d = new(struct inotify_data, 1);
2185         if (!d)
2186                 return -ENOMEM;
2187
2188         *d = (struct inotify_data) {
2189                 .wakeup = WAKEUP_INOTIFY_DATA,
2190                 .fd = TAKE_FD(fd),
2191                 .priority = priority,
2192         };
2193
2194         r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
2195         if (r < 0) {
2196                 d->fd = safe_close(d->fd);
2197                 free(d);
2198                 return r;
2199         }
2200
2201         struct epoll_event ev = {
2202                 .events = EPOLLIN,
2203                 .data.ptr = d,
2204         };
2205
2206         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2207                 r = -errno;
2208                 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2209                                             * remove the fd from the epoll first, which we don't want as we couldn't
2210                                             * add it in the first place. */
2211                 event_free_inotify_data(e, d);
2212                 return r;
2213         }
2214
2215         if (ret)
2216                 *ret = d;
2217
2218         return 1;
2219 }
2220
2221 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
2222         int r;
2223
2224         assert(x);
2225         assert(y);
2226
2227         r = CMP(x->dev, y->dev);
2228         if (r != 0)
2229                 return r;
2230
2231         return CMP(x->ino, y->ino);
2232 }
2233
2234 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2235         assert(d);
2236
2237         siphash24_compress_typesafe(d->dev, state);
2238         siphash24_compress_typesafe(d->ino, state);
2239 }
2240
2241 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
2242
2243 static void event_free_inode_data(
2244                 sd_event *e,
2245                 struct inode_data *d) {
2246
2247         assert(e);
2248
2249         if (!d)
2250                 return;
2251
2252         assert(!d->event_sources);
2253
2254         if (d->fd >= 0) {
2255                 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
2256                 safe_close(d->fd);
2257         }
2258
2259         if (d->inotify_data) {
2260
2261                 if (d->wd >= 0) {
2262                         if (d->inotify_data->fd >= 0 && !event_origin_changed(e)) {
2263                                 /* So here's a problem. At the time this runs the watch descriptor might already be
2264                                  * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2265                                  * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2266                                  * likely case to happen. */
2267
2268                                 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2269                                         log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2270                         }
2271
2272                         assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2273                 }
2274
2275                 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2276         }
2277
2278         free(d->path);
2279         free(d);
2280 }
2281
2282 static void event_gc_inotify_data(
2283                 sd_event *e,
2284                 struct inotify_data *d) {
2285
2286         assert(e);
2287
2288         /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2289          * any inode with it anymore, which in turn happens if no event source of this priority is interested
2290          * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2291          * (under the expectation that the GC is called again once the counter is decremented). */
2292
2293         if (!d)
2294                 return;
2295
2296         if (!hashmap_isempty(d->inodes))
2297                 return;
2298
2299         if (d->n_busy > 0)
2300                 return;
2301
2302         event_free_inotify_data(e, d);
2303 }
2304
2305 static void event_gc_inode_data(
2306                 sd_event *e,
2307                 struct inode_data *d) {
2308
2309         struct inotify_data *inotify_data;
2310
2311         assert(e);
2312
2313         if (!d)
2314                 return;
2315
2316         if (d->event_sources)
2317                 return;
2318
2319         inotify_data = d->inotify_data;
2320         event_free_inode_data(e, d);
2321
2322         event_gc_inotify_data(e, inotify_data);
2323 }
2324
2325 static int event_make_inode_data(
2326                 sd_event *e,
2327                 struct inotify_data *inotify_data,
2328                 dev_t dev,
2329                 ino_t ino,
2330                 struct inode_data **ret) {
2331
2332         struct inode_data *d, key;
2333         int r;
2334
2335         assert(e);
2336         assert(inotify_data);
2337
2338         key = (struct inode_data) {
2339                 .ino = ino,
2340                 .dev = dev,
2341         };
2342
2343         d = hashmap_get(inotify_data->inodes, &key);
2344         if (d) {
2345                 if (ret)
2346                         *ret = d;
2347
2348                 return 0;
2349         }
2350
2351         r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2352         if (r < 0)
2353                 return r;
2354
2355         d = new(struct inode_data, 1);
2356         if (!d)
2357                 return -ENOMEM;
2358
2359         *d = (struct inode_data) {
2360                 .dev = dev,
2361                 .ino = ino,
2362                 .wd = -1,
2363                 .fd = -EBADF,
2364                 .inotify_data = inotify_data,
2365         };
2366
2367         r = hashmap_put(inotify_data->inodes, d, d);
2368         if (r < 0) {
2369                 free(d);
2370                 return r;
2371         }
2372
2373         if (ret)
2374                 *ret = d;
2375
2376         return 1;
2377 }
2378
2379 static uint32_t inode_data_determine_mask(struct inode_data *d) {
2380         bool excl_unlink = true;
2381         uint32_t combined = 0;
2382
2383         assert(d);
2384
2385         /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2386          * the IN_EXCL_UNLINK flag is ANDed instead.
2387          *
2388          * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2389          * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2390          * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2391          * events we don't care for client-side. */
2392
2393         LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2394
2395                 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2396                         excl_unlink = false;
2397
2398                 combined |= s->inotify.mask;
2399         }
2400
2401         return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2402 }
2403
2404 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2405         uint32_t combined_mask;
2406         int wd, r;
2407
2408         assert(d);
2409         assert(d->fd >= 0);
2410
2411         combined_mask = inode_data_determine_mask(d);
2412
2413         if (d->wd >= 0 && combined_mask == d->combined_mask)
2414                 return 0;
2415
2416         r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2417         if (r < 0)
2418                 return r;
2419
2420         wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2421         if (wd < 0)
2422                 return wd;
2423
2424         if (d->wd < 0) {
2425                 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2426                 if (r < 0) {
2427                         (void) inotify_rm_watch(d->inotify_data->fd, wd);
2428                         return r;
2429                 }
2430
2431                 d->wd = wd;
2432
2433         } else if (d->wd != wd) {
2434
2435                 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2436                 (void) inotify_rm_watch(d->fd, wd);
2437                 return -EINVAL;
2438         }
2439
2440         d->combined_mask = combined_mask;
2441         return 1;
2442 }
2443
2444 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2445         assert(s);
2446
2447         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2448 }
2449
2450 static int event_add_inotify_fd_internal(
2451                 sd_event *e,
2452                 sd_event_source **ret,
2453                 int fd,
2454                 bool donate,
2455                 uint32_t mask,
2456                 sd_event_inotify_handler_t callback,
2457                 void *userdata) {
2458
2459         _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
2460         _cleanup_(source_freep) sd_event_source *s = NULL;
2461         struct inotify_data *inotify_data = NULL;
2462         struct inode_data *inode_data = NULL;
2463         struct stat st;
2464         int r;
2465
2466         assert_return(e, -EINVAL);
2467         assert_return(e = event_resolve(e), -ENOPKG);
2468         assert_return(fd >= 0, -EBADF);
2469         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2470         assert_return(!event_origin_changed(e), -ECHILD);
2471
2472         if (!callback)
2473                 callback = inotify_exit_callback;
2474
2475         /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2476          * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2477          * the user can't use them for us. */
2478         if (mask & IN_MASK_ADD)
2479                 return -EINVAL;
2480
2481         if (fstat(fd, &st) < 0)
2482                 return -errno;
2483
2484         s = source_new(e, !ret, SOURCE_INOTIFY);
2485         if (!s)
2486                 return -ENOMEM;
2487
2488         s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2489         s->inotify.mask = mask;
2490         s->inotify.callback = callback;
2491         s->userdata = userdata;
2492
2493         /* Allocate an inotify object for this priority, and an inode object within it */
2494         r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2495         if (r < 0)
2496                 return r;
2497
2498         r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2499         if (r < 0) {
2500                 event_gc_inotify_data(e, inotify_data);
2501                 return r;
2502         }
2503
2504         /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2505          * the event source, until then, for which we need the original inode. */
2506         if (inode_data->fd < 0) {
2507                 if (donated_fd >= 0)
2508                         inode_data->fd = TAKE_FD(donated_fd);
2509                 else {
2510                         inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2511                         if (inode_data->fd < 0) {
2512                                 r = -errno;
2513                                 event_gc_inode_data(e, inode_data);
2514                                 return r;
2515                         }
2516                 }
2517
2518                 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
2519
2520                 _cleanup_free_ char *path = NULL;
2521                 r = fd_get_path(inode_data->fd, &path);
2522                 if (r < 0 && r != -ENOSYS) { /* The path is optional, hence ignore -ENOSYS. */
2523                         event_gc_inode_data(e, inode_data);
2524                         return r;
2525                 }
2526
2527                 free_and_replace(inode_data->path, path);
2528         }
2529
2530         /* Link our event source to the inode data object */
2531         LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2532         s->inotify.inode_data = inode_data;
2533
2534         /* Actually realize the watch now */
2535         r = inode_data_realize_watch(e, inode_data);
2536         if (r < 0)
2537                 return r;
2538
2539         if (ret)
2540                 *ret = s;
2541         TAKE_PTR(s);
2542
2543         return 0;
2544 }
2545
2546 _public_ int sd_event_add_inotify_fd(
2547                 sd_event *e,
2548                 sd_event_source **ret,
2549                 int fd,
2550                 uint32_t mask,
2551                 sd_event_inotify_handler_t callback,
2552                 void *userdata) {
2553
2554         return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2555 }
2556
2557 _public_ int sd_event_add_inotify(
2558                 sd_event *e,
2559                 sd_event_source **ret,
2560                 const char *path,
2561                 uint32_t mask,
2562                 sd_event_inotify_handler_t callback,
2563                 void *userdata) {
2564
2565         sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2566         int fd, r;
2567
2568         assert_return(path, -EINVAL);
2569
2570         fd = open(path, O_PATH | O_CLOEXEC |
2571                         (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2572                         (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2573         if (fd < 0)
2574                 return -errno;
2575
2576         r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2577         if (r < 0)
2578                 return r;
2579
2580         (void) sd_event_source_set_description(s, path);
2581
2582         if (ret)
2583                 *ret = s;
2584
2585         return r;
2586 }
2587
2588 static sd_event_source* event_source_free(sd_event_source *s) {
2589         if (!s)
2590                 return NULL;
2591
2592         /* Here's a special hack: when we are called from a
2593          * dispatch handler we won't free the event source
2594          * immediately, but we will detach the fd from the
2595          * epoll. This way it is safe for the caller to unref
2596          * the event source and immediately close the fd, but
2597          * we still retain a valid event source object after
2598          * the callback. */
2599
2600         if (s->dispatching)
2601                 source_disconnect(s);
2602         else
2603                 source_free(s);
2604
2605         return NULL;
2606 }
2607
2608 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2609
2610 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2611         assert_return(s, -EINVAL);
2612         assert_return(!event_origin_changed(s->event), -ECHILD);
2613
2614         return free_and_strdup(&s->description, description);
2615 }
2616
2617 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
2618         assert_return(s, -EINVAL);
2619         assert_return(description, -EINVAL);
2620
2621         if (!s->description)
2622                 return -ENXIO;
2623
2624         *description = s->description;
2625         return 0;
2626 }
2627
2628 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
2629         assert_return(s, NULL);
2630         assert_return(!event_origin_changed(s->event), NULL);
2631
2632         return s->event;
2633 }
2634
2635 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2636         assert_return(s, -EINVAL);
2637         assert_return(s->type != SOURCE_EXIT, -EDOM);
2638         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2639         assert_return(!event_origin_changed(s->event), -ECHILD);
2640
2641         return s->pending;
2642 }
2643
2644 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2645         assert_return(s, -EINVAL);
2646         assert_return(s->type == SOURCE_IO, -EDOM);
2647         assert_return(!event_origin_changed(s->event), -ECHILD);
2648
2649         return s->io.fd;
2650 }
2651
2652 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2653         int saved_fd, r;
2654
2655         assert_return(s, -EINVAL);
2656         assert_return(fd >= 0, -EBADF);
2657         assert_return(s->type == SOURCE_IO, -EDOM);
2658         assert_return(!event_origin_changed(s->event), -ECHILD);
2659
2660         if (s->io.fd == fd)
2661                 return 0;
2662
2663         saved_fd = s->io.fd;
2664         s->io.fd = fd;
2665
2666         assert(event_source_is_offline(s) == !s->io.registered);
2667
2668         if (s->io.registered) {
2669                 s->io.registered = false;
2670
2671                 r = source_io_register(s, s->enabled, s->io.events);
2672                 if (r < 0) {
2673                         s->io.fd = saved_fd;
2674                         s->io.registered = true;
2675                         return r;
2676                 }
2677
2678                 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2679         }
2680
2681         if (s->io.owned)
2682                 safe_close(saved_fd);
2683
2684         return 0;
2685 }
2686
2687 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2688         assert_return(s, -EINVAL);
2689         assert_return(s->type == SOURCE_IO, -EDOM);
2690         assert_return(!event_origin_changed(s->event), -ECHILD);
2691
2692         return s->io.owned;
2693 }
2694
2695 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2696         assert_return(s, -EINVAL);
2697         assert_return(s->type == SOURCE_IO, -EDOM);
2698         assert_return(!event_origin_changed(s->event), -ECHILD);
2699
2700         s->io.owned = own;
2701         return 0;
2702 }
2703
2704 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2705         assert_return(s, -EINVAL);
2706         assert_return(events, -EINVAL);
2707         assert_return(s->type == SOURCE_IO, -EDOM);
2708         assert_return(!event_origin_changed(s->event), -ECHILD);
2709
2710         *events = s->io.events;
2711         return 0;
2712 }
2713
2714 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2715         int r;
2716
2717         assert_return(s, -EINVAL);
2718         assert_return(s->type == SOURCE_IO, -EDOM);
2719         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2720         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2721         assert_return(!event_origin_changed(s->event), -ECHILD);
2722
2723         /* edge-triggered updates are never skipped, so we can reset edges */
2724         if (s->io.events == events && !(events & EPOLLET))
2725                 return 0;
2726
2727         r = source_set_pending(s, false);
2728         if (r < 0)
2729                 return r;
2730
2731         if (event_source_is_online(s)) {
2732                 r = source_io_register(s, s->enabled, events);
2733                 if (r < 0)
2734                         return r;
2735         }
2736
2737         s->io.events = events;
2738
2739         return 0;
2740 }
2741
2742 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2743         assert_return(s, -EINVAL);
2744         assert_return(revents, -EINVAL);
2745         assert_return(s->type == SOURCE_IO, -EDOM);
2746         assert_return(s->pending, -ENODATA);
2747         assert_return(!event_origin_changed(s->event), -ECHILD);
2748
2749         *revents = s->io.revents;
2750         return 0;
2751 }
2752
2753 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2754         assert_return(s, -EINVAL);
2755         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2756         assert_return(!event_origin_changed(s->event), -ECHILD);
2757
2758         return s->signal.sig;
2759 }
2760
2761 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2762         assert_return(s, -EINVAL);
2763         assert_return(!event_origin_changed(s->event), -ECHILD);
2764
2765         *priority = s->priority;
2766         return 0;
2767 }
2768
2769 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2770         bool rm_inotify = false, rm_inode = false;
2771         struct inotify_data *new_inotify_data = NULL;
2772         struct inode_data *new_inode_data = NULL;
2773         int r;
2774
2775         assert_return(s, -EINVAL);
2776         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2777         assert_return(!event_origin_changed(s->event), -ECHILD);
2778
2779         if (s->priority == priority)
2780                 return 0;
2781
2782         if (s->type == SOURCE_INOTIFY) {
2783                 struct inode_data *old_inode_data;
2784
2785                 assert(s->inotify.inode_data);
2786                 old_inode_data = s->inotify.inode_data;
2787
2788                 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2789                  * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2790                  * events we allow priority changes only until the first following iteration. */
2791                 if (old_inode_data->fd < 0)
2792                         return -EOPNOTSUPP;
2793
2794                 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2795                 if (r < 0)
2796                         return r;
2797                 rm_inotify = r > 0;
2798
2799                 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2800                 if (r < 0)
2801                         goto fail;
2802                 rm_inode = r > 0;
2803
2804                 if (new_inode_data->fd < 0) {
2805                         /* Duplicate the fd for the new inode object if we don't have any yet */
2806                         new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2807                         if (new_inode_data->fd < 0) {
2808                                 r = -errno;
2809                                 goto fail;
2810                         }
2811
2812                         LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
2813
2814                         _cleanup_free_ char *path = NULL;
2815                         r = fd_get_path(new_inode_data->fd, &path);
2816                         if (r < 0 && r != -ENOSYS)
2817                                 goto fail;
2818
2819                         free_and_replace(new_inode_data->path, path);
2820                 }
2821
2822                 /* Move the event source to the new inode data structure */
2823                 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2824                 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2825                 s->inotify.inode_data = new_inode_data;
2826
2827                 /* Now create the new watch */
2828                 r = inode_data_realize_watch(s->event, new_inode_data);
2829                 if (r < 0) {
2830                         /* Move it back */
2831                         LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2832                         LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2833                         s->inotify.inode_data = old_inode_data;
2834                         goto fail;
2835                 }
2836
2837                 s->priority = priority;
2838
2839                 event_gc_inode_data(s->event, old_inode_data);
2840
2841         } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2842                 struct signal_data *old, *d;
2843
2844                 /* Move us from the signalfd belonging to the old
2845                  * priority to the signalfd of the new priority */
2846
2847                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2848
2849                 s->priority = priority;
2850
2851                 r = event_make_signal_data(s->event, s->signal.sig, &d);
2852                 if (r < 0) {
2853                         s->priority = old->priority;
2854                         return r;
2855                 }
2856
2857                 event_unmask_signal_data(s->event, old, s->signal.sig);
2858         } else
2859                 s->priority = priority;
2860
2861         event_source_pp_prioq_reshuffle(s);
2862
2863         if (s->type == SOURCE_EXIT)
2864                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2865
2866         return 0;
2867
2868 fail:
2869         if (rm_inode)
2870                 event_free_inode_data(s->event, new_inode_data);
2871
2872         if (rm_inotify)
2873                 event_free_inotify_data(s->event, new_inotify_data);
2874
2875         return r;
2876 }
2877
2878 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2879         /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2880         if (!s && !ret)
2881                 return false;
2882
2883         assert_return(s, -EINVAL);
2884         assert_return(!event_origin_changed(s->event), -ECHILD);
2885
2886         if (ret)
2887                 *ret = s->enabled;
2888
2889         return s->enabled != SD_EVENT_OFF;
2890 }
2891
2892 static int event_source_offline(
2893                 sd_event_source *s,
2894                 int enabled,
2895                 bool ratelimited) {
2896
2897         bool was_offline;
2898         int r;
2899
2900         assert(s);
2901         assert(enabled == SD_EVENT_OFF || ratelimited);
2902
2903         /* Unset the pending flag when this event source is disabled */
2904         if (s->enabled != SD_EVENT_OFF &&
2905             enabled == SD_EVENT_OFF &&
2906             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2907                 r = source_set_pending(s, false);
2908                 if (r < 0)
2909                         return r;
2910         }
2911
2912         was_offline = event_source_is_offline(s);
2913         s->enabled = enabled;
2914         s->ratelimited = ratelimited;
2915
2916         switch (s->type) {
2917
2918         case SOURCE_IO:
2919                 source_io_unregister(s);
2920                 break;
2921
2922         case SOURCE_SIGNAL:
2923                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2924                 break;
2925
2926         case SOURCE_CHILD:
2927                 if (!was_offline) {
2928                         assert(s->event->n_online_child_sources > 0);
2929                         s->event->n_online_child_sources--;
2930                 }
2931
2932                 if (EVENT_SOURCE_WATCH_PIDFD(s))
2933                         source_child_pidfd_unregister(s);
2934                 else
2935                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2936                 break;
2937
2938         case SOURCE_EXIT:
2939                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2940                 break;
2941
2942         case SOURCE_MEMORY_PRESSURE:
2943                 source_memory_pressure_unregister(s);
2944                 break;
2945
2946         case SOURCE_TIME_REALTIME:
2947         case SOURCE_TIME_BOOTTIME:
2948         case SOURCE_TIME_MONOTONIC:
2949         case SOURCE_TIME_REALTIME_ALARM:
2950         case SOURCE_TIME_BOOTTIME_ALARM:
2951         case SOURCE_DEFER:
2952         case SOURCE_POST:
2953         case SOURCE_INOTIFY:
2954                 break;
2955
2956         default:
2957                 assert_not_reached();
2958         }
2959
2960         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2961         event_source_time_prioq_reshuffle(s);
2962
2963         return 1;
2964 }
2965
2966 static int event_source_online(
2967                 sd_event_source *s,
2968                 int enabled,
2969                 bool ratelimited) {
2970
2971         bool was_online;
2972         int r;
2973
2974         assert(s);
2975         assert(enabled != SD_EVENT_OFF || !ratelimited);
2976
2977         /* Unset the pending flag when this event source is enabled */
2978         if (s->enabled == SD_EVENT_OFF &&
2979             enabled != SD_EVENT_OFF &&
2980             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2981                 r = source_set_pending(s, false);
2982                 if (r < 0)
2983                         return r;
2984         }
2985
2986         /* Are we really ready for onlining? */
2987         if (enabled == SD_EVENT_OFF || ratelimited) {
2988                 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2989                 s->enabled = enabled;
2990                 s->ratelimited = ratelimited;
2991                 return 0;
2992         }
2993
2994         was_online = event_source_is_online(s);
2995
2996         switch (s->type) {
2997         case SOURCE_IO:
2998                 r = source_io_register(s, enabled, s->io.events);
2999                 if (r < 0)
3000                         return r;
3001                 break;
3002
3003         case SOURCE_SIGNAL:
3004                 r = event_make_signal_data(s->event, s->signal.sig, NULL);
3005                 if (r < 0) {
3006                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
3007                         return r;
3008                 }
3009
3010                 break;
3011
3012         case SOURCE_CHILD:
3013                 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
3014                         /* yes, we have pidfd */
3015
3016                         r = source_child_pidfd_register(s, enabled);
3017                         if (r < 0)
3018                                 return r;
3019                 } else {
3020                         /* no pidfd, or something other to watch for than WEXITED */
3021
3022                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
3023                         if (r < 0) {
3024                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
3025                                 return r;
3026                         }
3027                 }
3028
3029                 if (!was_online)
3030                         s->event->n_online_child_sources++;
3031                 break;
3032
3033         case SOURCE_MEMORY_PRESSURE:
3034                 r = source_memory_pressure_register(s, enabled);
3035                 if (r < 0)
3036                         return r;
3037
3038                 break;
3039
3040         case SOURCE_TIME_REALTIME:
3041         case SOURCE_TIME_BOOTTIME:
3042         case SOURCE_TIME_MONOTONIC:
3043         case SOURCE_TIME_REALTIME_ALARM:
3044         case SOURCE_TIME_BOOTTIME_ALARM:
3045         case SOURCE_EXIT:
3046         case SOURCE_DEFER:
3047         case SOURCE_POST:
3048         case SOURCE_INOTIFY:
3049                 break;
3050
3051         default:
3052                 assert_not_reached();
3053         }
3054
3055         s->enabled = enabled;
3056         s->ratelimited = ratelimited;
3057
3058         /* Non-failing operations below */
3059         if (s->type == SOURCE_EXIT)
3060                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
3061
3062         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3063         event_source_time_prioq_reshuffle(s);
3064
3065         return 1;
3066 }
3067
3068 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3069         int r;
3070
3071         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
3072
3073         /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3074         if (m == SD_EVENT_OFF && !s)
3075                 return 0;
3076
3077         assert_return(s, -EINVAL);
3078         assert_return(!event_origin_changed(s->event), -ECHILD);
3079
3080         /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3081         if (s->event->state == SD_EVENT_FINISHED)
3082                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
3083
3084         if (s->enabled == m) /* No change? */
3085                 return 0;
3086
3087         if (m == SD_EVENT_OFF)
3088                 r = event_source_offline(s, m, s->ratelimited);
3089         else {
3090                 if (s->enabled != SD_EVENT_OFF) {
3091                         /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3092                          * event source is already enabled after all. */
3093                         s->enabled = m;
3094                         return 0;
3095                 }
3096
3097                 r = event_source_online(s, m, s->ratelimited);
3098         }
3099         if (r < 0)
3100                 return r;
3101
3102         event_source_pp_prioq_reshuffle(s);
3103         return 0;
3104 }
3105
3106 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
3107         assert_return(s, -EINVAL);
3108         assert_return(usec, -EINVAL);
3109         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3110         assert_return(!event_origin_changed(s->event), -ECHILD);
3111
3112         *usec = s->time.next;
3113         return 0;
3114 }
3115
3116 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
3117         int r;
3118
3119         assert_return(s, -EINVAL);
3120         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3121         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3122         assert_return(!event_origin_changed(s->event), -ECHILD);
3123
3124         r = source_set_pending(s, false);
3125         if (r < 0)
3126                 return r;
3127
3128         s->time.next = usec;
3129
3130         event_source_time_prioq_reshuffle(s);
3131         return 0;
3132 }
3133
3134 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3135         usec_t t;
3136         int r;
3137
3138         assert_return(s, -EINVAL);
3139         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3140         assert_return(!event_origin_changed(s->event), -ECHILD);
3141
3142         if (usec == USEC_INFINITY)
3143                 return sd_event_source_set_time(s, USEC_INFINITY);
3144
3145         r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3146         if (r < 0)
3147                 return r;
3148
3149         usec = usec_add(t, usec);
3150         if (usec == USEC_INFINITY)
3151                 return -EOVERFLOW;
3152
3153         return sd_event_source_set_time(s, usec);
3154 }
3155
3156 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
3157         assert_return(s, -EINVAL);
3158         assert_return(usec, -EINVAL);
3159         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3160         assert_return(!event_origin_changed(s->event), -ECHILD);
3161
3162         *usec = s->time.accuracy;
3163         return 0;
3164 }
3165
3166 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
3167         int r;
3168
3169         assert_return(s, -EINVAL);
3170         assert_return(usec != UINT64_MAX, -EINVAL);
3171         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3172         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3173         assert_return(!event_origin_changed(s->event), -ECHILD);
3174
3175         r = source_set_pending(s, false);
3176         if (r < 0)
3177                 return r;
3178
3179         if (usec == 0)
3180                 usec = DEFAULT_ACCURACY_USEC;
3181
3182         s->time.accuracy = usec;
3183
3184         event_source_time_prioq_reshuffle(s);
3185         return 0;
3186 }
3187
3188 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
3189         assert_return(s, -EINVAL);
3190         assert_return(clock, -EINVAL);
3191         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3192         assert_return(!event_origin_changed(s->event), -ECHILD);
3193
3194         *clock = event_source_type_to_clock(s->type);
3195         return 0;
3196 }
3197
3198 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
3199         assert_return(s, -EINVAL);
3200         assert_return(pid, -EINVAL);
3201         assert_return(s->type == SOURCE_CHILD, -EDOM);
3202         assert_return(!event_origin_changed(s->event), -ECHILD);
3203
3204         *pid = s->child.pid;
3205         return 0;
3206 }
3207
3208 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3209         assert_return(s, -EINVAL);
3210         assert_return(s->type == SOURCE_CHILD, -EDOM);
3211         assert_return(!event_origin_changed(s->event), -ECHILD);
3212
3213         if (s->child.pidfd < 0)
3214                 return -EOPNOTSUPP;
3215
3216         return s->child.pidfd;
3217 }
3218
3219 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3220         assert_return(s, -EINVAL);
3221         assert_return(s->type == SOURCE_CHILD, -EDOM);
3222         assert_return(!event_origin_changed(s->event), -ECHILD);
3223         assert_return(SIGNAL_VALID(sig), -EINVAL);
3224
3225         /* If we already have seen indication the process exited refuse sending a signal early. This way we
3226          * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3227          * available. */
3228         if (s->child.exited)
3229                 return -ESRCH;
3230
3231         if (s->child.pidfd >= 0) {
3232                 siginfo_t copy;
3233
3234                 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3235                  * structure here */
3236                 if (si)
3237                         copy = *si;
3238
3239                 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
3240                         /* Let's propagate the error only if the system call is not implemented or prohibited */
3241                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3242                                 return -errno;
3243                 } else
3244                         return 0;
3245         }
3246
3247         /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3248          * this here. */
3249         if (flags != 0)
3250                 return -EOPNOTSUPP;
3251
3252         if (si) {
3253                 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3254                 siginfo_t copy = *si;
3255
3256                 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
3257                         return -errno;
3258         } else if (kill(s->child.pid, sig) < 0)
3259                 return -errno;
3260
3261         return 0;
3262 }
3263
3264 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3265         assert_return(s, -EINVAL);
3266         assert_return(s->type == SOURCE_CHILD, -EDOM);
3267         assert_return(!event_origin_changed(s->event), -ECHILD);
3268
3269         if (s->child.pidfd < 0)
3270                 return -EOPNOTSUPP;
3271
3272         return s->child.pidfd_owned;
3273 }
3274
3275 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3276         assert_return(s, -EINVAL);
3277         assert_return(s->type == SOURCE_CHILD, -EDOM);
3278         assert_return(!event_origin_changed(s->event), -ECHILD);
3279
3280         if (s->child.pidfd < 0)
3281                 return -EOPNOTSUPP;
3282
3283         s->child.pidfd_owned = own;
3284         return 0;
3285 }
3286
3287 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3288         assert_return(s, -EINVAL);
3289         assert_return(s->type == SOURCE_CHILD, -EDOM);
3290         assert_return(!event_origin_changed(s->event), -ECHILD);
3291
3292         return s->child.process_owned;
3293 }
3294
3295 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3296         assert_return(s, -EINVAL);
3297         assert_return(s->type == SOURCE_CHILD, -EDOM);
3298         assert_return(!event_origin_changed(s->event), -ECHILD);
3299
3300         s->child.process_owned = own;
3301         return 0;
3302 }
3303
3304 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret) {
3305         assert_return(s, -EINVAL);
3306         assert_return(ret, -EINVAL);
3307         assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3308         assert_return(!event_origin_changed(s->event), -ECHILD);
3309
3310         *ret = s->inotify.mask;
3311         return 0;
3312 }
3313
3314 _public_ int sd_event_source_get_inotify_path(sd_event_source *s, const char **ret) {
3315         assert_return(s, -EINVAL);
3316         assert_return(ret, -EINVAL);
3317         assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3318         assert_return(!event_origin_changed(s->event), -ECHILD);
3319
3320         if (!s->inotify.inode_data)
3321                 return -ESTALE; /* already disconnected. */
3322
3323         if (!s->inotify.inode_data->path)
3324                 return -ENOSYS; /* /proc was not mounted? */
3325
3326         *ret = s->inotify.inode_data->path;
3327         return 0;
3328 }
3329
3330 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
3331         int r;
3332
3333         assert_return(s, -EINVAL);
3334         assert_return(s->type != SOURCE_EXIT, -EDOM);
3335         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3336         assert_return(!event_origin_changed(s->event), -ECHILD);
3337
3338         if (s->prepare == callback)
3339                 return 0;
3340
3341         if (callback && s->prepare) {
3342                 s->prepare = callback;
3343                 return 0;
3344         }
3345
3346         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3347         if (r < 0)
3348                 return r;
3349
3350         s->prepare = callback;
3351
3352         if (callback) {
3353                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3354                 if (r < 0)
3355                         return r;
3356         } else
3357                 prioq_remove(s->event->prepare, s, &s->prepare_index);
3358
3359         return 0;
3360 }
3361
3362 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
3363         assert_return(s, NULL);
3364         assert_return(!event_origin_changed(s->event), NULL);
3365
3366         return s->userdata;
3367 }
3368
3369 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3370         void *ret;
3371
3372         assert_return(s, NULL);
3373         assert_return(!event_origin_changed(s->event), NULL);
3374
3375         ret = s->userdata;
3376         s->userdata = userdata;
3377
3378         return ret;
3379 }
3380
3381 static int event_source_enter_ratelimited(sd_event_source *s) {
3382         int r;
3383
3384         assert(s);
3385
3386         /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3387          * the end of the rate limit time window, much as if it was a timer event source. */
3388
3389         if (s->ratelimited)
3390                 return 0; /* Already ratelimited, this is a NOP hence */
3391
3392         /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3393         r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3394         if (r < 0)
3395                 return r;
3396
3397         /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3398          * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3399          * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3400         if (EVENT_SOURCE_IS_TIME(s->type))
3401                 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3402
3403         /* Now, let's add the event source to the monotonic clock instead */
3404         r = event_source_time_prioq_put(s, &s->event->monotonic);
3405         if (r < 0)
3406                 goto fail;
3407
3408         /* And let's take the event source officially offline */
3409         r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3410         if (r < 0) {
3411                 event_source_time_prioq_remove(s, &s->event->monotonic);
3412                 goto fail;
3413         }
3414
3415         event_source_pp_prioq_reshuffle(s);
3416
3417         log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3418         return 0;
3419
3420 fail:
3421         /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3422          * space for it should already be allocated. */
3423         if (EVENT_SOURCE_IS_TIME(s->type))
3424                 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3425
3426         return r;
3427 }
3428
3429 static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
3430         int r;
3431
3432         assert(s);
3433
3434         if (!s->ratelimited)
3435                 return 0;
3436
3437         /* Let's take the event source out of the monotonic prioq first. */
3438         event_source_time_prioq_remove(s, &s->event->monotonic);
3439
3440         /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3441         if (EVENT_SOURCE_IS_TIME(s->type)) {
3442                 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3443                 if (r < 0)
3444                         goto fail;
3445         }
3446
3447         /* Let's try to take it online again.  */
3448         r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3449         if (r < 0) {
3450                 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3451                 if (EVENT_SOURCE_IS_TIME(s->type))
3452                         event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3453
3454                 goto fail;
3455         }
3456
3457         event_source_pp_prioq_reshuffle(s);
3458         ratelimit_reset(&s->rate_limit);
3459
3460         log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
3461
3462         if (run_callback && s->ratelimit_expire_callback) {
3463                 s->dispatching = true;
3464                 r = s->ratelimit_expire_callback(s, s->userdata);
3465                 s->dispatching = false;
3466
3467                 if (r < 0) {
3468                         log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3469                                         strna(s->description),
3470                                         event_source_type_to_string(s->type),
3471                                         s->exit_on_failure ? "exiting" : "disabling");
3472
3473                         if (s->exit_on_failure)
3474                                 (void) sd_event_exit(s->event, r);
3475                 }
3476
3477                 if (s->n_ref == 0)
3478                         source_free(s);
3479                 else if (r < 0)
3480                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3481
3482                 return 1;
3483         }
3484
3485         return 0;
3486
3487 fail:
3488         /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3489          * simply put it back in it, maybe we can then process it more successfully next iteration. */
3490         assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3491
3492         return r;
3493 }
3494
3495 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3496         usec_t c;
3497         assert(e);
3498         assert(a <= b);
3499
3500         if (a <= 0)
3501                 return 0;
3502         if (a >= USEC_INFINITY)
3503                 return USEC_INFINITY;
3504
3505         if (b <= a + 1)
3506                 return a;
3507
3508         initialize_perturb(e);
3509
3510         /*
3511           Find a good time to wake up again between times a and b. We
3512           have two goals here:
3513
3514           a) We want to wake up as seldom as possible, hence prefer
3515              later times over earlier times.
3516
3517           b) But if we have to wake up, then let's make sure to
3518              dispatch as much as possible on the entire system.
3519
3520           We implement this by waking up everywhere at the same time
3521           within any given minute if we can, synchronised via the
3522           perturbation value determined from the boot ID. If we can't,
3523           then we try to find the same spot in every 10s, then 1s and
3524           then 250ms step. Otherwise, we pick the last possible time
3525           to wake up.
3526         */
3527
3528         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3529         if (c >= b) {
3530                 if (_unlikely_(c < USEC_PER_MINUTE))
3531                         return b;
3532
3533                 c -= USEC_PER_MINUTE;
3534         }
3535
3536         if (c >= a)
3537                 return c;
3538
3539         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3540         if (c >= b) {
3541                 if (_unlikely_(c < USEC_PER_SEC*10))
3542                         return b;
3543
3544                 c -= USEC_PER_SEC*10;
3545         }
3546
3547         if (c >= a)
3548                 return c;
3549
3550         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3551         if (c >= b) {
3552                 if (_unlikely_(c < USEC_PER_SEC))
3553                         return b;
3554
3555                 c -= USEC_PER_SEC;
3556         }
3557
3558         if (c >= a)
3559                 return c;
3560
3561         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3562         if (c >= b) {
3563                 if (_unlikely_(c < USEC_PER_MSEC*250))
3564                         return b;
3565
3566                 c -= USEC_PER_MSEC*250;
3567         }
3568
3569         if (c >= a)
3570                 return c;
3571
3572         return b;
3573 }
3574
3575 static int event_arm_timer(
3576                 sd_event *e,
3577                 struct clock_data *d) {
3578
3579         struct itimerspec its = {};
3580         sd_event_source *a, *b;
3581         usec_t t;
3582
3583         assert(e);
3584         assert(d);
3585
3586         if (!d->needs_rearm)
3587                 return 0;
3588
3589         d->needs_rearm = false;
3590
3591         a = prioq_peek(d->earliest);
3592         assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3593         if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3594
3595                 if (d->fd < 0)
3596                         return 0;
3597
3598                 if (d->next == USEC_INFINITY)
3599                         return 0;
3600
3601                 /* disarm */
3602                 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3603                         return -errno;
3604
3605                 d->next = USEC_INFINITY;
3606                 return 0;
3607         }
3608
3609         b = prioq_peek(d->latest);
3610         assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3611         assert(b && b->enabled != SD_EVENT_OFF);
3612
3613         t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3614         if (d->next == t)
3615                 return 0;
3616
3617         assert_se(d->fd >= 0);
3618
3619         if (t == 0) {
3620                 /* We don't want to disarm here, just mean some time looooong ago. */
3621                 its.it_value.tv_sec = 0;
3622                 its.it_value.tv_nsec = 1;
3623         } else
3624                 timespec_store(&its.it_value, t);
3625
3626         if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3627                 return -errno;
3628
3629         d->next = t;
3630         return 0;
3631 }
3632
3633 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3634         assert(e);
3635         assert(s);
3636         assert(s->type == SOURCE_IO);
3637
3638         /* If the event source was already pending, we just OR in the
3639          * new revents, otherwise we reset the value. The ORing is
3640          * necessary to handle EPOLLONESHOT events properly where
3641          * readability might happen independently of writability, and
3642          * we need to keep track of both */
3643
3644         if (s->pending)
3645                 s->io.revents |= revents;
3646         else
3647                 s->io.revents = revents;
3648
3649         return source_set_pending(s, true);
3650 }
3651
3652 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3653         uint64_t x;
3654         ssize_t ss;
3655
3656         assert(e);
3657         assert(fd >= 0);
3658
3659         assert_return(events == EPOLLIN, -EIO);
3660
3661         ss = read(fd, &x, sizeof(x));
3662         if (ss < 0) {
3663                 if (ERRNO_IS_TRANSIENT(errno))
3664                         return 0;
3665
3666                 return -errno;
3667         }
3668
3669         if (_unlikely_(ss != sizeof(x)))
3670                 return -EIO;
3671
3672         if (next)
3673                 *next = USEC_INFINITY;
3674
3675         return 0;
3676 }
3677
3678 static int process_timer(
3679                 sd_event *e,
3680                 usec_t n,
3681                 struct clock_data *d) {
3682
3683         sd_event_source *s;
3684         bool callback_invoked = false;
3685         int r;
3686
3687         assert(e);
3688         assert(d);
3689
3690         for (;;) {
3691                 s = prioq_peek(d->earliest);
3692                 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3693
3694                 if (!s || time_event_source_next(s) > n)
3695                         break;
3696
3697                 if (s->ratelimited) {
3698                         /* This is an event sources whose ratelimit window has ended. Let's turn it on
3699                          * again. */
3700                         assert(s->ratelimited);
3701
3702                         r = event_source_leave_ratelimit(s, /* run_callback */ true);
3703                         if (r < 0)
3704                                 return r;
3705                         else if (r == 1)
3706                                 callback_invoked = true;
3707
3708                         continue;
3709                 }
3710
3711                 if (s->enabled == SD_EVENT_OFF || s->pending)
3712                         break;
3713
3714                 r = source_set_pending(s, true);
3715                 if (r < 0)
3716                         return r;
3717
3718                 event_source_time_prioq_reshuffle(s);
3719         }
3720
3721         return callback_invoked;
3722 }
3723
3724 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3725         int64_t min_priority = threshold;
3726         bool something_new = false;
3727         sd_event_source *s;
3728         int r;
3729
3730         assert(e);
3731         assert(ret_min_priority);
3732
3733         if (!e->need_process_child) {
3734                 *ret_min_priority = min_priority;
3735                 return 0;
3736         }
3737
3738         e->need_process_child = false;
3739
3740         /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3741          * for, instead of using P_ALL. This is because we only want to get child information of very
3742          * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3743          * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3744          * hence we really don't want anything flushed out of the kernel's queue that we don't care
3745          * about. Since this is O(n) this means that if you have a lot of processes you probably want
3746          * to handle SIGCHLD yourself.
3747          *
3748          * We do not reap the children here (by using WNOWAIT), this is only done after the event
3749          * source is dispatched so that the callback still sees the process as a zombie. */
3750
3751         HASHMAP_FOREACH(s, e->child_sources) {
3752                 assert(s->type == SOURCE_CHILD);
3753
3754                 if (s->priority > threshold)
3755                         continue;
3756
3757                 if (s->pending)
3758                         continue;
3759
3760                 if (event_source_is_offline(s))
3761                         continue;
3762
3763                 if (s->child.exited)
3764                         continue;
3765
3766                 if (EVENT_SOURCE_WATCH_PIDFD(s))
3767                         /* There's a usable pidfd known for this event source? Then don't waitid() for
3768                          * it here */
3769                         continue;
3770
3771                 zero(s->child.siginfo);
3772                 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3773                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3774                         return negative_errno();
3775
3776                 if (s->child.siginfo.si_pid != 0) {
3777                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3778
3779                         if (zombie)
3780                                 s->child.exited = true;
3781
3782                         if (!zombie && (s->child.options & WEXITED)) {
3783                                 /* If the child isn't dead then let's immediately remove the state
3784                                  * change from the queue, since there's no benefit in leaving it
3785                                  * queued. */
3786
3787                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
3788                                 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3789                         }
3790
3791                         r = source_set_pending(s, true);
3792                         if (r < 0)
3793                                 return r;
3794                         if (r > 0) {
3795                                 something_new = true;
3796                                 min_priority = MIN(min_priority, s->priority);
3797                         }
3798                 }
3799         }
3800
3801         *ret_min_priority = min_priority;
3802         return something_new;
3803 }
3804
3805 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3806         assert(e);
3807         assert(s);
3808         assert(s->type == SOURCE_CHILD);
3809
3810         if (s->pending)
3811                 return 0;
3812
3813         if (event_source_is_offline(s))
3814                 return 0;
3815
3816         if (!EVENT_SOURCE_WATCH_PIDFD(s))
3817                 return 0;
3818
3819         zero(s->child.siginfo);
3820         if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3821                 return -errno;
3822
3823         if (s->child.siginfo.si_pid == 0)
3824                 return 0;
3825
3826         if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3827                 s->child.exited = true;
3828
3829         return source_set_pending(s, true);
3830 }
3831
3832 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3833         int r;
3834
3835         assert(e);
3836         assert(d);
3837         assert_return(events == EPOLLIN, -EIO);
3838         assert(min_priority);
3839
3840         /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3841          * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3842          * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3843          * but we might have higher priority children we care about hence we need to check that
3844          * explicitly. */
3845
3846         if (sigismember(&d->sigset, SIGCHLD))
3847                 e->need_process_child = true;
3848
3849         /* If there's already an event source pending for this priority we don't read another */
3850         if (d->current)
3851                 return 0;
3852
3853         for (;;) {
3854                 struct signalfd_siginfo si;
3855                 ssize_t n;
3856                 sd_event_source *s = NULL;
3857
3858                 n = read(d->fd, &si, sizeof(si));
3859                 if (n < 0) {
3860                         if (ERRNO_IS_TRANSIENT(errno))
3861                                 return 0;
3862
3863                         return -errno;
3864                 }
3865
3866                 if (_unlikely_(n != sizeof(si)))
3867                         return -EIO;
3868
3869                 assert(SIGNAL_VALID(si.ssi_signo));
3870
3871                 if (e->signal_sources)
3872                         s = e->signal_sources[si.ssi_signo];
3873                 if (!s)
3874                         continue;
3875                 if (s->pending)
3876                         continue;
3877
3878                 s->signal.siginfo = si;
3879                 d->current = s;
3880
3881                 r = source_set_pending(s, true);
3882                 if (r < 0)
3883                         return r;
3884                 if (r > 0 && *min_priority >= s->priority) {
3885                         *min_priority = s->priority;
3886                         return 1; /* an event source with smaller priority is queued. */
3887                 }
3888
3889                 return 0;
3890         }
3891 }
3892
3893 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3894         ssize_t n;
3895
3896         assert(e);
3897         assert(d);
3898
3899         assert_return(revents == EPOLLIN, -EIO);
3900
3901         /* If there's already an event source pending for this priority, don't read another */
3902         if (d->n_pending > 0)
3903                 return 0;
3904
3905         /* Is the read buffer non-empty? If so, let's not read more */
3906         if (d->buffer_filled > 0)
3907                 return 0;
3908
3909         if (d->priority > threshold)
3910                 return 0;
3911
3912         n = read(d->fd, &d->buffer, sizeof(d->buffer));
3913         if (n < 0) {
3914                 if (ERRNO_IS_TRANSIENT(errno))
3915                         return 0;
3916
3917                 return -errno;
3918         }
3919
3920         assert(n > 0);
3921         d->buffer_filled = (size_t) n;
3922         LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
3923
3924         return 1;
3925 }
3926
3927 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3928         assert(e);
3929         assert(d);
3930         assert(sz <= d->buffer_filled);
3931
3932         if (sz == 0)
3933                 return;
3934
3935         /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3936         memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3937         d->buffer_filled -= sz;
3938
3939         if (d->buffer_filled == 0)
3940                 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
3941 }
3942
3943 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3944         int r;
3945
3946         assert(e);
3947         assert(d);
3948
3949         /* If there's already an event source pending for this priority, don't read another */
3950         if (d->n_pending > 0)
3951                 return 0;
3952
3953         while (d->buffer_filled > 0) {
3954                 size_t sz;
3955
3956                 /* Let's validate that the event structures are complete */
3957                 if (d->buffer_filled < offsetof(struct inotify_event, name))
3958                         return -EIO;
3959
3960                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3961                 if (d->buffer_filled < sz)
3962                         return -EIO;
3963
3964                 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3965                         struct inode_data *inode_data;
3966
3967                         /* The queue overran, let's pass this event to all event sources connected to this inotify
3968                          * object */
3969
3970                         HASHMAP_FOREACH(inode_data, d->inodes)
3971                                 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3972
3973                                         if (event_source_is_offline(s))
3974                                                 continue;
3975
3976                                         r = source_set_pending(s, true);
3977                                         if (r < 0)
3978                                                 return r;
3979                                 }
3980                 } else {
3981                         struct inode_data *inode_data;
3982
3983                         /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3984                          * our watch descriptor table. */
3985                         if (d->buffer.ev.mask & IN_IGNORED) {
3986
3987                                 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3988                                 if (!inode_data) {
3989                                         event_inotify_data_drop(e, d, sz);
3990                                         continue;
3991                                 }
3992
3993                                 /* The watch descriptor was removed by the kernel, let's drop it here too */
3994                                 inode_data->wd = -1;
3995                         } else {
3996                                 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3997                                 if (!inode_data) {
3998                                         event_inotify_data_drop(e, d, sz);
3999                                         continue;
4000                                 }
4001                         }
4002
4003                         /* Trigger all event sources that are interested in these events. Also trigger all event
4004                          * sources if IN_IGNORED or IN_UNMOUNT is set. */
4005                         LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
4006
4007                                 if (event_source_is_offline(s))
4008                                         continue;
4009
4010                                 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
4011                                     (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
4012                                         continue;
4013
4014                                 r = source_set_pending(s, true);
4015                                 if (r < 0)
4016                                         return r;
4017                         }
4018                 }
4019
4020                 /* Something pending now? If so, let's finish, otherwise let's read more. */
4021                 if (d->n_pending > 0)
4022                         return 1;
4023         }
4024
4025         return 0;
4026 }
4027
4028 static int process_inotify(sd_event *e) {
4029         int r, done = 0;
4030
4031         assert(e);
4032
4033         LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
4034                 r = event_inotify_data_process(e, d);
4035                 if (r < 0)
4036                         return r;
4037                 if (r > 0)
4038                         done++;
4039         }
4040
4041         return done;
4042 }
4043
4044 static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
4045         assert(s);
4046         assert(s->type == SOURCE_MEMORY_PRESSURE);
4047
4048         if (s->pending)
4049                 s->memory_pressure.revents |= revents;
4050         else
4051                 s->memory_pressure.revents = revents;
4052
4053         return source_set_pending(s, true);
4054 }
4055
4056 static int source_memory_pressure_write(sd_event_source *s) {
4057         ssize_t n;
4058         int r;
4059
4060         assert(s);
4061         assert(s->type == SOURCE_MEMORY_PRESSURE);
4062
4063         /* once we start writing, the buffer is locked, we allow no further changes. */
4064         s->memory_pressure.locked = true;
4065
4066         if (s->memory_pressure.write_buffer_size > 0) {
4067                 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4068                 if (n < 0) {
4069                         if (!ERRNO_IS_TRANSIENT(errno)) {
4070                                 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4071                                  * files, but then generates EOPNOSUPP on read() and write() (instead of on
4072                                  * open()!). This sucks hard, since we can only detect this kind of failure
4073                                  * so late. Let's make the best of it, and turn off the event source like we
4074                                  * do for failed event source handlers. */
4075
4076                                 log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4077                                 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4078                                 return 0;
4079                         }
4080
4081                         n = 0;
4082                 }
4083         } else
4084                 n = 0;
4085
4086         assert(n >= 0);
4087
4088         if ((size_t) n == s->memory_pressure.write_buffer_size) {
4089                 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4090
4091                 if (n > 0) {
4092                         s->memory_pressure.write_buffer_size = 0;
4093
4094                         /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4095                         r = source_memory_pressure_register(s, s->enabled);
4096                         if (r < 0)
4097                                 return r;
4098                 }
4099         } else if (n > 0) {
4100                 _cleanup_free_ void *c = NULL;
4101
4102                 assert((size_t) n < s->memory_pressure.write_buffer_size);
4103
4104                 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4105                 if (!c)
4106                         return -ENOMEM;
4107
4108                 free_and_replace(s->memory_pressure.write_buffer, c);
4109                 s->memory_pressure.write_buffer_size -= n;
4110                 return 1;
4111         }
4112
4113         return 0;
4114 }
4115
4116 static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4117         int r;
4118
4119         assert(s);
4120         assert(s->type == SOURCE_MEMORY_PRESSURE);
4121
4122         r = source_memory_pressure_write(s);
4123         if (r < 0)
4124                 return r;
4125         if (r > 0)
4126                 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4127                            * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4128
4129         /* No pending incoming IO? Then let's not continue further */
4130         if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4131
4132                 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4133                 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4134                         return -EIO;
4135
4136                 return 1; /* leave dispatch, we already processed everything */
4137         }
4138
4139         if (s->memory_pressure.revents & EPOLLIN) {
4140                 uint8_t pipe_buf[PIPE_BUF];
4141                 ssize_t n;
4142
4143                 /* If the fd is readable, then flush out anything that might be queued */
4144
4145                 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4146                 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4147                         return -errno;
4148         }
4149
4150         return 0; /* go on, dispatch to user callback */
4151 }
4152
4153 static int source_dispatch(sd_event_source *s) {
4154         EventSourceType saved_type;
4155         sd_event *saved_event;
4156         int r = 0;
4157
4158         assert(s);
4159         assert(s->pending || s->type == SOURCE_EXIT);
4160
4161         /* Save the event source type, here, so that we still know it after the event callback which might
4162          * invalidate the event. */
4163         saved_type = s->type;
4164
4165         /* Similarly, store a reference to the event loop object, so that we can still access it after the
4166          * callback might have invalidated/disconnected the event source. */
4167         saved_event = s->event;
4168         PROTECT_EVENT(saved_event);
4169
4170         /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4171         assert(!s->ratelimited);
4172         if (!ratelimit_below(&s->rate_limit)) {
4173                 r = event_source_enter_ratelimited(s);
4174                 if (r < 0)
4175                         return r;
4176
4177                 return 1;
4178         }
4179
4180         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
4181                 r = source_set_pending(s, false);
4182                 if (r < 0)
4183                         return r;
4184         }
4185
4186         if (s->type != SOURCE_POST) {
4187                 sd_event_source *z;
4188
4189                 /* If we execute a non-post source, let's mark all post sources as pending. */
4190
4191                 SET_FOREACH(z, s->event->post_sources) {
4192                         if (event_source_is_offline(z))
4193                                 continue;
4194
4195                         r = source_set_pending(z, true);
4196                         if (r < 0)
4197                                 return r;
4198                 }
4199         }
4200
4201         if (s->type == SOURCE_MEMORY_PRESSURE) {
4202                 r = source_memory_pressure_initiate_dispatch(s);
4203                 if (r == -EIO) /* handle EIO errors similar to callback errors */
4204                         goto finish;
4205                 if (r < 0)
4206                         return r;
4207                 if (r > 0) /* already handled */
4208                         return 1;
4209         }
4210
4211         if (s->enabled == SD_EVENT_ONESHOT) {
4212                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
4213                 if (r < 0)
4214                         return r;
4215         }
4216
4217         s->dispatching = true;
4218
4219         switch (s->type) {
4220
4221         case SOURCE_IO:
4222                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4223                 break;
4224
4225         case SOURCE_TIME_REALTIME:
4226         case SOURCE_TIME_BOOTTIME:
4227         case SOURCE_TIME_MONOTONIC:
4228         case SOURCE_TIME_REALTIME_ALARM:
4229         case SOURCE_TIME_BOOTTIME_ALARM:
4230                 r = s->time.callback(s, s->time.next, s->userdata);
4231                 break;
4232
4233         case SOURCE_SIGNAL:
4234                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4235                 break;
4236
4237         case SOURCE_CHILD: {
4238                 bool zombie;
4239
4240                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
4241
4242                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
4243
4244                 /* Now, reap the PID for good. */
4245                 if (zombie) {
4246                         (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
4247                         s->child.waited = true;
4248                 }
4249
4250                 break;
4251         }
4252
4253         case SOURCE_DEFER:
4254                 r = s->defer.callback(s, s->userdata);
4255                 break;
4256
4257         case SOURCE_POST:
4258                 r = s->post.callback(s, s->userdata);
4259                 break;
4260
4261         case SOURCE_EXIT:
4262                 r = s->exit.callback(s, s->userdata);
4263                 break;
4264
4265         case SOURCE_INOTIFY: {
4266                 struct sd_event *e = s->event;
4267                 struct inotify_data *d;
4268                 size_t sz;
4269
4270                 assert(s->inotify.inode_data);
4271                 assert_se(d = s->inotify.inode_data->inotify_data);
4272
4273                 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4274                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4275                 assert(d->buffer_filled >= sz);
4276
4277                 /* If the inotify callback destroys the event source then this likely means we don't need to
4278                  * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4279                  * free it immediately, then we couldn't drop the event from the inotify event queue without
4280                  * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4281                  * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4282                  * explicitly GC it after we are done dropping the inotify event from the buffer. */
4283                 d->n_busy++;
4284                 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
4285                 d->n_busy--;
4286
4287                 /* When no event is pending anymore on this inotify object, then let's drop the event from
4288                  * the inotify event queue buffer. */
4289                 if (d->n_pending == 0)
4290                         event_inotify_data_drop(e, d, sz);
4291
4292                 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4293                 event_gc_inotify_data(e, d);
4294                 break;
4295         }
4296
4297         case SOURCE_MEMORY_PRESSURE:
4298                 r = s->memory_pressure.callback(s, s->userdata);
4299                 break;
4300
4301         case SOURCE_WATCHDOG:
4302         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
4303         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
4304                 assert_not_reached();
4305         }
4306
4307         s->dispatching = false;
4308
4309 finish:
4310         if (r < 0) {
4311                 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4312                                 strna(s->description),
4313                                 event_source_type_to_string(saved_type),
4314                                 s->exit_on_failure ? "exiting" : "disabling");
4315
4316                 if (s->exit_on_failure)
4317                         (void) sd_event_exit(saved_event, r);
4318         }
4319
4320         if (s->n_ref == 0)
4321                 source_free(s);
4322         else if (r < 0)
4323                 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4324
4325         return 1;
4326 }
4327
4328 static int event_prepare(sd_event *e) {
4329         int r;
4330
4331         assert(e);
4332
4333         for (;;) {
4334                 sd_event_source *s;
4335
4336                 s = prioq_peek(e->prepare);
4337                 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
4338                         break;
4339
4340                 s->prepare_iteration = e->iteration;
4341                 prioq_reshuffle(e->prepare, s, &s->prepare_index);
4342
4343                 assert(s->prepare);
4344                 s->dispatching = true;
4345                 r = s->prepare(s, s->userdata);
4346                 s->dispatching = false;
4347
4348                 if (r < 0) {
4349                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4350                                         strna(s->description),
4351                                         event_source_type_to_string(s->type),
4352                                         s->exit_on_failure ? "exiting" : "disabling");
4353
4354                         if (s->exit_on_failure)
4355                                 (void) sd_event_exit(e, r);
4356                 }
4357
4358                 if (s->n_ref == 0)
4359                         source_free(s);
4360                 else if (r < 0)
4361                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4362         }
4363
4364         return 0;
4365 }
4366
4367 static int dispatch_exit(sd_event *e) {
4368         sd_event_source *p;
4369         int r;
4370
4371         assert(e);
4372
4373         p = prioq_peek(e->exit);
4374         assert(!p || p->type == SOURCE_EXIT);
4375
4376         if (!p || event_source_is_offline(p)) {
4377                 e->state = SD_EVENT_FINISHED;
4378                 return 0;
4379         }
4380
4381         PROTECT_EVENT(e);
4382         e->iteration++;
4383         e->state = SD_EVENT_EXITING;
4384         r = source_dispatch(p);
4385         e->state = SD_EVENT_INITIAL;
4386         return r;
4387 }
4388
4389 static sd_event_source* event_next_pending(sd_event *e) {
4390         sd_event_source *p;
4391
4392         assert(e);
4393
4394         p = prioq_peek(e->pending);
4395         if (!p)
4396                 return NULL;
4397
4398         if (event_source_is_offline(p))
4399                 return NULL;
4400
4401         return p;
4402 }
4403
4404 static int arm_watchdog(sd_event *e) {
4405         struct itimerspec its = {};
4406         usec_t t;
4407
4408         assert(e);
4409         assert(e->watchdog_fd >= 0);
4410
4411         t = sleep_between(e,
4412                           usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4413                           usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
4414
4415         timespec_store(&its.it_value, t);
4416
4417         /* Make sure we never set the watchdog to 0, which tells the
4418          * kernel to disable it. */
4419         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4420                 its.it_value.tv_nsec = 1;
4421
4422         return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
4423 }
4424
4425 static int process_watchdog(sd_event *e) {
4426         assert(e);
4427
4428         if (!e->watchdog)
4429                 return 0;
4430
4431         /* Don't notify watchdog too often */
4432         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4433                 return 0;
4434
4435         sd_notify(false, "WATCHDOG=1");
4436         e->watchdog_last = e->timestamp.monotonic;
4437
4438         return arm_watchdog(e);
4439 }
4440
4441 static void event_close_inode_data_fds(sd_event *e) {
4442         struct inode_data *d;
4443
4444         assert(e);
4445
4446         /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4447          * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4448          * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4449          * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4450          * compromise. */
4451
4452         while ((d = e->inode_data_to_close_list)) {
4453                 assert(d->fd >= 0);
4454                 d->fd = safe_close(d->fd);
4455
4456                 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
4457         }
4458 }
4459
4460 static int event_memory_pressure_write_list(sd_event *e) {
4461         int r;
4462
4463         assert(e);
4464
4465         for (;;) {
4466                 sd_event_source *s;
4467
4468                 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4469                 if (!s)
4470                         break;
4471
4472                 assert(s->type == SOURCE_MEMORY_PRESSURE);
4473                 assert(s->memory_pressure.write_buffer_size > 0);
4474                 s->memory_pressure.in_write_list = false;
4475
4476                 r = source_memory_pressure_write(s);
4477                 if (r < 0)
4478                         return r;
4479         }
4480
4481         return 0;
4482 }
4483
4484 _public_ int sd_event_prepare(sd_event *e) {
4485         int r;
4486
4487         assert_return(e, -EINVAL);
4488         assert_return(e = event_resolve(e), -ENOPKG);
4489         assert_return(!event_origin_changed(e), -ECHILD);
4490         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4491         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4492
4493         /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4494          * this check here once, since gettid() is typically not cached, and thus want to minimize
4495          * syscalls */
4496         assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4497
4498         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4499         PROTECT_EVENT(e);
4500
4501         if (e->exit_requested)
4502                 goto pending;
4503
4504         e->iteration++;
4505
4506         e->state = SD_EVENT_PREPARING;
4507         r = event_prepare(e);
4508         e->state = SD_EVENT_INITIAL;
4509         if (r < 0)
4510                 return r;
4511
4512         r = event_memory_pressure_write_list(e);
4513         if (r < 0)
4514                 return r;
4515
4516         r = event_arm_timer(e, &e->realtime);
4517         if (r < 0)
4518                 return r;
4519
4520         r = event_arm_timer(e, &e->boottime);
4521         if (r < 0)
4522                 return r;
4523
4524         r = event_arm_timer(e, &e->monotonic);
4525         if (r < 0)
4526                 return r;
4527
4528         r = event_arm_timer(e, &e->realtime_alarm);
4529         if (r < 0)
4530                 return r;
4531
4532         r = event_arm_timer(e, &e->boottime_alarm);
4533         if (r < 0)
4534                 return r;
4535
4536         event_close_inode_data_fds(e);
4537
4538         if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
4539                 goto pending;
4540
4541         e->state = SD_EVENT_ARMED;
4542
4543         return 0;
4544
4545 pending:
4546         e->state = SD_EVENT_ARMED;
4547         r = sd_event_wait(e, 0);
4548         if (r == 0)
4549                 e->state = SD_EVENT_ARMED;
4550
4551         return r;
4552 }
4553
4554 static int epoll_wait_usec(
4555                 int fd,
4556                 struct epoll_event *events,
4557                 int maxevents,
4558                 usec_t timeout) {
4559
4560         int msec;
4561         /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4562
4563 #if HAVE_EPOLL_PWAIT2
4564         static bool epoll_pwait2_absent = false;
4565         int r;
4566
4567         /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4568          * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4569          * is not that obvious to implement given the libc and kernel definitions differ in the last
4570          * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4571          * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4572          * missing. */
4573
4574         if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
4575                 r = epoll_pwait2(fd,
4576                                  events,
4577                                  maxevents,
4578                                  TIMESPEC_STORE(timeout),
4579                                  NULL);
4580                 if (r >= 0)
4581                         return r;
4582                 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
4583                         return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4584                                         * supported. */
4585
4586                 epoll_pwait2_absent = true;
4587         }
4588 #endif
4589
4590         if (timeout == USEC_INFINITY)
4591                 msec = -1;
4592         else {
4593                 usec_t k;
4594
4595                 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4596                 if (k >= INT_MAX)
4597                         msec = INT_MAX; /* Saturate */
4598                 else
4599                         msec = (int) k;
4600         }
4601
4602         return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
4603 }
4604
4605 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
4606         size_t n_event_queue, m, n_event_max;
4607         int64_t min_priority = threshold;
4608         bool something_new = false;
4609         int r;
4610
4611         assert(e);
4612         assert(ret_min_priority);
4613
4614         n_event_queue = MAX(e->n_sources, 1u);
4615         if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
4616                 return -ENOMEM;
4617
4618         n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4619
4620         /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4621         if (e->buffered_inotify_data_list)
4622                 timeout = 0;
4623
4624         for (;;) {
4625                 r = epoll_wait_usec(
4626                                 e->epoll_fd,
4627                                 e->event_queue,
4628                                 n_event_max,
4629                                 timeout);
4630                 if (r < 0)
4631                         return r;
4632
4633                 m = (size_t) r;
4634
4635                 if (m < n_event_max)
4636                         break;
4637
4638                 if (n_event_max >= n_event_queue * 10)
4639                         break;
4640
4641                 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
4642                         return -ENOMEM;
4643
4644                 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4645                 timeout = 0;
4646         }
4647
4648         /* Set timestamp only when this is called first time. */
4649         if (threshold == INT64_MAX)
4650                 triple_timestamp_now(&e->timestamp);
4651
4652         for (size_t i = 0; i < m; i++) {
4653
4654                 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4655                         r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
4656                 else {
4657                         WakeupType *t = e->event_queue[i].data.ptr;
4658
4659                         switch (*t) {
4660
4661                         case WAKEUP_EVENT_SOURCE: {
4662                                 sd_event_source *s = e->event_queue[i].data.ptr;
4663
4664                                 assert(s);
4665
4666                                 if (s->priority > threshold)
4667                                         continue;
4668
4669                                 min_priority = MIN(min_priority, s->priority);
4670
4671                                 switch (s->type) {
4672
4673                                 case SOURCE_IO:
4674                                         r = process_io(e, s, e->event_queue[i].events);
4675                                         break;
4676
4677                                 case SOURCE_CHILD:
4678                                         r = process_pidfd(e, s, e->event_queue[i].events);
4679                                         break;
4680
4681                                 case SOURCE_MEMORY_PRESSURE:
4682                                         r = process_memory_pressure(s, e->event_queue[i].events);
4683                                         break;
4684
4685                                 default:
4686                                         assert_not_reached();
4687                                 }
4688
4689                                 break;
4690                         }
4691
4692                         case WAKEUP_CLOCK_DATA: {
4693                                 struct clock_data *d = e->event_queue[i].data.ptr;
4694
4695                                 assert(d);
4696
4697                                 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4698                                 break;
4699                         }
4700
4701                         case WAKEUP_SIGNAL_DATA:
4702                                 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4703                                 break;
4704
4705                         case WAKEUP_INOTIFY_DATA:
4706                                 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4707                                 break;
4708
4709                         default:
4710                                 assert_not_reached();
4711                         }
4712                 }
4713                 if (r < 0)
4714                         return r;
4715                 if (r > 0)
4716                         something_new = true;
4717         }
4718
4719         *ret_min_priority = min_priority;
4720         return something_new;
4721 }
4722
4723 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4724         int r;
4725
4726         assert_return(e, -EINVAL);
4727         assert_return(e = event_resolve(e), -ENOPKG);
4728         assert_return(!event_origin_changed(e), -ECHILD);
4729         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4730         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4731
4732         if (e->exit_requested) {
4733                 e->state = SD_EVENT_PENDING;
4734                 return 1;
4735         }
4736
4737         for (int64_t threshold = INT64_MAX; ; threshold--) {
4738                 int64_t epoll_min_priority, child_min_priority;
4739
4740                 /* There may be a possibility that new epoll (especially IO) and child events are
4741                  * triggered just after process_epoll() call but before process_child(), and the new IO
4742                  * events may have higher priority than the child events. To salvage these events,
4743                  * let's call epoll_wait() again, but accepts only events with higher priority than the
4744                  * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4745                  * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4746                  * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4747
4748                 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4749                 if (r == -EINTR) {
4750                         e->state = SD_EVENT_PENDING;
4751                         return 1;
4752                 }
4753                 if (r < 0)
4754                         goto finish;
4755                 if (r == 0 && threshold < INT64_MAX)
4756                         /* No new epoll event. */
4757                         break;
4758
4759                 r = process_child(e, threshold, &child_min_priority);
4760                 if (r < 0)
4761                         goto finish;
4762                 if (r == 0)
4763                         /* No new child event. */
4764                         break;
4765
4766                 threshold = MIN(epoll_min_priority, child_min_priority);
4767                 if (threshold == INT64_MIN)
4768                         break;
4769
4770                 timeout = 0;
4771         }
4772
4773         r = process_watchdog(e);
4774         if (r < 0)
4775                 goto finish;
4776
4777         r = process_inotify(e);
4778         if (r < 0)
4779                 goto finish;
4780
4781         r = process_timer(e, e->timestamp.realtime, &e->realtime);
4782         if (r < 0)
4783                 goto finish;
4784
4785         r = process_timer(e, e->timestamp.boottime, &e->boottime);
4786         if (r < 0)
4787                 goto finish;
4788
4789         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4790         if (r < 0)
4791                 goto finish;
4792
4793         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4794         if (r < 0)
4795                 goto finish;
4796
4797         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4798         if (r < 0)
4799                 goto finish;
4800         else if (r == 1) {
4801                 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4802                  * put loop in the initial state in order to evaluate (in the next iteration) also sources
4803                  * there were potentially re-enabled by the callback.
4804                  *
4805                  * Wondering why we treat only this invocation of process_timer() differently? Once event
4806                  * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4807                  * ratelimit expiry callback is never called for any other timer type. */
4808                 r = 0;
4809                 goto finish;
4810         }
4811
4812         if (event_next_pending(e)) {
4813                 e->state = SD_EVENT_PENDING;
4814                 return 1;
4815         }
4816
4817         r = 0;
4818
4819 finish:
4820         e->state = SD_EVENT_INITIAL;
4821
4822         return r;
4823 }
4824
4825 _public_ int sd_event_dispatch(sd_event *e) {
4826         sd_event_source *p;
4827         int r;
4828
4829         assert_return(e, -EINVAL);
4830         assert_return(e = event_resolve(e), -ENOPKG);
4831         assert_return(!event_origin_changed(e), -ECHILD);
4832         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4833         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4834
4835         if (e->exit_requested)
4836                 return dispatch_exit(e);
4837
4838         p = event_next_pending(e);
4839         if (p) {
4840                 PROTECT_EVENT(e);
4841
4842                 e->state = SD_EVENT_RUNNING;
4843                 r = source_dispatch(p);
4844                 e->state = SD_EVENT_INITIAL;
4845                 return r;
4846         }
4847
4848         e->state = SD_EVENT_INITIAL;
4849
4850         return 1;
4851 }
4852
4853 static void event_log_delays(sd_event *e) {
4854         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4855         size_t l, i;
4856
4857         p = b;
4858         l = sizeof(b);
4859         for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4860                 l = strpcpyf(&p, l, "%u ", e->delays[i]);
4861                 e->delays[i] = 0;
4862         }
4863         log_debug("Event loop iterations: %s", b);
4864 }
4865
4866 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4867         int r;
4868
4869         assert_return(e, -EINVAL);
4870         assert_return(e = event_resolve(e), -ENOPKG);
4871         assert_return(!event_origin_changed(e), -ECHILD);
4872         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4873         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4874
4875         if (e->profile_delays && e->last_run_usec != 0) {
4876                 usec_t this_run;
4877                 unsigned l;
4878
4879                 this_run = now(CLOCK_MONOTONIC);
4880
4881                 l = log2u64(this_run - e->last_run_usec);
4882                 assert(l < ELEMENTSOF(e->delays));
4883                 e->delays[l]++;
4884
4885                 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4886                         event_log_delays(e);
4887                         e->last_log_usec = this_run;
4888                 }
4889         }
4890
4891         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4892         PROTECT_EVENT(e);
4893
4894         r = sd_event_prepare(e);
4895         if (r == 0)
4896                 /* There was nothing? Then wait... */
4897                 r = sd_event_wait(e, timeout);
4898
4899         if (e->profile_delays)
4900                 e->last_run_usec = now(CLOCK_MONOTONIC);
4901
4902         if (r > 0) {
4903                 /* There's something now, then let's dispatch it */
4904                 r = sd_event_dispatch(e);
4905                 if (r < 0)
4906                         return r;
4907
4908                 return 1;
4909         }
4910
4911         return r;
4912 }
4913
4914 _public_ int sd_event_loop(sd_event *e) {
4915         int r;
4916
4917         assert_return(e, -EINVAL);
4918         assert_return(e = event_resolve(e), -ENOPKG);
4919         assert_return(!event_origin_changed(e), -ECHILD);
4920         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4921
4922
4923         PROTECT_EVENT(e);
4924
4925         while (e->state != SD_EVENT_FINISHED) {
4926                 r = sd_event_run(e, UINT64_MAX);
4927                 if (r < 0)
4928                         return r;
4929         }
4930
4931         return e->exit_code;
4932 }
4933
4934 _public_ int sd_event_get_fd(sd_event *e) {
4935         assert_return(e, -EINVAL);
4936         assert_return(e = event_resolve(e), -ENOPKG);
4937         assert_return(!event_origin_changed(e), -ECHILD);
4938
4939         return e->epoll_fd;
4940 }
4941
4942 _public_ int sd_event_get_state(sd_event *e) {
4943         assert_return(e, -EINVAL);
4944         assert_return(e = event_resolve(e), -ENOPKG);
4945         assert_return(!event_origin_changed(e), -ECHILD);
4946
4947         return e->state;
4948 }
4949
4950 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
4951         assert_return(e, -EINVAL);
4952         assert_return(e = event_resolve(e), -ENOPKG);
4953         assert_return(!event_origin_changed(e), -ECHILD);
4954
4955         if (!e->exit_requested)
4956                 return -ENODATA;
4957
4958         if (code)
4959                 *code = e->exit_code;
4960         return 0;
4961 }
4962
4963 _public_ int sd_event_exit(sd_event *e, int code) {
4964         assert_return(e, -EINVAL);
4965         assert_return(e = event_resolve(e), -ENOPKG);
4966         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4967         assert_return(!event_origin_changed(e), -ECHILD);
4968
4969         e->exit_requested = true;
4970         e->exit_code = code;
4971
4972         return 0;
4973 }
4974
4975 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
4976         assert_return(e, -EINVAL);
4977         assert_return(e = event_resolve(e), -ENOPKG);
4978         assert_return(usec, -EINVAL);
4979         assert_return(!event_origin_changed(e), -ECHILD);
4980
4981         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4982                 return -EOPNOTSUPP;
4983
4984         if (!triple_timestamp_is_set(&e->timestamp)) {
4985                 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4986                 *usec = now(clock);
4987                 return 1;
4988         }
4989
4990         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
4991         return 0;
4992 }
4993
4994 _public_ int sd_event_default(sd_event **ret) {
4995         sd_event *e = NULL;
4996         int r;
4997
4998         if (!ret)
4999                 return !!default_event;
5000
5001         if (default_event) {
5002                 *ret = sd_event_ref(default_event);
5003                 return 0;
5004         }
5005
5006         r = sd_event_new(&e);
5007         if (r < 0)
5008                 return r;
5009
5010         e->default_event_ptr = &default_event;
5011         e->tid = gettid();
5012         default_event = e;
5013
5014         *ret = e;
5015         return 1;
5016 }
5017
5018 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
5019         assert_return(e, -EINVAL);
5020         assert_return(e = event_resolve(e), -ENOPKG);
5021         assert_return(tid, -EINVAL);
5022         assert_return(!event_origin_changed(e), -ECHILD);
5023
5024         if (e->tid != 0) {
5025                 *tid = e->tid;
5026                 return 0;
5027         }
5028
5029         return -ENXIO;
5030 }
5031
5032 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
5033         int r;
5034
5035         assert_return(e, -EINVAL);
5036         assert_return(e = event_resolve(e), -ENOPKG);
5037         assert_return(!event_origin_changed(e), -ECHILD);
5038
5039         if (e->watchdog == !!b)
5040                 return e->watchdog;
5041
5042         if (b) {
5043                 r = sd_watchdog_enabled(false, &e->watchdog_period);
5044                 if (r <= 0)
5045                         return r;
5046
5047                 /* Issue first ping immediately */
5048                 sd_notify(false, "WATCHDOG=1");
5049                 e->watchdog_last = now(CLOCK_MONOTONIC);
5050
5051                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
5052                 if (e->watchdog_fd < 0)
5053                         return -errno;
5054
5055                 r = arm_watchdog(e);
5056                 if (r < 0)
5057                         goto fail;
5058
5059                 struct epoll_event ev = {
5060                         .events = EPOLLIN,
5061                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
5062                 };
5063
5064                 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
5065                         r = -errno;
5066                         goto fail;
5067                 }
5068
5069         } else {
5070                 if (e->watchdog_fd >= 0) {
5071                         (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
5072                         e->watchdog_fd = safe_close(e->watchdog_fd);
5073                 }
5074         }
5075
5076         e->watchdog = b;
5077         return e->watchdog;
5078
5079 fail:
5080         e->watchdog_fd = safe_close(e->watchdog_fd);
5081         return r;
5082 }
5083
5084 _public_ int sd_event_get_watchdog(sd_event *e) {
5085         assert_return(e, -EINVAL);
5086         assert_return(e = event_resolve(e), -ENOPKG);
5087         assert_return(!event_origin_changed(e), -ECHILD);
5088
5089         return e->watchdog;
5090 }
5091
5092 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5093         assert_return(e, -EINVAL);
5094         assert_return(e = event_resolve(e), -ENOPKG);
5095         assert_return(!event_origin_changed(e), -ECHILD);
5096
5097         *ret = e->iteration;
5098         return 0;
5099 }
5100
5101 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5102         assert_return(s, -EINVAL);
5103         assert_return(s->event, -EINVAL);
5104         assert_return(!event_origin_changed(s->event), -ECHILD);
5105
5106         s->destroy_callback = callback;
5107         return 0;
5108 }
5109
5110 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5111         assert_return(s, -EINVAL);
5112         assert_return(!event_origin_changed(s->event), -ECHILD);
5113
5114         if (ret)
5115                 *ret = s->destroy_callback;
5116
5117         return !!s->destroy_callback;
5118 }
5119
5120 _public_ int sd_event_source_get_floating(sd_event_source *s) {
5121         assert_return(s, -EINVAL);
5122         assert_return(!event_origin_changed(s->event), -ECHILD);
5123
5124         return s->floating;
5125 }
5126
5127 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5128         assert_return(s, -EINVAL);
5129         assert_return(!event_origin_changed(s->event), -ECHILD);
5130
5131         if (s->floating == !!b)
5132                 return 0;
5133
5134         if (!s->event) /* Already disconnected */
5135                 return -ESTALE;
5136
5137         s->floating = b;
5138
5139         if (b) {
5140                 sd_event_source_ref(s);
5141                 sd_event_unref(s->event);
5142         } else {
5143                 sd_event_ref(s->event);
5144                 sd_event_source_unref(s);
5145         }
5146
5147         return 1;
5148 }
5149
5150 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5151         assert_return(s, -EINVAL);
5152         assert_return(s->type != SOURCE_EXIT, -EDOM);
5153         assert_return(!event_origin_changed(s->event), -ECHILD);
5154
5155         return s->exit_on_failure;
5156 }
5157
5158 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5159         assert_return(s, -EINVAL);
5160         assert_return(s->type != SOURCE_EXIT, -EDOM);
5161         assert_return(!event_origin_changed(s->event), -ECHILD);
5162
5163         if (s->exit_on_failure == !!b)
5164                 return 0;
5165
5166         s->exit_on_failure = b;
5167         return 1;
5168 }
5169
5170 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5171         int r;
5172
5173         assert_return(s, -EINVAL);
5174         assert_return(!event_origin_changed(s->event), -ECHILD);
5175
5176         /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5177          * so is a programming error. */
5178         assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5179
5180         /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5181          * non-ratelimited. */
5182         r = event_source_leave_ratelimit(s, /* run_callback */ false);
5183         if (r < 0)
5184                 return r;
5185
5186         s->rate_limit = (RateLimit) { interval, burst };
5187         return 0;
5188 }
5189
5190 _public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5191         assert_return(s, -EINVAL);
5192         assert_return(!event_origin_changed(s->event), -ECHILD);
5193
5194         s->ratelimit_expire_callback = callback;
5195         return 0;
5196 }
5197
5198 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5199         assert_return(s, -EINVAL);
5200         assert_return(!event_origin_changed(s->event), -ECHILD);
5201
5202         /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5203          * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5204         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5205                 return -EDOM;
5206
5207         if (!ratelimit_configured(&s->rate_limit))
5208                 return -ENOEXEC;
5209
5210         if (ret_interval)
5211                 *ret_interval = s->rate_limit.interval;
5212         if (ret_burst)
5213                 *ret_burst = s->rate_limit.burst;
5214
5215         return 0;
5216 }
5217
5218 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5219         assert_return(s, -EINVAL);
5220         assert_return(!event_origin_changed(s->event), -ECHILD);
5221
5222         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5223                 return false;
5224
5225         if (!ratelimit_configured(&s->rate_limit))
5226                 return false;
5227
5228         return s->ratelimited;
5229 }
5230
5231 _public_ int sd_event_source_leave_ratelimit(sd_event_source *s) {
5232         int r;
5233
5234         assert_return(s, -EINVAL);
5235
5236         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5237                 return 0;
5238
5239         if (!ratelimit_configured(&s->rate_limit))
5240                 return 0;
5241
5242         if (!s->ratelimited)
5243                 return 0;
5244
5245         r = event_source_leave_ratelimit(s, /* run_callback */ false);
5246         if (r < 0)
5247                 return r;
5248
5249         return 1; /* tell caller that we indeed just left the ratelimit state */
5250 }
5251
5252 _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5253         bool change = false;
5254         int r;
5255
5256         assert_return(e, -EINVAL);
5257
5258         if (b) {
5259                 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5260                  * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5261                  * floating after creation (and undo this before deleting them again). */
5262
5263                 if (!e->sigint_event_source) {
5264                         r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5265                         if (r < 0)
5266                                 return r;
5267
5268                         assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5269                         change = true;
5270                 }
5271
5272                 if (!e->sigterm_event_source) {
5273                         r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5274                         if (r < 0) {
5275                                 if (change) {
5276                                         assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5277                                         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5278                                 }
5279
5280                                 return r;
5281                         }
5282
5283                         assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5284                         change = true;
5285                 }
5286
5287         } else {
5288                 if (e->sigint_event_source) {
5289                         assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5290                         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5291                         change = true;
5292                 }
5293
5294                 if (e->sigterm_event_source) {
5295                         assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5296                         e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5297                         change = true;
5298                 }
5299         }
5300
5301         return change;
5302 }
5303
5304 _public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5305         _cleanup_free_ char *b = NULL;
5306         _cleanup_free_ void *w = NULL;
5307
5308         assert_return(s, -EINVAL);
5309         assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5310         assert_return(ty, -EINVAL);
5311         assert_return(!event_origin_changed(s->event), -ECHILD);
5312
5313         if (!STR_IN_SET(ty, "some", "full"))
5314                 return -EINVAL;
5315
5316         if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5317                 return -EBUSY;
5318
5319         char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5320         if (!space)
5321                 return -EINVAL;
5322
5323         size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5324         b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5325         if (!b)
5326                 return -ENOMEM;
5327         if (!STR_IN_SET(b, "some", "full"))
5328                 return -EINVAL;
5329
5330         if (streq(b, ty))
5331                 return 0;
5332
5333         size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5334         w = new(char, nl);
5335         if (!w)
5336                 return -ENOMEM;
5337
5338         memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5339
5340         free_and_replace(s->memory_pressure.write_buffer, w);
5341         s->memory_pressure.write_buffer_size = nl;
5342         s->memory_pressure.locked = false;
5343
5344         return 1;
5345 }
5346
5347 _public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5348         _cleanup_free_ char *b = NULL;
5349         _cleanup_free_ void *w = NULL;
5350
5351         assert_return(s, -EINVAL);
5352         assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5353         assert_return(!event_origin_changed(s->event), -ECHILD);
5354
5355         if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5356                 return -ERANGE;
5357         if (window_usec <= 0 || window_usec >= UINT64_MAX)
5358                 return -ERANGE;
5359         if (threshold_usec > window_usec)
5360                 return -EINVAL;
5361
5362         if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5363                 return -EBUSY;
5364
5365         char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5366         if (!space)
5367                 return -EINVAL;
5368
5369         size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5370         b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5371         if (!b)
5372                 return -ENOMEM;
5373         if (!STR_IN_SET(b, "some", "full"))
5374                 return -EINVAL;
5375
5376         if (asprintf((char**) &w,
5377                      "%s " USEC_FMT " " USEC_FMT "",
5378                      b,
5379                      threshold_usec,
5380                      window_usec) < 0)
5381                 return -EINVAL;
5382
5383         l = strlen(w) + 1;
5384         if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5385                 return 0;
5386
5387         free_and_replace(s->memory_pressure.write_buffer, w);
5388         s->memory_pressure.write_buffer_size = l;
5389         s->memory_pressure.locked = false;
5390
5391         return 1;
5392 }