src/libsystemd/sd-event/sd-event.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <sys/epoll.h>
   4 #include <sys/timerfd.h>
   5 #include <sys/wait.h>
   6
   7 #include "sd-daemon.h"
   8 #include "sd-event.h"
   9 #include "sd-id128.h"
  10 #include "sd-messages.h"
  11
  12 #include "alloc-util.h"
  13 #include "env-util.h"
  14 #include "event-source.h"
  15 #include "fd-util.h"
  16 #include "fs-util.h"
  17 #include "glyph-util.h"
  18 #include "hashmap.h"
  19 #include "hexdecoct.h"
  20 #include "list.h"
  21 #include "logarithm.h"
  22 #include "macro.h"
  23 #include "mallinfo-util.h"
  24 #include "memory-util.h"
  25 #include "missing_magic.h"
  26 #include "missing_syscall.h"
  27 #include "missing_threads.h"
  28 #include "origin-id.h"
  29 #include "path-util.h"
  30 #include "prioq.h"
  31 #include "process-util.h"
  32 #include "psi-util.h"
  33 #include "set.h"
  34 #include "signal-util.h"
  35 #include "socket-util.h"
  36 #include "stat-util.h"
  37 #include "string-table.h"
  38 #include "string-util.h"
  39 #include "strxcpyx.h"
  40 #include "time-util.h"
  41
  42 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
  43
  44 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
  45         /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
  46         return s &&
  47                 s->type == SOURCE_CHILD &&
  48                 s->child.pidfd >= 0 &&
  49                 s->child.options == WEXITED;
  50 }
  51
  52 static bool event_source_is_online(sd_event_source *s) {
  53         assert(s);
  54         return s->enabled != SD_EVENT_OFF && !s->ratelimited;
  55 }
  56
  57 static bool event_source_is_offline(sd_event_source *s) {
  58         assert(s);
  59         return s->enabled == SD_EVENT_OFF || s->ratelimited;
  60 }
  61
  62 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
  63         [SOURCE_IO]                  = "io",
  64         [SOURCE_TIME_REALTIME]       = "realtime",
  65         [SOURCE_TIME_BOOTTIME]       = "boottime",
  66         [SOURCE_TIME_MONOTONIC]      = "monotonic",
  67         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
  68         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
  69         [SOURCE_SIGNAL]              = "signal",
  70         [SOURCE_CHILD]               = "child",
  71         [SOURCE_DEFER]               = "defer",
  72         [SOURCE_POST]                = "post",
  73         [SOURCE_EXIT]                = "exit",
  74         [SOURCE_WATCHDOG]            = "watchdog",
  75         [SOURCE_INOTIFY]             = "inotify",
  76         [SOURCE_MEMORY_PRESSURE]     = "memory-pressure",
  77 };
  78
  79 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
  80
  81 #define EVENT_SOURCE_IS_TIME(t)                 \
  82         IN_SET((t),                             \
  83                SOURCE_TIME_REALTIME,            \
  84                SOURCE_TIME_BOOTTIME,            \
  85                SOURCE_TIME_MONOTONIC,           \
  86                SOURCE_TIME_REALTIME_ALARM,      \
  87                SOURCE_TIME_BOOTTIME_ALARM)
  88
  89 #define EVENT_SOURCE_CAN_RATE_LIMIT(t)          \
  90         IN_SET((t),                             \
  91                SOURCE_IO,                       \
  92                SOURCE_TIME_REALTIME,            \
  93                SOURCE_TIME_BOOTTIME,            \
  94                SOURCE_TIME_MONOTONIC,           \
  95                SOURCE_TIME_REALTIME_ALARM,      \
  96                SOURCE_TIME_BOOTTIME_ALARM,      \
  97                SOURCE_SIGNAL,                   \
  98                SOURCE_DEFER,                    \
  99                SOURCE_INOTIFY,                  \
 100                SOURCE_MEMORY_PRESSURE)
 101
 102 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
 103  * Time sources and ratelimited sources can be passed, so effectively this is the same as the
 104  * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
 105 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
 106
 107 struct sd_event {
 108         unsigned n_ref;
 109
 110         int epoll_fd;
 111         int watchdog_fd;
 112
 113         Prioq *pending;
 114         Prioq *prepare;
 115
 116         /* timerfd_create() only supports these five clocks so far. We
 117          * can add support for more clocks when the kernel learns to
 118          * deal with them, too. */
 119         struct clock_data realtime;
 120         struct clock_data boottime;
 121         struct clock_data monotonic;
 122         struct clock_data realtime_alarm;
 123         struct clock_data boottime_alarm;
 124
 125         usec_t perturb;
 126
 127         sd_event_source **signal_sources; /* indexed by signal number */
 128         Hashmap *signal_data; /* indexed by priority */
 129
 130         Hashmap *child_sources;
 131         unsigned n_online_child_sources;
 132
 133         Set *post_sources;
 134
 135         Prioq *exit;
 136
 137         Hashmap *inotify_data; /* indexed by priority */
 138
 139         /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
 140         LIST_HEAD(struct inode_data, inode_data_to_close_list);
 141
 142         /* A list of inotify objects that already have events buffered which aren't processed yet */
 143         LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
 144
 145         /* A list of memory pressure event sources that still need their subscription string written */
 146         LIST_HEAD(sd_event_source, memory_pressure_write_list);
 147
 148         uint64_t origin_id;
 149
 150         uint64_t iteration;
 151         triple_timestamp timestamp;
 152         int state;
 153
 154         bool exit_requested:1;
 155         bool need_process_child:1;
 156         bool watchdog:1;
 157         bool profile_delays:1;
 158
 159         int exit_code;
 160
 161         pid_t tid;
 162         sd_event **default_event_ptr;
 163
 164         usec_t watchdog_last, watchdog_period;
 165
 166         unsigned n_sources;
 167
 168         struct epoll_event *event_queue;
 169
 170         LIST_HEAD(sd_event_source, sources);
 171
 172         sd_event_source *sigint_event_source, *sigterm_event_source;
 173
 174         usec_t last_run_usec, last_log_usec;
 175         unsigned delays[sizeof(usec_t) * 8];
 176 };
 177
 178 DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event, event);
 179
 180 static thread_local sd_event *default_event = NULL;
 181
 182 static void source_disconnect(sd_event_source *s);
 183 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
 184
 185 static sd_event *event_resolve(sd_event *e) {
 186         return e == SD_EVENT_DEFAULT ? default_event : e;
 187 }
 188
 189 static int pending_prioq_compare(const void *a, const void *b) {
 190         const sd_event_source *x = a, *y = b;
 191         int r;
 192
 193         assert(x->pending);
 194         assert(y->pending);
 195
 196         /* Enabled ones first */
 197         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 198         if (r != 0)
 199                 return r;
 200
 201         /* Non rate-limited ones first. */
 202         r = CMP(!!x->ratelimited, !!y->ratelimited);
 203         if (r != 0)
 204                 return r;
 205
 206         /* Lower priority values first */
 207         r = CMP(x->priority, y->priority);
 208         if (r != 0)
 209                 return r;
 210
 211         /* Older entries first */
 212         return CMP(x->pending_iteration, y->pending_iteration);
 213 }
 214
 215 static int prepare_prioq_compare(const void *a, const void *b) {
 216         const sd_event_source *x = a, *y = b;
 217         int r;
 218
 219         assert(x->prepare);
 220         assert(y->prepare);
 221
 222         /* Enabled ones first */
 223         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 224         if (r != 0)
 225                 return r;
 226
 227         /* Non rate-limited ones first. */
 228         r = CMP(!!x->ratelimited, !!y->ratelimited);
 229         if (r != 0)
 230                 return r;
 231
 232         /* Move most recently prepared ones last, so that we can stop
 233          * preparing as soon as we hit one that has already been
 234          * prepared in the current iteration */
 235         r = CMP(x->prepare_iteration, y->prepare_iteration);
 236         if (r != 0)
 237                 return r;
 238
 239         /* Lower priority values first */
 240         return CMP(x->priority, y->priority);
 241 }
 242
 243 static usec_t time_event_source_next(const sd_event_source *s) {
 244         assert(s);
 245
 246         /* We have two kinds of event sources that have elapsation times associated with them: the actual
 247          * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
 248          * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
 249          * looking at here. */
 250
 251         if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
 252                 assert(s->rate_limit.begin != 0);
 253                 assert(s->rate_limit.interval != 0);
 254                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 255         }
 256
 257         /* Otherwise this must be a time event source, if not ratelimited */
 258         if (EVENT_SOURCE_IS_TIME(s->type))
 259                 return s->time.next;
 260
 261         return USEC_INFINITY;
 262 }
 263
 264 static usec_t time_event_source_latest(const sd_event_source *s) {
 265         assert(s);
 266
 267         if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
 268                                * same, as we should avoid adding additional inaccuracy on an inaccuracy time
 269                                * window */
 270                 assert(s->rate_limit.begin != 0);
 271                 assert(s->rate_limit.interval != 0);
 272                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 273         }
 274
 275         /* Must be a time event source, if not ratelimited */
 276         if (EVENT_SOURCE_IS_TIME(s->type))
 277                 return usec_add(s->time.next, s->time.accuracy);
 278
 279         return USEC_INFINITY;
 280 }
 281
 282 static bool event_source_timer_candidate(const sd_event_source *s) {
 283         assert(s);
 284
 285         /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
 286          * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
 287         return !s->pending || s->ratelimited;
 288 }
 289
 290 static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
 291         const sd_event_source *x = a, *y = b;
 292         int r;
 293
 294         /* Enabled ones first */
 295         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 296         if (r != 0)
 297                 return r;
 298
 299         /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
 300         r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
 301         if (r != 0)
 302                 return r;
 303
 304         /* Order by time */
 305         return CMP(time_func(x), time_func(y));
 306 }
 307
 308 static int earliest_time_prioq_compare(const void *a, const void *b) {
 309         return time_prioq_compare(a, b, time_event_source_next);
 310 }
 311
 312 static int latest_time_prioq_compare(const void *a, const void *b) {
 313         return time_prioq_compare(a, b, time_event_source_latest);
 314 }
 315
 316 static int exit_prioq_compare(const void *a, const void *b) {
 317         const sd_event_source *x = a, *y = b;
 318         int r;
 319
 320         assert(x->type == SOURCE_EXIT);
 321         assert(y->type == SOURCE_EXIT);
 322
 323         /* Enabled ones first */
 324         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 325         if (r != 0)
 326                 return r;
 327
 328         /* Lower priority values first */
 329         return CMP(x->priority, y->priority);
 330 }
 331
 332 static void free_clock_data(struct clock_data *d) {
 333         assert(d);
 334         assert(d->wakeup == WAKEUP_CLOCK_DATA);
 335
 336         safe_close(d->fd);
 337         prioq_free(d->earliest);
 338         prioq_free(d->latest);
 339 }
 340
 341 static sd_event *event_free(sd_event *e) {
 342         sd_event_source *s;
 343
 344         assert(e);
 345
 346         e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
 347         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
 348
 349         while ((s = e->sources)) {
 350                 assert(s->floating);
 351                 source_disconnect(s);
 352                 sd_event_source_unref(s);
 353         }
 354
 355         assert(e->n_sources == 0);
 356
 357         if (e->default_event_ptr)
 358                 *(e->default_event_ptr) = NULL;
 359
 360         safe_close(e->epoll_fd);
 361         safe_close(e->watchdog_fd);
 362
 363         free_clock_data(&e->realtime);
 364         free_clock_data(&e->boottime);
 365         free_clock_data(&e->monotonic);
 366         free_clock_data(&e->realtime_alarm);
 367         free_clock_data(&e->boottime_alarm);
 368
 369         prioq_free(e->pending);
 370         prioq_free(e->prepare);
 371         prioq_free(e->exit);
 372
 373         free(e->signal_sources);
 374         hashmap_free(e->signal_data);
 375
 376         hashmap_free(e->inotify_data);
 377
 378         hashmap_free(e->child_sources);
 379         set_free(e->post_sources);
 380
 381         free(e->event_queue);
 382
 383         return mfree(e);
 384 }
 385
 386 _public_ int sd_event_new(sd_event** ret) {
 387         sd_event *e;
 388         int r;
 389
 390         assert_return(ret, -EINVAL);
 391
 392         e = new(sd_event, 1);
 393         if (!e)
 394                 return -ENOMEM;
 395
 396         *e = (sd_event) {
 397                 .n_ref = 1,
 398                 .epoll_fd = -EBADF,
 399                 .watchdog_fd = -EBADF,
 400                 .realtime.wakeup = WAKEUP_CLOCK_DATA,
 401                 .realtime.fd = -EBADF,
 402                 .realtime.next = USEC_INFINITY,
 403                 .boottime.wakeup = WAKEUP_CLOCK_DATA,
 404                 .boottime.fd = -EBADF,
 405                 .boottime.next = USEC_INFINITY,
 406                 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
 407                 .monotonic.fd = -EBADF,
 408                 .monotonic.next = USEC_INFINITY,
 409                 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 410                 .realtime_alarm.fd = -EBADF,
 411                 .realtime_alarm.next = USEC_INFINITY,
 412                 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 413                 .boottime_alarm.fd = -EBADF,
 414                 .boottime_alarm.next = USEC_INFINITY,
 415                 .perturb = USEC_INFINITY,
 416                 .origin_id = origin_id_query(),
 417         };
 418
 419         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
 420         if (r < 0)
 421                 goto fail;
 422
 423         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
 424         if (e->epoll_fd < 0) {
 425                 r = -errno;
 426                 goto fail;
 427         }
 428
 429         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
 430
 431         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
 432                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
 433                           special_glyph(SPECIAL_GLYPH_ELLIPSIS));
 434                 e->profile_delays = true;
 435         }
 436
 437         *ret = e;
 438         return 0;
 439
 440 fail:
 441         event_free(e);
 442         return r;
 443 }
 444
 445 /* Define manually so we can add the origin check */
 446 _public_ sd_event *sd_event_ref(sd_event *e) {
 447         if (!e)
 448                 return NULL;
 449         if (event_origin_changed(e))
 450                 return NULL;
 451
 452         e->n_ref++;
 453
 454         return e;
 455 }
 456
 457 _public_ sd_event* sd_event_unref(sd_event *e) {
 458         if (!e)
 459                 return NULL;
 460         if (event_origin_changed(e))
 461                 return NULL;
 462
 463         assert(e->n_ref > 0);
 464         if (--e->n_ref > 0)
 465                 return NULL;
 466
 467         return event_free(e);
 468 }
 469
 470 #define PROTECT_EVENT(e)                                                \
 471         _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
 472
 473 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
 474         if (s)
 475                 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
 476         return sd_event_source_unref(s);
 477 }
 478
 479 static void source_io_unregister(sd_event_source *s) {
 480         assert(s);
 481         assert(s->type == SOURCE_IO);
 482
 483         if (event_origin_changed(s->event))
 484                 return;
 485
 486         if (!s->io.registered)
 487                 return;
 488
 489         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
 490                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 491                                 strna(s->description), event_source_type_to_string(s->type));
 492
 493         s->io.registered = false;
 494 }
 495
 496 static int source_io_register(
 497                 sd_event_source *s,
 498                 int enabled,
 499                 uint32_t events) {
 500
 501         assert(s);
 502         assert(s->type == SOURCE_IO);
 503         assert(enabled != SD_EVENT_OFF);
 504
 505         struct epoll_event ev = {
 506                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 507                 .data.ptr = s,
 508         };
 509
 510         if (epoll_ctl(s->event->epoll_fd,
 511                       s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 512                       s->io.fd, &ev) < 0)
 513                 return -errno;
 514
 515         s->io.registered = true;
 516
 517         return 0;
 518 }
 519
 520 static void source_child_pidfd_unregister(sd_event_source *s) {
 521         assert(s);
 522         assert(s->type == SOURCE_CHILD);
 523
 524         if (event_origin_changed(s->event))
 525                 return;
 526
 527         if (!s->child.registered)
 528                 return;
 529
 530         if (EVENT_SOURCE_WATCH_PIDFD(s))
 531                 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
 532                         log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 533                                         strna(s->description), event_source_type_to_string(s->type));
 534
 535         s->child.registered = false;
 536 }
 537
 538 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
 539         assert(s);
 540         assert(s->type == SOURCE_CHILD);
 541         assert(enabled != SD_EVENT_OFF);
 542
 543         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
 544                 struct epoll_event ev = {
 545                         .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 546                         .data.ptr = s,
 547                 };
 548
 549                 if (epoll_ctl(s->event->epoll_fd,
 550                               s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 551                               s->child.pidfd, &ev) < 0)
 552                         return -errno;
 553         }
 554
 555         s->child.registered = true;
 556         return 0;
 557 }
 558
 559 static void source_memory_pressure_unregister(sd_event_source *s) {
 560         assert(s);
 561         assert(s->type == SOURCE_MEMORY_PRESSURE);
 562
 563         if (event_origin_changed(s->event))
 564                 return;
 565
 566         if (!s->memory_pressure.registered)
 567                 return;
 568
 569         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
 570                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 571                                 strna(s->description), event_source_type_to_string(s->type));
 572
 573         s->memory_pressure.registered = false;
 574 }
 575
 576 static int source_memory_pressure_register(sd_event_source *s, int enabled) {
 577         assert(s);
 578         assert(s->type == SOURCE_MEMORY_PRESSURE);
 579         assert(enabled != SD_EVENT_OFF);
 580
 581         struct epoll_event ev = {
 582                 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
 583                           (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
 584                 .data.ptr = s,
 585         };
 586
 587         if (epoll_ctl(s->event->epoll_fd,
 588                       s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 589                       s->memory_pressure.fd, &ev) < 0)
 590                 return -errno;
 591
 592         s->memory_pressure.registered = true;
 593         return 0;
 594 }
 595
 596 static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
 597         assert(s);
 598         assert(s->type == SOURCE_MEMORY_PRESSURE);
 599
 600         if (s->memory_pressure.in_write_list)
 601                 return;
 602
 603         LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
 604         s->memory_pressure.in_write_list = true;
 605 }
 606
 607 static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
 608         assert(s);
 609         assert(s->type == SOURCE_MEMORY_PRESSURE);
 610
 611         if (!s->memory_pressure.in_write_list)
 612                 return;
 613
 614         LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
 615         s->memory_pressure.in_write_list = false;
 616 }
 617
 618 static clockid_t event_source_type_to_clock(EventSourceType t) {
 619
 620         switch (t) {
 621
 622         case SOURCE_TIME_REALTIME:
 623                 return CLOCK_REALTIME;
 624
 625         case SOURCE_TIME_BOOTTIME:
 626                 return CLOCK_BOOTTIME;
 627
 628         case SOURCE_TIME_MONOTONIC:
 629                 return CLOCK_MONOTONIC;
 630
 631         case SOURCE_TIME_REALTIME_ALARM:
 632                 return CLOCK_REALTIME_ALARM;
 633
 634         case SOURCE_TIME_BOOTTIME_ALARM:
 635                 return CLOCK_BOOTTIME_ALARM;
 636
 637         default:
 638                 return (clockid_t) -1;
 639         }
 640 }
 641
 642 static EventSourceType clock_to_event_source_type(clockid_t clock) {
 643
 644         switch (clock) {
 645
 646         case CLOCK_REALTIME:
 647                 return SOURCE_TIME_REALTIME;
 648
 649         case CLOCK_BOOTTIME:
 650                 return SOURCE_TIME_BOOTTIME;
 651
 652         case CLOCK_MONOTONIC:
 653                 return SOURCE_TIME_MONOTONIC;
 654
 655         case CLOCK_REALTIME_ALARM:
 656                 return SOURCE_TIME_REALTIME_ALARM;
 657
 658         case CLOCK_BOOTTIME_ALARM:
 659                 return SOURCE_TIME_BOOTTIME_ALARM;
 660
 661         default:
 662                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
 663         }
 664 }
 665
 666 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
 667         assert(e);
 668
 669         switch (t) {
 670
 671         case SOURCE_TIME_REALTIME:
 672                 return &e->realtime;
 673
 674         case SOURCE_TIME_BOOTTIME:
 675                 return &e->boottime;
 676
 677         case SOURCE_TIME_MONOTONIC:
 678                 return &e->monotonic;
 679
 680         case SOURCE_TIME_REALTIME_ALARM:
 681                 return &e->realtime_alarm;
 682
 683         case SOURCE_TIME_BOOTTIME_ALARM:
 684                 return &e->boottime_alarm;
 685
 686         default:
 687                 return NULL;
 688         }
 689 }
 690
 691 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
 692         assert(e);
 693
 694         if (!d)
 695                 return;
 696
 697         hashmap_remove(e->signal_data, &d->priority);
 698         safe_close(d->fd);
 699         free(d);
 700 }
 701
 702 static int event_make_signal_data(
 703                 sd_event *e,
 704                 int sig,
 705                 struct signal_data **ret) {
 706
 707         struct signal_data *d;
 708         bool added = false;
 709         sigset_t ss_copy;
 710         int64_t priority;
 711         int r;
 712
 713         assert(e);
 714
 715         if (event_origin_changed(e))
 716                 return -ECHILD;
 717
 718         if (e->signal_sources && e->signal_sources[sig])
 719                 priority = e->signal_sources[sig]->priority;
 720         else
 721                 priority = SD_EVENT_PRIORITY_NORMAL;
 722
 723         d = hashmap_get(e->signal_data, &priority);
 724         if (d) {
 725                 if (sigismember(&d->sigset, sig) > 0) {
 726                         if (ret)
 727                                 *ret = d;
 728                         return 0;
 729                 }
 730         } else {
 731                 d = new(struct signal_data, 1);
 732                 if (!d)
 733                         return -ENOMEM;
 734
 735                 *d = (struct signal_data) {
 736                         .wakeup = WAKEUP_SIGNAL_DATA,
 737                         .fd = -EBADF,
 738                         .priority = priority,
 739                 };
 740
 741                 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
 742                 if (r < 0) {
 743                         free(d);
 744                         return r;
 745                 }
 746
 747                 added = true;
 748         }
 749
 750         ss_copy = d->sigset;
 751         assert_se(sigaddset(&ss_copy, sig) >= 0);
 752
 753         r = signalfd(d->fd >= 0 ? d->fd : -1,   /* the first arg must be -1 or a valid signalfd */
 754                      &ss_copy,
 755                      SFD_NONBLOCK|SFD_CLOEXEC);
 756         if (r < 0) {
 757                 r = -errno;
 758                 goto fail;
 759         }
 760
 761         d->sigset = ss_copy;
 762
 763         if (d->fd >= 0) {
 764                 if (ret)
 765                         *ret = d;
 766                 return 0;
 767         }
 768
 769         d->fd = fd_move_above_stdio(r);
 770
 771         struct epoll_event ev = {
 772                 .events = EPOLLIN,
 773                 .data.ptr = d,
 774         };
 775
 776         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
 777                 r = -errno;
 778                 goto fail;
 779         }
 780
 781         if (ret)
 782                 *ret = d;
 783
 784         return 0;
 785
 786 fail:
 787         if (added)
 788                 event_free_signal_data(e, d);
 789
 790         return r;
 791 }
 792
 793 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
 794         assert(e);
 795         assert(d);
 796
 797         /* Turns off the specified signal in the signal data
 798          * object. If the signal mask of the object becomes empty that
 799          * way removes it. */
 800
 801         if (sigismember(&d->sigset, sig) == 0)
 802                 return;
 803
 804         assert_se(sigdelset(&d->sigset, sig) >= 0);
 805
 806         if (sigisemptyset(&d->sigset)) {
 807                 /* If all the mask is all-zero we can get rid of the structure */
 808                 event_free_signal_data(e, d);
 809                 return;
 810         }
 811
 812         if (event_origin_changed(e))
 813                 return;
 814
 815         assert(d->fd >= 0);
 816
 817         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
 818                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
 819 }
 820
 821 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
 822         struct signal_data *d;
 823         static const int64_t zero_priority = 0;
 824
 825         assert(e);
 826
 827         /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
 828          * and possibly drop the signalfd for it. */
 829
 830         if (sig == SIGCHLD &&
 831             e->n_online_child_sources > 0)
 832                 return;
 833
 834         if (e->signal_sources &&
 835             e->signal_sources[sig] &&
 836             event_source_is_online(e->signal_sources[sig]))
 837                 return;
 838
 839         /*
 840          * The specified signal might be enabled in three different queues:
 841          *
 842          * 1) the one that belongs to the priority passed (if it is non-NULL)
 843          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
 844          * 3) the 0 priority (to cover the SIGCHLD case)
 845          *
 846          * Hence, let's remove it from all three here.
 847          */
 848
 849         if (priority) {
 850                 d = hashmap_get(e->signal_data, priority);
 851                 if (d)
 852                         event_unmask_signal_data(e, d, sig);
 853         }
 854
 855         if (e->signal_sources && e->signal_sources[sig]) {
 856                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
 857                 if (d)
 858                         event_unmask_signal_data(e, d, sig);
 859         }
 860
 861         d = hashmap_get(e->signal_data, &zero_priority);
 862         if (d)
 863                 event_unmask_signal_data(e, d, sig);
 864 }
 865
 866 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
 867         assert(s);
 868
 869         /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
 870          * they are enabled/disabled or marked pending and such. */
 871
 872         if (s->pending)
 873                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
 874
 875         if (s->prepare)
 876                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
 877 }
 878
 879 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
 880         struct clock_data *d;
 881
 882         assert(s);
 883
 884         /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
 885          * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
 886          * properly again. */
 887
 888         if (s->ratelimited)
 889                 d = &s->event->monotonic;
 890         else if (EVENT_SOURCE_IS_TIME(s->type))
 891                 assert_se(d = event_get_clock_data(s->event, s->type));
 892         else
 893                 return; /* no-op for an event source which is neither a timer nor ratelimited. */
 894
 895         prioq_reshuffle(d->earliest, s, &s->earliest_index);
 896         prioq_reshuffle(d->latest, s, &s->latest_index);
 897         d->needs_rearm = true;
 898 }
 899
 900 static void event_source_time_prioq_remove(
 901                 sd_event_source *s,
 902                 struct clock_data *d) {
 903
 904         assert(s);
 905         assert(d);
 906
 907         prioq_remove(d->earliest, s, &s->earliest_index);
 908         prioq_remove(d->latest, s, &s->latest_index);
 909         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
 910         d->needs_rearm = true;
 911 }
 912
 913 static void source_disconnect(sd_event_source *s) {
 914         sd_event *event;
 915         int r;
 916
 917         assert(s);
 918
 919         if (!s->event)
 920                 return;
 921
 922         assert(s->event->n_sources > 0);
 923
 924         switch (s->type) {
 925
 926         case SOURCE_IO:
 927                 if (s->io.fd >= 0)
 928                         source_io_unregister(s);
 929
 930                 break;
 931
 932         case SOURCE_TIME_REALTIME:
 933         case SOURCE_TIME_BOOTTIME:
 934         case SOURCE_TIME_MONOTONIC:
 935         case SOURCE_TIME_REALTIME_ALARM:
 936         case SOURCE_TIME_BOOTTIME_ALARM:
 937                 /* Only remove this event source from the time event source here if it is not ratelimited. If
 938                  * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
 939                  * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
 940
 941                 if (!s->ratelimited) {
 942                         struct clock_data *d;
 943                         assert_se(d = event_get_clock_data(s->event, s->type));
 944                         event_source_time_prioq_remove(s, d);
 945                 }
 946
 947                 break;
 948
 949         case SOURCE_SIGNAL:
 950                 if (s->signal.sig > 0) {
 951
 952                         if (s->event->signal_sources)
 953                                 s->event->signal_sources[s->signal.sig] = NULL;
 954
 955                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
 956
 957                         if (s->signal.unblock) {
 958                                 sigset_t new_ss;
 959
 960                                 if (sigemptyset(&new_ss) < 0)
 961                                         log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
 962                                 else if (sigaddset(&new_ss, s->signal.sig) < 0)
 963                                         log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
 964                                 else {
 965                                         r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
 966                                         if (r != 0)
 967                                                 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
 968                                 }
 969                         }
 970                 }
 971
 972                 break;
 973
 974         case SOURCE_CHILD:
 975                 if (event_origin_changed(s->event))
 976                         s->child.process_owned = false;
 977
 978                 if (s->child.pid > 0) {
 979                         if (event_source_is_online(s)) {
 980                                 assert(s->event->n_online_child_sources > 0);
 981                                 s->event->n_online_child_sources--;
 982                         }
 983
 984                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
 985                 }
 986
 987                 if (EVENT_SOURCE_WATCH_PIDFD(s))
 988                         source_child_pidfd_unregister(s);
 989                 else
 990                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
 991
 992                 break;
 993
 994         case SOURCE_DEFER:
 995                 /* nothing */
 996                 break;
 997
 998         case SOURCE_POST:
 999                 set_remove(s->event->post_sources, s);
1000                 break;
1001
1002         case SOURCE_EXIT:
1003                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
1004                 break;
1005
1006         case SOURCE_INOTIFY: {
1007                 struct inode_data *inode_data;
1008
1009                 inode_data = s->inotify.inode_data;
1010                 if (inode_data) {
1011                         struct inotify_data *inotify_data;
1012                         assert_se(inotify_data = inode_data->inotify_data);
1013
1014                         /* Detach this event source from the inode object */
1015                         LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
1016                         s->inotify.inode_data = NULL;
1017
1018                         if (s->pending) {
1019                                 assert(inotify_data->n_pending > 0);
1020                                 inotify_data->n_pending--;
1021                         }
1022
1023                         /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1024                          * continued to being watched. That's because inotify doesn't really have an API for that: we
1025                          * can only change watch masks with access to the original inode either by fd or by path. But
1026                          * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1027                          * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1028                          * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1029                          * there), but given the need for open_by_handle_at() which is privileged and not universally
1030                          * available this would be quite an incomplete solution. Hence we go the other way, leave the
1031                          * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1032                          * anymore after reception. Yes, this sucks, but … Linux … */
1033
1034                         /* Maybe release the inode data (and its inotify) */
1035                         event_gc_inode_data(s->event, inode_data);
1036                 }
1037
1038                 break;
1039         }
1040
1041         case SOURCE_MEMORY_PRESSURE:
1042                 source_memory_pressure_remove_from_write_list(s);
1043                 source_memory_pressure_unregister(s);
1044                 break;
1045
1046         default:
1047                 assert_not_reached();
1048         }
1049
1050         if (s->pending)
1051                 prioq_remove(s->event->pending, s, &s->pending_index);
1052
1053         if (s->prepare)
1054                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1055
1056         if (s->ratelimited)
1057                 event_source_time_prioq_remove(s, &s->event->monotonic);
1058
1059         event = TAKE_PTR(s->event);
1060         LIST_REMOVE(sources, event->sources, s);
1061         event->n_sources--;
1062
1063         /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1064          * pidfd associated with this event source, which we'll do only on source_free(). */
1065
1066         if (!s->floating)
1067                 sd_event_unref(event);
1068 }
1069
1070 static sd_event_source* source_free(sd_event_source *s) {
1071         assert(s);
1072
1073         source_disconnect(s);
1074
1075         if (s->type == SOURCE_IO && s->io.owned)
1076                 s->io.fd = safe_close(s->io.fd);
1077
1078         if (s->type == SOURCE_CHILD) {
1079                 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1080
1081                 if (s->child.process_owned) {
1082
1083                         if (!s->child.exited) {
1084                                 bool sent = false;
1085
1086                                 if (s->child.pidfd >= 0) {
1087                                         if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
1088                                                 if (errno == ESRCH) /* Already dead */
1089                                                         sent = true;
1090                                                 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
1091                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
1092                                                                         s->child.pid);
1093                                         } else
1094                                                 sent = true;
1095                                 }
1096
1097                                 if (!sent)
1098                                         if (kill(s->child.pid, SIGKILL) < 0)
1099                                                 if (errno != ESRCH) /* Already dead */
1100                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1101                                                                         s->child.pid);
1102                         }
1103
1104                         if (!s->child.waited) {
1105                                 siginfo_t si = {};
1106
1107                                 /* Reap the child if we can */
1108                                 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1109                         }
1110                 }
1111
1112                 if (s->child.pidfd_owned)
1113                         s->child.pidfd = safe_close(s->child.pidfd);
1114         }
1115
1116         if (s->type == SOURCE_MEMORY_PRESSURE) {
1117                 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1118                 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1119         }
1120
1121         if (s->destroy_callback)
1122                 s->destroy_callback(s->userdata);
1123
1124         free(s->description);
1125         return mfree(s);
1126 }
1127 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1128
1129 static int source_set_pending(sd_event_source *s, bool b) {
1130         int r;
1131
1132         assert(s);
1133         assert(s->type != SOURCE_EXIT);
1134
1135         if (s->pending == b)
1136                 return 0;
1137
1138         s->pending = b;
1139
1140         if (b) {
1141                 s->pending_iteration = s->event->iteration;
1142
1143                 r = prioq_put(s->event->pending, s, &s->pending_index);
1144                 if (r < 0) {
1145                         s->pending = false;
1146                         return r;
1147                 }
1148         } else
1149                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1150
1151         if (EVENT_SOURCE_IS_TIME(s->type))
1152                 event_source_time_prioq_reshuffle(s);
1153
1154         if (s->type == SOURCE_SIGNAL && !b) {
1155                 struct signal_data *d;
1156
1157                 d = hashmap_get(s->event->signal_data, &s->priority);
1158                 if (d && d->current == s)
1159                         d->current = NULL;
1160         }
1161
1162         if (s->type == SOURCE_INOTIFY) {
1163
1164                 assert(s->inotify.inode_data);
1165                 assert(s->inotify.inode_data->inotify_data);
1166
1167                 if (b)
1168                         s->inotify.inode_data->inotify_data->n_pending++;
1169                 else {
1170                         assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1171                         s->inotify.inode_data->inotify_data->n_pending--;
1172                 }
1173         }
1174
1175         return 1;
1176 }
1177
1178 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1179
1180         /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1181          * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1182          * lines. */
1183         static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1184                 [SOURCE_IO]                  = endoffsetof_field(sd_event_source, io),
1185                 [SOURCE_TIME_REALTIME]       = endoffsetof_field(sd_event_source, time),
1186                 [SOURCE_TIME_BOOTTIME]       = endoffsetof_field(sd_event_source, time),
1187                 [SOURCE_TIME_MONOTONIC]      = endoffsetof_field(sd_event_source, time),
1188                 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1189                 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1190                 [SOURCE_SIGNAL]              = endoffsetof_field(sd_event_source, signal),
1191                 [SOURCE_CHILD]               = endoffsetof_field(sd_event_source, child),
1192                 [SOURCE_DEFER]               = endoffsetof_field(sd_event_source, defer),
1193                 [SOURCE_POST]                = endoffsetof_field(sd_event_source, post),
1194                 [SOURCE_EXIT]                = endoffsetof_field(sd_event_source, exit),
1195                 [SOURCE_INOTIFY]             = endoffsetof_field(sd_event_source, inotify),
1196                 [SOURCE_MEMORY_PRESSURE]     = endoffsetof_field(sd_event_source, memory_pressure),
1197         };
1198
1199         sd_event_source *s;
1200
1201         assert(e);
1202         assert(type >= 0);
1203         assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1204         assert(size_table[type] > 0);
1205
1206         s = malloc0(size_table[type]);
1207         if (!s)
1208                 return NULL;
1209         /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1210          * size, even if we only allocate the initial part we need. */
1211         s = expand_to_usable(s, sizeof(sd_event_source));
1212
1213         /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1214          * than what we allocated here. */
1215         s->n_ref = 1;
1216         s->event = e;
1217         s->floating = floating;
1218         s->type = type;
1219         s->pending_index = PRIOQ_IDX_NULL;
1220         s->prepare_index = PRIOQ_IDX_NULL;
1221
1222         if (!floating)
1223                 sd_event_ref(e);
1224
1225         LIST_PREPEND(sources, e->sources, s);
1226         e->n_sources++;
1227
1228         return s;
1229 }
1230
1231 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1232         assert(s);
1233
1234         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1235 }
1236
1237 _public_ int sd_event_add_io(
1238                 sd_event *e,
1239                 sd_event_source **ret,
1240                 int fd,
1241                 uint32_t events,
1242                 sd_event_io_handler_t callback,
1243                 void *userdata) {
1244
1245         _cleanup_(source_freep) sd_event_source *s = NULL;
1246         int r;
1247
1248         assert_return(e, -EINVAL);
1249         assert_return(e = event_resolve(e), -ENOPKG);
1250         assert_return(fd >= 0, -EBADF);
1251         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1252         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1253         assert_return(!event_origin_changed(e), -ECHILD);
1254
1255         if (!callback)
1256                 callback = io_exit_callback;
1257
1258         s = source_new(e, !ret, SOURCE_IO);
1259         if (!s)
1260                 return -ENOMEM;
1261
1262         s->wakeup = WAKEUP_EVENT_SOURCE;
1263         s->io.fd = fd;
1264         s->io.events = events;
1265         s->io.callback = callback;
1266         s->userdata = userdata;
1267         s->enabled = SD_EVENT_ON;
1268
1269         r = source_io_register(s, s->enabled, events);
1270         if (r < 0)
1271                 return r;
1272
1273         if (ret)
1274                 *ret = s;
1275         TAKE_PTR(s);
1276
1277         return 0;
1278 }
1279
1280 static void initialize_perturb(sd_event *e) {
1281         sd_id128_t id = {};
1282
1283         /* When we sleep for longer, we try to realign the wakeup to the same time within each
1284          * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1285          * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1286          * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1287          * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1288
1289         if (_likely_(e->perturb != USEC_INFINITY))
1290                 return;
1291
1292         if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
1293                 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1294         else
1295                 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
1296 }
1297
1298 static int event_setup_timer_fd(
1299                 sd_event *e,
1300                 struct clock_data *d,
1301                 clockid_t clock) {
1302
1303         assert(e);
1304         assert(d);
1305
1306         if (_likely_(d->fd >= 0))
1307                 return 0;
1308
1309         _cleanup_close_ int fd = -EBADF;
1310
1311         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1312         if (fd < 0)
1313                 return -errno;
1314
1315         fd = fd_move_above_stdio(fd);
1316
1317         struct epoll_event ev = {
1318                 .events = EPOLLIN,
1319                 .data.ptr = d,
1320         };
1321
1322         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1323                 return -errno;
1324
1325         d->fd = TAKE_FD(fd);
1326         return 0;
1327 }
1328
1329 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1330         assert(s);
1331
1332         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1333 }
1334
1335 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1336         int r;
1337
1338         assert(d);
1339
1340         if (d->fd < 0) {
1341                 r = event_setup_timer_fd(e, d, clock);
1342                 if (r < 0)
1343                         return r;
1344         }
1345
1346         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1347         if (r < 0)
1348                 return r;
1349
1350         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1351         if (r < 0)
1352                 return r;
1353
1354         return 0;
1355 }
1356
1357 static int event_source_time_prioq_put(
1358                 sd_event_source *s,
1359                 struct clock_data *d) {
1360
1361         int r;
1362
1363         assert(s);
1364         assert(d);
1365         assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1366
1367         r = prioq_put(d->earliest, s, &s->earliest_index);
1368         if (r < 0)
1369                 return r;
1370
1371         r = prioq_put(d->latest, s, &s->latest_index);
1372         if (r < 0) {
1373                 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1374                 s->earliest_index = PRIOQ_IDX_NULL;
1375                 return r;
1376         }
1377
1378         d->needs_rearm = true;
1379         return 0;
1380 }
1381
1382 _public_ int sd_event_add_time(
1383                 sd_event *e,
1384                 sd_event_source **ret,
1385                 clockid_t clock,
1386                 uint64_t usec,
1387                 uint64_t accuracy,
1388                 sd_event_time_handler_t callback,
1389                 void *userdata) {
1390
1391         EventSourceType type;
1392         _cleanup_(source_freep) sd_event_source *s = NULL;
1393         struct clock_data *d;
1394         int r;
1395
1396         assert_return(e, -EINVAL);
1397         assert_return(e = event_resolve(e), -ENOPKG);
1398         assert_return(accuracy != UINT64_MAX, -EINVAL);
1399         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1400         assert_return(!event_origin_changed(e), -ECHILD);
1401
1402         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1403                 return -EOPNOTSUPP;
1404
1405         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1406         if (type < 0)
1407                 return -EOPNOTSUPP;
1408
1409         if (!callback)
1410                 callback = time_exit_callback;
1411
1412         assert_se(d = event_get_clock_data(e, type));
1413
1414         r = setup_clock_data(e, d, clock);
1415         if (r < 0)
1416                 return r;
1417
1418         s = source_new(e, !ret, type);
1419         if (!s)
1420                 return -ENOMEM;
1421
1422         s->time.next = usec;
1423         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1424         s->time.callback = callback;
1425         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1426         s->userdata = userdata;
1427         s->enabled = SD_EVENT_ONESHOT;
1428
1429         r = event_source_time_prioq_put(s, d);
1430         if (r < 0)
1431                 return r;
1432
1433         if (ret)
1434                 *ret = s;
1435         TAKE_PTR(s);
1436
1437         return 0;
1438 }
1439
1440 _public_ int sd_event_add_time_relative(
1441                 sd_event *e,
1442                 sd_event_source **ret,
1443                 clockid_t clock,
1444                 uint64_t usec,
1445                 uint64_t accuracy,
1446                 sd_event_time_handler_t callback,
1447                 void *userdata) {
1448
1449         usec_t t;
1450         int r;
1451
1452         /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1453          * checks for overflow. */
1454
1455         r = sd_event_now(e, clock, &t);
1456         if (r < 0)
1457                 return r;
1458
1459         if (usec >= USEC_INFINITY - t)
1460                 return -EOVERFLOW;
1461
1462         return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1463 }
1464
1465 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1466         assert(s);
1467
1468         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1469 }
1470
1471 _public_ int sd_event_add_signal(
1472                 sd_event *e,
1473                 sd_event_source **ret,
1474                 int sig,
1475                 sd_event_signal_handler_t callback,
1476                 void *userdata) {
1477
1478         _cleanup_(source_freep) sd_event_source *s = NULL;
1479         struct signal_data *d;
1480         sigset_t new_ss;
1481         bool block_it;
1482         int r;
1483
1484         assert_return(e, -EINVAL);
1485         assert_return(e = event_resolve(e), -ENOPKG);
1486         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1487         assert_return(!event_origin_changed(e), -ECHILD);
1488
1489         /* Let's make sure our special flag stays outside of the valid signal range */
1490         assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1491
1492         if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1493                 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1494                 assert_return(SIGNAL_VALID(sig), -EINVAL);
1495
1496                 block_it = true;
1497         } else {
1498                 assert_return(SIGNAL_VALID(sig), -EINVAL);
1499
1500                 r = signal_is_blocked(sig);
1501                 if (r < 0)
1502                         return r;
1503                 if (r == 0)
1504                         return -EBUSY;
1505
1506                 block_it = false;
1507         }
1508
1509         if (!callback)
1510                 callback = signal_exit_callback;
1511
1512         if (!e->signal_sources) {
1513                 e->signal_sources = new0(sd_event_source*, _NSIG);
1514                 if (!e->signal_sources)
1515                         return -ENOMEM;
1516         } else if (e->signal_sources[sig])
1517                 return -EBUSY;
1518
1519         s = source_new(e, !ret, SOURCE_SIGNAL);
1520         if (!s)
1521                 return -ENOMEM;
1522
1523         s->signal.sig = sig;
1524         s->signal.callback = callback;
1525         s->userdata = userdata;
1526         s->enabled = SD_EVENT_ON;
1527
1528         e->signal_sources[sig] = s;
1529
1530         if (block_it) {
1531                 sigset_t old_ss;
1532
1533                 if (sigemptyset(&new_ss) < 0)
1534                         return -errno;
1535
1536                 if (sigaddset(&new_ss, sig) < 0)
1537                         return -errno;
1538
1539                 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1540                 if (r != 0)
1541                         return -r;
1542
1543                 r = sigismember(&old_ss, sig);
1544                 if (r < 0)
1545                         return -errno;
1546
1547                 s->signal.unblock = !r;
1548         } else
1549                 s->signal.unblock = false;
1550
1551         r = event_make_signal_data(e, sig, &d);
1552         if (r < 0) {
1553                 if (s->signal.unblock)
1554                         (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1555
1556                 return r;
1557         }
1558
1559         /* Use the signal name as description for the event source by default */
1560         (void) sd_event_source_set_description(s, signal_to_string(sig));
1561
1562         if (ret)
1563                 *ret = s;
1564         TAKE_PTR(s);
1565
1566         return 0;
1567 }
1568
1569 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1570         assert(s);
1571
1572         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1573 }
1574
1575 static bool shall_use_pidfd(void) {
1576         /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1577         return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1578 }
1579
1580 _public_ int sd_event_add_child(
1581                 sd_event *e,
1582                 sd_event_source **ret,
1583                 pid_t pid,
1584                 int options,
1585                 sd_event_child_handler_t callback,
1586                 void *userdata) {
1587
1588         _cleanup_(source_freep) sd_event_source *s = NULL;
1589         int r;
1590
1591         assert_return(e, -EINVAL);
1592         assert_return(e = event_resolve(e), -ENOPKG);
1593         assert_return(pid > 1, -EINVAL);
1594         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1595         assert_return(options != 0, -EINVAL);
1596         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1597         assert_return(!event_origin_changed(e), -ECHILD);
1598
1599         if (!callback)
1600                 callback = child_exit_callback;
1601
1602         if (e->n_online_child_sources == 0) {
1603                 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1604                  * for compatibility with pre-pidfd and because we don't want the reap the child processes
1605                  * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1606                  * take effect.
1607                  *
1608                  * (As an optimization we only do this check on the first child event source created.) */
1609                 r = signal_is_blocked(SIGCHLD);
1610                 if (r < 0)
1611                         return r;
1612                 if (r == 0)
1613                         return -EBUSY;
1614         }
1615
1616         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1617         if (r < 0)
1618                 return r;
1619
1620         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1621                 return -EBUSY;
1622
1623         s = source_new(e, !ret, SOURCE_CHILD);
1624         if (!s)
1625                 return -ENOMEM;
1626
1627         s->wakeup = WAKEUP_EVENT_SOURCE;
1628         s->child.options = options;
1629         s->child.callback = callback;
1630         s->userdata = userdata;
1631         s->enabled = SD_EVENT_ONESHOT;
1632
1633         /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1634          * pin the PID, and make regular waitid() handling race-free. */
1635
1636         if (shall_use_pidfd()) {
1637                 s->child.pidfd = pidfd_open(pid, 0);
1638                 if (s->child.pidfd < 0) {
1639                         /* Propagate errors unless the syscall is not supported or blocked */
1640                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1641                                 return -errno;
1642                 } else
1643                         s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1644         } else
1645                 s->child.pidfd = -EBADF;
1646
1647         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1648                 /* We have a pidfd and we only want to watch for exit */
1649                 r = source_child_pidfd_register(s, s->enabled);
1650                 if (r < 0)
1651                         return r;
1652
1653         } else {
1654                 /* We have no pidfd or we shall wait for some other event than WEXITED */
1655                 r = event_make_signal_data(e, SIGCHLD, NULL);
1656                 if (r < 0)
1657                         return r;
1658
1659                 e->need_process_child = true;
1660         }
1661
1662         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1663         if (r < 0)
1664                 return r;
1665
1666         /* These must be done after everything succeeds. */
1667         s->child.pid = pid;
1668         e->n_online_child_sources++;
1669
1670         if (ret)
1671                 *ret = s;
1672         TAKE_PTR(s);
1673         return 0;
1674 }
1675
1676 _public_ int sd_event_add_child_pidfd(
1677                 sd_event *e,
1678                 sd_event_source **ret,
1679                 int pidfd,
1680                 int options,
1681                 sd_event_child_handler_t callback,
1682                 void *userdata) {
1683
1684
1685         _cleanup_(source_freep) sd_event_source *s = NULL;
1686         pid_t pid;
1687         int r;
1688
1689         assert_return(e, -EINVAL);
1690         assert_return(e = event_resolve(e), -ENOPKG);
1691         assert_return(pidfd >= 0, -EBADF);
1692         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1693         assert_return(options != 0, -EINVAL);
1694         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1695         assert_return(!event_origin_changed(e), -ECHILD);
1696
1697         if (!callback)
1698                 callback = child_exit_callback;
1699
1700         if (e->n_online_child_sources == 0) {
1701                 r = signal_is_blocked(SIGCHLD);
1702                 if (r < 0)
1703                         return r;
1704                 if (r == 0)
1705                         return -EBUSY;
1706         }
1707
1708         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1709         if (r < 0)
1710                 return r;
1711
1712         r = pidfd_get_pid(pidfd, &pid);
1713         if (r < 0)
1714                 return r;
1715
1716         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1717                 return -EBUSY;
1718
1719         s = source_new(e, !ret, SOURCE_CHILD);
1720         if (!s)
1721                 return -ENOMEM;
1722
1723         s->wakeup = WAKEUP_EVENT_SOURCE;
1724         s->child.pidfd = pidfd;
1725         s->child.pid = pid;
1726         s->child.options = options;
1727         s->child.callback = callback;
1728         s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1729         s->userdata = userdata;
1730         s->enabled = SD_EVENT_ONESHOT;
1731
1732         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1733         if (r < 0)
1734                 return r;
1735
1736         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1737                 /* We only want to watch for WEXITED */
1738                 r = source_child_pidfd_register(s, s->enabled);
1739                 if (r < 0)
1740                         return r;
1741         } else {
1742                 /* We shall wait for some other event than WEXITED */
1743                 r = event_make_signal_data(e, SIGCHLD, NULL);
1744                 if (r < 0)
1745                         return r;
1746
1747                 e->need_process_child = true;
1748         }
1749
1750         e->n_online_child_sources++;
1751
1752         if (ret)
1753                 *ret = s;
1754         TAKE_PTR(s);
1755         return 0;
1756 }
1757
1758 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1759         assert(s);
1760
1761         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1762 }
1763
1764 _public_ int sd_event_add_defer(
1765                 sd_event *e,
1766                 sd_event_source **ret,
1767                 sd_event_handler_t callback,
1768                 void *userdata) {
1769
1770         _cleanup_(source_freep) sd_event_source *s = NULL;
1771         int r;
1772
1773         assert_return(e, -EINVAL);
1774         assert_return(e = event_resolve(e), -ENOPKG);
1775         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1776         assert_return(!event_origin_changed(e), -ECHILD);
1777
1778         if (!callback)
1779                 callback = generic_exit_callback;
1780
1781         s = source_new(e, !ret, SOURCE_DEFER);
1782         if (!s)
1783                 return -ENOMEM;
1784
1785         s->defer.callback = callback;
1786         s->userdata = userdata;
1787         s->enabled = SD_EVENT_ONESHOT;
1788
1789         r = source_set_pending(s, true);
1790         if (r < 0)
1791                 return r;
1792
1793         if (ret)
1794                 *ret = s;
1795         TAKE_PTR(s);
1796
1797         return 0;
1798 }
1799
1800 _public_ int sd_event_add_post(
1801                 sd_event *e,
1802                 sd_event_source **ret,
1803                 sd_event_handler_t callback,
1804                 void *userdata) {
1805
1806         _cleanup_(source_freep) sd_event_source *s = NULL;
1807         int r;
1808
1809         assert_return(e, -EINVAL);
1810         assert_return(e = event_resolve(e), -ENOPKG);
1811         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1812         assert_return(!event_origin_changed(e), -ECHILD);
1813
1814         if (!callback)
1815                 callback = generic_exit_callback;
1816
1817         s = source_new(e, !ret, SOURCE_POST);
1818         if (!s)
1819                 return -ENOMEM;
1820
1821         s->post.callback = callback;
1822         s->userdata = userdata;
1823         s->enabled = SD_EVENT_ON;
1824
1825         r = set_ensure_put(&e->post_sources, NULL, s);
1826         if (r < 0)
1827                 return r;
1828         assert(r > 0);
1829
1830         if (ret)
1831                 *ret = s;
1832         TAKE_PTR(s);
1833
1834         return 0;
1835 }
1836
1837 _public_ int sd_event_add_exit(
1838                 sd_event *e,
1839                 sd_event_source **ret,
1840                 sd_event_handler_t callback,
1841                 void *userdata) {
1842
1843         _cleanup_(source_freep) sd_event_source *s = NULL;
1844         int r;
1845
1846         assert_return(e, -EINVAL);
1847         assert_return(e = event_resolve(e), -ENOPKG);
1848         assert_return(callback, -EINVAL);
1849         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1850         assert_return(!event_origin_changed(e), -ECHILD);
1851
1852         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1853         if (r < 0)
1854                 return r;
1855
1856         s = source_new(e, !ret, SOURCE_EXIT);
1857         if (!s)
1858                 return -ENOMEM;
1859
1860         s->exit.callback = callback;
1861         s->userdata = userdata;
1862         s->exit.prioq_index = PRIOQ_IDX_NULL;
1863         s->enabled = SD_EVENT_ONESHOT;
1864
1865         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1866         if (r < 0)
1867                 return r;
1868
1869         if (ret)
1870                 *ret = s;
1871         TAKE_PTR(s);
1872
1873         return 0;
1874 }
1875
1876 _public_ int sd_event_trim_memory(void) {
1877         int r;
1878
1879         /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1880          * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1881          * NULL callback parameter. */
1882
1883         log_debug("Memory pressure event, trimming malloc() memory.");
1884
1885 #if HAVE_GENERIC_MALLINFO
1886         generic_mallinfo before_mallinfo = generic_mallinfo_get();
1887 #endif
1888
1889         usec_t before_timestamp = now(CLOCK_MONOTONIC);
1890         hashmap_trim_pools();
1891         r = malloc_trim(0);
1892         usec_t after_timestamp = now(CLOCK_MONOTONIC);
1893
1894         if (r > 0)
1895                 log_debug("Successfully trimmed some memory.");
1896         else
1897                 log_debug("Couldn't trim any memory.");
1898
1899         usec_t period = after_timestamp - before_timestamp;
1900
1901 #if HAVE_GENERIC_MALLINFO
1902         generic_mallinfo after_mallinfo = generic_mallinfo_get();
1903         size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
1904                 LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
1905         log_struct(LOG_DEBUG,
1906                    LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1907                                FORMAT_TIMESPAN(period, 0),
1908                                FORMAT_BYTES(l)),
1909                    "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1910                    "TRIMMED_BYTES=%zu", l,
1911                    "TRIMMED_USEC=" USEC_FMT, period);
1912 #else
1913         log_struct(LOG_DEBUG,
1914                    LOG_MESSAGE("Memory trimming took %s.",
1915                                FORMAT_TIMESPAN(period, 0)),
1916                    "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1917                    "TRIMMED_USEC=" USEC_FMT, period);
1918 #endif
1919
1920         return 0;
1921 }
1922
1923 static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1924         assert(s);
1925
1926         sd_event_trim_memory();
1927         return 0;
1928 }
1929
1930 _public_ int sd_event_add_memory_pressure(
1931                 sd_event *e,
1932                 sd_event_source **ret,
1933                 sd_event_handler_t callback,
1934                 void *userdata) {
1935
1936         _cleanup_free_ char *w = NULL;
1937         _cleanup_(source_freep) sd_event_source *s = NULL;
1938         _cleanup_close_ int path_fd = -EBADF, fd = -EBADF;
1939         _cleanup_free_ void *write_buffer = NULL;
1940         const char *watch, *watch_fallback = NULL, *env;
1941         size_t write_buffer_size = 0;
1942         struct stat st;
1943         uint32_t events;
1944         bool locked;
1945         int r;
1946
1947         assert_return(e, -EINVAL);
1948         assert_return(e = event_resolve(e), -ENOPKG);
1949         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1950         assert_return(!event_origin_changed(e), -ECHILD);
1951
1952         if (!callback)
1953                 callback = memory_pressure_callback;
1954
1955         s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1956         if (!s)
1957                 return -ENOMEM;
1958
1959         s->wakeup = WAKEUP_EVENT_SOURCE;
1960         s->memory_pressure.callback = callback;
1961         s->userdata = userdata;
1962         s->enabled = SD_EVENT_ON;
1963         s->memory_pressure.fd = -EBADF;
1964
1965         env = secure_getenv("MEMORY_PRESSURE_WATCH");
1966         if (env) {
1967                 if (isempty(env) || path_equal(env, "/dev/null"))
1968                         return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1969                                                "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1970
1971                 if (!path_is_absolute(env) || !path_is_normalized(env))
1972                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1973                                                "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1974
1975                 watch = env;
1976
1977                 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1978                 if (env) {
1979                         r = unbase64mem(env, &write_buffer, &write_buffer_size);
1980                         if (r < 0)
1981                                 return r;
1982                 }
1983
1984                 locked = true;
1985         } else {
1986
1987                 r = is_pressure_supported();
1988                 if (r < 0)
1989                         return r;
1990                 if (r == 0)
1991                         return -EOPNOTSUPP;
1992
1993                 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1994                  * the system wide pressure if for some reason we cannot (which could be: memory controller
1995                  * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1996                  * only use the system-wide logic. */
1997                 r = cg_all_unified();
1998                 if (r < 0)
1999                         return r;
2000                 if (r == 0)
2001                         watch = "/proc/pressure/memory";
2002                 else {
2003                         _cleanup_free_ char *cg = NULL;
2004
2005                         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
2006                         if (r < 0)
2007                                 return r;
2008
2009                         w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
2010                         if (!w)
2011                                 return -ENOMEM;
2012
2013                         watch = w;
2014                         watch_fallback = "/proc/pressure/memory";
2015                 }
2016
2017                 /* Android uses three levels in its userspace low memory killer logic:
2018                  *     some  70000 1000000
2019                  *     some 100000 1000000
2020                  *     full  70000 1000000
2021                  *
2022                  * GNOME's low memory monitor uses:
2023                  *     some  70000 1000000
2024                  *     some 100000 1000000
2025                  *     full 100000 1000000
2026                  *
2027                  * We'll default to the middle level that both agree on. Except we do it on a 2s window
2028                  * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2029                  * kernel will allow us to do unprivileged, also in the future. */
2030                 if (asprintf((char**) &write_buffer,
2031                              "%s " USEC_FMT " " USEC_FMT,
2032                              MEMORY_PRESSURE_DEFAULT_TYPE,
2033                              MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2034                              MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2035                         return -ENOMEM;
2036
2037                 write_buffer_size = strlen(write_buffer) + 1;
2038                 locked = false;
2039         }
2040
2041         path_fd = open(watch, O_PATH|O_CLOEXEC);
2042         if (path_fd < 0) {
2043                 if (errno != ENOENT)
2044                         return -errno;
2045
2046                 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2047                  * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2048                  * the PSI service apparently is not supported) */
2049                 if (!watch_fallback)
2050                         return locked ? -ENOENT : -EOPNOTSUPP;
2051
2052                 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
2053                 if (path_fd < 0) {
2054                         if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2055                                 return -EOPNOTSUPP;
2056                         return -errno;
2057                 }
2058         }
2059
2060         if (fstat(path_fd, &st) < 0)
2061                 return -errno;
2062
2063         if (S_ISSOCK(st.st_mode)) {
2064                 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2065                 if (fd < 0)
2066                         return -errno;
2067
2068                 r = connect_unix_path(fd, path_fd, NULL);
2069                 if (r < 0)
2070                         return r;
2071
2072                 events = EPOLLIN;
2073
2074         } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2075                 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2076                 if (fd < 0)
2077                         return fd;
2078
2079                 if (S_ISREG(st.st_mode)) {
2080                         struct statfs sfs;
2081
2082                         /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2083
2084                         if (fstatfs(fd, &sfs) < 0)
2085                                 return -errno;
2086
2087                         if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2088                             !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2089                                 return -ENOTTY;
2090
2091                         events = EPOLLPRI;
2092                 } else
2093                         /* For fifos and char devices just watch for EPOLLIN */
2094                         events = EPOLLIN;
2095
2096         } else if (S_ISDIR(st.st_mode))
2097                 return -EISDIR;
2098         else
2099                 return -EBADF;
2100
2101         s->memory_pressure.fd = TAKE_FD(fd);
2102         s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2103         s->memory_pressure.write_buffer_size = write_buffer_size;
2104         s->memory_pressure.events = events;
2105         s->memory_pressure.locked = locked;
2106
2107         /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2108          * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2109          * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2110          * event sources on which writes must be executed before the first event loop iteration is
2111          * executed. (We could also write the data here, right away, but we want to give the caller the
2112          * freedom to call sd_event_source_set_memory_pressure_type() and
2113          * sd_event_source_set_memory_pressure_rate() before we write it. */
2114
2115         if (s->memory_pressure.write_buffer_size > 0)
2116                 source_memory_pressure_add_to_write_list(s);
2117         else {
2118                 r = source_memory_pressure_register(s, s->enabled);
2119                 if (r < 0)
2120                         return r;
2121         }
2122
2123         if (ret)
2124                 *ret = s;
2125         TAKE_PTR(s);
2126
2127         return 0;
2128 }
2129
2130 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2131         assert(e);
2132
2133         if (!d)
2134                 return;
2135
2136         assert(hashmap_isempty(d->inodes));
2137         assert(hashmap_isempty(d->wd));
2138
2139         if (d->buffer_filled > 0)
2140                 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
2141
2142         hashmap_free(d->inodes);
2143         hashmap_free(d->wd);
2144
2145         assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2146
2147         if (d->fd >= 0) {
2148                 if (!event_origin_changed(e) &&
2149                     epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
2150                         log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2151
2152                 safe_close(d->fd);
2153         }
2154         free(d);
2155 }
2156
2157 static int event_make_inotify_data(
2158                 sd_event *e,
2159                 int64_t priority,
2160                 struct inotify_data **ret) {
2161
2162         _cleanup_close_ int fd = -EBADF;
2163         struct inotify_data *d;
2164         int r;
2165
2166         assert(e);
2167
2168         d = hashmap_get(e->inotify_data, &priority);
2169         if (d) {
2170                 if (ret)
2171                         *ret = d;
2172                 return 0;
2173         }
2174
2175         fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2176         if (fd < 0)
2177                 return -errno;
2178
2179         fd = fd_move_above_stdio(fd);
2180
2181         d = new(struct inotify_data, 1);
2182         if (!d)
2183                 return -ENOMEM;
2184
2185         *d = (struct inotify_data) {
2186                 .wakeup = WAKEUP_INOTIFY_DATA,
2187                 .fd = TAKE_FD(fd),
2188                 .priority = priority,
2189         };
2190
2191         r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
2192         if (r < 0) {
2193                 d->fd = safe_close(d->fd);
2194                 free(d);
2195                 return r;
2196         }
2197
2198         struct epoll_event ev = {
2199                 .events = EPOLLIN,
2200                 .data.ptr = d,
2201         };
2202
2203         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2204                 r = -errno;
2205                 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2206                                             * remove the fd from the epoll first, which we don't want as we couldn't
2207                                             * add it in the first place. */
2208                 event_free_inotify_data(e, d);
2209                 return r;
2210         }
2211
2212         if (ret)
2213                 *ret = d;
2214
2215         return 1;
2216 }
2217
2218 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
2219         int r;
2220
2221         assert(x);
2222         assert(y);
2223
2224         r = CMP(x->dev, y->dev);
2225         if (r != 0)
2226                 return r;
2227
2228         return CMP(x->ino, y->ino);
2229 }
2230
2231 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2232         assert(d);
2233
2234         siphash24_compress_typesafe(d->dev, state);
2235         siphash24_compress_typesafe(d->ino, state);
2236 }
2237
2238 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
2239
2240 static void event_free_inode_data(
2241                 sd_event *e,
2242                 struct inode_data *d) {
2243
2244         assert(e);
2245
2246         if (!d)
2247                 return;
2248
2249         assert(!d->event_sources);
2250
2251         if (d->fd >= 0) {
2252                 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
2253                 safe_close(d->fd);
2254         }
2255
2256         if (d->inotify_data) {
2257
2258                 if (d->wd >= 0) {
2259                         if (d->inotify_data->fd >= 0 && !event_origin_changed(e)) {
2260                                 /* So here's a problem. At the time this runs the watch descriptor might already be
2261                                  * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2262                                  * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2263                                  * likely case to happen. */
2264
2265                                 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2266                                         log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2267                         }
2268
2269                         assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2270                 }
2271
2272                 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2273         }
2274
2275         free(d);
2276 }
2277
2278 static void event_gc_inotify_data(
2279                 sd_event *e,
2280                 struct inotify_data *d) {
2281
2282         assert(e);
2283
2284         /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2285          * any inode with it anymore, which in turn happens if no event source of this priority is interested
2286          * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2287          * (under the expectation that the GC is called again once the counter is decremented). */
2288
2289         if (!d)
2290                 return;
2291
2292         if (!hashmap_isempty(d->inodes))
2293                 return;
2294
2295         if (d->n_busy > 0)
2296                 return;
2297
2298         event_free_inotify_data(e, d);
2299 }
2300
2301 static void event_gc_inode_data(
2302                 sd_event *e,
2303                 struct inode_data *d) {
2304
2305         struct inotify_data *inotify_data;
2306
2307         assert(e);
2308
2309         if (!d)
2310                 return;
2311
2312         if (d->event_sources)
2313                 return;
2314
2315         inotify_data = d->inotify_data;
2316         event_free_inode_data(e, d);
2317
2318         event_gc_inotify_data(e, inotify_data);
2319 }
2320
2321 static int event_make_inode_data(
2322                 sd_event *e,
2323                 struct inotify_data *inotify_data,
2324                 dev_t dev,
2325                 ino_t ino,
2326                 struct inode_data **ret) {
2327
2328         struct inode_data *d, key;
2329         int r;
2330
2331         assert(e);
2332         assert(inotify_data);
2333
2334         key = (struct inode_data) {
2335                 .ino = ino,
2336                 .dev = dev,
2337         };
2338
2339         d = hashmap_get(inotify_data->inodes, &key);
2340         if (d) {
2341                 if (ret)
2342                         *ret = d;
2343
2344                 return 0;
2345         }
2346
2347         r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2348         if (r < 0)
2349                 return r;
2350
2351         d = new(struct inode_data, 1);
2352         if (!d)
2353                 return -ENOMEM;
2354
2355         *d = (struct inode_data) {
2356                 .dev = dev,
2357                 .ino = ino,
2358                 .wd = -1,
2359                 .fd = -EBADF,
2360                 .inotify_data = inotify_data,
2361         };
2362
2363         r = hashmap_put(inotify_data->inodes, d, d);
2364         if (r < 0) {
2365                 free(d);
2366                 return r;
2367         }
2368
2369         if (ret)
2370                 *ret = d;
2371
2372         return 1;
2373 }
2374
2375 static uint32_t inode_data_determine_mask(struct inode_data *d) {
2376         bool excl_unlink = true;
2377         uint32_t combined = 0;
2378
2379         assert(d);
2380
2381         /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2382          * the IN_EXCL_UNLINK flag is ANDed instead.
2383          *
2384          * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2385          * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2386          * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2387          * events we don't care for client-side. */
2388
2389         LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2390
2391                 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2392                         excl_unlink = false;
2393
2394                 combined |= s->inotify.mask;
2395         }
2396
2397         return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2398 }
2399
2400 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2401         uint32_t combined_mask;
2402         int wd, r;
2403
2404         assert(d);
2405         assert(d->fd >= 0);
2406
2407         combined_mask = inode_data_determine_mask(d);
2408
2409         if (d->wd >= 0 && combined_mask == d->combined_mask)
2410                 return 0;
2411
2412         r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2413         if (r < 0)
2414                 return r;
2415
2416         wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2417         if (wd < 0)
2418                 return -errno;
2419
2420         if (d->wd < 0) {
2421                 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2422                 if (r < 0) {
2423                         (void) inotify_rm_watch(d->inotify_data->fd, wd);
2424                         return r;
2425                 }
2426
2427                 d->wd = wd;
2428
2429         } else if (d->wd != wd) {
2430
2431                 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2432                 (void) inotify_rm_watch(d->fd, wd);
2433                 return -EINVAL;
2434         }
2435
2436         d->combined_mask = combined_mask;
2437         return 1;
2438 }
2439
2440 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2441         assert(s);
2442
2443         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2444 }
2445
2446 static int event_add_inotify_fd_internal(
2447                 sd_event *e,
2448                 sd_event_source **ret,
2449                 int fd,
2450                 bool donate,
2451                 uint32_t mask,
2452                 sd_event_inotify_handler_t callback,
2453                 void *userdata) {
2454
2455         _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
2456         _cleanup_(source_freep) sd_event_source *s = NULL;
2457         struct inotify_data *inotify_data = NULL;
2458         struct inode_data *inode_data = NULL;
2459         struct stat st;
2460         int r;
2461
2462         assert_return(e, -EINVAL);
2463         assert_return(e = event_resolve(e), -ENOPKG);
2464         assert_return(fd >= 0, -EBADF);
2465         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2466         assert_return(!event_origin_changed(e), -ECHILD);
2467
2468         if (!callback)
2469                 callback = inotify_exit_callback;
2470
2471         /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2472          * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2473          * the user can't use them for us. */
2474         if (mask & IN_MASK_ADD)
2475                 return -EINVAL;
2476
2477         if (fstat(fd, &st) < 0)
2478                 return -errno;
2479
2480         s = source_new(e, !ret, SOURCE_INOTIFY);
2481         if (!s)
2482                 return -ENOMEM;
2483
2484         s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2485         s->inotify.mask = mask;
2486         s->inotify.callback = callback;
2487         s->userdata = userdata;
2488
2489         /* Allocate an inotify object for this priority, and an inode object within it */
2490         r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2491         if (r < 0)
2492                 return r;
2493
2494         r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2495         if (r < 0) {
2496                 event_gc_inotify_data(e, inotify_data);
2497                 return r;
2498         }
2499
2500         /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2501          * the event source, until then, for which we need the original inode. */
2502         if (inode_data->fd < 0) {
2503                 if (donated_fd >= 0)
2504                         inode_data->fd = TAKE_FD(donated_fd);
2505                 else {
2506                         inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2507                         if (inode_data->fd < 0) {
2508                                 r = -errno;
2509                                 event_gc_inode_data(e, inode_data);
2510                                 return r;
2511                         }
2512                 }
2513
2514                 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
2515         }
2516
2517         /* Link our event source to the inode data object */
2518         LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2519         s->inotify.inode_data = inode_data;
2520
2521         /* Actually realize the watch now */
2522         r = inode_data_realize_watch(e, inode_data);
2523         if (r < 0)
2524                 return r;
2525
2526         if (ret)
2527                 *ret = s;
2528         TAKE_PTR(s);
2529
2530         return 0;
2531 }
2532
2533 _public_ int sd_event_add_inotify_fd(
2534                 sd_event *e,
2535                 sd_event_source **ret,
2536                 int fd,
2537                 uint32_t mask,
2538                 sd_event_inotify_handler_t callback,
2539                 void *userdata) {
2540
2541         return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2542 }
2543
2544 _public_ int sd_event_add_inotify(
2545                 sd_event *e,
2546                 sd_event_source **ret,
2547                 const char *path,
2548                 uint32_t mask,
2549                 sd_event_inotify_handler_t callback,
2550                 void *userdata) {
2551
2552         sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2553         int fd, r;
2554
2555         assert_return(path, -EINVAL);
2556
2557         fd = open(path, O_PATH | O_CLOEXEC |
2558                         (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2559                         (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2560         if (fd < 0)
2561                 return -errno;
2562
2563         r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2564         if (r < 0)
2565                 return r;
2566
2567         (void) sd_event_source_set_description(s, path);
2568
2569         if (ret)
2570                 *ret = s;
2571
2572         return r;
2573 }
2574
2575 static sd_event_source* event_source_free(sd_event_source *s) {
2576         if (!s)
2577                 return NULL;
2578
2579         /* Here's a special hack: when we are called from a
2580          * dispatch handler we won't free the event source
2581          * immediately, but we will detach the fd from the
2582          * epoll. This way it is safe for the caller to unref
2583          * the event source and immediately close the fd, but
2584          * we still retain a valid event source object after
2585          * the callback. */
2586
2587         if (s->dispatching)
2588                 source_disconnect(s);
2589         else
2590                 source_free(s);
2591
2592         return NULL;
2593 }
2594
2595 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2596
2597 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2598         assert_return(s, -EINVAL);
2599         assert_return(!event_origin_changed(s->event), -ECHILD);
2600
2601         return free_and_strdup(&s->description, description);
2602 }
2603
2604 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
2605         assert_return(s, -EINVAL);
2606         assert_return(description, -EINVAL);
2607
2608         if (!s->description)
2609                 return -ENXIO;
2610
2611         *description = s->description;
2612         return 0;
2613 }
2614
2615 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
2616         assert_return(s, NULL);
2617         assert_return(!event_origin_changed(s->event), NULL);
2618
2619         return s->event;
2620 }
2621
2622 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2623         assert_return(s, -EINVAL);
2624         assert_return(s->type != SOURCE_EXIT, -EDOM);
2625         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2626         assert_return(!event_origin_changed(s->event), -ECHILD);
2627
2628         return s->pending;
2629 }
2630
2631 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2632         assert_return(s, -EINVAL);
2633         assert_return(s->type == SOURCE_IO, -EDOM);
2634         assert_return(!event_origin_changed(s->event), -ECHILD);
2635
2636         return s->io.fd;
2637 }
2638
2639 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2640         int r;
2641
2642         assert_return(s, -EINVAL);
2643         assert_return(fd >= 0, -EBADF);
2644         assert_return(s->type == SOURCE_IO, -EDOM);
2645         assert_return(!event_origin_changed(s->event), -ECHILD);
2646
2647         if (s->io.fd == fd)
2648                 return 0;
2649
2650         if (event_source_is_offline(s)) {
2651                 s->io.fd = fd;
2652                 s->io.registered = false;
2653         } else {
2654                 int saved_fd;
2655
2656                 saved_fd = s->io.fd;
2657                 assert(s->io.registered);
2658
2659                 s->io.fd = fd;
2660                 s->io.registered = false;
2661
2662                 r = source_io_register(s, s->enabled, s->io.events);
2663                 if (r < 0) {
2664                         s->io.fd = saved_fd;
2665                         s->io.registered = true;
2666                         return r;
2667                 }
2668
2669                 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2670         }
2671
2672         return 0;
2673 }
2674
2675 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2676         assert_return(s, -EINVAL);
2677         assert_return(s->type == SOURCE_IO, -EDOM);
2678         assert_return(!event_origin_changed(s->event), -ECHILD);
2679
2680         return s->io.owned;
2681 }
2682
2683 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2684         assert_return(s, -EINVAL);
2685         assert_return(s->type == SOURCE_IO, -EDOM);
2686         assert_return(!event_origin_changed(s->event), -ECHILD);
2687
2688         s->io.owned = own;
2689         return 0;
2690 }
2691
2692 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2693         assert_return(s, -EINVAL);
2694         assert_return(events, -EINVAL);
2695         assert_return(s->type == SOURCE_IO, -EDOM);
2696         assert_return(!event_origin_changed(s->event), -ECHILD);
2697
2698         *events = s->io.events;
2699         return 0;
2700 }
2701
2702 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2703         int r;
2704
2705         assert_return(s, -EINVAL);
2706         assert_return(s->type == SOURCE_IO, -EDOM);
2707         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2708         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2709         assert_return(!event_origin_changed(s->event), -ECHILD);
2710
2711         /* edge-triggered updates are never skipped, so we can reset edges */
2712         if (s->io.events == events && !(events & EPOLLET))
2713                 return 0;
2714
2715         r = source_set_pending(s, false);
2716         if (r < 0)
2717                 return r;
2718
2719         if (event_source_is_online(s)) {
2720                 r = source_io_register(s, s->enabled, events);
2721                 if (r < 0)
2722                         return r;
2723         }
2724
2725         s->io.events = events;
2726
2727         return 0;
2728 }
2729
2730 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2731         assert_return(s, -EINVAL);
2732         assert_return(revents, -EINVAL);
2733         assert_return(s->type == SOURCE_IO, -EDOM);
2734         assert_return(s->pending, -ENODATA);
2735         assert_return(!event_origin_changed(s->event), -ECHILD);
2736
2737         *revents = s->io.revents;
2738         return 0;
2739 }
2740
2741 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2742         assert_return(s, -EINVAL);
2743         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2744         assert_return(!event_origin_changed(s->event), -ECHILD);
2745
2746         return s->signal.sig;
2747 }
2748
2749 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2750         assert_return(s, -EINVAL);
2751         assert_return(!event_origin_changed(s->event), -ECHILD);
2752
2753         *priority = s->priority;
2754         return 0;
2755 }
2756
2757 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2758         bool rm_inotify = false, rm_inode = false;
2759         struct inotify_data *new_inotify_data = NULL;
2760         struct inode_data *new_inode_data = NULL;
2761         int r;
2762
2763         assert_return(s, -EINVAL);
2764         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2765         assert_return(!event_origin_changed(s->event), -ECHILD);
2766
2767         if (s->priority == priority)
2768                 return 0;
2769
2770         if (s->type == SOURCE_INOTIFY) {
2771                 struct inode_data *old_inode_data;
2772
2773                 assert(s->inotify.inode_data);
2774                 old_inode_data = s->inotify.inode_data;
2775
2776                 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2777                  * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2778                  * events we allow priority changes only until the first following iteration. */
2779                 if (old_inode_data->fd < 0)
2780                         return -EOPNOTSUPP;
2781
2782                 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2783                 if (r < 0)
2784                         return r;
2785                 rm_inotify = r > 0;
2786
2787                 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2788                 if (r < 0)
2789                         goto fail;
2790                 rm_inode = r > 0;
2791
2792                 if (new_inode_data->fd < 0) {
2793                         /* Duplicate the fd for the new inode object if we don't have any yet */
2794                         new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2795                         if (new_inode_data->fd < 0) {
2796                                 r = -errno;
2797                                 goto fail;
2798                         }
2799
2800                         LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
2801                 }
2802
2803                 /* Move the event source to the new inode data structure */
2804                 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2805                 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2806                 s->inotify.inode_data = new_inode_data;
2807
2808                 /* Now create the new watch */
2809                 r = inode_data_realize_watch(s->event, new_inode_data);
2810                 if (r < 0) {
2811                         /* Move it back */
2812                         LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2813                         LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2814                         s->inotify.inode_data = old_inode_data;
2815                         goto fail;
2816                 }
2817
2818                 s->priority = priority;
2819
2820                 event_gc_inode_data(s->event, old_inode_data);
2821
2822         } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2823                 struct signal_data *old, *d;
2824
2825                 /* Move us from the signalfd belonging to the old
2826                  * priority to the signalfd of the new priority */
2827
2828                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2829
2830                 s->priority = priority;
2831
2832                 r = event_make_signal_data(s->event, s->signal.sig, &d);
2833                 if (r < 0) {
2834                         s->priority = old->priority;
2835                         return r;
2836                 }
2837
2838                 event_unmask_signal_data(s->event, old, s->signal.sig);
2839         } else
2840                 s->priority = priority;
2841
2842         event_source_pp_prioq_reshuffle(s);
2843
2844         if (s->type == SOURCE_EXIT)
2845                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2846
2847         return 0;
2848
2849 fail:
2850         if (rm_inode)
2851                 event_free_inode_data(s->event, new_inode_data);
2852
2853         if (rm_inotify)
2854                 event_free_inotify_data(s->event, new_inotify_data);
2855
2856         return r;
2857 }
2858
2859 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2860         /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2861         if (!s && !ret)
2862                 return false;
2863
2864         assert_return(s, -EINVAL);
2865         assert_return(!event_origin_changed(s->event), -ECHILD);
2866
2867         if (ret)
2868                 *ret = s->enabled;
2869
2870         return s->enabled != SD_EVENT_OFF;
2871 }
2872
2873 static int event_source_offline(
2874                 sd_event_source *s,
2875                 int enabled,
2876                 bool ratelimited) {
2877
2878         bool was_offline;
2879         int r;
2880
2881         assert(s);
2882         assert(enabled == SD_EVENT_OFF || ratelimited);
2883
2884         /* Unset the pending flag when this event source is disabled */
2885         if (s->enabled != SD_EVENT_OFF &&
2886             enabled == SD_EVENT_OFF &&
2887             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2888                 r = source_set_pending(s, false);
2889                 if (r < 0)
2890                         return r;
2891         }
2892
2893         was_offline = event_source_is_offline(s);
2894         s->enabled = enabled;
2895         s->ratelimited = ratelimited;
2896
2897         switch (s->type) {
2898
2899         case SOURCE_IO:
2900                 source_io_unregister(s);
2901                 break;
2902
2903         case SOURCE_SIGNAL:
2904                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2905                 break;
2906
2907         case SOURCE_CHILD:
2908                 if (!was_offline) {
2909                         assert(s->event->n_online_child_sources > 0);
2910                         s->event->n_online_child_sources--;
2911                 }
2912
2913                 if (EVENT_SOURCE_WATCH_PIDFD(s))
2914                         source_child_pidfd_unregister(s);
2915                 else
2916                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2917                 break;
2918
2919         case SOURCE_EXIT:
2920                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2921                 break;
2922
2923         case SOURCE_MEMORY_PRESSURE:
2924                 source_memory_pressure_unregister(s);
2925                 break;
2926
2927         case SOURCE_TIME_REALTIME:
2928         case SOURCE_TIME_BOOTTIME:
2929         case SOURCE_TIME_MONOTONIC:
2930         case SOURCE_TIME_REALTIME_ALARM:
2931         case SOURCE_TIME_BOOTTIME_ALARM:
2932         case SOURCE_DEFER:
2933         case SOURCE_POST:
2934         case SOURCE_INOTIFY:
2935                 break;
2936
2937         default:
2938                 assert_not_reached();
2939         }
2940
2941         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2942         event_source_time_prioq_reshuffle(s);
2943
2944         return 1;
2945 }
2946
2947 static int event_source_online(
2948                 sd_event_source *s,
2949                 int enabled,
2950                 bool ratelimited) {
2951
2952         bool was_online;
2953         int r;
2954
2955         assert(s);
2956         assert(enabled != SD_EVENT_OFF || !ratelimited);
2957
2958         /* Unset the pending flag when this event source is enabled */
2959         if (s->enabled == SD_EVENT_OFF &&
2960             enabled != SD_EVENT_OFF &&
2961             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2962                 r = source_set_pending(s, false);
2963                 if (r < 0)
2964                         return r;
2965         }
2966
2967         /* Are we really ready for onlining? */
2968         if (enabled == SD_EVENT_OFF || ratelimited) {
2969                 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2970                 s->enabled = enabled;
2971                 s->ratelimited = ratelimited;
2972                 return 0;
2973         }
2974
2975         was_online = event_source_is_online(s);
2976
2977         switch (s->type) {
2978         case SOURCE_IO:
2979                 r = source_io_register(s, enabled, s->io.events);
2980                 if (r < 0)
2981                         return r;
2982                 break;
2983
2984         case SOURCE_SIGNAL:
2985                 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2986                 if (r < 0) {
2987                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2988                         return r;
2989                 }
2990
2991                 break;
2992
2993         case SOURCE_CHILD:
2994                 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2995                         /* yes, we have pidfd */
2996
2997                         r = source_child_pidfd_register(s, enabled);
2998                         if (r < 0)
2999                                 return r;
3000                 } else {
3001                         /* no pidfd, or something other to watch for than WEXITED */
3002
3003                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
3004                         if (r < 0) {
3005                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
3006                                 return r;
3007                         }
3008                 }
3009
3010                 if (!was_online)
3011                         s->event->n_online_child_sources++;
3012                 break;
3013
3014         case SOURCE_MEMORY_PRESSURE:
3015                 r = source_memory_pressure_register(s, enabled);
3016                 if (r < 0)
3017                         return r;
3018
3019                 break;
3020
3021         case SOURCE_TIME_REALTIME:
3022         case SOURCE_TIME_BOOTTIME:
3023         case SOURCE_TIME_MONOTONIC:
3024         case SOURCE_TIME_REALTIME_ALARM:
3025         case SOURCE_TIME_BOOTTIME_ALARM:
3026         case SOURCE_EXIT:
3027         case SOURCE_DEFER:
3028         case SOURCE_POST:
3029         case SOURCE_INOTIFY:
3030                 break;
3031
3032         default:
3033                 assert_not_reached();
3034         }
3035
3036         s->enabled = enabled;
3037         s->ratelimited = ratelimited;
3038
3039         /* Non-failing operations below */
3040         if (s->type == SOURCE_EXIT)
3041                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
3042
3043         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3044         event_source_time_prioq_reshuffle(s);
3045
3046         return 1;
3047 }
3048
3049 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3050         int r;
3051
3052         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
3053
3054         /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3055         if (m == SD_EVENT_OFF && !s)
3056                 return 0;
3057
3058         assert_return(s, -EINVAL);
3059         assert_return(!event_origin_changed(s->event), -ECHILD);
3060
3061         /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3062         if (s->event->state == SD_EVENT_FINISHED)
3063                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
3064
3065         if (s->enabled == m) /* No change? */
3066                 return 0;
3067
3068         if (m == SD_EVENT_OFF)
3069                 r = event_source_offline(s, m, s->ratelimited);
3070         else {
3071                 if (s->enabled != SD_EVENT_OFF) {
3072                         /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3073                          * event source is already enabled after all. */
3074                         s->enabled = m;
3075                         return 0;
3076                 }
3077
3078                 r = event_source_online(s, m, s->ratelimited);
3079         }
3080         if (r < 0)
3081                 return r;
3082
3083         event_source_pp_prioq_reshuffle(s);
3084         return 0;
3085 }
3086
3087 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
3088         assert_return(s, -EINVAL);
3089         assert_return(usec, -EINVAL);
3090         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3091         assert_return(!event_origin_changed(s->event), -ECHILD);
3092
3093         *usec = s->time.next;
3094         return 0;
3095 }
3096
3097 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
3098         int r;
3099
3100         assert_return(s, -EINVAL);
3101         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3102         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3103         assert_return(!event_origin_changed(s->event), -ECHILD);
3104
3105         r = source_set_pending(s, false);
3106         if (r < 0)
3107                 return r;
3108
3109         s->time.next = usec;
3110
3111         event_source_time_prioq_reshuffle(s);
3112         return 0;
3113 }
3114
3115 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3116         usec_t t;
3117         int r;
3118
3119         assert_return(s, -EINVAL);
3120         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3121         assert_return(!event_origin_changed(s->event), -ECHILD);
3122
3123         if (usec == USEC_INFINITY)
3124                 return sd_event_source_set_time(s, USEC_INFINITY);
3125
3126         r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3127         if (r < 0)
3128                 return r;
3129
3130         usec = usec_add(t, usec);
3131         if (usec == USEC_INFINITY)
3132                 return -EOVERFLOW;
3133
3134         return sd_event_source_set_time(s, usec);
3135 }
3136
3137 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
3138         assert_return(s, -EINVAL);
3139         assert_return(usec, -EINVAL);
3140         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3141         assert_return(!event_origin_changed(s->event), -ECHILD);
3142
3143         *usec = s->time.accuracy;
3144         return 0;
3145 }
3146
3147 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
3148         int r;
3149
3150         assert_return(s, -EINVAL);
3151         assert_return(usec != UINT64_MAX, -EINVAL);
3152         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3153         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3154         assert_return(!event_origin_changed(s->event), -ECHILD);
3155
3156         r = source_set_pending(s, false);
3157         if (r < 0)
3158                 return r;
3159
3160         if (usec == 0)
3161                 usec = DEFAULT_ACCURACY_USEC;
3162
3163         s->time.accuracy = usec;
3164
3165         event_source_time_prioq_reshuffle(s);
3166         return 0;
3167 }
3168
3169 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
3170         assert_return(s, -EINVAL);
3171         assert_return(clock, -EINVAL);
3172         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3173         assert_return(!event_origin_changed(s->event), -ECHILD);
3174
3175         *clock = event_source_type_to_clock(s->type);
3176         return 0;
3177 }
3178
3179 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
3180         assert_return(s, -EINVAL);
3181         assert_return(pid, -EINVAL);
3182         assert_return(s->type == SOURCE_CHILD, -EDOM);
3183         assert_return(!event_origin_changed(s->event), -ECHILD);
3184
3185         *pid = s->child.pid;
3186         return 0;
3187 }
3188
3189 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3190         assert_return(s, -EINVAL);
3191         assert_return(s->type == SOURCE_CHILD, -EDOM);
3192         assert_return(!event_origin_changed(s->event), -ECHILD);
3193
3194         if (s->child.pidfd < 0)
3195                 return -EOPNOTSUPP;
3196
3197         return s->child.pidfd;
3198 }
3199
3200 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3201         assert_return(s, -EINVAL);
3202         assert_return(s->type == SOURCE_CHILD, -EDOM);
3203         assert_return(!event_origin_changed(s->event), -ECHILD);
3204         assert_return(SIGNAL_VALID(sig), -EINVAL);
3205
3206         /* If we already have seen indication the process exited refuse sending a signal early. This way we
3207          * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3208          * available. */
3209         if (s->child.exited)
3210                 return -ESRCH;
3211
3212         if (s->child.pidfd >= 0) {
3213                 siginfo_t copy;
3214
3215                 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3216                  * structure here */
3217                 if (si)
3218                         copy = *si;
3219
3220                 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
3221                         /* Let's propagate the error only if the system call is not implemented or prohibited */
3222                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3223                                 return -errno;
3224                 } else
3225                         return 0;
3226         }
3227
3228         /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3229          * this here. */
3230         if (flags != 0)
3231                 return -EOPNOTSUPP;
3232
3233         if (si) {
3234                 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3235                 siginfo_t copy = *si;
3236
3237                 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
3238                         return -errno;
3239         } else if (kill(s->child.pid, sig) < 0)
3240                 return -errno;
3241
3242         return 0;
3243 }
3244
3245 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3246         assert_return(s, -EINVAL);
3247         assert_return(s->type == SOURCE_CHILD, -EDOM);
3248         assert_return(!event_origin_changed(s->event), -ECHILD);
3249
3250         if (s->child.pidfd < 0)
3251                 return -EOPNOTSUPP;
3252
3253         return s->child.pidfd_owned;
3254 }
3255
3256 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3257         assert_return(s, -EINVAL);
3258         assert_return(s->type == SOURCE_CHILD, -EDOM);
3259         assert_return(!event_origin_changed(s->event), -ECHILD);
3260
3261         if (s->child.pidfd < 0)
3262                 return -EOPNOTSUPP;
3263
3264         s->child.pidfd_owned = own;
3265         return 0;
3266 }
3267
3268 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3269         assert_return(s, -EINVAL);
3270         assert_return(s->type == SOURCE_CHILD, -EDOM);
3271         assert_return(!event_origin_changed(s->event), -ECHILD);
3272
3273         return s->child.process_owned;
3274 }
3275
3276 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3277         assert_return(s, -EINVAL);
3278         assert_return(s->type == SOURCE_CHILD, -EDOM);
3279         assert_return(!event_origin_changed(s->event), -ECHILD);
3280
3281         s->child.process_owned = own;
3282         return 0;
3283 }
3284
3285 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
3286         assert_return(s, -EINVAL);
3287         assert_return(mask, -EINVAL);
3288         assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3289         assert_return(!event_origin_changed(s->event), -ECHILD);
3290
3291         *mask = s->inotify.mask;
3292         return 0;
3293 }
3294
3295 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
3296         int r;
3297
3298         assert_return(s, -EINVAL);
3299         assert_return(s->type != SOURCE_EXIT, -EDOM);
3300         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3301         assert_return(!event_origin_changed(s->event), -ECHILD);
3302
3303         if (s->prepare == callback)
3304                 return 0;
3305
3306         if (callback && s->prepare) {
3307                 s->prepare = callback;
3308                 return 0;
3309         }
3310
3311         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3312         if (r < 0)
3313                 return r;
3314
3315         s->prepare = callback;
3316
3317         if (callback) {
3318                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3319                 if (r < 0)
3320                         return r;
3321         } else
3322                 prioq_remove(s->event->prepare, s, &s->prepare_index);
3323
3324         return 0;
3325 }
3326
3327 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
3328         assert_return(s, NULL);
3329         assert_return(!event_origin_changed(s->event), NULL);
3330
3331         return s->userdata;
3332 }
3333
3334 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3335         void *ret;
3336
3337         assert_return(s, NULL);
3338         assert_return(!event_origin_changed(s->event), NULL);
3339
3340         ret = s->userdata;
3341         s->userdata = userdata;
3342
3343         return ret;
3344 }
3345
3346 static int event_source_enter_ratelimited(sd_event_source *s) {
3347         int r;
3348
3349         assert(s);
3350
3351         /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3352          * the end of the rate limit time window, much as if it was a timer event source. */
3353
3354         if (s->ratelimited)
3355                 return 0; /* Already ratelimited, this is a NOP hence */
3356
3357         /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3358         r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3359         if (r < 0)
3360                 return r;
3361
3362         /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3363          * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3364          * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3365         if (EVENT_SOURCE_IS_TIME(s->type))
3366                 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3367
3368         /* Now, let's add the event source to the monotonic clock instead */
3369         r = event_source_time_prioq_put(s, &s->event->monotonic);
3370         if (r < 0)
3371                 goto fail;
3372
3373         /* And let's take the event source officially offline */
3374         r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3375         if (r < 0) {
3376                 event_source_time_prioq_remove(s, &s->event->monotonic);
3377                 goto fail;
3378         }
3379
3380         event_source_pp_prioq_reshuffle(s);
3381
3382         log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3383         return 0;
3384
3385 fail:
3386         /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3387          * space for it should already be allocated. */
3388         if (EVENT_SOURCE_IS_TIME(s->type))
3389                 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3390
3391         return r;
3392 }
3393
3394 static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
3395         int r;
3396
3397         assert(s);
3398
3399         if (!s->ratelimited)
3400                 return 0;
3401
3402         /* Let's take the event source out of the monotonic prioq first. */
3403         event_source_time_prioq_remove(s, &s->event->monotonic);
3404
3405         /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3406         if (EVENT_SOURCE_IS_TIME(s->type)) {
3407                 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3408                 if (r < 0)
3409                         goto fail;
3410         }
3411
3412         /* Let's try to take it online again.  */
3413         r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3414         if (r < 0) {
3415                 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3416                 if (EVENT_SOURCE_IS_TIME(s->type))
3417                         event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3418
3419                 goto fail;
3420         }
3421
3422         event_source_pp_prioq_reshuffle(s);
3423         ratelimit_reset(&s->rate_limit);
3424
3425         log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
3426
3427         if (run_callback && s->ratelimit_expire_callback) {
3428                 s->dispatching = true;
3429                 r = s->ratelimit_expire_callback(s, s->userdata);
3430                 s->dispatching = false;
3431
3432                 if (r < 0) {
3433                         log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3434                                         strna(s->description),
3435                                         event_source_type_to_string(s->type),
3436                                         s->exit_on_failure ? "exiting" : "disabling");
3437
3438                         if (s->exit_on_failure)
3439                                 (void) sd_event_exit(s->event, r);
3440                 }
3441
3442                 if (s->n_ref == 0)
3443                         source_free(s);
3444                 else if (r < 0)
3445                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3446
3447                 return 1;
3448         }
3449
3450         return 0;
3451
3452 fail:
3453         /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3454          * simply put it back in it, maybe we can then process it more successfully next iteration. */
3455         assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3456
3457         return r;
3458 }
3459
3460 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3461         usec_t c;
3462         assert(e);
3463         assert(a <= b);
3464
3465         if (a <= 0)
3466                 return 0;
3467         if (a >= USEC_INFINITY)
3468                 return USEC_INFINITY;
3469
3470         if (b <= a + 1)
3471                 return a;
3472
3473         initialize_perturb(e);
3474
3475         /*
3476           Find a good time to wake up again between times a and b. We
3477           have two goals here:
3478
3479           a) We want to wake up as seldom as possible, hence prefer
3480              later times over earlier times.
3481
3482           b) But if we have to wake up, then let's make sure to
3483              dispatch as much as possible on the entire system.
3484
3485           We implement this by waking up everywhere at the same time
3486           within any given minute if we can, synchronised via the
3487           perturbation value determined from the boot ID. If we can't,
3488           then we try to find the same spot in every 10s, then 1s and
3489           then 250ms step. Otherwise, we pick the last possible time
3490           to wake up.
3491         */
3492
3493         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3494         if (c >= b) {
3495                 if (_unlikely_(c < USEC_PER_MINUTE))
3496                         return b;
3497
3498                 c -= USEC_PER_MINUTE;
3499         }
3500
3501         if (c >= a)
3502                 return c;
3503
3504         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3505         if (c >= b) {
3506                 if (_unlikely_(c < USEC_PER_SEC*10))
3507                         return b;
3508
3509                 c -= USEC_PER_SEC*10;
3510         }
3511
3512         if (c >= a)
3513                 return c;
3514
3515         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3516         if (c >= b) {
3517                 if (_unlikely_(c < USEC_PER_SEC))
3518                         return b;
3519
3520                 c -= USEC_PER_SEC;
3521         }
3522
3523         if (c >= a)
3524                 return c;
3525
3526         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3527         if (c >= b) {
3528                 if (_unlikely_(c < USEC_PER_MSEC*250))
3529                         return b;
3530
3531                 c -= USEC_PER_MSEC*250;
3532         }
3533
3534         if (c >= a)
3535                 return c;
3536
3537         return b;
3538 }
3539
3540 static int event_arm_timer(
3541                 sd_event *e,
3542                 struct clock_data *d) {
3543
3544         struct itimerspec its = {};
3545         sd_event_source *a, *b;
3546         usec_t t;
3547
3548         assert(e);
3549         assert(d);
3550
3551         if (!d->needs_rearm)
3552                 return 0;
3553
3554         d->needs_rearm = false;
3555
3556         a = prioq_peek(d->earliest);
3557         assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3558         if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3559
3560                 if (d->fd < 0)
3561                         return 0;
3562
3563                 if (d->next == USEC_INFINITY)
3564                         return 0;
3565
3566                 /* disarm */
3567                 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3568                         return -errno;
3569
3570                 d->next = USEC_INFINITY;
3571                 return 0;
3572         }
3573
3574         b = prioq_peek(d->latest);
3575         assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3576         assert(b && b->enabled != SD_EVENT_OFF);
3577
3578         t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3579         if (d->next == t)
3580                 return 0;
3581
3582         assert_se(d->fd >= 0);
3583
3584         if (t == 0) {
3585                 /* We don't want to disarm here, just mean some time looooong ago. */
3586                 its.it_value.tv_sec = 0;
3587                 its.it_value.tv_nsec = 1;
3588         } else
3589                 timespec_store(&its.it_value, t);
3590
3591         if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3592                 return -errno;
3593
3594         d->next = t;
3595         return 0;
3596 }
3597
3598 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3599         assert(e);
3600         assert(s);
3601         assert(s->type == SOURCE_IO);
3602
3603         /* If the event source was already pending, we just OR in the
3604          * new revents, otherwise we reset the value. The ORing is
3605          * necessary to handle EPOLLONESHOT events properly where
3606          * readability might happen independently of writability, and
3607          * we need to keep track of both */
3608
3609         if (s->pending)
3610                 s->io.revents |= revents;
3611         else
3612                 s->io.revents = revents;
3613
3614         return source_set_pending(s, true);
3615 }
3616
3617 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3618         uint64_t x;
3619         ssize_t ss;
3620
3621         assert(e);
3622         assert(fd >= 0);
3623
3624         assert_return(events == EPOLLIN, -EIO);
3625
3626         ss = read(fd, &x, sizeof(x));
3627         if (ss < 0) {
3628                 if (ERRNO_IS_TRANSIENT(errno))
3629                         return 0;
3630
3631                 return -errno;
3632         }
3633
3634         if (_unlikely_(ss != sizeof(x)))
3635                 return -EIO;
3636
3637         if (next)
3638                 *next = USEC_INFINITY;
3639
3640         return 0;
3641 }
3642
3643 static int process_timer(
3644                 sd_event *e,
3645                 usec_t n,
3646                 struct clock_data *d) {
3647
3648         sd_event_source *s;
3649         bool callback_invoked = false;
3650         int r;
3651
3652         assert(e);
3653         assert(d);
3654
3655         for (;;) {
3656                 s = prioq_peek(d->earliest);
3657                 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3658
3659                 if (!s || time_event_source_next(s) > n)
3660                         break;
3661
3662                 if (s->ratelimited) {
3663                         /* This is an event sources whose ratelimit window has ended. Let's turn it on
3664                          * again. */
3665                         assert(s->ratelimited);
3666
3667                         r = event_source_leave_ratelimit(s, /* run_callback */ true);
3668                         if (r < 0)
3669                                 return r;
3670                         else if (r == 1)
3671                                 callback_invoked = true;
3672
3673                         continue;
3674                 }
3675
3676                 if (s->enabled == SD_EVENT_OFF || s->pending)
3677                         break;
3678
3679                 r = source_set_pending(s, true);
3680                 if (r < 0)
3681                         return r;
3682
3683                 event_source_time_prioq_reshuffle(s);
3684         }
3685
3686         return callback_invoked;
3687 }
3688
3689 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3690         int64_t min_priority = threshold;
3691         bool something_new = false;
3692         sd_event_source *s;
3693         int r;
3694
3695         assert(e);
3696         assert(ret_min_priority);
3697
3698         if (!e->need_process_child) {
3699                 *ret_min_priority = min_priority;
3700                 return 0;
3701         }
3702
3703         e->need_process_child = false;
3704
3705         /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3706          * for, instead of using P_ALL. This is because we only want to get child information of very
3707          * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3708          * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3709          * hence we really don't want anything flushed out of the kernel's queue that we don't care
3710          * about. Since this is O(n) this means that if you have a lot of processes you probably want
3711          * to handle SIGCHLD yourself.
3712          *
3713          * We do not reap the children here (by using WNOWAIT), this is only done after the event
3714          * source is dispatched so that the callback still sees the process as a zombie. */
3715
3716         HASHMAP_FOREACH(s, e->child_sources) {
3717                 assert(s->type == SOURCE_CHILD);
3718
3719                 if (s->priority > threshold)
3720                         continue;
3721
3722                 if (s->pending)
3723                         continue;
3724
3725                 if (event_source_is_offline(s))
3726                         continue;
3727
3728                 if (s->child.exited)
3729                         continue;
3730
3731                 if (EVENT_SOURCE_WATCH_PIDFD(s))
3732                         /* There's a usable pidfd known for this event source? Then don't waitid() for
3733                          * it here */
3734                         continue;
3735
3736                 zero(s->child.siginfo);
3737                 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3738                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3739                         return negative_errno();
3740
3741                 if (s->child.siginfo.si_pid != 0) {
3742                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3743
3744                         if (zombie)
3745                                 s->child.exited = true;
3746
3747                         if (!zombie && (s->child.options & WEXITED)) {
3748                                 /* If the child isn't dead then let's immediately remove the state
3749                                  * change from the queue, since there's no benefit in leaving it
3750                                  * queued. */
3751
3752                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
3753                                 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3754                         }
3755
3756                         r = source_set_pending(s, true);
3757                         if (r < 0)
3758                                 return r;
3759                         if (r > 0) {
3760                                 something_new = true;
3761                                 min_priority = MIN(min_priority, s->priority);
3762                         }
3763                 }
3764         }
3765
3766         *ret_min_priority = min_priority;
3767         return something_new;
3768 }
3769
3770 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3771         assert(e);
3772         assert(s);
3773         assert(s->type == SOURCE_CHILD);
3774
3775         if (s->pending)
3776                 return 0;
3777
3778         if (event_source_is_offline(s))
3779                 return 0;
3780
3781         if (!EVENT_SOURCE_WATCH_PIDFD(s))
3782                 return 0;
3783
3784         zero(s->child.siginfo);
3785         if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3786                 return -errno;
3787
3788         if (s->child.siginfo.si_pid == 0)
3789                 return 0;
3790
3791         if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3792                 s->child.exited = true;
3793
3794         return source_set_pending(s, true);
3795 }
3796
3797 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3798         int r;
3799
3800         assert(e);
3801         assert(d);
3802         assert_return(events == EPOLLIN, -EIO);
3803         assert(min_priority);
3804
3805         /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3806          * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3807          * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3808          * but we might have higher priority children we care about hence we need to check that
3809          * explicitly. */
3810
3811         if (sigismember(&d->sigset, SIGCHLD))
3812                 e->need_process_child = true;
3813
3814         /* If there's already an event source pending for this priority we don't read another */
3815         if (d->current)
3816                 return 0;
3817
3818         for (;;) {
3819                 struct signalfd_siginfo si;
3820                 ssize_t n;
3821                 sd_event_source *s = NULL;
3822
3823                 n = read(d->fd, &si, sizeof(si));
3824                 if (n < 0) {
3825                         if (ERRNO_IS_TRANSIENT(errno))
3826                                 return 0;
3827
3828                         return -errno;
3829                 }
3830
3831                 if (_unlikely_(n != sizeof(si)))
3832                         return -EIO;
3833
3834                 assert(SIGNAL_VALID(si.ssi_signo));
3835
3836                 if (e->signal_sources)
3837                         s = e->signal_sources[si.ssi_signo];
3838                 if (!s)
3839                         continue;
3840                 if (s->pending)
3841                         continue;
3842
3843                 s->signal.siginfo = si;
3844                 d->current = s;
3845
3846                 r = source_set_pending(s, true);
3847                 if (r < 0)
3848                         return r;
3849                 if (r > 0 && *min_priority >= s->priority) {
3850                         *min_priority = s->priority;
3851                         return 1; /* an event source with smaller priority is queued. */
3852                 }
3853
3854                 return 0;
3855         }
3856 }
3857
3858 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3859         ssize_t n;
3860
3861         assert(e);
3862         assert(d);
3863
3864         assert_return(revents == EPOLLIN, -EIO);
3865
3866         /* If there's already an event source pending for this priority, don't read another */
3867         if (d->n_pending > 0)
3868                 return 0;
3869
3870         /* Is the read buffer non-empty? If so, let's not read more */
3871         if (d->buffer_filled > 0)
3872                 return 0;
3873
3874         if (d->priority > threshold)
3875                 return 0;
3876
3877         n = read(d->fd, &d->buffer, sizeof(d->buffer));
3878         if (n < 0) {
3879                 if (ERRNO_IS_TRANSIENT(errno))
3880                         return 0;
3881
3882                 return -errno;
3883         }
3884
3885         assert(n > 0);
3886         d->buffer_filled = (size_t) n;
3887         LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
3888
3889         return 1;
3890 }
3891
3892 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3893         assert(e);
3894         assert(d);
3895         assert(sz <= d->buffer_filled);
3896
3897         if (sz == 0)
3898                 return;
3899
3900         /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3901         memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3902         d->buffer_filled -= sz;
3903
3904         if (d->buffer_filled == 0)
3905                 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
3906 }
3907
3908 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3909         int r;
3910
3911         assert(e);
3912         assert(d);
3913
3914         /* If there's already an event source pending for this priority, don't read another */
3915         if (d->n_pending > 0)
3916                 return 0;
3917
3918         while (d->buffer_filled > 0) {
3919                 size_t sz;
3920
3921                 /* Let's validate that the event structures are complete */
3922                 if (d->buffer_filled < offsetof(struct inotify_event, name))
3923                         return -EIO;
3924
3925                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3926                 if (d->buffer_filled < sz)
3927                         return -EIO;
3928
3929                 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3930                         struct inode_data *inode_data;
3931
3932                         /* The queue overran, let's pass this event to all event sources connected to this inotify
3933                          * object */
3934
3935                         HASHMAP_FOREACH(inode_data, d->inodes)
3936                                 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3937
3938                                         if (event_source_is_offline(s))
3939                                                 continue;
3940
3941                                         r = source_set_pending(s, true);
3942                                         if (r < 0)
3943                                                 return r;
3944                                 }
3945                 } else {
3946                         struct inode_data *inode_data;
3947
3948                         /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3949                          * our watch descriptor table. */
3950                         if (d->buffer.ev.mask & IN_IGNORED) {
3951
3952                                 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3953                                 if (!inode_data) {
3954                                         event_inotify_data_drop(e, d, sz);
3955                                         continue;
3956                                 }
3957
3958                                 /* The watch descriptor was removed by the kernel, let's drop it here too */
3959                                 inode_data->wd = -1;
3960                         } else {
3961                                 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3962                                 if (!inode_data) {
3963                                         event_inotify_data_drop(e, d, sz);
3964                                         continue;
3965                                 }
3966                         }
3967
3968                         /* Trigger all event sources that are interested in these events. Also trigger all event
3969                          * sources if IN_IGNORED or IN_UNMOUNT is set. */
3970                         LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3971
3972                                 if (event_source_is_offline(s))
3973                                         continue;
3974
3975                                 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3976                                     (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3977                                         continue;
3978
3979                                 r = source_set_pending(s, true);
3980                                 if (r < 0)
3981                                         return r;
3982                         }
3983                 }
3984
3985                 /* Something pending now? If so, let's finish, otherwise let's read more. */
3986                 if (d->n_pending > 0)
3987                         return 1;
3988         }
3989
3990         return 0;
3991 }
3992
3993 static int process_inotify(sd_event *e) {
3994         int r, done = 0;
3995
3996         assert(e);
3997
3998         LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
3999                 r = event_inotify_data_process(e, d);
4000                 if (r < 0)
4001                         return r;
4002                 if (r > 0)
4003                         done++;
4004         }
4005
4006         return done;
4007 }
4008
4009 static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
4010         assert(s);
4011         assert(s->type == SOURCE_MEMORY_PRESSURE);
4012
4013         if (s->pending)
4014                 s->memory_pressure.revents |= revents;
4015         else
4016                 s->memory_pressure.revents = revents;
4017
4018         return source_set_pending(s, true);
4019 }
4020
4021 static int source_memory_pressure_write(sd_event_source *s) {
4022         ssize_t n;
4023         int r;
4024
4025         assert(s);
4026         assert(s->type == SOURCE_MEMORY_PRESSURE);
4027
4028         /* once we start writing, the buffer is locked, we allow no further changes. */
4029         s->memory_pressure.locked = true;
4030
4031         if (s->memory_pressure.write_buffer_size > 0) {
4032                 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4033                 if (n < 0) {
4034                         if (!ERRNO_IS_TRANSIENT(errno)) {
4035                                 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4036                                  * files, but then generates EOPNOSUPP on read() and write() (instead of on
4037                                  * open()!). This sucks hard, since we can only detect this kind of failure
4038                                  * so late. Let's make the best of it, and turn off the event source like we
4039                                  * do for failed event source handlers. */
4040
4041                                 log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4042                                 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4043                                 return 0;
4044                         }
4045
4046                         n = 0;
4047                 }
4048         } else
4049                 n = 0;
4050
4051         assert(n >= 0);
4052
4053         if ((size_t) n == s->memory_pressure.write_buffer_size) {
4054                 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4055
4056                 if (n > 0) {
4057                         s->memory_pressure.write_buffer_size = 0;
4058
4059                         /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4060                         r = source_memory_pressure_register(s, s->enabled);
4061                         if (r < 0)
4062                                 return r;
4063                 }
4064         } else if (n > 0) {
4065                 _cleanup_free_ void *c = NULL;
4066
4067                 assert((size_t) n < s->memory_pressure.write_buffer_size);
4068
4069                 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4070                 if (!c)
4071                         return -ENOMEM;
4072
4073                 free_and_replace(s->memory_pressure.write_buffer, c);
4074                 s->memory_pressure.write_buffer_size -= n;
4075                 return 1;
4076         }
4077
4078         return 0;
4079 }
4080
4081 static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4082         int r;
4083
4084         assert(s);
4085         assert(s->type == SOURCE_MEMORY_PRESSURE);
4086
4087         r = source_memory_pressure_write(s);
4088         if (r < 0)
4089                 return r;
4090         if (r > 0)
4091                 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4092                            * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4093
4094         /* No pending incoming IO? Then let's not continue further */
4095         if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4096
4097                 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4098                 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4099                         return -EIO;
4100
4101                 return 1; /* leave dispatch, we already processed everything */
4102         }
4103
4104         if (s->memory_pressure.revents & EPOLLIN) {
4105                 uint8_t pipe_buf[PIPE_BUF];
4106                 ssize_t n;
4107
4108                 /* If the fd is readable, then flush out anything that might be queued */
4109
4110                 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4111                 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4112                         return -errno;
4113         }
4114
4115         return 0; /* go on, dispatch to user callback */
4116 }
4117
4118 static int source_dispatch(sd_event_source *s) {
4119         EventSourceType saved_type;
4120         sd_event *saved_event;
4121         int r = 0;
4122
4123         assert(s);
4124         assert(s->pending || s->type == SOURCE_EXIT);
4125
4126         /* Save the event source type, here, so that we still know it after the event callback which might
4127          * invalidate the event. */
4128         saved_type = s->type;
4129
4130         /* Similarly, store a reference to the event loop object, so that we can still access it after the
4131          * callback might have invalidated/disconnected the event source. */
4132         saved_event = s->event;
4133         PROTECT_EVENT(saved_event);
4134
4135         /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4136         assert(!s->ratelimited);
4137         if (!ratelimit_below(&s->rate_limit)) {
4138                 r = event_source_enter_ratelimited(s);
4139                 if (r < 0)
4140                         return r;
4141
4142                 return 1;
4143         }
4144
4145         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
4146                 r = source_set_pending(s, false);
4147                 if (r < 0)
4148                         return r;
4149         }
4150
4151         if (s->type != SOURCE_POST) {
4152                 sd_event_source *z;
4153
4154                 /* If we execute a non-post source, let's mark all post sources as pending. */
4155
4156                 SET_FOREACH(z, s->event->post_sources) {
4157                         if (event_source_is_offline(z))
4158                                 continue;
4159
4160                         r = source_set_pending(z, true);
4161                         if (r < 0)
4162                                 return r;
4163                 }
4164         }
4165
4166         if (s->type == SOURCE_MEMORY_PRESSURE) {
4167                 r = source_memory_pressure_initiate_dispatch(s);
4168                 if (r == -EIO) /* handle EIO errors similar to callback errors */
4169                         goto finish;
4170                 if (r < 0)
4171                         return r;
4172                 if (r > 0) /* already handled */
4173                         return 1;
4174         }
4175
4176         if (s->enabled == SD_EVENT_ONESHOT) {
4177                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
4178                 if (r < 0)
4179                         return r;
4180         }
4181
4182         s->dispatching = true;
4183
4184         switch (s->type) {
4185
4186         case SOURCE_IO:
4187                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4188                 break;
4189
4190         case SOURCE_TIME_REALTIME:
4191         case SOURCE_TIME_BOOTTIME:
4192         case SOURCE_TIME_MONOTONIC:
4193         case SOURCE_TIME_REALTIME_ALARM:
4194         case SOURCE_TIME_BOOTTIME_ALARM:
4195                 r = s->time.callback(s, s->time.next, s->userdata);
4196                 break;
4197
4198         case SOURCE_SIGNAL:
4199                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4200                 break;
4201
4202         case SOURCE_CHILD: {
4203                 bool zombie;
4204
4205                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
4206
4207                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
4208
4209                 /* Now, reap the PID for good. */
4210                 if (zombie) {
4211                         (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
4212                         s->child.waited = true;
4213                 }
4214
4215                 break;
4216         }
4217
4218         case SOURCE_DEFER:
4219                 r = s->defer.callback(s, s->userdata);
4220                 break;
4221
4222         case SOURCE_POST:
4223                 r = s->post.callback(s, s->userdata);
4224                 break;
4225
4226         case SOURCE_EXIT:
4227                 r = s->exit.callback(s, s->userdata);
4228                 break;
4229
4230         case SOURCE_INOTIFY: {
4231                 struct sd_event *e = s->event;
4232                 struct inotify_data *d;
4233                 size_t sz;
4234
4235                 assert(s->inotify.inode_data);
4236                 assert_se(d = s->inotify.inode_data->inotify_data);
4237
4238                 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4239                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4240                 assert(d->buffer_filled >= sz);
4241
4242                 /* If the inotify callback destroys the event source then this likely means we don't need to
4243                  * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4244                  * free it immediately, then we couldn't drop the event from the inotify event queue without
4245                  * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4246                  * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4247                  * explicitly GC it after we are done dropping the inotify event from the buffer. */
4248                 d->n_busy++;
4249                 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
4250                 d->n_busy--;
4251
4252                 /* When no event is pending anymore on this inotify object, then let's drop the event from
4253                  * the inotify event queue buffer. */
4254                 if (d->n_pending == 0)
4255                         event_inotify_data_drop(e, d, sz);
4256
4257                 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4258                 event_gc_inotify_data(e, d);
4259                 break;
4260         }
4261
4262         case SOURCE_MEMORY_PRESSURE:
4263                 r = s->memory_pressure.callback(s, s->userdata);
4264                 break;
4265
4266         case SOURCE_WATCHDOG:
4267         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
4268         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
4269                 assert_not_reached();
4270         }
4271
4272         s->dispatching = false;
4273
4274 finish:
4275         if (r < 0) {
4276                 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4277                                 strna(s->description),
4278                                 event_source_type_to_string(saved_type),
4279                                 s->exit_on_failure ? "exiting" : "disabling");
4280
4281                 if (s->exit_on_failure)
4282                         (void) sd_event_exit(saved_event, r);
4283         }
4284
4285         if (s->n_ref == 0)
4286                 source_free(s);
4287         else if (r < 0)
4288                 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4289
4290         return 1;
4291 }
4292
4293 static int event_prepare(sd_event *e) {
4294         int r;
4295
4296         assert(e);
4297
4298         for (;;) {
4299                 sd_event_source *s;
4300
4301                 s = prioq_peek(e->prepare);
4302                 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
4303                         break;
4304
4305                 s->prepare_iteration = e->iteration;
4306                 prioq_reshuffle(e->prepare, s, &s->prepare_index);
4307
4308                 assert(s->prepare);
4309                 s->dispatching = true;
4310                 r = s->prepare(s, s->userdata);
4311                 s->dispatching = false;
4312
4313                 if (r < 0) {
4314                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4315                                         strna(s->description),
4316                                         event_source_type_to_string(s->type),
4317                                         s->exit_on_failure ? "exiting" : "disabling");
4318
4319                         if (s->exit_on_failure)
4320                                 (void) sd_event_exit(e, r);
4321                 }
4322
4323                 if (s->n_ref == 0)
4324                         source_free(s);
4325                 else if (r < 0)
4326                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4327         }
4328
4329         return 0;
4330 }
4331
4332 static int dispatch_exit(sd_event *e) {
4333         sd_event_source *p;
4334         int r;
4335
4336         assert(e);
4337
4338         p = prioq_peek(e->exit);
4339         assert(!p || p->type == SOURCE_EXIT);
4340
4341         if (!p || event_source_is_offline(p)) {
4342                 e->state = SD_EVENT_FINISHED;
4343                 return 0;
4344         }
4345
4346         PROTECT_EVENT(e);
4347         e->iteration++;
4348         e->state = SD_EVENT_EXITING;
4349         r = source_dispatch(p);
4350         e->state = SD_EVENT_INITIAL;
4351         return r;
4352 }
4353
4354 static sd_event_source* event_next_pending(sd_event *e) {
4355         sd_event_source *p;
4356
4357         assert(e);
4358
4359         p = prioq_peek(e->pending);
4360         if (!p)
4361                 return NULL;
4362
4363         if (event_source_is_offline(p))
4364                 return NULL;
4365
4366         return p;
4367 }
4368
4369 static int arm_watchdog(sd_event *e) {
4370         struct itimerspec its = {};
4371         usec_t t;
4372
4373         assert(e);
4374         assert(e->watchdog_fd >= 0);
4375
4376         t = sleep_between(e,
4377                           usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4378                           usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
4379
4380         timespec_store(&its.it_value, t);
4381
4382         /* Make sure we never set the watchdog to 0, which tells the
4383          * kernel to disable it. */
4384         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4385                 its.it_value.tv_nsec = 1;
4386
4387         return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
4388 }
4389
4390 static int process_watchdog(sd_event *e) {
4391         assert(e);
4392
4393         if (!e->watchdog)
4394                 return 0;
4395
4396         /* Don't notify watchdog too often */
4397         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4398                 return 0;
4399
4400         sd_notify(false, "WATCHDOG=1");
4401         e->watchdog_last = e->timestamp.monotonic;
4402
4403         return arm_watchdog(e);
4404 }
4405
4406 static void event_close_inode_data_fds(sd_event *e) {
4407         struct inode_data *d;
4408
4409         assert(e);
4410
4411         /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4412          * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4413          * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4414          * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4415          * compromise. */
4416
4417         while ((d = e->inode_data_to_close_list)) {
4418                 assert(d->fd >= 0);
4419                 d->fd = safe_close(d->fd);
4420
4421                 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
4422         }
4423 }
4424
4425 static int event_memory_pressure_write_list(sd_event *e) {
4426         int r;
4427
4428         assert(e);
4429
4430         for (;;) {
4431                 sd_event_source *s;
4432
4433                 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4434                 if (!s)
4435                         break;
4436
4437                 assert(s->type == SOURCE_MEMORY_PRESSURE);
4438                 assert(s->memory_pressure.write_buffer_size > 0);
4439                 s->memory_pressure.in_write_list = false;
4440
4441                 r = source_memory_pressure_write(s);
4442                 if (r < 0)
4443                         return r;
4444         }
4445
4446         return 0;
4447 }
4448
4449 _public_ int sd_event_prepare(sd_event *e) {
4450         int r;
4451
4452         assert_return(e, -EINVAL);
4453         assert_return(e = event_resolve(e), -ENOPKG);
4454         assert_return(!event_origin_changed(e), -ECHILD);
4455         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4456         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4457
4458         /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4459          * this check here once, since gettid() is typically not cached, and thus want to minimize
4460          * syscalls */
4461         assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4462
4463         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4464         PROTECT_EVENT(e);
4465
4466         if (e->exit_requested)
4467                 goto pending;
4468
4469         e->iteration++;
4470
4471         e->state = SD_EVENT_PREPARING;
4472         r = event_prepare(e);
4473         e->state = SD_EVENT_INITIAL;
4474         if (r < 0)
4475                 return r;
4476
4477         r = event_memory_pressure_write_list(e);
4478         if (r < 0)
4479                 return r;
4480
4481         r = event_arm_timer(e, &e->realtime);
4482         if (r < 0)
4483                 return r;
4484
4485         r = event_arm_timer(e, &e->boottime);
4486         if (r < 0)
4487                 return r;
4488
4489         r = event_arm_timer(e, &e->monotonic);
4490         if (r < 0)
4491                 return r;
4492
4493         r = event_arm_timer(e, &e->realtime_alarm);
4494         if (r < 0)
4495                 return r;
4496
4497         r = event_arm_timer(e, &e->boottime_alarm);
4498         if (r < 0)
4499                 return r;
4500
4501         event_close_inode_data_fds(e);
4502
4503         if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
4504                 goto pending;
4505
4506         e->state = SD_EVENT_ARMED;
4507
4508         return 0;
4509
4510 pending:
4511         e->state = SD_EVENT_ARMED;
4512         r = sd_event_wait(e, 0);
4513         if (r == 0)
4514                 e->state = SD_EVENT_ARMED;
4515
4516         return r;
4517 }
4518
4519 static int epoll_wait_usec(
4520                 int fd,
4521                 struct epoll_event *events,
4522                 int maxevents,
4523                 usec_t timeout) {
4524
4525         int msec;
4526         /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4527
4528 #if HAVE_EPOLL_PWAIT2
4529         static bool epoll_pwait2_absent = false;
4530         int r;
4531
4532         /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4533          * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4534          * is not that obvious to implement given the libc and kernel definitions differ in the last
4535          * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4536          * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4537          * missing. */
4538
4539         if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
4540                 r = epoll_pwait2(fd,
4541                                  events,
4542                                  maxevents,
4543                                  TIMESPEC_STORE(timeout),
4544                                  NULL);
4545                 if (r >= 0)
4546                         return r;
4547                 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
4548                         return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4549                                         * supported. */
4550
4551                 epoll_pwait2_absent = true;
4552         }
4553 #endif
4554
4555         if (timeout == USEC_INFINITY)
4556                 msec = -1;
4557         else {
4558                 usec_t k;
4559
4560                 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4561                 if (k >= INT_MAX)
4562                         msec = INT_MAX; /* Saturate */
4563                 else
4564                         msec = (int) k;
4565         }
4566
4567         return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
4568 }
4569
4570 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
4571         size_t n_event_queue, m, n_event_max;
4572         int64_t min_priority = threshold;
4573         bool something_new = false;
4574         int r;
4575
4576         assert(e);
4577         assert(ret_min_priority);
4578
4579         n_event_queue = MAX(e->n_sources, 1u);
4580         if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
4581                 return -ENOMEM;
4582
4583         n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4584
4585         /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4586         if (e->buffered_inotify_data_list)
4587                 timeout = 0;
4588
4589         for (;;) {
4590                 r = epoll_wait_usec(
4591                                 e->epoll_fd,
4592                                 e->event_queue,
4593                                 n_event_max,
4594                                 timeout);
4595                 if (r < 0)
4596                         return r;
4597
4598                 m = (size_t) r;
4599
4600                 if (m < n_event_max)
4601                         break;
4602
4603                 if (n_event_max >= n_event_queue * 10)
4604                         break;
4605
4606                 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
4607                         return -ENOMEM;
4608
4609                 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4610                 timeout = 0;
4611         }
4612
4613         /* Set timestamp only when this is called first time. */
4614         if (threshold == INT64_MAX)
4615                 triple_timestamp_now(&e->timestamp);
4616
4617         for (size_t i = 0; i < m; i++) {
4618
4619                 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4620                         r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
4621                 else {
4622                         WakeupType *t = e->event_queue[i].data.ptr;
4623
4624                         switch (*t) {
4625
4626                         case WAKEUP_EVENT_SOURCE: {
4627                                 sd_event_source *s = e->event_queue[i].data.ptr;
4628
4629                                 assert(s);
4630
4631                                 if (s->priority > threshold)
4632                                         continue;
4633
4634                                 min_priority = MIN(min_priority, s->priority);
4635
4636                                 switch (s->type) {
4637
4638                                 case SOURCE_IO:
4639                                         r = process_io(e, s, e->event_queue[i].events);
4640                                         break;
4641
4642                                 case SOURCE_CHILD:
4643                                         r = process_pidfd(e, s, e->event_queue[i].events);
4644                                         break;
4645
4646                                 case SOURCE_MEMORY_PRESSURE:
4647                                         r = process_memory_pressure(s, e->event_queue[i].events);
4648                                         break;
4649
4650                                 default:
4651                                         assert_not_reached();
4652                                 }
4653
4654                                 break;
4655                         }
4656
4657                         case WAKEUP_CLOCK_DATA: {
4658                                 struct clock_data *d = e->event_queue[i].data.ptr;
4659
4660                                 assert(d);
4661
4662                                 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4663                                 break;
4664                         }
4665
4666                         case WAKEUP_SIGNAL_DATA:
4667                                 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4668                                 break;
4669
4670                         case WAKEUP_INOTIFY_DATA:
4671                                 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4672                                 break;
4673
4674                         default:
4675                                 assert_not_reached();
4676                         }
4677                 }
4678                 if (r < 0)
4679                         return r;
4680                 if (r > 0)
4681                         something_new = true;
4682         }
4683
4684         *ret_min_priority = min_priority;
4685         return something_new;
4686 }
4687
4688 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4689         int r;
4690
4691         assert_return(e, -EINVAL);
4692         assert_return(e = event_resolve(e), -ENOPKG);
4693         assert_return(!event_origin_changed(e), -ECHILD);
4694         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4695         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4696
4697         if (e->exit_requested) {
4698                 e->state = SD_EVENT_PENDING;
4699                 return 1;
4700         }
4701
4702         for (int64_t threshold = INT64_MAX; ; threshold--) {
4703                 int64_t epoll_min_priority, child_min_priority;
4704
4705                 /* There may be a possibility that new epoll (especially IO) and child events are
4706                  * triggered just after process_epoll() call but before process_child(), and the new IO
4707                  * events may have higher priority than the child events. To salvage these events,
4708                  * let's call epoll_wait() again, but accepts only events with higher priority than the
4709                  * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4710                  * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4711                  * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4712
4713                 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4714                 if (r == -EINTR) {
4715                         e->state = SD_EVENT_PENDING;
4716                         return 1;
4717                 }
4718                 if (r < 0)
4719                         goto finish;
4720                 if (r == 0 && threshold < INT64_MAX)
4721                         /* No new epoll event. */
4722                         break;
4723
4724                 r = process_child(e, threshold, &child_min_priority);
4725                 if (r < 0)
4726                         goto finish;
4727                 if (r == 0)
4728                         /* No new child event. */
4729                         break;
4730
4731                 threshold = MIN(epoll_min_priority, child_min_priority);
4732                 if (threshold == INT64_MIN)
4733                         break;
4734
4735                 timeout = 0;
4736         }
4737
4738         r = process_watchdog(e);
4739         if (r < 0)
4740                 goto finish;
4741
4742         r = process_inotify(e);
4743         if (r < 0)
4744                 goto finish;
4745
4746         r = process_timer(e, e->timestamp.realtime, &e->realtime);
4747         if (r < 0)
4748                 goto finish;
4749
4750         r = process_timer(e, e->timestamp.boottime, &e->boottime);
4751         if (r < 0)
4752                 goto finish;
4753
4754         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4755         if (r < 0)
4756                 goto finish;
4757
4758         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4759         if (r < 0)
4760                 goto finish;
4761
4762         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4763         if (r < 0)
4764                 goto finish;
4765         else if (r == 1) {
4766                 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4767                  * put loop in the initial state in order to evaluate (in the next iteration) also sources
4768                  * there were potentially re-enabled by the callback.
4769                  *
4770                  * Wondering why we treat only this invocation of process_timer() differently? Once event
4771                  * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4772                  * ratelimit expiry callback is never called for any other timer type. */
4773                 r = 0;
4774                 goto finish;
4775         }
4776
4777         if (event_next_pending(e)) {
4778                 e->state = SD_EVENT_PENDING;
4779                 return 1;
4780         }
4781
4782         r = 0;
4783
4784 finish:
4785         e->state = SD_EVENT_INITIAL;
4786
4787         return r;
4788 }
4789
4790 _public_ int sd_event_dispatch(sd_event *e) {
4791         sd_event_source *p;
4792         int r;
4793
4794         assert_return(e, -EINVAL);
4795         assert_return(e = event_resolve(e), -ENOPKG);
4796         assert_return(!event_origin_changed(e), -ECHILD);
4797         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4798         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4799
4800         if (e->exit_requested)
4801                 return dispatch_exit(e);
4802
4803         p = event_next_pending(e);
4804         if (p) {
4805                 PROTECT_EVENT(e);
4806
4807                 e->state = SD_EVENT_RUNNING;
4808                 r = source_dispatch(p);
4809                 e->state = SD_EVENT_INITIAL;
4810                 return r;
4811         }
4812
4813         e->state = SD_EVENT_INITIAL;
4814
4815         return 1;
4816 }
4817
4818 static void event_log_delays(sd_event *e) {
4819         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4820         size_t l, i;
4821
4822         p = b;
4823         l = sizeof(b);
4824         for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4825                 l = strpcpyf(&p, l, "%u ", e->delays[i]);
4826                 e->delays[i] = 0;
4827         }
4828         log_debug("Event loop iterations: %s", b);
4829 }
4830
4831 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4832         int r;
4833
4834         assert_return(e, -EINVAL);
4835         assert_return(e = event_resolve(e), -ENOPKG);
4836         assert_return(!event_origin_changed(e), -ECHILD);
4837         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4838         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4839
4840         if (e->profile_delays && e->last_run_usec != 0) {
4841                 usec_t this_run;
4842                 unsigned l;
4843
4844                 this_run = now(CLOCK_MONOTONIC);
4845
4846                 l = log2u64(this_run - e->last_run_usec);
4847                 assert(l < ELEMENTSOF(e->delays));
4848                 e->delays[l]++;
4849
4850                 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4851                         event_log_delays(e);
4852                         e->last_log_usec = this_run;
4853                 }
4854         }
4855
4856         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4857         PROTECT_EVENT(e);
4858
4859         r = sd_event_prepare(e);
4860         if (r == 0)
4861                 /* There was nothing? Then wait... */
4862                 r = sd_event_wait(e, timeout);
4863
4864         if (e->profile_delays)
4865                 e->last_run_usec = now(CLOCK_MONOTONIC);
4866
4867         if (r > 0) {
4868                 /* There's something now, then let's dispatch it */
4869                 r = sd_event_dispatch(e);
4870                 if (r < 0)
4871                         return r;
4872
4873                 return 1;
4874         }
4875
4876         return r;
4877 }
4878
4879 _public_ int sd_event_loop(sd_event *e) {
4880         int r;
4881
4882         assert_return(e, -EINVAL);
4883         assert_return(e = event_resolve(e), -ENOPKG);
4884         assert_return(!event_origin_changed(e), -ECHILD);
4885         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4886
4887
4888         PROTECT_EVENT(e);
4889
4890         while (e->state != SD_EVENT_FINISHED) {
4891                 r = sd_event_run(e, UINT64_MAX);
4892                 if (r < 0)
4893                         return r;
4894         }
4895
4896         return e->exit_code;
4897 }
4898
4899 _public_ int sd_event_get_fd(sd_event *e) {
4900         assert_return(e, -EINVAL);
4901         assert_return(e = event_resolve(e), -ENOPKG);
4902         assert_return(!event_origin_changed(e), -ECHILD);
4903
4904         return e->epoll_fd;
4905 }
4906
4907 _public_ int sd_event_get_state(sd_event *e) {
4908         assert_return(e, -EINVAL);
4909         assert_return(e = event_resolve(e), -ENOPKG);
4910         assert_return(!event_origin_changed(e), -ECHILD);
4911
4912         return e->state;
4913 }
4914
4915 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
4916         assert_return(e, -EINVAL);
4917         assert_return(e = event_resolve(e), -ENOPKG);
4918         assert_return(code, -EINVAL);
4919         assert_return(!event_origin_changed(e), -ECHILD);
4920
4921         if (!e->exit_requested)
4922                 return -ENODATA;
4923
4924         *code = e->exit_code;
4925         return 0;
4926 }
4927
4928 _public_ int sd_event_exit(sd_event *e, int code) {
4929         assert_return(e, -EINVAL);
4930         assert_return(e = event_resolve(e), -ENOPKG);
4931         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4932         assert_return(!event_origin_changed(e), -ECHILD);
4933
4934         e->exit_requested = true;
4935         e->exit_code = code;
4936
4937         return 0;
4938 }
4939
4940 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
4941         assert_return(e, -EINVAL);
4942         assert_return(e = event_resolve(e), -ENOPKG);
4943         assert_return(usec, -EINVAL);
4944         assert_return(!event_origin_changed(e), -ECHILD);
4945
4946         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4947                 return -EOPNOTSUPP;
4948
4949         if (!triple_timestamp_is_set(&e->timestamp)) {
4950                 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4951                 *usec = now(clock);
4952                 return 1;
4953         }
4954
4955         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
4956         return 0;
4957 }
4958
4959 _public_ int sd_event_default(sd_event **ret) {
4960         sd_event *e = NULL;
4961         int r;
4962
4963         if (!ret)
4964                 return !!default_event;
4965
4966         if (default_event) {
4967                 *ret = sd_event_ref(default_event);
4968                 return 0;
4969         }
4970
4971         r = sd_event_new(&e);
4972         if (r < 0)
4973                 return r;
4974
4975         e->default_event_ptr = &default_event;
4976         e->tid = gettid();
4977         default_event = e;
4978
4979         *ret = e;
4980         return 1;
4981 }
4982
4983 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4984         assert_return(e, -EINVAL);
4985         assert_return(e = event_resolve(e), -ENOPKG);
4986         assert_return(tid, -EINVAL);
4987         assert_return(!event_origin_changed(e), -ECHILD);
4988
4989         if (e->tid != 0) {
4990                 *tid = e->tid;
4991                 return 0;
4992         }
4993
4994         return -ENXIO;
4995 }
4996
4997 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
4998         int r;
4999
5000         assert_return(e, -EINVAL);
5001         assert_return(e = event_resolve(e), -ENOPKG);
5002         assert_return(!event_origin_changed(e), -ECHILD);
5003
5004         if (e->watchdog == !!b)
5005                 return e->watchdog;
5006
5007         if (b) {
5008                 r = sd_watchdog_enabled(false, &e->watchdog_period);
5009                 if (r <= 0)
5010                         return r;
5011
5012                 /* Issue first ping immediately */
5013                 sd_notify(false, "WATCHDOG=1");
5014                 e->watchdog_last = now(CLOCK_MONOTONIC);
5015
5016                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
5017                 if (e->watchdog_fd < 0)
5018                         return -errno;
5019
5020                 r = arm_watchdog(e);
5021                 if (r < 0)
5022                         goto fail;
5023
5024                 struct epoll_event ev = {
5025                         .events = EPOLLIN,
5026                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
5027                 };
5028
5029                 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
5030                         r = -errno;
5031                         goto fail;
5032                 }
5033
5034         } else {
5035                 if (e->watchdog_fd >= 0) {
5036                         (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
5037                         e->watchdog_fd = safe_close(e->watchdog_fd);
5038                 }
5039         }
5040
5041         e->watchdog = b;
5042         return e->watchdog;
5043
5044 fail:
5045         e->watchdog_fd = safe_close(e->watchdog_fd);
5046         return r;
5047 }
5048
5049 _public_ int sd_event_get_watchdog(sd_event *e) {
5050         assert_return(e, -EINVAL);
5051         assert_return(e = event_resolve(e), -ENOPKG);
5052         assert_return(!event_origin_changed(e), -ECHILD);
5053
5054         return e->watchdog;
5055 }
5056
5057 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5058         assert_return(e, -EINVAL);
5059         assert_return(e = event_resolve(e), -ENOPKG);
5060         assert_return(!event_origin_changed(e), -ECHILD);
5061
5062         *ret = e->iteration;
5063         return 0;
5064 }
5065
5066 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5067         assert_return(s, -EINVAL);
5068         assert_return(s->event, -EINVAL);
5069         assert_return(!event_origin_changed(s->event), -ECHILD);
5070
5071         s->destroy_callback = callback;
5072         return 0;
5073 }
5074
5075 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5076         assert_return(s, -EINVAL);
5077         assert_return(!event_origin_changed(s->event), -ECHILD);
5078
5079         if (ret)
5080                 *ret = s->destroy_callback;
5081
5082         return !!s->destroy_callback;
5083 }
5084
5085 _public_ int sd_event_source_get_floating(sd_event_source *s) {
5086         assert_return(s, -EINVAL);
5087         assert_return(!event_origin_changed(s->event), -ECHILD);
5088
5089         return s->floating;
5090 }
5091
5092 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5093         assert_return(s, -EINVAL);
5094         assert_return(!event_origin_changed(s->event), -ECHILD);
5095
5096         if (s->floating == !!b)
5097                 return 0;
5098
5099         if (!s->event) /* Already disconnected */
5100                 return -ESTALE;
5101
5102         s->floating = b;
5103
5104         if (b) {
5105                 sd_event_source_ref(s);
5106                 sd_event_unref(s->event);
5107         } else {
5108                 sd_event_ref(s->event);
5109                 sd_event_source_unref(s);
5110         }
5111
5112         return 1;
5113 }
5114
5115 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5116         assert_return(s, -EINVAL);
5117         assert_return(s->type != SOURCE_EXIT, -EDOM);
5118         assert_return(!event_origin_changed(s->event), -ECHILD);
5119
5120         return s->exit_on_failure;
5121 }
5122
5123 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5124         assert_return(s, -EINVAL);
5125         assert_return(s->type != SOURCE_EXIT, -EDOM);
5126         assert_return(!event_origin_changed(s->event), -ECHILD);
5127
5128         if (s->exit_on_failure == !!b)
5129                 return 0;
5130
5131         s->exit_on_failure = b;
5132         return 1;
5133 }
5134
5135 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5136         int r;
5137
5138         assert_return(s, -EINVAL);
5139         assert_return(!event_origin_changed(s->event), -ECHILD);
5140
5141         /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5142          * so is a programming error. */
5143         assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5144
5145         /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5146          * non-ratelimited. */
5147         r = event_source_leave_ratelimit(s, /* run_callback */ false);
5148         if (r < 0)
5149                 return r;
5150
5151         s->rate_limit = (RateLimit) { interval, burst };
5152         return 0;
5153 }
5154
5155 _public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5156         assert_return(s, -EINVAL);
5157         assert_return(!event_origin_changed(s->event), -ECHILD);
5158
5159         s->ratelimit_expire_callback = callback;
5160         return 0;
5161 }
5162
5163 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5164         assert_return(s, -EINVAL);
5165         assert_return(!event_origin_changed(s->event), -ECHILD);
5166
5167         /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5168          * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5169         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5170                 return -EDOM;
5171
5172         if (!ratelimit_configured(&s->rate_limit))
5173                 return -ENOEXEC;
5174
5175         if (ret_interval)
5176                 *ret_interval = s->rate_limit.interval;
5177         if (ret_burst)
5178                 *ret_burst = s->rate_limit.burst;
5179
5180         return 0;
5181 }
5182
5183 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5184         assert_return(s, -EINVAL);
5185         assert_return(!event_origin_changed(s->event), -ECHILD);
5186
5187         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5188                 return false;
5189
5190         if (!ratelimit_configured(&s->rate_limit))
5191                 return false;
5192
5193         return s->ratelimited;
5194 }
5195
5196 _public_ int sd_event_source_leave_ratelimit(sd_event_source *s) {
5197         int r;
5198
5199         assert_return(s, -EINVAL);
5200
5201         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5202                 return 0;
5203
5204         if (!ratelimit_configured(&s->rate_limit))
5205                 return 0;
5206
5207         if (!s->ratelimited)
5208                 return 0;
5209
5210         r = event_source_leave_ratelimit(s, /* run_callback */ false);
5211         if (r < 0)
5212                 return r;
5213
5214         return 1; /* tell caller that we indeed just left the ratelimit state */
5215 }
5216
5217 _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5218         bool change = false;
5219         int r;
5220
5221         assert_return(e, -EINVAL);
5222
5223         if (b) {
5224                 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5225                  * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5226                  * floating after creation (and undo this before deleting them again). */
5227
5228                 if (!e->sigint_event_source) {
5229                         r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5230                         if (r < 0)
5231                                 return r;
5232
5233                         assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5234                         change = true;
5235                 }
5236
5237                 if (!e->sigterm_event_source) {
5238                         r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5239                         if (r < 0) {
5240                                 if (change) {
5241                                         assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5242                                         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5243                                 }
5244
5245                                 return r;
5246                         }
5247
5248                         assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5249                         change = true;
5250                 }
5251
5252         } else {
5253                 if (e->sigint_event_source) {
5254                         assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5255                         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5256                         change = true;
5257                 }
5258
5259                 if (e->sigterm_event_source) {
5260                         assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5261                         e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5262                         change = true;
5263                 }
5264         }
5265
5266         return change;
5267 }
5268
5269 _public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5270         _cleanup_free_ char *b = NULL;
5271         _cleanup_free_ void *w = NULL;
5272
5273         assert_return(s, -EINVAL);
5274         assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5275         assert_return(ty, -EINVAL);
5276         assert_return(!event_origin_changed(s->event), -ECHILD);
5277
5278         if (!STR_IN_SET(ty, "some", "full"))
5279                 return -EINVAL;
5280
5281         if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5282                 return -EBUSY;
5283
5284         char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5285         if (!space)
5286                 return -EINVAL;
5287
5288         size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5289         b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5290         if (!b)
5291                 return -ENOMEM;
5292         if (!STR_IN_SET(b, "some", "full"))
5293                 return -EINVAL;
5294
5295         if (streq(b, ty))
5296                 return 0;
5297
5298         size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5299         w = new(char, nl);
5300         if (!w)
5301                 return -ENOMEM;
5302
5303         memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5304
5305         free_and_replace(s->memory_pressure.write_buffer, w);
5306         s->memory_pressure.write_buffer_size = nl;
5307         s->memory_pressure.locked = false;
5308
5309         return 1;
5310 }
5311
5312 _public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5313         _cleanup_free_ char *b = NULL;
5314         _cleanup_free_ void *w = NULL;
5315
5316         assert_return(s, -EINVAL);
5317         assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5318         assert_return(!event_origin_changed(s->event), -ECHILD);
5319
5320         if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5321                 return -ERANGE;
5322         if (window_usec <= 0 || window_usec >= UINT64_MAX)
5323                 return -ERANGE;
5324         if (threshold_usec > window_usec)
5325                 return -EINVAL;
5326
5327         if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5328                 return -EBUSY;
5329
5330         char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5331         if (!space)
5332                 return -EINVAL;
5333
5334         size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5335         b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5336         if (!b)
5337                 return -ENOMEM;
5338         if (!STR_IN_SET(b, "some", "full"))
5339                 return -EINVAL;
5340
5341         if (asprintf((char**) &w,
5342                      "%s " USEC_FMT " " USEC_FMT "",
5343                      b,
5344                      threshold_usec,
5345                      window_usec) < 0)
5346                 return -EINVAL;
5347
5348         l = strlen(w) + 1;
5349         if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5350                 return 0;
5351
5352         free_and_replace(s->memory_pressure.write_buffer, w);
5353         s->memory_pressure.write_buffer_size = l;
5354         s->memory_pressure.locked = false;
5355
5356         return 1;
5357 }