src/libsystemd/sd-event/sd-event.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <sys/epoll.h>
   4 #include <sys/timerfd.h>
   5 #include <sys/wait.h>
   6
   7 #include "sd-daemon.h"
   8 #include "sd-event.h"
   9 #include "sd-id128.h"
  10 #include "sd-messages.h"
  11
  12 #include "alloc-util.h"
  13 #include "env-util.h"
  14 #include "event-source.h"
  15 #include "fd-util.h"
  16 #include "fs-util.h"
  17 #include "glyph-util.h"
  18 #include "hashmap.h"
  19 #include "hexdecoct.h"
  20 #include "list.h"
  21 #include "logarithm.h"
  22 #include "macro.h"
  23 #include "mallinfo-util.h"
  24 #include "memory-util.h"
  25 #include "missing_magic.h"
  26 #include "missing_syscall.h"
  27 #include "missing_threads.h"
  28 #include "origin-id.h"
  29 #include "path-util.h"
  30 #include "prioq.h"
  31 #include "process-util.h"
  32 #include "psi-util.h"
  33 #include "set.h"
  34 #include "signal-util.h"
  35 #include "socket-util.h"
  36 #include "stat-util.h"
  37 #include "string-table.h"
  38 #include "string-util.h"
  39 #include "strxcpyx.h"
  40 #include "time-util.h"
  41
  42 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
  43
  44 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
  45         /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
  46         return s &&
  47                 s->type == SOURCE_CHILD &&
  48                 s->child.pidfd >= 0 &&
  49                 s->child.options == WEXITED;
  50 }
  51
  52 static bool event_source_is_online(sd_event_source *s) {
  53         assert(s);
  54         return s->enabled != SD_EVENT_OFF && !s->ratelimited;
  55 }
  56
  57 static bool event_source_is_offline(sd_event_source *s) {
  58         assert(s);
  59         return s->enabled == SD_EVENT_OFF || s->ratelimited;
  60 }
  61
  62 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
  63         [SOURCE_IO]                  = "io",
  64         [SOURCE_TIME_REALTIME]       = "realtime",
  65         [SOURCE_TIME_BOOTTIME]       = "bootime",
  66         [SOURCE_TIME_MONOTONIC]      = "monotonic",
  67         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
  68         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
  69         [SOURCE_SIGNAL]              = "signal",
  70         [SOURCE_CHILD]               = "child",
  71         [SOURCE_DEFER]               = "defer",
  72         [SOURCE_POST]                = "post",
  73         [SOURCE_EXIT]                = "exit",
  74         [SOURCE_WATCHDOG]            = "watchdog",
  75         [SOURCE_INOTIFY]             = "inotify",
  76         [SOURCE_MEMORY_PRESSURE]     = "memory-pressure",
  77 };
  78
  79 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
  80
  81 #define EVENT_SOURCE_IS_TIME(t)                 \
  82         IN_SET((t),                             \
  83                SOURCE_TIME_REALTIME,            \
  84                SOURCE_TIME_BOOTTIME,            \
  85                SOURCE_TIME_MONOTONIC,           \
  86                SOURCE_TIME_REALTIME_ALARM,      \
  87                SOURCE_TIME_BOOTTIME_ALARM)
  88
  89 #define EVENT_SOURCE_CAN_RATE_LIMIT(t)          \
  90         IN_SET((t),                             \
  91                SOURCE_IO,                       \
  92                SOURCE_TIME_REALTIME,            \
  93                SOURCE_TIME_BOOTTIME,            \
  94                SOURCE_TIME_MONOTONIC,           \
  95                SOURCE_TIME_REALTIME_ALARM,      \
  96                SOURCE_TIME_BOOTTIME_ALARM,      \
  97                SOURCE_SIGNAL,                   \
  98                SOURCE_DEFER,                    \
  99                SOURCE_INOTIFY,                  \
 100                SOURCE_MEMORY_PRESSURE)
 101
 102 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
 103  * Time sources and ratelimited sources can be passed, so effectively this is the same as the
 104  * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
 105 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
 106
 107 struct sd_event {
 108         unsigned n_ref;
 109
 110         int epoll_fd;
 111         int watchdog_fd;
 112
 113         Prioq *pending;
 114         Prioq *prepare;
 115
 116         /* timerfd_create() only supports these five clocks so far. We
 117          * can add support for more clocks when the kernel learns to
 118          * deal with them, too. */
 119         struct clock_data realtime;
 120         struct clock_data boottime;
 121         struct clock_data monotonic;
 122         struct clock_data realtime_alarm;
 123         struct clock_data boottime_alarm;
 124
 125         usec_t perturb;
 126
 127         sd_event_source **signal_sources; /* indexed by signal number */
 128         Hashmap *signal_data; /* indexed by priority */
 129
 130         Hashmap *child_sources;
 131         unsigned n_online_child_sources;
 132
 133         Set *post_sources;
 134
 135         Prioq *exit;
 136
 137         Hashmap *inotify_data; /* indexed by priority */
 138
 139         /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
 140         LIST_HEAD(struct inode_data, inode_data_to_close_list);
 141
 142         /* A list of inotify objects that already have events buffered which aren't processed yet */
 143         LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
 144
 145         /* A list of memory pressure event sources that still need their subscription string written */
 146         LIST_HEAD(sd_event_source, memory_pressure_write_list);
 147
 148         uint64_t origin_id;
 149
 150         uint64_t iteration;
 151         triple_timestamp timestamp;
 152         int state;
 153
 154         bool exit_requested:1;
 155         bool need_process_child:1;
 156         bool watchdog:1;
 157         bool profile_delays:1;
 158
 159         int exit_code;
 160
 161         pid_t tid;
 162         sd_event **default_event_ptr;
 163
 164         usec_t watchdog_last, watchdog_period;
 165
 166         unsigned n_sources;
 167
 168         struct epoll_event *event_queue;
 169
 170         LIST_HEAD(sd_event_source, sources);
 171
 172         sd_event_source *sigint_event_source, *sigterm_event_source;
 173
 174         usec_t last_run_usec, last_log_usec;
 175         unsigned delays[sizeof(usec_t) * 8];
 176 };
 177
 178 DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event, event);
 179
 180 static thread_local sd_event *default_event = NULL;
 181
 182 static void source_disconnect(sd_event_source *s);
 183 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
 184
 185 static sd_event *event_resolve(sd_event *e) {
 186         return e == SD_EVENT_DEFAULT ? default_event : e;
 187 }
 188
 189 static int pending_prioq_compare(const void *a, const void *b) {
 190         const sd_event_source *x = a, *y = b;
 191         int r;
 192
 193         assert(x->pending);
 194         assert(y->pending);
 195
 196         /* Enabled ones first */
 197         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 198         if (r != 0)
 199                 return r;
 200
 201         /* Non rate-limited ones first. */
 202         r = CMP(!!x->ratelimited, !!y->ratelimited);
 203         if (r != 0)
 204                 return r;
 205
 206         /* Lower priority values first */
 207         r = CMP(x->priority, y->priority);
 208         if (r != 0)
 209                 return r;
 210
 211         /* Older entries first */
 212         return CMP(x->pending_iteration, y->pending_iteration);
 213 }
 214
 215 static int prepare_prioq_compare(const void *a, const void *b) {
 216         const sd_event_source *x = a, *y = b;
 217         int r;
 218
 219         assert(x->prepare);
 220         assert(y->prepare);
 221
 222         /* Enabled ones first */
 223         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 224         if (r != 0)
 225                 return r;
 226
 227         /* Non rate-limited ones first. */
 228         r = CMP(!!x->ratelimited, !!y->ratelimited);
 229         if (r != 0)
 230                 return r;
 231
 232         /* Move most recently prepared ones last, so that we can stop
 233          * preparing as soon as we hit one that has already been
 234          * prepared in the current iteration */
 235         r = CMP(x->prepare_iteration, y->prepare_iteration);
 236         if (r != 0)
 237                 return r;
 238
 239         /* Lower priority values first */
 240         return CMP(x->priority, y->priority);
 241 }
 242
 243 static usec_t time_event_source_next(const sd_event_source *s) {
 244         assert(s);
 245
 246         /* We have two kinds of event sources that have elapsation times associated with them: the actual
 247          * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
 248          * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
 249          * looking at here. */
 250
 251         if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
 252                 assert(s->rate_limit.begin != 0);
 253                 assert(s->rate_limit.interval != 0);
 254                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 255         }
 256
 257         /* Otherwise this must be a time event source, if not ratelimited */
 258         if (EVENT_SOURCE_IS_TIME(s->type))
 259                 return s->time.next;
 260
 261         return USEC_INFINITY;
 262 }
 263
 264 static usec_t time_event_source_latest(const sd_event_source *s) {
 265         assert(s);
 266
 267         if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
 268                                * same, as we should avoid adding additional inaccuracy on an inaccuracy time
 269                                * window */
 270                 assert(s->rate_limit.begin != 0);
 271                 assert(s->rate_limit.interval != 0);
 272                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 273         }
 274
 275         /* Must be a time event source, if not ratelimited */
 276         if (EVENT_SOURCE_IS_TIME(s->type))
 277                 return usec_add(s->time.next, s->time.accuracy);
 278
 279         return USEC_INFINITY;
 280 }
 281
 282 static bool event_source_timer_candidate(const sd_event_source *s) {
 283         assert(s);
 284
 285         /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
 286          * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
 287         return !s->pending || s->ratelimited;
 288 }
 289
 290 static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
 291         const sd_event_source *x = a, *y = b;
 292         int r;
 293
 294         /* Enabled ones first */
 295         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 296         if (r != 0)
 297                 return r;
 298
 299         /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
 300         r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
 301         if (r != 0)
 302                 return r;
 303
 304         /* Order by time */
 305         return CMP(time_func(x), time_func(y));
 306 }
 307
 308 static int earliest_time_prioq_compare(const void *a, const void *b) {
 309         return time_prioq_compare(a, b, time_event_source_next);
 310 }
 311
 312 static int latest_time_prioq_compare(const void *a, const void *b) {
 313         return time_prioq_compare(a, b, time_event_source_latest);
 314 }
 315
 316 static int exit_prioq_compare(const void *a, const void *b) {
 317         const sd_event_source *x = a, *y = b;
 318         int r;
 319
 320         assert(x->type == SOURCE_EXIT);
 321         assert(y->type == SOURCE_EXIT);
 322
 323         /* Enabled ones first */
 324         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 325         if (r != 0)
 326                 return r;
 327
 328         /* Lower priority values first */
 329         return CMP(x->priority, y->priority);
 330 }
 331
 332 static void free_clock_data(struct clock_data *d) {
 333         assert(d);
 334         assert(d->wakeup == WAKEUP_CLOCK_DATA);
 335
 336         safe_close(d->fd);
 337         prioq_free(d->earliest);
 338         prioq_free(d->latest);
 339 }
 340
 341 static sd_event *event_free(sd_event *e) {
 342         sd_event_source *s;
 343
 344         assert(e);
 345
 346         e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
 347         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
 348
 349         while ((s = e->sources)) {
 350                 assert(s->floating);
 351                 source_disconnect(s);
 352                 sd_event_source_unref(s);
 353         }
 354
 355         assert(e->n_sources == 0);
 356
 357         if (e->default_event_ptr)
 358                 *(e->default_event_ptr) = NULL;
 359
 360         safe_close(e->epoll_fd);
 361         safe_close(e->watchdog_fd);
 362
 363         free_clock_data(&e->realtime);
 364         free_clock_data(&e->boottime);
 365         free_clock_data(&e->monotonic);
 366         free_clock_data(&e->realtime_alarm);
 367         free_clock_data(&e->boottime_alarm);
 368
 369         prioq_free(e->pending);
 370         prioq_free(e->prepare);
 371         prioq_free(e->exit);
 372
 373         free(e->signal_sources);
 374         hashmap_free(e->signal_data);
 375
 376         hashmap_free(e->inotify_data);
 377
 378         hashmap_free(e->child_sources);
 379         set_free(e->post_sources);
 380
 381         free(e->event_queue);
 382
 383         return mfree(e);
 384 }
 385
 386 _public_ int sd_event_new(sd_event** ret) {
 387         sd_event *e;
 388         int r;
 389
 390         assert_return(ret, -EINVAL);
 391
 392         e = new(sd_event, 1);
 393         if (!e)
 394                 return -ENOMEM;
 395
 396         *e = (sd_event) {
 397                 .n_ref = 1,
 398                 .epoll_fd = -EBADF,
 399                 .watchdog_fd = -EBADF,
 400                 .realtime.wakeup = WAKEUP_CLOCK_DATA,
 401                 .realtime.fd = -EBADF,
 402                 .realtime.next = USEC_INFINITY,
 403                 .boottime.wakeup = WAKEUP_CLOCK_DATA,
 404                 .boottime.fd = -EBADF,
 405                 .boottime.next = USEC_INFINITY,
 406                 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
 407                 .monotonic.fd = -EBADF,
 408                 .monotonic.next = USEC_INFINITY,
 409                 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 410                 .realtime_alarm.fd = -EBADF,
 411                 .realtime_alarm.next = USEC_INFINITY,
 412                 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 413                 .boottime_alarm.fd = -EBADF,
 414                 .boottime_alarm.next = USEC_INFINITY,
 415                 .perturb = USEC_INFINITY,
 416                 .origin_id = origin_id_query(),
 417         };
 418
 419         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
 420         if (r < 0)
 421                 goto fail;
 422
 423         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
 424         if (e->epoll_fd < 0) {
 425                 r = -errno;
 426                 goto fail;
 427         }
 428
 429         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
 430
 431         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
 432                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
 433                           special_glyph(SPECIAL_GLYPH_ELLIPSIS));
 434                 e->profile_delays = true;
 435         }
 436
 437         *ret = e;
 438         return 0;
 439
 440 fail:
 441         event_free(e);
 442         return r;
 443 }
 444
 445 /* Define manually so we can add the origin check */
 446 _public_ sd_event *sd_event_ref(sd_event *e) {
 447         if (!e)
 448                 return NULL;
 449         if (event_origin_changed(e))
 450                 return NULL;
 451
 452         e->n_ref++;
 453
 454         return e;
 455 }
 456
 457 _public_ sd_event* sd_event_unref(sd_event *e) {
 458         if (!e)
 459                 return NULL;
 460         if (event_origin_changed(e))
 461                 return NULL;
 462
 463         assert(e->n_ref > 0);
 464         if (--e->n_ref > 0)
 465                 return NULL;
 466
 467         return event_free(e);
 468 }
 469
 470 #define PROTECT_EVENT(e)                                                \
 471         _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
 472
 473 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
 474         if (s)
 475                 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
 476         return sd_event_source_unref(s);
 477 }
 478
 479 static void source_io_unregister(sd_event_source *s) {
 480         assert(s);
 481         assert(s->type == SOURCE_IO);
 482
 483         if (event_origin_changed(s->event))
 484                 return;
 485
 486         if (!s->io.registered)
 487                 return;
 488
 489         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
 490                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 491                                 strna(s->description), event_source_type_to_string(s->type));
 492
 493         s->io.registered = false;
 494 }
 495
 496 static int source_io_register(
 497                 sd_event_source *s,
 498                 int enabled,
 499                 uint32_t events) {
 500
 501         assert(s);
 502         assert(s->type == SOURCE_IO);
 503         assert(enabled != SD_EVENT_OFF);
 504
 505         struct epoll_event ev = {
 506                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 507                 .data.ptr = s,
 508         };
 509
 510         if (epoll_ctl(s->event->epoll_fd,
 511                       s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 512                       s->io.fd, &ev) < 0)
 513                 return -errno;
 514
 515         s->io.registered = true;
 516
 517         return 0;
 518 }
 519
 520 static void source_child_pidfd_unregister(sd_event_source *s) {
 521         assert(s);
 522         assert(s->type == SOURCE_CHILD);
 523
 524         if (event_origin_changed(s->event))
 525                 return;
 526
 527         if (!s->child.registered)
 528                 return;
 529
 530         if (EVENT_SOURCE_WATCH_PIDFD(s))
 531                 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
 532                         log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 533                                         strna(s->description), event_source_type_to_string(s->type));
 534
 535         s->child.registered = false;
 536 }
 537
 538 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
 539         assert(s);
 540         assert(s->type == SOURCE_CHILD);
 541         assert(enabled != SD_EVENT_OFF);
 542
 543         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
 544                 struct epoll_event ev = {
 545                         .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 546                         .data.ptr = s,
 547                 };
 548
 549                 if (epoll_ctl(s->event->epoll_fd,
 550                               s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 551                               s->child.pidfd, &ev) < 0)
 552                         return -errno;
 553         }
 554
 555         s->child.registered = true;
 556         return 0;
 557 }
 558
 559 static void source_memory_pressure_unregister(sd_event_source *s) {
 560         assert(s);
 561         assert(s->type == SOURCE_MEMORY_PRESSURE);
 562
 563         if (event_origin_changed(s->event))
 564                 return;
 565
 566         if (!s->memory_pressure.registered)
 567                 return;
 568
 569         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
 570                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 571                                 strna(s->description), event_source_type_to_string(s->type));
 572
 573         s->memory_pressure.registered = false;
 574 }
 575
 576 static int source_memory_pressure_register(sd_event_source *s, int enabled) {
 577         assert(s);
 578         assert(s->type == SOURCE_MEMORY_PRESSURE);
 579         assert(enabled != SD_EVENT_OFF);
 580
 581         struct epoll_event ev = {
 582                 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
 583                           (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
 584                 .data.ptr = s,
 585         };
 586
 587         if (epoll_ctl(s->event->epoll_fd,
 588                       s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 589                       s->memory_pressure.fd, &ev) < 0)
 590                 return -errno;
 591
 592         s->memory_pressure.registered = true;
 593         return 0;
 594 }
 595
 596 static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
 597         assert(s);
 598         assert(s->type == SOURCE_MEMORY_PRESSURE);
 599
 600         if (s->memory_pressure.in_write_list)
 601                 return;
 602
 603         LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
 604         s->memory_pressure.in_write_list = true;
 605 }
 606
 607 static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
 608         assert(s);
 609         assert(s->type == SOURCE_MEMORY_PRESSURE);
 610
 611         if (!s->memory_pressure.in_write_list)
 612                 return;
 613
 614         LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
 615         s->memory_pressure.in_write_list = false;
 616 }
 617
 618 static clockid_t event_source_type_to_clock(EventSourceType t) {
 619
 620         switch (t) {
 621
 622         case SOURCE_TIME_REALTIME:
 623                 return CLOCK_REALTIME;
 624
 625         case SOURCE_TIME_BOOTTIME:
 626                 return CLOCK_BOOTTIME;
 627
 628         case SOURCE_TIME_MONOTONIC:
 629                 return CLOCK_MONOTONIC;
 630
 631         case SOURCE_TIME_REALTIME_ALARM:
 632                 return CLOCK_REALTIME_ALARM;
 633
 634         case SOURCE_TIME_BOOTTIME_ALARM:
 635                 return CLOCK_BOOTTIME_ALARM;
 636
 637         default:
 638                 return (clockid_t) -1;
 639         }
 640 }
 641
 642 static EventSourceType clock_to_event_source_type(clockid_t clock) {
 643
 644         switch (clock) {
 645
 646         case CLOCK_REALTIME:
 647                 return SOURCE_TIME_REALTIME;
 648
 649         case CLOCK_BOOTTIME:
 650                 return SOURCE_TIME_BOOTTIME;
 651
 652         case CLOCK_MONOTONIC:
 653                 return SOURCE_TIME_MONOTONIC;
 654
 655         case CLOCK_REALTIME_ALARM:
 656                 return SOURCE_TIME_REALTIME_ALARM;
 657
 658         case CLOCK_BOOTTIME_ALARM:
 659                 return SOURCE_TIME_BOOTTIME_ALARM;
 660
 661         default:
 662                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
 663         }
 664 }
 665
 666 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
 667         assert(e);
 668
 669         switch (t) {
 670
 671         case SOURCE_TIME_REALTIME:
 672                 return &e->realtime;
 673
 674         case SOURCE_TIME_BOOTTIME:
 675                 return &e->boottime;
 676
 677         case SOURCE_TIME_MONOTONIC:
 678                 return &e->monotonic;
 679
 680         case SOURCE_TIME_REALTIME_ALARM:
 681                 return &e->realtime_alarm;
 682
 683         case SOURCE_TIME_BOOTTIME_ALARM:
 684                 return &e->boottime_alarm;
 685
 686         default:
 687                 return NULL;
 688         }
 689 }
 690
 691 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
 692         assert(e);
 693
 694         if (!d)
 695                 return;
 696
 697         hashmap_remove(e->signal_data, &d->priority);
 698         safe_close(d->fd);
 699         free(d);
 700 }
 701
 702 static int event_make_signal_data(
 703                 sd_event *e,
 704                 int sig,
 705                 struct signal_data **ret) {
 706
 707         struct signal_data *d;
 708         bool added = false;
 709         sigset_t ss_copy;
 710         int64_t priority;
 711         int r;
 712
 713         assert(e);
 714
 715         if (event_origin_changed(e))
 716                 return -ECHILD;
 717
 718         if (e->signal_sources && e->signal_sources[sig])
 719                 priority = e->signal_sources[sig]->priority;
 720         else
 721                 priority = SD_EVENT_PRIORITY_NORMAL;
 722
 723         d = hashmap_get(e->signal_data, &priority);
 724         if (d) {
 725                 if (sigismember(&d->sigset, sig) > 0) {
 726                         if (ret)
 727                                 *ret = d;
 728                         return 0;
 729                 }
 730         } else {
 731                 d = new(struct signal_data, 1);
 732                 if (!d)
 733                         return -ENOMEM;
 734
 735                 *d = (struct signal_data) {
 736                         .wakeup = WAKEUP_SIGNAL_DATA,
 737                         .fd = -EBADF,
 738                         .priority = priority,
 739                 };
 740
 741                 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
 742                 if (r < 0) {
 743                         free(d);
 744                         return r;
 745                 }
 746
 747                 added = true;
 748         }
 749
 750         ss_copy = d->sigset;
 751         assert_se(sigaddset(&ss_copy, sig) >= 0);
 752
 753         r = signalfd(d->fd >= 0 ? d->fd : -1,   /* the first arg must be -1 or a valid signalfd */
 754                      &ss_copy,
 755                      SFD_NONBLOCK|SFD_CLOEXEC);
 756         if (r < 0) {
 757                 r = -errno;
 758                 goto fail;
 759         }
 760
 761         d->sigset = ss_copy;
 762
 763         if (d->fd >= 0) {
 764                 if (ret)
 765                         *ret = d;
 766                 return 0;
 767         }
 768
 769         d->fd = fd_move_above_stdio(r);
 770
 771         struct epoll_event ev = {
 772                 .events = EPOLLIN,
 773                 .data.ptr = d,
 774         };
 775
 776         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
 777                 r = -errno;
 778                 goto fail;
 779         }
 780
 781         if (ret)
 782                 *ret = d;
 783
 784         return 0;
 785
 786 fail:
 787         if (added)
 788                 event_free_signal_data(e, d);
 789
 790         return r;
 791 }
 792
 793 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
 794         assert(e);
 795         assert(d);
 796
 797         /* Turns off the specified signal in the signal data
 798          * object. If the signal mask of the object becomes empty that
 799          * way removes it. */
 800
 801         if (sigismember(&d->sigset, sig) == 0)
 802                 return;
 803
 804         assert_se(sigdelset(&d->sigset, sig) >= 0);
 805
 806         if (sigisemptyset(&d->sigset)) {
 807                 /* If all the mask is all-zero we can get rid of the structure */
 808                 event_free_signal_data(e, d);
 809                 return;
 810         }
 811
 812         if (event_origin_changed(e))
 813                 return;
 814
 815         assert(d->fd >= 0);
 816
 817         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
 818                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
 819 }
 820
 821 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
 822         struct signal_data *d;
 823         static const int64_t zero_priority = 0;
 824
 825         assert(e);
 826
 827         /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
 828          * and possibly drop the signalfd for it. */
 829
 830         if (sig == SIGCHLD &&
 831             e->n_online_child_sources > 0)
 832                 return;
 833
 834         if (e->signal_sources &&
 835             e->signal_sources[sig] &&
 836             event_source_is_online(e->signal_sources[sig]))
 837                 return;
 838
 839         /*
 840          * The specified signal might be enabled in three different queues:
 841          *
 842          * 1) the one that belongs to the priority passed (if it is non-NULL)
 843          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
 844          * 3) the 0 priority (to cover the SIGCHLD case)
 845          *
 846          * Hence, let's remove it from all three here.
 847          */
 848
 849         if (priority) {
 850                 d = hashmap_get(e->signal_data, priority);
 851                 if (d)
 852                         event_unmask_signal_data(e, d, sig);
 853         }
 854
 855         if (e->signal_sources && e->signal_sources[sig]) {
 856                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
 857                 if (d)
 858                         event_unmask_signal_data(e, d, sig);
 859         }
 860
 861         d = hashmap_get(e->signal_data, &zero_priority);
 862         if (d)
 863                 event_unmask_signal_data(e, d, sig);
 864 }
 865
 866 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
 867         assert(s);
 868
 869         /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
 870          * they are enabled/disabled or marked pending and such. */
 871
 872         if (s->pending)
 873                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
 874
 875         if (s->prepare)
 876                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
 877 }
 878
 879 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
 880         struct clock_data *d;
 881
 882         assert(s);
 883
 884         /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
 885          * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
 886          * properly again. */
 887
 888         if (s->ratelimited)
 889                 d = &s->event->monotonic;
 890         else if (EVENT_SOURCE_IS_TIME(s->type))
 891                 assert_se(d = event_get_clock_data(s->event, s->type));
 892         else
 893                 return; /* no-op for an event source which is neither a timer nor ratelimited. */
 894
 895         prioq_reshuffle(d->earliest, s, &s->earliest_index);
 896         prioq_reshuffle(d->latest, s, &s->latest_index);
 897         d->needs_rearm = true;
 898 }
 899
 900 static void event_source_time_prioq_remove(
 901                 sd_event_source *s,
 902                 struct clock_data *d) {
 903
 904         assert(s);
 905         assert(d);
 906
 907         prioq_remove(d->earliest, s, &s->earliest_index);
 908         prioq_remove(d->latest, s, &s->latest_index);
 909         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
 910         d->needs_rearm = true;
 911 }
 912
 913 static void source_disconnect(sd_event_source *s) {
 914         sd_event *event;
 915         int r;
 916
 917         assert(s);
 918
 919         if (!s->event)
 920                 return;
 921
 922         assert(s->event->n_sources > 0);
 923
 924         switch (s->type) {
 925
 926         case SOURCE_IO:
 927                 if (s->io.fd >= 0)
 928                         source_io_unregister(s);
 929
 930                 break;
 931
 932         case SOURCE_TIME_REALTIME:
 933         case SOURCE_TIME_BOOTTIME:
 934         case SOURCE_TIME_MONOTONIC:
 935         case SOURCE_TIME_REALTIME_ALARM:
 936         case SOURCE_TIME_BOOTTIME_ALARM:
 937                 /* Only remove this event source from the time event source here if it is not ratelimited. If
 938                  * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
 939                  * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
 940
 941                 if (!s->ratelimited) {
 942                         struct clock_data *d;
 943                         assert_se(d = event_get_clock_data(s->event, s->type));
 944                         event_source_time_prioq_remove(s, d);
 945                 }
 946
 947                 break;
 948
 949         case SOURCE_SIGNAL:
 950                 if (s->signal.sig > 0) {
 951
 952                         if (s->event->signal_sources)
 953                                 s->event->signal_sources[s->signal.sig] = NULL;
 954
 955                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
 956
 957                         if (s->signal.unblock) {
 958                                 sigset_t new_ss;
 959
 960                                 if (sigemptyset(&new_ss) < 0)
 961                                         log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
 962                                 else if (sigaddset(&new_ss, s->signal.sig) < 0)
 963                                         log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
 964                                 else {
 965                                         r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
 966                                         if (r != 0)
 967                                                 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
 968                                 }
 969                         }
 970                 }
 971
 972                 break;
 973
 974         case SOURCE_CHILD:
 975                 if (event_origin_changed(s->event))
 976                         s->child.process_owned = false;
 977
 978                 if (s->child.pid > 0) {
 979                         if (event_source_is_online(s)) {
 980                                 assert(s->event->n_online_child_sources > 0);
 981                                 s->event->n_online_child_sources--;
 982                         }
 983
 984                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
 985                 }
 986
 987                 if (EVENT_SOURCE_WATCH_PIDFD(s))
 988                         source_child_pidfd_unregister(s);
 989                 else
 990                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
 991
 992                 break;
 993
 994         case SOURCE_DEFER:
 995                 /* nothing */
 996                 break;
 997
 998         case SOURCE_POST:
 999                 set_remove(s->event->post_sources, s);
1000                 break;
1001
1002         case SOURCE_EXIT:
1003                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
1004                 break;
1005
1006         case SOURCE_INOTIFY: {
1007                 struct inode_data *inode_data;
1008
1009                 inode_data = s->inotify.inode_data;
1010                 if (inode_data) {
1011                         struct inotify_data *inotify_data;
1012                         assert_se(inotify_data = inode_data->inotify_data);
1013
1014                         /* Detach this event source from the inode object */
1015                         LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
1016                         s->inotify.inode_data = NULL;
1017
1018                         if (s->pending) {
1019                                 assert(inotify_data->n_pending > 0);
1020                                 inotify_data->n_pending--;
1021                         }
1022
1023                         /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1024                          * continued to being watched. That's because inotify doesn't really have an API for that: we
1025                          * can only change watch masks with access to the original inode either by fd or by path. But
1026                          * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1027                          * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1028                          * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1029                          * there), but given the need for open_by_handle_at() which is privileged and not universally
1030                          * available this would be quite an incomplete solution. Hence we go the other way, leave the
1031                          * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1032                          * anymore after reception. Yes, this sucks, but … Linux … */
1033
1034                         /* Maybe release the inode data (and its inotify) */
1035                         event_gc_inode_data(s->event, inode_data);
1036                 }
1037
1038                 break;
1039         }
1040
1041         case SOURCE_MEMORY_PRESSURE:
1042                 source_memory_pressure_remove_from_write_list(s);
1043                 source_memory_pressure_unregister(s);
1044                 break;
1045
1046         default:
1047                 assert_not_reached();
1048         }
1049
1050         if (s->pending)
1051                 prioq_remove(s->event->pending, s, &s->pending_index);
1052
1053         if (s->prepare)
1054                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1055
1056         if (s->ratelimited)
1057                 event_source_time_prioq_remove(s, &s->event->monotonic);
1058
1059         event = TAKE_PTR(s->event);
1060         LIST_REMOVE(sources, event->sources, s);
1061         event->n_sources--;
1062
1063         /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1064          * pidfd associated with this event source, which we'll do only on source_free(). */
1065
1066         if (!s->floating)
1067                 sd_event_unref(event);
1068 }
1069
1070 static sd_event_source* source_free(sd_event_source *s) {
1071         assert(s);
1072
1073         source_disconnect(s);
1074
1075         if (s->type == SOURCE_IO && s->io.owned)
1076                 s->io.fd = safe_close(s->io.fd);
1077
1078         if (s->type == SOURCE_CHILD) {
1079                 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1080
1081                 if (s->child.process_owned) {
1082
1083                         if (!s->child.exited) {
1084                                 bool sent = false;
1085
1086                                 if (s->child.pidfd >= 0) {
1087                                         if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
1088                                                 if (errno == ESRCH) /* Already dead */
1089                                                         sent = true;
1090                                                 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
1091                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
1092                                                                         s->child.pid);
1093                                         } else
1094                                                 sent = true;
1095                                 }
1096
1097                                 if (!sent)
1098                                         if (kill(s->child.pid, SIGKILL) < 0)
1099                                                 if (errno != ESRCH) /* Already dead */
1100                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1101                                                                         s->child.pid);
1102                         }
1103
1104                         if (!s->child.waited) {
1105                                 siginfo_t si = {};
1106
1107                                 /* Reap the child if we can */
1108                                 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1109                         }
1110                 }
1111
1112                 if (s->child.pidfd_owned)
1113                         s->child.pidfd = safe_close(s->child.pidfd);
1114         }
1115
1116         if (s->type == SOURCE_MEMORY_PRESSURE) {
1117                 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1118                 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1119         }
1120
1121         if (s->destroy_callback)
1122                 s->destroy_callback(s->userdata);
1123
1124         free(s->description);
1125         return mfree(s);
1126 }
1127 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1128
1129 static int source_set_pending(sd_event_source *s, bool b) {
1130         int r;
1131
1132         assert(s);
1133         assert(s->type != SOURCE_EXIT);
1134
1135         if (s->pending == b)
1136                 return 0;
1137
1138         s->pending = b;
1139
1140         if (b) {
1141                 s->pending_iteration = s->event->iteration;
1142
1143                 r = prioq_put(s->event->pending, s, &s->pending_index);
1144                 if (r < 0) {
1145                         s->pending = false;
1146                         return r;
1147                 }
1148         } else
1149                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1150
1151         if (EVENT_SOURCE_IS_TIME(s->type))
1152                 event_source_time_prioq_reshuffle(s);
1153
1154         if (s->type == SOURCE_SIGNAL && !b) {
1155                 struct signal_data *d;
1156
1157                 d = hashmap_get(s->event->signal_data, &s->priority);
1158                 if (d && d->current == s)
1159                         d->current = NULL;
1160         }
1161
1162         if (s->type == SOURCE_INOTIFY) {
1163
1164                 assert(s->inotify.inode_data);
1165                 assert(s->inotify.inode_data->inotify_data);
1166
1167                 if (b)
1168                         s->inotify.inode_data->inotify_data->n_pending ++;
1169                 else {
1170                         assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1171                         s->inotify.inode_data->inotify_data->n_pending --;
1172                 }
1173         }
1174
1175         return 1;
1176 }
1177
1178 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1179
1180         /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1181          * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1182          * lines. */
1183         static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1184                 [SOURCE_IO]                  = endoffsetof_field(sd_event_source, io),
1185                 [SOURCE_TIME_REALTIME]       = endoffsetof_field(sd_event_source, time),
1186                 [SOURCE_TIME_BOOTTIME]       = endoffsetof_field(sd_event_source, time),
1187                 [SOURCE_TIME_MONOTONIC]      = endoffsetof_field(sd_event_source, time),
1188                 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1189                 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1190                 [SOURCE_SIGNAL]              = endoffsetof_field(sd_event_source, signal),
1191                 [SOURCE_CHILD]               = endoffsetof_field(sd_event_source, child),
1192                 [SOURCE_DEFER]               = endoffsetof_field(sd_event_source, defer),
1193                 [SOURCE_POST]                = endoffsetof_field(sd_event_source, post),
1194                 [SOURCE_EXIT]                = endoffsetof_field(sd_event_source, exit),
1195                 [SOURCE_INOTIFY]             = endoffsetof_field(sd_event_source, inotify),
1196                 [SOURCE_MEMORY_PRESSURE]     = endoffsetof_field(sd_event_source, memory_pressure),
1197         };
1198
1199         sd_event_source *s;
1200
1201         assert(e);
1202         assert(type >= 0);
1203         assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1204         assert(size_table[type] > 0);
1205
1206         /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1207          * size, even if we only allocate the initial part we need. */
1208         s = expand_to_usable(malloc0(size_table[type]), sizeof(sd_event_source));
1209         if (!s)
1210                 return NULL;
1211
1212         /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1213          * than what we allocated here. */
1214         s->n_ref = 1;
1215         s->event = e;
1216         s->floating = floating;
1217         s->type = type;
1218         s->pending_index = PRIOQ_IDX_NULL;
1219         s->prepare_index = PRIOQ_IDX_NULL;
1220
1221         if (!floating)
1222                 sd_event_ref(e);
1223
1224         LIST_PREPEND(sources, e->sources, s);
1225         e->n_sources++;
1226
1227         return s;
1228 }
1229
1230 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1231         assert(s);
1232
1233         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1234 }
1235
1236 _public_ int sd_event_add_io(
1237                 sd_event *e,
1238                 sd_event_source **ret,
1239                 int fd,
1240                 uint32_t events,
1241                 sd_event_io_handler_t callback,
1242                 void *userdata) {
1243
1244         _cleanup_(source_freep) sd_event_source *s = NULL;
1245         int r;
1246
1247         assert_return(e, -EINVAL);
1248         assert_return(e = event_resolve(e), -ENOPKG);
1249         assert_return(fd >= 0, -EBADF);
1250         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1251         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1252         assert_return(!event_origin_changed(e), -ECHILD);
1253
1254         if (!callback)
1255                 callback = io_exit_callback;
1256
1257         s = source_new(e, !ret, SOURCE_IO);
1258         if (!s)
1259                 return -ENOMEM;
1260
1261         s->wakeup = WAKEUP_EVENT_SOURCE;
1262         s->io.fd = fd;
1263         s->io.events = events;
1264         s->io.callback = callback;
1265         s->userdata = userdata;
1266         s->enabled = SD_EVENT_ON;
1267
1268         r = source_io_register(s, s->enabled, events);
1269         if (r < 0)
1270                 return r;
1271
1272         if (ret)
1273                 *ret = s;
1274         TAKE_PTR(s);
1275
1276         return 0;
1277 }
1278
1279 static void initialize_perturb(sd_event *e) {
1280         sd_id128_t id = {};
1281
1282         /* When we sleep for longer, we try to realign the wakeup to the same time within each
1283          * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1284          * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1285          * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1286          * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1287
1288         if (_likely_(e->perturb != USEC_INFINITY))
1289                 return;
1290
1291         if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
1292                 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1293         else
1294                 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
1295 }
1296
1297 static int event_setup_timer_fd(
1298                 sd_event *e,
1299                 struct clock_data *d,
1300                 clockid_t clock) {
1301
1302         assert(e);
1303         assert(d);
1304
1305         if (_likely_(d->fd >= 0))
1306                 return 0;
1307
1308         _cleanup_close_ int fd = -EBADF;
1309
1310         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1311         if (fd < 0)
1312                 return -errno;
1313
1314         fd = fd_move_above_stdio(fd);
1315
1316         struct epoll_event ev = {
1317                 .events = EPOLLIN,
1318                 .data.ptr = d,
1319         };
1320
1321         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1322                 return -errno;
1323
1324         d->fd = TAKE_FD(fd);
1325         return 0;
1326 }
1327
1328 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1329         assert(s);
1330
1331         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1332 }
1333
1334 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1335         int r;
1336
1337         assert(d);
1338
1339         if (d->fd < 0) {
1340                 r = event_setup_timer_fd(e, d, clock);
1341                 if (r < 0)
1342                         return r;
1343         }
1344
1345         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1346         if (r < 0)
1347                 return r;
1348
1349         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1350         if (r < 0)
1351                 return r;
1352
1353         return 0;
1354 }
1355
1356 static int event_source_time_prioq_put(
1357                 sd_event_source *s,
1358                 struct clock_data *d) {
1359
1360         int r;
1361
1362         assert(s);
1363         assert(d);
1364         assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1365
1366         r = prioq_put(d->earliest, s, &s->earliest_index);
1367         if (r < 0)
1368                 return r;
1369
1370         r = prioq_put(d->latest, s, &s->latest_index);
1371         if (r < 0) {
1372                 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1373                 s->earliest_index = PRIOQ_IDX_NULL;
1374                 return r;
1375         }
1376
1377         d->needs_rearm = true;
1378         return 0;
1379 }
1380
1381 _public_ int sd_event_add_time(
1382                 sd_event *e,
1383                 sd_event_source **ret,
1384                 clockid_t clock,
1385                 uint64_t usec,
1386                 uint64_t accuracy,
1387                 sd_event_time_handler_t callback,
1388                 void *userdata) {
1389
1390         EventSourceType type;
1391         _cleanup_(source_freep) sd_event_source *s = NULL;
1392         struct clock_data *d;
1393         int r;
1394
1395         assert_return(e, -EINVAL);
1396         assert_return(e = event_resolve(e), -ENOPKG);
1397         assert_return(accuracy != UINT64_MAX, -EINVAL);
1398         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1399         assert_return(!event_origin_changed(e), -ECHILD);
1400
1401         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1402                 return -EOPNOTSUPP;
1403
1404         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1405         if (type < 0)
1406                 return -EOPNOTSUPP;
1407
1408         if (!callback)
1409                 callback = time_exit_callback;
1410
1411         assert_se(d = event_get_clock_data(e, type));
1412
1413         r = setup_clock_data(e, d, clock);
1414         if (r < 0)
1415                 return r;
1416
1417         s = source_new(e, !ret, type);
1418         if (!s)
1419                 return -ENOMEM;
1420
1421         s->time.next = usec;
1422         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1423         s->time.callback = callback;
1424         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1425         s->userdata = userdata;
1426         s->enabled = SD_EVENT_ONESHOT;
1427
1428         r = event_source_time_prioq_put(s, d);
1429         if (r < 0)
1430                 return r;
1431
1432         if (ret)
1433                 *ret = s;
1434         TAKE_PTR(s);
1435
1436         return 0;
1437 }
1438
1439 _public_ int sd_event_add_time_relative(
1440                 sd_event *e,
1441                 sd_event_source **ret,
1442                 clockid_t clock,
1443                 uint64_t usec,
1444                 uint64_t accuracy,
1445                 sd_event_time_handler_t callback,
1446                 void *userdata) {
1447
1448         usec_t t;
1449         int r;
1450
1451         /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1452          * checks for overflow. */
1453
1454         r = sd_event_now(e, clock, &t);
1455         if (r < 0)
1456                 return r;
1457
1458         if (usec >= USEC_INFINITY - t)
1459                 return -EOVERFLOW;
1460
1461         return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1462 }
1463
1464 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1465         assert(s);
1466
1467         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1468 }
1469
1470 _public_ int sd_event_add_signal(
1471                 sd_event *e,
1472                 sd_event_source **ret,
1473                 int sig,
1474                 sd_event_signal_handler_t callback,
1475                 void *userdata) {
1476
1477         _cleanup_(source_freep) sd_event_source *s = NULL;
1478         struct signal_data *d;
1479         sigset_t new_ss;
1480         bool block_it;
1481         int r;
1482
1483         assert_return(e, -EINVAL);
1484         assert_return(e = event_resolve(e), -ENOPKG);
1485         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1486         assert_return(!event_origin_changed(e), -ECHILD);
1487
1488         /* Let's make sure our special flag stays outside of the valid signal range */
1489         assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1490
1491         if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1492                 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1493                 assert_return(SIGNAL_VALID(sig), -EINVAL);
1494
1495                 block_it = true;
1496         } else {
1497                 assert_return(SIGNAL_VALID(sig), -EINVAL);
1498
1499                 r = signal_is_blocked(sig);
1500                 if (r < 0)
1501                         return r;
1502                 if (r == 0)
1503                         return -EBUSY;
1504
1505                 block_it = false;
1506         }
1507
1508         if (!callback)
1509                 callback = signal_exit_callback;
1510
1511         if (!e->signal_sources) {
1512                 e->signal_sources = new0(sd_event_source*, _NSIG);
1513                 if (!e->signal_sources)
1514                         return -ENOMEM;
1515         } else if (e->signal_sources[sig])
1516                 return -EBUSY;
1517
1518         s = source_new(e, !ret, SOURCE_SIGNAL);
1519         if (!s)
1520                 return -ENOMEM;
1521
1522         s->signal.sig = sig;
1523         s->signal.callback = callback;
1524         s->userdata = userdata;
1525         s->enabled = SD_EVENT_ON;
1526
1527         e->signal_sources[sig] = s;
1528
1529         if (block_it) {
1530                 sigset_t old_ss;
1531
1532                 if (sigemptyset(&new_ss) < 0)
1533                         return -errno;
1534
1535                 if (sigaddset(&new_ss, sig) < 0)
1536                         return -errno;
1537
1538                 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1539                 if (r != 0)
1540                         return -r;
1541
1542                 r = sigismember(&old_ss, sig);
1543                 if (r < 0)
1544                         return -errno;
1545
1546                 s->signal.unblock = !r;
1547         } else
1548                 s->signal.unblock = false;
1549
1550         r = event_make_signal_data(e, sig, &d);
1551         if (r < 0) {
1552                 if (s->signal.unblock)
1553                         (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1554
1555                 return r;
1556         }
1557
1558         /* Use the signal name as description for the event source by default */
1559         (void) sd_event_source_set_description(s, signal_to_string(sig));
1560
1561         if (ret)
1562                 *ret = s;
1563         TAKE_PTR(s);
1564
1565         return 0;
1566 }
1567
1568 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1569         assert(s);
1570
1571         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1572 }
1573
1574 static bool shall_use_pidfd(void) {
1575         /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1576         return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1577 }
1578
1579 _public_ int sd_event_add_child(
1580                 sd_event *e,
1581                 sd_event_source **ret,
1582                 pid_t pid,
1583                 int options,
1584                 sd_event_child_handler_t callback,
1585                 void *userdata) {
1586
1587         _cleanup_(source_freep) sd_event_source *s = NULL;
1588         int r;
1589
1590         assert_return(e, -EINVAL);
1591         assert_return(e = event_resolve(e), -ENOPKG);
1592         assert_return(pid > 1, -EINVAL);
1593         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1594         assert_return(options != 0, -EINVAL);
1595         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1596         assert_return(!event_origin_changed(e), -ECHILD);
1597
1598         if (!callback)
1599                 callback = child_exit_callback;
1600
1601         if (e->n_online_child_sources == 0) {
1602                 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1603                  * for compatibility with pre-pidfd and because we don't want the reap the child processes
1604                  * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1605                  * take effect.
1606                  *
1607                  * (As an optimization we only do this check on the first child event source created.) */
1608                 r = signal_is_blocked(SIGCHLD);
1609                 if (r < 0)
1610                         return r;
1611                 if (r == 0)
1612                         return -EBUSY;
1613         }
1614
1615         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1616         if (r < 0)
1617                 return r;
1618
1619         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1620                 return -EBUSY;
1621
1622         s = source_new(e, !ret, SOURCE_CHILD);
1623         if (!s)
1624                 return -ENOMEM;
1625
1626         s->wakeup = WAKEUP_EVENT_SOURCE;
1627         s->child.options = options;
1628         s->child.callback = callback;
1629         s->userdata = userdata;
1630         s->enabled = SD_EVENT_ONESHOT;
1631
1632         /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1633          * pin the PID, and make regular waitid() handling race-free. */
1634
1635         if (shall_use_pidfd()) {
1636                 s->child.pidfd = pidfd_open(pid, 0);
1637                 if (s->child.pidfd < 0) {
1638                         /* Propagate errors unless the syscall is not supported or blocked */
1639                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1640                                 return -errno;
1641                 } else
1642                         s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1643         } else
1644                 s->child.pidfd = -EBADF;
1645
1646         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1647                 /* We have a pidfd and we only want to watch for exit */
1648                 r = source_child_pidfd_register(s, s->enabled);
1649                 if (r < 0)
1650                         return r;
1651
1652         } else {
1653                 /* We have no pidfd or we shall wait for some other event than WEXITED */
1654                 r = event_make_signal_data(e, SIGCHLD, NULL);
1655                 if (r < 0)
1656                         return r;
1657
1658                 e->need_process_child = true;
1659         }
1660
1661         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1662         if (r < 0)
1663                 return r;
1664
1665         /* These must be done after everything succeeds. */
1666         s->child.pid = pid;
1667         e->n_online_child_sources++;
1668
1669         if (ret)
1670                 *ret = s;
1671         TAKE_PTR(s);
1672         return 0;
1673 }
1674
1675 _public_ int sd_event_add_child_pidfd(
1676                 sd_event *e,
1677                 sd_event_source **ret,
1678                 int pidfd,
1679                 int options,
1680                 sd_event_child_handler_t callback,
1681                 void *userdata) {
1682
1683
1684         _cleanup_(source_freep) sd_event_source *s = NULL;
1685         pid_t pid;
1686         int r;
1687
1688         assert_return(e, -EINVAL);
1689         assert_return(e = event_resolve(e), -ENOPKG);
1690         assert_return(pidfd >= 0, -EBADF);
1691         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1692         assert_return(options != 0, -EINVAL);
1693         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1694         assert_return(!event_origin_changed(e), -ECHILD);
1695
1696         if (!callback)
1697                 callback = child_exit_callback;
1698
1699         if (e->n_online_child_sources == 0) {
1700                 r = signal_is_blocked(SIGCHLD);
1701                 if (r < 0)
1702                         return r;
1703                 if (r == 0)
1704                         return -EBUSY;
1705         }
1706
1707         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1708         if (r < 0)
1709                 return r;
1710
1711         r = pidfd_get_pid(pidfd, &pid);
1712         if (r < 0)
1713                 return r;
1714
1715         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1716                 return -EBUSY;
1717
1718         s = source_new(e, !ret, SOURCE_CHILD);
1719         if (!s)
1720                 return -ENOMEM;
1721
1722         s->wakeup = WAKEUP_EVENT_SOURCE;
1723         s->child.pidfd = pidfd;
1724         s->child.pid = pid;
1725         s->child.options = options;
1726         s->child.callback = callback;
1727         s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1728         s->userdata = userdata;
1729         s->enabled = SD_EVENT_ONESHOT;
1730
1731         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1732         if (r < 0)
1733                 return r;
1734
1735         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1736                 /* We only want to watch for WEXITED */
1737                 r = source_child_pidfd_register(s, s->enabled);
1738                 if (r < 0)
1739                         return r;
1740         } else {
1741                 /* We shall wait for some other event than WEXITED */
1742                 r = event_make_signal_data(e, SIGCHLD, NULL);
1743                 if (r < 0)
1744                         return r;
1745
1746                 e->need_process_child = true;
1747         }
1748
1749         e->n_online_child_sources++;
1750
1751         if (ret)
1752                 *ret = s;
1753         TAKE_PTR(s);
1754         return 0;
1755 }
1756
1757 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1758         assert(s);
1759
1760         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1761 }
1762
1763 _public_ int sd_event_add_defer(
1764                 sd_event *e,
1765                 sd_event_source **ret,
1766                 sd_event_handler_t callback,
1767                 void *userdata) {
1768
1769         _cleanup_(source_freep) sd_event_source *s = NULL;
1770         int r;
1771
1772         assert_return(e, -EINVAL);
1773         assert_return(e = event_resolve(e), -ENOPKG);
1774         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1775         assert_return(!event_origin_changed(e), -ECHILD);
1776
1777         if (!callback)
1778                 callback = generic_exit_callback;
1779
1780         s = source_new(e, !ret, SOURCE_DEFER);
1781         if (!s)
1782                 return -ENOMEM;
1783
1784         s->defer.callback = callback;
1785         s->userdata = userdata;
1786         s->enabled = SD_EVENT_ONESHOT;
1787
1788         r = source_set_pending(s, true);
1789         if (r < 0)
1790                 return r;
1791
1792         if (ret)
1793                 *ret = s;
1794         TAKE_PTR(s);
1795
1796         return 0;
1797 }
1798
1799 _public_ int sd_event_add_post(
1800                 sd_event *e,
1801                 sd_event_source **ret,
1802                 sd_event_handler_t callback,
1803                 void *userdata) {
1804
1805         _cleanup_(source_freep) sd_event_source *s = NULL;
1806         int r;
1807
1808         assert_return(e, -EINVAL);
1809         assert_return(e = event_resolve(e), -ENOPKG);
1810         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1811         assert_return(!event_origin_changed(e), -ECHILD);
1812
1813         if (!callback)
1814                 callback = generic_exit_callback;
1815
1816         s = source_new(e, !ret, SOURCE_POST);
1817         if (!s)
1818                 return -ENOMEM;
1819
1820         s->post.callback = callback;
1821         s->userdata = userdata;
1822         s->enabled = SD_EVENT_ON;
1823
1824         r = set_ensure_put(&e->post_sources, NULL, s);
1825         if (r < 0)
1826                 return r;
1827         assert(r > 0);
1828
1829         if (ret)
1830                 *ret = s;
1831         TAKE_PTR(s);
1832
1833         return 0;
1834 }
1835
1836 _public_ int sd_event_add_exit(
1837                 sd_event *e,
1838                 sd_event_source **ret,
1839                 sd_event_handler_t callback,
1840                 void *userdata) {
1841
1842         _cleanup_(source_freep) sd_event_source *s = NULL;
1843         int r;
1844
1845         assert_return(e, -EINVAL);
1846         assert_return(e = event_resolve(e), -ENOPKG);
1847         assert_return(callback, -EINVAL);
1848         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1849         assert_return(!event_origin_changed(e), -ECHILD);
1850
1851         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1852         if (r < 0)
1853                 return r;
1854
1855         s = source_new(e, !ret, SOURCE_EXIT);
1856         if (!s)
1857                 return -ENOMEM;
1858
1859         s->exit.callback = callback;
1860         s->userdata = userdata;
1861         s->exit.prioq_index = PRIOQ_IDX_NULL;
1862         s->enabled = SD_EVENT_ONESHOT;
1863
1864         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1865         if (r < 0)
1866                 return r;
1867
1868         if (ret)
1869                 *ret = s;
1870         TAKE_PTR(s);
1871
1872         return 0;
1873 }
1874
1875 _public_ int sd_event_trim_memory(void) {
1876         int r;
1877
1878         /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1879          * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1880          * NULL callback parameter. */
1881
1882         log_debug("Memory pressure event, trimming malloc() memory.");
1883
1884 #if HAVE_GENERIC_MALLINFO
1885         generic_mallinfo before_mallinfo = generic_mallinfo_get();
1886 #endif
1887
1888         usec_t before_timestamp = now(CLOCK_MONOTONIC);
1889         hashmap_trim_pools();
1890         r = malloc_trim(0);
1891         usec_t after_timestamp = now(CLOCK_MONOTONIC);
1892
1893         if (r > 0)
1894                 log_debug("Successfully trimmed some memory.");
1895         else
1896                 log_debug("Couldn't trim any memory.");
1897
1898         usec_t period = after_timestamp - before_timestamp;
1899
1900 #if HAVE_GENERIC_MALLINFO
1901         generic_mallinfo after_mallinfo = generic_mallinfo_get();
1902         size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
1903                 LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
1904         log_struct(LOG_DEBUG,
1905                    LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1906                                FORMAT_TIMESPAN(period, 0),
1907                                FORMAT_BYTES(l)),
1908                    "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1909                    "TRIMMED_BYTES=%zu", l,
1910                    "TRIMMED_USEC=" USEC_FMT, period);
1911 #else
1912         log_struct(LOG_DEBUG,
1913                    LOG_MESSAGE("Memory trimming took %s.",
1914                                FORMAT_TIMESPAN(period, 0)),
1915                    "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1916                    "TRIMMED_USEC=" USEC_FMT, period);
1917 #endif
1918
1919         return 0;
1920 }
1921
1922 static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1923         assert(s);
1924
1925         sd_event_trim_memory();
1926         return 0;
1927 }
1928
1929 _public_ int sd_event_add_memory_pressure(
1930                 sd_event *e,
1931                 sd_event_source **ret,
1932                 sd_event_handler_t callback,
1933                 void *userdata) {
1934
1935         _cleanup_free_ char *w = NULL;
1936         _cleanup_(source_freep) sd_event_source *s = NULL;
1937         _cleanup_close_ int path_fd = -EBADF, fd = -EBADF;
1938         _cleanup_free_ void *write_buffer = NULL;
1939         const char *watch, *watch_fallback = NULL, *env;
1940         size_t write_buffer_size = 0;
1941         struct stat st;
1942         uint32_t events;
1943         bool locked;
1944         int r;
1945
1946         assert_return(e, -EINVAL);
1947         assert_return(e = event_resolve(e), -ENOPKG);
1948         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1949         assert_return(!event_origin_changed(e), -ECHILD);
1950
1951         if (!callback)
1952                 callback = memory_pressure_callback;
1953
1954         s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1955         if (!s)
1956                 return -ENOMEM;
1957
1958         s->wakeup = WAKEUP_EVENT_SOURCE;
1959         s->memory_pressure.callback = callback;
1960         s->userdata = userdata;
1961         s->enabled = SD_EVENT_ON;
1962         s->memory_pressure.fd = -EBADF;
1963
1964         env = secure_getenv("MEMORY_PRESSURE_WATCH");
1965         if (env) {
1966                 if (isempty(env) || path_equal(env, "/dev/null"))
1967                         return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1968                                                "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1969
1970                 if (!path_is_absolute(env) || !path_is_normalized(env))
1971                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1972                                                "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1973
1974                 watch = env;
1975
1976                 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1977                 if (env) {
1978                         r = unbase64mem(env, SIZE_MAX, &write_buffer, &write_buffer_size);
1979                         if (r < 0)
1980                                 return r;
1981                 }
1982
1983                 locked = true;
1984         } else {
1985
1986                 r = is_pressure_supported();
1987                 if (r < 0)
1988                         return r;
1989                 if (r == 0)
1990                         return -EOPNOTSUPP;
1991
1992                 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1993                  * the system wide pressure if for some reason we cannot (which could be: memory controller
1994                  * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1995                  * only use the system-wide logic. */
1996                 r = cg_all_unified();
1997                 if (r < 0)
1998                         return r;
1999                 if (r == 0)
2000                         watch = "/proc/pressure/memory";
2001                 else {
2002                         _cleanup_free_ char *cg = NULL;
2003
2004                         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
2005                         if (r < 0)
2006                                 return r;
2007
2008                         w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
2009                         if (!w)
2010                                 return -ENOMEM;
2011
2012                         watch = w;
2013                         watch_fallback = "/proc/pressure/memory";
2014                 }
2015
2016                 /* Android uses three levels in its userspace low memory killer logic:
2017                  *     some  70000 1000000
2018                  *     some 100000 1000000
2019                  *     full  70000 1000000
2020                  *
2021                  * GNOME's low memory monitor uses:
2022                  *     some  70000 1000000
2023                  *     some 100000 1000000
2024                  *     full 100000 1000000
2025                  *
2026                  * We'll default to the middle level that both agree on. Except we do it on a 2s window
2027                  * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2028                  * kernel will allow us to do unprivileged, also in the future. */
2029                 if (asprintf((char**) &write_buffer,
2030                              "%s " USEC_FMT " " USEC_FMT,
2031                              MEMORY_PRESSURE_DEFAULT_TYPE,
2032                              MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2033                              MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2034                         return -ENOMEM;
2035
2036                 write_buffer_size = strlen(write_buffer) + 1;
2037                 locked = false;
2038         }
2039
2040         path_fd = open(watch, O_PATH|O_CLOEXEC);
2041         if (path_fd < 0) {
2042                 if (errno != ENOENT)
2043                         return -errno;
2044
2045                 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2046                  * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2047                  * the PSI service apparently is not supported) */
2048                 if (!watch_fallback)
2049                         return locked ? -ENOENT : -EOPNOTSUPP;
2050
2051                 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
2052                 if (path_fd < 0) {
2053                         if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2054                                 return -EOPNOTSUPP;
2055                         return -errno;
2056                 }
2057         }
2058
2059         if (fstat(path_fd, &st) < 0)
2060                 return -errno;
2061
2062         if (S_ISSOCK(st.st_mode)) {
2063                 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2064                 if (fd < 0)
2065                         return -errno;
2066
2067                 r = connect_unix_path(fd, path_fd, NULL);
2068                 if (r < 0)
2069                         return r;
2070
2071                 events = EPOLLIN;
2072
2073         } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2074                 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2075                 if (fd < 0)
2076                         return fd;
2077
2078                 if (S_ISREG(st.st_mode)) {
2079                         struct statfs sfs;
2080
2081                         /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2082
2083                         if (fstatfs(fd, &sfs) < 0)
2084                                 return -errno;
2085
2086                         if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2087                             !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2088                                 return -ENOTTY;
2089
2090                         events = EPOLLPRI;
2091                 } else
2092                         /* For fifos and char devices just watch for EPOLLIN */
2093                         events = EPOLLIN;
2094
2095         } else if (S_ISDIR(st.st_mode))
2096                 return -EISDIR;
2097         else
2098                 return -EBADF;
2099
2100         s->memory_pressure.fd = TAKE_FD(fd);
2101         s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2102         s->memory_pressure.write_buffer_size = write_buffer_size;
2103         s->memory_pressure.events = events;
2104         s->memory_pressure.locked = locked;
2105
2106         /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2107          * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2108          * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2109          * event sources on which writes must be executed before the first event loop iteration is
2110          * executed. (We could also write the data here, right away, but we want to give the caller the
2111          * freedom to call sd_event_source_set_memory_pressure_type() and
2112          * sd_event_source_set_memory_pressure_rate() before we write it. */
2113
2114         if (s->memory_pressure.write_buffer_size > 0)
2115                 source_memory_pressure_add_to_write_list(s);
2116         else {
2117                 r = source_memory_pressure_register(s, s->enabled);
2118                 if (r < 0)
2119                         return r;
2120         }
2121
2122         if (ret)
2123                 *ret = s;
2124         TAKE_PTR(s);
2125
2126         return 0;
2127 }
2128
2129 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2130         assert(e);
2131
2132         if (!d)
2133                 return;
2134
2135         assert(hashmap_isempty(d->inodes));
2136         assert(hashmap_isempty(d->wd));
2137
2138         if (d->buffer_filled > 0)
2139                 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
2140
2141         hashmap_free(d->inodes);
2142         hashmap_free(d->wd);
2143
2144         assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2145
2146         if (d->fd >= 0) {
2147                 if (!event_origin_changed(e) &&
2148                     epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
2149                         log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2150
2151                 safe_close(d->fd);
2152         }
2153         free(d);
2154 }
2155
2156 static int event_make_inotify_data(
2157                 sd_event *e,
2158                 int64_t priority,
2159                 struct inotify_data **ret) {
2160
2161         _cleanup_close_ int fd = -EBADF;
2162         struct inotify_data *d;
2163         int r;
2164
2165         assert(e);
2166
2167         d = hashmap_get(e->inotify_data, &priority);
2168         if (d) {
2169                 if (ret)
2170                         *ret = d;
2171                 return 0;
2172         }
2173
2174         fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2175         if (fd < 0)
2176                 return -errno;
2177
2178         fd = fd_move_above_stdio(fd);
2179
2180         d = new(struct inotify_data, 1);
2181         if (!d)
2182                 return -ENOMEM;
2183
2184         *d = (struct inotify_data) {
2185                 .wakeup = WAKEUP_INOTIFY_DATA,
2186                 .fd = TAKE_FD(fd),
2187                 .priority = priority,
2188         };
2189
2190         r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
2191         if (r < 0) {
2192                 d->fd = safe_close(d->fd);
2193                 free(d);
2194                 return r;
2195         }
2196
2197         struct epoll_event ev = {
2198                 .events = EPOLLIN,
2199                 .data.ptr = d,
2200         };
2201
2202         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2203                 r = -errno;
2204                 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2205                                             * remove the fd from the epoll first, which we don't want as we couldn't
2206                                             * add it in the first place. */
2207                 event_free_inotify_data(e, d);
2208                 return r;
2209         }
2210
2211         if (ret)
2212                 *ret = d;
2213
2214         return 1;
2215 }
2216
2217 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
2218         int r;
2219
2220         assert(x);
2221         assert(y);
2222
2223         r = CMP(x->dev, y->dev);
2224         if (r != 0)
2225                 return r;
2226
2227         return CMP(x->ino, y->ino);
2228 }
2229
2230 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2231         assert(d);
2232
2233         siphash24_compress(&d->dev, sizeof(d->dev), state);
2234         siphash24_compress(&d->ino, sizeof(d->ino), state);
2235 }
2236
2237 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
2238
2239 static void event_free_inode_data(
2240                 sd_event *e,
2241                 struct inode_data *d) {
2242
2243         assert(e);
2244
2245         if (!d)
2246                 return;
2247
2248         assert(!d->event_sources);
2249
2250         if (d->fd >= 0) {
2251                 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
2252                 safe_close(d->fd);
2253         }
2254
2255         if (d->inotify_data) {
2256
2257                 if (d->wd >= 0) {
2258                         if (d->inotify_data->fd >= 0 && !event_origin_changed(e)) {
2259                                 /* So here's a problem. At the time this runs the watch descriptor might already be
2260                                  * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2261                                  * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2262                                  * likely case to happen. */
2263
2264                                 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2265                                         log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2266                         }
2267
2268                         assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2269                 }
2270
2271                 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2272         }
2273
2274         free(d);
2275 }
2276
2277 static void event_gc_inotify_data(
2278                 sd_event *e,
2279                 struct inotify_data *d) {
2280
2281         assert(e);
2282
2283         /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2284          * any inode with it anymore, which in turn happens if no event source of this priority is interested
2285          * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2286          * (under the expectation that the GC is called again once the counter is decremented). */
2287
2288         if (!d)
2289                 return;
2290
2291         if (!hashmap_isempty(d->inodes))
2292                 return;
2293
2294         if (d->n_busy > 0)
2295                 return;
2296
2297         event_free_inotify_data(e, d);
2298 }
2299
2300 static void event_gc_inode_data(
2301                 sd_event *e,
2302                 struct inode_data *d) {
2303
2304         struct inotify_data *inotify_data;
2305
2306         assert(e);
2307
2308         if (!d)
2309                 return;
2310
2311         if (d->event_sources)
2312                 return;
2313
2314         inotify_data = d->inotify_data;
2315         event_free_inode_data(e, d);
2316
2317         event_gc_inotify_data(e, inotify_data);
2318 }
2319
2320 static int event_make_inode_data(
2321                 sd_event *e,
2322                 struct inotify_data *inotify_data,
2323                 dev_t dev,
2324                 ino_t ino,
2325                 struct inode_data **ret) {
2326
2327         struct inode_data *d, key;
2328         int r;
2329
2330         assert(e);
2331         assert(inotify_data);
2332
2333         key = (struct inode_data) {
2334                 .ino = ino,
2335                 .dev = dev,
2336         };
2337
2338         d = hashmap_get(inotify_data->inodes, &key);
2339         if (d) {
2340                 if (ret)
2341                         *ret = d;
2342
2343                 return 0;
2344         }
2345
2346         r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2347         if (r < 0)
2348                 return r;
2349
2350         d = new(struct inode_data, 1);
2351         if (!d)
2352                 return -ENOMEM;
2353
2354         *d = (struct inode_data) {
2355                 .dev = dev,
2356                 .ino = ino,
2357                 .wd = -1,
2358                 .fd = -EBADF,
2359                 .inotify_data = inotify_data,
2360         };
2361
2362         r = hashmap_put(inotify_data->inodes, d, d);
2363         if (r < 0) {
2364                 free(d);
2365                 return r;
2366         }
2367
2368         if (ret)
2369                 *ret = d;
2370
2371         return 1;
2372 }
2373
2374 static uint32_t inode_data_determine_mask(struct inode_data *d) {
2375         bool excl_unlink = true;
2376         uint32_t combined = 0;
2377
2378         assert(d);
2379
2380         /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2381          * the IN_EXCL_UNLINK flag is ANDed instead.
2382          *
2383          * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2384          * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2385          * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2386          * events we don't care for client-side. */
2387
2388         LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2389
2390                 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2391                         excl_unlink = false;
2392
2393                 combined |= s->inotify.mask;
2394         }
2395
2396         return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2397 }
2398
2399 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2400         uint32_t combined_mask;
2401         int wd, r;
2402
2403         assert(d);
2404         assert(d->fd >= 0);
2405
2406         combined_mask = inode_data_determine_mask(d);
2407
2408         if (d->wd >= 0 && combined_mask == d->combined_mask)
2409                 return 0;
2410
2411         r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2412         if (r < 0)
2413                 return r;
2414
2415         wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2416         if (wd < 0)
2417                 return -errno;
2418
2419         if (d->wd < 0) {
2420                 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2421                 if (r < 0) {
2422                         (void) inotify_rm_watch(d->inotify_data->fd, wd);
2423                         return r;
2424                 }
2425
2426                 d->wd = wd;
2427
2428         } else if (d->wd != wd) {
2429
2430                 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2431                 (void) inotify_rm_watch(d->fd, wd);
2432                 return -EINVAL;
2433         }
2434
2435         d->combined_mask = combined_mask;
2436         return 1;
2437 }
2438
2439 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2440         assert(s);
2441
2442         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2443 }
2444
2445 static int event_add_inotify_fd_internal(
2446                 sd_event *e,
2447                 sd_event_source **ret,
2448                 int fd,
2449                 bool donate,
2450                 uint32_t mask,
2451                 sd_event_inotify_handler_t callback,
2452                 void *userdata) {
2453
2454         _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
2455         _cleanup_(source_freep) sd_event_source *s = NULL;
2456         struct inotify_data *inotify_data = NULL;
2457         struct inode_data *inode_data = NULL;
2458         struct stat st;
2459         int r;
2460
2461         assert_return(e, -EINVAL);
2462         assert_return(e = event_resolve(e), -ENOPKG);
2463         assert_return(fd >= 0, -EBADF);
2464         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2465         assert_return(!event_origin_changed(e), -ECHILD);
2466
2467         if (!callback)
2468                 callback = inotify_exit_callback;
2469
2470         /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2471          * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2472          * the user can't use them for us. */
2473         if (mask & IN_MASK_ADD)
2474                 return -EINVAL;
2475
2476         if (fstat(fd, &st) < 0)
2477                 return -errno;
2478
2479         s = source_new(e, !ret, SOURCE_INOTIFY);
2480         if (!s)
2481                 return -ENOMEM;
2482
2483         s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2484         s->inotify.mask = mask;
2485         s->inotify.callback = callback;
2486         s->userdata = userdata;
2487
2488         /* Allocate an inotify object for this priority, and an inode object within it */
2489         r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2490         if (r < 0)
2491                 return r;
2492
2493         r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2494         if (r < 0) {
2495                 event_gc_inotify_data(e, inotify_data);
2496                 return r;
2497         }
2498
2499         /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2500          * the event source, until then, for which we need the original inode. */
2501         if (inode_data->fd < 0) {
2502                 if (donated_fd >= 0)
2503                         inode_data->fd = TAKE_FD(donated_fd);
2504                 else {
2505                         inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2506                         if (inode_data->fd < 0) {
2507                                 r = -errno;
2508                                 event_gc_inode_data(e, inode_data);
2509                                 return r;
2510                         }
2511                 }
2512
2513                 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
2514         }
2515
2516         /* Link our event source to the inode data object */
2517         LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2518         s->inotify.inode_data = inode_data;
2519
2520         /* Actually realize the watch now */
2521         r = inode_data_realize_watch(e, inode_data);
2522         if (r < 0)
2523                 return r;
2524
2525         if (ret)
2526                 *ret = s;
2527         TAKE_PTR(s);
2528
2529         return 0;
2530 }
2531
2532 _public_ int sd_event_add_inotify_fd(
2533                 sd_event *e,
2534                 sd_event_source **ret,
2535                 int fd,
2536                 uint32_t mask,
2537                 sd_event_inotify_handler_t callback,
2538                 void *userdata) {
2539
2540         return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2541 }
2542
2543 _public_ int sd_event_add_inotify(
2544                 sd_event *e,
2545                 sd_event_source **ret,
2546                 const char *path,
2547                 uint32_t mask,
2548                 sd_event_inotify_handler_t callback,
2549                 void *userdata) {
2550
2551         sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2552         int fd, r;
2553
2554         assert_return(path, -EINVAL);
2555
2556         fd = open(path, O_PATH | O_CLOEXEC |
2557                         (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2558                         (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2559         if (fd < 0)
2560                 return -errno;
2561
2562         r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2563         if (r < 0)
2564                 return r;
2565
2566         (void) sd_event_source_set_description(s, path);
2567
2568         if (ret)
2569                 *ret = s;
2570
2571         return r;
2572 }
2573
2574 static sd_event_source* event_source_free(sd_event_source *s) {
2575         if (!s)
2576                 return NULL;
2577
2578         /* Here's a special hack: when we are called from a
2579          * dispatch handler we won't free the event source
2580          * immediately, but we will detach the fd from the
2581          * epoll. This way it is safe for the caller to unref
2582          * the event source and immediately close the fd, but
2583          * we still retain a valid event source object after
2584          * the callback. */
2585
2586         if (s->dispatching)
2587                 source_disconnect(s);
2588         else
2589                 source_free(s);
2590
2591         return NULL;
2592 }
2593
2594 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2595
2596 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2597         assert_return(s, -EINVAL);
2598         assert_return(!event_origin_changed(s->event), -ECHILD);
2599
2600         return free_and_strdup(&s->description, description);
2601 }
2602
2603 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
2604         assert_return(s, -EINVAL);
2605         assert_return(description, -EINVAL);
2606
2607         if (!s->description)
2608                 return -ENXIO;
2609
2610         *description = s->description;
2611         return 0;
2612 }
2613
2614 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
2615         assert_return(s, NULL);
2616         assert_return(!event_origin_changed(s->event), NULL);
2617
2618         return s->event;
2619 }
2620
2621 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2622         assert_return(s, -EINVAL);
2623         assert_return(s->type != SOURCE_EXIT, -EDOM);
2624         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2625         assert_return(!event_origin_changed(s->event), -ECHILD);
2626
2627         return s->pending;
2628 }
2629
2630 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2631         assert_return(s, -EINVAL);
2632         assert_return(s->type == SOURCE_IO, -EDOM);
2633         assert_return(!event_origin_changed(s->event), -ECHILD);
2634
2635         return s->io.fd;
2636 }
2637
2638 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2639         int r;
2640
2641         assert_return(s, -EINVAL);
2642         assert_return(fd >= 0, -EBADF);
2643         assert_return(s->type == SOURCE_IO, -EDOM);
2644         assert_return(!event_origin_changed(s->event), -ECHILD);
2645
2646         if (s->io.fd == fd)
2647                 return 0;
2648
2649         if (event_source_is_offline(s)) {
2650                 s->io.fd = fd;
2651                 s->io.registered = false;
2652         } else {
2653                 int saved_fd;
2654
2655                 saved_fd = s->io.fd;
2656                 assert(s->io.registered);
2657
2658                 s->io.fd = fd;
2659                 s->io.registered = false;
2660
2661                 r = source_io_register(s, s->enabled, s->io.events);
2662                 if (r < 0) {
2663                         s->io.fd = saved_fd;
2664                         s->io.registered = true;
2665                         return r;
2666                 }
2667
2668                 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2669         }
2670
2671         return 0;
2672 }
2673
2674 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2675         assert_return(s, -EINVAL);
2676         assert_return(s->type == SOURCE_IO, -EDOM);
2677         assert_return(!event_origin_changed(s->event), -ECHILD);
2678
2679         return s->io.owned;
2680 }
2681
2682 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2683         assert_return(s, -EINVAL);
2684         assert_return(s->type == SOURCE_IO, -EDOM);
2685         assert_return(!event_origin_changed(s->event), -ECHILD);
2686
2687         s->io.owned = own;
2688         return 0;
2689 }
2690
2691 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2692         assert_return(s, -EINVAL);
2693         assert_return(events, -EINVAL);
2694         assert_return(s->type == SOURCE_IO, -EDOM);
2695         assert_return(!event_origin_changed(s->event), -ECHILD);
2696
2697         *events = s->io.events;
2698         return 0;
2699 }
2700
2701 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2702         int r;
2703
2704         assert_return(s, -EINVAL);
2705         assert_return(s->type == SOURCE_IO, -EDOM);
2706         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2707         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2708         assert_return(!event_origin_changed(s->event), -ECHILD);
2709
2710         /* edge-triggered updates are never skipped, so we can reset edges */
2711         if (s->io.events == events && !(events & EPOLLET))
2712                 return 0;
2713
2714         r = source_set_pending(s, false);
2715         if (r < 0)
2716                 return r;
2717
2718         if (event_source_is_online(s)) {
2719                 r = source_io_register(s, s->enabled, events);
2720                 if (r < 0)
2721                         return r;
2722         }
2723
2724         s->io.events = events;
2725
2726         return 0;
2727 }
2728
2729 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2730         assert_return(s, -EINVAL);
2731         assert_return(revents, -EINVAL);
2732         assert_return(s->type == SOURCE_IO, -EDOM);
2733         assert_return(s->pending, -ENODATA);
2734         assert_return(!event_origin_changed(s->event), -ECHILD);
2735
2736         *revents = s->io.revents;
2737         return 0;
2738 }
2739
2740 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2741         assert_return(s, -EINVAL);
2742         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2743         assert_return(!event_origin_changed(s->event), -ECHILD);
2744
2745         return s->signal.sig;
2746 }
2747
2748 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2749         assert_return(s, -EINVAL);
2750         assert_return(!event_origin_changed(s->event), -ECHILD);
2751
2752         *priority = s->priority;
2753         return 0;
2754 }
2755
2756 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2757         bool rm_inotify = false, rm_inode = false;
2758         struct inotify_data *new_inotify_data = NULL;
2759         struct inode_data *new_inode_data = NULL;
2760         int r;
2761
2762         assert_return(s, -EINVAL);
2763         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2764         assert_return(!event_origin_changed(s->event), -ECHILD);
2765
2766         if (s->priority == priority)
2767                 return 0;
2768
2769         if (s->type == SOURCE_INOTIFY) {
2770                 struct inode_data *old_inode_data;
2771
2772                 assert(s->inotify.inode_data);
2773                 old_inode_data = s->inotify.inode_data;
2774
2775                 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2776                  * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2777                  * events we allow priority changes only until the first following iteration. */
2778                 if (old_inode_data->fd < 0)
2779                         return -EOPNOTSUPP;
2780
2781                 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2782                 if (r < 0)
2783                         return r;
2784                 rm_inotify = r > 0;
2785
2786                 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2787                 if (r < 0)
2788                         goto fail;
2789                 rm_inode = r > 0;
2790
2791                 if (new_inode_data->fd < 0) {
2792                         /* Duplicate the fd for the new inode object if we don't have any yet */
2793                         new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2794                         if (new_inode_data->fd < 0) {
2795                                 r = -errno;
2796                                 goto fail;
2797                         }
2798
2799                         LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
2800                 }
2801
2802                 /* Move the event source to the new inode data structure */
2803                 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2804                 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2805                 s->inotify.inode_data = new_inode_data;
2806
2807                 /* Now create the new watch */
2808                 r = inode_data_realize_watch(s->event, new_inode_data);
2809                 if (r < 0) {
2810                         /* Move it back */
2811                         LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2812                         LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2813                         s->inotify.inode_data = old_inode_data;
2814                         goto fail;
2815                 }
2816
2817                 s->priority = priority;
2818
2819                 event_gc_inode_data(s->event, old_inode_data);
2820
2821         } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2822                 struct signal_data *old, *d;
2823
2824                 /* Move us from the signalfd belonging to the old
2825                  * priority to the signalfd of the new priority */
2826
2827                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2828
2829                 s->priority = priority;
2830
2831                 r = event_make_signal_data(s->event, s->signal.sig, &d);
2832                 if (r < 0) {
2833                         s->priority = old->priority;
2834                         return r;
2835                 }
2836
2837                 event_unmask_signal_data(s->event, old, s->signal.sig);
2838         } else
2839                 s->priority = priority;
2840
2841         event_source_pp_prioq_reshuffle(s);
2842
2843         if (s->type == SOURCE_EXIT)
2844                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2845
2846         return 0;
2847
2848 fail:
2849         if (rm_inode)
2850                 event_free_inode_data(s->event, new_inode_data);
2851
2852         if (rm_inotify)
2853                 event_free_inotify_data(s->event, new_inotify_data);
2854
2855         return r;
2856 }
2857
2858 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2859         /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2860         if (!s && !ret)
2861                 return false;
2862
2863         assert_return(s, -EINVAL);
2864         assert_return(!event_origin_changed(s->event), -ECHILD);
2865
2866         if (ret)
2867                 *ret = s->enabled;
2868
2869         return s->enabled != SD_EVENT_OFF;
2870 }
2871
2872 static int event_source_offline(
2873                 sd_event_source *s,
2874                 int enabled,
2875                 bool ratelimited) {
2876
2877         bool was_offline;
2878         int r;
2879
2880         assert(s);
2881         assert(enabled == SD_EVENT_OFF || ratelimited);
2882
2883         /* Unset the pending flag when this event source is disabled */
2884         if (s->enabled != SD_EVENT_OFF &&
2885             enabled == SD_EVENT_OFF &&
2886             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2887                 r = source_set_pending(s, false);
2888                 if (r < 0)
2889                         return r;
2890         }
2891
2892         was_offline = event_source_is_offline(s);
2893         s->enabled = enabled;
2894         s->ratelimited = ratelimited;
2895
2896         switch (s->type) {
2897
2898         case SOURCE_IO:
2899                 source_io_unregister(s);
2900                 break;
2901
2902         case SOURCE_SIGNAL:
2903                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2904                 break;
2905
2906         case SOURCE_CHILD:
2907                 if (!was_offline) {
2908                         assert(s->event->n_online_child_sources > 0);
2909                         s->event->n_online_child_sources--;
2910                 }
2911
2912                 if (EVENT_SOURCE_WATCH_PIDFD(s))
2913                         source_child_pidfd_unregister(s);
2914                 else
2915                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2916                 break;
2917
2918         case SOURCE_EXIT:
2919                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2920                 break;
2921
2922         case SOURCE_MEMORY_PRESSURE:
2923                 source_memory_pressure_unregister(s);
2924                 break;
2925
2926         case SOURCE_TIME_REALTIME:
2927         case SOURCE_TIME_BOOTTIME:
2928         case SOURCE_TIME_MONOTONIC:
2929         case SOURCE_TIME_REALTIME_ALARM:
2930         case SOURCE_TIME_BOOTTIME_ALARM:
2931         case SOURCE_DEFER:
2932         case SOURCE_POST:
2933         case SOURCE_INOTIFY:
2934                 break;
2935
2936         default:
2937                 assert_not_reached();
2938         }
2939
2940         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2941         event_source_time_prioq_reshuffle(s);
2942
2943         return 1;
2944 }
2945
2946 static int event_source_online(
2947                 sd_event_source *s,
2948                 int enabled,
2949                 bool ratelimited) {
2950
2951         bool was_online;
2952         int r;
2953
2954         assert(s);
2955         assert(enabled != SD_EVENT_OFF || !ratelimited);
2956
2957         /* Unset the pending flag when this event source is enabled */
2958         if (s->enabled == SD_EVENT_OFF &&
2959             enabled != SD_EVENT_OFF &&
2960             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2961                 r = source_set_pending(s, false);
2962                 if (r < 0)
2963                         return r;
2964         }
2965
2966         /* Are we really ready for onlining? */
2967         if (enabled == SD_EVENT_OFF || ratelimited) {
2968                 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2969                 s->enabled = enabled;
2970                 s->ratelimited = ratelimited;
2971                 return 0;
2972         }
2973
2974         was_online = event_source_is_online(s);
2975
2976         switch (s->type) {
2977         case SOURCE_IO:
2978                 r = source_io_register(s, enabled, s->io.events);
2979                 if (r < 0)
2980                         return r;
2981                 break;
2982
2983         case SOURCE_SIGNAL:
2984                 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2985                 if (r < 0) {
2986                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2987                         return r;
2988                 }
2989
2990                 break;
2991
2992         case SOURCE_CHILD:
2993                 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2994                         /* yes, we have pidfd */
2995
2996                         r = source_child_pidfd_register(s, enabled);
2997                         if (r < 0)
2998                                 return r;
2999                 } else {
3000                         /* no pidfd, or something other to watch for than WEXITED */
3001
3002                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
3003                         if (r < 0) {
3004                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
3005                                 return r;
3006                         }
3007                 }
3008
3009                 if (!was_online)
3010                         s->event->n_online_child_sources++;
3011                 break;
3012
3013         case SOURCE_MEMORY_PRESSURE:
3014                 r = source_memory_pressure_register(s, enabled);
3015                 if (r < 0)
3016                         return r;
3017
3018                 break;
3019
3020         case SOURCE_TIME_REALTIME:
3021         case SOURCE_TIME_BOOTTIME:
3022         case SOURCE_TIME_MONOTONIC:
3023         case SOURCE_TIME_REALTIME_ALARM:
3024         case SOURCE_TIME_BOOTTIME_ALARM:
3025         case SOURCE_EXIT:
3026         case SOURCE_DEFER:
3027         case SOURCE_POST:
3028         case SOURCE_INOTIFY:
3029                 break;
3030
3031         default:
3032                 assert_not_reached();
3033         }
3034
3035         s->enabled = enabled;
3036         s->ratelimited = ratelimited;
3037
3038         /* Non-failing operations below */
3039         if (s->type == SOURCE_EXIT)
3040                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
3041
3042         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3043         event_source_time_prioq_reshuffle(s);
3044
3045         return 1;
3046 }
3047
3048 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3049         int r;
3050
3051         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
3052
3053         /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3054         if (m == SD_EVENT_OFF && !s)
3055                 return 0;
3056
3057         assert_return(s, -EINVAL);
3058         assert_return(!event_origin_changed(s->event), -ECHILD);
3059
3060         /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3061         if (s->event->state == SD_EVENT_FINISHED)
3062                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
3063
3064         if (s->enabled == m) /* No change? */
3065                 return 0;
3066
3067         if (m == SD_EVENT_OFF)
3068                 r = event_source_offline(s, m, s->ratelimited);
3069         else {
3070                 if (s->enabled != SD_EVENT_OFF) {
3071                         /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3072                          * event source is already enabled after all. */
3073                         s->enabled = m;
3074                         return 0;
3075                 }
3076
3077                 r = event_source_online(s, m, s->ratelimited);
3078         }
3079         if (r < 0)
3080                 return r;
3081
3082         event_source_pp_prioq_reshuffle(s);
3083         return 0;
3084 }
3085
3086 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
3087         assert_return(s, -EINVAL);
3088         assert_return(usec, -EINVAL);
3089         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3090         assert_return(!event_origin_changed(s->event), -ECHILD);
3091
3092         *usec = s->time.next;
3093         return 0;
3094 }
3095
3096 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
3097         int r;
3098
3099         assert_return(s, -EINVAL);
3100         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3101         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3102         assert_return(!event_origin_changed(s->event), -ECHILD);
3103
3104         r = source_set_pending(s, false);
3105         if (r < 0)
3106                 return r;
3107
3108         s->time.next = usec;
3109
3110         event_source_time_prioq_reshuffle(s);
3111         return 0;
3112 }
3113
3114 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3115         usec_t t;
3116         int r;
3117
3118         assert_return(s, -EINVAL);
3119         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3120         assert_return(!event_origin_changed(s->event), -ECHILD);
3121
3122         if (usec == USEC_INFINITY)
3123                 return sd_event_source_set_time(s, USEC_INFINITY);
3124
3125         r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3126         if (r < 0)
3127                 return r;
3128
3129         usec = usec_add(t, usec);
3130         if (usec == USEC_INFINITY)
3131                 return -EOVERFLOW;
3132
3133         return sd_event_source_set_time(s, usec);
3134 }
3135
3136 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
3137         assert_return(s, -EINVAL);
3138         assert_return(usec, -EINVAL);
3139         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3140         assert_return(!event_origin_changed(s->event), -ECHILD);
3141
3142         *usec = s->time.accuracy;
3143         return 0;
3144 }
3145
3146 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
3147         int r;
3148
3149         assert_return(s, -EINVAL);
3150         assert_return(usec != UINT64_MAX, -EINVAL);
3151         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3152         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3153         assert_return(!event_origin_changed(s->event), -ECHILD);
3154
3155         r = source_set_pending(s, false);
3156         if (r < 0)
3157                 return r;
3158
3159         if (usec == 0)
3160                 usec = DEFAULT_ACCURACY_USEC;
3161
3162         s->time.accuracy = usec;
3163
3164         event_source_time_prioq_reshuffle(s);
3165         return 0;
3166 }
3167
3168 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
3169         assert_return(s, -EINVAL);
3170         assert_return(clock, -EINVAL);
3171         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3172         assert_return(!event_origin_changed(s->event), -ECHILD);
3173
3174         *clock = event_source_type_to_clock(s->type);
3175         return 0;
3176 }
3177
3178 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
3179         assert_return(s, -EINVAL);
3180         assert_return(pid, -EINVAL);
3181         assert_return(s->type == SOURCE_CHILD, -EDOM);
3182         assert_return(!event_origin_changed(s->event), -ECHILD);
3183
3184         *pid = s->child.pid;
3185         return 0;
3186 }
3187
3188 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3189         assert_return(s, -EINVAL);
3190         assert_return(s->type == SOURCE_CHILD, -EDOM);
3191         assert_return(!event_origin_changed(s->event), -ECHILD);
3192
3193         if (s->child.pidfd < 0)
3194                 return -EOPNOTSUPP;
3195
3196         return s->child.pidfd;
3197 }
3198
3199 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3200         assert_return(s, -EINVAL);
3201         assert_return(s->type == SOURCE_CHILD, -EDOM);
3202         assert_return(!event_origin_changed(s->event), -ECHILD);
3203         assert_return(SIGNAL_VALID(sig), -EINVAL);
3204
3205         /* If we already have seen indication the process exited refuse sending a signal early. This way we
3206          * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3207          * available. */
3208         if (s->child.exited)
3209                 return -ESRCH;
3210
3211         if (s->child.pidfd >= 0) {
3212                 siginfo_t copy;
3213
3214                 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3215                  * structure here */
3216                 if (si)
3217                         copy = *si;
3218
3219                 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
3220                         /* Let's propagate the error only if the system call is not implemented or prohibited */
3221                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3222                                 return -errno;
3223                 } else
3224                         return 0;
3225         }
3226
3227         /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3228          * this here. */
3229         if (flags != 0)
3230                 return -EOPNOTSUPP;
3231
3232         if (si) {
3233                 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3234                 siginfo_t copy = *si;
3235
3236                 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
3237                         return -errno;
3238         } else if (kill(s->child.pid, sig) < 0)
3239                 return -errno;
3240
3241         return 0;
3242 }
3243
3244 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3245         assert_return(s, -EINVAL);
3246         assert_return(s->type == SOURCE_CHILD, -EDOM);
3247         assert_return(!event_origin_changed(s->event), -ECHILD);
3248
3249         if (s->child.pidfd < 0)
3250                 return -EOPNOTSUPP;
3251
3252         return s->child.pidfd_owned;
3253 }
3254
3255 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3256         assert_return(s, -EINVAL);
3257         assert_return(s->type == SOURCE_CHILD, -EDOM);
3258         assert_return(!event_origin_changed(s->event), -ECHILD);
3259
3260         if (s->child.pidfd < 0)
3261                 return -EOPNOTSUPP;
3262
3263         s->child.pidfd_owned = own;
3264         return 0;
3265 }
3266
3267 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3268         assert_return(s, -EINVAL);
3269         assert_return(s->type == SOURCE_CHILD, -EDOM);
3270         assert_return(!event_origin_changed(s->event), -ECHILD);
3271
3272         return s->child.process_owned;
3273 }
3274
3275 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3276         assert_return(s, -EINVAL);
3277         assert_return(s->type == SOURCE_CHILD, -EDOM);
3278         assert_return(!event_origin_changed(s->event), -ECHILD);
3279
3280         s->child.process_owned = own;
3281         return 0;
3282 }
3283
3284 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
3285         assert_return(s, -EINVAL);
3286         assert_return(mask, -EINVAL);
3287         assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3288         assert_return(!event_origin_changed(s->event), -ECHILD);
3289
3290         *mask = s->inotify.mask;
3291         return 0;
3292 }
3293
3294 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
3295         int r;
3296
3297         assert_return(s, -EINVAL);
3298         assert_return(s->type != SOURCE_EXIT, -EDOM);
3299         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3300         assert_return(!event_origin_changed(s->event), -ECHILD);
3301
3302         if (s->prepare == callback)
3303                 return 0;
3304
3305         if (callback && s->prepare) {
3306                 s->prepare = callback;
3307                 return 0;
3308         }
3309
3310         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3311         if (r < 0)
3312                 return r;
3313
3314         s->prepare = callback;
3315
3316         if (callback) {
3317                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3318                 if (r < 0)
3319                         return r;
3320         } else
3321                 prioq_remove(s->event->prepare, s, &s->prepare_index);
3322
3323         return 0;
3324 }
3325
3326 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
3327         assert_return(s, NULL);
3328         assert_return(!event_origin_changed(s->event), NULL);
3329
3330         return s->userdata;
3331 }
3332
3333 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3334         void *ret;
3335
3336         assert_return(s, NULL);
3337         assert_return(!event_origin_changed(s->event), NULL);
3338
3339         ret = s->userdata;
3340         s->userdata = userdata;
3341
3342         return ret;
3343 }
3344
3345 static int event_source_enter_ratelimited(sd_event_source *s) {
3346         int r;
3347
3348         assert(s);
3349
3350         /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3351          * the end of the rate limit time window, much as if it was a timer event source. */
3352
3353         if (s->ratelimited)
3354                 return 0; /* Already ratelimited, this is a NOP hence */
3355
3356         /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3357         r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3358         if (r < 0)
3359                 return r;
3360
3361         /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3362          * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3363          * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3364         if (EVENT_SOURCE_IS_TIME(s->type))
3365                 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3366
3367         /* Now, let's add the event source to the monotonic clock instead */
3368         r = event_source_time_prioq_put(s, &s->event->monotonic);
3369         if (r < 0)
3370                 goto fail;
3371
3372         /* And let's take the event source officially offline */
3373         r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3374         if (r < 0) {
3375                 event_source_time_prioq_remove(s, &s->event->monotonic);
3376                 goto fail;
3377         }
3378
3379         event_source_pp_prioq_reshuffle(s);
3380
3381         log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3382         return 0;
3383
3384 fail:
3385         /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3386          * space for it should already be allocated. */
3387         if (EVENT_SOURCE_IS_TIME(s->type))
3388                 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3389
3390         return r;
3391 }
3392
3393 static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
3394         int r;
3395
3396         assert(s);
3397
3398         if (!s->ratelimited)
3399                 return 0;
3400
3401         /* Let's take the event source out of the monotonic prioq first. */
3402         event_source_time_prioq_remove(s, &s->event->monotonic);
3403
3404         /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3405         if (EVENT_SOURCE_IS_TIME(s->type)) {
3406                 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3407                 if (r < 0)
3408                         goto fail;
3409         }
3410
3411         /* Let's try to take it online again.  */
3412         r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3413         if (r < 0) {
3414                 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3415                 if (EVENT_SOURCE_IS_TIME(s->type))
3416                         event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3417
3418                 goto fail;
3419         }
3420
3421         event_source_pp_prioq_reshuffle(s);
3422         ratelimit_reset(&s->rate_limit);
3423
3424         log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
3425
3426         if (run_callback && s->ratelimit_expire_callback) {
3427                 s->dispatching = true;
3428                 r = s->ratelimit_expire_callback(s, s->userdata);
3429                 s->dispatching = false;
3430
3431                 if (r < 0) {
3432                         log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3433                                         strna(s->description),
3434                                         event_source_type_to_string(s->type),
3435                                         s->exit_on_failure ? "exiting" : "disabling");
3436
3437                         if (s->exit_on_failure)
3438                                 (void) sd_event_exit(s->event, r);
3439                 }
3440
3441                 if (s->n_ref == 0)
3442                         source_free(s);
3443                 else if (r < 0)
3444                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3445
3446                 return 1;
3447         }
3448
3449         return 0;
3450
3451 fail:
3452         /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3453          * simply put it back in it, maybe we can then process it more successfully next iteration. */
3454         assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3455
3456         return r;
3457 }
3458
3459 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3460         usec_t c;
3461         assert(e);
3462         assert(a <= b);
3463
3464         if (a <= 0)
3465                 return 0;
3466         if (a >= USEC_INFINITY)
3467                 return USEC_INFINITY;
3468
3469         if (b <= a + 1)
3470                 return a;
3471
3472         initialize_perturb(e);
3473
3474         /*
3475           Find a good time to wake up again between times a and b. We
3476           have two goals here:
3477
3478           a) We want to wake up as seldom as possible, hence prefer
3479              later times over earlier times.
3480
3481           b) But if we have to wake up, then let's make sure to
3482              dispatch as much as possible on the entire system.
3483
3484           We implement this by waking up everywhere at the same time
3485           within any given minute if we can, synchronised via the
3486           perturbation value determined from the boot ID. If we can't,
3487           then we try to find the same spot in every 10s, then 1s and
3488           then 250ms step. Otherwise, we pick the last possible time
3489           to wake up.
3490         */
3491
3492         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3493         if (c >= b) {
3494                 if (_unlikely_(c < USEC_PER_MINUTE))
3495                         return b;
3496
3497                 c -= USEC_PER_MINUTE;
3498         }
3499
3500         if (c >= a)
3501                 return c;
3502
3503         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3504         if (c >= b) {
3505                 if (_unlikely_(c < USEC_PER_SEC*10))
3506                         return b;
3507
3508                 c -= USEC_PER_SEC*10;
3509         }
3510
3511         if (c >= a)
3512                 return c;
3513
3514         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3515         if (c >= b) {
3516                 if (_unlikely_(c < USEC_PER_SEC))
3517                         return b;
3518
3519                 c -= USEC_PER_SEC;
3520         }
3521
3522         if (c >= a)
3523                 return c;
3524
3525         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3526         if (c >= b) {
3527                 if (_unlikely_(c < USEC_PER_MSEC*250))
3528                         return b;
3529
3530                 c -= USEC_PER_MSEC*250;
3531         }
3532
3533         if (c >= a)
3534                 return c;
3535
3536         return b;
3537 }
3538
3539 static int event_arm_timer(
3540                 sd_event *e,
3541                 struct clock_data *d) {
3542
3543         struct itimerspec its = {};
3544         sd_event_source *a, *b;
3545         usec_t t;
3546
3547         assert(e);
3548         assert(d);
3549
3550         if (!d->needs_rearm)
3551                 return 0;
3552
3553         d->needs_rearm = false;
3554
3555         a = prioq_peek(d->earliest);
3556         assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3557         if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3558
3559                 if (d->fd < 0)
3560                         return 0;
3561
3562                 if (d->next == USEC_INFINITY)
3563                         return 0;
3564
3565                 /* disarm */
3566                 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3567                         return -errno;
3568
3569                 d->next = USEC_INFINITY;
3570                 return 0;
3571         }
3572
3573         b = prioq_peek(d->latest);
3574         assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3575         assert(b && b->enabled != SD_EVENT_OFF);
3576
3577         t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3578         if (d->next == t)
3579                 return 0;
3580
3581         assert_se(d->fd >= 0);
3582
3583         if (t == 0) {
3584                 /* We don't want to disarm here, just mean some time looooong ago. */
3585                 its.it_value.tv_sec = 0;
3586                 its.it_value.tv_nsec = 1;
3587         } else
3588                 timespec_store(&its.it_value, t);
3589
3590         if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3591                 return -errno;
3592
3593         d->next = t;
3594         return 0;
3595 }
3596
3597 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3598         assert(e);
3599         assert(s);
3600         assert(s->type == SOURCE_IO);
3601
3602         /* If the event source was already pending, we just OR in the
3603          * new revents, otherwise we reset the value. The ORing is
3604          * necessary to handle EPOLLONESHOT events properly where
3605          * readability might happen independently of writability, and
3606          * we need to keep track of both */
3607
3608         if (s->pending)
3609                 s->io.revents |= revents;
3610         else
3611                 s->io.revents = revents;
3612
3613         return source_set_pending(s, true);
3614 }
3615
3616 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3617         uint64_t x;
3618         ssize_t ss;
3619
3620         assert(e);
3621         assert(fd >= 0);
3622
3623         assert_return(events == EPOLLIN, -EIO);
3624
3625         ss = read(fd, &x, sizeof(x));
3626         if (ss < 0) {
3627                 if (ERRNO_IS_TRANSIENT(errno))
3628                         return 0;
3629
3630                 return -errno;
3631         }
3632
3633         if (_unlikely_(ss != sizeof(x)))
3634                 return -EIO;
3635
3636         if (next)
3637                 *next = USEC_INFINITY;
3638
3639         return 0;
3640 }
3641
3642 static int process_timer(
3643                 sd_event *e,
3644                 usec_t n,
3645                 struct clock_data *d) {
3646
3647         sd_event_source *s;
3648         bool callback_invoked = false;
3649         int r;
3650
3651         assert(e);
3652         assert(d);
3653
3654         for (;;) {
3655                 s = prioq_peek(d->earliest);
3656                 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3657
3658                 if (!s || time_event_source_next(s) > n)
3659                         break;
3660
3661                 if (s->ratelimited) {
3662                         /* This is an event sources whose ratelimit window has ended. Let's turn it on
3663                          * again. */
3664                         assert(s->ratelimited);
3665
3666                         r = event_source_leave_ratelimit(s, /* run_callback */ true);
3667                         if (r < 0)
3668                                 return r;
3669                         else if (r == 1)
3670                                 callback_invoked = true;
3671
3672                         continue;
3673                 }
3674
3675                 if (s->enabled == SD_EVENT_OFF || s->pending)
3676                         break;
3677
3678                 r = source_set_pending(s, true);
3679                 if (r < 0)
3680                         return r;
3681
3682                 event_source_time_prioq_reshuffle(s);
3683         }
3684
3685         return callback_invoked;
3686 }
3687
3688 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3689         int64_t min_priority = threshold;
3690         bool something_new = false;
3691         sd_event_source *s;
3692         int r;
3693
3694         assert(e);
3695         assert(ret_min_priority);
3696
3697         if (!e->need_process_child) {
3698                 *ret_min_priority = min_priority;
3699                 return 0;
3700         }
3701
3702         e->need_process_child = false;
3703
3704         /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3705          * for, instead of using P_ALL. This is because we only want to get child information of very
3706          * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3707          * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3708          * hence we really don't want anything flushed out of the kernel's queue that we don't care
3709          * about. Since this is O(n) this means that if you have a lot of processes you probably want
3710          * to handle SIGCHLD yourself.
3711          *
3712          * We do not reap the children here (by using WNOWAIT), this is only done after the event
3713          * source is dispatched so that the callback still sees the process as a zombie. */
3714
3715         HASHMAP_FOREACH(s, e->child_sources) {
3716                 assert(s->type == SOURCE_CHILD);
3717
3718                 if (s->priority > threshold)
3719                         continue;
3720
3721                 if (s->pending)
3722                         continue;
3723
3724                 if (event_source_is_offline(s))
3725                         continue;
3726
3727                 if (s->child.exited)
3728                         continue;
3729
3730                 if (EVENT_SOURCE_WATCH_PIDFD(s))
3731                         /* There's a usable pidfd known for this event source? Then don't waitid() for
3732                          * it here */
3733                         continue;
3734
3735                 zero(s->child.siginfo);
3736                 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3737                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3738                         return negative_errno();
3739
3740                 if (s->child.siginfo.si_pid != 0) {
3741                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3742
3743                         if (zombie)
3744                                 s->child.exited = true;
3745
3746                         if (!zombie && (s->child.options & WEXITED)) {
3747                                 /* If the child isn't dead then let's immediately remove the state
3748                                  * change from the queue, since there's no benefit in leaving it
3749                                  * queued. */
3750
3751                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
3752                                 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3753                         }
3754
3755                         r = source_set_pending(s, true);
3756                         if (r < 0)
3757                                 return r;
3758                         if (r > 0) {
3759                                 something_new = true;
3760                                 min_priority = MIN(min_priority, s->priority);
3761                         }
3762                 }
3763         }
3764
3765         *ret_min_priority = min_priority;
3766         return something_new;
3767 }
3768
3769 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3770         assert(e);
3771         assert(s);
3772         assert(s->type == SOURCE_CHILD);
3773
3774         if (s->pending)
3775                 return 0;
3776
3777         if (event_source_is_offline(s))
3778                 return 0;
3779
3780         if (!EVENT_SOURCE_WATCH_PIDFD(s))
3781                 return 0;
3782
3783         zero(s->child.siginfo);
3784         if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3785                 return -errno;
3786
3787         if (s->child.siginfo.si_pid == 0)
3788                 return 0;
3789
3790         if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3791                 s->child.exited = true;
3792
3793         return source_set_pending(s, true);
3794 }
3795
3796 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3797         int r;
3798
3799         assert(e);
3800         assert(d);
3801         assert_return(events == EPOLLIN, -EIO);
3802         assert(min_priority);
3803
3804         /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3805          * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3806          * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3807          * but we might have higher priority children we care about hence we need to check that
3808          * explicitly. */
3809
3810         if (sigismember(&d->sigset, SIGCHLD))
3811                 e->need_process_child = true;
3812
3813         /* If there's already an event source pending for this priority we don't read another */
3814         if (d->current)
3815                 return 0;
3816
3817         for (;;) {
3818                 struct signalfd_siginfo si;
3819                 ssize_t n;
3820                 sd_event_source *s = NULL;
3821
3822                 n = read(d->fd, &si, sizeof(si));
3823                 if (n < 0) {
3824                         if (ERRNO_IS_TRANSIENT(errno))
3825                                 return 0;
3826
3827                         return -errno;
3828                 }
3829
3830                 if (_unlikely_(n != sizeof(si)))
3831                         return -EIO;
3832
3833                 assert(SIGNAL_VALID(si.ssi_signo));
3834
3835                 if (e->signal_sources)
3836                         s = e->signal_sources[si.ssi_signo];
3837                 if (!s)
3838                         continue;
3839                 if (s->pending)
3840                         continue;
3841
3842                 s->signal.siginfo = si;
3843                 d->current = s;
3844
3845                 r = source_set_pending(s, true);
3846                 if (r < 0)
3847                         return r;
3848                 if (r > 0 && *min_priority >= s->priority) {
3849                         *min_priority = s->priority;
3850                         return 1; /* an event source with smaller priority is queued. */
3851                 }
3852
3853                 return 0;
3854         }
3855 }
3856
3857 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3858         ssize_t n;
3859
3860         assert(e);
3861         assert(d);
3862
3863         assert_return(revents == EPOLLIN, -EIO);
3864
3865         /* If there's already an event source pending for this priority, don't read another */
3866         if (d->n_pending > 0)
3867                 return 0;
3868
3869         /* Is the read buffer non-empty? If so, let's not read more */
3870         if (d->buffer_filled > 0)
3871                 return 0;
3872
3873         if (d->priority > threshold)
3874                 return 0;
3875
3876         n = read(d->fd, &d->buffer, sizeof(d->buffer));
3877         if (n < 0) {
3878                 if (ERRNO_IS_TRANSIENT(errno))
3879                         return 0;
3880
3881                 return -errno;
3882         }
3883
3884         assert(n > 0);
3885         d->buffer_filled = (size_t) n;
3886         LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
3887
3888         return 1;
3889 }
3890
3891 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3892         assert(e);
3893         assert(d);
3894         assert(sz <= d->buffer_filled);
3895
3896         if (sz == 0)
3897                 return;
3898
3899         /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3900         memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3901         d->buffer_filled -= sz;
3902
3903         if (d->buffer_filled == 0)
3904                 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
3905 }
3906
3907 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3908         int r;
3909
3910         assert(e);
3911         assert(d);
3912
3913         /* If there's already an event source pending for this priority, don't read another */
3914         if (d->n_pending > 0)
3915                 return 0;
3916
3917         while (d->buffer_filled > 0) {
3918                 size_t sz;
3919
3920                 /* Let's validate that the event structures are complete */
3921                 if (d->buffer_filled < offsetof(struct inotify_event, name))
3922                         return -EIO;
3923
3924                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3925                 if (d->buffer_filled < sz)
3926                         return -EIO;
3927
3928                 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3929                         struct inode_data *inode_data;
3930
3931                         /* The queue overran, let's pass this event to all event sources connected to this inotify
3932                          * object */
3933
3934                         HASHMAP_FOREACH(inode_data, d->inodes)
3935                                 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3936
3937                                         if (event_source_is_offline(s))
3938                                                 continue;
3939
3940                                         r = source_set_pending(s, true);
3941                                         if (r < 0)
3942                                                 return r;
3943                                 }
3944                 } else {
3945                         struct inode_data *inode_data;
3946
3947                         /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3948                          * our watch descriptor table. */
3949                         if (d->buffer.ev.mask & IN_IGNORED) {
3950
3951                                 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3952                                 if (!inode_data) {
3953                                         event_inotify_data_drop(e, d, sz);
3954                                         continue;
3955                                 }
3956
3957                                 /* The watch descriptor was removed by the kernel, let's drop it here too */
3958                                 inode_data->wd = -1;
3959                         } else {
3960                                 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3961                                 if (!inode_data) {
3962                                         event_inotify_data_drop(e, d, sz);
3963                                         continue;
3964                                 }
3965                         }
3966
3967                         /* Trigger all event sources that are interested in these events. Also trigger all event
3968                          * sources if IN_IGNORED or IN_UNMOUNT is set. */
3969                         LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3970
3971                                 if (event_source_is_offline(s))
3972                                         continue;
3973
3974                                 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3975                                     (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3976                                         continue;
3977
3978                                 r = source_set_pending(s, true);
3979                                 if (r < 0)
3980                                         return r;
3981                         }
3982                 }
3983
3984                 /* Something pending now? If so, let's finish, otherwise let's read more. */
3985                 if (d->n_pending > 0)
3986                         return 1;
3987         }
3988
3989         return 0;
3990 }
3991
3992 static int process_inotify(sd_event *e) {
3993         int r, done = 0;
3994
3995         assert(e);
3996
3997         LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
3998                 r = event_inotify_data_process(e, d);
3999                 if (r < 0)
4000                         return r;
4001                 if (r > 0)
4002                         done ++;
4003         }
4004
4005         return done;
4006 }
4007
4008 static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
4009         assert(s);
4010         assert(s->type == SOURCE_MEMORY_PRESSURE);
4011
4012         if (s->pending)
4013                 s->memory_pressure.revents |= revents;
4014         else
4015                 s->memory_pressure.revents = revents;
4016
4017         return source_set_pending(s, true);
4018 }
4019
4020 static int source_memory_pressure_write(sd_event_source *s) {
4021         ssize_t n;
4022         int r;
4023
4024         assert(s);
4025         assert(s->type == SOURCE_MEMORY_PRESSURE);
4026
4027         /* once we start writing, the buffer is locked, we allow no further changes. */
4028         s->memory_pressure.locked = true;
4029
4030         if (s->memory_pressure.write_buffer_size > 0) {
4031                 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4032                 if (n < 0) {
4033                         if (!ERRNO_IS_TRANSIENT(errno)) {
4034                                 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4035                                  * files, but then generates EOPNOSUPP on read() and write() (instead of on
4036                                  * open()!). This sucks hard, since we can only detect this kind of failure
4037                                  * so late. Let's make the best of it, and turn off the event source like we
4038                                  * do for failed event source handlers. */
4039
4040                                 log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4041                                 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4042                                 return 0;
4043                         }
4044
4045                         n = 0;
4046                 }
4047         } else
4048                 n = 0;
4049
4050         assert(n >= 0);
4051
4052         if ((size_t) n == s->memory_pressure.write_buffer_size) {
4053                 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4054
4055                 if (n > 0) {
4056                         s->memory_pressure.write_buffer_size = 0;
4057
4058                         /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4059                         r = source_memory_pressure_register(s, s->enabled);
4060                         if (r < 0)
4061                                 return r;
4062                 }
4063         } else if (n > 0) {
4064                 _cleanup_free_ void *c = NULL;
4065
4066                 assert((size_t) n < s->memory_pressure.write_buffer_size);
4067
4068                 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4069                 if (!c)
4070                         return -ENOMEM;
4071
4072                 free_and_replace(s->memory_pressure.write_buffer, c);
4073                 s->memory_pressure.write_buffer_size -= n;
4074                 return 1;
4075         }
4076
4077         return 0;
4078 }
4079
4080 static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4081         int r;
4082
4083         assert(s);
4084         assert(s->type == SOURCE_MEMORY_PRESSURE);
4085
4086         r = source_memory_pressure_write(s);
4087         if (r < 0)
4088                 return r;
4089         if (r > 0)
4090                 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4091                            * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4092
4093         /* No pending incoming IO? Then let's not continue further */
4094         if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4095
4096                 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4097                 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4098                         return -EIO;
4099
4100                 return 1; /* leave dispatch, we already processed everything */
4101         }
4102
4103         if (s->memory_pressure.revents & EPOLLIN) {
4104                 uint8_t pipe_buf[PIPE_BUF];
4105                 ssize_t n;
4106
4107                 /* If the fd is readable, then flush out anything that might be queued */
4108
4109                 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4110                 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4111                         return -errno;
4112         }
4113
4114         return 0; /* go on, dispatch to user callback */
4115 }
4116
4117 static int source_dispatch(sd_event_source *s) {
4118         EventSourceType saved_type;
4119         sd_event *saved_event;
4120         int r = 0;
4121
4122         assert(s);
4123         assert(s->pending || s->type == SOURCE_EXIT);
4124
4125         /* Save the event source type, here, so that we still know it after the event callback which might
4126          * invalidate the event. */
4127         saved_type = s->type;
4128
4129         /* Similarly, store a reference to the event loop object, so that we can still access it after the
4130          * callback might have invalidated/disconnected the event source. */
4131         saved_event = s->event;
4132         PROTECT_EVENT(saved_event);
4133
4134         /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4135         assert(!s->ratelimited);
4136         if (!ratelimit_below(&s->rate_limit)) {
4137                 r = event_source_enter_ratelimited(s);
4138                 if (r < 0)
4139                         return r;
4140
4141                 return 1;
4142         }
4143
4144         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
4145                 r = source_set_pending(s, false);
4146                 if (r < 0)
4147                         return r;
4148         }
4149
4150         if (s->type != SOURCE_POST) {
4151                 sd_event_source *z;
4152
4153                 /* If we execute a non-post source, let's mark all post sources as pending. */
4154
4155                 SET_FOREACH(z, s->event->post_sources) {
4156                         if (event_source_is_offline(z))
4157                                 continue;
4158
4159                         r = source_set_pending(z, true);
4160                         if (r < 0)
4161                                 return r;
4162                 }
4163         }
4164
4165         if (s->type == SOURCE_MEMORY_PRESSURE) {
4166                 r = source_memory_pressure_initiate_dispatch(s);
4167                 if (r == -EIO) /* handle EIO errors similar to callback errors */
4168                         goto finish;
4169                 if (r < 0)
4170                         return r;
4171                 if (r > 0) /* already handled */
4172                         return 1;
4173         }
4174
4175         if (s->enabled == SD_EVENT_ONESHOT) {
4176                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
4177                 if (r < 0)
4178                         return r;
4179         }
4180
4181         s->dispatching = true;
4182
4183         switch (s->type) {
4184
4185         case SOURCE_IO:
4186                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4187                 break;
4188
4189         case SOURCE_TIME_REALTIME:
4190         case SOURCE_TIME_BOOTTIME:
4191         case SOURCE_TIME_MONOTONIC:
4192         case SOURCE_TIME_REALTIME_ALARM:
4193         case SOURCE_TIME_BOOTTIME_ALARM:
4194                 r = s->time.callback(s, s->time.next, s->userdata);
4195                 break;
4196
4197         case SOURCE_SIGNAL:
4198                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4199                 break;
4200
4201         case SOURCE_CHILD: {
4202                 bool zombie;
4203
4204                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
4205
4206                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
4207
4208                 /* Now, reap the PID for good. */
4209                 if (zombie) {
4210                         (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
4211                         s->child.waited = true;
4212                 }
4213
4214                 break;
4215         }
4216
4217         case SOURCE_DEFER:
4218                 r = s->defer.callback(s, s->userdata);
4219                 break;
4220
4221         case SOURCE_POST:
4222                 r = s->post.callback(s, s->userdata);
4223                 break;
4224
4225         case SOURCE_EXIT:
4226                 r = s->exit.callback(s, s->userdata);
4227                 break;
4228
4229         case SOURCE_INOTIFY: {
4230                 struct sd_event *e = s->event;
4231                 struct inotify_data *d;
4232                 size_t sz;
4233
4234                 assert(s->inotify.inode_data);
4235                 assert_se(d = s->inotify.inode_data->inotify_data);
4236
4237                 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4238                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4239                 assert(d->buffer_filled >= sz);
4240
4241                 /* If the inotify callback destroys the event source then this likely means we don't need to
4242                  * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4243                  * free it immediately, then we couldn't drop the event from the inotify event queue without
4244                  * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4245                  * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4246                  * explicitly GC it after we are done dropping the inotify event from the buffer. */
4247                 d->n_busy++;
4248                 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
4249                 d->n_busy--;
4250
4251                 /* When no event is pending anymore on this inotify object, then let's drop the event from
4252                  * the inotify event queue buffer. */
4253                 if (d->n_pending == 0)
4254                         event_inotify_data_drop(e, d, sz);
4255
4256                 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4257                 event_gc_inotify_data(e, d);
4258                 break;
4259         }
4260
4261         case SOURCE_MEMORY_PRESSURE:
4262                 r = s->memory_pressure.callback(s, s->userdata);
4263                 break;
4264
4265         case SOURCE_WATCHDOG:
4266         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
4267         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
4268                 assert_not_reached();
4269         }
4270
4271         s->dispatching = false;
4272
4273 finish:
4274         if (r < 0) {
4275                 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4276                                 strna(s->description),
4277                                 event_source_type_to_string(saved_type),
4278                                 s->exit_on_failure ? "exiting" : "disabling");
4279
4280                 if (s->exit_on_failure)
4281                         (void) sd_event_exit(saved_event, r);
4282         }
4283
4284         if (s->n_ref == 0)
4285                 source_free(s);
4286         else if (r < 0)
4287                 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4288
4289         return 1;
4290 }
4291
4292 static int event_prepare(sd_event *e) {
4293         int r;
4294
4295         assert(e);
4296
4297         for (;;) {
4298                 sd_event_source *s;
4299
4300                 s = prioq_peek(e->prepare);
4301                 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
4302                         break;
4303
4304                 s->prepare_iteration = e->iteration;
4305                 prioq_reshuffle(e->prepare, s, &s->prepare_index);
4306
4307                 assert(s->prepare);
4308                 s->dispatching = true;
4309                 r = s->prepare(s, s->userdata);
4310                 s->dispatching = false;
4311
4312                 if (r < 0) {
4313                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4314                                         strna(s->description),
4315                                         event_source_type_to_string(s->type),
4316                                         s->exit_on_failure ? "exiting" : "disabling");
4317
4318                         if (s->exit_on_failure)
4319                                 (void) sd_event_exit(e, r);
4320                 }
4321
4322                 if (s->n_ref == 0)
4323                         source_free(s);
4324                 else if (r < 0)
4325                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4326         }
4327
4328         return 0;
4329 }
4330
4331 static int dispatch_exit(sd_event *e) {
4332         sd_event_source *p;
4333         int r;
4334
4335         assert(e);
4336
4337         p = prioq_peek(e->exit);
4338         assert(!p || p->type == SOURCE_EXIT);
4339
4340         if (!p || event_source_is_offline(p)) {
4341                 e->state = SD_EVENT_FINISHED;
4342                 return 0;
4343         }
4344
4345         PROTECT_EVENT(e);
4346         e->iteration++;
4347         e->state = SD_EVENT_EXITING;
4348         r = source_dispatch(p);
4349         e->state = SD_EVENT_INITIAL;
4350         return r;
4351 }
4352
4353 static sd_event_source* event_next_pending(sd_event *e) {
4354         sd_event_source *p;
4355
4356         assert(e);
4357
4358         p = prioq_peek(e->pending);
4359         if (!p)
4360                 return NULL;
4361
4362         if (event_source_is_offline(p))
4363                 return NULL;
4364
4365         return p;
4366 }
4367
4368 static int arm_watchdog(sd_event *e) {
4369         struct itimerspec its = {};
4370         usec_t t;
4371
4372         assert(e);
4373         assert(e->watchdog_fd >= 0);
4374
4375         t = sleep_between(e,
4376                           usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4377                           usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
4378
4379         timespec_store(&its.it_value, t);
4380
4381         /* Make sure we never set the watchdog to 0, which tells the
4382          * kernel to disable it. */
4383         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4384                 its.it_value.tv_nsec = 1;
4385
4386         return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
4387 }
4388
4389 static int process_watchdog(sd_event *e) {
4390         assert(e);
4391
4392         if (!e->watchdog)
4393                 return 0;
4394
4395         /* Don't notify watchdog too often */
4396         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4397                 return 0;
4398
4399         sd_notify(false, "WATCHDOG=1");
4400         e->watchdog_last = e->timestamp.monotonic;
4401
4402         return arm_watchdog(e);
4403 }
4404
4405 static void event_close_inode_data_fds(sd_event *e) {
4406         struct inode_data *d;
4407
4408         assert(e);
4409
4410         /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4411          * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4412          * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4413          * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4414          * compromise. */
4415
4416         while ((d = e->inode_data_to_close_list)) {
4417                 assert(d->fd >= 0);
4418                 d->fd = safe_close(d->fd);
4419
4420                 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
4421         }
4422 }
4423
4424 static int event_memory_pressure_write_list(sd_event *e) {
4425         int r;
4426
4427         assert(e);
4428
4429         for (;;) {
4430                 sd_event_source *s;
4431
4432                 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4433                 if (!s)
4434                         break;
4435
4436                 assert(s->type == SOURCE_MEMORY_PRESSURE);
4437                 assert(s->memory_pressure.write_buffer_size > 0);
4438                 s->memory_pressure.in_write_list = false;
4439
4440                 r = source_memory_pressure_write(s);
4441                 if (r < 0)
4442                         return r;
4443         }
4444
4445         return 0;
4446 }
4447
4448 _public_ int sd_event_prepare(sd_event *e) {
4449         int r;
4450
4451         assert_return(e, -EINVAL);
4452         assert_return(e = event_resolve(e), -ENOPKG);
4453         assert_return(!event_origin_changed(e), -ECHILD);
4454         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4455         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4456
4457         /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4458          * this check here once, since gettid() is typically not cached, and thus want to minimize
4459          * syscalls */
4460         assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4461
4462         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4463         PROTECT_EVENT(e);
4464
4465         if (e->exit_requested)
4466                 goto pending;
4467
4468         e->iteration++;
4469
4470         e->state = SD_EVENT_PREPARING;
4471         r = event_prepare(e);
4472         e->state = SD_EVENT_INITIAL;
4473         if (r < 0)
4474                 return r;
4475
4476         r = event_memory_pressure_write_list(e);
4477         if (r < 0)
4478                 return r;
4479
4480         r = event_arm_timer(e, &e->realtime);
4481         if (r < 0)
4482                 return r;
4483
4484         r = event_arm_timer(e, &e->boottime);
4485         if (r < 0)
4486                 return r;
4487
4488         r = event_arm_timer(e, &e->monotonic);
4489         if (r < 0)
4490                 return r;
4491
4492         r = event_arm_timer(e, &e->realtime_alarm);
4493         if (r < 0)
4494                 return r;
4495
4496         r = event_arm_timer(e, &e->boottime_alarm);
4497         if (r < 0)
4498                 return r;
4499
4500         event_close_inode_data_fds(e);
4501
4502         if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
4503                 goto pending;
4504
4505         e->state = SD_EVENT_ARMED;
4506
4507         return 0;
4508
4509 pending:
4510         e->state = SD_EVENT_ARMED;
4511         r = sd_event_wait(e, 0);
4512         if (r == 0)
4513                 e->state = SD_EVENT_ARMED;
4514
4515         return r;
4516 }
4517
4518 static int epoll_wait_usec(
4519                 int fd,
4520                 struct epoll_event *events,
4521                 int maxevents,
4522                 usec_t timeout) {
4523
4524         int msec;
4525         /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4526
4527 #if HAVE_EPOLL_PWAIT2
4528         static bool epoll_pwait2_absent = false;
4529         int r;
4530
4531         /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4532          * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4533          * is not that obvious to implement given the libc and kernel definitions differ in the last
4534          * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4535          * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4536          * missing. */
4537
4538         if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
4539                 r = epoll_pwait2(fd,
4540                                  events,
4541                                  maxevents,
4542                                  TIMESPEC_STORE(timeout),
4543                                  NULL);
4544                 if (r >= 0)
4545                         return r;
4546                 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
4547                         return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4548                                         * supported. */
4549
4550                 epoll_pwait2_absent = true;
4551         }
4552 #endif
4553
4554         if (timeout == USEC_INFINITY)
4555                 msec = -1;
4556         else {
4557                 usec_t k;
4558
4559                 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4560                 if (k >= INT_MAX)
4561                         msec = INT_MAX; /* Saturate */
4562                 else
4563                         msec = (int) k;
4564         }
4565
4566         return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
4567 }
4568
4569 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
4570         size_t n_event_queue, m, n_event_max;
4571         int64_t min_priority = threshold;
4572         bool something_new = false;
4573         int r;
4574
4575         assert(e);
4576         assert(ret_min_priority);
4577
4578         n_event_queue = MAX(e->n_sources, 1u);
4579         if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
4580                 return -ENOMEM;
4581
4582         n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4583
4584         /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4585         if (e->buffered_inotify_data_list)
4586                 timeout = 0;
4587
4588         for (;;) {
4589                 r = epoll_wait_usec(
4590                                 e->epoll_fd,
4591                                 e->event_queue,
4592                                 n_event_max,
4593                                 timeout);
4594                 if (r < 0)
4595                         return r;
4596
4597                 m = (size_t) r;
4598
4599                 if (m < n_event_max)
4600                         break;
4601
4602                 if (n_event_max >= n_event_queue * 10)
4603                         break;
4604
4605                 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
4606                         return -ENOMEM;
4607
4608                 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4609                 timeout = 0;
4610         }
4611
4612         /* Set timestamp only when this is called first time. */
4613         if (threshold == INT64_MAX)
4614                 triple_timestamp_get(&e->timestamp);
4615
4616         for (size_t i = 0; i < m; i++) {
4617
4618                 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4619                         r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
4620                 else {
4621                         WakeupType *t = e->event_queue[i].data.ptr;
4622
4623                         switch (*t) {
4624
4625                         case WAKEUP_EVENT_SOURCE: {
4626                                 sd_event_source *s = e->event_queue[i].data.ptr;
4627
4628                                 assert(s);
4629
4630                                 if (s->priority > threshold)
4631                                         continue;
4632
4633                                 min_priority = MIN(min_priority, s->priority);
4634
4635                                 switch (s->type) {
4636
4637                                 case SOURCE_IO:
4638                                         r = process_io(e, s, e->event_queue[i].events);
4639                                         break;
4640
4641                                 case SOURCE_CHILD:
4642                                         r = process_pidfd(e, s, e->event_queue[i].events);
4643                                         break;
4644
4645                                 case SOURCE_MEMORY_PRESSURE:
4646                                         r = process_memory_pressure(s, e->event_queue[i].events);
4647                                         break;
4648
4649                                 default:
4650                                         assert_not_reached();
4651                                 }
4652
4653                                 break;
4654                         }
4655
4656                         case WAKEUP_CLOCK_DATA: {
4657                                 struct clock_data *d = e->event_queue[i].data.ptr;
4658
4659                                 assert(d);
4660
4661                                 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4662                                 break;
4663                         }
4664
4665                         case WAKEUP_SIGNAL_DATA:
4666                                 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4667                                 break;
4668
4669                         case WAKEUP_INOTIFY_DATA:
4670                                 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4671                                 break;
4672
4673                         default:
4674                                 assert_not_reached();
4675                         }
4676                 }
4677                 if (r < 0)
4678                         return r;
4679                 if (r > 0)
4680                         something_new = true;
4681         }
4682
4683         *ret_min_priority = min_priority;
4684         return something_new;
4685 }
4686
4687 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4688         int r;
4689
4690         assert_return(e, -EINVAL);
4691         assert_return(e = event_resolve(e), -ENOPKG);
4692         assert_return(!event_origin_changed(e), -ECHILD);
4693         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4694         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4695
4696         if (e->exit_requested) {
4697                 e->state = SD_EVENT_PENDING;
4698                 return 1;
4699         }
4700
4701         for (int64_t threshold = INT64_MAX; ; threshold--) {
4702                 int64_t epoll_min_priority, child_min_priority;
4703
4704                 /* There may be a possibility that new epoll (especially IO) and child events are
4705                  * triggered just after process_epoll() call but before process_child(), and the new IO
4706                  * events may have higher priority than the child events. To salvage these events,
4707                  * let's call epoll_wait() again, but accepts only events with higher priority than the
4708                  * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4709                  * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4710                  * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4711
4712                 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4713                 if (r == -EINTR) {
4714                         e->state = SD_EVENT_PENDING;
4715                         return 1;
4716                 }
4717                 if (r < 0)
4718                         goto finish;
4719                 if (r == 0 && threshold < INT64_MAX)
4720                         /* No new epoll event. */
4721                         break;
4722
4723                 r = process_child(e, threshold, &child_min_priority);
4724                 if (r < 0)
4725                         goto finish;
4726                 if (r == 0)
4727                         /* No new child event. */
4728                         break;
4729
4730                 threshold = MIN(epoll_min_priority, child_min_priority);
4731                 if (threshold == INT64_MIN)
4732                         break;
4733
4734                 timeout = 0;
4735         }
4736
4737         r = process_watchdog(e);
4738         if (r < 0)
4739                 goto finish;
4740
4741         r = process_inotify(e);
4742         if (r < 0)
4743                 goto finish;
4744
4745         r = process_timer(e, e->timestamp.realtime, &e->realtime);
4746         if (r < 0)
4747                 goto finish;
4748
4749         r = process_timer(e, e->timestamp.boottime, &e->boottime);
4750         if (r < 0)
4751                 goto finish;
4752
4753         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4754         if (r < 0)
4755                 goto finish;
4756
4757         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4758         if (r < 0)
4759                 goto finish;
4760
4761         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4762         if (r < 0)
4763                 goto finish;
4764         else if (r == 1) {
4765                 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4766                  * put loop in the initial state in order to evaluate (in the next iteration) also sources
4767                  * there were potentially re-enabled by the callback.
4768                  *
4769                  * Wondering why we treat only this invocation of process_timer() differently? Once event
4770                  * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4771                  * ratelimit expiry callback is never called for any other timer type. */
4772                 r = 0;
4773                 goto finish;
4774         }
4775
4776         if (event_next_pending(e)) {
4777                 e->state = SD_EVENT_PENDING;
4778                 return 1;
4779         }
4780
4781         r = 0;
4782
4783 finish:
4784         e->state = SD_EVENT_INITIAL;
4785
4786         return r;
4787 }
4788
4789 _public_ int sd_event_dispatch(sd_event *e) {
4790         sd_event_source *p;
4791         int r;
4792
4793         assert_return(e, -EINVAL);
4794         assert_return(e = event_resolve(e), -ENOPKG);
4795         assert_return(!event_origin_changed(e), -ECHILD);
4796         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4797         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4798
4799         if (e->exit_requested)
4800                 return dispatch_exit(e);
4801
4802         p = event_next_pending(e);
4803         if (p) {
4804                 PROTECT_EVENT(e);
4805
4806                 e->state = SD_EVENT_RUNNING;
4807                 r = source_dispatch(p);
4808                 e->state = SD_EVENT_INITIAL;
4809                 return r;
4810         }
4811
4812         e->state = SD_EVENT_INITIAL;
4813
4814         return 1;
4815 }
4816
4817 static void event_log_delays(sd_event *e) {
4818         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4819         size_t l, i;
4820
4821         p = b;
4822         l = sizeof(b);
4823         for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4824                 l = strpcpyf(&p, l, "%u ", e->delays[i]);
4825                 e->delays[i] = 0;
4826         }
4827         log_debug("Event loop iterations: %s", b);
4828 }
4829
4830 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4831         int r;
4832
4833         assert_return(e, -EINVAL);
4834         assert_return(e = event_resolve(e), -ENOPKG);
4835         assert_return(!event_origin_changed(e), -ECHILD);
4836         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4837         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4838
4839         if (e->profile_delays && e->last_run_usec != 0) {
4840                 usec_t this_run;
4841                 unsigned l;
4842
4843                 this_run = now(CLOCK_MONOTONIC);
4844
4845                 l = log2u64(this_run - e->last_run_usec);
4846                 assert(l < ELEMENTSOF(e->delays));
4847                 e->delays[l]++;
4848
4849                 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4850                         event_log_delays(e);
4851                         e->last_log_usec = this_run;
4852                 }
4853         }
4854
4855         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4856         PROTECT_EVENT(e);
4857
4858         r = sd_event_prepare(e);
4859         if (r == 0)
4860                 /* There was nothing? Then wait... */
4861                 r = sd_event_wait(e, timeout);
4862
4863         if (e->profile_delays)
4864                 e->last_run_usec = now(CLOCK_MONOTONIC);
4865
4866         if (r > 0) {
4867                 /* There's something now, then let's dispatch it */
4868                 r = sd_event_dispatch(e);
4869                 if (r < 0)
4870                         return r;
4871
4872                 return 1;
4873         }
4874
4875         return r;
4876 }
4877
4878 _public_ int sd_event_loop(sd_event *e) {
4879         int r;
4880
4881         assert_return(e, -EINVAL);
4882         assert_return(e = event_resolve(e), -ENOPKG);
4883         assert_return(!event_origin_changed(e), -ECHILD);
4884         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4885
4886
4887         PROTECT_EVENT(e);
4888
4889         while (e->state != SD_EVENT_FINISHED) {
4890                 r = sd_event_run(e, UINT64_MAX);
4891                 if (r < 0)
4892                         return r;
4893         }
4894
4895         return e->exit_code;
4896 }
4897
4898 _public_ int sd_event_get_fd(sd_event *e) {
4899         assert_return(e, -EINVAL);
4900         assert_return(e = event_resolve(e), -ENOPKG);
4901         assert_return(!event_origin_changed(e), -ECHILD);
4902
4903         return e->epoll_fd;
4904 }
4905
4906 _public_ int sd_event_get_state(sd_event *e) {
4907         assert_return(e, -EINVAL);
4908         assert_return(e = event_resolve(e), -ENOPKG);
4909         assert_return(!event_origin_changed(e), -ECHILD);
4910
4911         return e->state;
4912 }
4913
4914 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
4915         assert_return(e, -EINVAL);
4916         assert_return(e = event_resolve(e), -ENOPKG);
4917         assert_return(code, -EINVAL);
4918         assert_return(!event_origin_changed(e), -ECHILD);
4919
4920         if (!e->exit_requested)
4921                 return -ENODATA;
4922
4923         *code = e->exit_code;
4924         return 0;
4925 }
4926
4927 _public_ int sd_event_exit(sd_event *e, int code) {
4928         assert_return(e, -EINVAL);
4929         assert_return(e = event_resolve(e), -ENOPKG);
4930         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4931         assert_return(!event_origin_changed(e), -ECHILD);
4932
4933         e->exit_requested = true;
4934         e->exit_code = code;
4935
4936         return 0;
4937 }
4938
4939 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
4940         assert_return(e, -EINVAL);
4941         assert_return(e = event_resolve(e), -ENOPKG);
4942         assert_return(usec, -EINVAL);
4943         assert_return(!event_origin_changed(e), -ECHILD);
4944
4945         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4946                 return -EOPNOTSUPP;
4947
4948         if (!triple_timestamp_is_set(&e->timestamp)) {
4949                 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4950                 *usec = now(clock);
4951                 return 1;
4952         }
4953
4954         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
4955         return 0;
4956 }
4957
4958 _public_ int sd_event_default(sd_event **ret) {
4959         sd_event *e = NULL;
4960         int r;
4961
4962         if (!ret)
4963                 return !!default_event;
4964
4965         if (default_event) {
4966                 *ret = sd_event_ref(default_event);
4967                 return 0;
4968         }
4969
4970         r = sd_event_new(&e);
4971         if (r < 0)
4972                 return r;
4973
4974         e->default_event_ptr = &default_event;
4975         e->tid = gettid();
4976         default_event = e;
4977
4978         *ret = e;
4979         return 1;
4980 }
4981
4982 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4983         assert_return(e, -EINVAL);
4984         assert_return(e = event_resolve(e), -ENOPKG);
4985         assert_return(tid, -EINVAL);
4986         assert_return(!event_origin_changed(e), -ECHILD);
4987
4988         if (e->tid != 0) {
4989                 *tid = e->tid;
4990                 return 0;
4991         }
4992
4993         return -ENXIO;
4994 }
4995
4996 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
4997         int r;
4998
4999         assert_return(e, -EINVAL);
5000         assert_return(e = event_resolve(e), -ENOPKG);
5001         assert_return(!event_origin_changed(e), -ECHILD);
5002
5003         if (e->watchdog == !!b)
5004                 return e->watchdog;
5005
5006         if (b) {
5007                 r = sd_watchdog_enabled(false, &e->watchdog_period);
5008                 if (r <= 0)
5009                         return r;
5010
5011                 /* Issue first ping immediately */
5012                 sd_notify(false, "WATCHDOG=1");
5013                 e->watchdog_last = now(CLOCK_MONOTONIC);
5014
5015                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
5016                 if (e->watchdog_fd < 0)
5017                         return -errno;
5018
5019                 r = arm_watchdog(e);
5020                 if (r < 0)
5021                         goto fail;
5022
5023                 struct epoll_event ev = {
5024                         .events = EPOLLIN,
5025                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
5026                 };
5027
5028                 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
5029                         r = -errno;
5030                         goto fail;
5031                 }
5032
5033         } else {
5034                 if (e->watchdog_fd >= 0) {
5035                         (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
5036                         e->watchdog_fd = safe_close(e->watchdog_fd);
5037                 }
5038         }
5039
5040         e->watchdog = !!b;
5041         return e->watchdog;
5042
5043 fail:
5044         e->watchdog_fd = safe_close(e->watchdog_fd);
5045         return r;
5046 }
5047
5048 _public_ int sd_event_get_watchdog(sd_event *e) {
5049         assert_return(e, -EINVAL);
5050         assert_return(e = event_resolve(e), -ENOPKG);
5051         assert_return(!event_origin_changed(e), -ECHILD);
5052
5053         return e->watchdog;
5054 }
5055
5056 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5057         assert_return(e, -EINVAL);
5058         assert_return(e = event_resolve(e), -ENOPKG);
5059         assert_return(!event_origin_changed(e), -ECHILD);
5060
5061         *ret = e->iteration;
5062         return 0;
5063 }
5064
5065 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5066         assert_return(s, -EINVAL);
5067         assert_return(s->event, -EINVAL);
5068         assert_return(!event_origin_changed(s->event), -ECHILD);
5069
5070         s->destroy_callback = callback;
5071         return 0;
5072 }
5073
5074 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5075         assert_return(s, -EINVAL);
5076         assert_return(!event_origin_changed(s->event), -ECHILD);
5077
5078         if (ret)
5079                 *ret = s->destroy_callback;
5080
5081         return !!s->destroy_callback;
5082 }
5083
5084 _public_ int sd_event_source_get_floating(sd_event_source *s) {
5085         assert_return(s, -EINVAL);
5086         assert_return(!event_origin_changed(s->event), -ECHILD);
5087
5088         return s->floating;
5089 }
5090
5091 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5092         assert_return(s, -EINVAL);
5093         assert_return(!event_origin_changed(s->event), -ECHILD);
5094
5095         if (s->floating == !!b)
5096                 return 0;
5097
5098         if (!s->event) /* Already disconnected */
5099                 return -ESTALE;
5100
5101         s->floating = b;
5102
5103         if (b) {
5104                 sd_event_source_ref(s);
5105                 sd_event_unref(s->event);
5106         } else {
5107                 sd_event_ref(s->event);
5108                 sd_event_source_unref(s);
5109         }
5110
5111         return 1;
5112 }
5113
5114 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5115         assert_return(s, -EINVAL);
5116         assert_return(s->type != SOURCE_EXIT, -EDOM);
5117         assert_return(!event_origin_changed(s->event), -ECHILD);
5118
5119         return s->exit_on_failure;
5120 }
5121
5122 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5123         assert_return(s, -EINVAL);
5124         assert_return(s->type != SOURCE_EXIT, -EDOM);
5125         assert_return(!event_origin_changed(s->event), -ECHILD);
5126
5127         if (s->exit_on_failure == !!b)
5128                 return 0;
5129
5130         s->exit_on_failure = b;
5131         return 1;
5132 }
5133
5134 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5135         int r;
5136
5137         assert_return(s, -EINVAL);
5138         assert_return(!event_origin_changed(s->event), -ECHILD);
5139
5140         /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5141          * so is a programming error. */
5142         assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5143
5144         /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5145          * non-ratelimited. */
5146         r = event_source_leave_ratelimit(s, /* run_callback */ false);
5147         if (r < 0)
5148                 return r;
5149
5150         s->rate_limit = (RateLimit) { interval, burst };
5151         return 0;
5152 }
5153
5154 _public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5155         assert_return(s, -EINVAL);
5156         assert_return(!event_origin_changed(s->event), -ECHILD);
5157
5158         s->ratelimit_expire_callback = callback;
5159         return 0;
5160 }
5161
5162 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5163         assert_return(s, -EINVAL);
5164         assert_return(!event_origin_changed(s->event), -ECHILD);
5165
5166         /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5167          * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5168         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5169                 return -EDOM;
5170
5171         if (!ratelimit_configured(&s->rate_limit))
5172                 return -ENOEXEC;
5173
5174         if (ret_interval)
5175                 *ret_interval = s->rate_limit.interval;
5176         if (ret_burst)
5177                 *ret_burst = s->rate_limit.burst;
5178
5179         return 0;
5180 }
5181
5182 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5183         assert_return(s, -EINVAL);
5184         assert_return(!event_origin_changed(s->event), -ECHILD);
5185
5186         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5187                 return false;
5188
5189         if (!ratelimit_configured(&s->rate_limit))
5190                 return false;
5191
5192         return s->ratelimited;
5193 }
5194
5195 _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5196         bool change = false;
5197         int r;
5198
5199         assert_return(e, -EINVAL);
5200
5201         if (b) {
5202                 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5203                  * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5204                  * floating after creation (and undo this before deleting them again). */
5205
5206                 if (!e->sigint_event_source) {
5207                         r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5208                         if (r < 0)
5209                                 return r;
5210
5211                         assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5212                         change = true;
5213                 }
5214
5215                 if (!e->sigterm_event_source) {
5216                         r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5217                         if (r < 0) {
5218                                 if (change) {
5219                                         assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5220                                         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5221                                 }
5222
5223                                 return r;
5224                         }
5225
5226                         assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5227                         change = true;
5228                 }
5229
5230         } else {
5231                 if (e->sigint_event_source) {
5232                         assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5233                         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5234                         change = true;
5235                 }
5236
5237                 if (e->sigterm_event_source) {
5238                         assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5239                         e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5240                         change = true;
5241                 }
5242         }
5243
5244         return change;
5245 }
5246
5247 _public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5248         _cleanup_free_ char *b = NULL;
5249         _cleanup_free_ void *w = NULL;
5250
5251         assert_return(s, -EINVAL);
5252         assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5253         assert_return(ty, -EINVAL);
5254         assert_return(!event_origin_changed(s->event), -ECHILD);
5255
5256         if (!STR_IN_SET(ty, "some", "full"))
5257                 return -EINVAL;
5258
5259         if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5260                 return -EBUSY;
5261
5262         char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5263         if (!space)
5264                 return -EINVAL;
5265
5266         size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5267         b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5268         if (!b)
5269                 return -ENOMEM;
5270         if (!STR_IN_SET(b, "some", "full"))
5271                 return -EINVAL;
5272
5273         if (streq(b, ty))
5274                 return 0;
5275
5276         size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5277         w = new(char, nl);
5278         if (!w)
5279                 return -ENOMEM;
5280
5281         memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5282
5283         free_and_replace(s->memory_pressure.write_buffer, w);
5284         s->memory_pressure.write_buffer_size = nl;
5285         s->memory_pressure.locked = false;
5286
5287         return 1;
5288 }
5289
5290 _public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5291         _cleanup_free_ char *b = NULL;
5292         _cleanup_free_ void *w = NULL;
5293
5294         assert_return(s, -EINVAL);
5295         assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5296         assert_return(!event_origin_changed(s->event), -ECHILD);
5297
5298         if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5299                 return -ERANGE;
5300         if (window_usec <= 0 || window_usec >= UINT64_MAX)
5301                 return -ERANGE;
5302         if (threshold_usec > window_usec)
5303                 return -EINVAL;
5304
5305         if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5306                 return -EBUSY;
5307
5308         char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5309         if (!space)
5310                 return -EINVAL;
5311
5312         size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5313         b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5314         if (!b)
5315                 return -ENOMEM;
5316         if (!STR_IN_SET(b, "some", "full"))
5317                 return -EINVAL;
5318
5319         if (asprintf((char**) &w,
5320                      "%s " USEC_FMT " " USEC_FMT "",
5321                      b,
5322                      threshold_usec,
5323                      window_usec) < 0)
5324                 return -EINVAL;
5325
5326         l = strlen(w) + 1;
5327         if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5328                 return 0;
5329
5330         free_and_replace(s->memory_pressure.write_buffer, w);
5331         s->memory_pressure.write_buffer_size = l;
5332         s->memory_pressure.locked = false;
5333
5334         return 1;
5335 }