src/libsystemd/sd-event/sd-event.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <sys/epoll.h>
   4 #include <sys/timerfd.h>
   5 #include <sys/wait.h>
   6
   7 #include "sd-daemon.h"
   8 #include "sd-event.h"
   9 #include "sd-id128.h"
  10 #include "sd-messages.h"
  11
  12 #include "alloc-util.h"
  13 #include "env-util.h"
  14 #include "event-source.h"
  15 #include "fd-util.h"
  16 #include "fs-util.h"
  17 #include "glyph-util.h"
  18 #include "hashmap.h"
  19 #include "hexdecoct.h"
  20 #include "list.h"
  21 #include "logarithm.h"
  22 #include "macro.h"
  23 #include "mallinfo-util.h"
  24 #include "memory-util.h"
  25 #include "missing_magic.h"
  26 #include "missing_syscall.h"
  27 #include "missing_threads.h"
  28 #include "origin-id.h"
  29 #include "path-util.h"
  30 #include "prioq.h"
  31 #include "process-util.h"
  32 #include "psi-util.h"
  33 #include "set.h"
  34 #include "signal-util.h"
  35 #include "socket-util.h"
  36 #include "stat-util.h"
  37 #include "string-table.h"
  38 #include "string-util.h"
  39 #include "strxcpyx.h"
  40 #include "time-util.h"
  41
  42 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
  43
  44 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
  45         /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
  46         return s &&
  47                 s->type == SOURCE_CHILD &&
  48                 s->child.pidfd >= 0 &&
  49                 s->child.options == WEXITED;
  50 }
  51
  52 static bool event_source_is_online(sd_event_source *s) {
  53         assert(s);
  54         return s->enabled != SD_EVENT_OFF && !s->ratelimited;
  55 }
  56
  57 static bool event_source_is_offline(sd_event_source *s) {
  58         assert(s);
  59         return s->enabled == SD_EVENT_OFF || s->ratelimited;
  60 }
  61
  62 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
  63         [SOURCE_IO]                  = "io",
  64         [SOURCE_TIME_REALTIME]       = "realtime",
  65         [SOURCE_TIME_BOOTTIME]       = "boottime",
  66         [SOURCE_TIME_MONOTONIC]      = "monotonic",
  67         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
  68         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
  69         [SOURCE_SIGNAL]              = "signal",
  70         [SOURCE_CHILD]               = "child",
  71         [SOURCE_DEFER]               = "defer",
  72         [SOURCE_POST]                = "post",
  73         [SOURCE_EXIT]                = "exit",
  74         [SOURCE_WATCHDOG]            = "watchdog",
  75         [SOURCE_INOTIFY]             = "inotify",
  76         [SOURCE_MEMORY_PRESSURE]     = "memory-pressure",
  77 };
  78
  79 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
  80
  81 #define EVENT_SOURCE_IS_TIME(t)                 \
  82         IN_SET((t),                             \
  83                SOURCE_TIME_REALTIME,            \
  84                SOURCE_TIME_BOOTTIME,            \
  85                SOURCE_TIME_MONOTONIC,           \
  86                SOURCE_TIME_REALTIME_ALARM,      \
  87                SOURCE_TIME_BOOTTIME_ALARM)
  88
  89 #define EVENT_SOURCE_CAN_RATE_LIMIT(t)          \
  90         IN_SET((t),                             \
  91                SOURCE_IO,                       \
  92                SOURCE_TIME_REALTIME,            \
  93                SOURCE_TIME_BOOTTIME,            \
  94                SOURCE_TIME_MONOTONIC,           \
  95                SOURCE_TIME_REALTIME_ALARM,      \
  96                SOURCE_TIME_BOOTTIME_ALARM,      \
  97                SOURCE_SIGNAL,                   \
  98                SOURCE_DEFER,                    \
  99                SOURCE_INOTIFY,                  \
 100                SOURCE_MEMORY_PRESSURE)
 101
 102 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
 103  * Time sources and ratelimited sources can be passed, so effectively this is the same as the
 104  * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
 105 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
 106
 107 struct sd_event {
 108         unsigned n_ref;
 109
 110         int epoll_fd;
 111         int watchdog_fd;
 112
 113         Prioq *pending;
 114         Prioq *prepare;
 115
 116         /* timerfd_create() only supports these five clocks so far. We
 117          * can add support for more clocks when the kernel learns to
 118          * deal with them, too. */
 119         struct clock_data realtime;
 120         struct clock_data boottime;
 121         struct clock_data monotonic;
 122         struct clock_data realtime_alarm;
 123         struct clock_data boottime_alarm;
 124
 125         usec_t perturb;
 126
 127         sd_event_source **signal_sources; /* indexed by signal number */
 128         Hashmap *signal_data; /* indexed by priority */
 129
 130         Hashmap *child_sources;
 131         unsigned n_online_child_sources;
 132
 133         Set *post_sources;
 134
 135         Prioq *exit;
 136
 137         Hashmap *inotify_data; /* indexed by priority */
 138
 139         /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
 140         LIST_HEAD(struct inode_data, inode_data_to_close_list);
 141
 142         /* A list of inotify objects that already have events buffered which aren't processed yet */
 143         LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
 144
 145         /* A list of memory pressure event sources that still need their subscription string written */
 146         LIST_HEAD(sd_event_source, memory_pressure_write_list);
 147
 148         uint64_t origin_id;
 149
 150         uint64_t iteration;
 151         triple_timestamp timestamp;
 152         int state;
 153
 154         bool exit_requested:1;
 155         bool need_process_child:1;
 156         bool watchdog:1;
 157         bool profile_delays:1;
 158
 159         int exit_code;
 160
 161         pid_t tid;
 162         sd_event **default_event_ptr;
 163
 164         usec_t watchdog_last, watchdog_period;
 165
 166         unsigned n_sources;
 167
 168         struct epoll_event *event_queue;
 169
 170         LIST_HEAD(sd_event_source, sources);
 171
 172         sd_event_source *sigint_event_source, *sigterm_event_source;
 173
 174         usec_t last_run_usec, last_log_usec;
 175         unsigned delays[sizeof(usec_t) * 8];
 176 };
 177
 178 DEFINE_PRIVATE_ORIGIN_ID_HELPERS(sd_event, event);
 179
 180 static thread_local sd_event *default_event = NULL;
 181
 182 static void source_disconnect(sd_event_source *s);
 183 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
 184
 185 static sd_event *event_resolve(sd_event *e) {
 186         return e == SD_EVENT_DEFAULT ? default_event : e;
 187 }
 188
 189 static int pending_prioq_compare(const void *a, const void *b) {
 190         const sd_event_source *x = a, *y = b;
 191         int r;
 192
 193         assert(x->pending);
 194         assert(y->pending);
 195
 196         /* Enabled ones first */
 197         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 198         if (r != 0)
 199                 return r;
 200
 201         /* Non rate-limited ones first. */
 202         r = CMP(!!x->ratelimited, !!y->ratelimited);
 203         if (r != 0)
 204                 return r;
 205
 206         /* Lower priority values first */
 207         r = CMP(x->priority, y->priority);
 208         if (r != 0)
 209                 return r;
 210
 211         /* Older entries first */
 212         return CMP(x->pending_iteration, y->pending_iteration);
 213 }
 214
 215 static int prepare_prioq_compare(const void *a, const void *b) {
 216         const sd_event_source *x = a, *y = b;
 217         int r;
 218
 219         assert(x->prepare);
 220         assert(y->prepare);
 221
 222         /* Enabled ones first */
 223         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 224         if (r != 0)
 225                 return r;
 226
 227         /* Non rate-limited ones first. */
 228         r = CMP(!!x->ratelimited, !!y->ratelimited);
 229         if (r != 0)
 230                 return r;
 231
 232         /* Move most recently prepared ones last, so that we can stop
 233          * preparing as soon as we hit one that has already been
 234          * prepared in the current iteration */
 235         r = CMP(x->prepare_iteration, y->prepare_iteration);
 236         if (r != 0)
 237                 return r;
 238
 239         /* Lower priority values first */
 240         return CMP(x->priority, y->priority);
 241 }
 242
 243 static usec_t time_event_source_next(const sd_event_source *s) {
 244         assert(s);
 245
 246         /* We have two kinds of event sources that have elapsation times associated with them: the actual
 247          * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
 248          * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
 249          * looking at here. */
 250
 251         if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
 252                 assert(s->rate_limit.begin != 0);
 253                 assert(s->rate_limit.interval != 0);
 254                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 255         }
 256
 257         /* Otherwise this must be a time event source, if not ratelimited */
 258         if (EVENT_SOURCE_IS_TIME(s->type))
 259                 return s->time.next;
 260
 261         return USEC_INFINITY;
 262 }
 263
 264 static usec_t time_event_source_latest(const sd_event_source *s) {
 265         assert(s);
 266
 267         if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
 268                                * same, as we should avoid adding additional inaccuracy on an inaccuracy time
 269                                * window */
 270                 assert(s->rate_limit.begin != 0);
 271                 assert(s->rate_limit.interval != 0);
 272                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 273         }
 274
 275         /* Must be a time event source, if not ratelimited */
 276         if (EVENT_SOURCE_IS_TIME(s->type))
 277                 return usec_add(s->time.next, s->time.accuracy);
 278
 279         return USEC_INFINITY;
 280 }
 281
 282 static bool event_source_timer_candidate(const sd_event_source *s) {
 283         assert(s);
 284
 285         /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
 286          * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
 287         return !s->pending || s->ratelimited;
 288 }
 289
 290 static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
 291         const sd_event_source *x = a, *y = b;
 292         int r;
 293
 294         /* Enabled ones first */
 295         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 296         if (r != 0)
 297                 return r;
 298
 299         /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
 300         r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
 301         if (r != 0)
 302                 return r;
 303
 304         /* Order by time */
 305         return CMP(time_func(x), time_func(y));
 306 }
 307
 308 static int earliest_time_prioq_compare(const void *a, const void *b) {
 309         return time_prioq_compare(a, b, time_event_source_next);
 310 }
 311
 312 static int latest_time_prioq_compare(const void *a, const void *b) {
 313         return time_prioq_compare(a, b, time_event_source_latest);
 314 }
 315
 316 static int exit_prioq_compare(const void *a, const void *b) {
 317         const sd_event_source *x = a, *y = b;
 318         int r;
 319
 320         assert(x->type == SOURCE_EXIT);
 321         assert(y->type == SOURCE_EXIT);
 322
 323         /* Enabled ones first */
 324         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
 325         if (r != 0)
 326                 return r;
 327
 328         /* Lower priority values first */
 329         return CMP(x->priority, y->priority);
 330 }
 331
 332 static void free_clock_data(struct clock_data *d) {
 333         assert(d);
 334         assert(d->wakeup == WAKEUP_CLOCK_DATA);
 335
 336         safe_close(d->fd);
 337         prioq_free(d->earliest);
 338         prioq_free(d->latest);
 339 }
 340
 341 static sd_event *event_free(sd_event *e) {
 342         sd_event_source *s;
 343
 344         assert(e);
 345
 346         e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
 347         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
 348
 349         while ((s = e->sources)) {
 350                 assert(s->floating);
 351                 source_disconnect(s);
 352                 sd_event_source_unref(s);
 353         }
 354
 355         assert(e->n_sources == 0);
 356
 357         if (e->default_event_ptr)
 358                 *(e->default_event_ptr) = NULL;
 359
 360         safe_close(e->epoll_fd);
 361         safe_close(e->watchdog_fd);
 362
 363         free_clock_data(&e->realtime);
 364         free_clock_data(&e->boottime);
 365         free_clock_data(&e->monotonic);
 366         free_clock_data(&e->realtime_alarm);
 367         free_clock_data(&e->boottime_alarm);
 368
 369         prioq_free(e->pending);
 370         prioq_free(e->prepare);
 371         prioq_free(e->exit);
 372
 373         free(e->signal_sources);
 374         hashmap_free(e->signal_data);
 375
 376         hashmap_free(e->inotify_data);
 377
 378         hashmap_free(e->child_sources);
 379         set_free(e->post_sources);
 380
 381         free(e->event_queue);
 382
 383         return mfree(e);
 384 }
 385
 386 _public_ int sd_event_new(sd_event** ret) {
 387         sd_event *e;
 388         int r;
 389
 390         assert_return(ret, -EINVAL);
 391
 392         e = new(sd_event, 1);
 393         if (!e)
 394                 return -ENOMEM;
 395
 396         *e = (sd_event) {
 397                 .n_ref = 1,
 398                 .epoll_fd = -EBADF,
 399                 .watchdog_fd = -EBADF,
 400                 .realtime.wakeup = WAKEUP_CLOCK_DATA,
 401                 .realtime.fd = -EBADF,
 402                 .realtime.next = USEC_INFINITY,
 403                 .boottime.wakeup = WAKEUP_CLOCK_DATA,
 404                 .boottime.fd = -EBADF,
 405                 .boottime.next = USEC_INFINITY,
 406                 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
 407                 .monotonic.fd = -EBADF,
 408                 .monotonic.next = USEC_INFINITY,
 409                 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 410                 .realtime_alarm.fd = -EBADF,
 411                 .realtime_alarm.next = USEC_INFINITY,
 412                 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 413                 .boottime_alarm.fd = -EBADF,
 414                 .boottime_alarm.next = USEC_INFINITY,
 415                 .perturb = USEC_INFINITY,
 416                 .origin_id = origin_id_query(),
 417         };
 418
 419         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
 420         if (r < 0)
 421                 goto fail;
 422
 423         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
 424         if (e->epoll_fd < 0) {
 425                 r = -errno;
 426                 goto fail;
 427         }
 428
 429         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
 430
 431         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
 432                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
 433                           special_glyph(SPECIAL_GLYPH_ELLIPSIS));
 434                 e->profile_delays = true;
 435         }
 436
 437         *ret = e;
 438         return 0;
 439
 440 fail:
 441         event_free(e);
 442         return r;
 443 }
 444
 445 /* Define manually so we can add the origin check */
 446 _public_ sd_event *sd_event_ref(sd_event *e) {
 447         if (!e)
 448                 return NULL;
 449         if (event_origin_changed(e))
 450                 return NULL;
 451
 452         e->n_ref++;
 453
 454         return e;
 455 }
 456
 457 _public_ sd_event* sd_event_unref(sd_event *e) {
 458         if (!e)
 459                 return NULL;
 460         if (event_origin_changed(e))
 461                 return NULL;
 462
 463         assert(e->n_ref > 0);
 464         if (--e->n_ref > 0)
 465                 return NULL;
 466
 467         return event_free(e);
 468 }
 469
 470 #define PROTECT_EVENT(e)                                                \
 471         _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
 472
 473 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
 474         if (s)
 475                 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
 476         return sd_event_source_unref(s);
 477 }
 478
 479 static void source_io_unregister(sd_event_source *s) {
 480         assert(s);
 481         assert(s->type == SOURCE_IO);
 482
 483         if (event_origin_changed(s->event))
 484                 return;
 485
 486         if (!s->io.registered)
 487                 return;
 488
 489         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
 490                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 491                                 strna(s->description), event_source_type_to_string(s->type));
 492
 493         s->io.registered = false;
 494 }
 495
 496 static int source_io_register(
 497                 sd_event_source *s,
 498                 int enabled,
 499                 uint32_t events) {
 500
 501         assert(s);
 502         assert(s->type == SOURCE_IO);
 503         assert(enabled != SD_EVENT_OFF);
 504
 505         struct epoll_event ev = {
 506                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 507                 .data.ptr = s,
 508         };
 509
 510         if (epoll_ctl(s->event->epoll_fd,
 511                       s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 512                       s->io.fd, &ev) < 0)
 513                 return -errno;
 514
 515         s->io.registered = true;
 516
 517         return 0;
 518 }
 519
 520 static void source_child_pidfd_unregister(sd_event_source *s) {
 521         assert(s);
 522         assert(s->type == SOURCE_CHILD);
 523
 524         if (event_origin_changed(s->event))
 525                 return;
 526
 527         if (!s->child.registered)
 528                 return;
 529
 530         if (EVENT_SOURCE_WATCH_PIDFD(s))
 531                 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
 532                         log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 533                                         strna(s->description), event_source_type_to_string(s->type));
 534
 535         s->child.registered = false;
 536 }
 537
 538 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
 539         assert(s);
 540         assert(s->type == SOURCE_CHILD);
 541         assert(enabled != SD_EVENT_OFF);
 542
 543         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
 544                 struct epoll_event ev = {
 545                         .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 546                         .data.ptr = s,
 547                 };
 548
 549                 if (epoll_ctl(s->event->epoll_fd,
 550                               s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 551                               s->child.pidfd, &ev) < 0)
 552                         return -errno;
 553         }
 554
 555         s->child.registered = true;
 556         return 0;
 557 }
 558
 559 static void source_memory_pressure_unregister(sd_event_source *s) {
 560         assert(s);
 561         assert(s->type == SOURCE_MEMORY_PRESSURE);
 562
 563         if (event_origin_changed(s->event))
 564                 return;
 565
 566         if (!s->memory_pressure.registered)
 567                 return;
 568
 569         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
 570                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 571                                 strna(s->description), event_source_type_to_string(s->type));
 572
 573         s->memory_pressure.registered = false;
 574 }
 575
 576 static int source_memory_pressure_register(sd_event_source *s, int enabled) {
 577         assert(s);
 578         assert(s->type == SOURCE_MEMORY_PRESSURE);
 579         assert(enabled != SD_EVENT_OFF);
 580
 581         struct epoll_event ev = {
 582                 .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
 583                           (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
 584                 .data.ptr = s,
 585         };
 586
 587         if (epoll_ctl(s->event->epoll_fd,
 588                       s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 589                       s->memory_pressure.fd, &ev) < 0)
 590                 return -errno;
 591
 592         s->memory_pressure.registered = true;
 593         return 0;
 594 }
 595
 596 static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
 597         assert(s);
 598         assert(s->type == SOURCE_MEMORY_PRESSURE);
 599
 600         if (s->memory_pressure.in_write_list)
 601                 return;
 602
 603         LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
 604         s->memory_pressure.in_write_list = true;
 605 }
 606
 607 static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
 608         assert(s);
 609         assert(s->type == SOURCE_MEMORY_PRESSURE);
 610
 611         if (!s->memory_pressure.in_write_list)
 612                 return;
 613
 614         LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
 615         s->memory_pressure.in_write_list = false;
 616 }
 617
 618 static clockid_t event_source_type_to_clock(EventSourceType t) {
 619
 620         switch (t) {
 621
 622         case SOURCE_TIME_REALTIME:
 623                 return CLOCK_REALTIME;
 624
 625         case SOURCE_TIME_BOOTTIME:
 626                 return CLOCK_BOOTTIME;
 627
 628         case SOURCE_TIME_MONOTONIC:
 629                 return CLOCK_MONOTONIC;
 630
 631         case SOURCE_TIME_REALTIME_ALARM:
 632                 return CLOCK_REALTIME_ALARM;
 633
 634         case SOURCE_TIME_BOOTTIME_ALARM:
 635                 return CLOCK_BOOTTIME_ALARM;
 636
 637         default:
 638                 return (clockid_t) -1;
 639         }
 640 }
 641
 642 static EventSourceType clock_to_event_source_type(clockid_t clock) {
 643
 644         switch (clock) {
 645
 646         case CLOCK_REALTIME:
 647                 return SOURCE_TIME_REALTIME;
 648
 649         case CLOCK_BOOTTIME:
 650                 return SOURCE_TIME_BOOTTIME;
 651
 652         case CLOCK_MONOTONIC:
 653                 return SOURCE_TIME_MONOTONIC;
 654
 655         case CLOCK_REALTIME_ALARM:
 656                 return SOURCE_TIME_REALTIME_ALARM;
 657
 658         case CLOCK_BOOTTIME_ALARM:
 659                 return SOURCE_TIME_BOOTTIME_ALARM;
 660
 661         default:
 662                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
 663         }
 664 }
 665
 666 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
 667         assert(e);
 668
 669         switch (t) {
 670
 671         case SOURCE_TIME_REALTIME:
 672                 return &e->realtime;
 673
 674         case SOURCE_TIME_BOOTTIME:
 675                 return &e->boottime;
 676
 677         case SOURCE_TIME_MONOTONIC:
 678                 return &e->monotonic;
 679
 680         case SOURCE_TIME_REALTIME_ALARM:
 681                 return &e->realtime_alarm;
 682
 683         case SOURCE_TIME_BOOTTIME_ALARM:
 684                 return &e->boottime_alarm;
 685
 686         default:
 687                 return NULL;
 688         }
 689 }
 690
 691 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
 692         assert(e);
 693
 694         if (!d)
 695                 return;
 696
 697         hashmap_remove(e->signal_data, &d->priority);
 698         safe_close(d->fd);
 699         free(d);
 700 }
 701
 702 static int event_make_signal_data(
 703                 sd_event *e,
 704                 int sig,
 705                 struct signal_data **ret) {
 706
 707         struct signal_data *d;
 708         bool added = false;
 709         sigset_t ss_copy;
 710         int64_t priority;
 711         int r;
 712
 713         assert(e);
 714
 715         if (event_origin_changed(e))
 716                 return -ECHILD;
 717
 718         if (e->signal_sources && e->signal_sources[sig])
 719                 priority = e->signal_sources[sig]->priority;
 720         else
 721                 priority = SD_EVENT_PRIORITY_NORMAL;
 722
 723         d = hashmap_get(e->signal_data, &priority);
 724         if (d) {
 725                 if (sigismember(&d->sigset, sig) > 0) {
 726                         if (ret)
 727                                 *ret = d;
 728                         return 0;
 729                 }
 730         } else {
 731                 d = new(struct signal_data, 1);
 732                 if (!d)
 733                         return -ENOMEM;
 734
 735                 *d = (struct signal_data) {
 736                         .wakeup = WAKEUP_SIGNAL_DATA,
 737                         .fd = -EBADF,
 738                         .priority = priority,
 739                 };
 740
 741                 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
 742                 if (r < 0) {
 743                         free(d);
 744                         return r;
 745                 }
 746
 747                 added = true;
 748         }
 749
 750         ss_copy = d->sigset;
 751         assert_se(sigaddset(&ss_copy, sig) >= 0);
 752
 753         r = signalfd(d->fd >= 0 ? d->fd : -1,   /* the first arg must be -1 or a valid signalfd */
 754                      &ss_copy,
 755                      SFD_NONBLOCK|SFD_CLOEXEC);
 756         if (r < 0) {
 757                 r = -errno;
 758                 goto fail;
 759         }
 760
 761         d->sigset = ss_copy;
 762
 763         if (d->fd >= 0) {
 764                 if (ret)
 765                         *ret = d;
 766                 return 0;
 767         }
 768
 769         d->fd = fd_move_above_stdio(r);
 770
 771         struct epoll_event ev = {
 772                 .events = EPOLLIN,
 773                 .data.ptr = d,
 774         };
 775
 776         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
 777                 r = -errno;
 778                 goto fail;
 779         }
 780
 781         if (ret)
 782                 *ret = d;
 783
 784         return 0;
 785
 786 fail:
 787         if (added)
 788                 event_free_signal_data(e, d);
 789
 790         return r;
 791 }
 792
 793 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
 794         assert(e);
 795         assert(d);
 796
 797         /* Turns off the specified signal in the signal data
 798          * object. If the signal mask of the object becomes empty that
 799          * way removes it. */
 800
 801         if (sigismember(&d->sigset, sig) == 0)
 802                 return;
 803
 804         assert_se(sigdelset(&d->sigset, sig) >= 0);
 805
 806         if (sigisemptyset(&d->sigset)) {
 807                 /* If all the mask is all-zero we can get rid of the structure */
 808                 event_free_signal_data(e, d);
 809                 return;
 810         }
 811
 812         if (event_origin_changed(e))
 813                 return;
 814
 815         assert(d->fd >= 0);
 816
 817         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
 818                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
 819 }
 820
 821 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
 822         struct signal_data *d;
 823         static const int64_t zero_priority = 0;
 824
 825         assert(e);
 826
 827         /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
 828          * and possibly drop the signalfd for it. */
 829
 830         if (sig == SIGCHLD &&
 831             e->n_online_child_sources > 0)
 832                 return;
 833
 834         if (e->signal_sources &&
 835             e->signal_sources[sig] &&
 836             event_source_is_online(e->signal_sources[sig]))
 837                 return;
 838
 839         /*
 840          * The specified signal might be enabled in three different queues:
 841          *
 842          * 1) the one that belongs to the priority passed (if it is non-NULL)
 843          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
 844          * 3) the 0 priority (to cover the SIGCHLD case)
 845          *
 846          * Hence, let's remove it from all three here.
 847          */
 848
 849         if (priority) {
 850                 d = hashmap_get(e->signal_data, priority);
 851                 if (d)
 852                         event_unmask_signal_data(e, d, sig);
 853         }
 854
 855         if (e->signal_sources && e->signal_sources[sig]) {
 856                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
 857                 if (d)
 858                         event_unmask_signal_data(e, d, sig);
 859         }
 860
 861         d = hashmap_get(e->signal_data, &zero_priority);
 862         if (d)
 863                 event_unmask_signal_data(e, d, sig);
 864 }
 865
 866 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
 867         assert(s);
 868
 869         /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
 870          * they are enabled/disabled or marked pending and such. */
 871
 872         if (s->pending)
 873                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
 874
 875         if (s->prepare)
 876                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
 877 }
 878
 879 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
 880         struct clock_data *d;
 881
 882         assert(s);
 883
 884         /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
 885          * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
 886          * properly again. */
 887
 888         if (s->ratelimited)
 889                 d = &s->event->monotonic;
 890         else if (EVENT_SOURCE_IS_TIME(s->type))
 891                 assert_se(d = event_get_clock_data(s->event, s->type));
 892         else
 893                 return; /* no-op for an event source which is neither a timer nor ratelimited. */
 894
 895         prioq_reshuffle(d->earliest, s, &s->earliest_index);
 896         prioq_reshuffle(d->latest, s, &s->latest_index);
 897         d->needs_rearm = true;
 898 }
 899
 900 static void event_source_time_prioq_remove(
 901                 sd_event_source *s,
 902                 struct clock_data *d) {
 903
 904         assert(s);
 905         assert(d);
 906
 907         prioq_remove(d->earliest, s, &s->earliest_index);
 908         prioq_remove(d->latest, s, &s->latest_index);
 909         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
 910         d->needs_rearm = true;
 911 }
 912
 913 static void source_disconnect(sd_event_source *s) {
 914         sd_event *event;
 915         int r;
 916
 917         assert(s);
 918
 919         if (!s->event)
 920                 return;
 921
 922         assert(s->event->n_sources > 0);
 923
 924         switch (s->type) {
 925
 926         case SOURCE_IO:
 927                 if (s->io.fd >= 0)
 928                         source_io_unregister(s);
 929
 930                 break;
 931
 932         case SOURCE_TIME_REALTIME:
 933         case SOURCE_TIME_BOOTTIME:
 934         case SOURCE_TIME_MONOTONIC:
 935         case SOURCE_TIME_REALTIME_ALARM:
 936         case SOURCE_TIME_BOOTTIME_ALARM:
 937                 /* Only remove this event source from the time event source here if it is not ratelimited. If
 938                  * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
 939                  * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
 940
 941                 if (!s->ratelimited) {
 942                         struct clock_data *d;
 943                         assert_se(d = event_get_clock_data(s->event, s->type));
 944                         event_source_time_prioq_remove(s, d);
 945                 }
 946
 947                 break;
 948
 949         case SOURCE_SIGNAL:
 950                 if (s->signal.sig > 0) {
 951
 952                         if (s->event->signal_sources)
 953                                 s->event->signal_sources[s->signal.sig] = NULL;
 954
 955                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
 956
 957                         if (s->signal.unblock) {
 958                                 sigset_t new_ss;
 959
 960                                 if (sigemptyset(&new_ss) < 0)
 961                                         log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
 962                                 else if (sigaddset(&new_ss, s->signal.sig) < 0)
 963                                         log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
 964                                 else {
 965                                         r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
 966                                         if (r != 0)
 967                                                 log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
 968                                 }
 969                         }
 970                 }
 971
 972                 break;
 973
 974         case SOURCE_CHILD:
 975                 if (event_origin_changed(s->event))
 976                         s->child.process_owned = false;
 977
 978                 if (s->child.pid > 0) {
 979                         if (event_source_is_online(s)) {
 980                                 assert(s->event->n_online_child_sources > 0);
 981                                 s->event->n_online_child_sources--;
 982                         }
 983
 984                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
 985                 }
 986
 987                 if (EVENT_SOURCE_WATCH_PIDFD(s))
 988                         source_child_pidfd_unregister(s);
 989                 else
 990                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
 991
 992                 break;
 993
 994         case SOURCE_DEFER:
 995                 /* nothing */
 996                 break;
 997
 998         case SOURCE_POST:
 999                 set_remove(s->event->post_sources, s);
1000                 break;
1001
1002         case SOURCE_EXIT:
1003                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
1004                 break;
1005
1006         case SOURCE_INOTIFY: {
1007                 struct inode_data *inode_data;
1008
1009                 inode_data = s->inotify.inode_data;
1010                 if (inode_data) {
1011                         struct inotify_data *inotify_data;
1012                         assert_se(inotify_data = inode_data->inotify_data);
1013
1014                         /* Detach this event source from the inode object */
1015                         LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
1016                         s->inotify.inode_data = NULL;
1017
1018                         if (s->pending) {
1019                                 assert(inotify_data->n_pending > 0);
1020                                 inotify_data->n_pending--;
1021                         }
1022
1023                         /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
1024                          * continued to being watched. That's because inotify doesn't really have an API for that: we
1025                          * can only change watch masks with access to the original inode either by fd or by path. But
1026                          * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
1027                          * continuously and keeping the mount busy which we can't really do. We could reconstruct the
1028                          * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
1029                          * there), but given the need for open_by_handle_at() which is privileged and not universally
1030                          * available this would be quite an incomplete solution. Hence we go the other way, leave the
1031                          * mask set, even if it is not minimized now, and ignore all events we aren't interested in
1032                          * anymore after reception. Yes, this sucks, but … Linux … */
1033
1034                         /* Maybe release the inode data (and its inotify) */
1035                         event_gc_inode_data(s->event, inode_data);
1036                 }
1037
1038                 break;
1039         }
1040
1041         case SOURCE_MEMORY_PRESSURE:
1042                 source_memory_pressure_remove_from_write_list(s);
1043                 source_memory_pressure_unregister(s);
1044                 break;
1045
1046         default:
1047                 assert_not_reached();
1048         }
1049
1050         if (s->pending)
1051                 prioq_remove(s->event->pending, s, &s->pending_index);
1052
1053         if (s->prepare)
1054                 prioq_remove(s->event->prepare, s, &s->prepare_index);
1055
1056         if (s->ratelimited)
1057                 event_source_time_prioq_remove(s, &s->event->monotonic);
1058
1059         event = TAKE_PTR(s->event);
1060         LIST_REMOVE(sources, event->sources, s);
1061         event->n_sources--;
1062
1063         /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
1064          * pidfd associated with this event source, which we'll do only on source_free(). */
1065
1066         if (!s->floating)
1067                 sd_event_unref(event);
1068 }
1069
1070 static sd_event_source* source_free(sd_event_source *s) {
1071         assert(s);
1072
1073         source_disconnect(s);
1074
1075         if (s->type == SOURCE_IO && s->io.owned)
1076                 s->io.fd = safe_close(s->io.fd);
1077
1078         if (s->type == SOURCE_CHILD) {
1079                 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
1080
1081                 if (s->child.process_owned) {
1082
1083                         if (!s->child.exited) {
1084                                 bool sent = false;
1085
1086                                 if (s->child.pidfd >= 0) {
1087                                         if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
1088                                                 if (errno == ESRCH) /* Already dead */
1089                                                         sent = true;
1090                                                 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
1091                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
1092                                                                         s->child.pid);
1093                                         } else
1094                                                 sent = true;
1095                                 }
1096
1097                                 if (!sent)
1098                                         if (kill(s->child.pid, SIGKILL) < 0)
1099                                                 if (errno != ESRCH) /* Already dead */
1100                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
1101                                                                         s->child.pid);
1102                         }
1103
1104                         if (!s->child.waited) {
1105                                 siginfo_t si = {};
1106
1107                                 /* Reap the child if we can */
1108                                 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
1109                         }
1110                 }
1111
1112                 if (s->child.pidfd_owned)
1113                         s->child.pidfd = safe_close(s->child.pidfd);
1114         }
1115
1116         if (s->type == SOURCE_MEMORY_PRESSURE) {
1117                 s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
1118                 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
1119         }
1120
1121         if (s->destroy_callback)
1122                 s->destroy_callback(s->userdata);
1123
1124         free(s->description);
1125         return mfree(s);
1126 }
1127 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1128
1129 static int source_set_pending(sd_event_source *s, bool b) {
1130         int r;
1131
1132         assert(s);
1133         assert(s->type != SOURCE_EXIT);
1134
1135         if (s->pending == b)
1136                 return 0;
1137
1138         s->pending = b;
1139
1140         if (b) {
1141                 s->pending_iteration = s->event->iteration;
1142
1143                 r = prioq_put(s->event->pending, s, &s->pending_index);
1144                 if (r < 0) {
1145                         s->pending = false;
1146                         return r;
1147                 }
1148         } else
1149                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1150
1151         if (EVENT_SOURCE_IS_TIME(s->type))
1152                 event_source_time_prioq_reshuffle(s);
1153
1154         if (s->type == SOURCE_SIGNAL && !b) {
1155                 struct signal_data *d;
1156
1157                 d = hashmap_get(s->event->signal_data, &s->priority);
1158                 if (d && d->current == s)
1159                         d->current = NULL;
1160         }
1161
1162         if (s->type == SOURCE_INOTIFY) {
1163
1164                 assert(s->inotify.inode_data);
1165                 assert(s->inotify.inode_data->inotify_data);
1166
1167                 if (b)
1168                         s->inotify.inode_data->inotify_data->n_pending++;
1169                 else {
1170                         assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1171                         s->inotify.inode_data->inotify_data->n_pending--;
1172                 }
1173         }
1174
1175         return 1;
1176 }
1177
1178 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1179
1180         /* Let's allocate exactly what we need. Note that the difference of the smallest event source
1181          * structure to the largest is 144 bytes on x86-64 at the time of writing, i.e. more than two cache
1182          * lines. */
1183         static const size_t size_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
1184                 [SOURCE_IO]                  = endoffsetof_field(sd_event_source, io),
1185                 [SOURCE_TIME_REALTIME]       = endoffsetof_field(sd_event_source, time),
1186                 [SOURCE_TIME_BOOTTIME]       = endoffsetof_field(sd_event_source, time),
1187                 [SOURCE_TIME_MONOTONIC]      = endoffsetof_field(sd_event_source, time),
1188                 [SOURCE_TIME_REALTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1189                 [SOURCE_TIME_BOOTTIME_ALARM] = endoffsetof_field(sd_event_source, time),
1190                 [SOURCE_SIGNAL]              = endoffsetof_field(sd_event_source, signal),
1191                 [SOURCE_CHILD]               = endoffsetof_field(sd_event_source, child),
1192                 [SOURCE_DEFER]               = endoffsetof_field(sd_event_source, defer),
1193                 [SOURCE_POST]                = endoffsetof_field(sd_event_source, post),
1194                 [SOURCE_EXIT]                = endoffsetof_field(sd_event_source, exit),
1195                 [SOURCE_INOTIFY]             = endoffsetof_field(sd_event_source, inotify),
1196                 [SOURCE_MEMORY_PRESSURE]     = endoffsetof_field(sd_event_source, memory_pressure),
1197         };
1198
1199         sd_event_source *s;
1200
1201         assert(e);
1202         assert(type >= 0);
1203         assert(type < _SOURCE_EVENT_SOURCE_TYPE_MAX);
1204         assert(size_table[type] > 0);
1205
1206         s = malloc0(size_table[type]);
1207         if (!s)
1208                 return NULL;
1209         /* We use expand_to_usable() here to tell gcc that it should consider this an object of the full
1210          * size, even if we only allocate the initial part we need. */
1211         s = expand_to_usable(s, sizeof(sd_event_source));
1212
1213         /* Note: we cannot use compound initialization here, because sizeof(sd_event_source) is likely larger
1214          * than what we allocated here. */
1215         s->n_ref = 1;
1216         s->event = e;
1217         s->floating = floating;
1218         s->type = type;
1219         s->pending_index = PRIOQ_IDX_NULL;
1220         s->prepare_index = PRIOQ_IDX_NULL;
1221
1222         if (!floating)
1223                 sd_event_ref(e);
1224
1225         LIST_PREPEND(sources, e->sources, s);
1226         e->n_sources++;
1227
1228         return s;
1229 }
1230
1231 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1232         assert(s);
1233
1234         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1235 }
1236
1237 _public_ int sd_event_add_io(
1238                 sd_event *e,
1239                 sd_event_source **ret,
1240                 int fd,
1241                 uint32_t events,
1242                 sd_event_io_handler_t callback,
1243                 void *userdata) {
1244
1245         _cleanup_(source_freep) sd_event_source *s = NULL;
1246         int r;
1247
1248         assert_return(e, -EINVAL);
1249         assert_return(e = event_resolve(e), -ENOPKG);
1250         assert_return(fd >= 0, -EBADF);
1251         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1252         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1253         assert_return(!event_origin_changed(e), -ECHILD);
1254
1255         if (!callback)
1256                 callback = io_exit_callback;
1257
1258         s = source_new(e, !ret, SOURCE_IO);
1259         if (!s)
1260                 return -ENOMEM;
1261
1262         s->wakeup = WAKEUP_EVENT_SOURCE;
1263         s->io.fd = fd;
1264         s->io.events = events;
1265         s->io.callback = callback;
1266         s->userdata = userdata;
1267         s->enabled = SD_EVENT_ON;
1268
1269         r = source_io_register(s, s->enabled, events);
1270         if (r < 0)
1271                 return r;
1272
1273         if (ret)
1274                 *ret = s;
1275         TAKE_PTR(s);
1276
1277         return 0;
1278 }
1279
1280 static void initialize_perturb(sd_event *e) {
1281         sd_id128_t id = {};
1282
1283         /* When we sleep for longer, we try to realign the wakeup to the same time within each
1284          * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
1285          * wakeup. However, let's take some system-specific randomness for this value, so that in a network
1286          * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
1287          * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
1288
1289         if (_likely_(e->perturb != USEC_INFINITY))
1290                 return;
1291
1292         if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
1293                 e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
1294         else
1295                 e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
1296 }
1297
1298 static int event_setup_timer_fd(
1299                 sd_event *e,
1300                 struct clock_data *d,
1301                 clockid_t clock) {
1302
1303         assert(e);
1304         assert(d);
1305
1306         if (_likely_(d->fd >= 0))
1307                 return 0;
1308
1309         _cleanup_close_ int fd = -EBADF;
1310
1311         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1312         if (fd < 0)
1313                 return -errno;
1314
1315         fd = fd_move_above_stdio(fd);
1316
1317         struct epoll_event ev = {
1318                 .events = EPOLLIN,
1319                 .data.ptr = d,
1320         };
1321
1322         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1323                 return -errno;
1324
1325         d->fd = TAKE_FD(fd);
1326         return 0;
1327 }
1328
1329 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1330         assert(s);
1331
1332         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1333 }
1334
1335 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1336         int r;
1337
1338         assert(d);
1339
1340         if (d->fd < 0) {
1341                 r = event_setup_timer_fd(e, d, clock);
1342                 if (r < 0)
1343                         return r;
1344         }
1345
1346         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1347         if (r < 0)
1348                 return r;
1349
1350         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1351         if (r < 0)
1352                 return r;
1353
1354         return 0;
1355 }
1356
1357 static int event_source_time_prioq_put(
1358                 sd_event_source *s,
1359                 struct clock_data *d) {
1360
1361         int r;
1362
1363         assert(s);
1364         assert(d);
1365         assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1366
1367         r = prioq_put(d->earliest, s, &s->earliest_index);
1368         if (r < 0)
1369                 return r;
1370
1371         r = prioq_put(d->latest, s, &s->latest_index);
1372         if (r < 0) {
1373                 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1374                 s->earliest_index = PRIOQ_IDX_NULL;
1375                 return r;
1376         }
1377
1378         d->needs_rearm = true;
1379         return 0;
1380 }
1381
1382 _public_ int sd_event_add_time(
1383                 sd_event *e,
1384                 sd_event_source **ret,
1385                 clockid_t clock,
1386                 uint64_t usec,
1387                 uint64_t accuracy,
1388                 sd_event_time_handler_t callback,
1389                 void *userdata) {
1390
1391         EventSourceType type;
1392         _cleanup_(source_freep) sd_event_source *s = NULL;
1393         struct clock_data *d;
1394         int r;
1395
1396         assert_return(e, -EINVAL);
1397         assert_return(e = event_resolve(e), -ENOPKG);
1398         assert_return(accuracy != UINT64_MAX, -EINVAL);
1399         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1400         assert_return(!event_origin_changed(e), -ECHILD);
1401
1402         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1403                 return -EOPNOTSUPP;
1404
1405         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1406         if (type < 0)
1407                 return -EOPNOTSUPP;
1408
1409         if (!callback)
1410                 callback = time_exit_callback;
1411
1412         assert_se(d = event_get_clock_data(e, type));
1413
1414         r = setup_clock_data(e, d, clock);
1415         if (r < 0)
1416                 return r;
1417
1418         s = source_new(e, !ret, type);
1419         if (!s)
1420                 return -ENOMEM;
1421
1422         s->time.next = usec;
1423         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1424         s->time.callback = callback;
1425         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1426         s->userdata = userdata;
1427         s->enabled = SD_EVENT_ONESHOT;
1428
1429         r = event_source_time_prioq_put(s, d);
1430         if (r < 0)
1431                 return r;
1432
1433         if (ret)
1434                 *ret = s;
1435         TAKE_PTR(s);
1436
1437         return 0;
1438 }
1439
1440 _public_ int sd_event_add_time_relative(
1441                 sd_event *e,
1442                 sd_event_source **ret,
1443                 clockid_t clock,
1444                 uint64_t usec,
1445                 uint64_t accuracy,
1446                 sd_event_time_handler_t callback,
1447                 void *userdata) {
1448
1449         usec_t t;
1450         int r;
1451
1452         /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1453          * checks for overflow. */
1454
1455         r = sd_event_now(e, clock, &t);
1456         if (r < 0)
1457                 return r;
1458
1459         if (usec >= USEC_INFINITY - t)
1460                 return -EOVERFLOW;
1461
1462         return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1463 }
1464
1465 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1466         assert(s);
1467
1468         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1469 }
1470
1471 _public_ int sd_event_add_signal(
1472                 sd_event *e,
1473                 sd_event_source **ret,
1474                 int sig,
1475                 sd_event_signal_handler_t callback,
1476                 void *userdata) {
1477
1478         _cleanup_(source_freep) sd_event_source *s = NULL;
1479         struct signal_data *d;
1480         sigset_t new_ss;
1481         bool block_it;
1482         int r;
1483
1484         assert_return(e, -EINVAL);
1485         assert_return(e = event_resolve(e), -ENOPKG);
1486         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1487         assert_return(!event_origin_changed(e), -ECHILD);
1488
1489         /* Let's make sure our special flag stays outside of the valid signal range */
1490         assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
1491
1492         if (sig & SD_EVENT_SIGNAL_PROCMASK) {
1493                 sig &= ~SD_EVENT_SIGNAL_PROCMASK;
1494                 assert_return(SIGNAL_VALID(sig), -EINVAL);
1495
1496                 block_it = true;
1497         } else {
1498                 assert_return(SIGNAL_VALID(sig), -EINVAL);
1499
1500                 r = signal_is_blocked(sig);
1501                 if (r < 0)
1502                         return r;
1503                 if (r == 0)
1504                         return -EBUSY;
1505
1506                 block_it = false;
1507         }
1508
1509         if (!callback)
1510                 callback = signal_exit_callback;
1511
1512         if (!e->signal_sources) {
1513                 e->signal_sources = new0(sd_event_source*, _NSIG);
1514                 if (!e->signal_sources)
1515                         return -ENOMEM;
1516         } else if (e->signal_sources[sig])
1517                 return -EBUSY;
1518
1519         s = source_new(e, !ret, SOURCE_SIGNAL);
1520         if (!s)
1521                 return -ENOMEM;
1522
1523         s->signal.sig = sig;
1524         s->signal.callback = callback;
1525         s->userdata = userdata;
1526         s->enabled = SD_EVENT_ON;
1527
1528         e->signal_sources[sig] = s;
1529
1530         if (block_it) {
1531                 sigset_t old_ss;
1532
1533                 if (sigemptyset(&new_ss) < 0)
1534                         return -errno;
1535
1536                 if (sigaddset(&new_ss, sig) < 0)
1537                         return -errno;
1538
1539                 r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
1540                 if (r != 0)
1541                         return -r;
1542
1543                 r = sigismember(&old_ss, sig);
1544                 if (r < 0)
1545                         return -errno;
1546
1547                 s->signal.unblock = !r;
1548         } else
1549                 s->signal.unblock = false;
1550
1551         r = event_make_signal_data(e, sig, &d);
1552         if (r < 0) {
1553                 if (s->signal.unblock)
1554                         (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
1555
1556                 return r;
1557         }
1558
1559         /* Use the signal name as description for the event source by default */
1560         (void) sd_event_source_set_description(s, signal_to_string(sig));
1561
1562         if (ret)
1563                 *ret = s;
1564         TAKE_PTR(s);
1565
1566         return 0;
1567 }
1568
1569 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1570         assert(s);
1571
1572         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1573 }
1574
1575 static bool shall_use_pidfd(void) {
1576         /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1577         return secure_getenv_bool("SYSTEMD_PIDFD") != 0;
1578 }
1579
1580 _public_ int sd_event_add_child(
1581                 sd_event *e,
1582                 sd_event_source **ret,
1583                 pid_t pid,
1584                 int options,
1585                 sd_event_child_handler_t callback,
1586                 void *userdata) {
1587
1588         _cleanup_(source_freep) sd_event_source *s = NULL;
1589         int r;
1590
1591         assert_return(e, -EINVAL);
1592         assert_return(e = event_resolve(e), -ENOPKG);
1593         assert_return(pid > 1, -EINVAL);
1594         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1595         assert_return(options != 0, -EINVAL);
1596         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1597         assert_return(!event_origin_changed(e), -ECHILD);
1598
1599         if (!callback)
1600                 callback = child_exit_callback;
1601
1602         if (e->n_online_child_sources == 0) {
1603                 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1604                  * for compatibility with pre-pidfd and because we don't want the reap the child processes
1605                  * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1606                  * take effect.
1607                  *
1608                  * (As an optimization we only do this check on the first child event source created.) */
1609                 r = signal_is_blocked(SIGCHLD);
1610                 if (r < 0)
1611                         return r;
1612                 if (r == 0)
1613                         return -EBUSY;
1614         }
1615
1616         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1617         if (r < 0)
1618                 return r;
1619
1620         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1621                 return -EBUSY;
1622
1623         s = source_new(e, !ret, SOURCE_CHILD);
1624         if (!s)
1625                 return -ENOMEM;
1626
1627         s->wakeup = WAKEUP_EVENT_SOURCE;
1628         s->child.options = options;
1629         s->child.callback = callback;
1630         s->userdata = userdata;
1631         s->enabled = SD_EVENT_ONESHOT;
1632
1633         /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1634          * pin the PID, and make regular waitid() handling race-free. */
1635
1636         if (shall_use_pidfd()) {
1637                 s->child.pidfd = pidfd_open(pid, 0);
1638                 if (s->child.pidfd < 0) {
1639                         /* Propagate errors unless the syscall is not supported or blocked */
1640                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1641                                 return -errno;
1642                 } else
1643                         s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1644         } else
1645                 s->child.pidfd = -EBADF;
1646
1647         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1648                 /* We have a pidfd and we only want to watch for exit */
1649                 r = source_child_pidfd_register(s, s->enabled);
1650                 if (r < 0)
1651                         return r;
1652
1653         } else {
1654                 /* We have no pidfd or we shall wait for some other event than WEXITED */
1655                 r = event_make_signal_data(e, SIGCHLD, NULL);
1656                 if (r < 0)
1657                         return r;
1658
1659                 e->need_process_child = true;
1660         }
1661
1662         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1663         if (r < 0)
1664                 return r;
1665
1666         /* These must be done after everything succeeds. */
1667         s->child.pid = pid;
1668         e->n_online_child_sources++;
1669
1670         if (ret)
1671                 *ret = s;
1672         TAKE_PTR(s);
1673         return 0;
1674 }
1675
1676 _public_ int sd_event_add_child_pidfd(
1677                 sd_event *e,
1678                 sd_event_source **ret,
1679                 int pidfd,
1680                 int options,
1681                 sd_event_child_handler_t callback,
1682                 void *userdata) {
1683
1684
1685         _cleanup_(source_freep) sd_event_source *s = NULL;
1686         pid_t pid;
1687         int r;
1688
1689         assert_return(e, -EINVAL);
1690         assert_return(e = event_resolve(e), -ENOPKG);
1691         assert_return(pidfd >= 0, -EBADF);
1692         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1693         assert_return(options != 0, -EINVAL);
1694         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1695         assert_return(!event_origin_changed(e), -ECHILD);
1696
1697         if (!callback)
1698                 callback = child_exit_callback;
1699
1700         if (e->n_online_child_sources == 0) {
1701                 r = signal_is_blocked(SIGCHLD);
1702                 if (r < 0)
1703                         return r;
1704                 if (r == 0)
1705                         return -EBUSY;
1706         }
1707
1708         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1709         if (r < 0)
1710                 return r;
1711
1712         r = pidfd_get_pid(pidfd, &pid);
1713         if (r < 0)
1714                 return r;
1715
1716         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1717                 return -EBUSY;
1718
1719         s = source_new(e, !ret, SOURCE_CHILD);
1720         if (!s)
1721                 return -ENOMEM;
1722
1723         s->wakeup = WAKEUP_EVENT_SOURCE;
1724         s->child.pidfd = pidfd;
1725         s->child.pid = pid;
1726         s->child.options = options;
1727         s->child.callback = callback;
1728         s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1729         s->userdata = userdata;
1730         s->enabled = SD_EVENT_ONESHOT;
1731
1732         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1733         if (r < 0)
1734                 return r;
1735
1736         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1737                 /* We only want to watch for WEXITED */
1738                 r = source_child_pidfd_register(s, s->enabled);
1739                 if (r < 0)
1740                         return r;
1741         } else {
1742                 /* We shall wait for some other event than WEXITED */
1743                 r = event_make_signal_data(e, SIGCHLD, NULL);
1744                 if (r < 0)
1745                         return r;
1746
1747                 e->need_process_child = true;
1748         }
1749
1750         e->n_online_child_sources++;
1751
1752         if (ret)
1753                 *ret = s;
1754         TAKE_PTR(s);
1755         return 0;
1756 }
1757
1758 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1759         assert(s);
1760
1761         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1762 }
1763
1764 _public_ int sd_event_add_defer(
1765                 sd_event *e,
1766                 sd_event_source **ret,
1767                 sd_event_handler_t callback,
1768                 void *userdata) {
1769
1770         _cleanup_(source_freep) sd_event_source *s = NULL;
1771         int r;
1772
1773         assert_return(e, -EINVAL);
1774         assert_return(e = event_resolve(e), -ENOPKG);
1775         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1776         assert_return(!event_origin_changed(e), -ECHILD);
1777
1778         if (!callback)
1779                 callback = generic_exit_callback;
1780
1781         s = source_new(e, !ret, SOURCE_DEFER);
1782         if (!s)
1783                 return -ENOMEM;
1784
1785         s->defer.callback = callback;
1786         s->userdata = userdata;
1787         s->enabled = SD_EVENT_ONESHOT;
1788
1789         r = source_set_pending(s, true);
1790         if (r < 0)
1791                 return r;
1792
1793         if (ret)
1794                 *ret = s;
1795         TAKE_PTR(s);
1796
1797         return 0;
1798 }
1799
1800 _public_ int sd_event_add_post(
1801                 sd_event *e,
1802                 sd_event_source **ret,
1803                 sd_event_handler_t callback,
1804                 void *userdata) {
1805
1806         _cleanup_(source_freep) sd_event_source *s = NULL;
1807         int r;
1808
1809         assert_return(e, -EINVAL);
1810         assert_return(e = event_resolve(e), -ENOPKG);
1811         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1812         assert_return(!event_origin_changed(e), -ECHILD);
1813
1814         if (!callback)
1815                 callback = generic_exit_callback;
1816
1817         s = source_new(e, !ret, SOURCE_POST);
1818         if (!s)
1819                 return -ENOMEM;
1820
1821         s->post.callback = callback;
1822         s->userdata = userdata;
1823         s->enabled = SD_EVENT_ON;
1824
1825         r = set_ensure_put(&e->post_sources, NULL, s);
1826         if (r < 0)
1827                 return r;
1828         assert(r > 0);
1829
1830         if (ret)
1831                 *ret = s;
1832         TAKE_PTR(s);
1833
1834         return 0;
1835 }
1836
1837 _public_ int sd_event_add_exit(
1838                 sd_event *e,
1839                 sd_event_source **ret,
1840                 sd_event_handler_t callback,
1841                 void *userdata) {
1842
1843         _cleanup_(source_freep) sd_event_source *s = NULL;
1844         int r;
1845
1846         assert_return(e, -EINVAL);
1847         assert_return(e = event_resolve(e), -ENOPKG);
1848         assert_return(callback, -EINVAL);
1849         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1850         assert_return(!event_origin_changed(e), -ECHILD);
1851
1852         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1853         if (r < 0)
1854                 return r;
1855
1856         s = source_new(e, !ret, SOURCE_EXIT);
1857         if (!s)
1858                 return -ENOMEM;
1859
1860         s->exit.callback = callback;
1861         s->userdata = userdata;
1862         s->exit.prioq_index = PRIOQ_IDX_NULL;
1863         s->enabled = SD_EVENT_ONESHOT;
1864
1865         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1866         if (r < 0)
1867                 return r;
1868
1869         if (ret)
1870                 *ret = s;
1871         TAKE_PTR(s);
1872
1873         return 0;
1874 }
1875
1876 _public_ int sd_event_trim_memory(void) {
1877         int r;
1878
1879         /* A default implementation of a memory pressure callback. Simply releases our own allocation caches
1880          * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
1881          * NULL callback parameter. */
1882
1883         log_debug("Memory pressure event, trimming malloc() memory.");
1884
1885 #if HAVE_GENERIC_MALLINFO
1886         generic_mallinfo before_mallinfo = generic_mallinfo_get();
1887 #endif
1888
1889         usec_t before_timestamp = now(CLOCK_MONOTONIC);
1890         hashmap_trim_pools();
1891         r = malloc_trim(0);
1892         usec_t after_timestamp = now(CLOCK_MONOTONIC);
1893
1894         if (r > 0)
1895                 log_debug("Successfully trimmed some memory.");
1896         else
1897                 log_debug("Couldn't trim any memory.");
1898
1899         usec_t period = after_timestamp - before_timestamp;
1900
1901 #if HAVE_GENERIC_MALLINFO
1902         generic_mallinfo after_mallinfo = generic_mallinfo_get();
1903         size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
1904                 LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
1905         log_struct(LOG_DEBUG,
1906                    LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
1907                                FORMAT_TIMESPAN(period, 0),
1908                                FORMAT_BYTES(l)),
1909                    "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1910                    "TRIMMED_BYTES=%zu", l,
1911                    "TRIMMED_USEC=" USEC_FMT, period);
1912 #else
1913         log_struct(LOG_DEBUG,
1914                    LOG_MESSAGE("Memory trimming took %s.",
1915                                FORMAT_TIMESPAN(period, 0)),
1916                    "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
1917                    "TRIMMED_USEC=" USEC_FMT, period);
1918 #endif
1919
1920         return 0;
1921 }
1922
1923 static int memory_pressure_callback(sd_event_source *s, void *userdata) {
1924         assert(s);
1925
1926         sd_event_trim_memory();
1927         return 0;
1928 }
1929
1930 _public_ int sd_event_add_memory_pressure(
1931                 sd_event *e,
1932                 sd_event_source **ret,
1933                 sd_event_handler_t callback,
1934                 void *userdata) {
1935
1936         _cleanup_free_ char *w = NULL;
1937         _cleanup_(source_freep) sd_event_source *s = NULL;
1938         _cleanup_close_ int path_fd = -EBADF, fd = -EBADF;
1939         _cleanup_free_ void *write_buffer = NULL;
1940         const char *watch, *watch_fallback = NULL, *env;
1941         size_t write_buffer_size = 0;
1942         struct stat st;
1943         uint32_t events;
1944         bool locked;
1945         int r;
1946
1947         assert_return(e, -EINVAL);
1948         assert_return(e = event_resolve(e), -ENOPKG);
1949         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1950         assert_return(!event_origin_changed(e), -ECHILD);
1951
1952         if (!callback)
1953                 callback = memory_pressure_callback;
1954
1955         s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
1956         if (!s)
1957                 return -ENOMEM;
1958
1959         s->wakeup = WAKEUP_EVENT_SOURCE;
1960         s->memory_pressure.callback = callback;
1961         s->userdata = userdata;
1962         s->enabled = SD_EVENT_ON;
1963         s->memory_pressure.fd = -EBADF;
1964
1965         env = secure_getenv("MEMORY_PRESSURE_WATCH");
1966         if (env) {
1967                 if (isempty(env) || path_equal(env, "/dev/null"))
1968                         return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
1969                                                "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
1970
1971                 if (!path_is_absolute(env) || !path_is_normalized(env))
1972                         return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1973                                                "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
1974
1975                 watch = env;
1976
1977                 env = secure_getenv("MEMORY_PRESSURE_WRITE");
1978                 if (env) {
1979                         r = unbase64mem(env, &write_buffer, &write_buffer_size);
1980                         if (r < 0)
1981                                 return r;
1982                 }
1983
1984                 locked = true;
1985         } else {
1986
1987                 r = is_pressure_supported();
1988                 if (r < 0)
1989                         return r;
1990                 if (r == 0)
1991                         return -EOPNOTSUPP;
1992
1993                 /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
1994                  * the system wide pressure if for some reason we cannot (which could be: memory controller
1995                  * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
1996                  * only use the system-wide logic. */
1997                 r = cg_all_unified();
1998                 if (r < 0)
1999                         return r;
2000                 if (r == 0)
2001                         watch = "/proc/pressure/memory";
2002                 else {
2003                         _cleanup_free_ char *cg = NULL;
2004
2005                         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
2006                         if (r < 0)
2007                                 return r;
2008
2009                         w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
2010                         if (!w)
2011                                 return -ENOMEM;
2012
2013                         watch = w;
2014                         watch_fallback = "/proc/pressure/memory";
2015                 }
2016
2017                 /* Android uses three levels in its userspace low memory killer logic:
2018                  *     some  70000 1000000
2019                  *     some 100000 1000000
2020                  *     full  70000 1000000
2021                  *
2022                  * GNOME's low memory monitor uses:
2023                  *     some  70000 1000000
2024                  *     some 100000 1000000
2025                  *     full 100000 1000000
2026                  *
2027                  * We'll default to the middle level that both agree on. Except we do it on a 2s window
2028                  * (i.e. 200ms per 2s, rather than 100ms per 1s), because that's the window duration the
2029                  * kernel will allow us to do unprivileged, also in the future. */
2030                 if (asprintf((char**) &write_buffer,
2031                              "%s " USEC_FMT " " USEC_FMT,
2032                              MEMORY_PRESSURE_DEFAULT_TYPE,
2033                              MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
2034                              MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2035                         return -ENOMEM;
2036
2037                 write_buffer_size = strlen(write_buffer) + 1;
2038                 locked = false;
2039         }
2040
2041         path_fd = open(watch, O_PATH|O_CLOEXEC);
2042         if (path_fd < 0) {
2043                 if (errno != ENOENT)
2044                         return -errno;
2045
2046                 /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
2047                  * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
2048                  * the PSI service apparently is not supported) */
2049                 if (!watch_fallback)
2050                         return locked ? -ENOENT : -EOPNOTSUPP;
2051
2052                 path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
2053                 if (path_fd < 0) {
2054                         if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
2055                                 return -EOPNOTSUPP;
2056                         return -errno;
2057                 }
2058         }
2059
2060         if (fstat(path_fd, &st) < 0)
2061                 return -errno;
2062
2063         if (S_ISSOCK(st.st_mode)) {
2064                 fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
2065                 if (fd < 0)
2066                         return -errno;
2067
2068                 r = connect_unix_path(fd, path_fd, NULL);
2069                 if (r < 0)
2070                         return r;
2071
2072                 events = EPOLLIN;
2073
2074         } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
2075                 fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
2076                 if (fd < 0)
2077                         return fd;
2078
2079                 if (S_ISREG(st.st_mode)) {
2080                         struct statfs sfs;
2081
2082                         /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
2083
2084                         if (fstatfs(fd, &sfs) < 0)
2085                                 return -errno;
2086
2087                         if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
2088                             !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
2089                                 return -ENOTTY;
2090
2091                         events = EPOLLPRI;
2092                 } else
2093                         /* For fifos and char devices just watch for EPOLLIN */
2094                         events = EPOLLIN;
2095
2096         } else if (S_ISDIR(st.st_mode))
2097                 return -EISDIR;
2098         else
2099                 return -EBADF;
2100
2101         s->memory_pressure.fd = TAKE_FD(fd);
2102         s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
2103         s->memory_pressure.write_buffer_size = write_buffer_size;
2104         s->memory_pressure.events = events;
2105         s->memory_pressure.locked = locked;
2106
2107         /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
2108          * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
2109          * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
2110          * event sources on which writes must be executed before the first event loop iteration is
2111          * executed. (We could also write the data here, right away, but we want to give the caller the
2112          * freedom to call sd_event_source_set_memory_pressure_type() and
2113          * sd_event_source_set_memory_pressure_rate() before we write it. */
2114
2115         if (s->memory_pressure.write_buffer_size > 0)
2116                 source_memory_pressure_add_to_write_list(s);
2117         else {
2118                 r = source_memory_pressure_register(s, s->enabled);
2119                 if (r < 0)
2120                         return r;
2121         }
2122
2123         if (ret)
2124                 *ret = s;
2125         TAKE_PTR(s);
2126
2127         return 0;
2128 }
2129
2130 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
2131         assert(e);
2132
2133         if (!d)
2134                 return;
2135
2136         assert(hashmap_isempty(d->inodes));
2137         assert(hashmap_isempty(d->wd));
2138
2139         if (d->buffer_filled > 0)
2140                 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
2141
2142         hashmap_free(d->inodes);
2143         hashmap_free(d->wd);
2144
2145         assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
2146
2147         if (d->fd >= 0) {
2148                 if (!event_origin_changed(e) &&
2149                     epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
2150                         log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
2151
2152                 safe_close(d->fd);
2153         }
2154         free(d);
2155 }
2156
2157 static int event_make_inotify_data(
2158                 sd_event *e,
2159                 int64_t priority,
2160                 struct inotify_data **ret) {
2161
2162         _cleanup_close_ int fd = -EBADF;
2163         struct inotify_data *d;
2164         int r;
2165
2166         assert(e);
2167
2168         d = hashmap_get(e->inotify_data, &priority);
2169         if (d) {
2170                 if (ret)
2171                         *ret = d;
2172                 return 0;
2173         }
2174
2175         fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
2176         if (fd < 0)
2177                 return -errno;
2178
2179         fd = fd_move_above_stdio(fd);
2180
2181         d = new(struct inotify_data, 1);
2182         if (!d)
2183                 return -ENOMEM;
2184
2185         *d = (struct inotify_data) {
2186                 .wakeup = WAKEUP_INOTIFY_DATA,
2187                 .fd = TAKE_FD(fd),
2188                 .priority = priority,
2189         };
2190
2191         r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
2192         if (r < 0) {
2193                 d->fd = safe_close(d->fd);
2194                 free(d);
2195                 return r;
2196         }
2197
2198         struct epoll_event ev = {
2199                 .events = EPOLLIN,
2200                 .data.ptr = d,
2201         };
2202
2203         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
2204                 r = -errno;
2205                 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
2206                                             * remove the fd from the epoll first, which we don't want as we couldn't
2207                                             * add it in the first place. */
2208                 event_free_inotify_data(e, d);
2209                 return r;
2210         }
2211
2212         if (ret)
2213                 *ret = d;
2214
2215         return 1;
2216 }
2217
2218 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
2219         int r;
2220
2221         assert(x);
2222         assert(y);
2223
2224         r = CMP(x->dev, y->dev);
2225         if (r != 0)
2226                 return r;
2227
2228         return CMP(x->ino, y->ino);
2229 }
2230
2231 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
2232         assert(d);
2233
2234         siphash24_compress_typesafe(d->dev, state);
2235         siphash24_compress_typesafe(d->ino, state);
2236 }
2237
2238 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
2239
2240 static void event_free_inode_data(
2241                 sd_event *e,
2242                 struct inode_data *d) {
2243
2244         assert(e);
2245
2246         if (!d)
2247                 return;
2248
2249         assert(!d->event_sources);
2250
2251         if (d->fd >= 0) {
2252                 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
2253                 safe_close(d->fd);
2254         }
2255
2256         if (d->inotify_data) {
2257
2258                 if (d->wd >= 0) {
2259                         if (d->inotify_data->fd >= 0 && !event_origin_changed(e)) {
2260                                 /* So here's a problem. At the time this runs the watch descriptor might already be
2261                                  * invalidated, because an IN_IGNORED event might be queued right the moment we enter
2262                                  * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
2263                                  * likely case to happen. */
2264
2265                                 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
2266                                         log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
2267                         }
2268
2269                         assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
2270                 }
2271
2272                 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
2273         }
2274
2275         free(d->path);
2276         free(d);
2277 }
2278
2279 static void event_gc_inotify_data(
2280                 sd_event *e,
2281                 struct inotify_data *d) {
2282
2283         assert(e);
2284
2285         /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
2286          * any inode with it anymore, which in turn happens if no event source of this priority is interested
2287          * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
2288          * (under the expectation that the GC is called again once the counter is decremented). */
2289
2290         if (!d)
2291                 return;
2292
2293         if (!hashmap_isempty(d->inodes))
2294                 return;
2295
2296         if (d->n_busy > 0)
2297                 return;
2298
2299         event_free_inotify_data(e, d);
2300 }
2301
2302 static void event_gc_inode_data(
2303                 sd_event *e,
2304                 struct inode_data *d) {
2305
2306         struct inotify_data *inotify_data;
2307
2308         assert(e);
2309
2310         if (!d)
2311                 return;
2312
2313         if (d->event_sources)
2314                 return;
2315
2316         inotify_data = d->inotify_data;
2317         event_free_inode_data(e, d);
2318
2319         event_gc_inotify_data(e, inotify_data);
2320 }
2321
2322 static int event_make_inode_data(
2323                 sd_event *e,
2324                 struct inotify_data *inotify_data,
2325                 dev_t dev,
2326                 ino_t ino,
2327                 struct inode_data **ret) {
2328
2329         struct inode_data *d, key;
2330         int r;
2331
2332         assert(e);
2333         assert(inotify_data);
2334
2335         key = (struct inode_data) {
2336                 .ino = ino,
2337                 .dev = dev,
2338         };
2339
2340         d = hashmap_get(inotify_data->inodes, &key);
2341         if (d) {
2342                 if (ret)
2343                         *ret = d;
2344
2345                 return 0;
2346         }
2347
2348         r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
2349         if (r < 0)
2350                 return r;
2351
2352         d = new(struct inode_data, 1);
2353         if (!d)
2354                 return -ENOMEM;
2355
2356         *d = (struct inode_data) {
2357                 .dev = dev,
2358                 .ino = ino,
2359                 .wd = -1,
2360                 .fd = -EBADF,
2361                 .inotify_data = inotify_data,
2362         };
2363
2364         r = hashmap_put(inotify_data->inodes, d, d);
2365         if (r < 0) {
2366                 free(d);
2367                 return r;
2368         }
2369
2370         if (ret)
2371                 *ret = d;
2372
2373         return 1;
2374 }
2375
2376 static uint32_t inode_data_determine_mask(struct inode_data *d) {
2377         bool excl_unlink = true;
2378         uint32_t combined = 0;
2379
2380         assert(d);
2381
2382         /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
2383          * the IN_EXCL_UNLINK flag is ANDed instead.
2384          *
2385          * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
2386          * because we cannot change the mask anymore after the event source was created once, since the kernel has no
2387          * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
2388          * events we don't care for client-side. */
2389
2390         LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
2391
2392                 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
2393                         excl_unlink = false;
2394
2395                 combined |= s->inotify.mask;
2396         }
2397
2398         return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
2399 }
2400
2401 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
2402         uint32_t combined_mask;
2403         int wd, r;
2404
2405         assert(d);
2406         assert(d->fd >= 0);
2407
2408         combined_mask = inode_data_determine_mask(d);
2409
2410         if (d->wd >= 0 && combined_mask == d->combined_mask)
2411                 return 0;
2412
2413         r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
2414         if (r < 0)
2415                 return r;
2416
2417         wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
2418         if (wd < 0)
2419                 return wd;
2420
2421         if (d->wd < 0) {
2422                 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
2423                 if (r < 0) {
2424                         (void) inotify_rm_watch(d->inotify_data->fd, wd);
2425                         return r;
2426                 }
2427
2428                 d->wd = wd;
2429
2430         } else if (d->wd != wd) {
2431
2432                 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
2433                 (void) inotify_rm_watch(d->fd, wd);
2434                 return -EINVAL;
2435         }
2436
2437         d->combined_mask = combined_mask;
2438         return 1;
2439 }
2440
2441 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
2442         assert(s);
2443
2444         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
2445 }
2446
2447 static int event_add_inotify_fd_internal(
2448                 sd_event *e,
2449                 sd_event_source **ret,
2450                 int fd,
2451                 bool donate,
2452                 uint32_t mask,
2453                 sd_event_inotify_handler_t callback,
2454                 void *userdata) {
2455
2456         _cleanup_close_ int donated_fd = donate ? fd : -EBADF;
2457         _cleanup_(source_freep) sd_event_source *s = NULL;
2458         struct inotify_data *inotify_data = NULL;
2459         struct inode_data *inode_data = NULL;
2460         struct stat st;
2461         int r;
2462
2463         assert_return(e, -EINVAL);
2464         assert_return(e = event_resolve(e), -ENOPKG);
2465         assert_return(fd >= 0, -EBADF);
2466         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2467         assert_return(!event_origin_changed(e), -ECHILD);
2468
2469         if (!callback)
2470                 callback = inotify_exit_callback;
2471
2472         /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2473          * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2474          * the user can't use them for us. */
2475         if (mask & IN_MASK_ADD)
2476                 return -EINVAL;
2477
2478         if (fstat(fd, &st) < 0)
2479                 return -errno;
2480
2481         s = source_new(e, !ret, SOURCE_INOTIFY);
2482         if (!s)
2483                 return -ENOMEM;
2484
2485         s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2486         s->inotify.mask = mask;
2487         s->inotify.callback = callback;
2488         s->userdata = userdata;
2489
2490         /* Allocate an inotify object for this priority, and an inode object within it */
2491         r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2492         if (r < 0)
2493                 return r;
2494
2495         r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2496         if (r < 0) {
2497                 event_gc_inotify_data(e, inotify_data);
2498                 return r;
2499         }
2500
2501         /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2502          * the event source, until then, for which we need the original inode. */
2503         if (inode_data->fd < 0) {
2504                 if (donated_fd >= 0)
2505                         inode_data->fd = TAKE_FD(donated_fd);
2506                 else {
2507                         inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2508                         if (inode_data->fd < 0) {
2509                                 r = -errno;
2510                                 event_gc_inode_data(e, inode_data);
2511                                 return r;
2512                         }
2513                 }
2514
2515                 LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
2516
2517                 _cleanup_free_ char *path = NULL;
2518                 r = fd_get_path(inode_data->fd, &path);
2519                 if (r < 0 && r != -ENOSYS) { /* The path is optional, hence ignore -ENOSYS. */
2520                         event_gc_inode_data(e, inode_data);
2521                         return r;
2522                 }
2523
2524                 free_and_replace(inode_data->path, path);
2525         }
2526
2527         /* Link our event source to the inode data object */
2528         LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2529         s->inotify.inode_data = inode_data;
2530
2531         /* Actually realize the watch now */
2532         r = inode_data_realize_watch(e, inode_data);
2533         if (r < 0)
2534                 return r;
2535
2536         if (ret)
2537                 *ret = s;
2538         TAKE_PTR(s);
2539
2540         return 0;
2541 }
2542
2543 _public_ int sd_event_add_inotify_fd(
2544                 sd_event *e,
2545                 sd_event_source **ret,
2546                 int fd,
2547                 uint32_t mask,
2548                 sd_event_inotify_handler_t callback,
2549                 void *userdata) {
2550
2551         return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2552 }
2553
2554 _public_ int sd_event_add_inotify(
2555                 sd_event *e,
2556                 sd_event_source **ret,
2557                 const char *path,
2558                 uint32_t mask,
2559                 sd_event_inotify_handler_t callback,
2560                 void *userdata) {
2561
2562         sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2563         int fd, r;
2564
2565         assert_return(path, -EINVAL);
2566
2567         fd = open(path, O_PATH | O_CLOEXEC |
2568                         (mask & IN_ONLYDIR ? O_DIRECTORY : 0) |
2569                         (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2570         if (fd < 0)
2571                 return -errno;
2572
2573         r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2574         if (r < 0)
2575                 return r;
2576
2577         (void) sd_event_source_set_description(s, path);
2578
2579         if (ret)
2580                 *ret = s;
2581
2582         return r;
2583 }
2584
2585 static sd_event_source* event_source_free(sd_event_source *s) {
2586         if (!s)
2587                 return NULL;
2588
2589         /* Here's a special hack: when we are called from a
2590          * dispatch handler we won't free the event source
2591          * immediately, but we will detach the fd from the
2592          * epoll. This way it is safe for the caller to unref
2593          * the event source and immediately close the fd, but
2594          * we still retain a valid event source object after
2595          * the callback. */
2596
2597         if (s->dispatching)
2598                 source_disconnect(s);
2599         else
2600                 source_free(s);
2601
2602         return NULL;
2603 }
2604
2605 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2606
2607 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2608         assert_return(s, -EINVAL);
2609         assert_return(!event_origin_changed(s->event), -ECHILD);
2610
2611         return free_and_strdup(&s->description, description);
2612 }
2613
2614 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
2615         assert_return(s, -EINVAL);
2616         assert_return(description, -EINVAL);
2617
2618         if (!s->description)
2619                 return -ENXIO;
2620
2621         *description = s->description;
2622         return 0;
2623 }
2624
2625 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
2626         assert_return(s, NULL);
2627         assert_return(!event_origin_changed(s->event), NULL);
2628
2629         return s->event;
2630 }
2631
2632 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2633         assert_return(s, -EINVAL);
2634         assert_return(s->type != SOURCE_EXIT, -EDOM);
2635         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2636         assert_return(!event_origin_changed(s->event), -ECHILD);
2637
2638         return s->pending;
2639 }
2640
2641 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2642         assert_return(s, -EINVAL);
2643         assert_return(s->type == SOURCE_IO, -EDOM);
2644         assert_return(!event_origin_changed(s->event), -ECHILD);
2645
2646         return s->io.fd;
2647 }
2648
2649 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2650         int saved_fd, r;
2651
2652         assert_return(s, -EINVAL);
2653         assert_return(fd >= 0, -EBADF);
2654         assert_return(s->type == SOURCE_IO, -EDOM);
2655         assert_return(!event_origin_changed(s->event), -ECHILD);
2656
2657         if (s->io.fd == fd)
2658                 return 0;
2659
2660         saved_fd = s->io.fd;
2661         s->io.fd = fd;
2662
2663         assert(event_source_is_offline(s) == !s->io.registered);
2664
2665         if (s->io.registered) {
2666                 s->io.registered = false;
2667
2668                 r = source_io_register(s, s->enabled, s->io.events);
2669                 if (r < 0) {
2670                         s->io.fd = saved_fd;
2671                         s->io.registered = true;
2672                         return r;
2673                 }
2674
2675                 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2676         }
2677
2678         if (s->io.owned)
2679                 safe_close(saved_fd);
2680
2681         return 0;
2682 }
2683
2684 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2685         assert_return(s, -EINVAL);
2686         assert_return(s->type == SOURCE_IO, -EDOM);
2687         assert_return(!event_origin_changed(s->event), -ECHILD);
2688
2689         return s->io.owned;
2690 }
2691
2692 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2693         assert_return(s, -EINVAL);
2694         assert_return(s->type == SOURCE_IO, -EDOM);
2695         assert_return(!event_origin_changed(s->event), -ECHILD);
2696
2697         s->io.owned = own;
2698         return 0;
2699 }
2700
2701 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2702         assert_return(s, -EINVAL);
2703         assert_return(events, -EINVAL);
2704         assert_return(s->type == SOURCE_IO, -EDOM);
2705         assert_return(!event_origin_changed(s->event), -ECHILD);
2706
2707         *events = s->io.events;
2708         return 0;
2709 }
2710
2711 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2712         int r;
2713
2714         assert_return(s, -EINVAL);
2715         assert_return(s->type == SOURCE_IO, -EDOM);
2716         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2717         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2718         assert_return(!event_origin_changed(s->event), -ECHILD);
2719
2720         /* edge-triggered updates are never skipped, so we can reset edges */
2721         if (s->io.events == events && !(events & EPOLLET))
2722                 return 0;
2723
2724         r = source_set_pending(s, false);
2725         if (r < 0)
2726                 return r;
2727
2728         if (event_source_is_online(s)) {
2729                 r = source_io_register(s, s->enabled, events);
2730                 if (r < 0)
2731                         return r;
2732         }
2733
2734         s->io.events = events;
2735
2736         return 0;
2737 }
2738
2739 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2740         assert_return(s, -EINVAL);
2741         assert_return(revents, -EINVAL);
2742         assert_return(s->type == SOURCE_IO, -EDOM);
2743         assert_return(s->pending, -ENODATA);
2744         assert_return(!event_origin_changed(s->event), -ECHILD);
2745
2746         *revents = s->io.revents;
2747         return 0;
2748 }
2749
2750 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2751         assert_return(s, -EINVAL);
2752         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2753         assert_return(!event_origin_changed(s->event), -ECHILD);
2754
2755         return s->signal.sig;
2756 }
2757
2758 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2759         assert_return(s, -EINVAL);
2760         assert_return(!event_origin_changed(s->event), -ECHILD);
2761
2762         *priority = s->priority;
2763         return 0;
2764 }
2765
2766 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2767         bool rm_inotify = false, rm_inode = false;
2768         struct inotify_data *new_inotify_data = NULL;
2769         struct inode_data *new_inode_data = NULL;
2770         int r;
2771
2772         assert_return(s, -EINVAL);
2773         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2774         assert_return(!event_origin_changed(s->event), -ECHILD);
2775
2776         if (s->priority == priority)
2777                 return 0;
2778
2779         if (s->type == SOURCE_INOTIFY) {
2780                 struct inode_data *old_inode_data;
2781
2782                 assert(s->inotify.inode_data);
2783                 old_inode_data = s->inotify.inode_data;
2784
2785                 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2786                  * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2787                  * events we allow priority changes only until the first following iteration. */
2788                 if (old_inode_data->fd < 0)
2789                         return -EOPNOTSUPP;
2790
2791                 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2792                 if (r < 0)
2793                         return r;
2794                 rm_inotify = r > 0;
2795
2796                 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2797                 if (r < 0)
2798                         goto fail;
2799                 rm_inode = r > 0;
2800
2801                 if (new_inode_data->fd < 0) {
2802                         /* Duplicate the fd for the new inode object if we don't have any yet */
2803                         new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2804                         if (new_inode_data->fd < 0) {
2805                                 r = -errno;
2806                                 goto fail;
2807                         }
2808
2809                         LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
2810
2811                         _cleanup_free_ char *path = NULL;
2812                         r = fd_get_path(new_inode_data->fd, &path);
2813                         if (r < 0 && r != -ENOSYS)
2814                                 goto fail;
2815
2816                         free_and_replace(new_inode_data->path, path);
2817                 }
2818
2819                 /* Move the event source to the new inode data structure */
2820                 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2821                 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2822                 s->inotify.inode_data = new_inode_data;
2823
2824                 /* Now create the new watch */
2825                 r = inode_data_realize_watch(s->event, new_inode_data);
2826                 if (r < 0) {
2827                         /* Move it back */
2828                         LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2829                         LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2830                         s->inotify.inode_data = old_inode_data;
2831                         goto fail;
2832                 }
2833
2834                 s->priority = priority;
2835
2836                 event_gc_inode_data(s->event, old_inode_data);
2837
2838         } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2839                 struct signal_data *old, *d;
2840
2841                 /* Move us from the signalfd belonging to the old
2842                  * priority to the signalfd of the new priority */
2843
2844                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2845
2846                 s->priority = priority;
2847
2848                 r = event_make_signal_data(s->event, s->signal.sig, &d);
2849                 if (r < 0) {
2850                         s->priority = old->priority;
2851                         return r;
2852                 }
2853
2854                 event_unmask_signal_data(s->event, old, s->signal.sig);
2855         } else
2856                 s->priority = priority;
2857
2858         event_source_pp_prioq_reshuffle(s);
2859
2860         if (s->type == SOURCE_EXIT)
2861                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2862
2863         return 0;
2864
2865 fail:
2866         if (rm_inode)
2867                 event_free_inode_data(s->event, new_inode_data);
2868
2869         if (rm_inotify)
2870                 event_free_inotify_data(s->event, new_inotify_data);
2871
2872         return r;
2873 }
2874
2875 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2876         /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
2877         if (!s && !ret)
2878                 return false;
2879
2880         assert_return(s, -EINVAL);
2881         assert_return(!event_origin_changed(s->event), -ECHILD);
2882
2883         if (ret)
2884                 *ret = s->enabled;
2885
2886         return s->enabled != SD_EVENT_OFF;
2887 }
2888
2889 static int event_source_offline(
2890                 sd_event_source *s,
2891                 int enabled,
2892                 bool ratelimited) {
2893
2894         bool was_offline;
2895         int r;
2896
2897         assert(s);
2898         assert(enabled == SD_EVENT_OFF || ratelimited);
2899
2900         /* Unset the pending flag when this event source is disabled */
2901         if (s->enabled != SD_EVENT_OFF &&
2902             enabled == SD_EVENT_OFF &&
2903             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2904                 r = source_set_pending(s, false);
2905                 if (r < 0)
2906                         return r;
2907         }
2908
2909         was_offline = event_source_is_offline(s);
2910         s->enabled = enabled;
2911         s->ratelimited = ratelimited;
2912
2913         switch (s->type) {
2914
2915         case SOURCE_IO:
2916                 source_io_unregister(s);
2917                 break;
2918
2919         case SOURCE_SIGNAL:
2920                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2921                 break;
2922
2923         case SOURCE_CHILD:
2924                 if (!was_offline) {
2925                         assert(s->event->n_online_child_sources > 0);
2926                         s->event->n_online_child_sources--;
2927                 }
2928
2929                 if (EVENT_SOURCE_WATCH_PIDFD(s))
2930                         source_child_pidfd_unregister(s);
2931                 else
2932                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2933                 break;
2934
2935         case SOURCE_EXIT:
2936                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2937                 break;
2938
2939         case SOURCE_MEMORY_PRESSURE:
2940                 source_memory_pressure_unregister(s);
2941                 break;
2942
2943         case SOURCE_TIME_REALTIME:
2944         case SOURCE_TIME_BOOTTIME:
2945         case SOURCE_TIME_MONOTONIC:
2946         case SOURCE_TIME_REALTIME_ALARM:
2947         case SOURCE_TIME_BOOTTIME_ALARM:
2948         case SOURCE_DEFER:
2949         case SOURCE_POST:
2950         case SOURCE_INOTIFY:
2951                 break;
2952
2953         default:
2954                 assert_not_reached();
2955         }
2956
2957         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2958         event_source_time_prioq_reshuffle(s);
2959
2960         return 1;
2961 }
2962
2963 static int event_source_online(
2964                 sd_event_source *s,
2965                 int enabled,
2966                 bool ratelimited) {
2967
2968         bool was_online;
2969         int r;
2970
2971         assert(s);
2972         assert(enabled != SD_EVENT_OFF || !ratelimited);
2973
2974         /* Unset the pending flag when this event source is enabled */
2975         if (s->enabled == SD_EVENT_OFF &&
2976             enabled != SD_EVENT_OFF &&
2977             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2978                 r = source_set_pending(s, false);
2979                 if (r < 0)
2980                         return r;
2981         }
2982
2983         /* Are we really ready for onlining? */
2984         if (enabled == SD_EVENT_OFF || ratelimited) {
2985                 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2986                 s->enabled = enabled;
2987                 s->ratelimited = ratelimited;
2988                 return 0;
2989         }
2990
2991         was_online = event_source_is_online(s);
2992
2993         switch (s->type) {
2994         case SOURCE_IO:
2995                 r = source_io_register(s, enabled, s->io.events);
2996                 if (r < 0)
2997                         return r;
2998                 break;
2999
3000         case SOURCE_SIGNAL:
3001                 r = event_make_signal_data(s->event, s->signal.sig, NULL);
3002                 if (r < 0) {
3003                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
3004                         return r;
3005                 }
3006
3007                 break;
3008
3009         case SOURCE_CHILD:
3010                 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
3011                         /* yes, we have pidfd */
3012
3013                         r = source_child_pidfd_register(s, enabled);
3014                         if (r < 0)
3015                                 return r;
3016                 } else {
3017                         /* no pidfd, or something other to watch for than WEXITED */
3018
3019                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
3020                         if (r < 0) {
3021                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
3022                                 return r;
3023                         }
3024                 }
3025
3026                 if (!was_online)
3027                         s->event->n_online_child_sources++;
3028                 break;
3029
3030         case SOURCE_MEMORY_PRESSURE:
3031                 r = source_memory_pressure_register(s, enabled);
3032                 if (r < 0)
3033                         return r;
3034
3035                 break;
3036
3037         case SOURCE_TIME_REALTIME:
3038         case SOURCE_TIME_BOOTTIME:
3039         case SOURCE_TIME_MONOTONIC:
3040         case SOURCE_TIME_REALTIME_ALARM:
3041         case SOURCE_TIME_BOOTTIME_ALARM:
3042         case SOURCE_EXIT:
3043         case SOURCE_DEFER:
3044         case SOURCE_POST:
3045         case SOURCE_INOTIFY:
3046                 break;
3047
3048         default:
3049                 assert_not_reached();
3050         }
3051
3052         s->enabled = enabled;
3053         s->ratelimited = ratelimited;
3054
3055         /* Non-failing operations below */
3056         if (s->type == SOURCE_EXIT)
3057                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
3058
3059         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
3060         event_source_time_prioq_reshuffle(s);
3061
3062         return 1;
3063 }
3064
3065 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
3066         int r;
3067
3068         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
3069
3070         /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
3071         if (m == SD_EVENT_OFF && !s)
3072                 return 0;
3073
3074         assert_return(s, -EINVAL);
3075         assert_return(!event_origin_changed(s->event), -ECHILD);
3076
3077         /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
3078         if (s->event->state == SD_EVENT_FINISHED)
3079                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
3080
3081         if (s->enabled == m) /* No change? */
3082                 return 0;
3083
3084         if (m == SD_EVENT_OFF)
3085                 r = event_source_offline(s, m, s->ratelimited);
3086         else {
3087                 if (s->enabled != SD_EVENT_OFF) {
3088                         /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
3089                          * event source is already enabled after all. */
3090                         s->enabled = m;
3091                         return 0;
3092                 }
3093
3094                 r = event_source_online(s, m, s->ratelimited);
3095         }
3096         if (r < 0)
3097                 return r;
3098
3099         event_source_pp_prioq_reshuffle(s);
3100         return 0;
3101 }
3102
3103 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
3104         assert_return(s, -EINVAL);
3105         assert_return(usec, -EINVAL);
3106         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3107         assert_return(!event_origin_changed(s->event), -ECHILD);
3108
3109         *usec = s->time.next;
3110         return 0;
3111 }
3112
3113 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
3114         int r;
3115
3116         assert_return(s, -EINVAL);
3117         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3118         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3119         assert_return(!event_origin_changed(s->event), -ECHILD);
3120
3121         r = source_set_pending(s, false);
3122         if (r < 0)
3123                 return r;
3124
3125         s->time.next = usec;
3126
3127         event_source_time_prioq_reshuffle(s);
3128         return 0;
3129 }
3130
3131 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
3132         usec_t t;
3133         int r;
3134
3135         assert_return(s, -EINVAL);
3136         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3137         assert_return(!event_origin_changed(s->event), -ECHILD);
3138
3139         if (usec == USEC_INFINITY)
3140                 return sd_event_source_set_time(s, USEC_INFINITY);
3141
3142         r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
3143         if (r < 0)
3144                 return r;
3145
3146         usec = usec_add(t, usec);
3147         if (usec == USEC_INFINITY)
3148                 return -EOVERFLOW;
3149
3150         return sd_event_source_set_time(s, usec);
3151 }
3152
3153 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
3154         assert_return(s, -EINVAL);
3155         assert_return(usec, -EINVAL);
3156         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3157         assert_return(!event_origin_changed(s->event), -ECHILD);
3158
3159         *usec = s->time.accuracy;
3160         return 0;
3161 }
3162
3163 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
3164         int r;
3165
3166         assert_return(s, -EINVAL);
3167         assert_return(usec != UINT64_MAX, -EINVAL);
3168         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3169         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3170         assert_return(!event_origin_changed(s->event), -ECHILD);
3171
3172         r = source_set_pending(s, false);
3173         if (r < 0)
3174                 return r;
3175
3176         if (usec == 0)
3177                 usec = DEFAULT_ACCURACY_USEC;
3178
3179         s->time.accuracy = usec;
3180
3181         event_source_time_prioq_reshuffle(s);
3182         return 0;
3183 }
3184
3185 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
3186         assert_return(s, -EINVAL);
3187         assert_return(clock, -EINVAL);
3188         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
3189         assert_return(!event_origin_changed(s->event), -ECHILD);
3190
3191         *clock = event_source_type_to_clock(s->type);
3192         return 0;
3193 }
3194
3195 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
3196         assert_return(s, -EINVAL);
3197         assert_return(pid, -EINVAL);
3198         assert_return(s->type == SOURCE_CHILD, -EDOM);
3199         assert_return(!event_origin_changed(s->event), -ECHILD);
3200
3201         *pid = s->child.pid;
3202         return 0;
3203 }
3204
3205 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
3206         assert_return(s, -EINVAL);
3207         assert_return(s->type == SOURCE_CHILD, -EDOM);
3208         assert_return(!event_origin_changed(s->event), -ECHILD);
3209
3210         if (s->child.pidfd < 0)
3211                 return -EOPNOTSUPP;
3212
3213         return s->child.pidfd;
3214 }
3215
3216 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
3217         assert_return(s, -EINVAL);
3218         assert_return(s->type == SOURCE_CHILD, -EDOM);
3219         assert_return(!event_origin_changed(s->event), -ECHILD);
3220         assert_return(SIGNAL_VALID(sig), -EINVAL);
3221
3222         /* If we already have seen indication the process exited refuse sending a signal early. This way we
3223          * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
3224          * available. */
3225         if (s->child.exited)
3226                 return -ESRCH;
3227
3228         if (s->child.pidfd >= 0) {
3229                 siginfo_t copy;
3230
3231                 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
3232                  * structure here */
3233                 if (si)
3234                         copy = *si;
3235
3236                 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
3237                         /* Let's propagate the error only if the system call is not implemented or prohibited */
3238                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3239                                 return -errno;
3240                 } else
3241                         return 0;
3242         }
3243
3244         /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
3245          * this here. */
3246         if (flags != 0)
3247                 return -EOPNOTSUPP;
3248
3249         if (si) {
3250                 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
3251                 siginfo_t copy = *si;
3252
3253                 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
3254                         return -errno;
3255         } else if (kill(s->child.pid, sig) < 0)
3256                 return -errno;
3257
3258         return 0;
3259 }
3260
3261 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
3262         assert_return(s, -EINVAL);
3263         assert_return(s->type == SOURCE_CHILD, -EDOM);
3264         assert_return(!event_origin_changed(s->event), -ECHILD);
3265
3266         if (s->child.pidfd < 0)
3267                 return -EOPNOTSUPP;
3268
3269         return s->child.pidfd_owned;
3270 }
3271
3272 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
3273         assert_return(s, -EINVAL);
3274         assert_return(s->type == SOURCE_CHILD, -EDOM);
3275         assert_return(!event_origin_changed(s->event), -ECHILD);
3276
3277         if (s->child.pidfd < 0)
3278                 return -EOPNOTSUPP;
3279
3280         s->child.pidfd_owned = own;
3281         return 0;
3282 }
3283
3284 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
3285         assert_return(s, -EINVAL);
3286         assert_return(s->type == SOURCE_CHILD, -EDOM);
3287         assert_return(!event_origin_changed(s->event), -ECHILD);
3288
3289         return s->child.process_owned;
3290 }
3291
3292 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
3293         assert_return(s, -EINVAL);
3294         assert_return(s->type == SOURCE_CHILD, -EDOM);
3295         assert_return(!event_origin_changed(s->event), -ECHILD);
3296
3297         s->child.process_owned = own;
3298         return 0;
3299 }
3300
3301 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret) {
3302         assert_return(s, -EINVAL);
3303         assert_return(ret, -EINVAL);
3304         assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3305         assert_return(!event_origin_changed(s->event), -ECHILD);
3306
3307         *ret = s->inotify.mask;
3308         return 0;
3309 }
3310
3311 _public_ int sd_event_source_get_inotify_path(sd_event_source *s, const char **ret) {
3312         assert_return(s, -EINVAL);
3313         assert_return(ret, -EINVAL);
3314         assert_return(s->type == SOURCE_INOTIFY, -EDOM);
3315         assert_return(!event_origin_changed(s->event), -ECHILD);
3316
3317         if (!s->inotify.inode_data)
3318                 return -ESTALE; /* already disconnected. */
3319
3320         if (!s->inotify.inode_data->path)
3321                 return -ENOSYS; /* /proc was not mounted? */
3322
3323         *ret = s->inotify.inode_data->path;
3324         return 0;
3325 }
3326
3327 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
3328         int r;
3329
3330         assert_return(s, -EINVAL);
3331         assert_return(s->type != SOURCE_EXIT, -EDOM);
3332         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
3333         assert_return(!event_origin_changed(s->event), -ECHILD);
3334
3335         if (s->prepare == callback)
3336                 return 0;
3337
3338         if (callback && s->prepare) {
3339                 s->prepare = callback;
3340                 return 0;
3341         }
3342
3343         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
3344         if (r < 0)
3345                 return r;
3346
3347         s->prepare = callback;
3348
3349         if (callback) {
3350                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
3351                 if (r < 0)
3352                         return r;
3353         } else
3354                 prioq_remove(s->event->prepare, s, &s->prepare_index);
3355
3356         return 0;
3357 }
3358
3359 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
3360         assert_return(s, NULL);
3361         assert_return(!event_origin_changed(s->event), NULL);
3362
3363         return s->userdata;
3364 }
3365
3366 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
3367         void *ret;
3368
3369         assert_return(s, NULL);
3370         assert_return(!event_origin_changed(s->event), NULL);
3371
3372         ret = s->userdata;
3373         s->userdata = userdata;
3374
3375         return ret;
3376 }
3377
3378 static int event_source_enter_ratelimited(sd_event_source *s) {
3379         int r;
3380
3381         assert(s);
3382
3383         /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
3384          * the end of the rate limit time window, much as if it was a timer event source. */
3385
3386         if (s->ratelimited)
3387                 return 0; /* Already ratelimited, this is a NOP hence */
3388
3389         /* Make sure we can install a CLOCK_MONOTONIC event further down. */
3390         r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
3391         if (r < 0)
3392                 return r;
3393
3394         /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
3395          * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
3396          * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
3397         if (EVENT_SOURCE_IS_TIME(s->type))
3398                 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3399
3400         /* Now, let's add the event source to the monotonic clock instead */
3401         r = event_source_time_prioq_put(s, &s->event->monotonic);
3402         if (r < 0)
3403                 goto fail;
3404
3405         /* And let's take the event source officially offline */
3406         r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
3407         if (r < 0) {
3408                 event_source_time_prioq_remove(s, &s->event->monotonic);
3409                 goto fail;
3410         }
3411
3412         event_source_pp_prioq_reshuffle(s);
3413
3414         log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
3415         return 0;
3416
3417 fail:
3418         /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
3419          * space for it should already be allocated. */
3420         if (EVENT_SOURCE_IS_TIME(s->type))
3421                 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
3422
3423         return r;
3424 }
3425
3426 static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
3427         int r;
3428
3429         assert(s);
3430
3431         if (!s->ratelimited)
3432                 return 0;
3433
3434         /* Let's take the event source out of the monotonic prioq first. */
3435         event_source_time_prioq_remove(s, &s->event->monotonic);
3436
3437         /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
3438         if (EVENT_SOURCE_IS_TIME(s->type)) {
3439                 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
3440                 if (r < 0)
3441                         goto fail;
3442         }
3443
3444         /* Let's try to take it online again.  */
3445         r = event_source_online(s, s->enabled, /* ratelimited= */ false);
3446         if (r < 0) {
3447                 /* Do something roughly sensible when this failed: undo the two prioq ops above */
3448                 if (EVENT_SOURCE_IS_TIME(s->type))
3449                         event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
3450
3451                 goto fail;
3452         }
3453
3454         event_source_pp_prioq_reshuffle(s);
3455         ratelimit_reset(&s->rate_limit);
3456
3457         log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
3458
3459         if (run_callback && s->ratelimit_expire_callback) {
3460                 s->dispatching = true;
3461                 r = s->ratelimit_expire_callback(s, s->userdata);
3462                 s->dispatching = false;
3463
3464                 if (r < 0) {
3465                         log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
3466                                         strna(s->description),
3467                                         event_source_type_to_string(s->type),
3468                                         s->exit_on_failure ? "exiting" : "disabling");
3469
3470                         if (s->exit_on_failure)
3471                                 (void) sd_event_exit(s->event, r);
3472                 }
3473
3474                 if (s->n_ref == 0)
3475                         source_free(s);
3476                 else if (r < 0)
3477                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3478
3479                 return 1;
3480         }
3481
3482         return 0;
3483
3484 fail:
3485         /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
3486          * simply put it back in it, maybe we can then process it more successfully next iteration. */
3487         assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
3488
3489         return r;
3490 }
3491
3492 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
3493         usec_t c;
3494         assert(e);
3495         assert(a <= b);
3496
3497         if (a <= 0)
3498                 return 0;
3499         if (a >= USEC_INFINITY)
3500                 return USEC_INFINITY;
3501
3502         if (b <= a + 1)
3503                 return a;
3504
3505         initialize_perturb(e);
3506
3507         /*
3508           Find a good time to wake up again between times a and b. We
3509           have two goals here:
3510
3511           a) We want to wake up as seldom as possible, hence prefer
3512              later times over earlier times.
3513
3514           b) But if we have to wake up, then let's make sure to
3515              dispatch as much as possible on the entire system.
3516
3517           We implement this by waking up everywhere at the same time
3518           within any given minute if we can, synchronised via the
3519           perturbation value determined from the boot ID. If we can't,
3520           then we try to find the same spot in every 10s, then 1s and
3521           then 250ms step. Otherwise, we pick the last possible time
3522           to wake up.
3523         */
3524
3525         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3526         if (c >= b) {
3527                 if (_unlikely_(c < USEC_PER_MINUTE))
3528                         return b;
3529
3530                 c -= USEC_PER_MINUTE;
3531         }
3532
3533         if (c >= a)
3534                 return c;
3535
3536         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3537         if (c >= b) {
3538                 if (_unlikely_(c < USEC_PER_SEC*10))
3539                         return b;
3540
3541                 c -= USEC_PER_SEC*10;
3542         }
3543
3544         if (c >= a)
3545                 return c;
3546
3547         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3548         if (c >= b) {
3549                 if (_unlikely_(c < USEC_PER_SEC))
3550                         return b;
3551
3552                 c -= USEC_PER_SEC;
3553         }
3554
3555         if (c >= a)
3556                 return c;
3557
3558         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3559         if (c >= b) {
3560                 if (_unlikely_(c < USEC_PER_MSEC*250))
3561                         return b;
3562
3563                 c -= USEC_PER_MSEC*250;
3564         }
3565
3566         if (c >= a)
3567                 return c;
3568
3569         return b;
3570 }
3571
3572 static int event_arm_timer(
3573                 sd_event *e,
3574                 struct clock_data *d) {
3575
3576         struct itimerspec its = {};
3577         sd_event_source *a, *b;
3578         usec_t t;
3579
3580         assert(e);
3581         assert(d);
3582
3583         if (!d->needs_rearm)
3584                 return 0;
3585
3586         d->needs_rearm = false;
3587
3588         a = prioq_peek(d->earliest);
3589         assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3590         if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3591
3592                 if (d->fd < 0)
3593                         return 0;
3594
3595                 if (d->next == USEC_INFINITY)
3596                         return 0;
3597
3598                 /* disarm */
3599                 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3600                         return -errno;
3601
3602                 d->next = USEC_INFINITY;
3603                 return 0;
3604         }
3605
3606         b = prioq_peek(d->latest);
3607         assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3608         assert(b && b->enabled != SD_EVENT_OFF);
3609
3610         t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3611         if (d->next == t)
3612                 return 0;
3613
3614         assert_se(d->fd >= 0);
3615
3616         if (t == 0) {
3617                 /* We don't want to disarm here, just mean some time looooong ago. */
3618                 its.it_value.tv_sec = 0;
3619                 its.it_value.tv_nsec = 1;
3620         } else
3621                 timespec_store(&its.it_value, t);
3622
3623         if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3624                 return -errno;
3625
3626         d->next = t;
3627         return 0;
3628 }
3629
3630 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3631         assert(e);
3632         assert(s);
3633         assert(s->type == SOURCE_IO);
3634
3635         /* If the event source was already pending, we just OR in the
3636          * new revents, otherwise we reset the value. The ORing is
3637          * necessary to handle EPOLLONESHOT events properly where
3638          * readability might happen independently of writability, and
3639          * we need to keep track of both */
3640
3641         if (s->pending)
3642                 s->io.revents |= revents;
3643         else
3644                 s->io.revents = revents;
3645
3646         return source_set_pending(s, true);
3647 }
3648
3649 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3650         uint64_t x;
3651         ssize_t ss;
3652
3653         assert(e);
3654         assert(fd >= 0);
3655
3656         assert_return(events == EPOLLIN, -EIO);
3657
3658         ss = read(fd, &x, sizeof(x));
3659         if (ss < 0) {
3660                 if (ERRNO_IS_TRANSIENT(errno))
3661                         return 0;
3662
3663                 return -errno;
3664         }
3665
3666         if (_unlikely_(ss != sizeof(x)))
3667                 return -EIO;
3668
3669         if (next)
3670                 *next = USEC_INFINITY;
3671
3672         return 0;
3673 }
3674
3675 static int process_timer(
3676                 sd_event *e,
3677                 usec_t n,
3678                 struct clock_data *d) {
3679
3680         sd_event_source *s;
3681         bool callback_invoked = false;
3682         int r;
3683
3684         assert(e);
3685         assert(d);
3686
3687         for (;;) {
3688                 s = prioq_peek(d->earliest);
3689                 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3690
3691                 if (!s || time_event_source_next(s) > n)
3692                         break;
3693
3694                 if (s->ratelimited) {
3695                         /* This is an event sources whose ratelimit window has ended. Let's turn it on
3696                          * again. */
3697                         assert(s->ratelimited);
3698
3699                         r = event_source_leave_ratelimit(s, /* run_callback */ true);
3700                         if (r < 0)
3701                                 return r;
3702                         else if (r == 1)
3703                                 callback_invoked = true;
3704
3705                         continue;
3706                 }
3707
3708                 if (s->enabled == SD_EVENT_OFF || s->pending)
3709                         break;
3710
3711                 r = source_set_pending(s, true);
3712                 if (r < 0)
3713                         return r;
3714
3715                 event_source_time_prioq_reshuffle(s);
3716         }
3717
3718         return callback_invoked;
3719 }
3720
3721 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3722         int64_t min_priority = threshold;
3723         bool something_new = false;
3724         sd_event_source *s;
3725         int r;
3726
3727         assert(e);
3728         assert(ret_min_priority);
3729
3730         if (!e->need_process_child) {
3731                 *ret_min_priority = min_priority;
3732                 return 0;
3733         }
3734
3735         e->need_process_child = false;
3736
3737         /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3738          * for, instead of using P_ALL. This is because we only want to get child information of very
3739          * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3740          * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3741          * hence we really don't want anything flushed out of the kernel's queue that we don't care
3742          * about. Since this is O(n) this means that if you have a lot of processes you probably want
3743          * to handle SIGCHLD yourself.
3744          *
3745          * We do not reap the children here (by using WNOWAIT), this is only done after the event
3746          * source is dispatched so that the callback still sees the process as a zombie. */
3747
3748         HASHMAP_FOREACH(s, e->child_sources) {
3749                 assert(s->type == SOURCE_CHILD);
3750
3751                 if (s->priority > threshold)
3752                         continue;
3753
3754                 if (s->pending)
3755                         continue;
3756
3757                 if (event_source_is_offline(s))
3758                         continue;
3759
3760                 if (s->child.exited)
3761                         continue;
3762
3763                 if (EVENT_SOURCE_WATCH_PIDFD(s))
3764                         /* There's a usable pidfd known for this event source? Then don't waitid() for
3765                          * it here */
3766                         continue;
3767
3768                 zero(s->child.siginfo);
3769                 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3770                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3771                         return negative_errno();
3772
3773                 if (s->child.siginfo.si_pid != 0) {
3774                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3775
3776                         if (zombie)
3777                                 s->child.exited = true;
3778
3779                         if (!zombie && (s->child.options & WEXITED)) {
3780                                 /* If the child isn't dead then let's immediately remove the state
3781                                  * change from the queue, since there's no benefit in leaving it
3782                                  * queued. */
3783
3784                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
3785                                 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3786                         }
3787
3788                         r = source_set_pending(s, true);
3789                         if (r < 0)
3790                                 return r;
3791                         if (r > 0) {
3792                                 something_new = true;
3793                                 min_priority = MIN(min_priority, s->priority);
3794                         }
3795                 }
3796         }
3797
3798         *ret_min_priority = min_priority;
3799         return something_new;
3800 }
3801
3802 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3803         assert(e);
3804         assert(s);
3805         assert(s->type == SOURCE_CHILD);
3806
3807         if (s->pending)
3808                 return 0;
3809
3810         if (event_source_is_offline(s))
3811                 return 0;
3812
3813         if (!EVENT_SOURCE_WATCH_PIDFD(s))
3814                 return 0;
3815
3816         zero(s->child.siginfo);
3817         if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3818                 return -errno;
3819
3820         if (s->child.siginfo.si_pid == 0)
3821                 return 0;
3822
3823         if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3824                 s->child.exited = true;
3825
3826         return source_set_pending(s, true);
3827 }
3828
3829 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3830         int r;
3831
3832         assert(e);
3833         assert(d);
3834         assert_return(events == EPOLLIN, -EIO);
3835         assert(min_priority);
3836
3837         /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3838          * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3839          * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3840          * but we might have higher priority children we care about hence we need to check that
3841          * explicitly. */
3842
3843         if (sigismember(&d->sigset, SIGCHLD))
3844                 e->need_process_child = true;
3845
3846         /* If there's already an event source pending for this priority we don't read another */
3847         if (d->current)
3848                 return 0;
3849
3850         for (;;) {
3851                 struct signalfd_siginfo si;
3852                 ssize_t n;
3853                 sd_event_source *s = NULL;
3854
3855                 n = read(d->fd, &si, sizeof(si));
3856                 if (n < 0) {
3857                         if (ERRNO_IS_TRANSIENT(errno))
3858                                 return 0;
3859
3860                         return -errno;
3861                 }
3862
3863                 if (_unlikely_(n != sizeof(si)))
3864                         return -EIO;
3865
3866                 assert(SIGNAL_VALID(si.ssi_signo));
3867
3868                 if (e->signal_sources)
3869                         s = e->signal_sources[si.ssi_signo];
3870                 if (!s)
3871                         continue;
3872                 if (s->pending)
3873                         continue;
3874
3875                 s->signal.siginfo = si;
3876                 d->current = s;
3877
3878                 r = source_set_pending(s, true);
3879                 if (r < 0)
3880                         return r;
3881                 if (r > 0 && *min_priority >= s->priority) {
3882                         *min_priority = s->priority;
3883                         return 1; /* an event source with smaller priority is queued. */
3884                 }
3885
3886                 return 0;
3887         }
3888 }
3889
3890 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3891         ssize_t n;
3892
3893         assert(e);
3894         assert(d);
3895
3896         assert_return(revents == EPOLLIN, -EIO);
3897
3898         /* If there's already an event source pending for this priority, don't read another */
3899         if (d->n_pending > 0)
3900                 return 0;
3901
3902         /* Is the read buffer non-empty? If so, let's not read more */
3903         if (d->buffer_filled > 0)
3904                 return 0;
3905
3906         if (d->priority > threshold)
3907                 return 0;
3908
3909         n = read(d->fd, &d->buffer, sizeof(d->buffer));
3910         if (n < 0) {
3911                 if (ERRNO_IS_TRANSIENT(errno))
3912                         return 0;
3913
3914                 return -errno;
3915         }
3916
3917         assert(n > 0);
3918         d->buffer_filled = (size_t) n;
3919         LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
3920
3921         return 1;
3922 }
3923
3924 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3925         assert(e);
3926         assert(d);
3927         assert(sz <= d->buffer_filled);
3928
3929         if (sz == 0)
3930                 return;
3931
3932         /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3933         memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3934         d->buffer_filled -= sz;
3935
3936         if (d->buffer_filled == 0)
3937                 LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
3938 }
3939
3940 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3941         int r;
3942
3943         assert(e);
3944         assert(d);
3945
3946         /* If there's already an event source pending for this priority, don't read another */
3947         if (d->n_pending > 0)
3948                 return 0;
3949
3950         while (d->buffer_filled > 0) {
3951                 size_t sz;
3952
3953                 /* Let's validate that the event structures are complete */
3954                 if (d->buffer_filled < offsetof(struct inotify_event, name))
3955                         return -EIO;
3956
3957                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3958                 if (d->buffer_filled < sz)
3959                         return -EIO;
3960
3961                 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3962                         struct inode_data *inode_data;
3963
3964                         /* The queue overran, let's pass this event to all event sources connected to this inotify
3965                          * object */
3966
3967                         HASHMAP_FOREACH(inode_data, d->inodes)
3968                                 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3969
3970                                         if (event_source_is_offline(s))
3971                                                 continue;
3972
3973                                         r = source_set_pending(s, true);
3974                                         if (r < 0)
3975                                                 return r;
3976                                 }
3977                 } else {
3978                         struct inode_data *inode_data;
3979
3980                         /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3981                          * our watch descriptor table. */
3982                         if (d->buffer.ev.mask & IN_IGNORED) {
3983
3984                                 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3985                                 if (!inode_data) {
3986                                         event_inotify_data_drop(e, d, sz);
3987                                         continue;
3988                                 }
3989
3990                                 /* The watch descriptor was removed by the kernel, let's drop it here too */
3991                                 inode_data->wd = -1;
3992                         } else {
3993                                 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3994                                 if (!inode_data) {
3995                                         event_inotify_data_drop(e, d, sz);
3996                                         continue;
3997                                 }
3998                         }
3999
4000                         /* Trigger all event sources that are interested in these events. Also trigger all event
4001                          * sources if IN_IGNORED or IN_UNMOUNT is set. */
4002                         LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
4003
4004                                 if (event_source_is_offline(s))
4005                                         continue;
4006
4007                                 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
4008                                     (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
4009                                         continue;
4010
4011                                 r = source_set_pending(s, true);
4012                                 if (r < 0)
4013                                         return r;
4014                         }
4015                 }
4016
4017                 /* Something pending now? If so, let's finish, otherwise let's read more. */
4018                 if (d->n_pending > 0)
4019                         return 1;
4020         }
4021
4022         return 0;
4023 }
4024
4025 static int process_inotify(sd_event *e) {
4026         int r, done = 0;
4027
4028         assert(e);
4029
4030         LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
4031                 r = event_inotify_data_process(e, d);
4032                 if (r < 0)
4033                         return r;
4034                 if (r > 0)
4035                         done++;
4036         }
4037
4038         return done;
4039 }
4040
4041 static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
4042         assert(s);
4043         assert(s->type == SOURCE_MEMORY_PRESSURE);
4044
4045         if (s->pending)
4046                 s->memory_pressure.revents |= revents;
4047         else
4048                 s->memory_pressure.revents = revents;
4049
4050         return source_set_pending(s, true);
4051 }
4052
4053 static int source_memory_pressure_write(sd_event_source *s) {
4054         ssize_t n;
4055         int r;
4056
4057         assert(s);
4058         assert(s->type == SOURCE_MEMORY_PRESSURE);
4059
4060         /* once we start writing, the buffer is locked, we allow no further changes. */
4061         s->memory_pressure.locked = true;
4062
4063         if (s->memory_pressure.write_buffer_size > 0) {
4064                 n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
4065                 if (n < 0) {
4066                         if (!ERRNO_IS_TRANSIENT(errno)) {
4067                                 /* If kernel is built with CONFIG_PSI_DEFAULT_DISABLED it will expose PSI
4068                                  * files, but then generates EOPNOSUPP on read() and write() (instead of on
4069                                  * open()!). This sucks hard, since we can only detect this kind of failure
4070                                  * so late. Let's make the best of it, and turn off the event source like we
4071                                  * do for failed event source handlers. */
4072
4073                                 log_debug_errno(errno, "Writing memory pressure settings to kernel failed, disabling memory pressure event source: %m");
4074                                 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4075                                 return 0;
4076                         }
4077
4078                         n = 0;
4079                 }
4080         } else
4081                 n = 0;
4082
4083         assert(n >= 0);
4084
4085         if ((size_t) n == s->memory_pressure.write_buffer_size) {
4086                 s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
4087
4088                 if (n > 0) {
4089                         s->memory_pressure.write_buffer_size = 0;
4090
4091                         /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
4092                         r = source_memory_pressure_register(s, s->enabled);
4093                         if (r < 0)
4094                                 return r;
4095                 }
4096         } else if (n > 0) {
4097                 _cleanup_free_ void *c = NULL;
4098
4099                 assert((size_t) n < s->memory_pressure.write_buffer_size);
4100
4101                 c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
4102                 if (!c)
4103                         return -ENOMEM;
4104
4105                 free_and_replace(s->memory_pressure.write_buffer, c);
4106                 s->memory_pressure.write_buffer_size -= n;
4107                 return 1;
4108         }
4109
4110         return 0;
4111 }
4112
4113 static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
4114         int r;
4115
4116         assert(s);
4117         assert(s->type == SOURCE_MEMORY_PRESSURE);
4118
4119         r = source_memory_pressure_write(s);
4120         if (r < 0)
4121                 return r;
4122         if (r > 0)
4123                 return 1; /* if we wrote something, then don't continue with dispatching user dispatch
4124                            * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
4125
4126         /* No pending incoming IO? Then let's not continue further */
4127         if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
4128
4129                 /* Treat IO errors on the notifier the same ways errors returned from a callback */
4130                 if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
4131                         return -EIO;
4132
4133                 return 1; /* leave dispatch, we already processed everything */
4134         }
4135
4136         if (s->memory_pressure.revents & EPOLLIN) {
4137                 uint8_t pipe_buf[PIPE_BUF];
4138                 ssize_t n;
4139
4140                 /* If the fd is readable, then flush out anything that might be queued */
4141
4142                 n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
4143                 if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
4144                         return -errno;
4145         }
4146
4147         return 0; /* go on, dispatch to user callback */
4148 }
4149
4150 static int source_dispatch(sd_event_source *s) {
4151         EventSourceType saved_type;
4152         sd_event *saved_event;
4153         int r = 0;
4154
4155         assert(s);
4156         assert(s->pending || s->type == SOURCE_EXIT);
4157
4158         /* Save the event source type, here, so that we still know it after the event callback which might
4159          * invalidate the event. */
4160         saved_type = s->type;
4161
4162         /* Similarly, store a reference to the event loop object, so that we can still access it after the
4163          * callback might have invalidated/disconnected the event source. */
4164         saved_event = s->event;
4165         PROTECT_EVENT(saved_event);
4166
4167         /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
4168         assert(!s->ratelimited);
4169         if (!ratelimit_below(&s->rate_limit)) {
4170                 r = event_source_enter_ratelimited(s);
4171                 if (r < 0)
4172                         return r;
4173
4174                 return 1;
4175         }
4176
4177         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
4178                 r = source_set_pending(s, false);
4179                 if (r < 0)
4180                         return r;
4181         }
4182
4183         if (s->type != SOURCE_POST) {
4184                 sd_event_source *z;
4185
4186                 /* If we execute a non-post source, let's mark all post sources as pending. */
4187
4188                 SET_FOREACH(z, s->event->post_sources) {
4189                         if (event_source_is_offline(z))
4190                                 continue;
4191
4192                         r = source_set_pending(z, true);
4193                         if (r < 0)
4194                                 return r;
4195                 }
4196         }
4197
4198         if (s->type == SOURCE_MEMORY_PRESSURE) {
4199                 r = source_memory_pressure_initiate_dispatch(s);
4200                 if (r == -EIO) /* handle EIO errors similar to callback errors */
4201                         goto finish;
4202                 if (r < 0)
4203                         return r;
4204                 if (r > 0) /* already handled */
4205                         return 1;
4206         }
4207
4208         if (s->enabled == SD_EVENT_ONESHOT) {
4209                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
4210                 if (r < 0)
4211                         return r;
4212         }
4213
4214         s->dispatching = true;
4215
4216         switch (s->type) {
4217
4218         case SOURCE_IO:
4219                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
4220                 break;
4221
4222         case SOURCE_TIME_REALTIME:
4223         case SOURCE_TIME_BOOTTIME:
4224         case SOURCE_TIME_MONOTONIC:
4225         case SOURCE_TIME_REALTIME_ALARM:
4226         case SOURCE_TIME_BOOTTIME_ALARM:
4227                 r = s->time.callback(s, s->time.next, s->userdata);
4228                 break;
4229
4230         case SOURCE_SIGNAL:
4231                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
4232                 break;
4233
4234         case SOURCE_CHILD: {
4235                 bool zombie;
4236
4237                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
4238
4239                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
4240
4241                 /* Now, reap the PID for good. */
4242                 if (zombie) {
4243                         (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
4244                         s->child.waited = true;
4245                 }
4246
4247                 break;
4248         }
4249
4250         case SOURCE_DEFER:
4251                 r = s->defer.callback(s, s->userdata);
4252                 break;
4253
4254         case SOURCE_POST:
4255                 r = s->post.callback(s, s->userdata);
4256                 break;
4257
4258         case SOURCE_EXIT:
4259                 r = s->exit.callback(s, s->userdata);
4260                 break;
4261
4262         case SOURCE_INOTIFY: {
4263                 struct sd_event *e = s->event;
4264                 struct inotify_data *d;
4265                 size_t sz;
4266
4267                 assert(s->inotify.inode_data);
4268                 assert_se(d = s->inotify.inode_data->inotify_data);
4269
4270                 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
4271                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
4272                 assert(d->buffer_filled >= sz);
4273
4274                 /* If the inotify callback destroys the event source then this likely means we don't need to
4275                  * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
4276                  * free it immediately, then we couldn't drop the event from the inotify event queue without
4277                  * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
4278                  * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
4279                  * explicitly GC it after we are done dropping the inotify event from the buffer. */
4280                 d->n_busy++;
4281                 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
4282                 d->n_busy--;
4283
4284                 /* When no event is pending anymore on this inotify object, then let's drop the event from
4285                  * the inotify event queue buffer. */
4286                 if (d->n_pending == 0)
4287                         event_inotify_data_drop(e, d, sz);
4288
4289                 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
4290                 event_gc_inotify_data(e, d);
4291                 break;
4292         }
4293
4294         case SOURCE_MEMORY_PRESSURE:
4295                 r = s->memory_pressure.callback(s, s->userdata);
4296                 break;
4297
4298         case SOURCE_WATCHDOG:
4299         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
4300         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
4301                 assert_not_reached();
4302         }
4303
4304         s->dispatching = false;
4305
4306 finish:
4307         if (r < 0) {
4308                 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
4309                                 strna(s->description),
4310                                 event_source_type_to_string(saved_type),
4311                                 s->exit_on_failure ? "exiting" : "disabling");
4312
4313                 if (s->exit_on_failure)
4314                         (void) sd_event_exit(saved_event, r);
4315         }
4316
4317         if (s->n_ref == 0)
4318                 source_free(s);
4319         else if (r < 0)
4320                 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4321
4322         return 1;
4323 }
4324
4325 static int event_prepare(sd_event *e) {
4326         int r;
4327
4328         assert(e);
4329
4330         for (;;) {
4331                 sd_event_source *s;
4332
4333                 s = prioq_peek(e->prepare);
4334                 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
4335                         break;
4336
4337                 s->prepare_iteration = e->iteration;
4338                 prioq_reshuffle(e->prepare, s, &s->prepare_index);
4339
4340                 assert(s->prepare);
4341                 s->dispatching = true;
4342                 r = s->prepare(s, s->userdata);
4343                 s->dispatching = false;
4344
4345                 if (r < 0) {
4346                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
4347                                         strna(s->description),
4348                                         event_source_type_to_string(s->type),
4349                                         s->exit_on_failure ? "exiting" : "disabling");
4350
4351                         if (s->exit_on_failure)
4352                                 (void) sd_event_exit(e, r);
4353                 }
4354
4355                 if (s->n_ref == 0)
4356                         source_free(s);
4357                 else if (r < 0)
4358                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
4359         }
4360
4361         return 0;
4362 }
4363
4364 static int dispatch_exit(sd_event *e) {
4365         sd_event_source *p;
4366         int r;
4367
4368         assert(e);
4369
4370         p = prioq_peek(e->exit);
4371         assert(!p || p->type == SOURCE_EXIT);
4372
4373         if (!p || event_source_is_offline(p)) {
4374                 e->state = SD_EVENT_FINISHED;
4375                 return 0;
4376         }
4377
4378         PROTECT_EVENT(e);
4379         e->iteration++;
4380         e->state = SD_EVENT_EXITING;
4381         r = source_dispatch(p);
4382         e->state = SD_EVENT_INITIAL;
4383         return r;
4384 }
4385
4386 static sd_event_source* event_next_pending(sd_event *e) {
4387         sd_event_source *p;
4388
4389         assert(e);
4390
4391         p = prioq_peek(e->pending);
4392         if (!p)
4393                 return NULL;
4394
4395         if (event_source_is_offline(p))
4396                 return NULL;
4397
4398         return p;
4399 }
4400
4401 static int arm_watchdog(sd_event *e) {
4402         struct itimerspec its = {};
4403         usec_t t;
4404
4405         assert(e);
4406         assert(e->watchdog_fd >= 0);
4407
4408         t = sleep_between(e,
4409                           usec_add(e->watchdog_last, (e->watchdog_period / 2)),
4410                           usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
4411
4412         timespec_store(&its.it_value, t);
4413
4414         /* Make sure we never set the watchdog to 0, which tells the
4415          * kernel to disable it. */
4416         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
4417                 its.it_value.tv_nsec = 1;
4418
4419         return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
4420 }
4421
4422 static int process_watchdog(sd_event *e) {
4423         assert(e);
4424
4425         if (!e->watchdog)
4426                 return 0;
4427
4428         /* Don't notify watchdog too often */
4429         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
4430                 return 0;
4431
4432         sd_notify(false, "WATCHDOG=1");
4433         e->watchdog_last = e->timestamp.monotonic;
4434
4435         return arm_watchdog(e);
4436 }
4437
4438 static void event_close_inode_data_fds(sd_event *e) {
4439         struct inode_data *d;
4440
4441         assert(e);
4442
4443         /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
4444          * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
4445          * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
4446          * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
4447          * compromise. */
4448
4449         while ((d = e->inode_data_to_close_list)) {
4450                 assert(d->fd >= 0);
4451                 d->fd = safe_close(d->fd);
4452
4453                 LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
4454         }
4455 }
4456
4457 static int event_memory_pressure_write_list(sd_event *e) {
4458         int r;
4459
4460         assert(e);
4461
4462         for (;;) {
4463                 sd_event_source *s;
4464
4465                 s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
4466                 if (!s)
4467                         break;
4468
4469                 assert(s->type == SOURCE_MEMORY_PRESSURE);
4470                 assert(s->memory_pressure.write_buffer_size > 0);
4471                 s->memory_pressure.in_write_list = false;
4472
4473                 r = source_memory_pressure_write(s);
4474                 if (r < 0)
4475                         return r;
4476         }
4477
4478         return 0;
4479 }
4480
4481 _public_ int sd_event_prepare(sd_event *e) {
4482         int r;
4483
4484         assert_return(e, -EINVAL);
4485         assert_return(e = event_resolve(e), -ENOPKG);
4486         assert_return(!event_origin_changed(e), -ECHILD);
4487         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4488         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4489
4490         /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
4491          * this check here once, since gettid() is typically not cached, and thus want to minimize
4492          * syscalls */
4493         assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
4494
4495         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4496         PROTECT_EVENT(e);
4497
4498         if (e->exit_requested)
4499                 goto pending;
4500
4501         e->iteration++;
4502
4503         e->state = SD_EVENT_PREPARING;
4504         r = event_prepare(e);
4505         e->state = SD_EVENT_INITIAL;
4506         if (r < 0)
4507                 return r;
4508
4509         r = event_memory_pressure_write_list(e);
4510         if (r < 0)
4511                 return r;
4512
4513         r = event_arm_timer(e, &e->realtime);
4514         if (r < 0)
4515                 return r;
4516
4517         r = event_arm_timer(e, &e->boottime);
4518         if (r < 0)
4519                 return r;
4520
4521         r = event_arm_timer(e, &e->monotonic);
4522         if (r < 0)
4523                 return r;
4524
4525         r = event_arm_timer(e, &e->realtime_alarm);
4526         if (r < 0)
4527                 return r;
4528
4529         r = event_arm_timer(e, &e->boottime_alarm);
4530         if (r < 0)
4531                 return r;
4532
4533         event_close_inode_data_fds(e);
4534
4535         if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
4536                 goto pending;
4537
4538         e->state = SD_EVENT_ARMED;
4539
4540         return 0;
4541
4542 pending:
4543         e->state = SD_EVENT_ARMED;
4544         r = sd_event_wait(e, 0);
4545         if (r == 0)
4546                 e->state = SD_EVENT_ARMED;
4547
4548         return r;
4549 }
4550
4551 static int epoll_wait_usec(
4552                 int fd,
4553                 struct epoll_event *events,
4554                 int maxevents,
4555                 usec_t timeout) {
4556
4557         int msec;
4558         /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not. */
4559
4560 #if HAVE_EPOLL_PWAIT2
4561         static bool epoll_pwait2_absent = false;
4562         int r;
4563
4564         /* epoll_pwait2() was added to Linux 5.11 (2021-02-14) and to glibc in 2.35 (2022-02-03). In contrast
4565          * to other syscalls we don't bother with our own fallback syscall wrappers on old libcs, since this
4566          * is not that obvious to implement given the libc and kernel definitions differ in the last
4567          * argument. Moreover, the only reason to use it is the more accurate time-outs (which is not a
4568          * biggie), let's hence rely on glibc's definitions, and fallback to epoll_pwait() when that's
4569          * missing. */
4570
4571         if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
4572                 r = epoll_pwait2(fd,
4573                                  events,
4574                                  maxevents,
4575                                  TIMESPEC_STORE(timeout),
4576                                  NULL);
4577                 if (r >= 0)
4578                         return r;
4579                 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
4580                         return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
4581                                         * supported. */
4582
4583                 epoll_pwait2_absent = true;
4584         }
4585 #endif
4586
4587         if (timeout == USEC_INFINITY)
4588                 msec = -1;
4589         else {
4590                 usec_t k;
4591
4592                 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
4593                 if (k >= INT_MAX)
4594                         msec = INT_MAX; /* Saturate */
4595                 else
4596                         msec = (int) k;
4597         }
4598
4599         return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
4600 }
4601
4602 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
4603         size_t n_event_queue, m, n_event_max;
4604         int64_t min_priority = threshold;
4605         bool something_new = false;
4606         int r;
4607
4608         assert(e);
4609         assert(ret_min_priority);
4610
4611         n_event_queue = MAX(e->n_sources, 1u);
4612         if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
4613                 return -ENOMEM;
4614
4615         n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4616
4617         /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
4618         if (e->buffered_inotify_data_list)
4619                 timeout = 0;
4620
4621         for (;;) {
4622                 r = epoll_wait_usec(
4623                                 e->epoll_fd,
4624                                 e->event_queue,
4625                                 n_event_max,
4626                                 timeout);
4627                 if (r < 0)
4628                         return r;
4629
4630                 m = (size_t) r;
4631
4632                 if (m < n_event_max)
4633                         break;
4634
4635                 if (n_event_max >= n_event_queue * 10)
4636                         break;
4637
4638                 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
4639                         return -ENOMEM;
4640
4641                 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
4642                 timeout = 0;
4643         }
4644
4645         /* Set timestamp only when this is called first time. */
4646         if (threshold == INT64_MAX)
4647                 triple_timestamp_now(&e->timestamp);
4648
4649         for (size_t i = 0; i < m; i++) {
4650
4651                 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
4652                         r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
4653                 else {
4654                         WakeupType *t = e->event_queue[i].data.ptr;
4655
4656                         switch (*t) {
4657
4658                         case WAKEUP_EVENT_SOURCE: {
4659                                 sd_event_source *s = e->event_queue[i].data.ptr;
4660
4661                                 assert(s);
4662
4663                                 if (s->priority > threshold)
4664                                         continue;
4665
4666                                 min_priority = MIN(min_priority, s->priority);
4667
4668                                 switch (s->type) {
4669
4670                                 case SOURCE_IO:
4671                                         r = process_io(e, s, e->event_queue[i].events);
4672                                         break;
4673
4674                                 case SOURCE_CHILD:
4675                                         r = process_pidfd(e, s, e->event_queue[i].events);
4676                                         break;
4677
4678                                 case SOURCE_MEMORY_PRESSURE:
4679                                         r = process_memory_pressure(s, e->event_queue[i].events);
4680                                         break;
4681
4682                                 default:
4683                                         assert_not_reached();
4684                                 }
4685
4686                                 break;
4687                         }
4688
4689                         case WAKEUP_CLOCK_DATA: {
4690                                 struct clock_data *d = e->event_queue[i].data.ptr;
4691
4692                                 assert(d);
4693
4694                                 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4695                                 break;
4696                         }
4697
4698                         case WAKEUP_SIGNAL_DATA:
4699                                 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4700                                 break;
4701
4702                         case WAKEUP_INOTIFY_DATA:
4703                                 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4704                                 break;
4705
4706                         default:
4707                                 assert_not_reached();
4708                         }
4709                 }
4710                 if (r < 0)
4711                         return r;
4712                 if (r > 0)
4713                         something_new = true;
4714         }
4715
4716         *ret_min_priority = min_priority;
4717         return something_new;
4718 }
4719
4720 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4721         int r;
4722
4723         assert_return(e, -EINVAL);
4724         assert_return(e = event_resolve(e), -ENOPKG);
4725         assert_return(!event_origin_changed(e), -ECHILD);
4726         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4727         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4728
4729         if (e->exit_requested) {
4730                 e->state = SD_EVENT_PENDING;
4731                 return 1;
4732         }
4733
4734         for (int64_t threshold = INT64_MAX; ; threshold--) {
4735                 int64_t epoll_min_priority, child_min_priority;
4736
4737                 /* There may be a possibility that new epoll (especially IO) and child events are
4738                  * triggered just after process_epoll() call but before process_child(), and the new IO
4739                  * events may have higher priority than the child events. To salvage these events,
4740                  * let's call epoll_wait() again, but accepts only events with higher priority than the
4741                  * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4742                  * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4743                  * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4744
4745                 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4746                 if (r == -EINTR) {
4747                         e->state = SD_EVENT_PENDING;
4748                         return 1;
4749                 }
4750                 if (r < 0)
4751                         goto finish;
4752                 if (r == 0 && threshold < INT64_MAX)
4753                         /* No new epoll event. */
4754                         break;
4755
4756                 r = process_child(e, threshold, &child_min_priority);
4757                 if (r < 0)
4758                         goto finish;
4759                 if (r == 0)
4760                         /* No new child event. */
4761                         break;
4762
4763                 threshold = MIN(epoll_min_priority, child_min_priority);
4764                 if (threshold == INT64_MIN)
4765                         break;
4766
4767                 timeout = 0;
4768         }
4769
4770         r = process_watchdog(e);
4771         if (r < 0)
4772                 goto finish;
4773
4774         r = process_inotify(e);
4775         if (r < 0)
4776                 goto finish;
4777
4778         r = process_timer(e, e->timestamp.realtime, &e->realtime);
4779         if (r < 0)
4780                 goto finish;
4781
4782         r = process_timer(e, e->timestamp.boottime, &e->boottime);
4783         if (r < 0)
4784                 goto finish;
4785
4786         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4787         if (r < 0)
4788                 goto finish;
4789
4790         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4791         if (r < 0)
4792                 goto finish;
4793
4794         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4795         if (r < 0)
4796                 goto finish;
4797         else if (r == 1) {
4798                 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4799                  * put loop in the initial state in order to evaluate (in the next iteration) also sources
4800                  * there were potentially re-enabled by the callback.
4801                  *
4802                  * Wondering why we treat only this invocation of process_timer() differently? Once event
4803                  * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4804                  * ratelimit expiry callback is never called for any other timer type. */
4805                 r = 0;
4806                 goto finish;
4807         }
4808
4809         if (event_next_pending(e)) {
4810                 e->state = SD_EVENT_PENDING;
4811                 return 1;
4812         }
4813
4814         r = 0;
4815
4816 finish:
4817         e->state = SD_EVENT_INITIAL;
4818
4819         return r;
4820 }
4821
4822 _public_ int sd_event_dispatch(sd_event *e) {
4823         sd_event_source *p;
4824         int r;
4825
4826         assert_return(e, -EINVAL);
4827         assert_return(e = event_resolve(e), -ENOPKG);
4828         assert_return(!event_origin_changed(e), -ECHILD);
4829         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4830         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4831
4832         if (e->exit_requested)
4833                 return dispatch_exit(e);
4834
4835         p = event_next_pending(e);
4836         if (p) {
4837                 PROTECT_EVENT(e);
4838
4839                 e->state = SD_EVENT_RUNNING;
4840                 r = source_dispatch(p);
4841                 e->state = SD_EVENT_INITIAL;
4842                 return r;
4843         }
4844
4845         e->state = SD_EVENT_INITIAL;
4846
4847         return 1;
4848 }
4849
4850 static void event_log_delays(sd_event *e) {
4851         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4852         size_t l, i;
4853
4854         p = b;
4855         l = sizeof(b);
4856         for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4857                 l = strpcpyf(&p, l, "%u ", e->delays[i]);
4858                 e->delays[i] = 0;
4859         }
4860         log_debug("Event loop iterations: %s", b);
4861 }
4862
4863 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4864         int r;
4865
4866         assert_return(e, -EINVAL);
4867         assert_return(e = event_resolve(e), -ENOPKG);
4868         assert_return(!event_origin_changed(e), -ECHILD);
4869         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4870         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4871
4872         if (e->profile_delays && e->last_run_usec != 0) {
4873                 usec_t this_run;
4874                 unsigned l;
4875
4876                 this_run = now(CLOCK_MONOTONIC);
4877
4878                 l = log2u64(this_run - e->last_run_usec);
4879                 assert(l < ELEMENTSOF(e->delays));
4880                 e->delays[l]++;
4881
4882                 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4883                         event_log_delays(e);
4884                         e->last_log_usec = this_run;
4885                 }
4886         }
4887
4888         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4889         PROTECT_EVENT(e);
4890
4891         r = sd_event_prepare(e);
4892         if (r == 0)
4893                 /* There was nothing? Then wait... */
4894                 r = sd_event_wait(e, timeout);
4895
4896         if (e->profile_delays)
4897                 e->last_run_usec = now(CLOCK_MONOTONIC);
4898
4899         if (r > 0) {
4900                 /* There's something now, then let's dispatch it */
4901                 r = sd_event_dispatch(e);
4902                 if (r < 0)
4903                         return r;
4904
4905                 return 1;
4906         }
4907
4908         return r;
4909 }
4910
4911 _public_ int sd_event_loop(sd_event *e) {
4912         int r;
4913
4914         assert_return(e, -EINVAL);
4915         assert_return(e = event_resolve(e), -ENOPKG);
4916         assert_return(!event_origin_changed(e), -ECHILD);
4917         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4918
4919
4920         PROTECT_EVENT(e);
4921
4922         while (e->state != SD_EVENT_FINISHED) {
4923                 r = sd_event_run(e, UINT64_MAX);
4924                 if (r < 0)
4925                         return r;
4926         }
4927
4928         return e->exit_code;
4929 }
4930
4931 _public_ int sd_event_get_fd(sd_event *e) {
4932         assert_return(e, -EINVAL);
4933         assert_return(e = event_resolve(e), -ENOPKG);
4934         assert_return(!event_origin_changed(e), -ECHILD);
4935
4936         return e->epoll_fd;
4937 }
4938
4939 _public_ int sd_event_get_state(sd_event *e) {
4940         assert_return(e, -EINVAL);
4941         assert_return(e = event_resolve(e), -ENOPKG);
4942         assert_return(!event_origin_changed(e), -ECHILD);
4943
4944         return e->state;
4945 }
4946
4947 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
4948         assert_return(e, -EINVAL);
4949         assert_return(e = event_resolve(e), -ENOPKG);
4950         assert_return(!event_origin_changed(e), -ECHILD);
4951
4952         if (!e->exit_requested)
4953                 return -ENODATA;
4954
4955         if (code)
4956                 *code = e->exit_code;
4957         return 0;
4958 }
4959
4960 _public_ int sd_event_exit(sd_event *e, int code) {
4961         assert_return(e, -EINVAL);
4962         assert_return(e = event_resolve(e), -ENOPKG);
4963         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4964         assert_return(!event_origin_changed(e), -ECHILD);
4965
4966         e->exit_requested = true;
4967         e->exit_code = code;
4968
4969         return 0;
4970 }
4971
4972 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
4973         assert_return(e, -EINVAL);
4974         assert_return(e = event_resolve(e), -ENOPKG);
4975         assert_return(usec, -EINVAL);
4976         assert_return(!event_origin_changed(e), -ECHILD);
4977
4978         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4979                 return -EOPNOTSUPP;
4980
4981         if (!triple_timestamp_is_set(&e->timestamp)) {
4982                 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4983                 *usec = now(clock);
4984                 return 1;
4985         }
4986
4987         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
4988         return 0;
4989 }
4990
4991 _public_ int sd_event_default(sd_event **ret) {
4992         sd_event *e = NULL;
4993         int r;
4994
4995         if (!ret)
4996                 return !!default_event;
4997
4998         if (default_event) {
4999                 *ret = sd_event_ref(default_event);
5000                 return 0;
5001         }
5002
5003         r = sd_event_new(&e);
5004         if (r < 0)
5005                 return r;
5006
5007         e->default_event_ptr = &default_event;
5008         e->tid = gettid();
5009         default_event = e;
5010
5011         *ret = e;
5012         return 1;
5013 }
5014
5015 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
5016         assert_return(e, -EINVAL);
5017         assert_return(e = event_resolve(e), -ENOPKG);
5018         assert_return(tid, -EINVAL);
5019         assert_return(!event_origin_changed(e), -ECHILD);
5020
5021         if (e->tid != 0) {
5022                 *tid = e->tid;
5023                 return 0;
5024         }
5025
5026         return -ENXIO;
5027 }
5028
5029 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
5030         int r;
5031
5032         assert_return(e, -EINVAL);
5033         assert_return(e = event_resolve(e), -ENOPKG);
5034         assert_return(!event_origin_changed(e), -ECHILD);
5035
5036         if (e->watchdog == !!b)
5037                 return e->watchdog;
5038
5039         if (b) {
5040                 r = sd_watchdog_enabled(false, &e->watchdog_period);
5041                 if (r <= 0)
5042                         return r;
5043
5044                 /* Issue first ping immediately */
5045                 sd_notify(false, "WATCHDOG=1");
5046                 e->watchdog_last = now(CLOCK_MONOTONIC);
5047
5048                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
5049                 if (e->watchdog_fd < 0)
5050                         return -errno;
5051
5052                 r = arm_watchdog(e);
5053                 if (r < 0)
5054                         goto fail;
5055
5056                 struct epoll_event ev = {
5057                         .events = EPOLLIN,
5058                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
5059                 };
5060
5061                 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
5062                         r = -errno;
5063                         goto fail;
5064                 }
5065
5066         } else {
5067                 if (e->watchdog_fd >= 0) {
5068                         (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
5069                         e->watchdog_fd = safe_close(e->watchdog_fd);
5070                 }
5071         }
5072
5073         e->watchdog = b;
5074         return e->watchdog;
5075
5076 fail:
5077         e->watchdog_fd = safe_close(e->watchdog_fd);
5078         return r;
5079 }
5080
5081 _public_ int sd_event_get_watchdog(sd_event *e) {
5082         assert_return(e, -EINVAL);
5083         assert_return(e = event_resolve(e), -ENOPKG);
5084         assert_return(!event_origin_changed(e), -ECHILD);
5085
5086         return e->watchdog;
5087 }
5088
5089 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
5090         assert_return(e, -EINVAL);
5091         assert_return(e = event_resolve(e), -ENOPKG);
5092         assert_return(!event_origin_changed(e), -ECHILD);
5093
5094         *ret = e->iteration;
5095         return 0;
5096 }
5097
5098 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
5099         assert_return(s, -EINVAL);
5100         assert_return(s->event, -EINVAL);
5101         assert_return(!event_origin_changed(s->event), -ECHILD);
5102
5103         s->destroy_callback = callback;
5104         return 0;
5105 }
5106
5107 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
5108         assert_return(s, -EINVAL);
5109         assert_return(!event_origin_changed(s->event), -ECHILD);
5110
5111         if (ret)
5112                 *ret = s->destroy_callback;
5113
5114         return !!s->destroy_callback;
5115 }
5116
5117 _public_ int sd_event_source_get_floating(sd_event_source *s) {
5118         assert_return(s, -EINVAL);
5119         assert_return(!event_origin_changed(s->event), -ECHILD);
5120
5121         return s->floating;
5122 }
5123
5124 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
5125         assert_return(s, -EINVAL);
5126         assert_return(!event_origin_changed(s->event), -ECHILD);
5127
5128         if (s->floating == !!b)
5129                 return 0;
5130
5131         if (!s->event) /* Already disconnected */
5132                 return -ESTALE;
5133
5134         s->floating = b;
5135
5136         if (b) {
5137                 sd_event_source_ref(s);
5138                 sd_event_unref(s->event);
5139         } else {
5140                 sd_event_ref(s->event);
5141                 sd_event_source_unref(s);
5142         }
5143
5144         return 1;
5145 }
5146
5147 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
5148         assert_return(s, -EINVAL);
5149         assert_return(s->type != SOURCE_EXIT, -EDOM);
5150         assert_return(!event_origin_changed(s->event), -ECHILD);
5151
5152         return s->exit_on_failure;
5153 }
5154
5155 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
5156         assert_return(s, -EINVAL);
5157         assert_return(s->type != SOURCE_EXIT, -EDOM);
5158         assert_return(!event_origin_changed(s->event), -ECHILD);
5159
5160         if (s->exit_on_failure == !!b)
5161                 return 0;
5162
5163         s->exit_on_failure = b;
5164         return 1;
5165 }
5166
5167 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
5168         int r;
5169
5170         assert_return(s, -EINVAL);
5171         assert_return(!event_origin_changed(s->event), -ECHILD);
5172
5173         /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
5174          * so is a programming error. */
5175         assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
5176
5177         /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
5178          * non-ratelimited. */
5179         r = event_source_leave_ratelimit(s, /* run_callback */ false);
5180         if (r < 0)
5181                 return r;
5182
5183         s->rate_limit = (RateLimit) { interval, burst };
5184         return 0;
5185 }
5186
5187 _public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
5188         assert_return(s, -EINVAL);
5189         assert_return(!event_origin_changed(s->event), -ECHILD);
5190
5191         s->ratelimit_expire_callback = callback;
5192         return 0;
5193 }
5194
5195 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
5196         assert_return(s, -EINVAL);
5197         assert_return(!event_origin_changed(s->event), -ECHILD);
5198
5199         /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
5200          * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
5201         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5202                 return -EDOM;
5203
5204         if (!ratelimit_configured(&s->rate_limit))
5205                 return -ENOEXEC;
5206
5207         if (ret_interval)
5208                 *ret_interval = s->rate_limit.interval;
5209         if (ret_burst)
5210                 *ret_burst = s->rate_limit.burst;
5211
5212         return 0;
5213 }
5214
5215 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
5216         assert_return(s, -EINVAL);
5217         assert_return(!event_origin_changed(s->event), -ECHILD);
5218
5219         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5220                 return false;
5221
5222         if (!ratelimit_configured(&s->rate_limit))
5223                 return false;
5224
5225         return s->ratelimited;
5226 }
5227
5228 _public_ int sd_event_source_leave_ratelimit(sd_event_source *s) {
5229         int r;
5230
5231         assert_return(s, -EINVAL);
5232
5233         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
5234                 return 0;
5235
5236         if (!ratelimit_configured(&s->rate_limit))
5237                 return 0;
5238
5239         if (!s->ratelimited)
5240                 return 0;
5241
5242         r = event_source_leave_ratelimit(s, /* run_callback */ false);
5243         if (r < 0)
5244                 return r;
5245
5246         return 1; /* tell caller that we indeed just left the ratelimit state */
5247 }
5248
5249 _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
5250         bool change = false;
5251         int r;
5252
5253         assert_return(e, -EINVAL);
5254
5255         if (b) {
5256                 /* We want to maintain pointers to these event sources, so that we can destroy them when told
5257                  * so. But we also don't want them to pin the event loop itself. Hence we mark them as
5258                  * floating after creation (and undo this before deleting them again). */
5259
5260                 if (!e->sigint_event_source) {
5261                         r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5262                         if (r < 0)
5263                                 return r;
5264
5265                         assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
5266                         change = true;
5267                 }
5268
5269                 if (!e->sigterm_event_source) {
5270                         r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
5271                         if (r < 0) {
5272                                 if (change) {
5273                                         assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5274                                         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5275                                 }
5276
5277                                 return r;
5278                         }
5279
5280                         assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
5281                         change = true;
5282                 }
5283
5284         } else {
5285                 if (e->sigint_event_source) {
5286                         assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
5287                         e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
5288                         change = true;
5289                 }
5290
5291                 if (e->sigterm_event_source) {
5292                         assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
5293                         e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
5294                         change = true;
5295                 }
5296         }
5297
5298         return change;
5299 }
5300
5301 _public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
5302         _cleanup_free_ char *b = NULL;
5303         _cleanup_free_ void *w = NULL;
5304
5305         assert_return(s, -EINVAL);
5306         assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5307         assert_return(ty, -EINVAL);
5308         assert_return(!event_origin_changed(s->event), -ECHILD);
5309
5310         if (!STR_IN_SET(ty, "some", "full"))
5311                 return -EINVAL;
5312
5313         if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5314                 return -EBUSY;
5315
5316         char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5317         if (!space)
5318                 return -EINVAL;
5319
5320         size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5321         b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5322         if (!b)
5323                 return -ENOMEM;
5324         if (!STR_IN_SET(b, "some", "full"))
5325                 return -EINVAL;
5326
5327         if (streq(b, ty))
5328                 return 0;
5329
5330         size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
5331         w = new(char, nl);
5332         if (!w)
5333                 return -ENOMEM;
5334
5335         memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
5336
5337         free_and_replace(s->memory_pressure.write_buffer, w);
5338         s->memory_pressure.write_buffer_size = nl;
5339         s->memory_pressure.locked = false;
5340
5341         return 1;
5342 }
5343
5344 _public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
5345         _cleanup_free_ char *b = NULL;
5346         _cleanup_free_ void *w = NULL;
5347
5348         assert_return(s, -EINVAL);
5349         assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
5350         assert_return(!event_origin_changed(s->event), -ECHILD);
5351
5352         if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
5353                 return -ERANGE;
5354         if (window_usec <= 0 || window_usec >= UINT64_MAX)
5355                 return -ERANGE;
5356         if (threshold_usec > window_usec)
5357                 return -EINVAL;
5358
5359         if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
5360                 return -EBUSY;
5361
5362         char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
5363         if (!space)
5364                 return -EINVAL;
5365
5366         size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
5367         b = memdup_suffix0(s->memory_pressure.write_buffer, l);
5368         if (!b)
5369                 return -ENOMEM;
5370         if (!STR_IN_SET(b, "some", "full"))
5371                 return -EINVAL;
5372
5373         if (asprintf((char**) &w,
5374                      "%s " USEC_FMT " " USEC_FMT "",
5375                      b,
5376                      threshold_usec,
5377                      window_usec) < 0)
5378                 return -EINVAL;
5379
5380         l = strlen(w) + 1;
5381         if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
5382                 return 0;
5383
5384         free_and_replace(s->memory_pressure.write_buffer, w);
5385         s->memory_pressure.write_buffer_size = l;
5386         s->memory_pressure.locked = false;
5387
5388         return 1;
5389 }