src/libsystemd/sd-event/sd-event.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <sys/epoll.h>
   4 #include <sys/timerfd.h>
   5 #include <sys/wait.h>
   6
   7 #include "sd-daemon.h"
   8 #include "sd-event.h"
   9 #include "sd-id128.h"
  10
  11 #include "alloc-util.h"
  12 #include "env-util.h"
  13 #include "event-source.h"
  14 #include "fd-util.h"
  15 #include "fs-util.h"
  16 #include "hashmap.h"
  17 #include "list.h"
  18 #include "macro.h"
  19 #include "memory-util.h"
  20 #include "missing_syscall.h"
  21 #include "prioq.h"
  22 #include "process-util.h"
  23 #include "set.h"
  24 #include "signal-util.h"
  25 #include "string-table.h"
  26 #include "string-util.h"
  27 #include "strxcpyx.h"
  28 #include "time-util.h"
  29
  30 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
  31
  32 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
  33         /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
  34         return s &&
  35                 s->type == SOURCE_CHILD &&
  36                 s->child.pidfd >= 0 &&
  37                 s->child.options == WEXITED;
  38 }
  39
  40 static bool event_source_is_online(sd_event_source *s) {
  41         assert(s);
  42         return s->enabled != SD_EVENT_OFF && !s->ratelimited;
  43 }
  44
  45 static bool event_source_is_offline(sd_event_source *s) {
  46         assert(s);
  47         return s->enabled == SD_EVENT_OFF || s->ratelimited;
  48 }
  49
  50 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
  51         [SOURCE_IO] = "io",
  52         [SOURCE_TIME_REALTIME] = "realtime",
  53         [SOURCE_TIME_BOOTTIME] = "bootime",
  54         [SOURCE_TIME_MONOTONIC] = "monotonic",
  55         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
  56         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
  57         [SOURCE_SIGNAL] = "signal",
  58         [SOURCE_CHILD] = "child",
  59         [SOURCE_DEFER] = "defer",
  60         [SOURCE_POST] = "post",
  61         [SOURCE_EXIT] = "exit",
  62         [SOURCE_WATCHDOG] = "watchdog",
  63         [SOURCE_INOTIFY] = "inotify",
  64 };
  65
  66 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
  67
  68 #define EVENT_SOURCE_IS_TIME(t)                 \
  69         IN_SET((t),                             \
  70                SOURCE_TIME_REALTIME,            \
  71                SOURCE_TIME_BOOTTIME,            \
  72                SOURCE_TIME_MONOTONIC,           \
  73                SOURCE_TIME_REALTIME_ALARM,      \
  74                SOURCE_TIME_BOOTTIME_ALARM)
  75
  76 #define EVENT_SOURCE_CAN_RATE_LIMIT(t)          \
  77         IN_SET((t),                             \
  78                SOURCE_IO,                       \
  79                SOURCE_TIME_REALTIME,            \
  80                SOURCE_TIME_BOOTTIME,            \
  81                SOURCE_TIME_MONOTONIC,           \
  82                SOURCE_TIME_REALTIME_ALARM,      \
  83                SOURCE_TIME_BOOTTIME_ALARM,      \
  84                SOURCE_SIGNAL,                   \
  85                SOURCE_DEFER,                    \
  86                SOURCE_INOTIFY)
  87
  88 struct sd_event {
  89         unsigned n_ref;
  90
  91         int epoll_fd;
  92         int watchdog_fd;
  93
  94         Prioq *pending;
  95         Prioq *prepare;
  96
  97         /* timerfd_create() only supports these five clocks so far. We
  98          * can add support for more clocks when the kernel learns to
  99          * deal with them, too. */
 100         struct clock_data realtime;
 101         struct clock_data boottime;
 102         struct clock_data monotonic;
 103         struct clock_data realtime_alarm;
 104         struct clock_data boottime_alarm;
 105
 106         usec_t perturb;
 107
 108         sd_event_source **signal_sources; /* indexed by signal number */
 109         Hashmap *signal_data; /* indexed by priority */
 110
 111         Hashmap *child_sources;
 112         unsigned n_online_child_sources;
 113
 114         Set *post_sources;
 115
 116         Prioq *exit;
 117
 118         Hashmap *inotify_data; /* indexed by priority */
 119
 120         /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
 121         LIST_HEAD(struct inode_data, inode_data_to_close);
 122
 123         /* A list of inotify objects that already have events buffered which aren't processed yet */
 124         LIST_HEAD(struct inotify_data, inotify_data_buffered);
 125
 126         pid_t original_pid;
 127
 128         uint64_t iteration;
 129         triple_timestamp timestamp;
 130         int state;
 131
 132         bool exit_requested:1;
 133         bool need_process_child:1;
 134         bool watchdog:1;
 135         bool profile_delays:1;
 136
 137         int exit_code;
 138
 139         pid_t tid;
 140         sd_event **default_event_ptr;
 141
 142         usec_t watchdog_last, watchdog_period;
 143
 144         unsigned n_sources;
 145
 146         struct epoll_event *event_queue;
 147         size_t event_queue_allocated;
 148
 149         LIST_HEAD(sd_event_source, sources);
 150
 151         usec_t last_run_usec, last_log_usec;
 152         unsigned delays[sizeof(usec_t) * 8];
 153 };
 154
 155 static thread_local sd_event *default_event = NULL;
 156
 157 static void source_disconnect(sd_event_source *s);
 158 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
 159
 160 static sd_event *event_resolve(sd_event *e) {
 161         return e == SD_EVENT_DEFAULT ? default_event : e;
 162 }
 163
 164 static int pending_prioq_compare(const void *a, const void *b) {
 165         const sd_event_source *x = a, *y = b;
 166         int r;
 167
 168         assert(x->pending);
 169         assert(y->pending);
 170
 171         /* Enabled ones first */
 172         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
 173                 return -1;
 174         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
 175                 return 1;
 176
 177         /* Non rate-limited ones first. */
 178         r = CMP(!!x->ratelimited, !!y->ratelimited);
 179         if (r != 0)
 180                 return r;
 181
 182         /* Lower priority values first */
 183         r = CMP(x->priority, y->priority);
 184         if (r != 0)
 185                 return r;
 186
 187         /* Older entries first */
 188         return CMP(x->pending_iteration, y->pending_iteration);
 189 }
 190
 191 static int prepare_prioq_compare(const void *a, const void *b) {
 192         const sd_event_source *x = a, *y = b;
 193         int r;
 194
 195         assert(x->prepare);
 196         assert(y->prepare);
 197
 198         /* Enabled ones first */
 199         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
 200                 return -1;
 201         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
 202                 return 1;
 203
 204         /* Non rate-limited ones first. */
 205         r = CMP(!!x->ratelimited, !!y->ratelimited);
 206         if (r != 0)
 207                 return r;
 208
 209         /* Move most recently prepared ones last, so that we can stop
 210          * preparing as soon as we hit one that has already been
 211          * prepared in the current iteration */
 212         r = CMP(x->prepare_iteration, y->prepare_iteration);
 213         if (r != 0)
 214                 return r;
 215
 216         /* Lower priority values first */
 217         return CMP(x->priority, y->priority);
 218 }
 219
 220 static usec_t time_event_source_next(const sd_event_source *s) {
 221         assert(s);
 222
 223         /* We have two kinds of event sources that have elapsation times associated with them: the actual
 224          * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
 225          * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
 226          * looking at here. */
 227
 228         if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
 229                 assert(s->rate_limit.begin != 0);
 230                 assert(s->rate_limit.interval != 0);
 231                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 232         }
 233
 234         /* Otherwise this must be a time event source, if not ratelimited */
 235         if (EVENT_SOURCE_IS_TIME(s->type))
 236                 return s->time.next;
 237
 238         return USEC_INFINITY;
 239 }
 240
 241 static int earliest_time_prioq_compare(const void *a, const void *b) {
 242         const sd_event_source *x = a, *y = b;
 243
 244         /* Enabled ones first */
 245         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
 246                 return -1;
 247         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
 248                 return 1;
 249
 250         /* Move the pending ones to the end */
 251         if (!x->pending && y->pending)
 252                 return -1;
 253         if (x->pending && !y->pending)
 254                 return 1;
 255
 256         /* Order by time */
 257         return CMP(time_event_source_next(x), time_event_source_next(y));
 258 }
 259
 260 static usec_t time_event_source_latest(const sd_event_source *s) {
 261         assert(s);
 262
 263         if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
 264                                * same, as we should avoid adding additional inaccuracy on an inaccuracy time
 265                                * window */
 266                 assert(s->rate_limit.begin != 0);
 267                 assert(s->rate_limit.interval != 0);
 268                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
 269         }
 270
 271         /* Must be a time event source, if not ratelimited */
 272         if (EVENT_SOURCE_IS_TIME(s->type))
 273                 return usec_add(s->time.next, s->time.accuracy);
 274
 275         return USEC_INFINITY;
 276 }
 277
 278 static int latest_time_prioq_compare(const void *a, const void *b) {
 279         const sd_event_source *x = a, *y = b;
 280
 281         /* Enabled ones first */
 282         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
 283                 return -1;
 284         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
 285                 return 1;
 286
 287         /* Move the pending ones to the end */
 288         if (!x->pending && y->pending)
 289                 return -1;
 290         if (x->pending && !y->pending)
 291                 return 1;
 292
 293         /* Order by time */
 294         return CMP(time_event_source_latest(x), time_event_source_latest(y));
 295 }
 296
 297 static int exit_prioq_compare(const void *a, const void *b) {
 298         const sd_event_source *x = a, *y = b;
 299
 300         assert(x->type == SOURCE_EXIT);
 301         assert(y->type == SOURCE_EXIT);
 302
 303         /* Enabled ones first */
 304         if (x->enabled != SD_EVENT_OFF && y->enabled == SD_EVENT_OFF)
 305                 return -1;
 306         if (x->enabled == SD_EVENT_OFF && y->enabled != SD_EVENT_OFF)
 307                 return 1;
 308
 309         /* Lower priority values first */
 310         return CMP(x->priority, y->priority);
 311 }
 312
 313 static void free_clock_data(struct clock_data *d) {
 314         assert(d);
 315         assert(d->wakeup == WAKEUP_CLOCK_DATA);
 316
 317         safe_close(d->fd);
 318         prioq_free(d->earliest);
 319         prioq_free(d->latest);
 320 }
 321
 322 static sd_event *event_free(sd_event *e) {
 323         sd_event_source *s;
 324
 325         assert(e);
 326
 327         while ((s = e->sources)) {
 328                 assert(s->floating);
 329                 source_disconnect(s);
 330                 sd_event_source_unref(s);
 331         }
 332
 333         assert(e->n_sources == 0);
 334
 335         if (e->default_event_ptr)
 336                 *(e->default_event_ptr) = NULL;
 337
 338         safe_close(e->epoll_fd);
 339         safe_close(e->watchdog_fd);
 340
 341         free_clock_data(&e->realtime);
 342         free_clock_data(&e->boottime);
 343         free_clock_data(&e->monotonic);
 344         free_clock_data(&e->realtime_alarm);
 345         free_clock_data(&e->boottime_alarm);
 346
 347         prioq_free(e->pending);
 348         prioq_free(e->prepare);
 349         prioq_free(e->exit);
 350
 351         free(e->signal_sources);
 352         hashmap_free(e->signal_data);
 353
 354         hashmap_free(e->inotify_data);
 355
 356         hashmap_free(e->child_sources);
 357         set_free(e->post_sources);
 358
 359         free(e->event_queue);
 360
 361         return mfree(e);
 362 }
 363
 364 _public_ int sd_event_new(sd_event** ret) {
 365         sd_event *e;
 366         int r;
 367
 368         assert_return(ret, -EINVAL);
 369
 370         e = new(sd_event, 1);
 371         if (!e)
 372                 return -ENOMEM;
 373
 374         *e = (sd_event) {
 375                 .n_ref = 1,
 376                 .epoll_fd = -1,
 377                 .watchdog_fd = -1,
 378                 .realtime.wakeup = WAKEUP_CLOCK_DATA,
 379                 .realtime.fd = -1,
 380                 .realtime.next = USEC_INFINITY,
 381                 .boottime.wakeup = WAKEUP_CLOCK_DATA,
 382                 .boottime.fd = -1,
 383                 .boottime.next = USEC_INFINITY,
 384                 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
 385                 .monotonic.fd = -1,
 386                 .monotonic.next = USEC_INFINITY,
 387                 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 388                 .realtime_alarm.fd = -1,
 389                 .realtime_alarm.next = USEC_INFINITY,
 390                 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
 391                 .boottime_alarm.fd = -1,
 392                 .boottime_alarm.next = USEC_INFINITY,
 393                 .perturb = USEC_INFINITY,
 394                 .original_pid = getpid_cached(),
 395         };
 396
 397         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
 398         if (r < 0)
 399                 goto fail;
 400
 401         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
 402         if (e->epoll_fd < 0) {
 403                 r = -errno;
 404                 goto fail;
 405         }
 406
 407         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
 408
 409         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
 410                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 … 2^63 us will be logged every 5s.");
 411                 e->profile_delays = true;
 412         }
 413
 414         *ret = e;
 415         return 0;
 416
 417 fail:
 418         event_free(e);
 419         return r;
 420 }
 421
 422 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event, sd_event, event_free);
 423
 424 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
 425         if (s)
 426                 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
 427         return sd_event_source_unref(s);
 428 }
 429
 430 static bool event_pid_changed(sd_event *e) {
 431         assert(e);
 432
 433         /* We don't support people creating an event loop and keeping
 434          * it around over a fork(). Let's complain. */
 435
 436         return e->original_pid != getpid_cached();
 437 }
 438
 439 static void source_io_unregister(sd_event_source *s) {
 440         assert(s);
 441         assert(s->type == SOURCE_IO);
 442
 443         if (event_pid_changed(s->event))
 444                 return;
 445
 446         if (!s->io.registered)
 447                 return;
 448
 449         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
 450                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 451                                 strna(s->description), event_source_type_to_string(s->type));
 452
 453         s->io.registered = false;
 454 }
 455
 456 static int source_io_register(
 457                 sd_event_source *s,
 458                 int enabled,
 459                 uint32_t events) {
 460
 461         assert(s);
 462         assert(s->type == SOURCE_IO);
 463         assert(enabled != SD_EVENT_OFF);
 464
 465         struct epoll_event ev = {
 466                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 467                 .data.ptr = s,
 468         };
 469
 470         if (epoll_ctl(s->event->epoll_fd,
 471                       s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 472                       s->io.fd, &ev) < 0)
 473                 return -errno;
 474
 475         s->io.registered = true;
 476
 477         return 0;
 478 }
 479
 480 static void source_child_pidfd_unregister(sd_event_source *s) {
 481         assert(s);
 482         assert(s->type == SOURCE_CHILD);
 483
 484         if (event_pid_changed(s->event))
 485                 return;
 486
 487         if (!s->child.registered)
 488                 return;
 489
 490         if (EVENT_SOURCE_WATCH_PIDFD(s))
 491                 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
 492                         log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
 493                                         strna(s->description), event_source_type_to_string(s->type));
 494
 495         s->child.registered = false;
 496 }
 497
 498 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
 499         assert(s);
 500         assert(s->type == SOURCE_CHILD);
 501         assert(enabled != SD_EVENT_OFF);
 502
 503         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
 504                 struct epoll_event ev = {
 505                         .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
 506                         .data.ptr = s,
 507                 };
 508
 509                 if (epoll_ctl(s->event->epoll_fd,
 510                               s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
 511                               s->child.pidfd, &ev) < 0)
 512                         return -errno;
 513         }
 514
 515         s->child.registered = true;
 516         return 0;
 517 }
 518
 519 static clockid_t event_source_type_to_clock(EventSourceType t) {
 520
 521         switch (t) {
 522
 523         case SOURCE_TIME_REALTIME:
 524                 return CLOCK_REALTIME;
 525
 526         case SOURCE_TIME_BOOTTIME:
 527                 return CLOCK_BOOTTIME;
 528
 529         case SOURCE_TIME_MONOTONIC:
 530                 return CLOCK_MONOTONIC;
 531
 532         case SOURCE_TIME_REALTIME_ALARM:
 533                 return CLOCK_REALTIME_ALARM;
 534
 535         case SOURCE_TIME_BOOTTIME_ALARM:
 536                 return CLOCK_BOOTTIME_ALARM;
 537
 538         default:
 539                 return (clockid_t) -1;
 540         }
 541 }
 542
 543 static EventSourceType clock_to_event_source_type(clockid_t clock) {
 544
 545         switch (clock) {
 546
 547         case CLOCK_REALTIME:
 548                 return SOURCE_TIME_REALTIME;
 549
 550         case CLOCK_BOOTTIME:
 551                 return SOURCE_TIME_BOOTTIME;
 552
 553         case CLOCK_MONOTONIC:
 554                 return SOURCE_TIME_MONOTONIC;
 555
 556         case CLOCK_REALTIME_ALARM:
 557                 return SOURCE_TIME_REALTIME_ALARM;
 558
 559         case CLOCK_BOOTTIME_ALARM:
 560                 return SOURCE_TIME_BOOTTIME_ALARM;
 561
 562         default:
 563                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
 564         }
 565 }
 566
 567 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
 568         assert(e);
 569
 570         switch (t) {
 571
 572         case SOURCE_TIME_REALTIME:
 573                 return &e->realtime;
 574
 575         case SOURCE_TIME_BOOTTIME:
 576                 return &e->boottime;
 577
 578         case SOURCE_TIME_MONOTONIC:
 579                 return &e->monotonic;
 580
 581         case SOURCE_TIME_REALTIME_ALARM:
 582                 return &e->realtime_alarm;
 583
 584         case SOURCE_TIME_BOOTTIME_ALARM:
 585                 return &e->boottime_alarm;
 586
 587         default:
 588                 return NULL;
 589         }
 590 }
 591
 592 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
 593         assert(e);
 594
 595         if (!d)
 596                 return;
 597
 598         hashmap_remove(e->signal_data, &d->priority);
 599         safe_close(d->fd);
 600         free(d);
 601 }
 602
 603 static int event_make_signal_data(
 604                 sd_event *e,
 605                 int sig,
 606                 struct signal_data **ret) {
 607
 608         struct signal_data *d;
 609         bool added = false;
 610         sigset_t ss_copy;
 611         int64_t priority;
 612         int r;
 613
 614         assert(e);
 615
 616         if (event_pid_changed(e))
 617                 return -ECHILD;
 618
 619         if (e->signal_sources && e->signal_sources[sig])
 620                 priority = e->signal_sources[sig]->priority;
 621         else
 622                 priority = SD_EVENT_PRIORITY_NORMAL;
 623
 624         d = hashmap_get(e->signal_data, &priority);
 625         if (d) {
 626                 if (sigismember(&d->sigset, sig) > 0) {
 627                         if (ret)
 628                                 *ret = d;
 629                         return 0;
 630                 }
 631         } else {
 632                 d = new(struct signal_data, 1);
 633                 if (!d)
 634                         return -ENOMEM;
 635
 636                 *d = (struct signal_data) {
 637                         .wakeup = WAKEUP_SIGNAL_DATA,
 638                         .fd = -1,
 639                         .priority = priority,
 640                 };
 641
 642                 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
 643                 if (r < 0) {
 644                         free(d);
 645                         return r;
 646                 }
 647
 648                 added = true;
 649         }
 650
 651         ss_copy = d->sigset;
 652         assert_se(sigaddset(&ss_copy, sig) >= 0);
 653
 654         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
 655         if (r < 0) {
 656                 r = -errno;
 657                 goto fail;
 658         }
 659
 660         d->sigset = ss_copy;
 661
 662         if (d->fd >= 0) {
 663                 if (ret)
 664                         *ret = d;
 665                 return 0;
 666         }
 667
 668         d->fd = fd_move_above_stdio(r);
 669
 670         struct epoll_event ev = {
 671                 .events = EPOLLIN,
 672                 .data.ptr = d,
 673         };
 674
 675         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
 676                 r = -errno;
 677                 goto fail;
 678         }
 679
 680         if (ret)
 681                 *ret = d;
 682
 683         return 0;
 684
 685 fail:
 686         if (added)
 687                 event_free_signal_data(e, d);
 688
 689         return r;
 690 }
 691
 692 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
 693         assert(e);
 694         assert(d);
 695
 696         /* Turns off the specified signal in the signal data
 697          * object. If the signal mask of the object becomes empty that
 698          * way removes it. */
 699
 700         if (sigismember(&d->sigset, sig) == 0)
 701                 return;
 702
 703         assert_se(sigdelset(&d->sigset, sig) >= 0);
 704
 705         if (sigisemptyset(&d->sigset)) {
 706                 /* If all the mask is all-zero we can get rid of the structure */
 707                 event_free_signal_data(e, d);
 708                 return;
 709         }
 710
 711         assert(d->fd >= 0);
 712
 713         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
 714                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
 715 }
 716
 717 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
 718         struct signal_data *d;
 719         static const int64_t zero_priority = 0;
 720
 721         assert(e);
 722
 723         /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
 724          * and possibly drop the signalfd for it. */
 725
 726         if (sig == SIGCHLD &&
 727             e->n_online_child_sources > 0)
 728                 return;
 729
 730         if (e->signal_sources &&
 731             e->signal_sources[sig] &&
 732             event_source_is_online(e->signal_sources[sig]))
 733                 return;
 734
 735         /*
 736          * The specified signal might be enabled in three different queues:
 737          *
 738          * 1) the one that belongs to the priority passed (if it is non-NULL)
 739          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
 740          * 3) the 0 priority (to cover the SIGCHLD case)
 741          *
 742          * Hence, let's remove it from all three here.
 743          */
 744
 745         if (priority) {
 746                 d = hashmap_get(e->signal_data, priority);
 747                 if (d)
 748                         event_unmask_signal_data(e, d, sig);
 749         }
 750
 751         if (e->signal_sources && e->signal_sources[sig]) {
 752                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
 753                 if (d)
 754                         event_unmask_signal_data(e, d, sig);
 755         }
 756
 757         d = hashmap_get(e->signal_data, &zero_priority);
 758         if (d)
 759                 event_unmask_signal_data(e, d, sig);
 760 }
 761
 762 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
 763         assert(s);
 764
 765         /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
 766          * they are enabled/disabled or marked pending and such. */
 767
 768         if (s->pending)
 769                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
 770
 771         if (s->prepare)
 772                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
 773 }
 774
 775 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
 776         struct clock_data *d;
 777
 778         assert(s);
 779
 780         /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
 781          * pending, enable state. Makes sure the two prioq's are ordered properly again. */
 782
 783         if (s->ratelimited)
 784                 d = &s->event->monotonic;
 785         else {
 786                 assert(EVENT_SOURCE_IS_TIME(s->type));
 787                 assert_se(d = event_get_clock_data(s->event, s->type));
 788         }
 789
 790         prioq_reshuffle(d->earliest, s, &s->earliest_index);
 791         prioq_reshuffle(d->latest, s, &s->latest_index);
 792         d->needs_rearm = true;
 793 }
 794
 795 static void event_source_time_prioq_remove(
 796                 sd_event_source *s,
 797                 struct clock_data *d) {
 798
 799         assert(s);
 800         assert(d);
 801
 802         prioq_remove(d->earliest, s, &s->earliest_index);
 803         prioq_remove(d->latest, s, &s->latest_index);
 804         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
 805         d->needs_rearm = true;
 806 }
 807
 808 static void source_disconnect(sd_event_source *s) {
 809         sd_event *event;
 810
 811         assert(s);
 812
 813         if (!s->event)
 814                 return;
 815
 816         assert(s->event->n_sources > 0);
 817
 818         switch (s->type) {
 819
 820         case SOURCE_IO:
 821                 if (s->io.fd >= 0)
 822                         source_io_unregister(s);
 823
 824                 break;
 825
 826         case SOURCE_TIME_REALTIME:
 827         case SOURCE_TIME_BOOTTIME:
 828         case SOURCE_TIME_MONOTONIC:
 829         case SOURCE_TIME_REALTIME_ALARM:
 830         case SOURCE_TIME_BOOTTIME_ALARM:
 831                 /* Only remove this event source from the time event source here if it is not ratelimited. If
 832                  * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
 833                  * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
 834
 835                 if (!s->ratelimited) {
 836                         struct clock_data *d;
 837                         assert_se(d = event_get_clock_data(s->event, s->type));
 838                         event_source_time_prioq_remove(s, d);
 839                 }
 840
 841                 break;
 842
 843         case SOURCE_SIGNAL:
 844                 if (s->signal.sig > 0) {
 845
 846                         if (s->event->signal_sources)
 847                                 s->event->signal_sources[s->signal.sig] = NULL;
 848
 849                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
 850                 }
 851
 852                 break;
 853
 854         case SOURCE_CHILD:
 855                 if (s->child.pid > 0) {
 856                         if (event_source_is_online(s)) {
 857                                 assert(s->event->n_online_child_sources > 0);
 858                                 s->event->n_online_child_sources--;
 859                         }
 860
 861                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
 862                 }
 863
 864                 if (EVENT_SOURCE_WATCH_PIDFD(s))
 865                         source_child_pidfd_unregister(s);
 866                 else
 867                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
 868
 869                 break;
 870
 871         case SOURCE_DEFER:
 872                 /* nothing */
 873                 break;
 874
 875         case SOURCE_POST:
 876                 set_remove(s->event->post_sources, s);
 877                 break;
 878
 879         case SOURCE_EXIT:
 880                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
 881                 break;
 882
 883         case SOURCE_INOTIFY: {
 884                 struct inode_data *inode_data;
 885
 886                 inode_data = s->inotify.inode_data;
 887                 if (inode_data) {
 888                         struct inotify_data *inotify_data;
 889                         assert_se(inotify_data = inode_data->inotify_data);
 890
 891                         /* Detach this event source from the inode object */
 892                         LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
 893                         s->inotify.inode_data = NULL;
 894
 895                         if (s->pending) {
 896                                 assert(inotify_data->n_pending > 0);
 897                                 inotify_data->n_pending--;
 898                         }
 899
 900                         /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
 901                          * continued to being watched. That's because inotify doesn't really have an API for that: we
 902                          * can only change watch masks with access to the original inode either by fd or by path. But
 903                          * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
 904                          * continuously and keeping the mount busy which we can't really do. We could reconstruct the
 905                          * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
 906                          * there), but given the need for open_by_handle_at() which is privileged and not universally
 907                          * available this would be quite an incomplete solution. Hence we go the other way, leave the
 908                          * mask set, even if it is not minimized now, and ignore all events we aren't interested in
 909                          * anymore after reception. Yes, this sucks, but … Linux … */
 910
 911                         /* Maybe release the inode data (and its inotify) */
 912                         event_gc_inode_data(s->event, inode_data);
 913                 }
 914
 915                 break;
 916         }
 917
 918         default:
 919                 assert_not_reached("Wut? I shouldn't exist.");
 920         }
 921
 922         if (s->pending)
 923                 prioq_remove(s->event->pending, s, &s->pending_index);
 924
 925         if (s->prepare)
 926                 prioq_remove(s->event->prepare, s, &s->prepare_index);
 927
 928         if (s->ratelimited)
 929                 event_source_time_prioq_remove(s, &s->event->monotonic);
 930
 931         event = TAKE_PTR(s->event);
 932         LIST_REMOVE(sources, event->sources, s);
 933         event->n_sources--;
 934
 935         /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
 936          * pidfd associated with this event source, which we'll do only on source_free(). */
 937
 938         if (!s->floating)
 939                 sd_event_unref(event);
 940 }
 941
 942 static sd_event_source* source_free(sd_event_source *s) {
 943         assert(s);
 944
 945         source_disconnect(s);
 946
 947         if (s->type == SOURCE_IO && s->io.owned)
 948                 s->io.fd = safe_close(s->io.fd);
 949
 950         if (s->type == SOURCE_CHILD) {
 951                 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
 952
 953                 if (s->child.process_owned) {
 954
 955                         if (!s->child.exited) {
 956                                 bool sent = false;
 957
 958                                 if (s->child.pidfd >= 0) {
 959                                         if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
 960                                                 if (errno == ESRCH) /* Already dead */
 961                                                         sent = true;
 962                                                 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
 963                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
 964                                                                         s->child.pid);
 965                                         } else
 966                                                 sent = true;
 967                                 }
 968
 969                                 if (!sent)
 970                                         if (kill(s->child.pid, SIGKILL) < 0)
 971                                                 if (errno != ESRCH) /* Already dead */
 972                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
 973                                                                         s->child.pid);
 974                         }
 975
 976                         if (!s->child.waited) {
 977                                 siginfo_t si = {};
 978
 979                                 /* Reap the child if we can */
 980                                 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
 981                         }
 982                 }
 983
 984                 if (s->child.pidfd_owned)
 985                         s->child.pidfd = safe_close(s->child.pidfd);
 986         }
 987
 988         if (s->destroy_callback)
 989                 s->destroy_callback(s->userdata);
 990
 991         free(s->description);
 992         return mfree(s);
 993 }
 994 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
 995
 996 static int source_set_pending(sd_event_source *s, bool b) {
 997         int r;
 998
 999         assert(s);
1000         assert(s->type != SOURCE_EXIT);
1001
1002         if (s->pending == b)
1003                 return 0;
1004
1005         s->pending = b;
1006
1007         if (b) {
1008                 s->pending_iteration = s->event->iteration;
1009
1010                 r = prioq_put(s->event->pending, s, &s->pending_index);
1011                 if (r < 0) {
1012                         s->pending = false;
1013                         return r;
1014                 }
1015         } else
1016                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1017
1018         if (EVENT_SOURCE_IS_TIME(s->type))
1019                 event_source_time_prioq_reshuffle(s);
1020
1021         if (s->type == SOURCE_SIGNAL && !b) {
1022                 struct signal_data *d;
1023
1024                 d = hashmap_get(s->event->signal_data, &s->priority);
1025                 if (d && d->current == s)
1026                         d->current = NULL;
1027         }
1028
1029         if (s->type == SOURCE_INOTIFY) {
1030
1031                 assert(s->inotify.inode_data);
1032                 assert(s->inotify.inode_data->inotify_data);
1033
1034                 if (b)
1035                         s->inotify.inode_data->inotify_data->n_pending ++;
1036                 else {
1037                         assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1038                         s->inotify.inode_data->inotify_data->n_pending --;
1039                 }
1040         }
1041
1042         return 1;
1043 }
1044
1045 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1046         sd_event_source *s;
1047
1048         assert(e);
1049
1050         s = new(sd_event_source, 1);
1051         if (!s)
1052                 return NULL;
1053
1054         *s = (struct sd_event_source) {
1055                 .n_ref = 1,
1056                 .event = e,
1057                 .floating = floating,
1058                 .type = type,
1059                 .pending_index = PRIOQ_IDX_NULL,
1060                 .prepare_index = PRIOQ_IDX_NULL,
1061         };
1062
1063         if (!floating)
1064                 sd_event_ref(e);
1065
1066         LIST_PREPEND(sources, e->sources, s);
1067         e->n_sources++;
1068
1069         return s;
1070 }
1071
1072 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1073         assert(s);
1074
1075         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1076 }
1077
1078 _public_ int sd_event_add_io(
1079                 sd_event *e,
1080                 sd_event_source **ret,
1081                 int fd,
1082                 uint32_t events,
1083                 sd_event_io_handler_t callback,
1084                 void *userdata) {
1085
1086         _cleanup_(source_freep) sd_event_source *s = NULL;
1087         int r;
1088
1089         assert_return(e, -EINVAL);
1090         assert_return(e = event_resolve(e), -ENOPKG);
1091         assert_return(fd >= 0, -EBADF);
1092         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1093         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1094         assert_return(!event_pid_changed(e), -ECHILD);
1095
1096         if (!callback)
1097                 callback = io_exit_callback;
1098
1099         s = source_new(e, !ret, SOURCE_IO);
1100         if (!s)
1101                 return -ENOMEM;
1102
1103         s->wakeup = WAKEUP_EVENT_SOURCE;
1104         s->io.fd = fd;
1105         s->io.events = events;
1106         s->io.callback = callback;
1107         s->userdata = userdata;
1108         s->enabled = SD_EVENT_ON;
1109
1110         r = source_io_register(s, s->enabled, events);
1111         if (r < 0)
1112                 return r;
1113
1114         if (ret)
1115                 *ret = s;
1116         TAKE_PTR(s);
1117
1118         return 0;
1119 }
1120
1121 static void initialize_perturb(sd_event *e) {
1122         sd_id128_t bootid = {};
1123
1124         /* When we sleep for longer, we try to realign the wakeup to
1125            the same time within each minute/second/250ms, so that
1126            events all across the system can be coalesced into a single
1127            CPU wakeup. However, let's take some system-specific
1128            randomness for this value, so that in a network of systems
1129            with synced clocks timer events are distributed a
1130            bit. Here, we calculate a perturbation usec offset from the
1131            boot ID. */
1132
1133         if (_likely_(e->perturb != USEC_INFINITY))
1134                 return;
1135
1136         if (sd_id128_get_boot(&bootid) >= 0)
1137                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1138 }
1139
1140 static int event_setup_timer_fd(
1141                 sd_event *e,
1142                 struct clock_data *d,
1143                 clockid_t clock) {
1144
1145         assert(e);
1146         assert(d);
1147
1148         if (_likely_(d->fd >= 0))
1149                 return 0;
1150
1151         _cleanup_close_ int fd = -1;
1152
1153         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1154         if (fd < 0)
1155                 return -errno;
1156
1157         fd = fd_move_above_stdio(fd);
1158
1159         struct epoll_event ev = {
1160                 .events = EPOLLIN,
1161                 .data.ptr = d,
1162         };
1163
1164         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1165                 return -errno;
1166
1167         d->fd = TAKE_FD(fd);
1168         return 0;
1169 }
1170
1171 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1172         assert(s);
1173
1174         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1175 }
1176
1177 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1178         int r;
1179
1180         assert(d);
1181
1182         if (d->fd < 0) {
1183                 r = event_setup_timer_fd(e, d, clock);
1184                 if (r < 0)
1185                         return r;
1186         }
1187
1188         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1189         if (r < 0)
1190                 return r;
1191
1192         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1193         if (r < 0)
1194                 return r;
1195
1196         return 0;
1197 }
1198
1199 static int event_source_time_prioq_put(
1200                 sd_event_source *s,
1201                 struct clock_data *d) {
1202
1203         int r;
1204
1205         assert(s);
1206         assert(d);
1207
1208         r = prioq_put(d->earliest, s, &s->earliest_index);
1209         if (r < 0)
1210                 return r;
1211
1212         r = prioq_put(d->latest, s, &s->latest_index);
1213         if (r < 0) {
1214                 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1215                 s->earliest_index = PRIOQ_IDX_NULL;
1216                 return r;
1217         }
1218
1219         d->needs_rearm = true;
1220         return 0;
1221 }
1222
1223 _public_ int sd_event_add_time(
1224                 sd_event *e,
1225                 sd_event_source **ret,
1226                 clockid_t clock,
1227                 uint64_t usec,
1228                 uint64_t accuracy,
1229                 sd_event_time_handler_t callback,
1230                 void *userdata) {
1231
1232         EventSourceType type;
1233         _cleanup_(source_freep) sd_event_source *s = NULL;
1234         struct clock_data *d;
1235         int r;
1236
1237         assert_return(e, -EINVAL);
1238         assert_return(e = event_resolve(e), -ENOPKG);
1239         assert_return(accuracy != UINT64_MAX, -EINVAL);
1240         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1241         assert_return(!event_pid_changed(e), -ECHILD);
1242
1243         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1244                 return -EOPNOTSUPP;
1245
1246         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1247         if (type < 0)
1248                 return -EOPNOTSUPP;
1249
1250         if (!callback)
1251                 callback = time_exit_callback;
1252
1253         assert_se(d = event_get_clock_data(e, type));
1254
1255         r = setup_clock_data(e, d, clock);
1256         if (r < 0)
1257                 return r;
1258
1259         s = source_new(e, !ret, type);
1260         if (!s)
1261                 return -ENOMEM;
1262
1263         s->time.next = usec;
1264         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1265         s->time.callback = callback;
1266         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1267         s->userdata = userdata;
1268         s->enabled = SD_EVENT_ONESHOT;
1269
1270         r = event_source_time_prioq_put(s, d);
1271         if (r < 0)
1272                 return r;
1273
1274         if (ret)
1275                 *ret = s;
1276         TAKE_PTR(s);
1277
1278         return 0;
1279 }
1280
1281 _public_ int sd_event_add_time_relative(
1282                 sd_event *e,
1283                 sd_event_source **ret,
1284                 clockid_t clock,
1285                 uint64_t usec,
1286                 uint64_t accuracy,
1287                 sd_event_time_handler_t callback,
1288                 void *userdata) {
1289
1290         usec_t t;
1291         int r;
1292
1293         /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1294          * checks for overflow. */
1295
1296         r = sd_event_now(e, clock, &t);
1297         if (r < 0)
1298                 return r;
1299
1300         if (usec >= USEC_INFINITY - t)
1301                 return -EOVERFLOW;
1302
1303         return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1304 }
1305
1306 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1307         assert(s);
1308
1309         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1310 }
1311
1312 _public_ int sd_event_add_signal(
1313                 sd_event *e,
1314                 sd_event_source **ret,
1315                 int sig,
1316                 sd_event_signal_handler_t callback,
1317                 void *userdata) {
1318
1319         _cleanup_(source_freep) sd_event_source *s = NULL;
1320         struct signal_data *d;
1321         int r;
1322
1323         assert_return(e, -EINVAL);
1324         assert_return(e = event_resolve(e), -ENOPKG);
1325         assert_return(SIGNAL_VALID(sig), -EINVAL);
1326         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1327         assert_return(!event_pid_changed(e), -ECHILD);
1328
1329         if (!callback)
1330                 callback = signal_exit_callback;
1331
1332         r = signal_is_blocked(sig);
1333         if (r < 0)
1334                 return r;
1335         if (r == 0)
1336                 return -EBUSY;
1337
1338         if (!e->signal_sources) {
1339                 e->signal_sources = new0(sd_event_source*, _NSIG);
1340                 if (!e->signal_sources)
1341                         return -ENOMEM;
1342         } else if (e->signal_sources[sig])
1343                 return -EBUSY;
1344
1345         s = source_new(e, !ret, SOURCE_SIGNAL);
1346         if (!s)
1347                 return -ENOMEM;
1348
1349         s->signal.sig = sig;
1350         s->signal.callback = callback;
1351         s->userdata = userdata;
1352         s->enabled = SD_EVENT_ON;
1353
1354         e->signal_sources[sig] = s;
1355
1356         r = event_make_signal_data(e, sig, &d);
1357         if (r < 0)
1358                 return r;
1359
1360         /* Use the signal name as description for the event source by default */
1361         (void) sd_event_source_set_description(s, signal_to_string(sig));
1362
1363         if (ret)
1364                 *ret = s;
1365         TAKE_PTR(s);
1366
1367         return 0;
1368 }
1369
1370 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1371         assert(s);
1372
1373         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1374 }
1375
1376 static bool shall_use_pidfd(void) {
1377         /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1378         return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1379 }
1380
1381 _public_ int sd_event_add_child(
1382                 sd_event *e,
1383                 sd_event_source **ret,
1384                 pid_t pid,
1385                 int options,
1386                 sd_event_child_handler_t callback,
1387                 void *userdata) {
1388
1389         _cleanup_(source_freep) sd_event_source *s = NULL;
1390         int r;
1391
1392         assert_return(e, -EINVAL);
1393         assert_return(e = event_resolve(e), -ENOPKG);
1394         assert_return(pid > 1, -EINVAL);
1395         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1396         assert_return(options != 0, -EINVAL);
1397         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1398         assert_return(!event_pid_changed(e), -ECHILD);
1399
1400         if (!callback)
1401                 callback = child_exit_callback;
1402
1403         if (e->n_online_child_sources == 0) {
1404                 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1405                  * for compatibility with pre-pidfd and because we don't want the reap the child processes
1406                  * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1407                  * take effect.
1408                  *
1409                  * (As an optimization we only do this check on the first child event source created.) */
1410                 r = signal_is_blocked(SIGCHLD);
1411                 if (r < 0)
1412                         return r;
1413                 if (r == 0)
1414                         return -EBUSY;
1415         }
1416
1417         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1418         if (r < 0)
1419                 return r;
1420
1421         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1422                 return -EBUSY;
1423
1424         s = source_new(e, !ret, SOURCE_CHILD);
1425         if (!s)
1426                 return -ENOMEM;
1427
1428         s->wakeup = WAKEUP_EVENT_SOURCE;
1429         s->child.pid = pid;
1430         s->child.options = options;
1431         s->child.callback = callback;
1432         s->userdata = userdata;
1433         s->enabled = SD_EVENT_ONESHOT;
1434
1435         /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1436          * pin the PID, and make regular waitid() handling race-free. */
1437
1438         if (shall_use_pidfd()) {
1439                 s->child.pidfd = pidfd_open(s->child.pid, 0);
1440                 if (s->child.pidfd < 0) {
1441                         /* Propagate errors unless the syscall is not supported or blocked */
1442                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1443                                 return -errno;
1444                 } else
1445                         s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1446         } else
1447                 s->child.pidfd = -1;
1448
1449         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1450         if (r < 0)
1451                 return r;
1452
1453         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1454                 /* We have a pidfd and we only want to watch for exit */
1455                 r = source_child_pidfd_register(s, s->enabled);
1456                 if (r < 0)
1457                         return r;
1458
1459         } else {
1460                 /* We have no pidfd or we shall wait for some other event than WEXITED */
1461                 r = event_make_signal_data(e, SIGCHLD, NULL);
1462                 if (r < 0)
1463                         return r;
1464
1465                 e->need_process_child = true;
1466         }
1467
1468         e->n_online_child_sources++;
1469
1470         if (ret)
1471                 *ret = s;
1472         TAKE_PTR(s);
1473         return 0;
1474 }
1475
1476 _public_ int sd_event_add_child_pidfd(
1477                 sd_event *e,
1478                 sd_event_source **ret,
1479                 int pidfd,
1480                 int options,
1481                 sd_event_child_handler_t callback,
1482                 void *userdata) {
1483
1484
1485         _cleanup_(source_freep) sd_event_source *s = NULL;
1486         pid_t pid;
1487         int r;
1488
1489         assert_return(e, -EINVAL);
1490         assert_return(e = event_resolve(e), -ENOPKG);
1491         assert_return(pidfd >= 0, -EBADF);
1492         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1493         assert_return(options != 0, -EINVAL);
1494         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1495         assert_return(!event_pid_changed(e), -ECHILD);
1496
1497         if (!callback)
1498                 callback = child_exit_callback;
1499
1500         if (e->n_online_child_sources == 0) {
1501                 r = signal_is_blocked(SIGCHLD);
1502                 if (r < 0)
1503                         return r;
1504                 if (r == 0)
1505                         return -EBUSY;
1506         }
1507
1508         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1509         if (r < 0)
1510                 return r;
1511
1512         r = pidfd_get_pid(pidfd, &pid);
1513         if (r < 0)
1514                 return r;
1515
1516         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1517                 return -EBUSY;
1518
1519         s = source_new(e, !ret, SOURCE_CHILD);
1520         if (!s)
1521                 return -ENOMEM;
1522
1523         s->wakeup = WAKEUP_EVENT_SOURCE;
1524         s->child.pidfd = pidfd;
1525         s->child.pid = pid;
1526         s->child.options = options;
1527         s->child.callback = callback;
1528         s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1529         s->userdata = userdata;
1530         s->enabled = SD_EVENT_ONESHOT;
1531
1532         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1533         if (r < 0)
1534                 return r;
1535
1536         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1537                 /* We only want to watch for WEXITED */
1538                 r = source_child_pidfd_register(s, s->enabled);
1539                 if (r < 0)
1540                         return r;
1541         } else {
1542                 /* We shall wait for some other event than WEXITED */
1543                 r = event_make_signal_data(e, SIGCHLD, NULL);
1544                 if (r < 0)
1545                         return r;
1546
1547                 e->need_process_child = true;
1548         }
1549
1550         e->n_online_child_sources++;
1551
1552         if (ret)
1553                 *ret = s;
1554         TAKE_PTR(s);
1555         return 0;
1556 }
1557
1558 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1559         assert(s);
1560
1561         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1562 }
1563
1564 _public_ int sd_event_add_defer(
1565                 sd_event *e,
1566                 sd_event_source **ret,
1567                 sd_event_handler_t callback,
1568                 void *userdata) {
1569
1570         _cleanup_(source_freep) sd_event_source *s = NULL;
1571         int r;
1572
1573         assert_return(e, -EINVAL);
1574         assert_return(e = event_resolve(e), -ENOPKG);
1575         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1576         assert_return(!event_pid_changed(e), -ECHILD);
1577
1578         if (!callback)
1579                 callback = generic_exit_callback;
1580
1581         s = source_new(e, !ret, SOURCE_DEFER);
1582         if (!s)
1583                 return -ENOMEM;
1584
1585         s->defer.callback = callback;
1586         s->userdata = userdata;
1587         s->enabled = SD_EVENT_ONESHOT;
1588
1589         r = source_set_pending(s, true);
1590         if (r < 0)
1591                 return r;
1592
1593         if (ret)
1594                 *ret = s;
1595         TAKE_PTR(s);
1596
1597         return 0;
1598 }
1599
1600 _public_ int sd_event_add_post(
1601                 sd_event *e,
1602                 sd_event_source **ret,
1603                 sd_event_handler_t callback,
1604                 void *userdata) {
1605
1606         _cleanup_(source_freep) sd_event_source *s = NULL;
1607         int r;
1608
1609         assert_return(e, -EINVAL);
1610         assert_return(e = event_resolve(e), -ENOPKG);
1611         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1612         assert_return(!event_pid_changed(e), -ECHILD);
1613
1614         if (!callback)
1615                 callback = generic_exit_callback;
1616
1617         s = source_new(e, !ret, SOURCE_POST);
1618         if (!s)
1619                 return -ENOMEM;
1620
1621         s->post.callback = callback;
1622         s->userdata = userdata;
1623         s->enabled = SD_EVENT_ON;
1624
1625         r = set_ensure_put(&e->post_sources, NULL, s);
1626         if (r < 0)
1627                 return r;
1628         assert(r > 0);
1629
1630         if (ret)
1631                 *ret = s;
1632         TAKE_PTR(s);
1633
1634         return 0;
1635 }
1636
1637 _public_ int sd_event_add_exit(
1638                 sd_event *e,
1639                 sd_event_source **ret,
1640                 sd_event_handler_t callback,
1641                 void *userdata) {
1642
1643         _cleanup_(source_freep) sd_event_source *s = NULL;
1644         int r;
1645
1646         assert_return(e, -EINVAL);
1647         assert_return(e = event_resolve(e), -ENOPKG);
1648         assert_return(callback, -EINVAL);
1649         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1650         assert_return(!event_pid_changed(e), -ECHILD);
1651
1652         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1653         if (r < 0)
1654                 return r;
1655
1656         s = source_new(e, !ret, SOURCE_EXIT);
1657         if (!s)
1658                 return -ENOMEM;
1659
1660         s->exit.callback = callback;
1661         s->userdata = userdata;
1662         s->exit.prioq_index = PRIOQ_IDX_NULL;
1663         s->enabled = SD_EVENT_ONESHOT;
1664
1665         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1666         if (r < 0)
1667                 return r;
1668
1669         if (ret)
1670                 *ret = s;
1671         TAKE_PTR(s);
1672
1673         return 0;
1674 }
1675
1676 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
1677         assert(e);
1678
1679         if (!d)
1680                 return;
1681
1682         assert(hashmap_isempty(d->inodes));
1683         assert(hashmap_isempty(d->wd));
1684
1685         if (d->buffer_filled > 0)
1686                 LIST_REMOVE(buffered, e->inotify_data_buffered, d);
1687
1688         hashmap_free(d->inodes);
1689         hashmap_free(d->wd);
1690
1691         assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
1692
1693         if (d->fd >= 0) {
1694                 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
1695                         log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
1696
1697                 safe_close(d->fd);
1698         }
1699         free(d);
1700 }
1701
1702 static int event_make_inotify_data(
1703                 sd_event *e,
1704                 int64_t priority,
1705                 struct inotify_data **ret) {
1706
1707         _cleanup_close_ int fd = -1;
1708         struct inotify_data *d;
1709         int r;
1710
1711         assert(e);
1712
1713         d = hashmap_get(e->inotify_data, &priority);
1714         if (d) {
1715                 if (ret)
1716                         *ret = d;
1717                 return 0;
1718         }
1719
1720         fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
1721         if (fd < 0)
1722                 return -errno;
1723
1724         fd = fd_move_above_stdio(fd);
1725
1726         d = new(struct inotify_data, 1);
1727         if (!d)
1728                 return -ENOMEM;
1729
1730         *d = (struct inotify_data) {
1731                 .wakeup = WAKEUP_INOTIFY_DATA,
1732                 .fd = TAKE_FD(fd),
1733                 .priority = priority,
1734         };
1735
1736         r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
1737         if (r < 0) {
1738                 d->fd = safe_close(d->fd);
1739                 free(d);
1740                 return r;
1741         }
1742
1743         struct epoll_event ev = {
1744                 .events = EPOLLIN,
1745                 .data.ptr = d,
1746         };
1747
1748         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
1749                 r = -errno;
1750                 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
1751                                             * remove the fd from the epoll first, which we don't want as we couldn't
1752                                             * add it in the first place. */
1753                 event_free_inotify_data(e, d);
1754                 return r;
1755         }
1756
1757         if (ret)
1758                 *ret = d;
1759
1760         return 1;
1761 }
1762
1763 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
1764         int r;
1765
1766         assert(x);
1767         assert(y);
1768
1769         r = CMP(x->dev, y->dev);
1770         if (r != 0)
1771                 return r;
1772
1773         return CMP(x->ino, y->ino);
1774 }
1775
1776 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
1777         assert(d);
1778
1779         siphash24_compress(&d->dev, sizeof(d->dev), state);
1780         siphash24_compress(&d->ino, sizeof(d->ino), state);
1781 }
1782
1783 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
1784
1785 static void event_free_inode_data(
1786                 sd_event *e,
1787                 struct inode_data *d) {
1788
1789         assert(e);
1790
1791         if (!d)
1792                 return;
1793
1794         assert(!d->event_sources);
1795
1796         if (d->fd >= 0) {
1797                 LIST_REMOVE(to_close, e->inode_data_to_close, d);
1798                 safe_close(d->fd);
1799         }
1800
1801         if (d->inotify_data) {
1802
1803                 if (d->wd >= 0) {
1804                         if (d->inotify_data->fd >= 0) {
1805                                 /* So here's a problem. At the time this runs the watch descriptor might already be
1806                                  * invalidated, because an IN_IGNORED event might be queued right the moment we enter
1807                                  * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
1808                                  * likely case to happen. */
1809
1810                                 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
1811                                         log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
1812                         }
1813
1814                         assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
1815                 }
1816
1817                 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
1818         }
1819
1820         free(d);
1821 }
1822
1823 static void event_gc_inode_data(
1824                 sd_event *e,
1825                 struct inode_data *d) {
1826
1827         struct inotify_data *inotify_data;
1828
1829         assert(e);
1830
1831         if (!d)
1832                 return;
1833
1834         if (d->event_sources)
1835                 return;
1836
1837         inotify_data = d->inotify_data;
1838         event_free_inode_data(e, d);
1839
1840         if (inotify_data && hashmap_isempty(inotify_data->inodes))
1841                 event_free_inotify_data(e, inotify_data);
1842 }
1843
1844 static int event_make_inode_data(
1845                 sd_event *e,
1846                 struct inotify_data *inotify_data,
1847                 dev_t dev,
1848                 ino_t ino,
1849                 struct inode_data **ret) {
1850
1851         struct inode_data *d, key;
1852         int r;
1853
1854         assert(e);
1855         assert(inotify_data);
1856
1857         key = (struct inode_data) {
1858                 .ino = ino,
1859                 .dev = dev,
1860         };
1861
1862         d = hashmap_get(inotify_data->inodes, &key);
1863         if (d) {
1864                 if (ret)
1865                         *ret = d;
1866
1867                 return 0;
1868         }
1869
1870         r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
1871         if (r < 0)
1872                 return r;
1873
1874         d = new(struct inode_data, 1);
1875         if (!d)
1876                 return -ENOMEM;
1877
1878         *d = (struct inode_data) {
1879                 .dev = dev,
1880                 .ino = ino,
1881                 .wd = -1,
1882                 .fd = -1,
1883                 .inotify_data = inotify_data,
1884         };
1885
1886         r = hashmap_put(inotify_data->inodes, d, d);
1887         if (r < 0) {
1888                 free(d);
1889                 return r;
1890         }
1891
1892         if (ret)
1893                 *ret = d;
1894
1895         return 1;
1896 }
1897
1898 static uint32_t inode_data_determine_mask(struct inode_data *d) {
1899         bool excl_unlink = true;
1900         uint32_t combined = 0;
1901         sd_event_source *s;
1902
1903         assert(d);
1904
1905         /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
1906          * the IN_EXCL_UNLINK flag is ANDed instead.
1907          *
1908          * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
1909          * because we cannot change the mask anymore after the event source was created once, since the kernel has no
1910          * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
1911          * events we don't care for client-side. */
1912
1913         LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
1914
1915                 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
1916                         excl_unlink = false;
1917
1918                 combined |= s->inotify.mask;
1919         }
1920
1921         return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
1922 }
1923
1924 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
1925         uint32_t combined_mask;
1926         int wd, r;
1927
1928         assert(d);
1929         assert(d->fd >= 0);
1930
1931         combined_mask = inode_data_determine_mask(d);
1932
1933         if (d->wd >= 0 && combined_mask == d->combined_mask)
1934                 return 0;
1935
1936         r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
1937         if (r < 0)
1938                 return r;
1939
1940         wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
1941         if (wd < 0)
1942                 return -errno;
1943
1944         if (d->wd < 0) {
1945                 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
1946                 if (r < 0) {
1947                         (void) inotify_rm_watch(d->inotify_data->fd, wd);
1948                         return r;
1949                 }
1950
1951                 d->wd = wd;
1952
1953         } else if (d->wd != wd) {
1954
1955                 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
1956                 (void) inotify_rm_watch(d->fd, wd);
1957                 return -EINVAL;
1958         }
1959
1960         d->combined_mask = combined_mask;
1961         return 1;
1962 }
1963
1964 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
1965         assert(s);
1966
1967         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1968 }
1969
1970 _public_ int sd_event_add_inotify(
1971                 sd_event *e,
1972                 sd_event_source **ret,
1973                 const char *path,
1974                 uint32_t mask,
1975                 sd_event_inotify_handler_t callback,
1976                 void *userdata) {
1977
1978         struct inotify_data *inotify_data = NULL;
1979         struct inode_data *inode_data = NULL;
1980         _cleanup_close_ int fd = -1;
1981         _cleanup_(source_freep) sd_event_source *s = NULL;
1982         struct stat st;
1983         int r;
1984
1985         assert_return(e, -EINVAL);
1986         assert_return(e = event_resolve(e), -ENOPKG);
1987         assert_return(path, -EINVAL);
1988         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1989         assert_return(!event_pid_changed(e), -ECHILD);
1990
1991         if (!callback)
1992                 callback = inotify_exit_callback;
1993
1994         /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
1995          * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
1996          * the user can't use them for us. */
1997         if (mask & IN_MASK_ADD)
1998                 return -EINVAL;
1999
2000         fd = open(path, O_PATH|O_CLOEXEC|
2001                   (mask & IN_ONLYDIR ? O_DIRECTORY : 0)|
2002                   (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2003         if (fd < 0)
2004                 return -errno;
2005
2006         if (fstat(fd, &st) < 0)
2007                 return -errno;
2008
2009         s = source_new(e, !ret, SOURCE_INOTIFY);
2010         if (!s)
2011                 return -ENOMEM;
2012
2013         s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2014         s->inotify.mask = mask;
2015         s->inotify.callback = callback;
2016         s->userdata = userdata;
2017
2018         /* Allocate an inotify object for this priority, and an inode object within it */
2019         r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2020         if (r < 0)
2021                 return r;
2022
2023         r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2024         if (r < 0) {
2025                 event_free_inotify_data(e, inotify_data);
2026                 return r;
2027         }
2028
2029         /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2030          * the event source, until then, for which we need the original inode. */
2031         if (inode_data->fd < 0) {
2032                 inode_data->fd = TAKE_FD(fd);
2033                 LIST_PREPEND(to_close, e->inode_data_to_close, inode_data);
2034         }
2035
2036         /* Link our event source to the inode data object */
2037         LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2038         s->inotify.inode_data = inode_data;
2039
2040         /* Actually realize the watch now */
2041         r = inode_data_realize_watch(e, inode_data);
2042         if (r < 0)
2043                 return r;
2044
2045         (void) sd_event_source_set_description(s, path);
2046
2047         if (ret)
2048                 *ret = s;
2049         TAKE_PTR(s);
2050
2051         return 0;
2052 }
2053
2054 static sd_event_source* event_source_free(sd_event_source *s) {
2055         if (!s)
2056                 return NULL;
2057
2058         /* Here's a special hack: when we are called from a
2059          * dispatch handler we won't free the event source
2060          * immediately, but we will detach the fd from the
2061          * epoll. This way it is safe for the caller to unref
2062          * the event source and immediately close the fd, but
2063          * we still retain a valid event source object after
2064          * the callback. */
2065
2066         if (s->dispatching) {
2067                 if (s->type == SOURCE_IO)
2068                         source_io_unregister(s);
2069
2070                 source_disconnect(s);
2071         } else
2072                 source_free(s);
2073
2074         return NULL;
2075 }
2076
2077 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2078
2079 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2080         assert_return(s, -EINVAL);
2081         assert_return(!event_pid_changed(s->event), -ECHILD);
2082
2083         return free_and_strdup(&s->description, description);
2084 }
2085
2086 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
2087         assert_return(s, -EINVAL);
2088         assert_return(description, -EINVAL);
2089         assert_return(!event_pid_changed(s->event), -ECHILD);
2090
2091         if (!s->description)
2092                 return -ENXIO;
2093
2094         *description = s->description;
2095         return 0;
2096 }
2097
2098 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
2099         assert_return(s, NULL);
2100
2101         return s->event;
2102 }
2103
2104 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2105         assert_return(s, -EINVAL);
2106         assert_return(s->type != SOURCE_EXIT, -EDOM);
2107         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2108         assert_return(!event_pid_changed(s->event), -ECHILD);
2109
2110         return s->pending;
2111 }
2112
2113 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2114         assert_return(s, -EINVAL);
2115         assert_return(s->type == SOURCE_IO, -EDOM);
2116         assert_return(!event_pid_changed(s->event), -ECHILD);
2117
2118         return s->io.fd;
2119 }
2120
2121 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2122         int r;
2123
2124         assert_return(s, -EINVAL);
2125         assert_return(fd >= 0, -EBADF);
2126         assert_return(s->type == SOURCE_IO, -EDOM);
2127         assert_return(!event_pid_changed(s->event), -ECHILD);
2128
2129         if (s->io.fd == fd)
2130                 return 0;
2131
2132         if (event_source_is_offline(s)) {
2133                 s->io.fd = fd;
2134                 s->io.registered = false;
2135         } else {
2136                 int saved_fd;
2137
2138                 saved_fd = s->io.fd;
2139                 assert(s->io.registered);
2140
2141                 s->io.fd = fd;
2142                 s->io.registered = false;
2143
2144                 r = source_io_register(s, s->enabled, s->io.events);
2145                 if (r < 0) {
2146                         s->io.fd = saved_fd;
2147                         s->io.registered = true;
2148                         return r;
2149                 }
2150
2151                 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2152         }
2153
2154         return 0;
2155 }
2156
2157 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2158         assert_return(s, -EINVAL);
2159         assert_return(s->type == SOURCE_IO, -EDOM);
2160
2161         return s->io.owned;
2162 }
2163
2164 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2165         assert_return(s, -EINVAL);
2166         assert_return(s->type == SOURCE_IO, -EDOM);
2167
2168         s->io.owned = own;
2169         return 0;
2170 }
2171
2172 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2173         assert_return(s, -EINVAL);
2174         assert_return(events, -EINVAL);
2175         assert_return(s->type == SOURCE_IO, -EDOM);
2176         assert_return(!event_pid_changed(s->event), -ECHILD);
2177
2178         *events = s->io.events;
2179         return 0;
2180 }
2181
2182 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2183         int r;
2184
2185         assert_return(s, -EINVAL);
2186         assert_return(s->type == SOURCE_IO, -EDOM);
2187         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2188         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2189         assert_return(!event_pid_changed(s->event), -ECHILD);
2190
2191         /* edge-triggered updates are never skipped, so we can reset edges */
2192         if (s->io.events == events && !(events & EPOLLET))
2193                 return 0;
2194
2195         r = source_set_pending(s, false);
2196         if (r < 0)
2197                 return r;
2198
2199         if (event_source_is_online(s)) {
2200                 r = source_io_register(s, s->enabled, events);
2201                 if (r < 0)
2202                         return r;
2203         }
2204
2205         s->io.events = events;
2206
2207         return 0;
2208 }
2209
2210 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2211         assert_return(s, -EINVAL);
2212         assert_return(revents, -EINVAL);
2213         assert_return(s->type == SOURCE_IO, -EDOM);
2214         assert_return(s->pending, -ENODATA);
2215         assert_return(!event_pid_changed(s->event), -ECHILD);
2216
2217         *revents = s->io.revents;
2218         return 0;
2219 }
2220
2221 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2222         assert_return(s, -EINVAL);
2223         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2224         assert_return(!event_pid_changed(s->event), -ECHILD);
2225
2226         return s->signal.sig;
2227 }
2228
2229 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2230         assert_return(s, -EINVAL);
2231         assert_return(!event_pid_changed(s->event), -ECHILD);
2232
2233         *priority = s->priority;
2234         return 0;
2235 }
2236
2237 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2238         bool rm_inotify = false, rm_inode = false;
2239         struct inotify_data *new_inotify_data = NULL;
2240         struct inode_data *new_inode_data = NULL;
2241         int r;
2242
2243         assert_return(s, -EINVAL);
2244         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2245         assert_return(!event_pid_changed(s->event), -ECHILD);
2246
2247         if (s->priority == priority)
2248                 return 0;
2249
2250         if (s->type == SOURCE_INOTIFY) {
2251                 struct inode_data *old_inode_data;
2252
2253                 assert(s->inotify.inode_data);
2254                 old_inode_data = s->inotify.inode_data;
2255
2256                 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2257                  * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2258                  * events we allow priority changes only until the first following iteration. */
2259                 if (old_inode_data->fd < 0)
2260                         return -EOPNOTSUPP;
2261
2262                 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2263                 if (r < 0)
2264                         return r;
2265                 rm_inotify = r > 0;
2266
2267                 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2268                 if (r < 0)
2269                         goto fail;
2270                 rm_inode = r > 0;
2271
2272                 if (new_inode_data->fd < 0) {
2273                         /* Duplicate the fd for the new inode object if we don't have any yet */
2274                         new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2275                         if (new_inode_data->fd < 0) {
2276                                 r = -errno;
2277                                 goto fail;
2278                         }
2279
2280                         LIST_PREPEND(to_close, s->event->inode_data_to_close, new_inode_data);
2281                 }
2282
2283                 /* Move the event source to the new inode data structure */
2284                 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2285                 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2286                 s->inotify.inode_data = new_inode_data;
2287
2288                 /* Now create the new watch */
2289                 r = inode_data_realize_watch(s->event, new_inode_data);
2290                 if (r < 0) {
2291                         /* Move it back */
2292                         LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2293                         LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2294                         s->inotify.inode_data = old_inode_data;
2295                         goto fail;
2296                 }
2297
2298                 s->priority = priority;
2299
2300                 event_gc_inode_data(s->event, old_inode_data);
2301
2302         } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2303                 struct signal_data *old, *d;
2304
2305                 /* Move us from the signalfd belonging to the old
2306                  * priority to the signalfd of the new priority */
2307
2308                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2309
2310                 s->priority = priority;
2311
2312                 r = event_make_signal_data(s->event, s->signal.sig, &d);
2313                 if (r < 0) {
2314                         s->priority = old->priority;
2315                         return r;
2316                 }
2317
2318                 event_unmask_signal_data(s->event, old, s->signal.sig);
2319         } else
2320                 s->priority = priority;
2321
2322         event_source_pp_prioq_reshuffle(s);
2323
2324         if (s->type == SOURCE_EXIT)
2325                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2326
2327         return 0;
2328
2329 fail:
2330         if (rm_inode)
2331                 event_free_inode_data(s->event, new_inode_data);
2332
2333         if (rm_inotify)
2334                 event_free_inotify_data(s->event, new_inotify_data);
2335
2336         return r;
2337 }
2338
2339 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2340         assert_return(s, -EINVAL);
2341         assert_return(!event_pid_changed(s->event), -ECHILD);
2342
2343         if (ret)
2344                 *ret = s->enabled;
2345
2346         return s->enabled != SD_EVENT_OFF;
2347 }
2348
2349 static int event_source_offline(
2350                 sd_event_source *s,
2351                 int enabled,
2352                 bool ratelimited) {
2353
2354         bool was_offline;
2355         int r;
2356
2357         assert(s);
2358         assert(enabled == SD_EVENT_OFF || ratelimited);
2359
2360         /* Unset the pending flag when this event source is disabled */
2361         if (s->enabled != SD_EVENT_OFF &&
2362             enabled == SD_EVENT_OFF &&
2363             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2364                 r = source_set_pending(s, false);
2365                 if (r < 0)
2366                         return r;
2367         }
2368
2369         was_offline = event_source_is_offline(s);
2370         s->enabled = enabled;
2371         s->ratelimited = ratelimited;
2372
2373         switch (s->type) {
2374
2375         case SOURCE_IO:
2376                 source_io_unregister(s);
2377                 break;
2378
2379         case SOURCE_TIME_REALTIME:
2380         case SOURCE_TIME_BOOTTIME:
2381         case SOURCE_TIME_MONOTONIC:
2382         case SOURCE_TIME_REALTIME_ALARM:
2383         case SOURCE_TIME_BOOTTIME_ALARM:
2384                 event_source_time_prioq_reshuffle(s);
2385                 break;
2386
2387         case SOURCE_SIGNAL:
2388                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2389                 break;
2390
2391         case SOURCE_CHILD:
2392                 if (!was_offline) {
2393                         assert(s->event->n_online_child_sources > 0);
2394                         s->event->n_online_child_sources--;
2395                 }
2396
2397                 if (EVENT_SOURCE_WATCH_PIDFD(s))
2398                         source_child_pidfd_unregister(s);
2399                 else
2400                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2401                 break;
2402
2403         case SOURCE_EXIT:
2404                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2405                 break;
2406
2407         case SOURCE_DEFER:
2408         case SOURCE_POST:
2409         case SOURCE_INOTIFY:
2410                 break;
2411
2412         default:
2413                 assert_not_reached("Wut? I shouldn't exist.");
2414         }
2415
2416         return 1;
2417 }
2418
2419 static int event_source_online(
2420                 sd_event_source *s,
2421                 int enabled,
2422                 bool ratelimited) {
2423
2424         bool was_online;
2425         int r;
2426
2427         assert(s);
2428         assert(enabled != SD_EVENT_OFF || !ratelimited);
2429
2430         /* Unset the pending flag when this event source is enabled */
2431         if (s->enabled == SD_EVENT_OFF &&
2432             enabled != SD_EVENT_OFF &&
2433             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2434                 r = source_set_pending(s, false);
2435                 if (r < 0)
2436                         return r;
2437         }
2438
2439         /* Are we really ready for onlining? */
2440         if (enabled == SD_EVENT_OFF || ratelimited) {
2441                 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2442                 s->enabled = enabled;
2443                 s->ratelimited = ratelimited;
2444                 return 0;
2445         }
2446
2447         was_online = event_source_is_online(s);
2448
2449         switch (s->type) {
2450         case SOURCE_IO:
2451                 r = source_io_register(s, enabled, s->io.events);
2452                 if (r < 0)
2453                         return r;
2454                 break;
2455
2456         case SOURCE_SIGNAL:
2457                 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2458                 if (r < 0) {
2459                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2460                         return r;
2461                 }
2462
2463                 break;
2464
2465         case SOURCE_CHILD:
2466                 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2467                         /* yes, we have pidfd */
2468
2469                         r = source_child_pidfd_register(s, enabled);
2470                         if (r < 0)
2471                                 return r;
2472                 } else {
2473                         /* no pidfd, or something other to watch for than WEXITED */
2474
2475                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
2476                         if (r < 0) {
2477                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2478                                 return r;
2479                         }
2480                 }
2481
2482                 if (!was_online)
2483                         s->event->n_online_child_sources++;
2484                 break;
2485
2486         case SOURCE_TIME_REALTIME:
2487         case SOURCE_TIME_BOOTTIME:
2488         case SOURCE_TIME_MONOTONIC:
2489         case SOURCE_TIME_REALTIME_ALARM:
2490         case SOURCE_TIME_BOOTTIME_ALARM:
2491         case SOURCE_EXIT:
2492         case SOURCE_DEFER:
2493         case SOURCE_POST:
2494         case SOURCE_INOTIFY:
2495                 break;
2496
2497         default:
2498                 assert_not_reached("Wut? I shouldn't exist.");
2499         }
2500
2501         s->enabled = enabled;
2502         s->ratelimited = ratelimited;
2503
2504         /* Non-failing operations below */
2505         switch (s->type) {
2506         case SOURCE_TIME_REALTIME:
2507         case SOURCE_TIME_BOOTTIME:
2508         case SOURCE_TIME_MONOTONIC:
2509         case SOURCE_TIME_REALTIME_ALARM:
2510         case SOURCE_TIME_BOOTTIME_ALARM:
2511                 event_source_time_prioq_reshuffle(s);
2512                 break;
2513
2514         case SOURCE_EXIT:
2515                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2516                 break;
2517
2518         default:
2519                 break;
2520         }
2521
2522         return 1;
2523 }
2524
2525 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
2526         int r;
2527
2528         assert_return(s, -EINVAL);
2529         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
2530         assert_return(!event_pid_changed(s->event), -ECHILD);
2531
2532         /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
2533         if (s->event->state == SD_EVENT_FINISHED)
2534                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
2535
2536         if (s->enabled == m) /* No change? */
2537                 return 0;
2538
2539         if (m == SD_EVENT_OFF)
2540                 r = event_source_offline(s, m, s->ratelimited);
2541         else {
2542                 if (s->enabled != SD_EVENT_OFF) {
2543                         /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
2544                          * event source is already enabled after all. */
2545                         s->enabled = m;
2546                         return 0;
2547                 }
2548
2549                 r = event_source_online(s, m, s->ratelimited);
2550         }
2551         if (r < 0)
2552                 return r;
2553
2554         event_source_pp_prioq_reshuffle(s);
2555         return 0;
2556 }
2557
2558 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
2559         assert_return(s, -EINVAL);
2560         assert_return(usec, -EINVAL);
2561         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2562         assert_return(!event_pid_changed(s->event), -ECHILD);
2563
2564         *usec = s->time.next;
2565         return 0;
2566 }
2567
2568 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
2569         int r;
2570
2571         assert_return(s, -EINVAL);
2572         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2573         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2574         assert_return(!event_pid_changed(s->event), -ECHILD);
2575
2576         r = source_set_pending(s, false);
2577         if (r < 0)
2578                 return r;
2579
2580         s->time.next = usec;
2581
2582         event_source_time_prioq_reshuffle(s);
2583         return 0;
2584 }
2585
2586 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
2587         usec_t t;
2588         int r;
2589
2590         assert_return(s, -EINVAL);
2591         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2592
2593         r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
2594         if (r < 0)
2595                 return r;
2596
2597         usec = usec_add(t, usec);
2598         if (usec == USEC_INFINITY)
2599                 return -EOVERFLOW;
2600
2601         return sd_event_source_set_time(s, usec);
2602 }
2603
2604 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
2605         assert_return(s, -EINVAL);
2606         assert_return(usec, -EINVAL);
2607         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2608         assert_return(!event_pid_changed(s->event), -ECHILD);
2609
2610         *usec = s->time.accuracy;
2611         return 0;
2612 }
2613
2614 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
2615         int r;
2616
2617         assert_return(s, -EINVAL);
2618         assert_return(usec != UINT64_MAX, -EINVAL);
2619         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2620         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2621         assert_return(!event_pid_changed(s->event), -ECHILD);
2622
2623         r = source_set_pending(s, false);
2624         if (r < 0)
2625                 return r;
2626
2627         if (usec == 0)
2628                 usec = DEFAULT_ACCURACY_USEC;
2629
2630         s->time.accuracy = usec;
2631
2632         event_source_time_prioq_reshuffle(s);
2633         return 0;
2634 }
2635
2636 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
2637         assert_return(s, -EINVAL);
2638         assert_return(clock, -EINVAL);
2639         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2640         assert_return(!event_pid_changed(s->event), -ECHILD);
2641
2642         *clock = event_source_type_to_clock(s->type);
2643         return 0;
2644 }
2645
2646 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
2647         assert_return(s, -EINVAL);
2648         assert_return(pid, -EINVAL);
2649         assert_return(s->type == SOURCE_CHILD, -EDOM);
2650         assert_return(!event_pid_changed(s->event), -ECHILD);
2651
2652         *pid = s->child.pid;
2653         return 0;
2654 }
2655
2656 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
2657         assert_return(s, -EINVAL);
2658         assert_return(s->type == SOURCE_CHILD, -EDOM);
2659         assert_return(!event_pid_changed(s->event), -ECHILD);
2660
2661         if (s->child.pidfd < 0)
2662                 return -EOPNOTSUPP;
2663
2664         return s->child.pidfd;
2665 }
2666
2667 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
2668         assert_return(s, -EINVAL);
2669         assert_return(s->type == SOURCE_CHILD, -EDOM);
2670         assert_return(!event_pid_changed(s->event), -ECHILD);
2671         assert_return(SIGNAL_VALID(sig), -EINVAL);
2672
2673         /* If we already have seen indication the process exited refuse sending a signal early. This way we
2674          * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
2675          * available. */
2676         if (s->child.exited)
2677                 return -ESRCH;
2678
2679         if (s->child.pidfd >= 0) {
2680                 siginfo_t copy;
2681
2682                 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
2683                  * structure here */
2684                 if (si)
2685                         copy = *si;
2686
2687                 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
2688                         /* Let's propagate the error only if the system call is not implemented or prohibited */
2689                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
2690                                 return -errno;
2691                 } else
2692                         return 0;
2693         }
2694
2695         /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
2696          * this here. */
2697         if (flags != 0)
2698                 return -EOPNOTSUPP;
2699
2700         if (si) {
2701                 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
2702                 siginfo_t copy = *si;
2703
2704                 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
2705                         return -errno;
2706         } else if (kill(s->child.pid, sig) < 0)
2707                 return -errno;
2708
2709         return 0;
2710 }
2711
2712 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
2713         assert_return(s, -EINVAL);
2714         assert_return(s->type == SOURCE_CHILD, -EDOM);
2715
2716         if (s->child.pidfd < 0)
2717                 return -EOPNOTSUPP;
2718
2719         return s->child.pidfd_owned;
2720 }
2721
2722 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
2723         assert_return(s, -EINVAL);
2724         assert_return(s->type == SOURCE_CHILD, -EDOM);
2725
2726         if (s->child.pidfd < 0)
2727                 return -EOPNOTSUPP;
2728
2729         s->child.pidfd_owned = own;
2730         return 0;
2731 }
2732
2733 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
2734         assert_return(s, -EINVAL);
2735         assert_return(s->type == SOURCE_CHILD, -EDOM);
2736
2737         return s->child.process_owned;
2738 }
2739
2740 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
2741         assert_return(s, -EINVAL);
2742         assert_return(s->type == SOURCE_CHILD, -EDOM);
2743
2744         s->child.process_owned = own;
2745         return 0;
2746 }
2747
2748 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
2749         assert_return(s, -EINVAL);
2750         assert_return(mask, -EINVAL);
2751         assert_return(s->type == SOURCE_INOTIFY, -EDOM);
2752         assert_return(!event_pid_changed(s->event), -ECHILD);
2753
2754         *mask = s->inotify.mask;
2755         return 0;
2756 }
2757
2758 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
2759         int r;
2760
2761         assert_return(s, -EINVAL);
2762         assert_return(s->type != SOURCE_EXIT, -EDOM);
2763         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2764         assert_return(!event_pid_changed(s->event), -ECHILD);
2765
2766         if (s->prepare == callback)
2767                 return 0;
2768
2769         if (callback && s->prepare) {
2770                 s->prepare = callback;
2771                 return 0;
2772         }
2773
2774         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
2775         if (r < 0)
2776                 return r;
2777
2778         s->prepare = callback;
2779
2780         if (callback) {
2781                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
2782                 if (r < 0)
2783                         return r;
2784         } else
2785                 prioq_remove(s->event->prepare, s, &s->prepare_index);
2786
2787         return 0;
2788 }
2789
2790 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
2791         assert_return(s, NULL);
2792
2793         return s->userdata;
2794 }
2795
2796 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
2797         void *ret;
2798
2799         assert_return(s, NULL);
2800
2801         ret = s->userdata;
2802         s->userdata = userdata;
2803
2804         return ret;
2805 }
2806
2807 static int event_source_enter_ratelimited(sd_event_source *s) {
2808         int r;
2809
2810         assert(s);
2811
2812         /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
2813          * the end of the rate limit time window, much as if it was a timer event source. */
2814
2815         if (s->ratelimited)
2816                 return 0; /* Already ratelimited, this is a NOP hence */
2817
2818         /* Make sure we can install a CLOCK_MONOTONIC event further down. */
2819         r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
2820         if (r < 0)
2821                 return r;
2822
2823         /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
2824          * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
2825          * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
2826         if (EVENT_SOURCE_IS_TIME(s->type))
2827                 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
2828
2829         /* Now, let's add the event source to the monotonic clock instead */
2830         r = event_source_time_prioq_put(s, &s->event->monotonic);
2831         if (r < 0)
2832                 goto fail;
2833
2834         /* And let's take the event source officially offline */
2835         r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
2836         if (r < 0) {
2837                 event_source_time_prioq_remove(s, &s->event->monotonic);
2838                 goto fail;
2839         }
2840
2841         event_source_pp_prioq_reshuffle(s);
2842
2843         log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
2844         return 0;
2845
2846 fail:
2847         /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
2848          * space for it should already be allocated. */
2849         if (EVENT_SOURCE_IS_TIME(s->type))
2850                 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
2851
2852         return r;
2853 }
2854
2855 static int event_source_leave_ratelimit(sd_event_source *s) {
2856         int r;
2857
2858         assert(s);
2859
2860         if (!s->ratelimited)
2861                 return 0;
2862
2863         /* Let's take the event source out of the monotonic prioq first. */
2864         event_source_time_prioq_remove(s, &s->event->monotonic);
2865
2866         /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
2867         if (EVENT_SOURCE_IS_TIME(s->type)) {
2868                 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
2869                 if (r < 0)
2870                         goto fail;
2871         }
2872
2873         /* Let's try to take it online again.  */
2874         r = event_source_online(s, s->enabled, /* ratelimited= */ false);
2875         if (r < 0) {
2876                 /* Do something roughly sensible when this failed: undo the two prioq ops above */
2877                 if (EVENT_SOURCE_IS_TIME(s->type))
2878                         event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
2879
2880                 goto fail;
2881         }
2882
2883         event_source_pp_prioq_reshuffle(s);
2884         ratelimit_reset(&s->rate_limit);
2885
2886         log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
2887         return 0;
2888
2889 fail:
2890         /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
2891          * simply put it back in it, maybe we can then process it more successfully next iteration. */
2892         assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
2893
2894         return r;
2895 }
2896
2897 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
2898         usec_t c;
2899         assert(e);
2900         assert(a <= b);
2901
2902         if (a <= 0)
2903                 return 0;
2904         if (a >= USEC_INFINITY)
2905                 return USEC_INFINITY;
2906
2907         if (b <= a + 1)
2908                 return a;
2909
2910         initialize_perturb(e);
2911
2912         /*
2913           Find a good time to wake up again between times a and b. We
2914           have two goals here:
2915
2916           a) We want to wake up as seldom as possible, hence prefer
2917              later times over earlier times.
2918
2919           b) But if we have to wake up, then let's make sure to
2920              dispatch as much as possible on the entire system.
2921
2922           We implement this by waking up everywhere at the same time
2923           within any given minute if we can, synchronised via the
2924           perturbation value determined from the boot ID. If we can't,
2925           then we try to find the same spot in every 10s, then 1s and
2926           then 250ms step. Otherwise, we pick the last possible time
2927           to wake up.
2928         */
2929
2930         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
2931         if (c >= b) {
2932                 if (_unlikely_(c < USEC_PER_MINUTE))
2933                         return b;
2934
2935                 c -= USEC_PER_MINUTE;
2936         }
2937
2938         if (c >= a)
2939                 return c;
2940
2941         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
2942         if (c >= b) {
2943                 if (_unlikely_(c < USEC_PER_SEC*10))
2944                         return b;
2945
2946                 c -= USEC_PER_SEC*10;
2947         }
2948
2949         if (c >= a)
2950                 return c;
2951
2952         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
2953         if (c >= b) {
2954                 if (_unlikely_(c < USEC_PER_SEC))
2955                         return b;
2956
2957                 c -= USEC_PER_SEC;
2958         }
2959
2960         if (c >= a)
2961                 return c;
2962
2963         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
2964         if (c >= b) {
2965                 if (_unlikely_(c < USEC_PER_MSEC*250))
2966                         return b;
2967
2968                 c -= USEC_PER_MSEC*250;
2969         }
2970
2971         if (c >= a)
2972                 return c;
2973
2974         return b;
2975 }
2976
2977 static int event_arm_timer(
2978                 sd_event *e,
2979                 struct clock_data *d) {
2980
2981         struct itimerspec its = {};
2982         sd_event_source *a, *b;
2983         usec_t t;
2984
2985         assert(e);
2986         assert(d);
2987
2988         if (!d->needs_rearm)
2989                 return 0;
2990         else
2991                 d->needs_rearm = false;
2992
2993         a = prioq_peek(d->earliest);
2994         if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
2995
2996                 if (d->fd < 0)
2997                         return 0;
2998
2999                 if (d->next == USEC_INFINITY)
3000                         return 0;
3001
3002                 /* disarm */
3003                 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3004                         return -errno;
3005
3006                 d->next = USEC_INFINITY;
3007                 return 0;
3008         }
3009
3010         b = prioq_peek(d->latest);
3011         assert_se(b && b->enabled != SD_EVENT_OFF);
3012
3013         t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3014         if (d->next == t)
3015                 return 0;
3016
3017         assert_se(d->fd >= 0);
3018
3019         if (t == 0) {
3020                 /* We don' want to disarm here, just mean some time looooong ago. */
3021                 its.it_value.tv_sec = 0;
3022                 its.it_value.tv_nsec = 1;
3023         } else
3024                 timespec_store(&its.it_value, t);
3025
3026         if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3027                 return -errno;
3028
3029         d->next = t;
3030         return 0;
3031 }
3032
3033 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3034         assert(e);
3035         assert(s);
3036         assert(s->type == SOURCE_IO);
3037
3038         /* If the event source was already pending, we just OR in the
3039          * new revents, otherwise we reset the value. The ORing is
3040          * necessary to handle EPOLLONESHOT events properly where
3041          * readability might happen independently of writability, and
3042          * we need to keep track of both */
3043
3044         if (s->pending)
3045                 s->io.revents |= revents;
3046         else
3047                 s->io.revents = revents;
3048
3049         return source_set_pending(s, true);
3050 }
3051
3052 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3053         uint64_t x;
3054         ssize_t ss;
3055
3056         assert(e);
3057         assert(fd >= 0);
3058
3059         assert_return(events == EPOLLIN, -EIO);
3060
3061         ss = read(fd, &x, sizeof(x));
3062         if (ss < 0) {
3063                 if (IN_SET(errno, EAGAIN, EINTR))
3064                         return 0;
3065
3066                 return -errno;
3067         }
3068
3069         if (_unlikely_(ss != sizeof(x)))
3070                 return -EIO;
3071
3072         if (next)
3073                 *next = USEC_INFINITY;
3074
3075         return 0;
3076 }
3077
3078 static int process_timer(
3079                 sd_event *e,
3080                 usec_t n,
3081                 struct clock_data *d) {
3082
3083         sd_event_source *s;
3084         int r;
3085
3086         assert(e);
3087         assert(d);
3088
3089         for (;;) {
3090                 s = prioq_peek(d->earliest);
3091                 if (!s || time_event_source_next(s) > n)
3092                         break;
3093
3094                 if (s->ratelimited) {
3095                         /* This is an event sources whose ratelimit window has ended. Let's turn it on
3096                          * again. */
3097                         assert(s->ratelimited);
3098
3099                         r = event_source_leave_ratelimit(s);
3100                         if (r < 0)
3101                                 return r;
3102
3103                         continue;
3104                 }
3105
3106                 if (s->enabled == SD_EVENT_OFF || s->pending)
3107                         break;
3108
3109                 r = source_set_pending(s, true);
3110                 if (r < 0)
3111                         return r;
3112
3113                 event_source_time_prioq_reshuffle(s);
3114         }
3115
3116         return 0;
3117 }
3118
3119 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3120         int64_t min_priority = threshold;
3121         bool something_new = false;
3122         sd_event_source *s;
3123         int r;
3124
3125         assert(e);
3126         assert(ret_min_priority);
3127
3128         if (!e->need_process_child) {
3129                 *ret_min_priority = min_priority;
3130                 return 0;
3131         }
3132
3133         e->need_process_child = false;
3134
3135         /*
3136            So, this is ugly. We iteratively invoke waitid() with P_PID
3137            + WNOHANG for each PID we wait for, instead of using
3138            P_ALL. This is because we only want to get child
3139            information of very specific child processes, and not all
3140            of them. We might not have processed the SIGCHLD even of a
3141            previous invocation and we don't want to maintain a
3142            unbounded *per-child* event queue, hence we really don't
3143            want anything flushed out of the kernel's queue that we
3144            don't care about. Since this is O(n) this means that if you
3145            have a lot of processes you probably want to handle SIGCHLD
3146            yourself.
3147
3148            We do not reap the children here (by using WNOWAIT), this
3149            is only done after the event source is dispatched so that
3150            the callback still sees the process as a zombie.
3151         */
3152
3153         HASHMAP_FOREACH(s, e->child_sources) {
3154                 assert(s->type == SOURCE_CHILD);
3155
3156                 if (s->priority > threshold)
3157                         continue;
3158
3159                 if (s->pending)
3160                         continue;
3161
3162                 if (event_source_is_offline(s))
3163                         continue;
3164
3165                 if (s->child.exited)
3166                         continue;
3167
3168                 if (EVENT_SOURCE_WATCH_PIDFD(s)) /* There's a usable pidfd known for this event source? then don't waitid() for it here */
3169                         continue;
3170
3171                 zero(s->child.siginfo);
3172                 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3173                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3174                         return negative_errno();
3175
3176                 if (s->child.siginfo.si_pid != 0) {
3177                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3178
3179                         if (zombie)
3180                                 s->child.exited = true;
3181
3182                         if (!zombie && (s->child.options & WEXITED)) {
3183                                 /* If the child isn't dead then let's
3184                                  * immediately remove the state change
3185                                  * from the queue, since there's no
3186                                  * benefit in leaving it queued */
3187
3188                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
3189                                 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3190                         }
3191
3192                         r = source_set_pending(s, true);
3193                         if (r < 0)
3194                                 return r;
3195                         if (r > 0) {
3196                                 something_new = true;
3197                                 min_priority = MIN(min_priority, s->priority);
3198                         }
3199                 }
3200         }
3201
3202         *ret_min_priority = min_priority;
3203         return something_new;
3204 }
3205
3206 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3207         assert(e);
3208         assert(s);
3209         assert(s->type == SOURCE_CHILD);
3210
3211         if (s->pending)
3212                 return 0;
3213
3214         if (event_source_is_offline(s))
3215                 return 0;
3216
3217         if (!EVENT_SOURCE_WATCH_PIDFD(s))
3218                 return 0;
3219
3220         zero(s->child.siginfo);
3221         if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3222                 return -errno;
3223
3224         if (s->child.siginfo.si_pid == 0)
3225                 return 0;
3226
3227         if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3228                 s->child.exited = true;
3229
3230         return source_set_pending(s, true);
3231 }
3232
3233 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3234         int r;
3235
3236         assert(e);
3237         assert(d);
3238         assert_return(events == EPOLLIN, -EIO);
3239         assert(min_priority);
3240
3241         /* If there's a signal queued on this priority and SIGCHLD is
3242            on this priority too, then make sure to recheck the
3243            children we watch. This is because we only ever dequeue
3244            the first signal per priority, and if we dequeue one, and
3245            SIGCHLD might be enqueued later we wouldn't know, but we
3246            might have higher priority children we care about hence we
3247            need to check that explicitly. */
3248
3249         if (sigismember(&d->sigset, SIGCHLD))
3250                 e->need_process_child = true;
3251
3252         /* If there's already an event source pending for this
3253          * priority we don't read another */
3254         if (d->current)
3255                 return 0;
3256
3257         for (;;) {
3258                 struct signalfd_siginfo si;
3259                 ssize_t n;
3260                 sd_event_source *s = NULL;
3261
3262                 n = read(d->fd, &si, sizeof(si));
3263                 if (n < 0) {
3264                         if (IN_SET(errno, EAGAIN, EINTR))
3265                                 return 0;
3266
3267                         return -errno;
3268                 }
3269
3270                 if (_unlikely_(n != sizeof(si)))
3271                         return -EIO;
3272
3273                 assert(SIGNAL_VALID(si.ssi_signo));
3274
3275                 if (e->signal_sources)
3276                         s = e->signal_sources[si.ssi_signo];
3277                 if (!s)
3278                         continue;
3279                 if (s->pending)
3280                         continue;
3281
3282                 s->signal.siginfo = si;
3283                 d->current = s;
3284
3285                 r = source_set_pending(s, true);
3286                 if (r < 0)
3287                         return r;
3288                 if (r > 0 && *min_priority >= s->priority) {
3289                         *min_priority = s->priority;
3290                         return 1; /* an event source with smaller priority is queued. */
3291                 }
3292
3293                 return 0;
3294         }
3295 }
3296
3297 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3298         ssize_t n;
3299
3300         assert(e);
3301         assert(d);
3302
3303         assert_return(revents == EPOLLIN, -EIO);
3304
3305         /* If there's already an event source pending for this priority, don't read another */
3306         if (d->n_pending > 0)
3307                 return 0;
3308
3309         /* Is the read buffer non-empty? If so, let's not read more */
3310         if (d->buffer_filled > 0)
3311                 return 0;
3312
3313         if (d->priority > threshold)
3314                 return 0;
3315
3316         n = read(d->fd, &d->buffer, sizeof(d->buffer));
3317         if (n < 0) {
3318                 if (IN_SET(errno, EAGAIN, EINTR))
3319                         return 0;
3320
3321                 return -errno;
3322         }
3323
3324         assert(n > 0);
3325         d->buffer_filled = (size_t) n;
3326         LIST_PREPEND(buffered, e->inotify_data_buffered, d);
3327
3328         return 1;
3329 }
3330
3331 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3332         assert(e);
3333         assert(d);
3334         assert(sz <= d->buffer_filled);
3335
3336         if (sz == 0)
3337                 return;
3338
3339         /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3340         memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3341         d->buffer_filled -= sz;
3342
3343         if (d->buffer_filled == 0)
3344                 LIST_REMOVE(buffered, e->inotify_data_buffered, d);
3345 }
3346
3347 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3348         int r;
3349
3350         assert(e);
3351         assert(d);
3352
3353         /* If there's already an event source pending for this priority, don't read another */
3354         if (d->n_pending > 0)
3355                 return 0;
3356
3357         while (d->buffer_filled > 0) {
3358                 size_t sz;
3359
3360                 /* Let's validate that the event structures are complete */
3361                 if (d->buffer_filled < offsetof(struct inotify_event, name))
3362                         return -EIO;
3363
3364                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3365                 if (d->buffer_filled < sz)
3366                         return -EIO;
3367
3368                 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3369                         struct inode_data *inode_data;
3370
3371                         /* The queue overran, let's pass this event to all event sources connected to this inotify
3372                          * object */
3373
3374                         HASHMAP_FOREACH(inode_data, d->inodes) {
3375                                 sd_event_source *s;
3376
3377                                 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3378
3379                                         if (event_source_is_offline(s))
3380                                                 continue;
3381
3382                                         r = source_set_pending(s, true);
3383                                         if (r < 0)
3384                                                 return r;
3385                                 }
3386                         }
3387                 } else {
3388                         struct inode_data *inode_data;
3389                         sd_event_source *s;
3390
3391                         /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3392                          * our watch descriptor table. */
3393                         if (d->buffer.ev.mask & IN_IGNORED) {
3394
3395                                 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3396                                 if (!inode_data) {
3397                                         event_inotify_data_drop(e, d, sz);
3398                                         continue;
3399                                 }
3400
3401                                 /* The watch descriptor was removed by the kernel, let's drop it here too */
3402                                 inode_data->wd = -1;
3403                         } else {
3404                                 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3405                                 if (!inode_data) {
3406                                         event_inotify_data_drop(e, d, sz);
3407                                         continue;
3408                                 }
3409                         }
3410
3411                         /* Trigger all event sources that are interested in these events. Also trigger all event
3412                          * sources if IN_IGNORED or IN_UNMOUNT is set. */
3413                         LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3414
3415                                 if (event_source_is_offline(s))
3416                                         continue;
3417
3418                                 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3419                                     (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3420                                         continue;
3421
3422                                 r = source_set_pending(s, true);
3423                                 if (r < 0)
3424                                         return r;
3425                         }
3426                 }
3427
3428                 /* Something pending now? If so, let's finish, otherwise let's read more. */
3429                 if (d->n_pending > 0)
3430                         return 1;
3431         }
3432
3433         return 0;
3434 }
3435
3436 static int process_inotify(sd_event *e) {
3437         struct inotify_data *d;
3438         int r, done = 0;
3439
3440         assert(e);
3441
3442         LIST_FOREACH(buffered, d, e->inotify_data_buffered) {
3443                 r = event_inotify_data_process(e, d);
3444                 if (r < 0)
3445                         return r;
3446                 if (r > 0)
3447                         done ++;
3448         }
3449
3450         return done;
3451 }
3452
3453 static int source_dispatch(sd_event_source *s) {
3454         _cleanup_(sd_event_unrefp) sd_event *saved_event = NULL;
3455         EventSourceType saved_type;
3456         int r = 0;
3457
3458         assert(s);
3459         assert(s->pending || s->type == SOURCE_EXIT);
3460
3461         /* Save the event source type, here, so that we still know it after the event callback which might
3462          * invalidate the event. */
3463         saved_type = s->type;
3464
3465         /* Similar, store a reference to the event loop object, so that we can still access it after the
3466          * callback might have invalidated/disconnected the event source. */
3467         saved_event = sd_event_ref(s->event);
3468
3469         /* Check if we hit the ratelimit for this event source, if so, let's disable it. */
3470         assert(!s->ratelimited);
3471         if (!ratelimit_below(&s->rate_limit)) {
3472                 r = event_source_enter_ratelimited(s);
3473                 if (r < 0)
3474                         return r;
3475
3476                 return 1;
3477         }
3478
3479         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
3480                 r = source_set_pending(s, false);
3481                 if (r < 0)
3482                         return r;
3483         }
3484
3485         if (s->type != SOURCE_POST) {
3486                 sd_event_source *z;
3487
3488                 /* If we execute a non-post source, let's mark all
3489                  * post sources as pending */
3490
3491                 SET_FOREACH(z, s->event->post_sources) {
3492                         if (event_source_is_offline(z))
3493                                 continue;
3494
3495                         r = source_set_pending(z, true);
3496                         if (r < 0)
3497                                 return r;
3498                 }
3499         }
3500
3501         if (s->enabled == SD_EVENT_ONESHOT) {
3502                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
3503                 if (r < 0)
3504                         return r;
3505         }
3506
3507         s->dispatching = true;
3508
3509         switch (s->type) {
3510
3511         case SOURCE_IO:
3512                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
3513                 break;
3514
3515         case SOURCE_TIME_REALTIME:
3516         case SOURCE_TIME_BOOTTIME:
3517         case SOURCE_TIME_MONOTONIC:
3518         case SOURCE_TIME_REALTIME_ALARM:
3519         case SOURCE_TIME_BOOTTIME_ALARM:
3520                 r = s->time.callback(s, s->time.next, s->userdata);
3521                 break;
3522
3523         case SOURCE_SIGNAL:
3524                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
3525                 break;
3526
3527         case SOURCE_CHILD: {
3528                 bool zombie;
3529
3530                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3531
3532                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
3533
3534                 /* Now, reap the PID for good. */
3535                 if (zombie) {
3536                         (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
3537                         s->child.waited = true;
3538                 }
3539
3540                 break;
3541         }
3542
3543         case SOURCE_DEFER:
3544                 r = s->defer.callback(s, s->userdata);
3545                 break;
3546
3547         case SOURCE_POST:
3548                 r = s->post.callback(s, s->userdata);
3549                 break;
3550
3551         case SOURCE_EXIT:
3552                 r = s->exit.callback(s, s->userdata);
3553                 break;
3554
3555         case SOURCE_INOTIFY: {
3556                 struct sd_event *e = s->event;
3557                 struct inotify_data *d;
3558                 size_t sz;
3559
3560                 assert(s->inotify.inode_data);
3561                 assert_se(d = s->inotify.inode_data->inotify_data);
3562
3563                 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
3564                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3565                 assert(d->buffer_filled >= sz);
3566
3567                 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
3568
3569                 /* When no event is pending anymore on this inotify object, then let's drop the event from the
3570                  * buffer. */
3571                 if (d->n_pending == 0)
3572                         event_inotify_data_drop(e, d, sz);
3573
3574                 break;
3575         }
3576
3577         case SOURCE_WATCHDOG:
3578         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
3579         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
3580                 assert_not_reached("Wut? I shouldn't exist.");
3581         }
3582
3583         s->dispatching = false;
3584
3585         if (r < 0) {
3586                 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
3587                                 strna(s->description),
3588                                 event_source_type_to_string(saved_type),
3589                                 s->exit_on_failure ? "exiting" : "disabling");
3590
3591                 if (s->exit_on_failure)
3592                         (void) sd_event_exit(saved_event, r);
3593         }
3594
3595         if (s->n_ref == 0)
3596                 source_free(s);
3597         else if (r < 0)
3598                 sd_event_source_set_enabled(s, SD_EVENT_OFF);
3599
3600         return 1;
3601 }
3602
3603 static int event_prepare(sd_event *e) {
3604         int r;
3605
3606         assert(e);
3607
3608         for (;;) {
3609                 sd_event_source *s;
3610
3611                 s = prioq_peek(e->prepare);
3612                 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
3613                         break;
3614
3615                 s->prepare_iteration = e->iteration;
3616                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
3617                 if (r < 0)
3618                         return r;
3619
3620                 assert(s->prepare);
3621
3622                 s->dispatching = true;
3623                 r = s->prepare(s, s->userdata);
3624                 s->dispatching = false;
3625
3626                 if (r < 0) {
3627                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
3628                                         strna(s->description),
3629                                         event_source_type_to_string(s->type),
3630                                         s->exit_on_failure ? "exiting" : "disabling");
3631
3632                         if (s->exit_on_failure)
3633                                 (void) sd_event_exit(e, r);
3634                 }
3635
3636                 if (s->n_ref == 0)
3637                         source_free(s);
3638                 else if (r < 0)
3639                         sd_event_source_set_enabled(s, SD_EVENT_OFF);
3640         }
3641
3642         return 0;
3643 }
3644
3645 static int dispatch_exit(sd_event *e) {
3646         sd_event_source *p;
3647         int r;
3648
3649         assert(e);
3650
3651         p = prioq_peek(e->exit);
3652         if (!p || event_source_is_offline(p)) {
3653                 e->state = SD_EVENT_FINISHED;
3654                 return 0;
3655         }
3656
3657         _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
3658         e->iteration++;
3659         e->state = SD_EVENT_EXITING;
3660         r = source_dispatch(p);
3661         e->state = SD_EVENT_INITIAL;
3662         return r;
3663 }
3664
3665 static sd_event_source* event_next_pending(sd_event *e) {
3666         sd_event_source *p;
3667
3668         assert(e);
3669
3670         p = prioq_peek(e->pending);
3671         if (!p)
3672                 return NULL;
3673
3674         if (event_source_is_offline(p))
3675                 return NULL;
3676
3677         return p;
3678 }
3679
3680 static int arm_watchdog(sd_event *e) {
3681         struct itimerspec its = {};
3682         usec_t t;
3683
3684         assert(e);
3685         assert(e->watchdog_fd >= 0);
3686
3687         t = sleep_between(e,
3688                           e->watchdog_last + (e->watchdog_period / 2),
3689                           e->watchdog_last + (e->watchdog_period * 3 / 4));
3690
3691         timespec_store(&its.it_value, t);
3692
3693         /* Make sure we never set the watchdog to 0, which tells the
3694          * kernel to disable it. */
3695         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
3696                 its.it_value.tv_nsec = 1;
3697
3698         if (timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3699                 return -errno;
3700
3701         return 0;
3702 }
3703
3704 static int process_watchdog(sd_event *e) {
3705         assert(e);
3706
3707         if (!e->watchdog)
3708                 return 0;
3709
3710         /* Don't notify watchdog too often */
3711         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
3712                 return 0;
3713
3714         sd_notify(false, "WATCHDOG=1");
3715         e->watchdog_last = e->timestamp.monotonic;
3716
3717         return arm_watchdog(e);
3718 }
3719
3720 static void event_close_inode_data_fds(sd_event *e) {
3721         struct inode_data *d;
3722
3723         assert(e);
3724
3725         /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
3726          * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
3727          * adjustments to the even source, such as changing the priority (which requires us to remove and re-add a watch
3728          * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
3729          * compromise. */
3730
3731         while ((d = e->inode_data_to_close)) {
3732                 assert(d->fd >= 0);
3733                 d->fd = safe_close(d->fd);
3734
3735                 LIST_REMOVE(to_close, e->inode_data_to_close, d);
3736         }
3737 }
3738
3739 _public_ int sd_event_prepare(sd_event *e) {
3740         int r;
3741
3742         assert_return(e, -EINVAL);
3743         assert_return(e = event_resolve(e), -ENOPKG);
3744         assert_return(!event_pid_changed(e), -ECHILD);
3745         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
3746         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
3747
3748         /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
3749          * this check here once, since gettid() is typically not cached, and thus want to minimize
3750          * syscalls */
3751         assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
3752
3753         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
3754         _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
3755
3756         if (e->exit_requested)
3757                 goto pending;
3758
3759         e->iteration++;
3760
3761         e->state = SD_EVENT_PREPARING;
3762         r = event_prepare(e);
3763         e->state = SD_EVENT_INITIAL;
3764         if (r < 0)
3765                 return r;
3766
3767         r = event_arm_timer(e, &e->realtime);
3768         if (r < 0)
3769                 return r;
3770
3771         r = event_arm_timer(e, &e->boottime);
3772         if (r < 0)
3773                 return r;
3774
3775         r = event_arm_timer(e, &e->monotonic);
3776         if (r < 0)
3777                 return r;
3778
3779         r = event_arm_timer(e, &e->realtime_alarm);
3780         if (r < 0)
3781                 return r;
3782
3783         r = event_arm_timer(e, &e->boottime_alarm);
3784         if (r < 0)
3785                 return r;
3786
3787         event_close_inode_data_fds(e);
3788
3789         if (event_next_pending(e) || e->need_process_child)
3790                 goto pending;
3791
3792         e->state = SD_EVENT_ARMED;
3793
3794         return 0;
3795
3796 pending:
3797         e->state = SD_EVENT_ARMED;
3798         r = sd_event_wait(e, 0);
3799         if (r == 0)
3800                 e->state = SD_EVENT_ARMED;
3801
3802         return r;
3803 }
3804
3805 static int epoll_wait_usec(
3806                 int fd,
3807                 struct epoll_event *events,
3808                 int maxevents,
3809                 usec_t timeout) {
3810
3811         int r, msec;
3812 #if 0
3813         static bool epoll_pwait2_absent = false;
3814
3815         /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not.
3816          *
3817          * FIXME: this is temporarily disabled until epoll_pwait2() becomes more widely available.
3818          * See https://github.com/systemd/systemd/pull/18973 and
3819          * https://github.com/systemd/systemd/issues/19052. */
3820
3821         if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
3822                 struct timespec ts;
3823
3824                 r = epoll_pwait2(fd,
3825                                  events,
3826                                  maxevents,
3827                                  timespec_store(&ts, timeout),
3828                                  NULL);
3829                 if (r >= 0)
3830                         return r;
3831                 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3832                         return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
3833                                         * supported. */
3834
3835                 epoll_pwait2_absent = true;
3836         }
3837 #endif
3838
3839         if (timeout == USEC_INFINITY)
3840                 msec = -1;
3841         else {
3842                 usec_t k;
3843
3844                 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
3845                 if (k >= INT_MAX)
3846                         msec = INT_MAX; /* Saturate */
3847                 else
3848                         msec = (int) k;
3849         }
3850
3851         r = epoll_wait(fd,
3852                        events,
3853                        maxevents,
3854                        msec);
3855         if (r < 0)
3856                 return -errno;
3857
3858         return r;
3859 }
3860
3861 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
3862         int64_t min_priority = threshold;
3863         bool something_new = false;
3864         size_t n_event_queue, m;
3865         int r;
3866
3867         assert(e);
3868         assert(ret_min_priority);
3869
3870         n_event_queue = MAX(e->n_sources, 1u);
3871         if (!GREEDY_REALLOC(e->event_queue, e->event_queue_allocated, n_event_queue))
3872                 return -ENOMEM;
3873
3874         /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
3875         if (e->inotify_data_buffered)
3876                 timeout = 0;
3877
3878         for (;;) {
3879                 r = epoll_wait_usec(e->epoll_fd, e->event_queue, e->event_queue_allocated, timeout);
3880                 if (r < 0)
3881                         return r;
3882
3883                 m = (size_t) r;
3884
3885                 if (m < e->event_queue_allocated)
3886                         break;
3887
3888                 if (e->event_queue_allocated >= n_event_queue * 10)
3889                         break;
3890
3891                 if (!GREEDY_REALLOC(e->event_queue, e->event_queue_allocated, e->event_queue_allocated + n_event_queue))
3892                         return -ENOMEM;
3893
3894                 timeout = 0;
3895         }
3896
3897         /* Set timestamp only when this is called first time. */
3898         if (threshold == INT64_MAX)
3899                 triple_timestamp_get(&e->timestamp);
3900
3901         for (size_t i = 0; i < m; i++) {
3902
3903                 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
3904                         r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
3905                 else {
3906                         WakeupType *t = e->event_queue[i].data.ptr;
3907
3908                         switch (*t) {
3909
3910                         case WAKEUP_EVENT_SOURCE: {
3911                                 sd_event_source *s = e->event_queue[i].data.ptr;
3912
3913                                 assert(s);
3914
3915                                 if (s->priority > threshold)
3916                                         continue;
3917
3918                                 min_priority = MIN(min_priority, s->priority);
3919
3920                                 switch (s->type) {
3921
3922                                 case SOURCE_IO:
3923                                         r = process_io(e, s, e->event_queue[i].events);
3924                                         break;
3925
3926                                 case SOURCE_CHILD:
3927                                         r = process_pidfd(e, s, e->event_queue[i].events);
3928                                         break;
3929
3930                                 default:
3931                                         assert_not_reached("Unexpected event source type");
3932                                 }
3933
3934                                 break;
3935                         }
3936
3937                         case WAKEUP_CLOCK_DATA: {
3938                                 struct clock_data *d = e->event_queue[i].data.ptr;
3939
3940                                 assert(d);
3941
3942                                 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
3943                                 break;
3944                         }
3945
3946                         case WAKEUP_SIGNAL_DATA:
3947                                 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
3948                                 break;
3949
3950                         case WAKEUP_INOTIFY_DATA:
3951                                 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
3952                                 break;
3953
3954                         default:
3955                                 assert_not_reached("Invalid wake-up pointer");
3956                         }
3957                 }
3958                 if (r < 0)
3959                         return r;
3960                 if (r > 0)
3961                         something_new = true;
3962         }
3963
3964         *ret_min_priority = min_priority;
3965         return something_new;
3966 }
3967
3968 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
3969         int r;
3970
3971         assert_return(e, -EINVAL);
3972         assert_return(e = event_resolve(e), -ENOPKG);
3973         assert_return(!event_pid_changed(e), -ECHILD);
3974         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
3975         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
3976
3977         if (e->exit_requested) {
3978                 e->state = SD_EVENT_PENDING;
3979                 return 1;
3980         }
3981
3982         for (int64_t threshold = INT64_MAX; ; threshold--) {
3983                 int64_t epoll_min_priority, child_min_priority;
3984
3985                 /* There may be a possibility that new epoll (especially IO) and child events are
3986                  * triggered just after process_epoll() call but before process_child(), and the new IO
3987                  * events may have higher priority than the child events. To salvage these events,
3988                  * let's call epoll_wait() again, but accepts only events with higher priority than the
3989                  * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
3990                  * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
3991                  * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
3992
3993                 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
3994                 if (r == -EINTR) {
3995                         e->state = SD_EVENT_PENDING;
3996                         return 1;
3997                 }
3998                 if (r < 0)
3999                         goto finish;
4000                 if (r == 0 && threshold < INT64_MAX)
4001                         /* No new epoll event. */
4002                         break;
4003
4004                 r = process_child(e, threshold, &child_min_priority);
4005                 if (r < 0)
4006                         goto finish;
4007                 if (r == 0)
4008                         /* No new child event. */
4009                         break;
4010
4011                 threshold = MIN(epoll_min_priority, child_min_priority);
4012                 if (threshold == INT64_MIN)
4013                         break;
4014
4015                 timeout = 0;
4016         }
4017
4018         r = process_watchdog(e);
4019         if (r < 0)
4020                 goto finish;
4021
4022         r = process_timer(e, e->timestamp.realtime, &e->realtime);
4023         if (r < 0)
4024                 goto finish;
4025
4026         r = process_timer(e, e->timestamp.boottime, &e->boottime);
4027         if (r < 0)
4028                 goto finish;
4029
4030         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4031         if (r < 0)
4032                 goto finish;
4033
4034         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4035         if (r < 0)
4036                 goto finish;
4037
4038         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4039         if (r < 0)
4040                 goto finish;
4041
4042         r = process_inotify(e);
4043         if (r < 0)
4044                 goto finish;
4045
4046         if (event_next_pending(e)) {
4047                 e->state = SD_EVENT_PENDING;
4048                 return 1;
4049         }
4050
4051         r = 0;
4052
4053 finish:
4054         e->state = SD_EVENT_INITIAL;
4055
4056         return r;
4057 }
4058
4059 _public_ int sd_event_dispatch(sd_event *e) {
4060         sd_event_source *p;
4061         int r;
4062
4063         assert_return(e, -EINVAL);
4064         assert_return(e = event_resolve(e), -ENOPKG);
4065         assert_return(!event_pid_changed(e), -ECHILD);
4066         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4067         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4068
4069         if (e->exit_requested)
4070                 return dispatch_exit(e);
4071
4072         p = event_next_pending(e);
4073         if (p) {
4074                 _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
4075
4076                 e->state = SD_EVENT_RUNNING;
4077                 r = source_dispatch(p);
4078                 e->state = SD_EVENT_INITIAL;
4079                 return r;
4080         }
4081
4082         e->state = SD_EVENT_INITIAL;
4083
4084         return 1;
4085 }
4086
4087 static void event_log_delays(sd_event *e) {
4088         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4089         size_t l, i;
4090
4091         p = b;
4092         l = sizeof(b);
4093         for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4094                 l = strpcpyf(&p, l, "%u ", e->delays[i]);
4095                 e->delays[i] = 0;
4096         }
4097         log_debug("Event loop iterations: %s", b);
4098 }
4099
4100 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4101         int r;
4102
4103         assert_return(e, -EINVAL);
4104         assert_return(e = event_resolve(e), -ENOPKG);
4105         assert_return(!event_pid_changed(e), -ECHILD);
4106         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4107         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4108
4109         if (e->profile_delays && e->last_run_usec != 0) {
4110                 usec_t this_run;
4111                 unsigned l;
4112
4113                 this_run = now(CLOCK_MONOTONIC);
4114
4115                 l = u64log2(this_run - e->last_run_usec);
4116                 assert(l < ELEMENTSOF(e->delays));
4117                 e->delays[l]++;
4118
4119                 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4120                         event_log_delays(e);
4121                         e->last_log_usec = this_run;
4122                 }
4123         }
4124
4125         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4126         _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
4127
4128         r = sd_event_prepare(e);
4129         if (r == 0)
4130                 /* There was nothing? Then wait... */
4131                 r = sd_event_wait(e, timeout);
4132
4133         if (e->profile_delays)
4134                 e->last_run_usec = now(CLOCK_MONOTONIC);
4135
4136         if (r > 0) {
4137                 /* There's something now, then let's dispatch it */
4138                 r = sd_event_dispatch(e);
4139                 if (r < 0)
4140                         return r;
4141
4142                 return 1;
4143         }
4144
4145         return r;
4146 }
4147
4148 _public_ int sd_event_loop(sd_event *e) {
4149         int r;
4150
4151         assert_return(e, -EINVAL);
4152         assert_return(e = event_resolve(e), -ENOPKG);
4153         assert_return(!event_pid_changed(e), -ECHILD);
4154         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4155
4156         _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = NULL;
4157
4158         while (e->state != SD_EVENT_FINISHED) {
4159                 r = sd_event_run(e, UINT64_MAX);
4160                 if (r < 0)
4161                         return r;
4162         }
4163
4164         return e->exit_code;
4165 }
4166
4167 _public_ int sd_event_get_fd(sd_event *e) {
4168         assert_return(e, -EINVAL);
4169         assert_return(e = event_resolve(e), -ENOPKG);
4170         assert_return(!event_pid_changed(e), -ECHILD);
4171
4172         return e->epoll_fd;
4173 }
4174
4175 _public_ int sd_event_get_state(sd_event *e) {
4176         assert_return(e, -EINVAL);
4177         assert_return(e = event_resolve(e), -ENOPKG);
4178         assert_return(!event_pid_changed(e), -ECHILD);
4179
4180         return e->state;
4181 }
4182
4183 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
4184         assert_return(e, -EINVAL);
4185         assert_return(e = event_resolve(e), -ENOPKG);
4186         assert_return(code, -EINVAL);
4187         assert_return(!event_pid_changed(e), -ECHILD);
4188
4189         if (!e->exit_requested)
4190                 return -ENODATA;
4191
4192         *code = e->exit_code;
4193         return 0;
4194 }
4195
4196 _public_ int sd_event_exit(sd_event *e, int code) {
4197         assert_return(e, -EINVAL);
4198         assert_return(e = event_resolve(e), -ENOPKG);
4199         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4200         assert_return(!event_pid_changed(e), -ECHILD);
4201
4202         e->exit_requested = true;
4203         e->exit_code = code;
4204
4205         return 0;
4206 }
4207
4208 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
4209         assert_return(e, -EINVAL);
4210         assert_return(e = event_resolve(e), -ENOPKG);
4211         assert_return(usec, -EINVAL);
4212         assert_return(!event_pid_changed(e), -ECHILD);
4213
4214         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4215                 return -EOPNOTSUPP;
4216
4217         /* Generate a clean error in case CLOCK_BOOTTIME is not available. Note that don't use clock_supported() here,
4218          * for a reason: there are systems where CLOCK_BOOTTIME is supported, but CLOCK_BOOTTIME_ALARM is not, but for
4219          * the purpose of getting the time this doesn't matter. */
4220         if (IN_SET(clock, CLOCK_BOOTTIME, CLOCK_BOOTTIME_ALARM) && !clock_boottime_supported())
4221                 return -EOPNOTSUPP;
4222
4223         if (!triple_timestamp_is_set(&e->timestamp)) {
4224                 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4225                 *usec = now(clock);
4226                 return 1;
4227         }
4228
4229         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
4230         return 0;
4231 }
4232
4233 _public_ int sd_event_default(sd_event **ret) {
4234         sd_event *e = NULL;
4235         int r;
4236
4237         if (!ret)
4238                 return !!default_event;
4239
4240         if (default_event) {
4241                 *ret = sd_event_ref(default_event);
4242                 return 0;
4243         }
4244
4245         r = sd_event_new(&e);
4246         if (r < 0)
4247                 return r;
4248
4249         e->default_event_ptr = &default_event;
4250         e->tid = gettid();
4251         default_event = e;
4252
4253         *ret = e;
4254         return 1;
4255 }
4256
4257 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4258         assert_return(e, -EINVAL);
4259         assert_return(e = event_resolve(e), -ENOPKG);
4260         assert_return(tid, -EINVAL);
4261         assert_return(!event_pid_changed(e), -ECHILD);
4262
4263         if (e->tid != 0) {
4264                 *tid = e->tid;
4265                 return 0;
4266         }
4267
4268         return -ENXIO;
4269 }
4270
4271 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
4272         int r;
4273
4274         assert_return(e, -EINVAL);
4275         assert_return(e = event_resolve(e), -ENOPKG);
4276         assert_return(!event_pid_changed(e), -ECHILD);
4277
4278         if (e->watchdog == !!b)
4279                 return e->watchdog;
4280
4281         if (b) {
4282                 r = sd_watchdog_enabled(false, &e->watchdog_period);
4283                 if (r <= 0)
4284                         return r;
4285
4286                 /* Issue first ping immediately */
4287                 sd_notify(false, "WATCHDOG=1");
4288                 e->watchdog_last = now(CLOCK_MONOTONIC);
4289
4290                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4291                 if (e->watchdog_fd < 0)
4292                         return -errno;
4293
4294                 r = arm_watchdog(e);
4295                 if (r < 0)
4296                         goto fail;
4297
4298                 struct epoll_event ev = {
4299                         .events = EPOLLIN,
4300                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
4301                 };
4302
4303                 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
4304                         r = -errno;
4305                         goto fail;
4306                 }
4307
4308         } else {
4309                 if (e->watchdog_fd >= 0) {
4310                         (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
4311                         e->watchdog_fd = safe_close(e->watchdog_fd);
4312                 }
4313         }
4314
4315         e->watchdog = !!b;
4316         return e->watchdog;
4317
4318 fail:
4319         e->watchdog_fd = safe_close(e->watchdog_fd);
4320         return r;
4321 }
4322
4323 _public_ int sd_event_get_watchdog(sd_event *e) {
4324         assert_return(e, -EINVAL);
4325         assert_return(e = event_resolve(e), -ENOPKG);
4326         assert_return(!event_pid_changed(e), -ECHILD);
4327
4328         return e->watchdog;
4329 }
4330
4331 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
4332         assert_return(e, -EINVAL);
4333         assert_return(e = event_resolve(e), -ENOPKG);
4334         assert_return(!event_pid_changed(e), -ECHILD);
4335
4336         *ret = e->iteration;
4337         return 0;
4338 }
4339
4340 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
4341         assert_return(s, -EINVAL);
4342
4343         s->destroy_callback = callback;
4344         return 0;
4345 }
4346
4347 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
4348         assert_return(s, -EINVAL);
4349
4350         if (ret)
4351                 *ret = s->destroy_callback;
4352
4353         return !!s->destroy_callback;
4354 }
4355
4356 _public_ int sd_event_source_get_floating(sd_event_source *s) {
4357         assert_return(s, -EINVAL);
4358
4359         return s->floating;
4360 }
4361
4362 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
4363         assert_return(s, -EINVAL);
4364
4365         if (s->floating == !!b)
4366                 return 0;
4367
4368         if (!s->event) /* Already disconnected */
4369                 return -ESTALE;
4370
4371         s->floating = b;
4372
4373         if (b) {
4374                 sd_event_source_ref(s);
4375                 sd_event_unref(s->event);
4376         } else {
4377                 sd_event_ref(s->event);
4378                 sd_event_source_unref(s);
4379         }
4380
4381         return 1;
4382 }
4383
4384 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
4385         assert_return(s, -EINVAL);
4386         assert_return(s->type != SOURCE_EXIT, -EDOM);
4387
4388         return s->exit_on_failure;
4389 }
4390
4391 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
4392         assert_return(s, -EINVAL);
4393         assert_return(s->type != SOURCE_EXIT, -EDOM);
4394
4395         if (s->exit_on_failure == !!b)
4396                 return 0;
4397
4398         s->exit_on_failure = b;
4399         return 1;
4400 }
4401
4402 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
4403         int r;
4404
4405         assert_return(s, -EINVAL);
4406
4407         /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
4408          * so is a programming error. */
4409         assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
4410
4411         /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
4412          * non-ratelimited. */
4413         r = event_source_leave_ratelimit(s);
4414         if (r < 0)
4415                 return r;
4416
4417         s->rate_limit = (RateLimit) { interval, burst };
4418         return 0;
4419 }
4420
4421 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
4422         assert_return(s, -EINVAL);
4423
4424         /* Querying whether an event source has ratelimiting configured is not a loggable offsense, hence
4425          * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error */
4426         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
4427                 return -EDOM;
4428
4429         if (!ratelimit_configured(&s->rate_limit))
4430                 return -ENOEXEC;
4431
4432         if (ret_interval)
4433                 *ret_interval = s->rate_limit.interval;
4434         if (ret_burst)
4435                 *ret_burst = s->rate_limit.burst;
4436
4437         return 0;
4438 }
4439
4440 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
4441         assert_return(s, -EINVAL);
4442
4443         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
4444                 return false;
4445
4446         if (!ratelimit_configured(&s->rate_limit))
4447                 return false;
4448
4449         return s->ratelimited;
4450 }