src/journal/journal-file.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2011 Lennart Poettering
   6 ***/
   7
   8 #include <errno.h>
   9 #include <fcntl.h>
  10 #include <linux/fs.h>
  11 #include <pthread.h>
  12 #include <stddef.h>
  13 #include <sys/mman.h>
  14 #include <sys/statvfs.h>
  15 #include <sys/uio.h>
  16 #include <unistd.h>
  17
  18 #include "alloc-util.h"
  19 #include "btrfs-util.h"
  20 #include "chattr-util.h"
  21 #include "compress.h"
  22 #include "fd-util.h"
  23 #include "fs-util.h"
  24 #include "journal-authenticate.h"
  25 #include "journal-def.h"
  26 #include "journal-file.h"
  27 #include "lookup3.h"
  28 #include "parse-util.h"
  29 #include "path-util.h"
  30 #include "random-util.h"
  31 #include "sd-event.h"
  32 #include "set.h"
  33 #include "stat-util.h"
  34 #include "string-util.h"
  35 #include "strv.h"
  36 #include "xattr-util.h"
  37
  38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  40
  41 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
  42 #define MIN_COMPRESS_THRESHOLD (8ULL)
  43
  44 /* This is the minimum journal file size */
  45 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL)                 /* 512 KiB */
  46
  47 /* These are the lower and upper bounds if we deduce the max_use value
  48  * from the file system size */
  49 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  50 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  51
  52 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  53 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  54
  55 /* This is the upper bound if we deduce max_size from max_use */
  56 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  57
  58 /* This is the upper bound if we deduce the keep_free value from the
  59  * file system size */
  60 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  61
  62 /* This is the keep_free value when we can't determine the system
  63  * size */
  64 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  65
  66 /* This is the default maximum number of journal files to keep around. */
  67 #define DEFAULT_N_MAX_FILES (100)
  68
  69 /* n_data was the first entry we added after the initial file format design */
  70 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  71
  72 /* How many entries to keep in the entry array chain cache at max */
  73 #define CHAIN_CACHE_MAX 20
  74
  75 /* How much to increase the journal file size at once each time we allocate something new. */
  76 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  77
  78 /* Reread fstat() of the file for detecting deletions at least this often */
  79 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  80
  81 /* The mmap context to use for the header we pick as one above the last defined typed */
  82 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  83
  84 #ifdef __clang__
  85 #  pragma GCC diagnostic ignored "-Waddress-of-packed-member"
  86 #endif
  87
  88 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
  89  * As a result we use atomic operations on f->offline_state for inter-thread communications with
  90  * journal_file_set_offline() and journal_file_set_online(). */
  91 static void journal_file_set_offline_internal(JournalFile *f) {
  92         assert(f);
  93         assert(f->fd >= 0);
  94         assert(f->header);
  95
  96         for (;;) {
  97                 switch (f->offline_state) {
  98                 case OFFLINE_CANCEL:
  99                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
 100                                 continue;
 101                         return;
 102
 103                 case OFFLINE_AGAIN_FROM_SYNCING:
 104                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
 105                                 continue;
 106                         break;
 107
 108                 case OFFLINE_AGAIN_FROM_OFFLINING:
 109                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
 110                                 continue;
 111                         break;
 112
 113                 case OFFLINE_SYNCING:
 114                         (void) fsync(f->fd);
 115
 116                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
 117                                 continue;
 118
 119                         f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
 120                         (void) fsync(f->fd);
 121                         break;
 122
 123                 case OFFLINE_OFFLINING:
 124                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
 125                                 continue;
 126                         _fallthrough_;
 127                 case OFFLINE_DONE:
 128                         return;
 129
 130                 case OFFLINE_JOINED:
 131                         log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
 132                         return;
 133                 }
 134         }
 135 }
 136
 137 static void * journal_file_set_offline_thread(void *arg) {
 138         JournalFile *f = arg;
 139
 140         (void) pthread_setname_np(pthread_self(), "journal-offline");
 141
 142         journal_file_set_offline_internal(f);
 143
 144         return NULL;
 145 }
 146
 147 static int journal_file_set_offline_thread_join(JournalFile *f) {
 148         int r;
 149
 150         assert(f);
 151
 152         if (f->offline_state == OFFLINE_JOINED)
 153                 return 0;
 154
 155         r = pthread_join(f->offline_thread, NULL);
 156         if (r)
 157                 return -r;
 158
 159         f->offline_state = OFFLINE_JOINED;
 160
 161         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 162                 return -EIO;
 163
 164         return 0;
 165 }
 166
 167 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
 168 static bool journal_file_set_offline_try_restart(JournalFile *f) {
 169         for (;;) {
 170                 switch (f->offline_state) {
 171                 case OFFLINE_AGAIN_FROM_SYNCING:
 172                 case OFFLINE_AGAIN_FROM_OFFLINING:
 173                         return true;
 174
 175                 case OFFLINE_CANCEL:
 176                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
 177                                 continue;
 178                         return true;
 179
 180                 case OFFLINE_SYNCING:
 181                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
 182                                 continue;
 183                         return true;
 184
 185                 case OFFLINE_OFFLINING:
 186                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
 187                                 continue;
 188                         return true;
 189
 190                 default:
 191                         return false;
 192                 }
 193         }
 194 }
 195
 196 /* Sets a journal offline.
 197  *
 198  * If wait is false then an offline is dispatched in a separate thread for a
 199  * subsequent journal_file_set_offline() or journal_file_set_online() of the
 200  * same journal to synchronize with.
 201  *
 202  * If wait is true, then either an existing offline thread will be restarted
 203  * and joined, or if none exists the offline is simply performed in this
 204  * context without involving another thread.
 205  */
 206 int journal_file_set_offline(JournalFile *f, bool wait) {
 207         bool restarted;
 208         int r;
 209
 210         assert(f);
 211
 212         if (!f->writable)
 213                 return -EPERM;
 214
 215         if (!(f->fd >= 0 && f->header))
 216                 return -EINVAL;
 217
 218         /* An offlining journal is implicitly online and may modify f->header->state,
 219          * we must also join any potentially lingering offline thread when not online. */
 220         if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
 221                 return journal_file_set_offline_thread_join(f);
 222
 223         /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
 224         restarted = journal_file_set_offline_try_restart(f);
 225         if ((restarted && wait) || !restarted) {
 226                 r = journal_file_set_offline_thread_join(f);
 227                 if (r < 0)
 228                         return r;
 229         }
 230
 231         if (restarted)
 232                 return 0;
 233
 234         /* Initiate a new offline. */
 235         f->offline_state = OFFLINE_SYNCING;
 236
 237         if (wait) /* Without using a thread if waiting. */
 238                 journal_file_set_offline_internal(f);
 239         else {
 240                 sigset_t ss, saved_ss;
 241                 int k;
 242
 243                 if (sigfillset(&ss) < 0)
 244                         return -errno;
 245
 246                 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
 247                 if (r > 0)
 248                         return -r;
 249
 250                 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
 251
 252                 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
 253                 if (r > 0) {
 254                         f->offline_state = OFFLINE_JOINED;
 255                         return -r;
 256                 }
 257                 if (k > 0)
 258                         return -k;
 259         }
 260
 261         return 0;
 262 }
 263
 264 static int journal_file_set_online(JournalFile *f) {
 265         bool wait = true;
 266
 267         assert(f);
 268
 269         if (!f->writable)
 270                 return -EPERM;
 271
 272         if (!(f->fd >= 0 && f->header))
 273                 return -EINVAL;
 274
 275         while (wait) {
 276                 switch (f->offline_state) {
 277                 case OFFLINE_JOINED:
 278                         /* No offline thread, no need to wait. */
 279                         wait = false;
 280                         break;
 281
 282                 case OFFLINE_SYNCING:
 283                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
 284                                 continue;
 285                         /* Canceled syncing prior to offlining, no need to wait. */
 286                         wait = false;
 287                         break;
 288
 289                 case OFFLINE_AGAIN_FROM_SYNCING:
 290                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
 291                                 continue;
 292                         /* Canceled restart from syncing, no need to wait. */
 293                         wait = false;
 294                         break;
 295
 296                 case OFFLINE_AGAIN_FROM_OFFLINING:
 297                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
 298                                 continue;
 299                         /* Canceled restart from offlining, must wait for offlining to complete however. */
 300                         _fallthrough_;
 301                 default: {
 302                         int r;
 303
 304                         r = journal_file_set_offline_thread_join(f);
 305                         if (r < 0)
 306                                 return r;
 307
 308                         wait = false;
 309                         break;
 310                 }
 311                 }
 312         }
 313
 314         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 315                 return -EIO;
 316
 317         switch (f->header->state) {
 318                 case STATE_ONLINE:
 319                         return 0;
 320
 321                 case STATE_OFFLINE:
 322                         f->header->state = STATE_ONLINE;
 323                         (void) fsync(f->fd);
 324                         return 0;
 325
 326                 default:
 327                         return -EINVAL;
 328         }
 329 }
 330
 331 bool journal_file_is_offlining(JournalFile *f) {
 332         assert(f);
 333
 334         __sync_synchronize();
 335
 336         if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
 337                 return false;
 338
 339         return true;
 340 }
 341
 342 JournalFile* journal_file_close(JournalFile *f) {
 343         assert(f);
 344
 345 #if HAVE_GCRYPT
 346         /* Write the final tag */
 347         if (f->seal && f->writable) {
 348                 int r;
 349
 350                 r = journal_file_append_tag(f);
 351                 if (r < 0)
 352                         log_error_errno(r, "Failed to append tag when closing journal: %m");
 353         }
 354 #endif
 355
 356         if (f->post_change_timer) {
 357                 int enabled;
 358
 359                 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
 360                         if (enabled == SD_EVENT_ONESHOT)
 361                                 journal_file_post_change(f);
 362
 363                 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
 364                 sd_event_source_unref(f->post_change_timer);
 365         }
 366
 367         journal_file_set_offline(f, true);
 368
 369         if (f->mmap && f->cache_fd)
 370                 mmap_cache_free_fd(f->mmap, f->cache_fd);
 371
 372         if (f->fd >= 0 && f->defrag_on_close) {
 373
 374                 /* Be friendly to btrfs: turn COW back on again now,
 375                  * and defragment the file. We won't write to the file
 376                  * ever again, hence remove all fragmentation, and
 377                  * reenable all the good bits COW usually provides
 378                  * (such as data checksumming). */
 379
 380                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 381                 (void) btrfs_defrag_fd(f->fd);
 382         }
 383
 384         if (f->close_fd)
 385                 safe_close(f->fd);
 386         free(f->path);
 387
 388         mmap_cache_unref(f->mmap);
 389
 390         ordered_hashmap_free_free(f->chain_cache);
 391
 392 #if HAVE_XZ || HAVE_LZ4
 393         free(f->compress_buffer);
 394 #endif
 395
 396 #if HAVE_GCRYPT
 397         if (f->fss_file)
 398                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 399         else
 400                 free(f->fsprg_state);
 401
 402         free(f->fsprg_seed);
 403
 404         if (f->hmac)
 405                 gcry_md_close(f->hmac);
 406 #endif
 407
 408         return mfree(f);
 409 }
 410
 411 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 412         Header h = {};
 413         ssize_t k;
 414         int r;
 415
 416         assert(f);
 417
 418         memcpy(h.signature, HEADER_SIGNATURE, 8);
 419         h.header_size = htole64(ALIGN64(sizeof(h)));
 420
 421         h.incompatible_flags |= htole32(
 422                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 423                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 424
 425         h.compatible_flags = htole32(
 426                 f->seal * HEADER_COMPATIBLE_SEALED);
 427
 428         r = sd_id128_randomize(&h.file_id);
 429         if (r < 0)
 430                 return r;
 431
 432         if (template) {
 433                 h.seqnum_id = template->header->seqnum_id;
 434                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 435         } else
 436                 h.seqnum_id = h.file_id;
 437
 438         k = pwrite(f->fd, &h, sizeof(h), 0);
 439         if (k < 0)
 440                 return -errno;
 441
 442         if (k != sizeof(h))
 443                 return -EIO;
 444
 445         return 0;
 446 }
 447
 448 static int journal_file_refresh_header(JournalFile *f) {
 449         sd_id128_t boot_id;
 450         int r;
 451
 452         assert(f);
 453         assert(f->header);
 454
 455         r = sd_id128_get_machine(&f->header->machine_id);
 456         if (IN_SET(r, -ENOENT, -ENOMEDIUM))
 457                 /* We don't have a machine-id, let's continue without */
 458                 zero(f->header->machine_id);
 459         else if (r < 0)
 460                 return r;
 461
 462         r = sd_id128_get_boot(&boot_id);
 463         if (r < 0)
 464                 return r;
 465
 466         f->header->boot_id = boot_id;
 467
 468         r = journal_file_set_online(f);
 469
 470         /* Sync the online state to disk */
 471         (void) fsync(f->fd);
 472
 473         /* We likely just created a new file, also sync the directory this file is located in. */
 474         (void) fsync_directory_of_file(f->fd);
 475
 476         return r;
 477 }
 478
 479 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
 480         const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
 481                 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
 482         const char *type = compatible ? "compatible" : "incompatible";
 483         uint32_t flags;
 484
 485         flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
 486
 487         if (flags & ~supported) {
 488                 if (flags & ~any)
 489                         log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
 490                                   f->path, type, flags & ~any);
 491                 flags = (flags & any) & ~supported;
 492                 if (flags) {
 493                         const char* strv[3];
 494                         unsigned n = 0;
 495                         _cleanup_free_ char *t = NULL;
 496
 497                         if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
 498                                 strv[n++] = "sealed";
 499                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
 500                                 strv[n++] = "xz-compressed";
 501                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
 502                                 strv[n++] = "lz4-compressed";
 503                         strv[n] = NULL;
 504                         assert(n < ELEMENTSOF(strv));
 505
 506                         t = strv_join((char**) strv, ", ");
 507                         log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
 508                                   f->path, type, n > 1 ? "flags" : "flag", strnull(t));
 509                 }
 510                 return true;
 511         }
 512
 513         return false;
 514 }
 515
 516 static int journal_file_verify_header(JournalFile *f) {
 517         uint64_t arena_size, header_size;
 518
 519         assert(f);
 520         assert(f->header);
 521
 522         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 523                 return -EBADMSG;
 524
 525         /* In both read and write mode we refuse to open files with incompatible
 526          * flags we don't know. */
 527         if (warn_wrong_flags(f, false))
 528                 return -EPROTONOSUPPORT;
 529
 530         /* When open for writing we refuse to open files with compatible flags, too. */
 531         if (f->writable && warn_wrong_flags(f, true))
 532                 return -EPROTONOSUPPORT;
 533
 534         if (f->header->state >= _STATE_MAX)
 535                 return -EBADMSG;
 536
 537         header_size = le64toh(f->header->header_size);
 538
 539         /* The first addition was n_data, so check that we are at least this large */
 540         if (header_size < HEADER_SIZE_MIN)
 541                 return -EBADMSG;
 542
 543         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 544                 return -EBADMSG;
 545
 546         arena_size = le64toh(f->header->arena_size);
 547
 548         if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
 549                 return -ENODATA;
 550
 551         if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
 552                 return -ENODATA;
 553
 554         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 555             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 556             !VALID64(le64toh(f->header->tail_object_offset)) ||
 557             !VALID64(le64toh(f->header->entry_array_offset)))
 558                 return -ENODATA;
 559
 560         if (f->writable) {
 561                 sd_id128_t machine_id;
 562                 uint8_t state;
 563                 int r;
 564
 565                 r = sd_id128_get_machine(&machine_id);
 566                 if (r < 0)
 567                         return r;
 568
 569                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 570                         return -EHOSTDOWN;
 571
 572                 state = f->header->state;
 573
 574                 if (state == STATE_ARCHIVED)
 575                         return -ESHUTDOWN; /* Already archived */
 576                 else if (state == STATE_ONLINE) {
 577                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 578                         return -EBUSY;
 579                 } else if (state != STATE_OFFLINE) {
 580                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 581                         return -EBUSY;
 582                 }
 583
 584                 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
 585                         return -EBADMSG;
 586
 587                 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
 588                  * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
 589                  * bisection. */
 590                 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
 591                         log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
 592                         return -ETXTBSY;
 593                 }
 594         }
 595
 596         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 597         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 598
 599         f->seal = JOURNAL_HEADER_SEALED(f->header);
 600
 601         return 0;
 602 }
 603
 604 static int journal_file_fstat(JournalFile *f) {
 605         int r;
 606
 607         assert(f);
 608         assert(f->fd >= 0);
 609
 610         if (fstat(f->fd, &f->last_stat) < 0)
 611                 return -errno;
 612
 613         f->last_stat_usec = now(CLOCK_MONOTONIC);
 614
 615         /* Refuse dealing with with files that aren't regular */
 616         r = stat_verify_regular(&f->last_stat);
 617         if (r < 0)
 618                 return r;
 619
 620         /* Refuse appending to files that are already deleted */
 621         if (f->last_stat.st_nlink <= 0)
 622                 return -EIDRM;
 623
 624         return 0;
 625 }
 626
 627 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 628         uint64_t old_size, new_size;
 629         int r;
 630
 631         assert(f);
 632         assert(f->header);
 633
 634         /* We assume that this file is not sparse, and we know that
 635          * for sure, since we always call posix_fallocate()
 636          * ourselves */
 637
 638         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 639                 return -EIO;
 640
 641         old_size =
 642                 le64toh(f->header->header_size) +
 643                 le64toh(f->header->arena_size);
 644
 645         new_size = PAGE_ALIGN(offset + size);
 646         if (new_size < le64toh(f->header->header_size))
 647                 new_size = le64toh(f->header->header_size);
 648
 649         if (new_size <= old_size) {
 650
 651                 /* We already pre-allocated enough space, but before
 652                  * we write to it, let's check with fstat() if the
 653                  * file got deleted, in order make sure we don't throw
 654                  * away the data immediately. Don't check fstat() for
 655                  * all writes though, but only once ever 10s. */
 656
 657                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 658                         return 0;
 659
 660                 return journal_file_fstat(f);
 661         }
 662
 663         /* Allocate more space. */
 664
 665         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 666                 return -E2BIG;
 667
 668         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 669                 struct statvfs svfs;
 670
 671                 if (fstatvfs(f->fd, &svfs) >= 0) {
 672                         uint64_t available;
 673
 674                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 675
 676                         if (new_size - old_size > available)
 677                                 return -E2BIG;
 678                 }
 679         }
 680
 681         /* Increase by larger blocks at once */
 682         new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 683         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 684                 new_size = f->metrics.max_size;
 685
 686         /* Note that the glibc fallocate() fallback is very
 687            inefficient, hence we try to minimize the allocation area
 688            as we can. */
 689         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 690         if (r != 0)
 691                 return -r;
 692
 693         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 694
 695         return journal_file_fstat(f);
 696 }
 697
 698 static unsigned type_to_context(ObjectType type) {
 699         /* One context for each type, plus one catch-all for the rest */
 700         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 701         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 702         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 703 }
 704
 705 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
 706         int r;
 707
 708         assert(f);
 709         assert(ret);
 710
 711         if (size <= 0)
 712                 return -EINVAL;
 713
 714         /* Avoid SIGBUS on invalid accesses */
 715         if (offset + size > (uint64_t) f->last_stat.st_size) {
 716                 /* Hmm, out of range? Let's refresh the fstat() data
 717                  * first, before we trust that check. */
 718
 719                 r = journal_file_fstat(f);
 720                 if (r < 0)
 721                         return r;
 722
 723                 if (offset + size > (uint64_t) f->last_stat.st_size)
 724                         return -EADDRNOTAVAIL;
 725         }
 726
 727         return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
 728 }
 729
 730 static uint64_t minimum_header_size(Object *o) {
 731
 732         static const uint64_t table[] = {
 733                 [OBJECT_DATA] = sizeof(DataObject),
 734                 [OBJECT_FIELD] = sizeof(FieldObject),
 735                 [OBJECT_ENTRY] = sizeof(EntryObject),
 736                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 737                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 738                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 739                 [OBJECT_TAG] = sizeof(TagObject),
 740         };
 741
 742         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 743                 return sizeof(ObjectHeader);
 744
 745         return table[o->object.type];
 746 }
 747
 748 /* Lightweight object checks. We want this to be fast, so that we won't
 749  * slowdown every journal_file_move_to_object() call too much. */
 750 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
 751         assert(f);
 752         assert(o);
 753
 754         switch (o->object.type) {
 755
 756         case OBJECT_DATA: {
 757                 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
 758                         log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
 759                                         le64toh(o->data.n_entries), offset);
 760                         return -EBADMSG;
 761                 }
 762
 763                 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
 764                         log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
 765                               offsetof(DataObject, payload),
 766                               le64toh(o->object.size),
 767                               offset);
 768                         return -EBADMSG;
 769                 }
 770
 771                 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
 772                     !VALID64(le64toh(o->data.next_field_offset)) ||
 773                     !VALID64(le64toh(o->data.entry_offset)) ||
 774                     !VALID64(le64toh(o->data.entry_array_offset))) {
 775                         log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
 776                                 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
 777                               le64toh(o->data.next_hash_offset),
 778                               le64toh(o->data.next_field_offset),
 779                               le64toh(o->data.entry_offset),
 780                               le64toh(o->data.entry_array_offset),
 781                               offset);
 782                         return -EBADMSG;
 783                 }
 784
 785                 break;
 786         }
 787
 788         case OBJECT_FIELD:
 789                 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
 790                         log_debug(
 791                               "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
 792                               offsetof(FieldObject, payload),
 793                               le64toh(o->object.size),
 794                               offset);
 795                         return -EBADMSG;
 796                 }
 797
 798                 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
 799                     !VALID64(le64toh(o->field.head_data_offset))) {
 800                         log_debug(
 801                               "Invalid offset, next_hash_offset="OFSfmt
 802                               ", head_data_offset="OFSfmt": %"PRIu64,
 803                               le64toh(o->field.next_hash_offset),
 804                               le64toh(o->field.head_data_offset),
 805                               offset);
 806                         return -EBADMSG;
 807                 }
 808                 break;
 809
 810         case OBJECT_ENTRY:
 811                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
 812                         log_debug(
 813                               "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
 814                               offsetof(EntryObject, items),
 815                               le64toh(o->object.size),
 816                               offset);
 817                         return -EBADMSG;
 818                 }
 819
 820                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
 821                         log_debug(
 822                               "Invalid number items in entry: %"PRIu64": %"PRIu64,
 823                               (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
 824                               offset);
 825                         return -EBADMSG;
 826                 }
 827
 828                 if (le64toh(o->entry.seqnum) <= 0) {
 829                         log_debug(
 830                               "Invalid entry seqnum: %"PRIx64": %"PRIu64,
 831                               le64toh(o->entry.seqnum),
 832                               offset);
 833                         return -EBADMSG;
 834                 }
 835
 836                 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
 837                         log_debug(
 838                               "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
 839                               le64toh(o->entry.realtime),
 840                               offset);
 841                         return -EBADMSG;
 842                 }
 843
 844                 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
 845                         log_debug(
 846                               "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
 847                               le64toh(o->entry.monotonic),
 848                               offset);
 849                         return -EBADMSG;
 850                 }
 851
 852                 break;
 853
 854         case OBJECT_DATA_HASH_TABLE:
 855         case OBJECT_FIELD_HASH_TABLE:
 856                 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
 857                     (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
 858                         log_debug(
 859                               "Invalid %s hash table size: %"PRIu64": %"PRIu64,
 860                               o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
 861                               le64toh(o->object.size),
 862                               offset);
 863                         return -EBADMSG;
 864                 }
 865
 866                 break;
 867
 868         case OBJECT_ENTRY_ARRAY:
 869                 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
 870                     (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
 871                         log_debug(
 872                               "Invalid object entry array size: %"PRIu64": %"PRIu64,
 873                               le64toh(o->object.size),
 874                               offset);
 875                         return -EBADMSG;
 876                 }
 877
 878                 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
 879                         log_debug(
 880                               "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
 881                               le64toh(o->entry_array.next_entry_array_offset),
 882                               offset);
 883                         return -EBADMSG;
 884                 }
 885
 886                 break;
 887
 888         case OBJECT_TAG:
 889                 if (le64toh(o->object.size) != sizeof(TagObject)) {
 890                         log_debug(
 891                               "Invalid object tag size: %"PRIu64": %"PRIu64,
 892                               le64toh(o->object.size),
 893                               offset);
 894                         return -EBADMSG;
 895                 }
 896
 897                 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
 898                         log_debug(
 899                               "Invalid object tag epoch: %"PRIu64": %"PRIu64,
 900                               le64toh(o->tag.epoch),
 901                               offset);
 902                         return -EBADMSG;
 903                 }
 904
 905                 break;
 906         }
 907
 908         return 0;
 909 }
 910
 911 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 912         int r;
 913         void *t;
 914         size_t tsize;
 915         Object *o;
 916         uint64_t s;
 917
 918         assert(f);
 919         assert(ret);
 920
 921         /* Objects may only be located at multiple of 64 bit */
 922         if (!VALID64(offset)) {
 923                 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
 924                 return -EBADMSG;
 925         }
 926
 927         /* Object may not be located in the file header */
 928         if (offset < le64toh(f->header->header_size)) {
 929                 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
 930                 return -EBADMSG;
 931         }
 932
 933         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
 934         if (r < 0)
 935                 return r;
 936
 937         o = (Object*) t;
 938         s = le64toh(o->object.size);
 939
 940         if (s == 0) {
 941                 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
 942                 return -EBADMSG;
 943         }
 944         if (s < sizeof(ObjectHeader)) {
 945                 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
 946                 return -EBADMSG;
 947         }
 948
 949         if (o->object.type <= OBJECT_UNUSED) {
 950                 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
 951                 return -EBADMSG;
 952         }
 953
 954         if (s < minimum_header_size(o)) {
 955                 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
 956                 return -EBADMSG;
 957         }
 958
 959         if (type > OBJECT_UNUSED && o->object.type != type) {
 960                 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
 961                 return -EBADMSG;
 962         }
 963
 964         if (s > tsize) {
 965                 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
 966                 if (r < 0)
 967                         return r;
 968
 969                 o = (Object*) t;
 970         }
 971
 972         r = journal_file_check_object(f, offset, o);
 973         if (r < 0)
 974                 return r;
 975
 976         *ret = o;
 977         return 0;
 978 }
 979
 980 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 981         uint64_t r;
 982
 983         assert(f);
 984         assert(f->header);
 985
 986         r = le64toh(f->header->tail_entry_seqnum) + 1;
 987
 988         if (seqnum) {
 989                 /* If an external seqnum counter was passed, we update
 990                  * both the local and the external one, and set it to
 991                  * the maximum of both */
 992
 993                 if (*seqnum + 1 > r)
 994                         r = *seqnum + 1;
 995
 996                 *seqnum = r;
 997         }
 998
 999         f->header->tail_entry_seqnum = htole64(r);
1000
1001         if (f->header->head_entry_seqnum == 0)
1002                 f->header->head_entry_seqnum = htole64(r);
1003
1004         return r;
1005 }
1006
1007 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
1008         int r;
1009         uint64_t p;
1010         Object *tail, *o;
1011         void *t;
1012
1013         assert(f);
1014         assert(f->header);
1015         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
1016         assert(size >= sizeof(ObjectHeader));
1017         assert(offset);
1018         assert(ret);
1019
1020         r = journal_file_set_online(f);
1021         if (r < 0)
1022                 return r;
1023
1024         p = le64toh(f->header->tail_object_offset);
1025         if (p == 0)
1026                 p = le64toh(f->header->header_size);
1027         else {
1028                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1029                 if (r < 0)
1030                         return r;
1031
1032                 p += ALIGN64(le64toh(tail->object.size));
1033         }
1034
1035         r = journal_file_allocate(f, p, size);
1036         if (r < 0)
1037                 return r;
1038
1039         r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1040         if (r < 0)
1041                 return r;
1042
1043         o = (Object*) t;
1044
1045         zero(o->object);
1046         o->object.type = type;
1047         o->object.size = htole64(size);
1048
1049         f->header->tail_object_offset = htole64(p);
1050         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1051
1052         *ret = o;
1053         *offset = p;
1054
1055         return 0;
1056 }
1057
1058 static int journal_file_setup_data_hash_table(JournalFile *f) {
1059         uint64_t s, p;
1060         Object *o;
1061         int r;
1062
1063         assert(f);
1064         assert(f->header);
1065
1066         /* We estimate that we need 1 hash table entry per 768 bytes
1067            of journal file and we want to make sure we never get
1068            beyond 75% fill level. Calculate the hash table size for
1069            the maximum file size based on these metrics. */
1070
1071         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1072         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1073                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1074
1075         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1076
1077         r = journal_file_append_object(f,
1078                                        OBJECT_DATA_HASH_TABLE,
1079                                        offsetof(Object, hash_table.items) + s,
1080                                        &o, &p);
1081         if (r < 0)
1082                 return r;
1083
1084         memzero(o->hash_table.items, s);
1085
1086         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1087         f->header->data_hash_table_size = htole64(s);
1088
1089         return 0;
1090 }
1091
1092 static int journal_file_setup_field_hash_table(JournalFile *f) {
1093         uint64_t s, p;
1094         Object *o;
1095         int r;
1096
1097         assert(f);
1098         assert(f->header);
1099
1100         /* We use a fixed size hash table for the fields as this
1101          * number should grow very slowly only */
1102
1103         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1104         r = journal_file_append_object(f,
1105                                        OBJECT_FIELD_HASH_TABLE,
1106                                        offsetof(Object, hash_table.items) + s,
1107                                        &o, &p);
1108         if (r < 0)
1109                 return r;
1110
1111         memzero(o->hash_table.items, s);
1112
1113         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1114         f->header->field_hash_table_size = htole64(s);
1115
1116         return 0;
1117 }
1118
1119 int journal_file_map_data_hash_table(JournalFile *f) {
1120         uint64_t s, p;
1121         void *t;
1122         int r;
1123
1124         assert(f);
1125         assert(f->header);
1126
1127         if (f->data_hash_table)
1128                 return 0;
1129
1130         p = le64toh(f->header->data_hash_table_offset);
1131         s = le64toh(f->header->data_hash_table_size);
1132
1133         r = journal_file_move_to(f,
1134                                  OBJECT_DATA_HASH_TABLE,
1135                                  true,
1136                                  p, s,
1137                                  &t, NULL);
1138         if (r < 0)
1139                 return r;
1140
1141         f->data_hash_table = t;
1142         return 0;
1143 }
1144
1145 int journal_file_map_field_hash_table(JournalFile *f) {
1146         uint64_t s, p;
1147         void *t;
1148         int r;
1149
1150         assert(f);
1151         assert(f->header);
1152
1153         if (f->field_hash_table)
1154                 return 0;
1155
1156         p = le64toh(f->header->field_hash_table_offset);
1157         s = le64toh(f->header->field_hash_table_size);
1158
1159         r = journal_file_move_to(f,
1160                                  OBJECT_FIELD_HASH_TABLE,
1161                                  true,
1162                                  p, s,
1163                                  &t, NULL);
1164         if (r < 0)
1165                 return r;
1166
1167         f->field_hash_table = t;
1168         return 0;
1169 }
1170
1171 static int journal_file_link_field(
1172                 JournalFile *f,
1173                 Object *o,
1174                 uint64_t offset,
1175                 uint64_t hash) {
1176
1177         uint64_t p, h, m;
1178         int r;
1179
1180         assert(f);
1181         assert(f->header);
1182         assert(f->field_hash_table);
1183         assert(o);
1184         assert(offset > 0);
1185
1186         if (o->object.type != OBJECT_FIELD)
1187                 return -EINVAL;
1188
1189         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1190         if (m <= 0)
1191                 return -EBADMSG;
1192
1193         /* This might alter the window we are looking at */
1194         o->field.next_hash_offset = o->field.head_data_offset = 0;
1195
1196         h = hash % m;
1197         p = le64toh(f->field_hash_table[h].tail_hash_offset);
1198         if (p == 0)
1199                 f->field_hash_table[h].head_hash_offset = htole64(offset);
1200         else {
1201                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1202                 if (r < 0)
1203                         return r;
1204
1205                 o->field.next_hash_offset = htole64(offset);
1206         }
1207
1208         f->field_hash_table[h].tail_hash_offset = htole64(offset);
1209
1210         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1211                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1212
1213         return 0;
1214 }
1215
1216 static int journal_file_link_data(
1217                 JournalFile *f,
1218                 Object *o,
1219                 uint64_t offset,
1220                 uint64_t hash) {
1221
1222         uint64_t p, h, m;
1223         int r;
1224
1225         assert(f);
1226         assert(f->header);
1227         assert(f->data_hash_table);
1228         assert(o);
1229         assert(offset > 0);
1230
1231         if (o->object.type != OBJECT_DATA)
1232                 return -EINVAL;
1233
1234         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1235         if (m <= 0)
1236                 return -EBADMSG;
1237
1238         /* This might alter the window we are looking at */
1239         o->data.next_hash_offset = o->data.next_field_offset = 0;
1240         o->data.entry_offset = o->data.entry_array_offset = 0;
1241         o->data.n_entries = 0;
1242
1243         h = hash % m;
1244         p = le64toh(f->data_hash_table[h].tail_hash_offset);
1245         if (p == 0)
1246                 /* Only entry in the hash table is easy */
1247                 f->data_hash_table[h].head_hash_offset = htole64(offset);
1248         else {
1249                 /* Move back to the previous data object, to patch in
1250                  * pointer */
1251
1252                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1253                 if (r < 0)
1254                         return r;
1255
1256                 o->data.next_hash_offset = htole64(offset);
1257         }
1258
1259         f->data_hash_table[h].tail_hash_offset = htole64(offset);
1260
1261         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1262                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1263
1264         return 0;
1265 }
1266
1267 int journal_file_find_field_object_with_hash(
1268                 JournalFile *f,
1269                 const void *field, uint64_t size, uint64_t hash,
1270                 Object **ret, uint64_t *offset) {
1271
1272         uint64_t p, osize, h, m;
1273         int r;
1274
1275         assert(f);
1276         assert(f->header);
1277         assert(field && size > 0);
1278
1279         /* If the field hash table is empty, we can't find anything */
1280         if (le64toh(f->header->field_hash_table_size) <= 0)
1281                 return 0;
1282
1283         /* Map the field hash table, if it isn't mapped yet. */
1284         r = journal_file_map_field_hash_table(f);
1285         if (r < 0)
1286                 return r;
1287
1288         osize = offsetof(Object, field.payload) + size;
1289
1290         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1291         if (m <= 0)
1292                 return -EBADMSG;
1293
1294         h = hash % m;
1295         p = le64toh(f->field_hash_table[h].head_hash_offset);
1296
1297         while (p > 0) {
1298                 Object *o;
1299
1300                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1301                 if (r < 0)
1302                         return r;
1303
1304                 if (le64toh(o->field.hash) == hash &&
1305                     le64toh(o->object.size) == osize &&
1306                     memcmp(o->field.payload, field, size) == 0) {
1307
1308                         if (ret)
1309                                 *ret = o;
1310                         if (offset)
1311                                 *offset = p;
1312
1313                         return 1;
1314                 }
1315
1316                 p = le64toh(o->field.next_hash_offset);
1317         }
1318
1319         return 0;
1320 }
1321
1322 int journal_file_find_field_object(
1323                 JournalFile *f,
1324                 const void *field, uint64_t size,
1325                 Object **ret, uint64_t *offset) {
1326
1327         uint64_t hash;
1328
1329         assert(f);
1330         assert(field && size > 0);
1331
1332         hash = hash64(field, size);
1333
1334         return journal_file_find_field_object_with_hash(f,
1335                                                         field, size, hash,
1336                                                         ret, offset);
1337 }
1338
1339 int journal_file_find_data_object_with_hash(
1340                 JournalFile *f,
1341                 const void *data, uint64_t size, uint64_t hash,
1342                 Object **ret, uint64_t *offset) {
1343
1344         uint64_t p, osize, h, m;
1345         int r;
1346
1347         assert(f);
1348         assert(f->header);
1349         assert(data || size == 0);
1350
1351         /* If there's no data hash table, then there's no entry. */
1352         if (le64toh(f->header->data_hash_table_size) <= 0)
1353                 return 0;
1354
1355         /* Map the data hash table, if it isn't mapped yet. */
1356         r = journal_file_map_data_hash_table(f);
1357         if (r < 0)
1358                 return r;
1359
1360         osize = offsetof(Object, data.payload) + size;
1361
1362         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1363         if (m <= 0)
1364                 return -EBADMSG;
1365
1366         h = hash % m;
1367         p = le64toh(f->data_hash_table[h].head_hash_offset);
1368
1369         while (p > 0) {
1370                 Object *o;
1371
1372                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1373                 if (r < 0)
1374                         return r;
1375
1376                 if (le64toh(o->data.hash) != hash)
1377                         goto next;
1378
1379                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1380 #if HAVE_XZ || HAVE_LZ4
1381                         uint64_t l;
1382                         size_t rsize = 0;
1383
1384                         l = le64toh(o->object.size);
1385                         if (l <= offsetof(Object, data.payload))
1386                                 return -EBADMSG;
1387
1388                         l -= offsetof(Object, data.payload);
1389
1390                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1391                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1392                         if (r < 0)
1393                                 return r;
1394
1395                         if (rsize == size &&
1396                             memcmp(f->compress_buffer, data, size) == 0) {
1397
1398                                 if (ret)
1399                                         *ret = o;
1400
1401                                 if (offset)
1402                                         *offset = p;
1403
1404                                 return 1;
1405                         }
1406 #else
1407                         return -EPROTONOSUPPORT;
1408 #endif
1409                 } else if (le64toh(o->object.size) == osize &&
1410                            memcmp(o->data.payload, data, size) == 0) {
1411
1412                         if (ret)
1413                                 *ret = o;
1414
1415                         if (offset)
1416                                 *offset = p;
1417
1418                         return 1;
1419                 }
1420
1421         next:
1422                 p = le64toh(o->data.next_hash_offset);
1423         }
1424
1425         return 0;
1426 }
1427
1428 int journal_file_find_data_object(
1429                 JournalFile *f,
1430                 const void *data, uint64_t size,
1431                 Object **ret, uint64_t *offset) {
1432
1433         uint64_t hash;
1434
1435         assert(f);
1436         assert(data || size == 0);
1437
1438         hash = hash64(data, size);
1439
1440         return journal_file_find_data_object_with_hash(f,
1441                                                        data, size, hash,
1442                                                        ret, offset);
1443 }
1444
1445 static int journal_file_append_field(
1446                 JournalFile *f,
1447                 const void *field, uint64_t size,
1448                 Object **ret, uint64_t *offset) {
1449
1450         uint64_t hash, p;
1451         uint64_t osize;
1452         Object *o;
1453         int r;
1454
1455         assert(f);
1456         assert(field && size > 0);
1457
1458         hash = hash64(field, size);
1459
1460         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1461         if (r < 0)
1462                 return r;
1463         else if (r > 0) {
1464
1465                 if (ret)
1466                         *ret = o;
1467
1468                 if (offset)
1469                         *offset = p;
1470
1471                 return 0;
1472         }
1473
1474         osize = offsetof(Object, field.payload) + size;
1475         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1476         if (r < 0)
1477                 return r;
1478
1479         o->field.hash = htole64(hash);
1480         memcpy(o->field.payload, field, size);
1481
1482         r = journal_file_link_field(f, o, p, hash);
1483         if (r < 0)
1484                 return r;
1485
1486         /* The linking might have altered the window, so let's
1487          * refresh our pointer */
1488         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1489         if (r < 0)
1490                 return r;
1491
1492 #if HAVE_GCRYPT
1493         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1494         if (r < 0)
1495                 return r;
1496 #endif
1497
1498         if (ret)
1499                 *ret = o;
1500
1501         if (offset)
1502                 *offset = p;
1503
1504         return 0;
1505 }
1506
1507 static int journal_file_append_data(
1508                 JournalFile *f,
1509                 const void *data, uint64_t size,
1510                 Object **ret, uint64_t *offset) {
1511
1512         uint64_t hash, p;
1513         uint64_t osize;
1514         Object *o;
1515         int r, compression = 0;
1516         const void *eq;
1517
1518         assert(f);
1519         assert(data || size == 0);
1520
1521         hash = hash64(data, size);
1522
1523         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1524         if (r < 0)
1525                 return r;
1526         if (r > 0) {
1527
1528                 if (ret)
1529                         *ret = o;
1530
1531                 if (offset)
1532                         *offset = p;
1533
1534                 return 0;
1535         }
1536
1537         osize = offsetof(Object, data.payload) + size;
1538         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1539         if (r < 0)
1540                 return r;
1541
1542         o->data.hash = htole64(hash);
1543
1544 #if HAVE_XZ || HAVE_LZ4
1545         if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
1546                 size_t rsize = 0;
1547
1548                 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1549
1550                 if (compression >= 0) {
1551                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1552                         o->object.flags |= compression;
1553
1554                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1555                                   size, rsize, object_compressed_to_string(compression));
1556                 } else
1557                         /* Compression didn't work, we don't really care why, let's continue without compression */
1558                         compression = 0;
1559         }
1560 #endif
1561
1562         if (compression == 0)
1563                 memcpy_safe(o->data.payload, data, size);
1564
1565         r = journal_file_link_data(f, o, p, hash);
1566         if (r < 0)
1567                 return r;
1568
1569 #if HAVE_GCRYPT
1570         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1571         if (r < 0)
1572                 return r;
1573 #endif
1574
1575         /* The linking might have altered the window, so let's
1576          * refresh our pointer */
1577         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1578         if (r < 0)
1579                 return r;
1580
1581         if (!data)
1582                 eq = NULL;
1583         else
1584                 eq = memchr(data, '=', size);
1585         if (eq && eq > data) {
1586                 Object *fo = NULL;
1587                 uint64_t fp;
1588
1589                 /* Create field object ... */
1590                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1591                 if (r < 0)
1592                         return r;
1593
1594                 /* ... and link it in. */
1595                 o->data.next_field_offset = fo->field.head_data_offset;
1596                 fo->field.head_data_offset = le64toh(p);
1597         }
1598
1599         if (ret)
1600                 *ret = o;
1601
1602         if (offset)
1603                 *offset = p;
1604
1605         return 0;
1606 }
1607
1608 uint64_t journal_file_entry_n_items(Object *o) {
1609         assert(o);
1610
1611         if (o->object.type != OBJECT_ENTRY)
1612                 return 0;
1613
1614         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1615 }
1616
1617 uint64_t journal_file_entry_array_n_items(Object *o) {
1618         assert(o);
1619
1620         if (o->object.type != OBJECT_ENTRY_ARRAY)
1621                 return 0;
1622
1623         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1624 }
1625
1626 uint64_t journal_file_hash_table_n_items(Object *o) {
1627         assert(o);
1628
1629         if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1630                 return 0;
1631
1632         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1633 }
1634
1635 static int link_entry_into_array(JournalFile *f,
1636                                  le64_t *first,
1637                                  le64_t *idx,
1638                                  uint64_t p) {
1639         int r;
1640         uint64_t n = 0, ap = 0, q, i, a, hidx;
1641         Object *o;
1642
1643         assert(f);
1644         assert(f->header);
1645         assert(first);
1646         assert(idx);
1647         assert(p > 0);
1648
1649         a = le64toh(*first);
1650         i = hidx = le64toh(*idx);
1651         while (a > 0) {
1652
1653                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1654                 if (r < 0)
1655                         return r;
1656
1657                 n = journal_file_entry_array_n_items(o);
1658                 if (i < n) {
1659                         o->entry_array.items[i] = htole64(p);
1660                         *idx = htole64(hidx + 1);
1661                         return 0;
1662                 }
1663
1664                 i -= n;
1665                 ap = a;
1666                 a = le64toh(o->entry_array.next_entry_array_offset);
1667         }
1668
1669         if (hidx > n)
1670                 n = (hidx+1) * 2;
1671         else
1672                 n = n * 2;
1673
1674         if (n < 4)
1675                 n = 4;
1676
1677         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1678                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1679                                        &o, &q);
1680         if (r < 0)
1681                 return r;
1682
1683 #if HAVE_GCRYPT
1684         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1685         if (r < 0)
1686                 return r;
1687 #endif
1688
1689         o->entry_array.items[i] = htole64(p);
1690
1691         if (ap == 0)
1692                 *first = htole64(q);
1693         else {
1694                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1695                 if (r < 0)
1696                         return r;
1697
1698                 o->entry_array.next_entry_array_offset = htole64(q);
1699         }
1700
1701         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1702                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1703
1704         *idx = htole64(hidx + 1);
1705
1706         return 0;
1707 }
1708
1709 static int link_entry_into_array_plus_one(JournalFile *f,
1710                                           le64_t *extra,
1711                                           le64_t *first,
1712                                           le64_t *idx,
1713                                           uint64_t p) {
1714
1715         int r;
1716
1717         assert(f);
1718         assert(extra);
1719         assert(first);
1720         assert(idx);
1721         assert(p > 0);
1722
1723         if (*idx == 0)
1724                 *extra = htole64(p);
1725         else {
1726                 le64_t i;
1727
1728                 i = htole64(le64toh(*idx) - 1);
1729                 r = link_entry_into_array(f, first, &i, p);
1730                 if (r < 0)
1731                         return r;
1732         }
1733
1734         *idx = htole64(le64toh(*idx) + 1);
1735         return 0;
1736 }
1737
1738 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1739         uint64_t p;
1740         int r;
1741         assert(f);
1742         assert(o);
1743         assert(offset > 0);
1744
1745         p = le64toh(o->entry.items[i].object_offset);
1746         if (p == 0)
1747                 return -EINVAL;
1748
1749         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1750         if (r < 0)
1751                 return r;
1752
1753         return link_entry_into_array_plus_one(f,
1754                                               &o->data.entry_offset,
1755                                               &o->data.entry_array_offset,
1756                                               &o->data.n_entries,
1757                                               offset);
1758 }
1759
1760 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1761         uint64_t n, i;
1762         int r;
1763
1764         assert(f);
1765         assert(f->header);
1766         assert(o);
1767         assert(offset > 0);
1768
1769         if (o->object.type != OBJECT_ENTRY)
1770                 return -EINVAL;
1771
1772         __sync_synchronize();
1773
1774         /* Link up the entry itself */
1775         r = link_entry_into_array(f,
1776                                   &f->header->entry_array_offset,
1777                                   &f->header->n_entries,
1778                                   offset);
1779         if (r < 0)
1780                 return r;
1781
1782         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1783
1784         if (f->header->head_entry_realtime == 0)
1785                 f->header->head_entry_realtime = o->entry.realtime;
1786
1787         f->header->tail_entry_realtime = o->entry.realtime;
1788         f->header->tail_entry_monotonic = o->entry.monotonic;
1789
1790         /* Link up the items */
1791         n = journal_file_entry_n_items(o);
1792         for (i = 0; i < n; i++) {
1793                 r = journal_file_link_entry_item(f, o, offset, i);
1794                 if (r < 0)
1795                         return r;
1796         }
1797
1798         return 0;
1799 }
1800
1801 static int journal_file_append_entry_internal(
1802                 JournalFile *f,
1803                 const dual_timestamp *ts,
1804                 const sd_id128_t *boot_id,
1805                 uint64_t xor_hash,
1806                 const EntryItem items[], unsigned n_items,
1807                 uint64_t *seqnum,
1808                 Object **ret, uint64_t *offset) {
1809         uint64_t np;
1810         uint64_t osize;
1811         Object *o;
1812         int r;
1813
1814         assert(f);
1815         assert(f->header);
1816         assert(items || n_items == 0);
1817         assert(ts);
1818
1819         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1820
1821         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1822         if (r < 0)
1823                 return r;
1824
1825         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1826         memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1827         o->entry.realtime = htole64(ts->realtime);
1828         o->entry.monotonic = htole64(ts->monotonic);
1829         o->entry.xor_hash = htole64(xor_hash);
1830         o->entry.boot_id = boot_id ? *boot_id : f->header->boot_id;
1831
1832 #if HAVE_GCRYPT
1833         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1834         if (r < 0)
1835                 return r;
1836 #endif
1837
1838         r = journal_file_link_entry(f, o, np);
1839         if (r < 0)
1840                 return r;
1841
1842         if (ret)
1843                 *ret = o;
1844
1845         if (offset)
1846                 *offset = np;
1847
1848         return 0;
1849 }
1850
1851 void journal_file_post_change(JournalFile *f) {
1852         assert(f);
1853
1854         /* inotify() does not receive IN_MODIFY events from file
1855          * accesses done via mmap(). After each access we hence
1856          * trigger IN_MODIFY by truncating the journal file to its
1857          * current size which triggers IN_MODIFY. */
1858
1859         __sync_synchronize();
1860
1861         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1862                 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1863 }
1864
1865 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1866         assert(userdata);
1867
1868         journal_file_post_change(userdata);
1869
1870         return 1;
1871 }
1872
1873 static void schedule_post_change(JournalFile *f) {
1874         sd_event_source *timer;
1875         int enabled, r;
1876         uint64_t now;
1877
1878         assert(f);
1879         assert(f->post_change_timer);
1880
1881         timer = f->post_change_timer;
1882
1883         r = sd_event_source_get_enabled(timer, &enabled);
1884         if (r < 0) {
1885                 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1886                 goto fail;
1887         }
1888
1889         if (enabled == SD_EVENT_ONESHOT)
1890                 return;
1891
1892         r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1893         if (r < 0) {
1894                 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1895                 goto fail;
1896         }
1897
1898         r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1899         if (r < 0) {
1900                 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1901                 goto fail;
1902         }
1903
1904         r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1905         if (r < 0) {
1906                 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1907                 goto fail;
1908         }
1909
1910         return;
1911
1912 fail:
1913         /* On failure, let's simply post the change immediately. */
1914         journal_file_post_change(f);
1915 }
1916
1917 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1918 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1919         _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1920         int r;
1921
1922         assert(f);
1923         assert_return(!f->post_change_timer, -EINVAL);
1924         assert(e);
1925         assert(t);
1926
1927         r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1928         if (r < 0)
1929                 return r;
1930
1931         r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1932         if (r < 0)
1933                 return r;
1934
1935         f->post_change_timer = TAKE_PTR(timer);
1936         f->post_change_timer_period = t;
1937
1938         return r;
1939 }
1940
1941 static int entry_item_cmp(const void *_a, const void *_b) {
1942         const EntryItem *a = _a, *b = _b;
1943
1944         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1945                 return -1;
1946         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1947                 return 1;
1948         return 0;
1949 }
1950
1951 int journal_file_append_entry(
1952                 JournalFile *f,
1953                 const dual_timestamp *ts,
1954                 const sd_id128_t *boot_id,
1955                 const struct iovec iovec[], unsigned n_iovec,
1956                 uint64_t *seqnum,
1957                 Object **ret, uint64_t *offset) {
1958
1959         unsigned i;
1960         EntryItem *items;
1961         int r;
1962         uint64_t xor_hash = 0;
1963         struct dual_timestamp _ts;
1964
1965         assert(f);
1966         assert(f->header);
1967         assert(iovec || n_iovec == 0);
1968
1969         if (ts) {
1970                 if (!VALID_REALTIME(ts->realtime)) {
1971                         log_debug("Invalid realtime timestamp %"PRIu64", refusing entry.", ts->realtime);
1972                         return -EBADMSG;
1973                 }
1974                 if (!VALID_MONOTONIC(ts->monotonic)) {
1975                         log_debug("Invalid monotomic timestamp %"PRIu64", refusing entry.", ts->monotonic);
1976                         return -EBADMSG;
1977                 }
1978         } else {
1979                 dual_timestamp_get(&_ts);
1980                 ts = &_ts;
1981         }
1982
1983 #if HAVE_GCRYPT
1984         r = journal_file_maybe_append_tag(f, ts->realtime);
1985         if (r < 0)
1986                 return r;
1987 #endif
1988
1989         /* alloca() can't take 0, hence let's allocate at least one */
1990         items = newa(EntryItem, MAX(1u, n_iovec));
1991
1992         for (i = 0; i < n_iovec; i++) {
1993                 uint64_t p;
1994                 Object *o;
1995
1996                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1997                 if (r < 0)
1998                         return r;
1999
2000                 xor_hash ^= le64toh(o->data.hash);
2001                 items[i].object_offset = htole64(p);
2002                 items[i].hash = o->data.hash;
2003         }
2004
2005         /* Order by the position on disk, in order to improve seek
2006          * times for rotating media. */
2007         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
2008
2009         r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, offset);
2010
2011         /* If the memory mapping triggered a SIGBUS then we return an
2012          * IO error and ignore the error code passed down to us, since
2013          * it is very likely just an effect of a nullified replacement
2014          * mapping page */
2015
2016         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
2017                 r = -EIO;
2018
2019         if (f->post_change_timer)
2020                 schedule_post_change(f);
2021         else
2022                 journal_file_post_change(f);
2023
2024         return r;
2025 }
2026
2027 typedef struct ChainCacheItem {
2028         uint64_t first; /* the array at the beginning of the chain */
2029         uint64_t array; /* the cached array */
2030         uint64_t begin; /* the first item in the cached array */
2031         uint64_t total; /* the total number of items in all arrays before this one in the chain */
2032         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2033 } ChainCacheItem;
2034
2035 static void chain_cache_put(
2036                 OrderedHashmap *h,
2037                 ChainCacheItem *ci,
2038                 uint64_t first,
2039                 uint64_t array,
2040                 uint64_t begin,
2041                 uint64_t total,
2042                 uint64_t last_index) {
2043
2044         if (!ci) {
2045                 /* If the chain item to cache for this chain is the
2046                  * first one it's not worth caching anything */
2047                 if (array == first)
2048                         return;
2049
2050                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2051                         ci = ordered_hashmap_steal_first(h);
2052                         assert(ci);
2053                 } else {
2054                         ci = new(ChainCacheItem, 1);
2055                         if (!ci)
2056                                 return;
2057                 }
2058
2059                 ci->first = first;
2060
2061                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2062                         free(ci);
2063                         return;
2064                 }
2065         } else
2066                 assert(ci->first == first);
2067
2068         ci->array = array;
2069         ci->begin = begin;
2070         ci->total = total;
2071         ci->last_index = last_index;
2072 }
2073
2074 static int generic_array_get(
2075                 JournalFile *f,
2076                 uint64_t first,
2077                 uint64_t i,
2078                 Object **ret, uint64_t *offset) {
2079
2080         Object *o;
2081         uint64_t p = 0, a, t = 0;
2082         int r;
2083         ChainCacheItem *ci;
2084
2085         assert(f);
2086
2087         a = first;
2088
2089         /* Try the chain cache first */
2090         ci = ordered_hashmap_get(f->chain_cache, &first);
2091         if (ci && i > ci->total) {
2092                 a = ci->array;
2093                 i -= ci->total;
2094                 t = ci->total;
2095         }
2096
2097         while (a > 0) {
2098                 uint64_t k;
2099
2100                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2101                 if (r < 0)
2102                         return r;
2103
2104                 k = journal_file_entry_array_n_items(o);
2105                 if (i < k) {
2106                         p = le64toh(o->entry_array.items[i]);
2107                         goto found;
2108                 }
2109
2110                 i -= k;
2111                 t += k;
2112                 a = le64toh(o->entry_array.next_entry_array_offset);
2113         }
2114
2115         return 0;
2116
2117 found:
2118         /* Let's cache this item for the next invocation */
2119         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2120
2121         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2122         if (r < 0)
2123                 return r;
2124
2125         if (ret)
2126                 *ret = o;
2127
2128         if (offset)
2129                 *offset = p;
2130
2131         return 1;
2132 }
2133
2134 static int generic_array_get_plus_one(
2135                 JournalFile *f,
2136                 uint64_t extra,
2137                 uint64_t first,
2138                 uint64_t i,
2139                 Object **ret, uint64_t *offset) {
2140
2141         Object *o;
2142
2143         assert(f);
2144
2145         if (i == 0) {
2146                 int r;
2147
2148                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2149                 if (r < 0)
2150                         return r;
2151
2152                 if (ret)
2153                         *ret = o;
2154
2155                 if (offset)
2156                         *offset = extra;
2157
2158                 return 1;
2159         }
2160
2161         return generic_array_get(f, first, i-1, ret, offset);
2162 }
2163
2164 enum {
2165         TEST_FOUND,
2166         TEST_LEFT,
2167         TEST_RIGHT
2168 };
2169
2170 static int generic_array_bisect(
2171                 JournalFile *f,
2172                 uint64_t first,
2173                 uint64_t n,
2174                 uint64_t needle,
2175                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2176                 direction_t direction,
2177                 Object **ret,
2178                 uint64_t *offset,
2179                 uint64_t *idx) {
2180
2181         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2182         bool subtract_one = false;
2183         Object *o, *array = NULL;
2184         int r;
2185         ChainCacheItem *ci;
2186
2187         assert(f);
2188         assert(test_object);
2189
2190         /* Start with the first array in the chain */
2191         a = first;
2192
2193         ci = ordered_hashmap_get(f->chain_cache, &first);
2194         if (ci && n > ci->total && ci->begin != 0) {
2195                 /* Ah, we have iterated this bisection array chain
2196                  * previously! Let's see if we can skip ahead in the
2197                  * chain, as far as the last time. But we can't jump
2198                  * backwards in the chain, so let's check that
2199                  * first. */
2200
2201                 r = test_object(f, ci->begin, needle);
2202                 if (r < 0)
2203                         return r;
2204
2205                 if (r == TEST_LEFT) {
2206                         /* OK, what we are looking for is right of the
2207                          * begin of this EntryArray, so let's jump
2208                          * straight to previously cached array in the
2209                          * chain */
2210
2211                         a = ci->array;
2212                         n -= ci->total;
2213                         t = ci->total;
2214                         last_index = ci->last_index;
2215                 }
2216         }
2217
2218         while (a > 0) {
2219                 uint64_t left, right, k, lp;
2220
2221                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2222                 if (r < 0)
2223                         return r;
2224
2225                 k = journal_file_entry_array_n_items(array);
2226                 right = MIN(k, n);
2227                 if (right <= 0)
2228                         return 0;
2229
2230                 i = right - 1;
2231                 lp = p = le64toh(array->entry_array.items[i]);
2232                 if (p <= 0)
2233                         r = -EBADMSG;
2234                 else
2235                         r = test_object(f, p, needle);
2236                 if (r == -EBADMSG) {
2237                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2238                         n = i;
2239                         continue;
2240                 }
2241                 if (r < 0)
2242                         return r;
2243
2244                 if (r == TEST_FOUND)
2245                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2246
2247                 if (r == TEST_RIGHT) {
2248                         left = 0;
2249                         right -= 1;
2250
2251                         if (last_index != (uint64_t) -1) {
2252                                 assert(last_index <= right);
2253
2254                                 /* If we cached the last index we
2255                                  * looked at, let's try to not to jump
2256                                  * too wildly around and see if we can
2257                                  * limit the range to look at early to
2258                                  * the immediate neighbors of the last
2259                                  * index we looked at. */
2260
2261                                 if (last_index > 0) {
2262                                         uint64_t x = last_index - 1;
2263
2264                                         p = le64toh(array->entry_array.items[x]);
2265                                         if (p <= 0)
2266                                                 return -EBADMSG;
2267
2268                                         r = test_object(f, p, needle);
2269                                         if (r < 0)
2270                                                 return r;
2271
2272                                         if (r == TEST_FOUND)
2273                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2274
2275                                         if (r == TEST_RIGHT)
2276                                                 right = x;
2277                                         else
2278                                                 left = x + 1;
2279                                 }
2280
2281                                 if (last_index < right) {
2282                                         uint64_t y = last_index + 1;
2283
2284                                         p = le64toh(array->entry_array.items[y]);
2285                                         if (p <= 0)
2286                                                 return -EBADMSG;
2287
2288                                         r = test_object(f, p, needle);
2289                                         if (r < 0)
2290                                                 return r;
2291
2292                                         if (r == TEST_FOUND)
2293                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2294
2295                                         if (r == TEST_RIGHT)
2296                                                 right = y;
2297                                         else
2298                                                 left = y + 1;
2299                                 }
2300                         }
2301
2302                         for (;;) {
2303                                 if (left == right) {
2304                                         if (direction == DIRECTION_UP)
2305                                                 subtract_one = true;
2306
2307                                         i = left;
2308                                         goto found;
2309                                 }
2310
2311                                 assert(left < right);
2312                                 i = (left + right) / 2;
2313
2314                                 p = le64toh(array->entry_array.items[i]);
2315                                 if (p <= 0)
2316                                         r = -EBADMSG;
2317                                 else
2318                                         r = test_object(f, p, needle);
2319                                 if (r == -EBADMSG) {
2320                                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2321                                         right = n = i;
2322                                         continue;
2323                                 }
2324                                 if (r < 0)
2325                                         return r;
2326
2327                                 if (r == TEST_FOUND)
2328                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2329
2330                                 if (r == TEST_RIGHT)
2331                                         right = i;
2332                                 else
2333                                         left = i + 1;
2334                         }
2335                 }
2336
2337                 if (k >= n) {
2338                         if (direction == DIRECTION_UP) {
2339                                 i = n;
2340                                 subtract_one = true;
2341                                 goto found;
2342                         }
2343
2344                         return 0;
2345                 }
2346
2347                 last_p = lp;
2348
2349                 n -= k;
2350                 t += k;
2351                 last_index = (uint64_t) -1;
2352                 a = le64toh(array->entry_array.next_entry_array_offset);
2353         }
2354
2355         return 0;
2356
2357 found:
2358         if (subtract_one && t == 0 && i == 0)
2359                 return 0;
2360
2361         /* Let's cache this item for the next invocation */
2362         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2363
2364         if (subtract_one && i == 0)
2365                 p = last_p;
2366         else if (subtract_one)
2367                 p = le64toh(array->entry_array.items[i-1]);
2368         else
2369                 p = le64toh(array->entry_array.items[i]);
2370
2371         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2372         if (r < 0)
2373                 return r;
2374
2375         if (ret)
2376                 *ret = o;
2377
2378         if (offset)
2379                 *offset = p;
2380
2381         if (idx)
2382                 *idx = t + i + (subtract_one ? -1 : 0);
2383
2384         return 1;
2385 }
2386
2387 static int generic_array_bisect_plus_one(
2388                 JournalFile *f,
2389                 uint64_t extra,
2390                 uint64_t first,
2391                 uint64_t n,
2392                 uint64_t needle,
2393                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2394                 direction_t direction,
2395                 Object **ret,
2396                 uint64_t *offset,
2397                 uint64_t *idx) {
2398
2399         int r;
2400         bool step_back = false;
2401         Object *o;
2402
2403         assert(f);
2404         assert(test_object);
2405
2406         if (n <= 0)
2407                 return 0;
2408
2409         /* This bisects the array in object 'first', but first checks
2410          * an extra  */
2411         r = test_object(f, extra, needle);
2412         if (r < 0)
2413                 return r;
2414
2415         if (r == TEST_FOUND)
2416                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2417
2418         /* if we are looking with DIRECTION_UP then we need to first
2419            see if in the actual array there is a matching entry, and
2420            return the last one of that. But if there isn't any we need
2421            to return this one. Hence remember this, and return it
2422            below. */
2423         if (r == TEST_LEFT)
2424                 step_back = direction == DIRECTION_UP;
2425
2426         if (r == TEST_RIGHT) {
2427                 if (direction == DIRECTION_DOWN)
2428                         goto found;
2429                 else
2430                         return 0;
2431         }
2432
2433         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2434
2435         if (r == 0 && step_back)
2436                 goto found;
2437
2438         if (r > 0 && idx)
2439                 (*idx)++;
2440
2441         return r;
2442
2443 found:
2444         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2445         if (r < 0)
2446                 return r;
2447
2448         if (ret)
2449                 *ret = o;
2450
2451         if (offset)
2452                 *offset = extra;
2453
2454         if (idx)
2455                 *idx = 0;
2456
2457         return 1;
2458 }
2459
2460 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2461         assert(f);
2462         assert(p > 0);
2463
2464         if (p == needle)
2465                 return TEST_FOUND;
2466         else if (p < needle)
2467                 return TEST_LEFT;
2468         else
2469                 return TEST_RIGHT;
2470 }
2471
2472 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2473         Object *o;
2474         int r;
2475
2476         assert(f);
2477         assert(p > 0);
2478
2479         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2480         if (r < 0)
2481                 return r;
2482
2483         if (le64toh(o->entry.seqnum) == needle)
2484                 return TEST_FOUND;
2485         else if (le64toh(o->entry.seqnum) < needle)
2486                 return TEST_LEFT;
2487         else
2488                 return TEST_RIGHT;
2489 }
2490
2491 int journal_file_move_to_entry_by_seqnum(
2492                 JournalFile *f,
2493                 uint64_t seqnum,
2494                 direction_t direction,
2495                 Object **ret,
2496                 uint64_t *offset) {
2497         assert(f);
2498         assert(f->header);
2499
2500         return generic_array_bisect(f,
2501                                     le64toh(f->header->entry_array_offset),
2502                                     le64toh(f->header->n_entries),
2503                                     seqnum,
2504                                     test_object_seqnum,
2505                                     direction,
2506                                     ret, offset, NULL);
2507 }
2508
2509 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2510         Object *o;
2511         int r;
2512
2513         assert(f);
2514         assert(p > 0);
2515
2516         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2517         if (r < 0)
2518                 return r;
2519
2520         if (le64toh(o->entry.realtime) == needle)
2521                 return TEST_FOUND;
2522         else if (le64toh(o->entry.realtime) < needle)
2523                 return TEST_LEFT;
2524         else
2525                 return TEST_RIGHT;
2526 }
2527
2528 int journal_file_move_to_entry_by_realtime(
2529                 JournalFile *f,
2530                 uint64_t realtime,
2531                 direction_t direction,
2532                 Object **ret,
2533                 uint64_t *offset) {
2534         assert(f);
2535         assert(f->header);
2536
2537         return generic_array_bisect(f,
2538                                     le64toh(f->header->entry_array_offset),
2539                                     le64toh(f->header->n_entries),
2540                                     realtime,
2541                                     test_object_realtime,
2542                                     direction,
2543                                     ret, offset, NULL);
2544 }
2545
2546 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2547         Object *o;
2548         int r;
2549
2550         assert(f);
2551         assert(p > 0);
2552
2553         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2554         if (r < 0)
2555                 return r;
2556
2557         if (le64toh(o->entry.monotonic) == needle)
2558                 return TEST_FOUND;
2559         else if (le64toh(o->entry.monotonic) < needle)
2560                 return TEST_LEFT;
2561         else
2562                 return TEST_RIGHT;
2563 }
2564
2565 static int find_data_object_by_boot_id(
2566                 JournalFile *f,
2567                 sd_id128_t boot_id,
2568                 Object **o,
2569                 uint64_t *b) {
2570
2571         char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2572
2573         sd_id128_to_string(boot_id, t + 9);
2574         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2575 }
2576
2577 int journal_file_move_to_entry_by_monotonic(
2578                 JournalFile *f,
2579                 sd_id128_t boot_id,
2580                 uint64_t monotonic,
2581                 direction_t direction,
2582                 Object **ret,
2583                 uint64_t *offset) {
2584
2585         Object *o;
2586         int r;
2587
2588         assert(f);
2589
2590         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2591         if (r < 0)
2592                 return r;
2593         if (r == 0)
2594                 return -ENOENT;
2595
2596         return generic_array_bisect_plus_one(f,
2597                                              le64toh(o->data.entry_offset),
2598                                              le64toh(o->data.entry_array_offset),
2599                                              le64toh(o->data.n_entries),
2600                                              monotonic,
2601                                              test_object_monotonic,
2602                                              direction,
2603                                              ret, offset, NULL);
2604 }
2605
2606 void journal_file_reset_location(JournalFile *f) {
2607         f->location_type = LOCATION_HEAD;
2608         f->current_offset = 0;
2609         f->current_seqnum = 0;
2610         f->current_realtime = 0;
2611         f->current_monotonic = 0;
2612         zero(f->current_boot_id);
2613         f->current_xor_hash = 0;
2614 }
2615
2616 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2617         f->location_type = LOCATION_SEEK;
2618         f->current_offset = offset;
2619         f->current_seqnum = le64toh(o->entry.seqnum);
2620         f->current_realtime = le64toh(o->entry.realtime);
2621         f->current_monotonic = le64toh(o->entry.monotonic);
2622         f->current_boot_id = o->entry.boot_id;
2623         f->current_xor_hash = le64toh(o->entry.xor_hash);
2624 }
2625
2626 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2627         assert(af);
2628         assert(af->header);
2629         assert(bf);
2630         assert(bf->header);
2631         assert(af->location_type == LOCATION_SEEK);
2632         assert(bf->location_type == LOCATION_SEEK);
2633
2634         /* If contents and timestamps match, these entries are
2635          * identical, even if the seqnum does not match */
2636         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2637             af->current_monotonic == bf->current_monotonic &&
2638             af->current_realtime == bf->current_realtime &&
2639             af->current_xor_hash == bf->current_xor_hash)
2640                 return 0;
2641
2642         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2643
2644                 /* If this is from the same seqnum source, compare
2645                  * seqnums */
2646                 if (af->current_seqnum < bf->current_seqnum)
2647                         return -1;
2648                 if (af->current_seqnum > bf->current_seqnum)
2649                         return 1;
2650
2651                 /* Wow! This is weird, different data but the same
2652                  * seqnums? Something is borked, but let's make the
2653                  * best of it and compare by time. */
2654         }
2655
2656         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2657
2658                 /* If the boot id matches, compare monotonic time */
2659                 if (af->current_monotonic < bf->current_monotonic)
2660                         return -1;
2661                 if (af->current_monotonic > bf->current_monotonic)
2662                         return 1;
2663         }
2664
2665         /* Otherwise, compare UTC time */
2666         if (af->current_realtime < bf->current_realtime)
2667                 return -1;
2668         if (af->current_realtime > bf->current_realtime)
2669                 return 1;
2670
2671         /* Finally, compare by contents */
2672         if (af->current_xor_hash < bf->current_xor_hash)
2673                 return -1;
2674         if (af->current_xor_hash > bf->current_xor_hash)
2675                 return 1;
2676
2677         return 0;
2678 }
2679
2680 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2681
2682         /* Increase or decrease the specified index, in the right direction. */
2683
2684         if (direction == DIRECTION_DOWN) {
2685                 if (*i >= n - 1)
2686                         return 0;
2687
2688                 (*i) ++;
2689         } else {
2690                 if (*i <= 0)
2691                         return 0;
2692
2693                 (*i) --;
2694         }
2695
2696         return 1;
2697 }
2698
2699 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2700
2701         /* Consider it an error if any of the two offsets is uninitialized */
2702         if (old_offset == 0 || new_offset == 0)
2703                 return false;
2704
2705         /* If we go down, the new offset must be larger than the old one. */
2706         return direction == DIRECTION_DOWN ?
2707                 new_offset > old_offset  :
2708                 new_offset < old_offset;
2709 }
2710
2711 int journal_file_next_entry(
2712                 JournalFile *f,
2713                 uint64_t p,
2714                 direction_t direction,
2715                 Object **ret, uint64_t *offset) {
2716
2717         uint64_t i, n, ofs;
2718         int r;
2719
2720         assert(f);
2721         assert(f->header);
2722
2723         n = le64toh(f->header->n_entries);
2724         if (n <= 0)
2725                 return 0;
2726
2727         if (p == 0)
2728                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2729         else {
2730                 r = generic_array_bisect(f,
2731                                          le64toh(f->header->entry_array_offset),
2732                                          le64toh(f->header->n_entries),
2733                                          p,
2734                                          test_object_offset,
2735                                          DIRECTION_DOWN,
2736                                          NULL, NULL,
2737                                          &i);
2738                 if (r <= 0)
2739                         return r;
2740
2741                 r = bump_array_index(&i, direction, n);
2742                 if (r <= 0)
2743                         return r;
2744         }
2745
2746         /* And jump to it */
2747         for (;;) {
2748                 r = generic_array_get(f,
2749                                       le64toh(f->header->entry_array_offset),
2750                                       i,
2751                                       ret, &ofs);
2752                 if (r > 0)
2753                         break;
2754                 if (r != -EBADMSG)
2755                         return r;
2756
2757                 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2758                  * the next one might work for us instead. */
2759                 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2760
2761                 r = bump_array_index(&i, direction, n);
2762                 if (r <= 0)
2763                         return r;
2764         }
2765
2766         /* Ensure our array is properly ordered. */
2767         if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2768                 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
2769                 return -EBADMSG;
2770         }
2771
2772         if (offset)
2773                 *offset = ofs;
2774
2775         return 1;
2776 }
2777
2778 int journal_file_next_entry_for_data(
2779                 JournalFile *f,
2780                 Object *o, uint64_t p,
2781                 uint64_t data_offset,
2782                 direction_t direction,
2783                 Object **ret, uint64_t *offset) {
2784
2785         uint64_t i, n, ofs;
2786         Object *d;
2787         int r;
2788
2789         assert(f);
2790         assert(p > 0 || !o);
2791
2792         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2793         if (r < 0)
2794                 return r;
2795
2796         n = le64toh(d->data.n_entries);
2797         if (n <= 0)
2798                 return n;
2799
2800         if (!o)
2801                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2802         else {
2803                 if (o->object.type != OBJECT_ENTRY)
2804                         return -EINVAL;
2805
2806                 r = generic_array_bisect_plus_one(f,
2807                                                   le64toh(d->data.entry_offset),
2808                                                   le64toh(d->data.entry_array_offset),
2809                                                   le64toh(d->data.n_entries),
2810                                                   p,
2811                                                   test_object_offset,
2812                                                   DIRECTION_DOWN,
2813                                                   NULL, NULL,
2814                                                   &i);
2815
2816                 if (r <= 0)
2817                         return r;
2818
2819                 r = bump_array_index(&i, direction, n);
2820                 if (r <= 0)
2821                         return r;
2822         }
2823
2824         for (;;) {
2825                 r = generic_array_get_plus_one(f,
2826                                                le64toh(d->data.entry_offset),
2827                                                le64toh(d->data.entry_array_offset),
2828                                                i,
2829                                                ret, &ofs);
2830                 if (r > 0)
2831                         break;
2832                 if (r != -EBADMSG)
2833                         return r;
2834
2835                 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2836
2837                 r = bump_array_index(&i, direction, n);
2838                 if (r <= 0)
2839                         return r;
2840         }
2841
2842         /* Ensure our array is properly ordered. */
2843         if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2844                 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2845                 return -EBADMSG;
2846         }
2847
2848         if (offset)
2849                 *offset = ofs;
2850
2851         return 1;
2852 }
2853
2854 int journal_file_move_to_entry_by_offset_for_data(
2855                 JournalFile *f,
2856                 uint64_t data_offset,
2857                 uint64_t p,
2858                 direction_t direction,
2859                 Object **ret, uint64_t *offset) {
2860
2861         int r;
2862         Object *d;
2863
2864         assert(f);
2865
2866         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2867         if (r < 0)
2868                 return r;
2869
2870         return generic_array_bisect_plus_one(f,
2871                                              le64toh(d->data.entry_offset),
2872                                              le64toh(d->data.entry_array_offset),
2873                                              le64toh(d->data.n_entries),
2874                                              p,
2875                                              test_object_offset,
2876                                              direction,
2877                                              ret, offset, NULL);
2878 }
2879
2880 int journal_file_move_to_entry_by_monotonic_for_data(
2881                 JournalFile *f,
2882                 uint64_t data_offset,
2883                 sd_id128_t boot_id,
2884                 uint64_t monotonic,
2885                 direction_t direction,
2886                 Object **ret, uint64_t *offset) {
2887
2888         Object *o, *d;
2889         int r;
2890         uint64_t b, z;
2891
2892         assert(f);
2893
2894         /* First, seek by time */
2895         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2896         if (r < 0)
2897                 return r;
2898         if (r == 0)
2899                 return -ENOENT;
2900
2901         r = generic_array_bisect_plus_one(f,
2902                                           le64toh(o->data.entry_offset),
2903                                           le64toh(o->data.entry_array_offset),
2904                                           le64toh(o->data.n_entries),
2905                                           monotonic,
2906                                           test_object_monotonic,
2907                                           direction,
2908                                           NULL, &z, NULL);
2909         if (r <= 0)
2910                 return r;
2911
2912         /* And now, continue seeking until we find an entry that
2913          * exists in both bisection arrays */
2914
2915         for (;;) {
2916                 Object *qo;
2917                 uint64_t p, q;
2918
2919                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2920                 if (r < 0)
2921                         return r;
2922
2923                 r = generic_array_bisect_plus_one(f,
2924                                                   le64toh(d->data.entry_offset),
2925                                                   le64toh(d->data.entry_array_offset),
2926                                                   le64toh(d->data.n_entries),
2927                                                   z,
2928                                                   test_object_offset,
2929                                                   direction,
2930                                                   NULL, &p, NULL);
2931                 if (r <= 0)
2932                         return r;
2933
2934                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2935                 if (r < 0)
2936                         return r;
2937
2938                 r = generic_array_bisect_plus_one(f,
2939                                                   le64toh(o->data.entry_offset),
2940                                                   le64toh(o->data.entry_array_offset),
2941                                                   le64toh(o->data.n_entries),
2942                                                   p,
2943                                                   test_object_offset,
2944                                                   direction,
2945                                                   &qo, &q, NULL);
2946
2947                 if (r <= 0)
2948                         return r;
2949
2950                 if (p == q) {
2951                         if (ret)
2952                                 *ret = qo;
2953                         if (offset)
2954                                 *offset = q;
2955
2956                         return 1;
2957                 }
2958
2959                 z = q;
2960         }
2961 }
2962
2963 int journal_file_move_to_entry_by_seqnum_for_data(
2964                 JournalFile *f,
2965                 uint64_t data_offset,
2966                 uint64_t seqnum,
2967                 direction_t direction,
2968                 Object **ret, uint64_t *offset) {
2969
2970         Object *d;
2971         int r;
2972
2973         assert(f);
2974
2975         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2976         if (r < 0)
2977                 return r;
2978
2979         return generic_array_bisect_plus_one(f,
2980                                              le64toh(d->data.entry_offset),
2981                                              le64toh(d->data.entry_array_offset),
2982                                              le64toh(d->data.n_entries),
2983                                              seqnum,
2984                                              test_object_seqnum,
2985                                              direction,
2986                                              ret, offset, NULL);
2987 }
2988
2989 int journal_file_move_to_entry_by_realtime_for_data(
2990                 JournalFile *f,
2991                 uint64_t data_offset,
2992                 uint64_t realtime,
2993                 direction_t direction,
2994                 Object **ret, uint64_t *offset) {
2995
2996         Object *d;
2997         int r;
2998
2999         assert(f);
3000
3001         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
3002         if (r < 0)
3003                 return r;
3004
3005         return generic_array_bisect_plus_one(f,
3006                                              le64toh(d->data.entry_offset),
3007                                              le64toh(d->data.entry_array_offset),
3008                                              le64toh(d->data.n_entries),
3009                                              realtime,
3010                                              test_object_realtime,
3011                                              direction,
3012                                              ret, offset, NULL);
3013 }
3014
3015 void journal_file_dump(JournalFile *f) {
3016         Object *o;
3017         int r;
3018         uint64_t p;
3019
3020         assert(f);
3021         assert(f->header);
3022
3023         journal_file_print_header(f);
3024
3025         p = le64toh(f->header->header_size);
3026         while (p != 0) {
3027                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3028                 if (r < 0)
3029                         goto fail;
3030
3031                 switch (o->object.type) {
3032
3033                 case OBJECT_UNUSED:
3034                         printf("Type: OBJECT_UNUSED\n");
3035                         break;
3036
3037                 case OBJECT_DATA:
3038                         printf("Type: OBJECT_DATA\n");
3039                         break;
3040
3041                 case OBJECT_FIELD:
3042                         printf("Type: OBJECT_FIELD\n");
3043                         break;
3044
3045                 case OBJECT_ENTRY:
3046                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3047                                le64toh(o->entry.seqnum),
3048                                le64toh(o->entry.monotonic),
3049                                le64toh(o->entry.realtime));
3050                         break;
3051
3052                 case OBJECT_FIELD_HASH_TABLE:
3053                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3054                         break;
3055
3056                 case OBJECT_DATA_HASH_TABLE:
3057                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
3058                         break;
3059
3060                 case OBJECT_ENTRY_ARRAY:
3061                         printf("Type: OBJECT_ENTRY_ARRAY\n");
3062                         break;
3063
3064                 case OBJECT_TAG:
3065                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3066                                le64toh(o->tag.seqnum),
3067                                le64toh(o->tag.epoch));
3068                         break;
3069
3070                 default:
3071                         printf("Type: unknown (%i)\n", o->object.type);
3072                         break;
3073                 }
3074
3075                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3076                         printf("Flags: %s\n",
3077                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3078
3079                 if (p == le64toh(f->header->tail_object_offset))
3080                         p = 0;
3081                 else
3082                         p = p + ALIGN64(le64toh(o->object.size));
3083         }
3084
3085         return;
3086 fail:
3087         log_error("File corrupt");
3088 }
3089
3090 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3091         const char *x;
3092
3093         x = format_timestamp(buf, l, t);
3094         if (x)
3095                 return x;
3096         return " --- ";
3097 }
3098
3099 void journal_file_print_header(JournalFile *f) {
3100         char a[33], b[33], c[33], d[33];
3101         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3102         struct stat st;
3103         char bytes[FORMAT_BYTES_MAX];
3104
3105         assert(f);
3106         assert(f->header);
3107
3108         printf("File Path: %s\n"
3109                "File ID: %s\n"
3110                "Machine ID: %s\n"
3111                "Boot ID: %s\n"
3112                "Sequential Number ID: %s\n"
3113                "State: %s\n"
3114                "Compatible Flags:%s%s\n"
3115                "Incompatible Flags:%s%s%s\n"
3116                "Header size: %"PRIu64"\n"
3117                "Arena size: %"PRIu64"\n"
3118                "Data Hash Table Size: %"PRIu64"\n"
3119                "Field Hash Table Size: %"PRIu64"\n"
3120                "Rotate Suggested: %s\n"
3121                "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3122                "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3123                "Head Realtime Timestamp: %s (%"PRIx64")\n"
3124                "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3125                "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3126                "Objects: %"PRIu64"\n"
3127                "Entry Objects: %"PRIu64"\n",
3128                f->path,
3129                sd_id128_to_string(f->header->file_id, a),
3130                sd_id128_to_string(f->header->machine_id, b),
3131                sd_id128_to_string(f->header->boot_id, c),
3132                sd_id128_to_string(f->header->seqnum_id, d),
3133                f->header->state == STATE_OFFLINE ? "OFFLINE" :
3134                f->header->state == STATE_ONLINE ? "ONLINE" :
3135                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3136                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3137                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3138                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3139                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3140                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3141                le64toh(f->header->header_size),
3142                le64toh(f->header->arena_size),
3143                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3144                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3145                yes_no(journal_file_rotate_suggested(f, 0)),
3146                le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3147                le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3148                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3149                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3150                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3151                le64toh(f->header->n_objects),
3152                le64toh(f->header->n_entries));
3153
3154         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3155                 printf("Data Objects: %"PRIu64"\n"
3156                        "Data Hash Table Fill: %.1f%%\n",
3157                        le64toh(f->header->n_data),
3158                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3159
3160         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3161                 printf("Field Objects: %"PRIu64"\n"
3162                        "Field Hash Table Fill: %.1f%%\n",
3163                        le64toh(f->header->n_fields),
3164                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3165
3166         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3167                 printf("Tag Objects: %"PRIu64"\n",
3168                        le64toh(f->header->n_tags));
3169         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3170                 printf("Entry Array Objects: %"PRIu64"\n",
3171                        le64toh(f->header->n_entry_arrays));
3172
3173         if (fstat(f->fd, &st) >= 0)
3174                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3175 }
3176
3177 static int journal_file_warn_btrfs(JournalFile *f) {
3178         unsigned attrs;
3179         int r;
3180
3181         assert(f);
3182
3183         /* Before we write anything, check if the COW logic is turned
3184          * off on btrfs. Given our write pattern that is quite
3185          * unfriendly to COW file systems this should greatly improve
3186          * performance on COW file systems, such as btrfs, at the
3187          * expense of data integrity features (which shouldn't be too
3188          * bad, given that we do our own checksumming). */
3189
3190         r = btrfs_is_filesystem(f->fd);
3191         if (r < 0)
3192                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3193         if (!r)
3194                 return 0;
3195
3196         r = read_attr_fd(f->fd, &attrs);
3197         if (r < 0)
3198                 return log_warning_errno(r, "Failed to read file attributes: %m");
3199
3200         if (attrs & FS_NOCOW_FL) {
3201                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3202                 return 0;
3203         }
3204
3205         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3206                    "This is likely to slow down journal access substantially, please consider turning "
3207                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3208
3209         return 1;
3210 }
3211
3212 int journal_file_open(
3213                 int fd,
3214                 const char *fname,
3215                 int flags,
3216                 mode_t mode,
3217                 bool compress,
3218                 uint64_t compress_threshold_bytes,
3219                 bool seal,
3220                 JournalMetrics *metrics,
3221                 MMapCache *mmap_cache,
3222                 Set *deferred_closes,
3223                 JournalFile *template,
3224                 JournalFile **ret) {
3225
3226         bool newly_created = false;
3227         JournalFile *f;
3228         void *h;
3229         int r;
3230         char bytes[FORMAT_BYTES_MAX];
3231
3232         assert(ret);
3233         assert(fd >= 0 || fname);
3234
3235         if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3236                 return -EINVAL;
3237
3238         if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3239                 return -EINVAL;
3240
3241         f = new0(JournalFile, 1);
3242         if (!f)
3243                 return -ENOMEM;
3244
3245         f->fd = fd;
3246         f->mode = mode;
3247
3248         f->flags = flags;
3249         f->prot = prot_from_flags(flags);
3250         f->writable = (flags & O_ACCMODE) != O_RDONLY;
3251 #if HAVE_LZ4
3252         f->compress_lz4 = compress;
3253 #elif HAVE_XZ
3254         f->compress_xz = compress;
3255 #endif
3256
3257         if (compress_threshold_bytes == (uint64_t) -1)
3258                 f->compress_threshold_bytes = DEFAULT_COMPRESS_THRESHOLD;
3259         else
3260                 f->compress_threshold_bytes = MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes);
3261
3262 #if HAVE_GCRYPT
3263         f->seal = seal;
3264 #endif
3265
3266         log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
3267                   yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
3268                   format_bytes(bytes, sizeof(bytes), f->compress_threshold_bytes));
3269
3270         if (mmap_cache)
3271                 f->mmap = mmap_cache_ref(mmap_cache);
3272         else {
3273                 f->mmap = mmap_cache_new();
3274                 if (!f->mmap) {
3275                         r = -ENOMEM;
3276                         goto fail;
3277                 }
3278         }
3279
3280         if (fname) {
3281                 f->path = strdup(fname);
3282                 if (!f->path) {
3283                         r = -ENOMEM;
3284                         goto fail;
3285                 }
3286         } else {
3287                 assert(fd >= 0);
3288
3289                 /* If we don't know the path, fill in something explanatory and vaguely useful */
3290                 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3291                         r = -ENOMEM;
3292                         goto fail;
3293                 }
3294         }
3295
3296         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3297         if (!f->chain_cache) {
3298                 r = -ENOMEM;
3299                 goto fail;
3300         }
3301
3302         if (f->fd < 0) {
3303                 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3304                  * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3305                  * it doesn't hurt in that case. */
3306
3307                 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
3308                 if (f->fd < 0) {
3309                         r = -errno;
3310                         goto fail;
3311                 }
3312
3313                 /* fds we opened here by us should also be closed by us. */
3314                 f->close_fd = true;
3315
3316                 r = fd_nonblock(f->fd, false);
3317                 if (r < 0)
3318                         goto fail;
3319         }
3320
3321         f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3322         if (!f->cache_fd) {
3323                 r = -ENOMEM;
3324                 goto fail;
3325         }
3326
3327         r = journal_file_fstat(f);
3328         if (r < 0)
3329                 goto fail;
3330
3331         if (f->last_stat.st_size == 0 && f->writable) {
3332
3333                 (void) journal_file_warn_btrfs(f);
3334
3335                 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3336                  * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3337                  * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3338                  * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3339                  * solely on mtime/atime/ctime of the file. */
3340                 (void) fd_setcrtime(f->fd, 0);
3341
3342 #if HAVE_GCRYPT
3343                 /* Try to load the FSPRG state, and if we can't, then
3344                  * just don't do sealing */
3345                 if (f->seal) {
3346                         r = journal_file_fss_load(f);
3347                         if (r < 0)
3348                                 f->seal = false;
3349                 }
3350 #endif
3351
3352                 r = journal_file_init_header(f, template);
3353                 if (r < 0)
3354                         goto fail;
3355
3356                 r = journal_file_fstat(f);
3357                 if (r < 0)
3358                         goto fail;
3359
3360                 newly_created = true;
3361         }
3362
3363         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3364                 r = -ENODATA;
3365                 goto fail;
3366         }
3367
3368         r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3369         if (r < 0)
3370                 goto fail;
3371
3372         f->header = h;
3373
3374         if (!newly_created) {
3375                 set_clear_with_destructor(deferred_closes, journal_file_close);
3376
3377                 r = journal_file_verify_header(f);
3378                 if (r < 0)
3379                         goto fail;
3380         }
3381
3382 #if HAVE_GCRYPT
3383         if (!newly_created && f->writable) {
3384                 r = journal_file_fss_load(f);
3385                 if (r < 0)
3386                         goto fail;
3387         }
3388 #endif
3389
3390         if (f->writable) {
3391                 if (metrics) {
3392                         journal_default_metrics(metrics, f->fd);
3393                         f->metrics = *metrics;
3394                 } else if (template)
3395                         f->metrics = template->metrics;
3396
3397                 r = journal_file_refresh_header(f);
3398                 if (r < 0)
3399                         goto fail;
3400         }
3401
3402 #if HAVE_GCRYPT
3403         r = journal_file_hmac_setup(f);
3404         if (r < 0)
3405                 goto fail;
3406 #endif
3407
3408         if (newly_created) {
3409                 r = journal_file_setup_field_hash_table(f);
3410                 if (r < 0)
3411                         goto fail;
3412
3413                 r = journal_file_setup_data_hash_table(f);
3414                 if (r < 0)
3415                         goto fail;
3416
3417 #if HAVE_GCRYPT
3418                 r = journal_file_append_first_tag(f);
3419                 if (r < 0)
3420                         goto fail;
3421 #endif
3422         }
3423
3424         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3425                 r = -EIO;
3426                 goto fail;
3427         }
3428
3429         if (template && template->post_change_timer) {
3430                 r = journal_file_enable_post_change_timer(
3431                                 f,
3432                                 sd_event_source_get_event(template->post_change_timer),
3433                                 template->post_change_timer_period);
3434
3435                 if (r < 0)
3436                         goto fail;
3437         }
3438
3439         /* The file is opened now successfully, thus we take possession of any passed in fd. */
3440         f->close_fd = true;
3441
3442         *ret = f;
3443         return 0;
3444
3445 fail:
3446         if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3447                 r = -EIO;
3448
3449         (void) journal_file_close(f);
3450
3451         return r;
3452 }
3453
3454 int journal_file_rotate(JournalFile **f, bool compress, uint64_t compress_threshold_bytes, bool seal, Set *deferred_closes) {
3455         _cleanup_free_ char *p = NULL;
3456         size_t l;
3457         JournalFile *old_file, *new_file = NULL;
3458         int r;
3459
3460         assert(f);
3461         assert(*f);
3462
3463         old_file = *f;
3464
3465         if (!old_file->writable)
3466                 return -EINVAL;
3467
3468         /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3469          * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3470         if (path_startswith(old_file->path, "/proc/self/fd"))
3471                 return -EINVAL;
3472
3473         if (!endswith(old_file->path, ".journal"))
3474                 return -EINVAL;
3475
3476         l = strlen(old_file->path);
3477         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3478                      (int) l - 8, old_file->path,
3479                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3480                      le64toh((*f)->header->head_entry_seqnum),
3481                      le64toh((*f)->header->head_entry_realtime));
3482         if (r < 0)
3483                 return -ENOMEM;
3484
3485         /* Try to rename the file to the archived version. If the file
3486          * already was deleted, we'll get ENOENT, let's ignore that
3487          * case. */
3488         r = rename(old_file->path, p);
3489         if (r < 0 && errno != ENOENT)
3490                 return -errno;
3491
3492         /* Sync the rename to disk */
3493         (void) fsync_directory_of_file(old_file->fd);
3494
3495         /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3496          * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3497          * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3498          * would result in the rotated journal never getting fsync() called before closing.
3499          * Now we simply queue the archive state by setting an archive bit, leaving the state
3500          * as STATE_ONLINE so proper offlining occurs. */
3501         old_file->archive = true;
3502
3503         /* Currently, btrfs is not very good with out write patterns
3504          * and fragments heavily. Let's defrag our journal files when
3505          * we archive them */
3506         old_file->defrag_on_close = true;
3507
3508         r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress,
3509                               compress_threshold_bytes, seal, NULL, old_file->mmap, deferred_closes,
3510                               old_file, &new_file);
3511
3512         if (deferred_closes &&
3513             set_put(deferred_closes, old_file) >= 0)
3514                 (void) journal_file_set_offline(old_file, false);
3515         else
3516                 (void) journal_file_close(old_file);
3517
3518         *f = new_file;
3519         return r;
3520 }
3521
3522 int journal_file_open_reliably(
3523                 const char *fname,
3524                 int flags,
3525                 mode_t mode,
3526                 bool compress,
3527                 uint64_t compress_threshold_bytes,
3528                 bool seal,
3529                 JournalMetrics *metrics,
3530                 MMapCache *mmap_cache,
3531                 Set *deferred_closes,
3532                 JournalFile *template,
3533                 JournalFile **ret) {
3534
3535         int r;
3536         size_t l;
3537         _cleanup_free_ char *p = NULL;
3538
3539         r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3540                               deferred_closes, template, ret);
3541         if (!IN_SET(r,
3542                     -EBADMSG,           /* Corrupted */
3543                     -ENODATA,           /* Truncated */
3544                     -EHOSTDOWN,         /* Other machine */
3545                     -EPROTONOSUPPORT,   /* Incompatible feature */
3546                     -EBUSY,             /* Unclean shutdown */
3547                     -ESHUTDOWN,         /* Already archived */
3548                     -EIO,               /* IO error, including SIGBUS on mmap */
3549                     -EIDRM,             /* File has been deleted */
3550                     -ETXTBSY))          /* File is from the future */
3551                 return r;
3552
3553         if ((flags & O_ACCMODE) == O_RDONLY)
3554                 return r;
3555
3556         if (!(flags & O_CREAT))
3557                 return r;
3558
3559         if (!endswith(fname, ".journal"))
3560                 return r;
3561
3562         /* The file is corrupted. Rotate it away and try it again (but only once) */
3563
3564         l = strlen(fname);
3565         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
3566                      (int) l - 8, fname,
3567                      now(CLOCK_REALTIME),
3568                      random_u64()) < 0)
3569                 return -ENOMEM;
3570
3571         if (rename(fname, p) < 0)
3572                 return -errno;
3573
3574         /* btrfs doesn't cope well with our write pattern and
3575          * fragments heavily. Let's defrag all files we rotate */
3576
3577         (void) chattr_path(p, 0, FS_NOCOW_FL);
3578         (void) btrfs_defrag(p);
3579
3580         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3581
3582         return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3583                                  deferred_closes, template, ret);
3584 }
3585
3586 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
3587         uint64_t i, n;
3588         uint64_t q, xor_hash = 0;
3589         int r;
3590         EntryItem *items;
3591         dual_timestamp ts;
3592         const sd_id128_t *boot_id;
3593
3594         assert(from);
3595         assert(to);
3596         assert(o);
3597         assert(p);
3598
3599         if (!to->writable)
3600                 return -EPERM;
3601
3602         ts.monotonic = le64toh(o->entry.monotonic);
3603         ts.realtime = le64toh(o->entry.realtime);
3604         boot_id = &o->entry.boot_id;
3605
3606         n = journal_file_entry_n_items(o);
3607         /* alloca() can't take 0, hence let's allocate at least one */
3608         items = newa(EntryItem, MAX(1u, n));
3609
3610         for (i = 0; i < n; i++) {
3611                 uint64_t l, h;
3612                 le64_t le_hash;
3613                 size_t t;
3614                 void *data;
3615                 Object *u;
3616
3617                 q = le64toh(o->entry.items[i].object_offset);
3618                 le_hash = o->entry.items[i].hash;
3619
3620                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3621                 if (r < 0)
3622                         return r;
3623
3624                 if (le_hash != o->data.hash)
3625                         return -EBADMSG;
3626
3627                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3628                 t = (size_t) l;
3629
3630                 /* We hit the limit on 32bit machines */
3631                 if ((uint64_t) t != l)
3632                         return -E2BIG;
3633
3634                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3635 #if HAVE_XZ || HAVE_LZ4
3636                         size_t rsize = 0;
3637
3638                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3639                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3640                         if (r < 0)
3641                                 return r;
3642
3643                         data = from->compress_buffer;
3644                         l = rsize;
3645 #else
3646                         return -EPROTONOSUPPORT;
3647 #endif
3648                 } else
3649                         data = o->data.payload;
3650
3651                 r = journal_file_append_data(to, data, l, &u, &h);
3652                 if (r < 0)
3653                         return r;
3654
3655                 xor_hash ^= le64toh(u->data.hash);
3656                 items[i].object_offset = htole64(h);
3657                 items[i].hash = u->data.hash;
3658
3659                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3660                 if (r < 0)
3661                         return r;
3662         }
3663
3664         r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n,
3665                                                NULL, NULL, NULL);
3666
3667         if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3668                 return -EIO;
3669
3670         return r;
3671 }
3672
3673 void journal_reset_metrics(JournalMetrics *m) {
3674         assert(m);
3675
3676         /* Set everything to "pick automatic values". */
3677
3678         *m = (JournalMetrics) {
3679                 .min_use = (uint64_t) -1,
3680                 .max_use = (uint64_t) -1,
3681                 .min_size = (uint64_t) -1,
3682                 .max_size = (uint64_t) -1,
3683                 .keep_free = (uint64_t) -1,
3684                 .n_max_files = (uint64_t) -1,
3685         };
3686 }
3687
3688 void journal_default_metrics(JournalMetrics *m, int fd) {
3689         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3690         struct statvfs ss;
3691         uint64_t fs_size;
3692
3693         assert(m);
3694         assert(fd >= 0);
3695
3696         if (fstatvfs(fd, &ss) >= 0)
3697                 fs_size = ss.f_frsize * ss.f_blocks;
3698         else {
3699                 log_debug_errno(errno, "Failed to determine disk size: %m");
3700                 fs_size = 0;
3701         }
3702
3703         if (m->max_use == (uint64_t) -1) {
3704
3705                 if (fs_size > 0) {
3706                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3707
3708                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3709                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3710
3711                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3712                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3713                 } else
3714                         m->max_use = DEFAULT_MAX_USE_LOWER;
3715         } else {
3716                 m->max_use = PAGE_ALIGN(m->max_use);
3717
3718                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3719                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3720         }
3721
3722         if (m->min_use == (uint64_t) -1)
3723                 m->min_use = DEFAULT_MIN_USE;
3724
3725         if (m->min_use > m->max_use)
3726                 m->min_use = m->max_use;
3727
3728         if (m->max_size == (uint64_t) -1) {
3729                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3730
3731                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3732                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3733         } else
3734                 m->max_size = PAGE_ALIGN(m->max_size);
3735
3736         if (m->max_size != 0) {
3737                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3738                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3739
3740                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3741                         m->max_use = m->max_size*2;
3742         }
3743
3744         if (m->min_size == (uint64_t) -1)
3745                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3746         else {
3747                 m->min_size = PAGE_ALIGN(m->min_size);
3748
3749                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3750                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3751
3752                 if (m->max_size != 0 && m->min_size > m->max_size)
3753                         m->max_size = m->min_size;
3754         }
3755
3756         if (m->keep_free == (uint64_t) -1) {
3757
3758                 if (fs_size > 0) {
3759                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3760
3761                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3762                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3763
3764                 } else
3765                         m->keep_free = DEFAULT_KEEP_FREE;
3766         }
3767
3768         if (m->n_max_files == (uint64_t) -1)
3769                 m->n_max_files = DEFAULT_N_MAX_FILES;
3770
3771         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3772                   format_bytes(a, sizeof(a), m->min_use),
3773                   format_bytes(b, sizeof(b), m->max_use),
3774                   format_bytes(c, sizeof(c), m->max_size),
3775                   format_bytes(d, sizeof(d), m->min_size),
3776                   format_bytes(e, sizeof(e), m->keep_free),
3777                   m->n_max_files);
3778 }
3779
3780 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3781         assert(f);
3782         assert(f->header);
3783         assert(from || to);
3784
3785         if (from) {
3786                 if (f->header->head_entry_realtime == 0)
3787                         return -ENOENT;
3788
3789                 *from = le64toh(f->header->head_entry_realtime);
3790         }
3791
3792         if (to) {
3793                 if (f->header->tail_entry_realtime == 0)
3794                         return -ENOENT;
3795
3796                 *to = le64toh(f->header->tail_entry_realtime);
3797         }
3798
3799         return 1;
3800 }
3801
3802 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3803         Object *o;
3804         uint64_t p;
3805         int r;
3806
3807         assert(f);
3808         assert(from || to);
3809
3810         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3811         if (r <= 0)
3812                 return r;
3813
3814         if (le64toh(o->data.n_entries) <= 0)
3815                 return 0;
3816
3817         if (from) {
3818                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3819                 if (r < 0)
3820                         return r;
3821
3822                 *from = le64toh(o->entry.monotonic);
3823         }
3824
3825         if (to) {
3826                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3827                 if (r < 0)
3828                         return r;
3829
3830                 r = generic_array_get_plus_one(f,
3831                                                le64toh(o->data.entry_offset),
3832                                                le64toh(o->data.entry_array_offset),
3833                                                le64toh(o->data.n_entries)-1,
3834                                                &o, NULL);
3835                 if (r <= 0)
3836                         return r;
3837
3838                 *to = le64toh(o->entry.monotonic);
3839         }
3840
3841         return 1;
3842 }
3843
3844 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3845         assert(f);
3846         assert(f->header);
3847
3848         /* If we gained new header fields we gained new features,
3849          * hence suggest a rotation */
3850         if (le64toh(f->header->header_size) < sizeof(Header)) {
3851                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3852                 return true;
3853         }
3854
3855         /* Let's check if the hash tables grew over a certain fill
3856          * level (75%, borrowing this value from Java's hash table
3857          * implementation), and if so suggest a rotation. To calculate
3858          * the fill level we need the n_data field, which only exists
3859          * in newer versions. */
3860
3861         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3862                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3863                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3864                                   f->path,
3865                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3866                                   le64toh(f->header->n_data),
3867                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3868                                   (unsigned long long) f->last_stat.st_size,
3869                                   f->last_stat.st_size / le64toh(f->header->n_data));
3870                         return true;
3871                 }
3872
3873         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3874                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3875                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3876                                   f->path,
3877                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3878                                   le64toh(f->header->n_fields),
3879                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3880                         return true;
3881                 }
3882
3883         /* Are the data objects properly indexed by field objects? */
3884         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3885             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3886             le64toh(f->header->n_data) > 0 &&
3887             le64toh(f->header->n_fields) == 0)
3888                 return true;
3889
3890         if (max_file_usec > 0) {
3891                 usec_t t, h;
3892
3893                 h = le64toh(f->header->head_entry_realtime);
3894                 t = now(CLOCK_REALTIME);
3895
3896                 if (h > 0 && t > h + max_file_usec)
3897                         return true;
3898         }
3899
3900         return false;
3901 }