src/journal/journal-file.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2011 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <linux/fs.h>
  24 #include <pthread.h>
  25 #include <stddef.h>
  26 #include <sys/mman.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/uio.h>
  29 #include <unistd.h>
  30
  31 #include "alloc-util.h"
  32 #include "btrfs-util.h"
  33 #include "chattr-util.h"
  34 #include "compress.h"
  35 #include "fd-util.h"
  36 #include "journal-authenticate.h"
  37 #include "journal-def.h"
  38 #include "journal-file.h"
  39 #include "lookup3.h"
  40 #include "parse-util.h"
  41 #include "path-util.h"
  42 #include "random-util.h"
  43 #include "sd-event.h"
  44 #include "set.h"
  45 #include "string-util.h"
  46 #include "strv.h"
  47 #include "xattr-util.h"
  48
  49 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  50 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  51
  52 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
  53
  54 /* This is the minimum journal file size */
  55 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL)                 /* 512 KiB */
  56
  57 /* These are the lower and upper bounds if we deduce the max_use value
  58  * from the file system size */
  59 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  60 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  61
  62 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  63 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  64
  65 /* This is the upper bound if we deduce max_size from max_use */
  66 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  67
  68 /* This is the upper bound if we deduce the keep_free value from the
  69  * file system size */
  70 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  71
  72 /* This is the keep_free value when we can't determine the system
  73  * size */
  74 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  75
  76 /* This is the default maximum number of journal files to keep around. */
  77 #define DEFAULT_N_MAX_FILES (100)
  78
  79 /* n_data was the first entry we added after the initial file format design */
  80 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  81
  82 /* How many entries to keep in the entry array chain cache at max */
  83 #define CHAIN_CACHE_MAX 20
  84
  85 /* How much to increase the journal file size at once each time we allocate something new. */
  86 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  87
  88 /* Reread fstat() of the file for detecting deletions at least this often */
  89 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  90
  91 /* The mmap context to use for the header we pick as one above the last defined typed */
  92 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  93
  94 #ifdef __clang__
  95 #  pragma GCC diagnostic ignored "-Waddress-of-packed-member"
  96 #endif
  97
  98 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
  99  * As a result we use atomic operations on f->offline_state for inter-thread communications with
 100  * journal_file_set_offline() and journal_file_set_online(). */
 101 static void journal_file_set_offline_internal(JournalFile *f) {
 102         assert(f);
 103         assert(f->fd >= 0);
 104         assert(f->header);
 105
 106         for (;;) {
 107                 switch (f->offline_state) {
 108                 case OFFLINE_CANCEL:
 109                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
 110                                 continue;
 111                         return;
 112
 113                 case OFFLINE_AGAIN_FROM_SYNCING:
 114                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
 115                                 continue;
 116                         break;
 117
 118                 case OFFLINE_AGAIN_FROM_OFFLINING:
 119                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
 120                                 continue;
 121                         break;
 122
 123                 case OFFLINE_SYNCING:
 124                         (void) fsync(f->fd);
 125
 126                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
 127                                 continue;
 128
 129                         f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
 130                         (void) fsync(f->fd);
 131                         break;
 132
 133                 case OFFLINE_OFFLINING:
 134                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
 135                                 continue;
 136                         _fallthrough_;
 137                 case OFFLINE_DONE:
 138                         return;
 139
 140                 case OFFLINE_JOINED:
 141                         log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
 142                         return;
 143                 }
 144         }
 145 }
 146
 147 static void * journal_file_set_offline_thread(void *arg) {
 148         JournalFile *f = arg;
 149
 150         journal_file_set_offline_internal(f);
 151
 152         return NULL;
 153 }
 154
 155 static int journal_file_set_offline_thread_join(JournalFile *f) {
 156         int r;
 157
 158         assert(f);
 159
 160         if (f->offline_state == OFFLINE_JOINED)
 161                 return 0;
 162
 163         r = pthread_join(f->offline_thread, NULL);
 164         if (r)
 165                 return -r;
 166
 167         f->offline_state = OFFLINE_JOINED;
 168
 169         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 170                 return -EIO;
 171
 172         return 0;
 173 }
 174
 175 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
 176 static bool journal_file_set_offline_try_restart(JournalFile *f) {
 177         for (;;) {
 178                 switch (f->offline_state) {
 179                 case OFFLINE_AGAIN_FROM_SYNCING:
 180                 case OFFLINE_AGAIN_FROM_OFFLINING:
 181                         return true;
 182
 183                 case OFFLINE_CANCEL:
 184                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
 185                                 continue;
 186                         return true;
 187
 188                 case OFFLINE_SYNCING:
 189                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
 190                                 continue;
 191                         return true;
 192
 193                 case OFFLINE_OFFLINING:
 194                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
 195                                 continue;
 196                         return true;
 197
 198                 default:
 199                         return false;
 200                 }
 201         }
 202 }
 203
 204 /* Sets a journal offline.
 205  *
 206  * If wait is false then an offline is dispatched in a separate thread for a
 207  * subsequent journal_file_set_offline() or journal_file_set_online() of the
 208  * same journal to synchronize with.
 209  *
 210  * If wait is true, then either an existing offline thread will be restarted
 211  * and joined, or if none exists the offline is simply performed in this
 212  * context without involving another thread.
 213  */
 214 int journal_file_set_offline(JournalFile *f, bool wait) {
 215         bool restarted;
 216         int r;
 217
 218         assert(f);
 219
 220         if (!f->writable)
 221                 return -EPERM;
 222
 223         if (!(f->fd >= 0 && f->header))
 224                 return -EINVAL;
 225
 226         /* An offlining journal is implicitly online and may modify f->header->state,
 227          * we must also join any potentially lingering offline thread when not online. */
 228         if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
 229                 return journal_file_set_offline_thread_join(f);
 230
 231         /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
 232         restarted = journal_file_set_offline_try_restart(f);
 233         if ((restarted && wait) || !restarted) {
 234                 r = journal_file_set_offline_thread_join(f);
 235                 if (r < 0)
 236                         return r;
 237         }
 238
 239         if (restarted)
 240                 return 0;
 241
 242         /* Initiate a new offline. */
 243         f->offline_state = OFFLINE_SYNCING;
 244
 245         if (wait) /* Without using a thread if waiting. */
 246                 journal_file_set_offline_internal(f);
 247         else {
 248                 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
 249                 if (r > 0) {
 250                         f->offline_state = OFFLINE_JOINED;
 251                         return -r;
 252                 }
 253         }
 254
 255         return 0;
 256 }
 257
 258 static int journal_file_set_online(JournalFile *f) {
 259         bool joined = false;
 260
 261         assert(f);
 262
 263         if (!f->writable)
 264                 return -EPERM;
 265
 266         if (!(f->fd >= 0 && f->header))
 267                 return -EINVAL;
 268
 269         while (!joined) {
 270                 switch (f->offline_state) {
 271                 case OFFLINE_JOINED:
 272                         /* No offline thread, no need to wait. */
 273                         joined = true;
 274                         break;
 275
 276                 case OFFLINE_SYNCING:
 277                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
 278                                 continue;
 279                         /* Canceled syncing prior to offlining, no need to wait. */
 280                         break;
 281
 282                 case OFFLINE_AGAIN_FROM_SYNCING:
 283                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
 284                                 continue;
 285                         /* Canceled restart from syncing, no need to wait. */
 286                         break;
 287
 288                 case OFFLINE_AGAIN_FROM_OFFLINING:
 289                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
 290                                 continue;
 291                         /* Canceled restart from offlining, must wait for offlining to complete however. */
 292                         _fallthrough_;
 293                 default: {
 294                         int r;
 295
 296                         r = journal_file_set_offline_thread_join(f);
 297                         if (r < 0)
 298                                 return r;
 299
 300                         joined = true;
 301                         break;
 302                 }
 303                 }
 304         }
 305
 306         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 307                 return -EIO;
 308
 309         switch (f->header->state) {
 310                 case STATE_ONLINE:
 311                         return 0;
 312
 313                 case STATE_OFFLINE:
 314                         f->header->state = STATE_ONLINE;
 315                         (void) fsync(f->fd);
 316                         return 0;
 317
 318                 default:
 319                         return -EINVAL;
 320         }
 321 }
 322
 323 bool journal_file_is_offlining(JournalFile *f) {
 324         assert(f);
 325
 326         __sync_synchronize();
 327
 328         if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
 329                 return false;
 330
 331         return true;
 332 }
 333
 334 JournalFile* journal_file_close(JournalFile *f) {
 335         assert(f);
 336
 337 #if HAVE_GCRYPT
 338         /* Write the final tag */
 339         if (f->seal && f->writable) {
 340                 int r;
 341
 342                 r = journal_file_append_tag(f);
 343                 if (r < 0)
 344                         log_error_errno(r, "Failed to append tag when closing journal: %m");
 345         }
 346 #endif
 347
 348         if (f->post_change_timer) {
 349                 int enabled;
 350
 351                 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
 352                         if (enabled == SD_EVENT_ONESHOT)
 353                                 journal_file_post_change(f);
 354
 355                 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
 356                 sd_event_source_unref(f->post_change_timer);
 357         }
 358
 359         journal_file_set_offline(f, true);
 360
 361         if (f->mmap && f->cache_fd)
 362                 mmap_cache_free_fd(f->mmap, f->cache_fd);
 363
 364         if (f->fd >= 0 && f->defrag_on_close) {
 365
 366                 /* Be friendly to btrfs: turn COW back on again now,
 367                  * and defragment the file. We won't write to the file
 368                  * ever again, hence remove all fragmentation, and
 369                  * reenable all the good bits COW usually provides
 370                  * (such as data checksumming). */
 371
 372                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 373                 (void) btrfs_defrag_fd(f->fd);
 374         }
 375
 376         if (f->close_fd)
 377                 safe_close(f->fd);
 378         free(f->path);
 379
 380         mmap_cache_unref(f->mmap);
 381
 382         ordered_hashmap_free_free(f->chain_cache);
 383
 384 #if HAVE_XZ || HAVE_LZ4
 385         free(f->compress_buffer);
 386 #endif
 387
 388 #if HAVE_GCRYPT
 389         if (f->fss_file)
 390                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 391         else
 392                 free(f->fsprg_state);
 393
 394         free(f->fsprg_seed);
 395
 396         if (f->hmac)
 397                 gcry_md_close(f->hmac);
 398 #endif
 399
 400         return mfree(f);
 401 }
 402
 403 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 404         Header h = {};
 405         ssize_t k;
 406         int r;
 407
 408         assert(f);
 409
 410         memcpy(h.signature, HEADER_SIGNATURE, 8);
 411         h.header_size = htole64(ALIGN64(sizeof(h)));
 412
 413         h.incompatible_flags |= htole32(
 414                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 415                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 416
 417         h.compatible_flags = htole32(
 418                 f->seal * HEADER_COMPATIBLE_SEALED);
 419
 420         r = sd_id128_randomize(&h.file_id);
 421         if (r < 0)
 422                 return r;
 423
 424         if (template) {
 425                 h.seqnum_id = template->header->seqnum_id;
 426                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 427         } else
 428                 h.seqnum_id = h.file_id;
 429
 430         k = pwrite(f->fd, &h, sizeof(h), 0);
 431         if (k < 0)
 432                 return -errno;
 433
 434         if (k != sizeof(h))
 435                 return -EIO;
 436
 437         return 0;
 438 }
 439
 440 static int fsync_directory_of_file(int fd) {
 441         _cleanup_free_ char *path = NULL, *dn = NULL;
 442         _cleanup_close_ int dfd = -1;
 443         struct stat st;
 444         int r;
 445
 446         if (fstat(fd, &st) < 0)
 447                 return -errno;
 448
 449         if (!S_ISREG(st.st_mode))
 450                 return -EBADFD;
 451
 452         r = fd_get_path(fd, &path);
 453         if (r < 0)
 454                 return r;
 455
 456         if (!path_is_absolute(path))
 457                 return -EINVAL;
 458
 459         dn = dirname_malloc(path);
 460         if (!dn)
 461                 return -ENOMEM;
 462
 463         dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
 464         if (dfd < 0)
 465                 return -errno;
 466
 467         if (fsync(dfd) < 0)
 468                 return -errno;
 469
 470         return 0;
 471 }
 472
 473 static int journal_file_refresh_header(JournalFile *f) {
 474         sd_id128_t boot_id;
 475         int r;
 476
 477         assert(f);
 478         assert(f->header);
 479
 480         r = sd_id128_get_machine(&f->header->machine_id);
 481         if (r < 0)
 482                 return r;
 483
 484         r = sd_id128_get_boot(&boot_id);
 485         if (r < 0)
 486                 return r;
 487
 488         if (sd_id128_equal(boot_id, f->header->boot_id))
 489                 f->tail_entry_monotonic_valid = true;
 490
 491         f->header->boot_id = boot_id;
 492
 493         r = journal_file_set_online(f);
 494
 495         /* Sync the online state to disk */
 496         (void) fsync(f->fd);
 497
 498         /* We likely just created a new file, also sync the directory this file is located in. */
 499         (void) fsync_directory_of_file(f->fd);
 500
 501         return r;
 502 }
 503
 504 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
 505         const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
 506                 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
 507         const char *type = compatible ? "compatible" : "incompatible";
 508         uint32_t flags;
 509
 510         flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
 511
 512         if (flags & ~supported) {
 513                 if (flags & ~any)
 514                         log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
 515                                   f->path, type, flags & ~any);
 516                 flags = (flags & any) & ~supported;
 517                 if (flags) {
 518                         const char* strv[3];
 519                         unsigned n = 0;
 520                         _cleanup_free_ char *t = NULL;
 521
 522                         if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
 523                                 strv[n++] = "sealed";
 524                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
 525                                 strv[n++] = "xz-compressed";
 526                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
 527                                 strv[n++] = "lz4-compressed";
 528                         strv[n] = NULL;
 529                         assert(n < ELEMENTSOF(strv));
 530
 531                         t = strv_join((char**) strv, ", ");
 532                         log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
 533                                   f->path, type, n > 1 ? "flags" : "flag", strnull(t));
 534                 }
 535                 return true;
 536         }
 537
 538         return false;
 539 }
 540
 541 static int journal_file_verify_header(JournalFile *f) {
 542         uint64_t arena_size, header_size;
 543
 544         assert(f);
 545         assert(f->header);
 546
 547         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 548                 return -EBADMSG;
 549
 550         /* In both read and write mode we refuse to open files with incompatible
 551          * flags we don't know. */
 552         if (warn_wrong_flags(f, false))
 553                 return -EPROTONOSUPPORT;
 554
 555         /* When open for writing we refuse to open files with compatible flags, too. */
 556         if (f->writable && warn_wrong_flags(f, true))
 557                 return -EPROTONOSUPPORT;
 558
 559         if (f->header->state >= _STATE_MAX)
 560                 return -EBADMSG;
 561
 562         header_size = le64toh(f->header->header_size);
 563
 564         /* The first addition was n_data, so check that we are at least this large */
 565         if (header_size < HEADER_SIZE_MIN)
 566                 return -EBADMSG;
 567
 568         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 569                 return -EBADMSG;
 570
 571         arena_size = le64toh(f->header->arena_size);
 572
 573         if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
 574                 return -ENODATA;
 575
 576         if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
 577                 return -ENODATA;
 578
 579         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 580             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 581             !VALID64(le64toh(f->header->tail_object_offset)) ||
 582             !VALID64(le64toh(f->header->entry_array_offset)))
 583                 return -ENODATA;
 584
 585         if (f->writable) {
 586                 sd_id128_t machine_id;
 587                 uint8_t state;
 588                 int r;
 589
 590                 r = sd_id128_get_machine(&machine_id);
 591                 if (r < 0)
 592                         return r;
 593
 594                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 595                         return -EHOSTDOWN;
 596
 597                 state = f->header->state;
 598
 599                 if (state == STATE_ARCHIVED)
 600                         return -ESHUTDOWN; /* Already archived */
 601                 else if (state == STATE_ONLINE) {
 602                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 603                         return -EBUSY;
 604                 } else if (state != STATE_OFFLINE) {
 605                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 606                         return -EBUSY;
 607                 }
 608
 609                 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
 610                         return -EBADMSG;
 611
 612                 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
 613                  * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
 614                  * bisection. */
 615                 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
 616                         log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
 617                         return -ETXTBSY;
 618                 }
 619         }
 620
 621         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 622         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 623
 624         f->seal = JOURNAL_HEADER_SEALED(f->header);
 625
 626         return 0;
 627 }
 628
 629 static int journal_file_fstat(JournalFile *f) {
 630         assert(f);
 631         assert(f->fd >= 0);
 632
 633         if (fstat(f->fd, &f->last_stat) < 0)
 634                 return -errno;
 635
 636         f->last_stat_usec = now(CLOCK_MONOTONIC);
 637
 638         /* Refuse appending to files that are already deleted */
 639         if (f->last_stat.st_nlink <= 0)
 640                 return -EIDRM;
 641
 642         return 0;
 643 }
 644
 645 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 646         uint64_t old_size, new_size;
 647         int r;
 648
 649         assert(f);
 650         assert(f->header);
 651
 652         /* We assume that this file is not sparse, and we know that
 653          * for sure, since we always call posix_fallocate()
 654          * ourselves */
 655
 656         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 657                 return -EIO;
 658
 659         old_size =
 660                 le64toh(f->header->header_size) +
 661                 le64toh(f->header->arena_size);
 662
 663         new_size = PAGE_ALIGN(offset + size);
 664         if (new_size < le64toh(f->header->header_size))
 665                 new_size = le64toh(f->header->header_size);
 666
 667         if (new_size <= old_size) {
 668
 669                 /* We already pre-allocated enough space, but before
 670                  * we write to it, let's check with fstat() if the
 671                  * file got deleted, in order make sure we don't throw
 672                  * away the data immediately. Don't check fstat() for
 673                  * all writes though, but only once ever 10s. */
 674
 675                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 676                         return 0;
 677
 678                 return journal_file_fstat(f);
 679         }
 680
 681         /* Allocate more space. */
 682
 683         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 684                 return -E2BIG;
 685
 686         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 687                 struct statvfs svfs;
 688
 689                 if (fstatvfs(f->fd, &svfs) >= 0) {
 690                         uint64_t available;
 691
 692                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 693
 694                         if (new_size - old_size > available)
 695                                 return -E2BIG;
 696                 }
 697         }
 698
 699         /* Increase by larger blocks at once */
 700         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 701         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 702                 new_size = f->metrics.max_size;
 703
 704         /* Note that the glibc fallocate() fallback is very
 705            inefficient, hence we try to minimize the allocation area
 706            as we can. */
 707         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 708         if (r != 0)
 709                 return -r;
 710
 711         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 712
 713         return journal_file_fstat(f);
 714 }
 715
 716 static unsigned type_to_context(ObjectType type) {
 717         /* One context for each type, plus one catch-all for the rest */
 718         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 719         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 720         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 721 }
 722
 723 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
 724         int r;
 725
 726         assert(f);
 727         assert(ret);
 728
 729         if (size <= 0)
 730                 return -EINVAL;
 731
 732         /* Avoid SIGBUS on invalid accesses */
 733         if (offset + size > (uint64_t) f->last_stat.st_size) {
 734                 /* Hmm, out of range? Let's refresh the fstat() data
 735                  * first, before we trust that check. */
 736
 737                 r = journal_file_fstat(f);
 738                 if (r < 0)
 739                         return r;
 740
 741                 if (offset + size > (uint64_t) f->last_stat.st_size)
 742                         return -EADDRNOTAVAIL;
 743         }
 744
 745         return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
 746 }
 747
 748 static uint64_t minimum_header_size(Object *o) {
 749
 750         static const uint64_t table[] = {
 751                 [OBJECT_DATA] = sizeof(DataObject),
 752                 [OBJECT_FIELD] = sizeof(FieldObject),
 753                 [OBJECT_ENTRY] = sizeof(EntryObject),
 754                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 755                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 756                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 757                 [OBJECT_TAG] = sizeof(TagObject),
 758         };
 759
 760         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 761                 return sizeof(ObjectHeader);
 762
 763         return table[o->object.type];
 764 }
 765
 766 /* Lightweight object checks. We want this to be fast, so that we won't
 767  * slowdown every journal_file_move_to_object() call too much. */
 768 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
 769         assert(f);
 770         assert(o);
 771
 772         switch (o->object.type) {
 773
 774         case OBJECT_DATA: {
 775                 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
 776                         log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
 777                                         le64toh(o->data.n_entries), offset);
 778                         return -EBADMSG;
 779                 }
 780
 781                 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
 782                         log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
 783                               offsetof(DataObject, payload),
 784                               le64toh(o->object.size),
 785                               offset);
 786                         return -EBADMSG;
 787                 }
 788
 789                 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
 790                     !VALID64(le64toh(o->data.next_field_offset)) ||
 791                     !VALID64(le64toh(o->data.entry_offset)) ||
 792                     !VALID64(le64toh(o->data.entry_array_offset))) {
 793                         log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
 794                                 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
 795                               le64toh(o->data.next_hash_offset),
 796                               le64toh(o->data.next_field_offset),
 797                               le64toh(o->data.entry_offset),
 798                               le64toh(o->data.entry_array_offset),
 799                               offset);
 800                         return -EBADMSG;
 801                 }
 802
 803                 break;
 804         }
 805
 806         case OBJECT_FIELD:
 807                 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
 808                         log_debug(
 809                               "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
 810                               offsetof(FieldObject, payload),
 811                               le64toh(o->object.size),
 812                               offset);
 813                         return -EBADMSG;
 814                 }
 815
 816                 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
 817                     !VALID64(le64toh(o->field.head_data_offset))) {
 818                         log_debug(
 819                               "Invalid offset, next_hash_offset="OFSfmt
 820                               ", head_data_offset="OFSfmt": %"PRIu64,
 821                               le64toh(o->field.next_hash_offset),
 822                               le64toh(o->field.head_data_offset),
 823                               offset);
 824                         return -EBADMSG;
 825                 }
 826                 break;
 827
 828         case OBJECT_ENTRY:
 829                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
 830                         log_debug(
 831                               "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
 832                               offsetof(EntryObject, items),
 833                               le64toh(o->object.size),
 834                               offset);
 835                         return -EBADMSG;
 836                 }
 837
 838                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
 839                         log_debug(
 840                               "Invalid number items in entry: %"PRIu64": %"PRIu64,
 841                               (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
 842                               offset);
 843                         return -EBADMSG;
 844                 }
 845
 846                 if (le64toh(o->entry.seqnum) <= 0) {
 847                         log_debug(
 848                               "Invalid entry seqnum: %"PRIx64": %"PRIu64,
 849                               le64toh(o->entry.seqnum),
 850                               offset);
 851                         return -EBADMSG;
 852                 }
 853
 854                 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
 855                         log_debug(
 856                               "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
 857                               le64toh(o->entry.realtime),
 858                               offset);
 859                         return -EBADMSG;
 860                 }
 861
 862                 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
 863                         log_debug(
 864                               "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
 865                               le64toh(o->entry.monotonic),
 866                               offset);
 867                         return -EBADMSG;
 868                 }
 869
 870                 break;
 871
 872         case OBJECT_DATA_HASH_TABLE:
 873         case OBJECT_FIELD_HASH_TABLE:
 874                 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
 875                     (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
 876                         log_debug(
 877                               "Invalid %s hash table size: %"PRIu64": %"PRIu64,
 878                               o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
 879                               le64toh(o->object.size),
 880                               offset);
 881                         return -EBADMSG;
 882                 }
 883
 884                 break;
 885
 886         case OBJECT_ENTRY_ARRAY:
 887                 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
 888                     (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
 889                         log_debug(
 890                               "Invalid object entry array size: %"PRIu64": %"PRIu64,
 891                               le64toh(o->object.size),
 892                               offset);
 893                         return -EBADMSG;
 894                 }
 895
 896                 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
 897                         log_debug(
 898                               "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
 899                               le64toh(o->entry_array.next_entry_array_offset),
 900                               offset);
 901                         return -EBADMSG;
 902                 }
 903
 904                 break;
 905
 906         case OBJECT_TAG:
 907                 if (le64toh(o->object.size) != sizeof(TagObject)) {
 908                         log_debug(
 909                               "Invalid object tag size: %"PRIu64": %"PRIu64,
 910                               le64toh(o->object.size),
 911                               offset);
 912                         return -EBADMSG;
 913                 }
 914
 915                 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
 916                         log_debug(
 917                               "Invalid object tag epoch: %"PRIu64": %"PRIu64,
 918                               le64toh(o->tag.epoch),
 919                               offset);
 920                         return -EBADMSG;
 921                 }
 922
 923                 break;
 924         }
 925
 926         return 0;
 927 }
 928
 929 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 930         int r;
 931         void *t;
 932         size_t tsize;
 933         Object *o;
 934         uint64_t s;
 935
 936         assert(f);
 937         assert(ret);
 938
 939         /* Objects may only be located at multiple of 64 bit */
 940         if (!VALID64(offset)) {
 941                 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
 942                 return -EBADMSG;
 943         }
 944
 945         /* Object may not be located in the file header */
 946         if (offset < le64toh(f->header->header_size)) {
 947                 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
 948                 return -EBADMSG;
 949         }
 950
 951         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
 952         if (r < 0)
 953                 return r;
 954
 955         o = (Object*) t;
 956         s = le64toh(o->object.size);
 957
 958         if (s == 0) {
 959                 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
 960                 return -EBADMSG;
 961         }
 962         if (s < sizeof(ObjectHeader)) {
 963                 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
 964                 return -EBADMSG;
 965         }
 966
 967         if (o->object.type <= OBJECT_UNUSED) {
 968                 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
 969                 return -EBADMSG;
 970         }
 971
 972         if (s < minimum_header_size(o)) {
 973                 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
 974                 return -EBADMSG;
 975         }
 976
 977         if (type > OBJECT_UNUSED && o->object.type != type) {
 978                 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
 979                 return -EBADMSG;
 980         }
 981
 982         if (s > tsize) {
 983                 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
 984                 if (r < 0)
 985                         return r;
 986
 987                 o = (Object*) t;
 988         }
 989
 990         r = journal_file_check_object(f, offset, o);
 991         if (r < 0)
 992                 return r;
 993
 994         *ret = o;
 995         return 0;
 996 }
 997
 998 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 999         uint64_t r;
1000
1001         assert(f);
1002         assert(f->header);
1003
1004         r = le64toh(f->header->tail_entry_seqnum) + 1;
1005
1006         if (seqnum) {
1007                 /* If an external seqnum counter was passed, we update
1008                  * both the local and the external one, and set it to
1009                  * the maximum of both */
1010
1011                 if (*seqnum + 1 > r)
1012                         r = *seqnum + 1;
1013
1014                 *seqnum = r;
1015         }
1016
1017         f->header->tail_entry_seqnum = htole64(r);
1018
1019         if (f->header->head_entry_seqnum == 0)
1020                 f->header->head_entry_seqnum = htole64(r);
1021
1022         return r;
1023 }
1024
1025 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
1026         int r;
1027         uint64_t p;
1028         Object *tail, *o;
1029         void *t;
1030
1031         assert(f);
1032         assert(f->header);
1033         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
1034         assert(size >= sizeof(ObjectHeader));
1035         assert(offset);
1036         assert(ret);
1037
1038         r = journal_file_set_online(f);
1039         if (r < 0)
1040                 return r;
1041
1042         p = le64toh(f->header->tail_object_offset);
1043         if (p == 0)
1044                 p = le64toh(f->header->header_size);
1045         else {
1046                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1047                 if (r < 0)
1048                         return r;
1049
1050                 p += ALIGN64(le64toh(tail->object.size));
1051         }
1052
1053         r = journal_file_allocate(f, p, size);
1054         if (r < 0)
1055                 return r;
1056
1057         r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1058         if (r < 0)
1059                 return r;
1060
1061         o = (Object*) t;
1062
1063         zero(o->object);
1064         o->object.type = type;
1065         o->object.size = htole64(size);
1066
1067         f->header->tail_object_offset = htole64(p);
1068         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1069
1070         *ret = o;
1071         *offset = p;
1072
1073         return 0;
1074 }
1075
1076 static int journal_file_setup_data_hash_table(JournalFile *f) {
1077         uint64_t s, p;
1078         Object *o;
1079         int r;
1080
1081         assert(f);
1082         assert(f->header);
1083
1084         /* We estimate that we need 1 hash table entry per 768 bytes
1085            of journal file and we want to make sure we never get
1086            beyond 75% fill level. Calculate the hash table size for
1087            the maximum file size based on these metrics. */
1088
1089         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1090         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1091                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1092
1093         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1094
1095         r = journal_file_append_object(f,
1096                                        OBJECT_DATA_HASH_TABLE,
1097                                        offsetof(Object, hash_table.items) + s,
1098                                        &o, &p);
1099         if (r < 0)
1100                 return r;
1101
1102         memzero(o->hash_table.items, s);
1103
1104         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1105         f->header->data_hash_table_size = htole64(s);
1106
1107         return 0;
1108 }
1109
1110 static int journal_file_setup_field_hash_table(JournalFile *f) {
1111         uint64_t s, p;
1112         Object *o;
1113         int r;
1114
1115         assert(f);
1116         assert(f->header);
1117
1118         /* We use a fixed size hash table for the fields as this
1119          * number should grow very slowly only */
1120
1121         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1122         r = journal_file_append_object(f,
1123                                        OBJECT_FIELD_HASH_TABLE,
1124                                        offsetof(Object, hash_table.items) + s,
1125                                        &o, &p);
1126         if (r < 0)
1127                 return r;
1128
1129         memzero(o->hash_table.items, s);
1130
1131         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1132         f->header->field_hash_table_size = htole64(s);
1133
1134         return 0;
1135 }
1136
1137 int journal_file_map_data_hash_table(JournalFile *f) {
1138         uint64_t s, p;
1139         void *t;
1140         int r;
1141
1142         assert(f);
1143         assert(f->header);
1144
1145         if (f->data_hash_table)
1146                 return 0;
1147
1148         p = le64toh(f->header->data_hash_table_offset);
1149         s = le64toh(f->header->data_hash_table_size);
1150
1151         r = journal_file_move_to(f,
1152                                  OBJECT_DATA_HASH_TABLE,
1153                                  true,
1154                                  p, s,
1155                                  &t, NULL);
1156         if (r < 0)
1157                 return r;
1158
1159         f->data_hash_table = t;
1160         return 0;
1161 }
1162
1163 int journal_file_map_field_hash_table(JournalFile *f) {
1164         uint64_t s, p;
1165         void *t;
1166         int r;
1167
1168         assert(f);
1169         assert(f->header);
1170
1171         if (f->field_hash_table)
1172                 return 0;
1173
1174         p = le64toh(f->header->field_hash_table_offset);
1175         s = le64toh(f->header->field_hash_table_size);
1176
1177         r = journal_file_move_to(f,
1178                                  OBJECT_FIELD_HASH_TABLE,
1179                                  true,
1180                                  p, s,
1181                                  &t, NULL);
1182         if (r < 0)
1183                 return r;
1184
1185         f->field_hash_table = t;
1186         return 0;
1187 }
1188
1189 static int journal_file_link_field(
1190                 JournalFile *f,
1191                 Object *o,
1192                 uint64_t offset,
1193                 uint64_t hash) {
1194
1195         uint64_t p, h, m;
1196         int r;
1197
1198         assert(f);
1199         assert(f->header);
1200         assert(f->field_hash_table);
1201         assert(o);
1202         assert(offset > 0);
1203
1204         if (o->object.type != OBJECT_FIELD)
1205                 return -EINVAL;
1206
1207         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1208         if (m <= 0)
1209                 return -EBADMSG;
1210
1211         /* This might alter the window we are looking at */
1212         o->field.next_hash_offset = o->field.head_data_offset = 0;
1213
1214         h = hash % m;
1215         p = le64toh(f->field_hash_table[h].tail_hash_offset);
1216         if (p == 0)
1217                 f->field_hash_table[h].head_hash_offset = htole64(offset);
1218         else {
1219                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1220                 if (r < 0)
1221                         return r;
1222
1223                 o->field.next_hash_offset = htole64(offset);
1224         }
1225
1226         f->field_hash_table[h].tail_hash_offset = htole64(offset);
1227
1228         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1229                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1230
1231         return 0;
1232 }
1233
1234 static int journal_file_link_data(
1235                 JournalFile *f,
1236                 Object *o,
1237                 uint64_t offset,
1238                 uint64_t hash) {
1239
1240         uint64_t p, h, m;
1241         int r;
1242
1243         assert(f);
1244         assert(f->header);
1245         assert(f->data_hash_table);
1246         assert(o);
1247         assert(offset > 0);
1248
1249         if (o->object.type != OBJECT_DATA)
1250                 return -EINVAL;
1251
1252         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1253         if (m <= 0)
1254                 return -EBADMSG;
1255
1256         /* This might alter the window we are looking at */
1257         o->data.next_hash_offset = o->data.next_field_offset = 0;
1258         o->data.entry_offset = o->data.entry_array_offset = 0;
1259         o->data.n_entries = 0;
1260
1261         h = hash % m;
1262         p = le64toh(f->data_hash_table[h].tail_hash_offset);
1263         if (p == 0)
1264                 /* Only entry in the hash table is easy */
1265                 f->data_hash_table[h].head_hash_offset = htole64(offset);
1266         else {
1267                 /* Move back to the previous data object, to patch in
1268                  * pointer */
1269
1270                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1271                 if (r < 0)
1272                         return r;
1273
1274                 o->data.next_hash_offset = htole64(offset);
1275         }
1276
1277         f->data_hash_table[h].tail_hash_offset = htole64(offset);
1278
1279         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1280                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1281
1282         return 0;
1283 }
1284
1285 int journal_file_find_field_object_with_hash(
1286                 JournalFile *f,
1287                 const void *field, uint64_t size, uint64_t hash,
1288                 Object **ret, uint64_t *offset) {
1289
1290         uint64_t p, osize, h, m;
1291         int r;
1292
1293         assert(f);
1294         assert(f->header);
1295         assert(field && size > 0);
1296
1297         /* If the field hash table is empty, we can't find anything */
1298         if (le64toh(f->header->field_hash_table_size) <= 0)
1299                 return 0;
1300
1301         /* Map the field hash table, if it isn't mapped yet. */
1302         r = journal_file_map_field_hash_table(f);
1303         if (r < 0)
1304                 return r;
1305
1306         osize = offsetof(Object, field.payload) + size;
1307
1308         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1309         if (m <= 0)
1310                 return -EBADMSG;
1311
1312         h = hash % m;
1313         p = le64toh(f->field_hash_table[h].head_hash_offset);
1314
1315         while (p > 0) {
1316                 Object *o;
1317
1318                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1319                 if (r < 0)
1320                         return r;
1321
1322                 if (le64toh(o->field.hash) == hash &&
1323                     le64toh(o->object.size) == osize &&
1324                     memcmp(o->field.payload, field, size) == 0) {
1325
1326                         if (ret)
1327                                 *ret = o;
1328                         if (offset)
1329                                 *offset = p;
1330
1331                         return 1;
1332                 }
1333
1334                 p = le64toh(o->field.next_hash_offset);
1335         }
1336
1337         return 0;
1338 }
1339
1340 int journal_file_find_field_object(
1341                 JournalFile *f,
1342                 const void *field, uint64_t size,
1343                 Object **ret, uint64_t *offset) {
1344
1345         uint64_t hash;
1346
1347         assert(f);
1348         assert(field && size > 0);
1349
1350         hash = hash64(field, size);
1351
1352         return journal_file_find_field_object_with_hash(f,
1353                                                         field, size, hash,
1354                                                         ret, offset);
1355 }
1356
1357 int journal_file_find_data_object_with_hash(
1358                 JournalFile *f,
1359                 const void *data, uint64_t size, uint64_t hash,
1360                 Object **ret, uint64_t *offset) {
1361
1362         uint64_t p, osize, h, m;
1363         int r;
1364
1365         assert(f);
1366         assert(f->header);
1367         assert(data || size == 0);
1368
1369         /* If there's no data hash table, then there's no entry. */
1370         if (le64toh(f->header->data_hash_table_size) <= 0)
1371                 return 0;
1372
1373         /* Map the data hash table, if it isn't mapped yet. */
1374         r = journal_file_map_data_hash_table(f);
1375         if (r < 0)
1376                 return r;
1377
1378         osize = offsetof(Object, data.payload) + size;
1379
1380         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1381         if (m <= 0)
1382                 return -EBADMSG;
1383
1384         h = hash % m;
1385         p = le64toh(f->data_hash_table[h].head_hash_offset);
1386
1387         while (p > 0) {
1388                 Object *o;
1389
1390                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1391                 if (r < 0)
1392                         return r;
1393
1394                 if (le64toh(o->data.hash) != hash)
1395                         goto next;
1396
1397                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1398 #if HAVE_XZ || HAVE_LZ4
1399                         uint64_t l;
1400                         size_t rsize = 0;
1401
1402                         l = le64toh(o->object.size);
1403                         if (l <= offsetof(Object, data.payload))
1404                                 return -EBADMSG;
1405
1406                         l -= offsetof(Object, data.payload);
1407
1408                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1409                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1410                         if (r < 0)
1411                                 return r;
1412
1413                         if (rsize == size &&
1414                             memcmp(f->compress_buffer, data, size) == 0) {
1415
1416                                 if (ret)
1417                                         *ret = o;
1418
1419                                 if (offset)
1420                                         *offset = p;
1421
1422                                 return 1;
1423                         }
1424 #else
1425                         return -EPROTONOSUPPORT;
1426 #endif
1427                 } else if (le64toh(o->object.size) == osize &&
1428                            memcmp(o->data.payload, data, size) == 0) {
1429
1430                         if (ret)
1431                                 *ret = o;
1432
1433                         if (offset)
1434                                 *offset = p;
1435
1436                         return 1;
1437                 }
1438
1439         next:
1440                 p = le64toh(o->data.next_hash_offset);
1441         }
1442
1443         return 0;
1444 }
1445
1446 int journal_file_find_data_object(
1447                 JournalFile *f,
1448                 const void *data, uint64_t size,
1449                 Object **ret, uint64_t *offset) {
1450
1451         uint64_t hash;
1452
1453         assert(f);
1454         assert(data || size == 0);
1455
1456         hash = hash64(data, size);
1457
1458         return journal_file_find_data_object_with_hash(f,
1459                                                        data, size, hash,
1460                                                        ret, offset);
1461 }
1462
1463 static int journal_file_append_field(
1464                 JournalFile *f,
1465                 const void *field, uint64_t size,
1466                 Object **ret, uint64_t *offset) {
1467
1468         uint64_t hash, p;
1469         uint64_t osize;
1470         Object *o;
1471         int r;
1472
1473         assert(f);
1474         assert(field && size > 0);
1475
1476         hash = hash64(field, size);
1477
1478         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1479         if (r < 0)
1480                 return r;
1481         else if (r > 0) {
1482
1483                 if (ret)
1484                         *ret = o;
1485
1486                 if (offset)
1487                         *offset = p;
1488
1489                 return 0;
1490         }
1491
1492         osize = offsetof(Object, field.payload) + size;
1493         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1494         if (r < 0)
1495                 return r;
1496
1497         o->field.hash = htole64(hash);
1498         memcpy(o->field.payload, field, size);
1499
1500         r = journal_file_link_field(f, o, p, hash);
1501         if (r < 0)
1502                 return r;
1503
1504         /* The linking might have altered the window, so let's
1505          * refresh our pointer */
1506         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1507         if (r < 0)
1508                 return r;
1509
1510 #if HAVE_GCRYPT
1511         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1512         if (r < 0)
1513                 return r;
1514 #endif
1515
1516         if (ret)
1517                 *ret = o;
1518
1519         if (offset)
1520                 *offset = p;
1521
1522         return 0;
1523 }
1524
1525 static int journal_file_append_data(
1526                 JournalFile *f,
1527                 const void *data, uint64_t size,
1528                 Object **ret, uint64_t *offset) {
1529
1530         uint64_t hash, p;
1531         uint64_t osize;
1532         Object *o;
1533         int r, compression = 0;
1534         const void *eq;
1535
1536         assert(f);
1537         assert(data || size == 0);
1538
1539         hash = hash64(data, size);
1540
1541         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1542         if (r < 0)
1543                 return r;
1544         if (r > 0) {
1545
1546                 if (ret)
1547                         *ret = o;
1548
1549                 if (offset)
1550                         *offset = p;
1551
1552                 return 0;
1553         }
1554
1555         osize = offsetof(Object, data.payload) + size;
1556         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1557         if (r < 0)
1558                 return r;
1559
1560         o->data.hash = htole64(hash);
1561
1562 #if HAVE_XZ || HAVE_LZ4
1563         if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1564                 size_t rsize = 0;
1565
1566                 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1567
1568                 if (compression >= 0) {
1569                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1570                         o->object.flags |= compression;
1571
1572                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1573                                   size, rsize, object_compressed_to_string(compression));
1574                 } else
1575                         /* Compression didn't work, we don't really care why, let's continue without compression */
1576                         compression = 0;
1577         }
1578 #endif
1579
1580         if (compression == 0)
1581                 memcpy_safe(o->data.payload, data, size);
1582
1583         r = journal_file_link_data(f, o, p, hash);
1584         if (r < 0)
1585                 return r;
1586
1587 #if HAVE_GCRYPT
1588         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1589         if (r < 0)
1590                 return r;
1591 #endif
1592
1593         /* The linking might have altered the window, so let's
1594          * refresh our pointer */
1595         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1596         if (r < 0)
1597                 return r;
1598
1599         if (!data)
1600                 eq = NULL;
1601         else
1602                 eq = memchr(data, '=', size);
1603         if (eq && eq > data) {
1604                 Object *fo = NULL;
1605                 uint64_t fp;
1606
1607                 /* Create field object ... */
1608                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1609                 if (r < 0)
1610                         return r;
1611
1612                 /* ... and link it in. */
1613                 o->data.next_field_offset = fo->field.head_data_offset;
1614                 fo->field.head_data_offset = le64toh(p);
1615         }
1616
1617         if (ret)
1618                 *ret = o;
1619
1620         if (offset)
1621                 *offset = p;
1622
1623         return 0;
1624 }
1625
1626 uint64_t journal_file_entry_n_items(Object *o) {
1627         assert(o);
1628
1629         if (o->object.type != OBJECT_ENTRY)
1630                 return 0;
1631
1632         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1633 }
1634
1635 uint64_t journal_file_entry_array_n_items(Object *o) {
1636         assert(o);
1637
1638         if (o->object.type != OBJECT_ENTRY_ARRAY)
1639                 return 0;
1640
1641         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1642 }
1643
1644 uint64_t journal_file_hash_table_n_items(Object *o) {
1645         assert(o);
1646
1647         if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1648                 return 0;
1649
1650         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1651 }
1652
1653 static int link_entry_into_array(JournalFile *f,
1654                                  le64_t *first,
1655                                  le64_t *idx,
1656                                  uint64_t p) {
1657         int r;
1658         uint64_t n = 0, ap = 0, q, i, a, hidx;
1659         Object *o;
1660
1661         assert(f);
1662         assert(f->header);
1663         assert(first);
1664         assert(idx);
1665         assert(p > 0);
1666
1667         a = le64toh(*first);
1668         i = hidx = le64toh(*idx);
1669         while (a > 0) {
1670
1671                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1672                 if (r < 0)
1673                         return r;
1674
1675                 n = journal_file_entry_array_n_items(o);
1676                 if (i < n) {
1677                         o->entry_array.items[i] = htole64(p);
1678                         *idx = htole64(hidx + 1);
1679                         return 0;
1680                 }
1681
1682                 i -= n;
1683                 ap = a;
1684                 a = le64toh(o->entry_array.next_entry_array_offset);
1685         }
1686
1687         if (hidx > n)
1688                 n = (hidx+1) * 2;
1689         else
1690                 n = n * 2;
1691
1692         if (n < 4)
1693                 n = 4;
1694
1695         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1696                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1697                                        &o, &q);
1698         if (r < 0)
1699                 return r;
1700
1701 #if HAVE_GCRYPT
1702         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1703         if (r < 0)
1704                 return r;
1705 #endif
1706
1707         o->entry_array.items[i] = htole64(p);
1708
1709         if (ap == 0)
1710                 *first = htole64(q);
1711         else {
1712                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1713                 if (r < 0)
1714                         return r;
1715
1716                 o->entry_array.next_entry_array_offset = htole64(q);
1717         }
1718
1719         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1720                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1721
1722         *idx = htole64(hidx + 1);
1723
1724         return 0;
1725 }
1726
1727 static int link_entry_into_array_plus_one(JournalFile *f,
1728                                           le64_t *extra,
1729                                           le64_t *first,
1730                                           le64_t *idx,
1731                                           uint64_t p) {
1732
1733         int r;
1734
1735         assert(f);
1736         assert(extra);
1737         assert(first);
1738         assert(idx);
1739         assert(p > 0);
1740
1741         if (*idx == 0)
1742                 *extra = htole64(p);
1743         else {
1744                 le64_t i;
1745
1746                 i = htole64(le64toh(*idx) - 1);
1747                 r = link_entry_into_array(f, first, &i, p);
1748                 if (r < 0)
1749                         return r;
1750         }
1751
1752         *idx = htole64(le64toh(*idx) + 1);
1753         return 0;
1754 }
1755
1756 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1757         uint64_t p;
1758         int r;
1759         assert(f);
1760         assert(o);
1761         assert(offset > 0);
1762
1763         p = le64toh(o->entry.items[i].object_offset);
1764         if (p == 0)
1765                 return -EINVAL;
1766
1767         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1768         if (r < 0)
1769                 return r;
1770
1771         return link_entry_into_array_plus_one(f,
1772                                               &o->data.entry_offset,
1773                                               &o->data.entry_array_offset,
1774                                               &o->data.n_entries,
1775                                               offset);
1776 }
1777
1778 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1779         uint64_t n, i;
1780         int r;
1781
1782         assert(f);
1783         assert(f->header);
1784         assert(o);
1785         assert(offset > 0);
1786
1787         if (o->object.type != OBJECT_ENTRY)
1788                 return -EINVAL;
1789
1790         __sync_synchronize();
1791
1792         /* Link up the entry itself */
1793         r = link_entry_into_array(f,
1794                                   &f->header->entry_array_offset,
1795                                   &f->header->n_entries,
1796                                   offset);
1797         if (r < 0)
1798                 return r;
1799
1800         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1801
1802         if (f->header->head_entry_realtime == 0)
1803                 f->header->head_entry_realtime = o->entry.realtime;
1804
1805         f->header->tail_entry_realtime = o->entry.realtime;
1806         f->header->tail_entry_monotonic = o->entry.monotonic;
1807
1808         f->tail_entry_monotonic_valid = true;
1809
1810         /* Link up the items */
1811         n = journal_file_entry_n_items(o);
1812         for (i = 0; i < n; i++) {
1813                 r = journal_file_link_entry_item(f, o, offset, i);
1814                 if (r < 0)
1815                         return r;
1816         }
1817
1818         return 0;
1819 }
1820
1821 static int journal_file_append_entry_internal(
1822                 JournalFile *f,
1823                 const dual_timestamp *ts,
1824                 uint64_t xor_hash,
1825                 const EntryItem items[], unsigned n_items,
1826                 uint64_t *seqnum,
1827                 Object **ret, uint64_t *offset) {
1828         uint64_t np;
1829         uint64_t osize;
1830         Object *o;
1831         int r;
1832
1833         assert(f);
1834         assert(f->header);
1835         assert(items || n_items == 0);
1836         assert(ts);
1837
1838         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1839
1840         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1841         if (r < 0)
1842                 return r;
1843
1844         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1845         memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1846         o->entry.realtime = htole64(ts->realtime);
1847         o->entry.monotonic = htole64(ts->monotonic);
1848         o->entry.xor_hash = htole64(xor_hash);
1849         o->entry.boot_id = f->header->boot_id;
1850
1851 #if HAVE_GCRYPT
1852         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1853         if (r < 0)
1854                 return r;
1855 #endif
1856
1857         r = journal_file_link_entry(f, o, np);
1858         if (r < 0)
1859                 return r;
1860
1861         if (ret)
1862                 *ret = o;
1863
1864         if (offset)
1865                 *offset = np;
1866
1867         return 0;
1868 }
1869
1870 void journal_file_post_change(JournalFile *f) {
1871         assert(f);
1872
1873         /* inotify() does not receive IN_MODIFY events from file
1874          * accesses done via mmap(). After each access we hence
1875          * trigger IN_MODIFY by truncating the journal file to its
1876          * current size which triggers IN_MODIFY. */
1877
1878         __sync_synchronize();
1879
1880         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1881                 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1882 }
1883
1884 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1885         assert(userdata);
1886
1887         journal_file_post_change(userdata);
1888
1889         return 1;
1890 }
1891
1892 static void schedule_post_change(JournalFile *f) {
1893         sd_event_source *timer;
1894         int enabled, r;
1895         uint64_t now;
1896
1897         assert(f);
1898         assert(f->post_change_timer);
1899
1900         timer = f->post_change_timer;
1901
1902         r = sd_event_source_get_enabled(timer, &enabled);
1903         if (r < 0) {
1904                 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1905                 goto fail;
1906         }
1907
1908         if (enabled == SD_EVENT_ONESHOT)
1909                 return;
1910
1911         r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1912         if (r < 0) {
1913                 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1914                 goto fail;
1915         }
1916
1917         r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1918         if (r < 0) {
1919                 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1920                 goto fail;
1921         }
1922
1923         r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1924         if (r < 0) {
1925                 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1926                 goto fail;
1927         }
1928
1929         return;
1930
1931 fail:
1932         /* On failure, let's simply post the change immediately. */
1933         journal_file_post_change(f);
1934 }
1935
1936 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1937 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1938         _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1939         int r;
1940
1941         assert(f);
1942         assert_return(!f->post_change_timer, -EINVAL);
1943         assert(e);
1944         assert(t);
1945
1946         r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1947         if (r < 0)
1948                 return r;
1949
1950         r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1951         if (r < 0)
1952                 return r;
1953
1954         f->post_change_timer = timer;
1955         timer = NULL;
1956         f->post_change_timer_period = t;
1957
1958         return r;
1959 }
1960
1961 static int entry_item_cmp(const void *_a, const void *_b) {
1962         const EntryItem *a = _a, *b = _b;
1963
1964         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1965                 return -1;
1966         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1967                 return 1;
1968         return 0;
1969 }
1970
1971 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1972         unsigned i;
1973         EntryItem *items;
1974         int r;
1975         uint64_t xor_hash = 0;
1976         struct dual_timestamp _ts;
1977
1978         assert(f);
1979         assert(f->header);
1980         assert(iovec || n_iovec == 0);
1981
1982         if (!ts) {
1983                 dual_timestamp_get(&_ts);
1984                 ts = &_ts;
1985         }
1986
1987 #if HAVE_GCRYPT
1988         r = journal_file_maybe_append_tag(f, ts->realtime);
1989         if (r < 0)
1990                 return r;
1991 #endif
1992
1993         /* alloca() can't take 0, hence let's allocate at least one */
1994         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1995
1996         for (i = 0; i < n_iovec; i++) {
1997                 uint64_t p;
1998                 Object *o;
1999
2000                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2001                 if (r < 0)
2002                         return r;
2003
2004                 xor_hash ^= le64toh(o->data.hash);
2005                 items[i].object_offset = htole64(p);
2006                 items[i].hash = o->data.hash;
2007         }
2008
2009         /* Order by the position on disk, in order to improve seek
2010          * times for rotating media. */
2011         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
2012
2013         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
2014
2015         /* If the memory mapping triggered a SIGBUS then we return an
2016          * IO error and ignore the error code passed down to us, since
2017          * it is very likely just an effect of a nullified replacement
2018          * mapping page */
2019
2020         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
2021                 r = -EIO;
2022
2023         if (f->post_change_timer)
2024                 schedule_post_change(f);
2025         else
2026                 journal_file_post_change(f);
2027
2028         return r;
2029 }
2030
2031 typedef struct ChainCacheItem {
2032         uint64_t first; /* the array at the beginning of the chain */
2033         uint64_t array; /* the cached array */
2034         uint64_t begin; /* the first item in the cached array */
2035         uint64_t total; /* the total number of items in all arrays before this one in the chain */
2036         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2037 } ChainCacheItem;
2038
2039 static void chain_cache_put(
2040                 OrderedHashmap *h,
2041                 ChainCacheItem *ci,
2042                 uint64_t first,
2043                 uint64_t array,
2044                 uint64_t begin,
2045                 uint64_t total,
2046                 uint64_t last_index) {
2047
2048         if (!ci) {
2049                 /* If the chain item to cache for this chain is the
2050                  * first one it's not worth caching anything */
2051                 if (array == first)
2052                         return;
2053
2054                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2055                         ci = ordered_hashmap_steal_first(h);
2056                         assert(ci);
2057                 } else {
2058                         ci = new(ChainCacheItem, 1);
2059                         if (!ci)
2060                                 return;
2061                 }
2062
2063                 ci->first = first;
2064
2065                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2066                         free(ci);
2067                         return;
2068                 }
2069         } else
2070                 assert(ci->first == first);
2071
2072         ci->array = array;
2073         ci->begin = begin;
2074         ci->total = total;
2075         ci->last_index = last_index;
2076 }
2077
2078 static int generic_array_get(
2079                 JournalFile *f,
2080                 uint64_t first,
2081                 uint64_t i,
2082                 Object **ret, uint64_t *offset) {
2083
2084         Object *o;
2085         uint64_t p = 0, a, t = 0;
2086         int r;
2087         ChainCacheItem *ci;
2088
2089         assert(f);
2090
2091         a = first;
2092
2093         /* Try the chain cache first */
2094         ci = ordered_hashmap_get(f->chain_cache, &first);
2095         if (ci && i > ci->total) {
2096                 a = ci->array;
2097                 i -= ci->total;
2098                 t = ci->total;
2099         }
2100
2101         while (a > 0) {
2102                 uint64_t k;
2103
2104                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2105                 if (r < 0)
2106                         return r;
2107
2108                 k = journal_file_entry_array_n_items(o);
2109                 if (i < k) {
2110                         p = le64toh(o->entry_array.items[i]);
2111                         goto found;
2112                 }
2113
2114                 i -= k;
2115                 t += k;
2116                 a = le64toh(o->entry_array.next_entry_array_offset);
2117         }
2118
2119         return 0;
2120
2121 found:
2122         /* Let's cache this item for the next invocation */
2123         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2124
2125         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2126         if (r < 0)
2127                 return r;
2128
2129         if (ret)
2130                 *ret = o;
2131
2132         if (offset)
2133                 *offset = p;
2134
2135         return 1;
2136 }
2137
2138 static int generic_array_get_plus_one(
2139                 JournalFile *f,
2140                 uint64_t extra,
2141                 uint64_t first,
2142                 uint64_t i,
2143                 Object **ret, uint64_t *offset) {
2144
2145         Object *o;
2146
2147         assert(f);
2148
2149         if (i == 0) {
2150                 int r;
2151
2152                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2153                 if (r < 0)
2154                         return r;
2155
2156                 if (ret)
2157                         *ret = o;
2158
2159                 if (offset)
2160                         *offset = extra;
2161
2162                 return 1;
2163         }
2164
2165         return generic_array_get(f, first, i-1, ret, offset);
2166 }
2167
2168 enum {
2169         TEST_FOUND,
2170         TEST_LEFT,
2171         TEST_RIGHT
2172 };
2173
2174 static int generic_array_bisect(
2175                 JournalFile *f,
2176                 uint64_t first,
2177                 uint64_t n,
2178                 uint64_t needle,
2179                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2180                 direction_t direction,
2181                 Object **ret,
2182                 uint64_t *offset,
2183                 uint64_t *idx) {
2184
2185         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2186         bool subtract_one = false;
2187         Object *o, *array = NULL;
2188         int r;
2189         ChainCacheItem *ci;
2190
2191         assert(f);
2192         assert(test_object);
2193
2194         /* Start with the first array in the chain */
2195         a = first;
2196
2197         ci = ordered_hashmap_get(f->chain_cache, &first);
2198         if (ci && n > ci->total) {
2199                 /* Ah, we have iterated this bisection array chain
2200                  * previously! Let's see if we can skip ahead in the
2201                  * chain, as far as the last time. But we can't jump
2202                  * backwards in the chain, so let's check that
2203                  * first. */
2204
2205                 r = test_object(f, ci->begin, needle);
2206                 if (r < 0)
2207                         return r;
2208
2209                 if (r == TEST_LEFT) {
2210                         /* OK, what we are looking for is right of the
2211                          * begin of this EntryArray, so let's jump
2212                          * straight to previously cached array in the
2213                          * chain */
2214
2215                         a = ci->array;
2216                         n -= ci->total;
2217                         t = ci->total;
2218                         last_index = ci->last_index;
2219                 }
2220         }
2221
2222         while (a > 0) {
2223                 uint64_t left, right, k, lp;
2224
2225                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2226                 if (r < 0)
2227                         return r;
2228
2229                 k = journal_file_entry_array_n_items(array);
2230                 right = MIN(k, n);
2231                 if (right <= 0)
2232                         return 0;
2233
2234                 i = right - 1;
2235                 lp = p = le64toh(array->entry_array.items[i]);
2236                 if (p <= 0)
2237                         r = -EBADMSG;
2238                 else
2239                         r = test_object(f, p, needle);
2240                 if (r == -EBADMSG) {
2241                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2242                         n = i;
2243                         continue;
2244                 }
2245                 if (r < 0)
2246                         return r;
2247
2248                 if (r == TEST_FOUND)
2249                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2250
2251                 if (r == TEST_RIGHT) {
2252                         left = 0;
2253                         right -= 1;
2254
2255                         if (last_index != (uint64_t) -1) {
2256                                 assert(last_index <= right);
2257
2258                                 /* If we cached the last index we
2259                                  * looked at, let's try to not to jump
2260                                  * too wildly around and see if we can
2261                                  * limit the range to look at early to
2262                                  * the immediate neighbors of the last
2263                                  * index we looked at. */
2264
2265                                 if (last_index > 0) {
2266                                         uint64_t x = last_index - 1;
2267
2268                                         p = le64toh(array->entry_array.items[x]);
2269                                         if (p <= 0)
2270                                                 return -EBADMSG;
2271
2272                                         r = test_object(f, p, needle);
2273                                         if (r < 0)
2274                                                 return r;
2275
2276                                         if (r == TEST_FOUND)
2277                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2278
2279                                         if (r == TEST_RIGHT)
2280                                                 right = x;
2281                                         else
2282                                                 left = x + 1;
2283                                 }
2284
2285                                 if (last_index < right) {
2286                                         uint64_t y = last_index + 1;
2287
2288                                         p = le64toh(array->entry_array.items[y]);
2289                                         if (p <= 0)
2290                                                 return -EBADMSG;
2291
2292                                         r = test_object(f, p, needle);
2293                                         if (r < 0)
2294                                                 return r;
2295
2296                                         if (r == TEST_FOUND)
2297                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2298
2299                                         if (r == TEST_RIGHT)
2300                                                 right = y;
2301                                         else
2302                                                 left = y + 1;
2303                                 }
2304                         }
2305
2306                         for (;;) {
2307                                 if (left == right) {
2308                                         if (direction == DIRECTION_UP)
2309                                                 subtract_one = true;
2310
2311                                         i = left;
2312                                         goto found;
2313                                 }
2314
2315                                 assert(left < right);
2316                                 i = (left + right) / 2;
2317
2318                                 p = le64toh(array->entry_array.items[i]);
2319                                 if (p <= 0)
2320                                         r = -EBADMSG;
2321                                 else
2322                                         r = test_object(f, p, needle);
2323                                 if (r == -EBADMSG) {
2324                                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2325                                         right = n = i;
2326                                         continue;
2327                                 }
2328                                 if (r < 0)
2329                                         return r;
2330
2331                                 if (r == TEST_FOUND)
2332                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2333
2334                                 if (r == TEST_RIGHT)
2335                                         right = i;
2336                                 else
2337                                         left = i + 1;
2338                         }
2339                 }
2340
2341                 if (k >= n) {
2342                         if (direction == DIRECTION_UP) {
2343                                 i = n;
2344                                 subtract_one = true;
2345                                 goto found;
2346                         }
2347
2348                         return 0;
2349                 }
2350
2351                 last_p = lp;
2352
2353                 n -= k;
2354                 t += k;
2355                 last_index = (uint64_t) -1;
2356                 a = le64toh(array->entry_array.next_entry_array_offset);
2357         }
2358
2359         return 0;
2360
2361 found:
2362         if (subtract_one && t == 0 && i == 0)
2363                 return 0;
2364
2365         /* Let's cache this item for the next invocation */
2366         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2367
2368         if (subtract_one && i == 0)
2369                 p = last_p;
2370         else if (subtract_one)
2371                 p = le64toh(array->entry_array.items[i-1]);
2372         else
2373                 p = le64toh(array->entry_array.items[i]);
2374
2375         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2376         if (r < 0)
2377                 return r;
2378
2379         if (ret)
2380                 *ret = o;
2381
2382         if (offset)
2383                 *offset = p;
2384
2385         if (idx)
2386                 *idx = t + i + (subtract_one ? -1 : 0);
2387
2388         return 1;
2389 }
2390
2391 static int generic_array_bisect_plus_one(
2392                 JournalFile *f,
2393                 uint64_t extra,
2394                 uint64_t first,
2395                 uint64_t n,
2396                 uint64_t needle,
2397                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2398                 direction_t direction,
2399                 Object **ret,
2400                 uint64_t *offset,
2401                 uint64_t *idx) {
2402
2403         int r;
2404         bool step_back = false;
2405         Object *o;
2406
2407         assert(f);
2408         assert(test_object);
2409
2410         if (n <= 0)
2411                 return 0;
2412
2413         /* This bisects the array in object 'first', but first checks
2414          * an extra  */
2415         r = test_object(f, extra, needle);
2416         if (r < 0)
2417                 return r;
2418
2419         if (r == TEST_FOUND)
2420                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2421
2422         /* if we are looking with DIRECTION_UP then we need to first
2423            see if in the actual array there is a matching entry, and
2424            return the last one of that. But if there isn't any we need
2425            to return this one. Hence remember this, and return it
2426            below. */
2427         if (r == TEST_LEFT)
2428                 step_back = direction == DIRECTION_UP;
2429
2430         if (r == TEST_RIGHT) {
2431                 if (direction == DIRECTION_DOWN)
2432                         goto found;
2433                 else
2434                         return 0;
2435         }
2436
2437         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2438
2439         if (r == 0 && step_back)
2440                 goto found;
2441
2442         if (r > 0 && idx)
2443                 (*idx)++;
2444
2445         return r;
2446
2447 found:
2448         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2449         if (r < 0)
2450                 return r;
2451
2452         if (ret)
2453                 *ret = o;
2454
2455         if (offset)
2456                 *offset = extra;
2457
2458         if (idx)
2459                 *idx = 0;
2460
2461         return 1;
2462 }
2463
2464 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2465         assert(f);
2466         assert(p > 0);
2467
2468         if (p == needle)
2469                 return TEST_FOUND;
2470         else if (p < needle)
2471                 return TEST_LEFT;
2472         else
2473                 return TEST_RIGHT;
2474 }
2475
2476 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2477         Object *o;
2478         int r;
2479
2480         assert(f);
2481         assert(p > 0);
2482
2483         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2484         if (r < 0)
2485                 return r;
2486
2487         if (le64toh(o->entry.seqnum) == needle)
2488                 return TEST_FOUND;
2489         else if (le64toh(o->entry.seqnum) < needle)
2490                 return TEST_LEFT;
2491         else
2492                 return TEST_RIGHT;
2493 }
2494
2495 int journal_file_move_to_entry_by_seqnum(
2496                 JournalFile *f,
2497                 uint64_t seqnum,
2498                 direction_t direction,
2499                 Object **ret,
2500                 uint64_t *offset) {
2501         assert(f);
2502         assert(f->header);
2503
2504         return generic_array_bisect(f,
2505                                     le64toh(f->header->entry_array_offset),
2506                                     le64toh(f->header->n_entries),
2507                                     seqnum,
2508                                     test_object_seqnum,
2509                                     direction,
2510                                     ret, offset, NULL);
2511 }
2512
2513 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2514         Object *o;
2515         int r;
2516
2517         assert(f);
2518         assert(p > 0);
2519
2520         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2521         if (r < 0)
2522                 return r;
2523
2524         if (le64toh(o->entry.realtime) == needle)
2525                 return TEST_FOUND;
2526         else if (le64toh(o->entry.realtime) < needle)
2527                 return TEST_LEFT;
2528         else
2529                 return TEST_RIGHT;
2530 }
2531
2532 int journal_file_move_to_entry_by_realtime(
2533                 JournalFile *f,
2534                 uint64_t realtime,
2535                 direction_t direction,
2536                 Object **ret,
2537                 uint64_t *offset) {
2538         assert(f);
2539         assert(f->header);
2540
2541         return generic_array_bisect(f,
2542                                     le64toh(f->header->entry_array_offset),
2543                                     le64toh(f->header->n_entries),
2544                                     realtime,
2545                                     test_object_realtime,
2546                                     direction,
2547                                     ret, offset, NULL);
2548 }
2549
2550 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2551         Object *o;
2552         int r;
2553
2554         assert(f);
2555         assert(p > 0);
2556
2557         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2558         if (r < 0)
2559                 return r;
2560
2561         if (le64toh(o->entry.monotonic) == needle)
2562                 return TEST_FOUND;
2563         else if (le64toh(o->entry.monotonic) < needle)
2564                 return TEST_LEFT;
2565         else
2566                 return TEST_RIGHT;
2567 }
2568
2569 static int find_data_object_by_boot_id(
2570                 JournalFile *f,
2571                 sd_id128_t boot_id,
2572                 Object **o,
2573                 uint64_t *b) {
2574
2575         char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2576
2577         sd_id128_to_string(boot_id, t + 9);
2578         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2579 }
2580
2581 int journal_file_move_to_entry_by_monotonic(
2582                 JournalFile *f,
2583                 sd_id128_t boot_id,
2584                 uint64_t monotonic,
2585                 direction_t direction,
2586                 Object **ret,
2587                 uint64_t *offset) {
2588
2589         Object *o;
2590         int r;
2591
2592         assert(f);
2593
2594         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2595         if (r < 0)
2596                 return r;
2597         if (r == 0)
2598                 return -ENOENT;
2599
2600         return generic_array_bisect_plus_one(f,
2601                                              le64toh(o->data.entry_offset),
2602                                              le64toh(o->data.entry_array_offset),
2603                                              le64toh(o->data.n_entries),
2604                                              monotonic,
2605                                              test_object_monotonic,
2606                                              direction,
2607                                              ret, offset, NULL);
2608 }
2609
2610 void journal_file_reset_location(JournalFile *f) {
2611         f->location_type = LOCATION_HEAD;
2612         f->current_offset = 0;
2613         f->current_seqnum = 0;
2614         f->current_realtime = 0;
2615         f->current_monotonic = 0;
2616         zero(f->current_boot_id);
2617         f->current_xor_hash = 0;
2618 }
2619
2620 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2621         f->location_type = LOCATION_SEEK;
2622         f->current_offset = offset;
2623         f->current_seqnum = le64toh(o->entry.seqnum);
2624         f->current_realtime = le64toh(o->entry.realtime);
2625         f->current_monotonic = le64toh(o->entry.monotonic);
2626         f->current_boot_id = o->entry.boot_id;
2627         f->current_xor_hash = le64toh(o->entry.xor_hash);
2628 }
2629
2630 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2631         assert(af);
2632         assert(af->header);
2633         assert(bf);
2634         assert(bf->header);
2635         assert(af->location_type == LOCATION_SEEK);
2636         assert(bf->location_type == LOCATION_SEEK);
2637
2638         /* If contents and timestamps match, these entries are
2639          * identical, even if the seqnum does not match */
2640         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2641             af->current_monotonic == bf->current_monotonic &&
2642             af->current_realtime == bf->current_realtime &&
2643             af->current_xor_hash == bf->current_xor_hash)
2644                 return 0;
2645
2646         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2647
2648                 /* If this is from the same seqnum source, compare
2649                  * seqnums */
2650                 if (af->current_seqnum < bf->current_seqnum)
2651                         return -1;
2652                 if (af->current_seqnum > bf->current_seqnum)
2653                         return 1;
2654
2655                 /* Wow! This is weird, different data but the same
2656                  * seqnums? Something is borked, but let's make the
2657                  * best of it and compare by time. */
2658         }
2659
2660         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2661
2662                 /* If the boot id matches, compare monotonic time */
2663                 if (af->current_monotonic < bf->current_monotonic)
2664                         return -1;
2665                 if (af->current_monotonic > bf->current_monotonic)
2666                         return 1;
2667         }
2668
2669         /* Otherwise, compare UTC time */
2670         if (af->current_realtime < bf->current_realtime)
2671                 return -1;
2672         if (af->current_realtime > bf->current_realtime)
2673                 return 1;
2674
2675         /* Finally, compare by contents */
2676         if (af->current_xor_hash < bf->current_xor_hash)
2677                 return -1;
2678         if (af->current_xor_hash > bf->current_xor_hash)
2679                 return 1;
2680
2681         return 0;
2682 }
2683
2684 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2685
2686         /* Increase or decrease the specified index, in the right direction. */
2687
2688         if (direction == DIRECTION_DOWN) {
2689                 if (*i >= n - 1)
2690                         return 0;
2691
2692                 (*i) ++;
2693         } else {
2694                 if (*i <= 0)
2695                         return 0;
2696
2697                 (*i) --;
2698         }
2699
2700         return 1;
2701 }
2702
2703 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2704
2705         /* Consider it an error if any of the two offsets is uninitialized */
2706         if (old_offset == 0 || new_offset == 0)
2707                 return false;
2708
2709         /* If we go down, the new offset must be larger than the old one. */
2710         return direction == DIRECTION_DOWN ?
2711                 new_offset > old_offset  :
2712                 new_offset < old_offset;
2713 }
2714
2715 int journal_file_next_entry(
2716                 JournalFile *f,
2717                 uint64_t p,
2718                 direction_t direction,
2719                 Object **ret, uint64_t *offset) {
2720
2721         uint64_t i, n, ofs;
2722         int r;
2723
2724         assert(f);
2725         assert(f->header);
2726
2727         n = le64toh(f->header->n_entries);
2728         if (n <= 0)
2729                 return 0;
2730
2731         if (p == 0)
2732                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2733         else {
2734                 r = generic_array_bisect(f,
2735                                          le64toh(f->header->entry_array_offset),
2736                                          le64toh(f->header->n_entries),
2737                                          p,
2738                                          test_object_offset,
2739                                          DIRECTION_DOWN,
2740                                          NULL, NULL,
2741                                          &i);
2742                 if (r <= 0)
2743                         return r;
2744
2745                 r = bump_array_index(&i, direction, n);
2746                 if (r <= 0)
2747                         return r;
2748         }
2749
2750         /* And jump to it */
2751         for (;;) {
2752                 r = generic_array_get(f,
2753                                       le64toh(f->header->entry_array_offset),
2754                                       i,
2755                                       ret, &ofs);
2756                 if (r > 0)
2757                         break;
2758                 if (r != -EBADMSG)
2759                         return r;
2760
2761                 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2762                  * the next one might work for us instead. */
2763                 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2764
2765                 r = bump_array_index(&i, direction, n);
2766                 if (r <= 0)
2767                         return r;
2768         }
2769
2770         /* Ensure our array is properly ordered. */
2771         if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2772                 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
2773                 return -EBADMSG;
2774         }
2775
2776         if (offset)
2777                 *offset = ofs;
2778
2779         return 1;
2780 }
2781
2782 int journal_file_next_entry_for_data(
2783                 JournalFile *f,
2784                 Object *o, uint64_t p,
2785                 uint64_t data_offset,
2786                 direction_t direction,
2787                 Object **ret, uint64_t *offset) {
2788
2789         uint64_t i, n, ofs;
2790         Object *d;
2791         int r;
2792
2793         assert(f);
2794         assert(p > 0 || !o);
2795
2796         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2797         if (r < 0)
2798                 return r;
2799
2800         n = le64toh(d->data.n_entries);
2801         if (n <= 0)
2802                 return n;
2803
2804         if (!o)
2805                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2806         else {
2807                 if (o->object.type != OBJECT_ENTRY)
2808                         return -EINVAL;
2809
2810                 r = generic_array_bisect_plus_one(f,
2811                                                   le64toh(d->data.entry_offset),
2812                                                   le64toh(d->data.entry_array_offset),
2813                                                   le64toh(d->data.n_entries),
2814                                                   p,
2815                                                   test_object_offset,
2816                                                   DIRECTION_DOWN,
2817                                                   NULL, NULL,
2818                                                   &i);
2819
2820                 if (r <= 0)
2821                         return r;
2822
2823                 r = bump_array_index(&i, direction, n);
2824                 if (r <= 0)
2825                         return r;
2826         }
2827
2828         for (;;) {
2829                 r = generic_array_get_plus_one(f,
2830                                                le64toh(d->data.entry_offset),
2831                                                le64toh(d->data.entry_array_offset),
2832                                                i,
2833                                                ret, &ofs);
2834                 if (r > 0)
2835                         break;
2836                 if (r != -EBADMSG)
2837                         return r;
2838
2839                 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2840
2841                 r = bump_array_index(&i, direction, n);
2842                 if (r <= 0)
2843                         return r;
2844         }
2845
2846         /* Ensure our array is properly ordered. */
2847         if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2848                 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2849                 return -EBADMSG;
2850         }
2851
2852         if (offset)
2853                 *offset = ofs;
2854
2855         return 1;
2856 }
2857
2858 int journal_file_move_to_entry_by_offset_for_data(
2859                 JournalFile *f,
2860                 uint64_t data_offset,
2861                 uint64_t p,
2862                 direction_t direction,
2863                 Object **ret, uint64_t *offset) {
2864
2865         int r;
2866         Object *d;
2867
2868         assert(f);
2869
2870         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2871         if (r < 0)
2872                 return r;
2873
2874         return generic_array_bisect_plus_one(f,
2875                                              le64toh(d->data.entry_offset),
2876                                              le64toh(d->data.entry_array_offset),
2877                                              le64toh(d->data.n_entries),
2878                                              p,
2879                                              test_object_offset,
2880                                              direction,
2881                                              ret, offset, NULL);
2882 }
2883
2884 int journal_file_move_to_entry_by_monotonic_for_data(
2885                 JournalFile *f,
2886                 uint64_t data_offset,
2887                 sd_id128_t boot_id,
2888                 uint64_t monotonic,
2889                 direction_t direction,
2890                 Object **ret, uint64_t *offset) {
2891
2892         Object *o, *d;
2893         int r;
2894         uint64_t b, z;
2895
2896         assert(f);
2897
2898         /* First, seek by time */
2899         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2900         if (r < 0)
2901                 return r;
2902         if (r == 0)
2903                 return -ENOENT;
2904
2905         r = generic_array_bisect_plus_one(f,
2906                                           le64toh(o->data.entry_offset),
2907                                           le64toh(o->data.entry_array_offset),
2908                                           le64toh(o->data.n_entries),
2909                                           monotonic,
2910                                           test_object_monotonic,
2911                                           direction,
2912                                           NULL, &z, NULL);
2913         if (r <= 0)
2914                 return r;
2915
2916         /* And now, continue seeking until we find an entry that
2917          * exists in both bisection arrays */
2918
2919         for (;;) {
2920                 Object *qo;
2921                 uint64_t p, q;
2922
2923                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2924                 if (r < 0)
2925                         return r;
2926
2927                 r = generic_array_bisect_plus_one(f,
2928                                                   le64toh(d->data.entry_offset),
2929                                                   le64toh(d->data.entry_array_offset),
2930                                                   le64toh(d->data.n_entries),
2931                                                   z,
2932                                                   test_object_offset,
2933                                                   direction,
2934                                                   NULL, &p, NULL);
2935                 if (r <= 0)
2936                         return r;
2937
2938                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2939                 if (r < 0)
2940                         return r;
2941
2942                 r = generic_array_bisect_plus_one(f,
2943                                                   le64toh(o->data.entry_offset),
2944                                                   le64toh(o->data.entry_array_offset),
2945                                                   le64toh(o->data.n_entries),
2946                                                   p,
2947                                                   test_object_offset,
2948                                                   direction,
2949                                                   &qo, &q, NULL);
2950
2951                 if (r <= 0)
2952                         return r;
2953
2954                 if (p == q) {
2955                         if (ret)
2956                                 *ret = qo;
2957                         if (offset)
2958                                 *offset = q;
2959
2960                         return 1;
2961                 }
2962
2963                 z = q;
2964         }
2965 }
2966
2967 int journal_file_move_to_entry_by_seqnum_for_data(
2968                 JournalFile *f,
2969                 uint64_t data_offset,
2970                 uint64_t seqnum,
2971                 direction_t direction,
2972                 Object **ret, uint64_t *offset) {
2973
2974         Object *d;
2975         int r;
2976
2977         assert(f);
2978
2979         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2980         if (r < 0)
2981                 return r;
2982
2983         return generic_array_bisect_plus_one(f,
2984                                              le64toh(d->data.entry_offset),
2985                                              le64toh(d->data.entry_array_offset),
2986                                              le64toh(d->data.n_entries),
2987                                              seqnum,
2988                                              test_object_seqnum,
2989                                              direction,
2990                                              ret, offset, NULL);
2991 }
2992
2993 int journal_file_move_to_entry_by_realtime_for_data(
2994                 JournalFile *f,
2995                 uint64_t data_offset,
2996                 uint64_t realtime,
2997                 direction_t direction,
2998                 Object **ret, uint64_t *offset) {
2999
3000         Object *d;
3001         int r;
3002
3003         assert(f);
3004
3005         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
3006         if (r < 0)
3007                 return r;
3008
3009         return generic_array_bisect_plus_one(f,
3010                                              le64toh(d->data.entry_offset),
3011                                              le64toh(d->data.entry_array_offset),
3012                                              le64toh(d->data.n_entries),
3013                                              realtime,
3014                                              test_object_realtime,
3015                                              direction,
3016                                              ret, offset, NULL);
3017 }
3018
3019 void journal_file_dump(JournalFile *f) {
3020         Object *o;
3021         int r;
3022         uint64_t p;
3023
3024         assert(f);
3025         assert(f->header);
3026
3027         journal_file_print_header(f);
3028
3029         p = le64toh(f->header->header_size);
3030         while (p != 0) {
3031                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3032                 if (r < 0)
3033                         goto fail;
3034
3035                 switch (o->object.type) {
3036
3037                 case OBJECT_UNUSED:
3038                         printf("Type: OBJECT_UNUSED\n");
3039                         break;
3040
3041                 case OBJECT_DATA:
3042                         printf("Type: OBJECT_DATA\n");
3043                         break;
3044
3045                 case OBJECT_FIELD:
3046                         printf("Type: OBJECT_FIELD\n");
3047                         break;
3048
3049                 case OBJECT_ENTRY:
3050                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3051                                le64toh(o->entry.seqnum),
3052                                le64toh(o->entry.monotonic),
3053                                le64toh(o->entry.realtime));
3054                         break;
3055
3056                 case OBJECT_FIELD_HASH_TABLE:
3057                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3058                         break;
3059
3060                 case OBJECT_DATA_HASH_TABLE:
3061                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
3062                         break;
3063
3064                 case OBJECT_ENTRY_ARRAY:
3065                         printf("Type: OBJECT_ENTRY_ARRAY\n");
3066                         break;
3067
3068                 case OBJECT_TAG:
3069                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3070                                le64toh(o->tag.seqnum),
3071                                le64toh(o->tag.epoch));
3072                         break;
3073
3074                 default:
3075                         printf("Type: unknown (%i)\n", o->object.type);
3076                         break;
3077                 }
3078
3079                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3080                         printf("Flags: %s\n",
3081                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3082
3083                 if (p == le64toh(f->header->tail_object_offset))
3084                         p = 0;
3085                 else
3086                         p = p + ALIGN64(le64toh(o->object.size));
3087         }
3088
3089         return;
3090 fail:
3091         log_error("File corrupt");
3092 }
3093
3094 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3095         const char *x;
3096
3097         x = format_timestamp(buf, l, t);
3098         if (x)
3099                 return x;
3100         return " --- ";
3101 }
3102
3103 void journal_file_print_header(JournalFile *f) {
3104         char a[33], b[33], c[33], d[33];
3105         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3106         struct stat st;
3107         char bytes[FORMAT_BYTES_MAX];
3108
3109         assert(f);
3110         assert(f->header);
3111
3112         printf("File Path: %s\n"
3113                "File ID: %s\n"
3114                "Machine ID: %s\n"
3115                "Boot ID: %s\n"
3116                "Sequential Number ID: %s\n"
3117                "State: %s\n"
3118                "Compatible Flags:%s%s\n"
3119                "Incompatible Flags:%s%s%s\n"
3120                "Header size: %"PRIu64"\n"
3121                "Arena size: %"PRIu64"\n"
3122                "Data Hash Table Size: %"PRIu64"\n"
3123                "Field Hash Table Size: %"PRIu64"\n"
3124                "Rotate Suggested: %s\n"
3125                "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3126                "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3127                "Head Realtime Timestamp: %s (%"PRIx64")\n"
3128                "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3129                "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3130                "Objects: %"PRIu64"\n"
3131                "Entry Objects: %"PRIu64"\n",
3132                f->path,
3133                sd_id128_to_string(f->header->file_id, a),
3134                sd_id128_to_string(f->header->machine_id, b),
3135                sd_id128_to_string(f->header->boot_id, c),
3136                sd_id128_to_string(f->header->seqnum_id, d),
3137                f->header->state == STATE_OFFLINE ? "OFFLINE" :
3138                f->header->state == STATE_ONLINE ? "ONLINE" :
3139                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3140                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3141                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3142                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3143                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3144                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3145                le64toh(f->header->header_size),
3146                le64toh(f->header->arena_size),
3147                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3148                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3149                yes_no(journal_file_rotate_suggested(f, 0)),
3150                le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3151                le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3152                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3153                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3154                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3155                le64toh(f->header->n_objects),
3156                le64toh(f->header->n_entries));
3157
3158         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3159                 printf("Data Objects: %"PRIu64"\n"
3160                        "Data Hash Table Fill: %.1f%%\n",
3161                        le64toh(f->header->n_data),
3162                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3163
3164         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3165                 printf("Field Objects: %"PRIu64"\n"
3166                        "Field Hash Table Fill: %.1f%%\n",
3167                        le64toh(f->header->n_fields),
3168                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3169
3170         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3171                 printf("Tag Objects: %"PRIu64"\n",
3172                        le64toh(f->header->n_tags));
3173         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3174                 printf("Entry Array Objects: %"PRIu64"\n",
3175                        le64toh(f->header->n_entry_arrays));
3176
3177         if (fstat(f->fd, &st) >= 0)
3178                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3179 }
3180
3181 static int journal_file_warn_btrfs(JournalFile *f) {
3182         unsigned attrs;
3183         int r;
3184
3185         assert(f);
3186
3187         /* Before we write anything, check if the COW logic is turned
3188          * off on btrfs. Given our write pattern that is quite
3189          * unfriendly to COW file systems this should greatly improve
3190          * performance on COW file systems, such as btrfs, at the
3191          * expense of data integrity features (which shouldn't be too
3192          * bad, given that we do our own checksumming). */
3193
3194         r = btrfs_is_filesystem(f->fd);
3195         if (r < 0)
3196                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3197         if (!r)
3198                 return 0;
3199
3200         r = read_attr_fd(f->fd, &attrs);
3201         if (r < 0)
3202                 return log_warning_errno(r, "Failed to read file attributes: %m");
3203
3204         if (attrs & FS_NOCOW_FL) {
3205                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3206                 return 0;
3207         }
3208
3209         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3210                    "This is likely to slow down journal access substantially, please consider turning "
3211                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3212
3213         return 1;
3214 }
3215
3216 int journal_file_open(
3217                 int fd,
3218                 const char *fname,
3219                 int flags,
3220                 mode_t mode,
3221                 bool compress,
3222                 bool seal,
3223                 JournalMetrics *metrics,
3224                 MMapCache *mmap_cache,
3225                 Set *deferred_closes,
3226                 JournalFile *template,
3227                 JournalFile **ret) {
3228
3229         bool newly_created = false;
3230         JournalFile *f;
3231         void *h;
3232         int r;
3233
3234         assert(ret);
3235         assert(fd >= 0 || fname);
3236
3237         if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3238                 return -EINVAL;
3239
3240         if (fname) {
3241                 if (!endswith(fname, ".journal") &&
3242                     !endswith(fname, ".journal~"))
3243                         return -EINVAL;
3244         }
3245
3246         f = new0(JournalFile, 1);
3247         if (!f)
3248                 return -ENOMEM;
3249
3250         f->fd = fd;
3251         f->mode = mode;
3252
3253         f->flags = flags;
3254         f->prot = prot_from_flags(flags);
3255         f->writable = (flags & O_ACCMODE) != O_RDONLY;
3256 #if HAVE_LZ4
3257         f->compress_lz4 = compress;
3258 #elif HAVE_XZ
3259         f->compress_xz = compress;
3260 #endif
3261 #if HAVE_GCRYPT
3262         f->seal = seal;
3263 #endif
3264
3265         if (mmap_cache)
3266                 f->mmap = mmap_cache_ref(mmap_cache);
3267         else {
3268                 f->mmap = mmap_cache_new();
3269                 if (!f->mmap) {
3270                         r = -ENOMEM;
3271                         goto fail;
3272                 }
3273         }
3274
3275         if (fname) {
3276                 f->path = strdup(fname);
3277                 if (!f->path) {
3278                         r = -ENOMEM;
3279                         goto fail;
3280                 }
3281         } else {
3282                 /* If we don't know the path, fill in something explanatory and vaguely useful */
3283                 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3284                         r = -ENOMEM;
3285                         goto fail;
3286                 }
3287         }
3288
3289         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3290         if (!f->chain_cache) {
3291                 r = -ENOMEM;
3292                 goto fail;
3293         }
3294
3295         if (f->fd < 0) {
3296                 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
3297                 if (f->fd < 0) {
3298                         r = -errno;
3299                         goto fail;
3300                 }
3301
3302                 /* fds we opened here by us should also be closed by us. */
3303                 f->close_fd = true;
3304         }
3305
3306         f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3307         if (!f->cache_fd) {
3308                 r = -ENOMEM;
3309                 goto fail;
3310         }
3311
3312         r = journal_file_fstat(f);
3313         if (r < 0)
3314                 goto fail;
3315
3316         if (f->last_stat.st_size == 0 && f->writable) {
3317
3318                 (void) journal_file_warn_btrfs(f);
3319
3320                 /* Let's attach the creation time to the journal file,
3321                  * so that the vacuuming code knows the age of this
3322                  * file even if the file might end up corrupted one
3323                  * day... Ideally we'd just use the creation time many
3324                  * file systems maintain for each file, but there is
3325                  * currently no usable API to query this, hence let's
3326                  * emulate this via extended attributes. If extended
3327                  * attributes are not supported we'll just skip this,
3328                  * and rely solely on mtime/atime/ctime of the file. */
3329
3330                 fd_setcrtime(f->fd, 0);
3331
3332 #if HAVE_GCRYPT
3333                 /* Try to load the FSPRG state, and if we can't, then
3334                  * just don't do sealing */
3335                 if (f->seal) {
3336                         r = journal_file_fss_load(f);
3337                         if (r < 0)
3338                                 f->seal = false;
3339                 }
3340 #endif
3341
3342                 r = journal_file_init_header(f, template);
3343                 if (r < 0)
3344                         goto fail;
3345
3346                 r = journal_file_fstat(f);
3347                 if (r < 0)
3348                         goto fail;
3349
3350                 newly_created = true;
3351         }
3352
3353         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3354                 r = -ENODATA;
3355                 goto fail;
3356         }
3357
3358         r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3359         if (r < 0)
3360                 goto fail;
3361
3362         f->header = h;
3363
3364         if (!newly_created) {
3365                 set_clear_with_destructor(deferred_closes, journal_file_close);
3366
3367                 r = journal_file_verify_header(f);
3368                 if (r < 0)
3369                         goto fail;
3370         }
3371
3372 #if HAVE_GCRYPT
3373         if (!newly_created && f->writable) {
3374                 r = journal_file_fss_load(f);
3375                 if (r < 0)
3376                         goto fail;
3377         }
3378 #endif
3379
3380         if (f->writable) {
3381                 if (metrics) {
3382                         journal_default_metrics(metrics, f->fd);
3383                         f->metrics = *metrics;
3384                 } else if (template)
3385                         f->metrics = template->metrics;
3386
3387                 r = journal_file_refresh_header(f);
3388                 if (r < 0)
3389                         goto fail;
3390         }
3391
3392 #if HAVE_GCRYPT
3393         r = journal_file_hmac_setup(f);
3394         if (r < 0)
3395                 goto fail;
3396 #endif
3397
3398         if (newly_created) {
3399                 r = journal_file_setup_field_hash_table(f);
3400                 if (r < 0)
3401                         goto fail;
3402
3403                 r = journal_file_setup_data_hash_table(f);
3404                 if (r < 0)
3405                         goto fail;
3406
3407 #if HAVE_GCRYPT
3408                 r = journal_file_append_first_tag(f);
3409                 if (r < 0)
3410                         goto fail;
3411 #endif
3412         }
3413
3414         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3415                 r = -EIO;
3416                 goto fail;
3417         }
3418
3419         if (template && template->post_change_timer) {
3420                 r = journal_file_enable_post_change_timer(
3421                                 f,
3422                                 sd_event_source_get_event(template->post_change_timer),
3423                                 template->post_change_timer_period);
3424
3425                 if (r < 0)
3426                         goto fail;
3427         }
3428
3429         /* The file is opened now successfully, thus we take possession of any passed in fd. */
3430         f->close_fd = true;
3431
3432         *ret = f;
3433         return 0;
3434
3435 fail:
3436         if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3437                 r = -EIO;
3438
3439         (void) journal_file_close(f);
3440
3441         return r;
3442 }
3443
3444 int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
3445         _cleanup_free_ char *p = NULL;
3446         size_t l;
3447         JournalFile *old_file, *new_file = NULL;
3448         int r;
3449
3450         assert(f);
3451         assert(*f);
3452
3453         old_file = *f;
3454
3455         if (!old_file->writable)
3456                 return -EINVAL;
3457
3458         /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3459          * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3460         if (path_startswith(old_file->path, "/proc/self/fd"))
3461                 return -EINVAL;
3462
3463         if (!endswith(old_file->path, ".journal"))
3464                 return -EINVAL;
3465
3466         l = strlen(old_file->path);
3467         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3468                      (int) l - 8, old_file->path,
3469                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3470                      le64toh((*f)->header->head_entry_seqnum),
3471                      le64toh((*f)->header->head_entry_realtime));
3472         if (r < 0)
3473                 return -ENOMEM;
3474
3475         /* Try to rename the file to the archived version. If the file
3476          * already was deleted, we'll get ENOENT, let's ignore that
3477          * case. */
3478         r = rename(old_file->path, p);
3479         if (r < 0 && errno != ENOENT)
3480                 return -errno;
3481
3482         /* Sync the rename to disk */
3483         (void) fsync_directory_of_file(old_file->fd);
3484
3485         /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3486          * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3487          * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3488          * would result in the rotated journal never getting fsync() called before closing.
3489          * Now we simply queue the archive state by setting an archive bit, leaving the state
3490          * as STATE_ONLINE so proper offlining occurs. */
3491         old_file->archive = true;
3492
3493         /* Currently, btrfs is not very good with out write patterns
3494          * and fragments heavily. Let's defrag our journal files when
3495          * we archive them */
3496         old_file->defrag_on_close = true;
3497
3498         r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
3499
3500         if (deferred_closes &&
3501             set_put(deferred_closes, old_file) >= 0)
3502                 (void) journal_file_set_offline(old_file, false);
3503         else
3504                 (void) journal_file_close(old_file);
3505
3506         *f = new_file;
3507         return r;
3508 }
3509
3510 int journal_file_open_reliably(
3511                 const char *fname,
3512                 int flags,
3513                 mode_t mode,
3514                 bool compress,
3515                 bool seal,
3516                 JournalMetrics *metrics,
3517                 MMapCache *mmap_cache,
3518                 Set *deferred_closes,
3519                 JournalFile *template,
3520                 JournalFile **ret) {
3521
3522         int r;
3523         size_t l;
3524         _cleanup_free_ char *p = NULL;
3525
3526         r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3527         if (!IN_SET(r,
3528                     -EBADMSG,           /* Corrupted */
3529                     -ENODATA,           /* Truncated */
3530                     -EHOSTDOWN,         /* Other machine */
3531                     -EPROTONOSUPPORT,   /* Incompatible feature */
3532                     -EBUSY,             /* Unclean shutdown */
3533                     -ESHUTDOWN,         /* Already archived */
3534                     -EIO,               /* IO error, including SIGBUS on mmap */
3535                     -EIDRM,             /* File has been deleted */
3536                     -ETXTBSY))          /* File is from the future */
3537                 return r;
3538
3539         if ((flags & O_ACCMODE) == O_RDONLY)
3540                 return r;
3541
3542         if (!(flags & O_CREAT))
3543                 return r;
3544
3545         if (!endswith(fname, ".journal"))
3546                 return r;
3547
3548         /* The file is corrupted. Rotate it away and try it again (but only once) */
3549
3550         l = strlen(fname);
3551         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
3552                      (int) l - 8, fname,
3553                      now(CLOCK_REALTIME),
3554                      random_u64()) < 0)
3555                 return -ENOMEM;
3556
3557         if (rename(fname, p) < 0)
3558                 return -errno;
3559
3560         /* btrfs doesn't cope well with our write pattern and
3561          * fragments heavily. Let's defrag all files we rotate */
3562
3563         (void) chattr_path(p, 0, FS_NOCOW_FL);
3564         (void) btrfs_defrag(p);
3565
3566         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3567
3568         return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3569 }
3570
3571 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3572         uint64_t i, n;
3573         uint64_t q, xor_hash = 0;
3574         int r;
3575         EntryItem *items;
3576         dual_timestamp ts;
3577
3578         assert(from);
3579         assert(to);
3580         assert(o);
3581         assert(p);
3582
3583         if (!to->writable)
3584                 return -EPERM;
3585
3586         ts.monotonic = le64toh(o->entry.monotonic);
3587         ts.realtime = le64toh(o->entry.realtime);
3588
3589         n = journal_file_entry_n_items(o);
3590         /* alloca() can't take 0, hence let's allocate at least one */
3591         items = alloca(sizeof(EntryItem) * MAX(1u, n));
3592
3593         for (i = 0; i < n; i++) {
3594                 uint64_t l, h;
3595                 le64_t le_hash;
3596                 size_t t;
3597                 void *data;
3598                 Object *u;
3599
3600                 q = le64toh(o->entry.items[i].object_offset);
3601                 le_hash = o->entry.items[i].hash;
3602
3603                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3604                 if (r < 0)
3605                         return r;
3606
3607                 if (le_hash != o->data.hash)
3608                         return -EBADMSG;
3609
3610                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3611                 t = (size_t) l;
3612
3613                 /* We hit the limit on 32bit machines */
3614                 if ((uint64_t) t != l)
3615                         return -E2BIG;
3616
3617                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3618 #if HAVE_XZ || HAVE_LZ4
3619                         size_t rsize = 0;
3620
3621                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3622                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3623                         if (r < 0)
3624                                 return r;
3625
3626                         data = from->compress_buffer;
3627                         l = rsize;
3628 #else
3629                         return -EPROTONOSUPPORT;
3630 #endif
3631                 } else
3632                         data = o->data.payload;
3633
3634                 r = journal_file_append_data(to, data, l, &u, &h);
3635                 if (r < 0)
3636                         return r;
3637
3638                 xor_hash ^= le64toh(u->data.hash);
3639                 items[i].object_offset = htole64(h);
3640                 items[i].hash = u->data.hash;
3641
3642                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3643                 if (r < 0)
3644                         return r;
3645         }
3646
3647         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3648
3649         if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3650                 return -EIO;
3651
3652         return r;
3653 }
3654
3655 void journal_reset_metrics(JournalMetrics *m) {
3656         assert(m);
3657
3658         /* Set everything to "pick automatic values". */
3659
3660         *m = (JournalMetrics) {
3661                 .min_use = (uint64_t) -1,
3662                 .max_use = (uint64_t) -1,
3663                 .min_size = (uint64_t) -1,
3664                 .max_size = (uint64_t) -1,
3665                 .keep_free = (uint64_t) -1,
3666                 .n_max_files = (uint64_t) -1,
3667         };
3668 }
3669
3670 void journal_default_metrics(JournalMetrics *m, int fd) {
3671         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3672         struct statvfs ss;
3673         uint64_t fs_size;
3674
3675         assert(m);
3676         assert(fd >= 0);
3677
3678         if (fstatvfs(fd, &ss) >= 0)
3679                 fs_size = ss.f_frsize * ss.f_blocks;
3680         else {
3681                 log_debug_errno(errno, "Failed to detremine disk size: %m");
3682                 fs_size = 0;
3683         }
3684
3685         if (m->max_use == (uint64_t) -1) {
3686
3687                 if (fs_size > 0) {
3688                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3689
3690                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3691                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3692
3693                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3694                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3695                 } else
3696                         m->max_use = DEFAULT_MAX_USE_LOWER;
3697         } else {
3698                 m->max_use = PAGE_ALIGN(m->max_use);
3699
3700                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3701                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3702         }
3703
3704         if (m->min_use == (uint64_t) -1)
3705                 m->min_use = DEFAULT_MIN_USE;
3706
3707         if (m->min_use > m->max_use)
3708                 m->min_use = m->max_use;
3709
3710         if (m->max_size == (uint64_t) -1) {
3711                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3712
3713                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3714                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3715         } else
3716                 m->max_size = PAGE_ALIGN(m->max_size);
3717
3718         if (m->max_size != 0) {
3719                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3720                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3721
3722                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3723                         m->max_use = m->max_size*2;
3724         }
3725
3726         if (m->min_size == (uint64_t) -1)
3727                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3728         else {
3729                 m->min_size = PAGE_ALIGN(m->min_size);
3730
3731                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3732                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3733
3734                 if (m->max_size != 0 && m->min_size > m->max_size)
3735                         m->max_size = m->min_size;
3736         }
3737
3738         if (m->keep_free == (uint64_t) -1) {
3739
3740                 if (fs_size > 0) {
3741                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3742
3743                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3744                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3745
3746                 } else
3747                         m->keep_free = DEFAULT_KEEP_FREE;
3748         }
3749
3750         if (m->n_max_files == (uint64_t) -1)
3751                 m->n_max_files = DEFAULT_N_MAX_FILES;
3752
3753         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3754                   format_bytes(a, sizeof(a), m->min_use),
3755                   format_bytes(b, sizeof(b), m->max_use),
3756                   format_bytes(c, sizeof(c), m->max_size),
3757                   format_bytes(d, sizeof(d), m->min_size),
3758                   format_bytes(e, sizeof(e), m->keep_free),
3759                   m->n_max_files);
3760 }
3761
3762 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3763         assert(f);
3764         assert(f->header);
3765         assert(from || to);
3766
3767         if (from) {
3768                 if (f->header->head_entry_realtime == 0)
3769                         return -ENOENT;
3770
3771                 *from = le64toh(f->header->head_entry_realtime);
3772         }
3773
3774         if (to) {
3775                 if (f->header->tail_entry_realtime == 0)
3776                         return -ENOENT;
3777
3778                 *to = le64toh(f->header->tail_entry_realtime);
3779         }
3780
3781         return 1;
3782 }
3783
3784 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3785         Object *o;
3786         uint64_t p;
3787         int r;
3788
3789         assert(f);
3790         assert(from || to);
3791
3792         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3793         if (r <= 0)
3794                 return r;
3795
3796         if (le64toh(o->data.n_entries) <= 0)
3797                 return 0;
3798
3799         if (from) {
3800                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3801                 if (r < 0)
3802                         return r;
3803
3804                 *from = le64toh(o->entry.monotonic);
3805         }
3806
3807         if (to) {
3808                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3809                 if (r < 0)
3810                         return r;
3811
3812                 r = generic_array_get_plus_one(f,
3813                                                le64toh(o->data.entry_offset),
3814                                                le64toh(o->data.entry_array_offset),
3815                                                le64toh(o->data.n_entries)-1,
3816                                                &o, NULL);
3817                 if (r <= 0)
3818                         return r;
3819
3820                 *to = le64toh(o->entry.monotonic);
3821         }
3822
3823         return 1;
3824 }
3825
3826 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3827         assert(f);
3828         assert(f->header);
3829
3830         /* If we gained new header fields we gained new features,
3831          * hence suggest a rotation */
3832         if (le64toh(f->header->header_size) < sizeof(Header)) {
3833                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3834                 return true;
3835         }
3836
3837         /* Let's check if the hash tables grew over a certain fill
3838          * level (75%, borrowing this value from Java's hash table
3839          * implementation), and if so suggest a rotation. To calculate
3840          * the fill level we need the n_data field, which only exists
3841          * in newer versions. */
3842
3843         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3844                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3845                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3846                                   f->path,
3847                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3848                                   le64toh(f->header->n_data),
3849                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3850                                   (unsigned long long) f->last_stat.st_size,
3851                                   f->last_stat.st_size / le64toh(f->header->n_data));
3852                         return true;
3853                 }
3854
3855         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3856                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3857                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3858                                   f->path,
3859                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3860                                   le64toh(f->header->n_fields),
3861                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3862                         return true;
3863                 }
3864
3865         /* Are the data objects properly indexed by field objects? */
3866         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3867             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3868             le64toh(f->header->n_data) > 0 &&
3869             le64toh(f->header->n_fields) == 0)
3870                 return true;
3871
3872         if (max_file_usec > 0) {
3873                 usec_t t, h;
3874
3875                 h = le64toh(f->header->head_entry_realtime);
3876                 t = now(CLOCK_REALTIME);
3877
3878                 if (h > 0 && t > h + max_file_usec)
3879                         return true;
3880         }
3881
3882         return false;
3883 }