src/journal/journal-file.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2011 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <linux/fs.h>
  24 #include <pthread.h>
  25 #include <stddef.h>
  26 #include <sys/mman.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/uio.h>
  29 #include <unistd.h>
  30
  31 #include "alloc-util.h"
  32 #include "btrfs-util.h"
  33 #include "chattr-util.h"
  34 #include "compress.h"
  35 #include "fd-util.h"
  36 #include "journal-authenticate.h"
  37 #include "journal-def.h"
  38 #include "journal-file.h"
  39 #include "lookup3.h"
  40 #include "parse-util.h"
  41 #include "path-util.h"
  42 #include "random-util.h"
  43 #include "sd-event.h"
  44 #include "set.h"
  45 #include "string-util.h"
  46 #include "strv.h"
  47 #include "xattr-util.h"
  48
  49 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  50 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  51
  52 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
  53
  54 /* This is the minimum journal file size */
  55 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL)                 /* 512 KiB */
  56
  57 /* These are the lower and upper bounds if we deduce the max_use value
  58  * from the file system size */
  59 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  60 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  61
  62 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  63 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  64
  65 /* This is the upper bound if we deduce max_size from max_use */
  66 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  67
  68 /* This is the upper bound if we deduce the keep_free value from the
  69  * file system size */
  70 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  71
  72 /* This is the keep_free value when we can't determine the system
  73  * size */
  74 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  75
  76 /* This is the default maximum number of journal files to keep around. */
  77 #define DEFAULT_N_MAX_FILES (100)
  78
  79 /* n_data was the first entry we added after the initial file format design */
  80 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  81
  82 /* How many entries to keep in the entry array chain cache at max */
  83 #define CHAIN_CACHE_MAX 20
  84
  85 /* How much to increase the journal file size at once each time we allocate something new. */
  86 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  87
  88 /* Reread fstat() of the file for detecting deletions at least this often */
  89 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  90
  91 /* The mmap context to use for the header we pick as one above the last defined typed */
  92 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  93
  94 #ifdef __clang__
  95 #  pragma GCC diagnostic ignored "-Waddress-of-packed-member"
  96 #endif
  97
  98 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
  99  * As a result we use atomic operations on f->offline_state for inter-thread communications with
 100  * journal_file_set_offline() and journal_file_set_online(). */
 101 static void journal_file_set_offline_internal(JournalFile *f) {
 102         assert(f);
 103         assert(f->fd >= 0);
 104         assert(f->header);
 105
 106         for (;;) {
 107                 switch (f->offline_state) {
 108                 case OFFLINE_CANCEL:
 109                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
 110                                 continue;
 111                         return;
 112
 113                 case OFFLINE_AGAIN_FROM_SYNCING:
 114                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
 115                                 continue;
 116                         break;
 117
 118                 case OFFLINE_AGAIN_FROM_OFFLINING:
 119                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
 120                                 continue;
 121                         break;
 122
 123                 case OFFLINE_SYNCING:
 124                         (void) fsync(f->fd);
 125
 126                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
 127                                 continue;
 128
 129                         f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
 130                         (void) fsync(f->fd);
 131                         break;
 132
 133                 case OFFLINE_OFFLINING:
 134                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
 135                                 continue;
 136                         _fallthrough_;
 137                 case OFFLINE_DONE:
 138                         return;
 139
 140                 case OFFLINE_JOINED:
 141                         log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
 142                         return;
 143                 }
 144         }
 145 }
 146
 147 static void * journal_file_set_offline_thread(void *arg) {
 148         JournalFile *f = arg;
 149
 150         journal_file_set_offline_internal(f);
 151
 152         return NULL;
 153 }
 154
 155 static int journal_file_set_offline_thread_join(JournalFile *f) {
 156         int r;
 157
 158         assert(f);
 159
 160         if (f->offline_state == OFFLINE_JOINED)
 161                 return 0;
 162
 163         r = pthread_join(f->offline_thread, NULL);
 164         if (r)
 165                 return -r;
 166
 167         f->offline_state = OFFLINE_JOINED;
 168
 169         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 170                 return -EIO;
 171
 172         return 0;
 173 }
 174
 175 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
 176 static bool journal_file_set_offline_try_restart(JournalFile *f) {
 177         for (;;) {
 178                 switch (f->offline_state) {
 179                 case OFFLINE_AGAIN_FROM_SYNCING:
 180                 case OFFLINE_AGAIN_FROM_OFFLINING:
 181                         return true;
 182
 183                 case OFFLINE_CANCEL:
 184                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
 185                                 continue;
 186                         return true;
 187
 188                 case OFFLINE_SYNCING:
 189                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
 190                                 continue;
 191                         return true;
 192
 193                 case OFFLINE_OFFLINING:
 194                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
 195                                 continue;
 196                         return true;
 197
 198                 default:
 199                         return false;
 200                 }
 201         }
 202 }
 203
 204 /* Sets a journal offline.
 205  *
 206  * If wait is false then an offline is dispatched in a separate thread for a
 207  * subsequent journal_file_set_offline() or journal_file_set_online() of the
 208  * same journal to synchronize with.
 209  *
 210  * If wait is true, then either an existing offline thread will be restarted
 211  * and joined, or if none exists the offline is simply performed in this
 212  * context without involving another thread.
 213  */
 214 int journal_file_set_offline(JournalFile *f, bool wait) {
 215         bool restarted;
 216         int r;
 217
 218         assert(f);
 219
 220         if (!f->writable)
 221                 return -EPERM;
 222
 223         if (!(f->fd >= 0 && f->header))
 224                 return -EINVAL;
 225
 226         /* An offlining journal is implicitly online and may modify f->header->state,
 227          * we must also join any potentially lingering offline thread when not online. */
 228         if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
 229                 return journal_file_set_offline_thread_join(f);
 230
 231         /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
 232         restarted = journal_file_set_offline_try_restart(f);
 233         if ((restarted && wait) || !restarted) {
 234                 r = journal_file_set_offline_thread_join(f);
 235                 if (r < 0)
 236                         return r;
 237         }
 238
 239         if (restarted)
 240                 return 0;
 241
 242         /* Initiate a new offline. */
 243         f->offline_state = OFFLINE_SYNCING;
 244
 245         if (wait) /* Without using a thread if waiting. */
 246                 journal_file_set_offline_internal(f);
 247         else {
 248                 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
 249                 if (r > 0) {
 250                         f->offline_state = OFFLINE_JOINED;
 251                         return -r;
 252                 }
 253         }
 254
 255         return 0;
 256 }
 257
 258 static int journal_file_set_online(JournalFile *f) {
 259         bool joined = false;
 260
 261         assert(f);
 262
 263         if (!f->writable)
 264                 return -EPERM;
 265
 266         if (!(f->fd >= 0 && f->header))
 267                 return -EINVAL;
 268
 269         while (!joined) {
 270                 switch (f->offline_state) {
 271                 case OFFLINE_JOINED:
 272                         /* No offline thread, no need to wait. */
 273                         joined = true;
 274                         break;
 275
 276                 case OFFLINE_SYNCING:
 277                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
 278                                 continue;
 279                         /* Canceled syncing prior to offlining, no need to wait. */
 280                         break;
 281
 282                 case OFFLINE_AGAIN_FROM_SYNCING:
 283                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
 284                                 continue;
 285                         /* Canceled restart from syncing, no need to wait. */
 286                         break;
 287
 288                 case OFFLINE_AGAIN_FROM_OFFLINING:
 289                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
 290                                 continue;
 291                         /* Canceled restart from offlining, must wait for offlining to complete however. */
 292                         _fallthrough_;
 293                 default: {
 294                         int r;
 295
 296                         r = journal_file_set_offline_thread_join(f);
 297                         if (r < 0)
 298                                 return r;
 299
 300                         joined = true;
 301                         break;
 302                 }
 303                 }
 304         }
 305
 306         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 307                 return -EIO;
 308
 309         switch (f->header->state) {
 310                 case STATE_ONLINE:
 311                         return 0;
 312
 313                 case STATE_OFFLINE:
 314                         f->header->state = STATE_ONLINE;
 315                         (void) fsync(f->fd);
 316                         return 0;
 317
 318                 default:
 319                         return -EINVAL;
 320         }
 321 }
 322
 323 bool journal_file_is_offlining(JournalFile *f) {
 324         assert(f);
 325
 326         __sync_synchronize();
 327
 328         if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
 329                 return false;
 330
 331         return true;
 332 }
 333
 334 JournalFile* journal_file_close(JournalFile *f) {
 335         assert(f);
 336
 337 #if HAVE_GCRYPT
 338         /* Write the final tag */
 339         if (f->seal && f->writable) {
 340                 int r;
 341
 342                 r = journal_file_append_tag(f);
 343                 if (r < 0)
 344                         log_error_errno(r, "Failed to append tag when closing journal: %m");
 345         }
 346 #endif
 347
 348         if (f->post_change_timer) {
 349                 int enabled;
 350
 351                 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
 352                         if (enabled == SD_EVENT_ONESHOT)
 353                                 journal_file_post_change(f);
 354
 355                 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
 356                 sd_event_source_unref(f->post_change_timer);
 357         }
 358
 359         journal_file_set_offline(f, true);
 360
 361         if (f->mmap && f->cache_fd)
 362                 mmap_cache_free_fd(f->mmap, f->cache_fd);
 363
 364         if (f->fd >= 0 && f->defrag_on_close) {
 365
 366                 /* Be friendly to btrfs: turn COW back on again now,
 367                  * and defragment the file. We won't write to the file
 368                  * ever again, hence remove all fragmentation, and
 369                  * reenable all the good bits COW usually provides
 370                  * (such as data checksumming). */
 371
 372                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 373                 (void) btrfs_defrag_fd(f->fd);
 374         }
 375
 376         if (f->close_fd)
 377                 safe_close(f->fd);
 378         free(f->path);
 379
 380         mmap_cache_unref(f->mmap);
 381
 382         ordered_hashmap_free_free(f->chain_cache);
 383
 384 #if HAVE_XZ || HAVE_LZ4
 385         free(f->compress_buffer);
 386 #endif
 387
 388 #if HAVE_GCRYPT
 389         if (f->fss_file)
 390                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 391         else
 392                 free(f->fsprg_state);
 393
 394         free(f->fsprg_seed);
 395
 396         if (f->hmac)
 397                 gcry_md_close(f->hmac);
 398 #endif
 399
 400         return mfree(f);
 401 }
 402
 403 void journal_file_close_set(Set *s) {
 404         JournalFile *f;
 405
 406         assert(s);
 407
 408         while ((f = set_steal_first(s)))
 409                 (void) journal_file_close(f);
 410 }
 411
 412 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 413         Header h = {};
 414         ssize_t k;
 415         int r;
 416
 417         assert(f);
 418
 419         memcpy(h.signature, HEADER_SIGNATURE, 8);
 420         h.header_size = htole64(ALIGN64(sizeof(h)));
 421
 422         h.incompatible_flags |= htole32(
 423                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 424                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 425
 426         h.compatible_flags = htole32(
 427                 f->seal * HEADER_COMPATIBLE_SEALED);
 428
 429         r = sd_id128_randomize(&h.file_id);
 430         if (r < 0)
 431                 return r;
 432
 433         if (template) {
 434                 h.seqnum_id = template->header->seqnum_id;
 435                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 436         } else
 437                 h.seqnum_id = h.file_id;
 438
 439         k = pwrite(f->fd, &h, sizeof(h), 0);
 440         if (k < 0)
 441                 return -errno;
 442
 443         if (k != sizeof(h))
 444                 return -EIO;
 445
 446         return 0;
 447 }
 448
 449 static int fsync_directory_of_file(int fd) {
 450         _cleanup_free_ char *path = NULL, *dn = NULL;
 451         _cleanup_close_ int dfd = -1;
 452         struct stat st;
 453         int r;
 454
 455         if (fstat(fd, &st) < 0)
 456                 return -errno;
 457
 458         if (!S_ISREG(st.st_mode))
 459                 return -EBADFD;
 460
 461         r = fd_get_path(fd, &path);
 462         if (r < 0)
 463                 return r;
 464
 465         if (!path_is_absolute(path))
 466                 return -EINVAL;
 467
 468         dn = dirname_malloc(path);
 469         if (!dn)
 470                 return -ENOMEM;
 471
 472         dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
 473         if (dfd < 0)
 474                 return -errno;
 475
 476         if (fsync(dfd) < 0)
 477                 return -errno;
 478
 479         return 0;
 480 }
 481
 482 static int journal_file_refresh_header(JournalFile *f) {
 483         sd_id128_t boot_id;
 484         int r;
 485
 486         assert(f);
 487         assert(f->header);
 488
 489         r = sd_id128_get_machine(&f->header->machine_id);
 490         if (r < 0)
 491                 return r;
 492
 493         r = sd_id128_get_boot(&boot_id);
 494         if (r < 0)
 495                 return r;
 496
 497         if (sd_id128_equal(boot_id, f->header->boot_id))
 498                 f->tail_entry_monotonic_valid = true;
 499
 500         f->header->boot_id = boot_id;
 501
 502         r = journal_file_set_online(f);
 503
 504         /* Sync the online state to disk */
 505         (void) fsync(f->fd);
 506
 507         /* We likely just created a new file, also sync the directory this file is located in. */
 508         (void) fsync_directory_of_file(f->fd);
 509
 510         return r;
 511 }
 512
 513 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
 514         const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
 515                 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
 516         const char *type = compatible ? "compatible" : "incompatible";
 517         uint32_t flags;
 518
 519         flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
 520
 521         if (flags & ~supported) {
 522                 if (flags & ~any)
 523                         log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
 524                                   f->path, type, flags & ~any);
 525                 flags = (flags & any) & ~supported;
 526                 if (flags) {
 527                         const char* strv[3];
 528                         unsigned n = 0;
 529                         _cleanup_free_ char *t = NULL;
 530
 531                         if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
 532                                 strv[n++] = "sealed";
 533                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
 534                                 strv[n++] = "xz-compressed";
 535                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
 536                                 strv[n++] = "lz4-compressed";
 537                         strv[n] = NULL;
 538                         assert(n < ELEMENTSOF(strv));
 539
 540                         t = strv_join((char**) strv, ", ");
 541                         log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
 542                                   f->path, type, n > 1 ? "flags" : "flag", strnull(t));
 543                 }
 544                 return true;
 545         }
 546
 547         return false;
 548 }
 549
 550 static int journal_file_verify_header(JournalFile *f) {
 551         uint64_t arena_size, header_size;
 552
 553         assert(f);
 554         assert(f->header);
 555
 556         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 557                 return -EBADMSG;
 558
 559         /* In both read and write mode we refuse to open files with incompatible
 560          * flags we don't know. */
 561         if (warn_wrong_flags(f, false))
 562                 return -EPROTONOSUPPORT;
 563
 564         /* When open for writing we refuse to open files with compatible flags, too. */
 565         if (f->writable && warn_wrong_flags(f, true))
 566                 return -EPROTONOSUPPORT;
 567
 568         if (f->header->state >= _STATE_MAX)
 569                 return -EBADMSG;
 570
 571         header_size = le64toh(f->header->header_size);
 572
 573         /* The first addition was n_data, so check that we are at least this large */
 574         if (header_size < HEADER_SIZE_MIN)
 575                 return -EBADMSG;
 576
 577         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 578                 return -EBADMSG;
 579
 580         arena_size = le64toh(f->header->arena_size);
 581
 582         if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
 583                 return -ENODATA;
 584
 585         if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
 586                 return -ENODATA;
 587
 588         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 589             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 590             !VALID64(le64toh(f->header->tail_object_offset)) ||
 591             !VALID64(le64toh(f->header->entry_array_offset)))
 592                 return -ENODATA;
 593
 594         if (f->writable) {
 595                 sd_id128_t machine_id;
 596                 uint8_t state;
 597                 int r;
 598
 599                 r = sd_id128_get_machine(&machine_id);
 600                 if (r < 0)
 601                         return r;
 602
 603                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 604                         return -EHOSTDOWN;
 605
 606                 state = f->header->state;
 607
 608                 if (state == STATE_ARCHIVED)
 609                         return -ESHUTDOWN; /* Already archived */
 610                 else if (state == STATE_ONLINE) {
 611                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 612                         return -EBUSY;
 613                 } else if (state != STATE_OFFLINE) {
 614                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 615                         return -EBUSY;
 616                 }
 617
 618                 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
 619                         return -EBADMSG;
 620
 621                 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
 622                  * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
 623                  * bisection. */
 624                 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
 625                         log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
 626                         return -ETXTBSY;
 627                 }
 628         }
 629
 630         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 631         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 632
 633         f->seal = JOURNAL_HEADER_SEALED(f->header);
 634
 635         return 0;
 636 }
 637
 638 static int journal_file_fstat(JournalFile *f) {
 639         assert(f);
 640         assert(f->fd >= 0);
 641
 642         if (fstat(f->fd, &f->last_stat) < 0)
 643                 return -errno;
 644
 645         f->last_stat_usec = now(CLOCK_MONOTONIC);
 646
 647         /* Refuse appending to files that are already deleted */
 648         if (f->last_stat.st_nlink <= 0)
 649                 return -EIDRM;
 650
 651         return 0;
 652 }
 653
 654 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 655         uint64_t old_size, new_size;
 656         int r;
 657
 658         assert(f);
 659         assert(f->header);
 660
 661         /* We assume that this file is not sparse, and we know that
 662          * for sure, since we always call posix_fallocate()
 663          * ourselves */
 664
 665         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 666                 return -EIO;
 667
 668         old_size =
 669                 le64toh(f->header->header_size) +
 670                 le64toh(f->header->arena_size);
 671
 672         new_size = PAGE_ALIGN(offset + size);
 673         if (new_size < le64toh(f->header->header_size))
 674                 new_size = le64toh(f->header->header_size);
 675
 676         if (new_size <= old_size) {
 677
 678                 /* We already pre-allocated enough space, but before
 679                  * we write to it, let's check with fstat() if the
 680                  * file got deleted, in order make sure we don't throw
 681                  * away the data immediately. Don't check fstat() for
 682                  * all writes though, but only once ever 10s. */
 683
 684                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 685                         return 0;
 686
 687                 return journal_file_fstat(f);
 688         }
 689
 690         /* Allocate more space. */
 691
 692         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 693                 return -E2BIG;
 694
 695         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 696                 struct statvfs svfs;
 697
 698                 if (fstatvfs(f->fd, &svfs) >= 0) {
 699                         uint64_t available;
 700
 701                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 702
 703                         if (new_size - old_size > available)
 704                                 return -E2BIG;
 705                 }
 706         }
 707
 708         /* Increase by larger blocks at once */
 709         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 710         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 711                 new_size = f->metrics.max_size;
 712
 713         /* Note that the glibc fallocate() fallback is very
 714            inefficient, hence we try to minimize the allocation area
 715            as we can. */
 716         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 717         if (r != 0)
 718                 return -r;
 719
 720         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 721
 722         return journal_file_fstat(f);
 723 }
 724
 725 static unsigned type_to_context(ObjectType type) {
 726         /* One context for each type, plus one catch-all for the rest */
 727         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 728         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 729         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 730 }
 731
 732 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
 733         int r;
 734
 735         assert(f);
 736         assert(ret);
 737
 738         if (size <= 0)
 739                 return -EINVAL;
 740
 741         /* Avoid SIGBUS on invalid accesses */
 742         if (offset + size > (uint64_t) f->last_stat.st_size) {
 743                 /* Hmm, out of range? Let's refresh the fstat() data
 744                  * first, before we trust that check. */
 745
 746                 r = journal_file_fstat(f);
 747                 if (r < 0)
 748                         return r;
 749
 750                 if (offset + size > (uint64_t) f->last_stat.st_size)
 751                         return -EADDRNOTAVAIL;
 752         }
 753
 754         return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
 755 }
 756
 757 static uint64_t minimum_header_size(Object *o) {
 758
 759         static const uint64_t table[] = {
 760                 [OBJECT_DATA] = sizeof(DataObject),
 761                 [OBJECT_FIELD] = sizeof(FieldObject),
 762                 [OBJECT_ENTRY] = sizeof(EntryObject),
 763                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 764                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 765                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 766                 [OBJECT_TAG] = sizeof(TagObject),
 767         };
 768
 769         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 770                 return sizeof(ObjectHeader);
 771
 772         return table[o->object.type];
 773 }
 774
 775 /* Lightweight object checks. We want this to be fast, so that we won't
 776  * slowdown every journal_file_move_to_object() call too much. */
 777 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
 778         assert(f);
 779         assert(o);
 780
 781         switch (o->object.type) {
 782
 783         case OBJECT_DATA: {
 784                 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
 785                         log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
 786                                         le64toh(o->data.n_entries), offset);
 787                         return -EBADMSG;
 788                 }
 789
 790                 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
 791                         log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
 792                               offsetof(DataObject, payload),
 793                               le64toh(o->object.size),
 794                               offset);
 795                         return -EBADMSG;
 796                 }
 797
 798                 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
 799                     !VALID64(le64toh(o->data.next_field_offset)) ||
 800                     !VALID64(le64toh(o->data.entry_offset)) ||
 801                     !VALID64(le64toh(o->data.entry_array_offset))) {
 802                         log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
 803                                 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
 804                               le64toh(o->data.next_hash_offset),
 805                               le64toh(o->data.next_field_offset),
 806                               le64toh(o->data.entry_offset),
 807                               le64toh(o->data.entry_array_offset),
 808                               offset);
 809                         return -EBADMSG;
 810                 }
 811
 812                 break;
 813         }
 814
 815         case OBJECT_FIELD:
 816                 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
 817                         log_debug(
 818                               "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
 819                               offsetof(FieldObject, payload),
 820                               le64toh(o->object.size),
 821                               offset);
 822                         return -EBADMSG;
 823                 }
 824
 825                 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
 826                     !VALID64(le64toh(o->field.head_data_offset))) {
 827                         log_debug(
 828                               "Invalid offset, next_hash_offset="OFSfmt
 829                               ", head_data_offset="OFSfmt": %"PRIu64,
 830                               le64toh(o->field.next_hash_offset),
 831                               le64toh(o->field.head_data_offset),
 832                               offset);
 833                         return -EBADMSG;
 834                 }
 835                 break;
 836
 837         case OBJECT_ENTRY:
 838                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
 839                         log_debug(
 840                               "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
 841                               offsetof(EntryObject, items),
 842                               le64toh(o->object.size),
 843                               offset);
 844                         return -EBADMSG;
 845                 }
 846
 847                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
 848                         log_debug(
 849                               "Invalid number items in entry: %"PRIu64": %"PRIu64,
 850                               (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
 851                               offset);
 852                         return -EBADMSG;
 853                 }
 854
 855                 if (le64toh(o->entry.seqnum) <= 0) {
 856                         log_debug(
 857                               "Invalid entry seqnum: %"PRIx64": %"PRIu64,
 858                               le64toh(o->entry.seqnum),
 859                               offset);
 860                         return -EBADMSG;
 861                 }
 862
 863                 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
 864                         log_debug(
 865                               "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
 866                               le64toh(o->entry.realtime),
 867                               offset);
 868                         return -EBADMSG;
 869                 }
 870
 871                 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
 872                         log_debug(
 873                               "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
 874                               le64toh(o->entry.monotonic),
 875                               offset);
 876                         return -EBADMSG;
 877                 }
 878
 879                 break;
 880
 881         case OBJECT_DATA_HASH_TABLE:
 882         case OBJECT_FIELD_HASH_TABLE:
 883                 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
 884                     (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
 885                         log_debug(
 886                               "Invalid %s hash table size: %"PRIu64": %"PRIu64,
 887                               o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
 888                               le64toh(o->object.size),
 889                               offset);
 890                         return -EBADMSG;
 891                 }
 892
 893                 break;
 894
 895         case OBJECT_ENTRY_ARRAY:
 896                 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
 897                     (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
 898                         log_debug(
 899                               "Invalid object entry array size: %"PRIu64": %"PRIu64,
 900                               le64toh(o->object.size),
 901                               offset);
 902                         return -EBADMSG;
 903                 }
 904
 905                 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
 906                         log_debug(
 907                               "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
 908                               le64toh(o->entry_array.next_entry_array_offset),
 909                               offset);
 910                         return -EBADMSG;
 911                 }
 912
 913                 break;
 914
 915         case OBJECT_TAG:
 916                 if (le64toh(o->object.size) != sizeof(TagObject)) {
 917                         log_debug(
 918                               "Invalid object tag size: %"PRIu64": %"PRIu64,
 919                               le64toh(o->object.size),
 920                               offset);
 921                         return -EBADMSG;
 922                 }
 923
 924                 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
 925                         log_debug(
 926                               "Invalid object tag epoch: %"PRIu64": %"PRIu64,
 927                               le64toh(o->tag.epoch),
 928                               offset);
 929                         return -EBADMSG;
 930                 }
 931
 932                 break;
 933         }
 934
 935         return 0;
 936 }
 937
 938 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 939         int r;
 940         void *t;
 941         size_t tsize;
 942         Object *o;
 943         uint64_t s;
 944
 945         assert(f);
 946         assert(ret);
 947
 948         /* Objects may only be located at multiple of 64 bit */
 949         if (!VALID64(offset)) {
 950                 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
 951                 return -EBADMSG;
 952         }
 953
 954         /* Object may not be located in the file header */
 955         if (offset < le64toh(f->header->header_size)) {
 956                 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
 957                 return -EBADMSG;
 958         }
 959
 960         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
 961         if (r < 0)
 962                 return r;
 963
 964         o = (Object*) t;
 965         s = le64toh(o->object.size);
 966
 967         if (s == 0) {
 968                 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
 969                 return -EBADMSG;
 970         }
 971         if (s < sizeof(ObjectHeader)) {
 972                 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
 973                 return -EBADMSG;
 974         }
 975
 976         if (o->object.type <= OBJECT_UNUSED) {
 977                 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
 978                 return -EBADMSG;
 979         }
 980
 981         if (s < minimum_header_size(o)) {
 982                 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
 983                 return -EBADMSG;
 984         }
 985
 986         if (type > OBJECT_UNUSED && o->object.type != type) {
 987                 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
 988                 return -EBADMSG;
 989         }
 990
 991         if (s > tsize) {
 992                 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
 993                 if (r < 0)
 994                         return r;
 995
 996                 o = (Object*) t;
 997         }
 998
 999         r = journal_file_check_object(f, offset, o);
1000         if (r < 0)
1001                 return r;
1002
1003         *ret = o;
1004         return 0;
1005 }
1006
1007 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
1008         uint64_t r;
1009
1010         assert(f);
1011         assert(f->header);
1012
1013         r = le64toh(f->header->tail_entry_seqnum) + 1;
1014
1015         if (seqnum) {
1016                 /* If an external seqnum counter was passed, we update
1017                  * both the local and the external one, and set it to
1018                  * the maximum of both */
1019
1020                 if (*seqnum + 1 > r)
1021                         r = *seqnum + 1;
1022
1023                 *seqnum = r;
1024         }
1025
1026         f->header->tail_entry_seqnum = htole64(r);
1027
1028         if (f->header->head_entry_seqnum == 0)
1029                 f->header->head_entry_seqnum = htole64(r);
1030
1031         return r;
1032 }
1033
1034 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
1035         int r;
1036         uint64_t p;
1037         Object *tail, *o;
1038         void *t;
1039
1040         assert(f);
1041         assert(f->header);
1042         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
1043         assert(size >= sizeof(ObjectHeader));
1044         assert(offset);
1045         assert(ret);
1046
1047         r = journal_file_set_online(f);
1048         if (r < 0)
1049                 return r;
1050
1051         p = le64toh(f->header->tail_object_offset);
1052         if (p == 0)
1053                 p = le64toh(f->header->header_size);
1054         else {
1055                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1056                 if (r < 0)
1057                         return r;
1058
1059                 p += ALIGN64(le64toh(tail->object.size));
1060         }
1061
1062         r = journal_file_allocate(f, p, size);
1063         if (r < 0)
1064                 return r;
1065
1066         r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1067         if (r < 0)
1068                 return r;
1069
1070         o = (Object*) t;
1071
1072         zero(o->object);
1073         o->object.type = type;
1074         o->object.size = htole64(size);
1075
1076         f->header->tail_object_offset = htole64(p);
1077         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1078
1079         *ret = o;
1080         *offset = p;
1081
1082         return 0;
1083 }
1084
1085 static int journal_file_setup_data_hash_table(JournalFile *f) {
1086         uint64_t s, p;
1087         Object *o;
1088         int r;
1089
1090         assert(f);
1091         assert(f->header);
1092
1093         /* We estimate that we need 1 hash table entry per 768 bytes
1094            of journal file and we want to make sure we never get
1095            beyond 75% fill level. Calculate the hash table size for
1096            the maximum file size based on these metrics. */
1097
1098         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1099         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1100                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1101
1102         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1103
1104         r = journal_file_append_object(f,
1105                                        OBJECT_DATA_HASH_TABLE,
1106                                        offsetof(Object, hash_table.items) + s,
1107                                        &o, &p);
1108         if (r < 0)
1109                 return r;
1110
1111         memzero(o->hash_table.items, s);
1112
1113         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1114         f->header->data_hash_table_size = htole64(s);
1115
1116         return 0;
1117 }
1118
1119 static int journal_file_setup_field_hash_table(JournalFile *f) {
1120         uint64_t s, p;
1121         Object *o;
1122         int r;
1123
1124         assert(f);
1125         assert(f->header);
1126
1127         /* We use a fixed size hash table for the fields as this
1128          * number should grow very slowly only */
1129
1130         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1131         r = journal_file_append_object(f,
1132                                        OBJECT_FIELD_HASH_TABLE,
1133                                        offsetof(Object, hash_table.items) + s,
1134                                        &o, &p);
1135         if (r < 0)
1136                 return r;
1137
1138         memzero(o->hash_table.items, s);
1139
1140         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1141         f->header->field_hash_table_size = htole64(s);
1142
1143         return 0;
1144 }
1145
1146 int journal_file_map_data_hash_table(JournalFile *f) {
1147         uint64_t s, p;
1148         void *t;
1149         int r;
1150
1151         assert(f);
1152         assert(f->header);
1153
1154         if (f->data_hash_table)
1155                 return 0;
1156
1157         p = le64toh(f->header->data_hash_table_offset);
1158         s = le64toh(f->header->data_hash_table_size);
1159
1160         r = journal_file_move_to(f,
1161                                  OBJECT_DATA_HASH_TABLE,
1162                                  true,
1163                                  p, s,
1164                                  &t, NULL);
1165         if (r < 0)
1166                 return r;
1167
1168         f->data_hash_table = t;
1169         return 0;
1170 }
1171
1172 int journal_file_map_field_hash_table(JournalFile *f) {
1173         uint64_t s, p;
1174         void *t;
1175         int r;
1176
1177         assert(f);
1178         assert(f->header);
1179
1180         if (f->field_hash_table)
1181                 return 0;
1182
1183         p = le64toh(f->header->field_hash_table_offset);
1184         s = le64toh(f->header->field_hash_table_size);
1185
1186         r = journal_file_move_to(f,
1187                                  OBJECT_FIELD_HASH_TABLE,
1188                                  true,
1189                                  p, s,
1190                                  &t, NULL);
1191         if (r < 0)
1192                 return r;
1193
1194         f->field_hash_table = t;
1195         return 0;
1196 }
1197
1198 static int journal_file_link_field(
1199                 JournalFile *f,
1200                 Object *o,
1201                 uint64_t offset,
1202                 uint64_t hash) {
1203
1204         uint64_t p, h, m;
1205         int r;
1206
1207         assert(f);
1208         assert(f->header);
1209         assert(f->field_hash_table);
1210         assert(o);
1211         assert(offset > 0);
1212
1213         if (o->object.type != OBJECT_FIELD)
1214                 return -EINVAL;
1215
1216         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1217         if (m <= 0)
1218                 return -EBADMSG;
1219
1220         /* This might alter the window we are looking at */
1221         o->field.next_hash_offset = o->field.head_data_offset = 0;
1222
1223         h = hash % m;
1224         p = le64toh(f->field_hash_table[h].tail_hash_offset);
1225         if (p == 0)
1226                 f->field_hash_table[h].head_hash_offset = htole64(offset);
1227         else {
1228                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1229                 if (r < 0)
1230                         return r;
1231
1232                 o->field.next_hash_offset = htole64(offset);
1233         }
1234
1235         f->field_hash_table[h].tail_hash_offset = htole64(offset);
1236
1237         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1238                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1239
1240         return 0;
1241 }
1242
1243 static int journal_file_link_data(
1244                 JournalFile *f,
1245                 Object *o,
1246                 uint64_t offset,
1247                 uint64_t hash) {
1248
1249         uint64_t p, h, m;
1250         int r;
1251
1252         assert(f);
1253         assert(f->header);
1254         assert(f->data_hash_table);
1255         assert(o);
1256         assert(offset > 0);
1257
1258         if (o->object.type != OBJECT_DATA)
1259                 return -EINVAL;
1260
1261         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1262         if (m <= 0)
1263                 return -EBADMSG;
1264
1265         /* This might alter the window we are looking at */
1266         o->data.next_hash_offset = o->data.next_field_offset = 0;
1267         o->data.entry_offset = o->data.entry_array_offset = 0;
1268         o->data.n_entries = 0;
1269
1270         h = hash % m;
1271         p = le64toh(f->data_hash_table[h].tail_hash_offset);
1272         if (p == 0)
1273                 /* Only entry in the hash table is easy */
1274                 f->data_hash_table[h].head_hash_offset = htole64(offset);
1275         else {
1276                 /* Move back to the previous data object, to patch in
1277                  * pointer */
1278
1279                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1280                 if (r < 0)
1281                         return r;
1282
1283                 o->data.next_hash_offset = htole64(offset);
1284         }
1285
1286         f->data_hash_table[h].tail_hash_offset = htole64(offset);
1287
1288         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1289                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1290
1291         return 0;
1292 }
1293
1294 int journal_file_find_field_object_with_hash(
1295                 JournalFile *f,
1296                 const void *field, uint64_t size, uint64_t hash,
1297                 Object **ret, uint64_t *offset) {
1298
1299         uint64_t p, osize, h, m;
1300         int r;
1301
1302         assert(f);
1303         assert(f->header);
1304         assert(field && size > 0);
1305
1306         /* If the field hash table is empty, we can't find anything */
1307         if (le64toh(f->header->field_hash_table_size) <= 0)
1308                 return 0;
1309
1310         /* Map the field hash table, if it isn't mapped yet. */
1311         r = journal_file_map_field_hash_table(f);
1312         if (r < 0)
1313                 return r;
1314
1315         osize = offsetof(Object, field.payload) + size;
1316
1317         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1318         if (m <= 0)
1319                 return -EBADMSG;
1320
1321         h = hash % m;
1322         p = le64toh(f->field_hash_table[h].head_hash_offset);
1323
1324         while (p > 0) {
1325                 Object *o;
1326
1327                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1328                 if (r < 0)
1329                         return r;
1330
1331                 if (le64toh(o->field.hash) == hash &&
1332                     le64toh(o->object.size) == osize &&
1333                     memcmp(o->field.payload, field, size) == 0) {
1334
1335                         if (ret)
1336                                 *ret = o;
1337                         if (offset)
1338                                 *offset = p;
1339
1340                         return 1;
1341                 }
1342
1343                 p = le64toh(o->field.next_hash_offset);
1344         }
1345
1346         return 0;
1347 }
1348
1349 int journal_file_find_field_object(
1350                 JournalFile *f,
1351                 const void *field, uint64_t size,
1352                 Object **ret, uint64_t *offset) {
1353
1354         uint64_t hash;
1355
1356         assert(f);
1357         assert(field && size > 0);
1358
1359         hash = hash64(field, size);
1360
1361         return journal_file_find_field_object_with_hash(f,
1362                                                         field, size, hash,
1363                                                         ret, offset);
1364 }
1365
1366 int journal_file_find_data_object_with_hash(
1367                 JournalFile *f,
1368                 const void *data, uint64_t size, uint64_t hash,
1369                 Object **ret, uint64_t *offset) {
1370
1371         uint64_t p, osize, h, m;
1372         int r;
1373
1374         assert(f);
1375         assert(f->header);
1376         assert(data || size == 0);
1377
1378         /* If there's no data hash table, then there's no entry. */
1379         if (le64toh(f->header->data_hash_table_size) <= 0)
1380                 return 0;
1381
1382         /* Map the data hash table, if it isn't mapped yet. */
1383         r = journal_file_map_data_hash_table(f);
1384         if (r < 0)
1385                 return r;
1386
1387         osize = offsetof(Object, data.payload) + size;
1388
1389         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1390         if (m <= 0)
1391                 return -EBADMSG;
1392
1393         h = hash % m;
1394         p = le64toh(f->data_hash_table[h].head_hash_offset);
1395
1396         while (p > 0) {
1397                 Object *o;
1398
1399                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1400                 if (r < 0)
1401                         return r;
1402
1403                 if (le64toh(o->data.hash) != hash)
1404                         goto next;
1405
1406                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1407 #if HAVE_XZ || HAVE_LZ4
1408                         uint64_t l;
1409                         size_t rsize = 0;
1410
1411                         l = le64toh(o->object.size);
1412                         if (l <= offsetof(Object, data.payload))
1413                                 return -EBADMSG;
1414
1415                         l -= offsetof(Object, data.payload);
1416
1417                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1418                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1419                         if (r < 0)
1420                                 return r;
1421
1422                         if (rsize == size &&
1423                             memcmp(f->compress_buffer, data, size) == 0) {
1424
1425                                 if (ret)
1426                                         *ret = o;
1427
1428                                 if (offset)
1429                                         *offset = p;
1430
1431                                 return 1;
1432                         }
1433 #else
1434                         return -EPROTONOSUPPORT;
1435 #endif
1436                 } else if (le64toh(o->object.size) == osize &&
1437                            memcmp(o->data.payload, data, size) == 0) {
1438
1439                         if (ret)
1440                                 *ret = o;
1441
1442                         if (offset)
1443                                 *offset = p;
1444
1445                         return 1;
1446                 }
1447
1448         next:
1449                 p = le64toh(o->data.next_hash_offset);
1450         }
1451
1452         return 0;
1453 }
1454
1455 int journal_file_find_data_object(
1456                 JournalFile *f,
1457                 const void *data, uint64_t size,
1458                 Object **ret, uint64_t *offset) {
1459
1460         uint64_t hash;
1461
1462         assert(f);
1463         assert(data || size == 0);
1464
1465         hash = hash64(data, size);
1466
1467         return journal_file_find_data_object_with_hash(f,
1468                                                        data, size, hash,
1469                                                        ret, offset);
1470 }
1471
1472 static int journal_file_append_field(
1473                 JournalFile *f,
1474                 const void *field, uint64_t size,
1475                 Object **ret, uint64_t *offset) {
1476
1477         uint64_t hash, p;
1478         uint64_t osize;
1479         Object *o;
1480         int r;
1481
1482         assert(f);
1483         assert(field && size > 0);
1484
1485         hash = hash64(field, size);
1486
1487         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1488         if (r < 0)
1489                 return r;
1490         else if (r > 0) {
1491
1492                 if (ret)
1493                         *ret = o;
1494
1495                 if (offset)
1496                         *offset = p;
1497
1498                 return 0;
1499         }
1500
1501         osize = offsetof(Object, field.payload) + size;
1502         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1503         if (r < 0)
1504                 return r;
1505
1506         o->field.hash = htole64(hash);
1507         memcpy(o->field.payload, field, size);
1508
1509         r = journal_file_link_field(f, o, p, hash);
1510         if (r < 0)
1511                 return r;
1512
1513         /* The linking might have altered the window, so let's
1514          * refresh our pointer */
1515         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1516         if (r < 0)
1517                 return r;
1518
1519 #if HAVE_GCRYPT
1520         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1521         if (r < 0)
1522                 return r;
1523 #endif
1524
1525         if (ret)
1526                 *ret = o;
1527
1528         if (offset)
1529                 *offset = p;
1530
1531         return 0;
1532 }
1533
1534 static int journal_file_append_data(
1535                 JournalFile *f,
1536                 const void *data, uint64_t size,
1537                 Object **ret, uint64_t *offset) {
1538
1539         uint64_t hash, p;
1540         uint64_t osize;
1541         Object *o;
1542         int r, compression = 0;
1543         const void *eq;
1544
1545         assert(f);
1546         assert(data || size == 0);
1547
1548         hash = hash64(data, size);
1549
1550         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1551         if (r < 0)
1552                 return r;
1553         if (r > 0) {
1554
1555                 if (ret)
1556                         *ret = o;
1557
1558                 if (offset)
1559                         *offset = p;
1560
1561                 return 0;
1562         }
1563
1564         osize = offsetof(Object, data.payload) + size;
1565         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1566         if (r < 0)
1567                 return r;
1568
1569         o->data.hash = htole64(hash);
1570
1571 #if HAVE_XZ || HAVE_LZ4
1572         if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1573                 size_t rsize = 0;
1574
1575                 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1576
1577                 if (compression >= 0) {
1578                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1579                         o->object.flags |= compression;
1580
1581                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1582                                   size, rsize, object_compressed_to_string(compression));
1583                 } else
1584                         /* Compression didn't work, we don't really care why, let's continue without compression */
1585                         compression = 0;
1586         }
1587 #endif
1588
1589         if (compression == 0)
1590                 memcpy_safe(o->data.payload, data, size);
1591
1592         r = journal_file_link_data(f, o, p, hash);
1593         if (r < 0)
1594                 return r;
1595
1596 #if HAVE_GCRYPT
1597         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1598         if (r < 0)
1599                 return r;
1600 #endif
1601
1602         /* The linking might have altered the window, so let's
1603          * refresh our pointer */
1604         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1605         if (r < 0)
1606                 return r;
1607
1608         if (!data)
1609                 eq = NULL;
1610         else
1611                 eq = memchr(data, '=', size);
1612         if (eq && eq > data) {
1613                 Object *fo = NULL;
1614                 uint64_t fp;
1615
1616                 /* Create field object ... */
1617                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1618                 if (r < 0)
1619                         return r;
1620
1621                 /* ... and link it in. */
1622                 o->data.next_field_offset = fo->field.head_data_offset;
1623                 fo->field.head_data_offset = le64toh(p);
1624         }
1625
1626         if (ret)
1627                 *ret = o;
1628
1629         if (offset)
1630                 *offset = p;
1631
1632         return 0;
1633 }
1634
1635 uint64_t journal_file_entry_n_items(Object *o) {
1636         assert(o);
1637
1638         if (o->object.type != OBJECT_ENTRY)
1639                 return 0;
1640
1641         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1642 }
1643
1644 uint64_t journal_file_entry_array_n_items(Object *o) {
1645         assert(o);
1646
1647         if (o->object.type != OBJECT_ENTRY_ARRAY)
1648                 return 0;
1649
1650         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1651 }
1652
1653 uint64_t journal_file_hash_table_n_items(Object *o) {
1654         assert(o);
1655
1656         if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1657                 return 0;
1658
1659         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1660 }
1661
1662 static int link_entry_into_array(JournalFile *f,
1663                                  le64_t *first,
1664                                  le64_t *idx,
1665                                  uint64_t p) {
1666         int r;
1667         uint64_t n = 0, ap = 0, q, i, a, hidx;
1668         Object *o;
1669
1670         assert(f);
1671         assert(f->header);
1672         assert(first);
1673         assert(idx);
1674         assert(p > 0);
1675
1676         a = le64toh(*first);
1677         i = hidx = le64toh(*idx);
1678         while (a > 0) {
1679
1680                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1681                 if (r < 0)
1682                         return r;
1683
1684                 n = journal_file_entry_array_n_items(o);
1685                 if (i < n) {
1686                         o->entry_array.items[i] = htole64(p);
1687                         *idx = htole64(hidx + 1);
1688                         return 0;
1689                 }
1690
1691                 i -= n;
1692                 ap = a;
1693                 a = le64toh(o->entry_array.next_entry_array_offset);
1694         }
1695
1696         if (hidx > n)
1697                 n = (hidx+1) * 2;
1698         else
1699                 n = n * 2;
1700
1701         if (n < 4)
1702                 n = 4;
1703
1704         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1705                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1706                                        &o, &q);
1707         if (r < 0)
1708                 return r;
1709
1710 #if HAVE_GCRYPT
1711         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1712         if (r < 0)
1713                 return r;
1714 #endif
1715
1716         o->entry_array.items[i] = htole64(p);
1717
1718         if (ap == 0)
1719                 *first = htole64(q);
1720         else {
1721                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1722                 if (r < 0)
1723                         return r;
1724
1725                 o->entry_array.next_entry_array_offset = htole64(q);
1726         }
1727
1728         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1729                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1730
1731         *idx = htole64(hidx + 1);
1732
1733         return 0;
1734 }
1735
1736 static int link_entry_into_array_plus_one(JournalFile *f,
1737                                           le64_t *extra,
1738                                           le64_t *first,
1739                                           le64_t *idx,
1740                                           uint64_t p) {
1741
1742         int r;
1743
1744         assert(f);
1745         assert(extra);
1746         assert(first);
1747         assert(idx);
1748         assert(p > 0);
1749
1750         if (*idx == 0)
1751                 *extra = htole64(p);
1752         else {
1753                 le64_t i;
1754
1755                 i = htole64(le64toh(*idx) - 1);
1756                 r = link_entry_into_array(f, first, &i, p);
1757                 if (r < 0)
1758                         return r;
1759         }
1760
1761         *idx = htole64(le64toh(*idx) + 1);
1762         return 0;
1763 }
1764
1765 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1766         uint64_t p;
1767         int r;
1768         assert(f);
1769         assert(o);
1770         assert(offset > 0);
1771
1772         p = le64toh(o->entry.items[i].object_offset);
1773         if (p == 0)
1774                 return -EINVAL;
1775
1776         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1777         if (r < 0)
1778                 return r;
1779
1780         return link_entry_into_array_plus_one(f,
1781                                               &o->data.entry_offset,
1782                                               &o->data.entry_array_offset,
1783                                               &o->data.n_entries,
1784                                               offset);
1785 }
1786
1787 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1788         uint64_t n, i;
1789         int r;
1790
1791         assert(f);
1792         assert(f->header);
1793         assert(o);
1794         assert(offset > 0);
1795
1796         if (o->object.type != OBJECT_ENTRY)
1797                 return -EINVAL;
1798
1799         __sync_synchronize();
1800
1801         /* Link up the entry itself */
1802         r = link_entry_into_array(f,
1803                                   &f->header->entry_array_offset,
1804                                   &f->header->n_entries,
1805                                   offset);
1806         if (r < 0)
1807                 return r;
1808
1809         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1810
1811         if (f->header->head_entry_realtime == 0)
1812                 f->header->head_entry_realtime = o->entry.realtime;
1813
1814         f->header->tail_entry_realtime = o->entry.realtime;
1815         f->header->tail_entry_monotonic = o->entry.monotonic;
1816
1817         f->tail_entry_monotonic_valid = true;
1818
1819         /* Link up the items */
1820         n = journal_file_entry_n_items(o);
1821         for (i = 0; i < n; i++) {
1822                 r = journal_file_link_entry_item(f, o, offset, i);
1823                 if (r < 0)
1824                         return r;
1825         }
1826
1827         return 0;
1828 }
1829
1830 static int journal_file_append_entry_internal(
1831                 JournalFile *f,
1832                 const dual_timestamp *ts,
1833                 uint64_t xor_hash,
1834                 const EntryItem items[], unsigned n_items,
1835                 uint64_t *seqnum,
1836                 Object **ret, uint64_t *offset) {
1837         uint64_t np;
1838         uint64_t osize;
1839         Object *o;
1840         int r;
1841
1842         assert(f);
1843         assert(f->header);
1844         assert(items || n_items == 0);
1845         assert(ts);
1846
1847         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1848
1849         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1850         if (r < 0)
1851                 return r;
1852
1853         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1854         memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1855         o->entry.realtime = htole64(ts->realtime);
1856         o->entry.monotonic = htole64(ts->monotonic);
1857         o->entry.xor_hash = htole64(xor_hash);
1858         o->entry.boot_id = f->header->boot_id;
1859
1860 #if HAVE_GCRYPT
1861         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1862         if (r < 0)
1863                 return r;
1864 #endif
1865
1866         r = journal_file_link_entry(f, o, np);
1867         if (r < 0)
1868                 return r;
1869
1870         if (ret)
1871                 *ret = o;
1872
1873         if (offset)
1874                 *offset = np;
1875
1876         return 0;
1877 }
1878
1879 void journal_file_post_change(JournalFile *f) {
1880         assert(f);
1881
1882         /* inotify() does not receive IN_MODIFY events from file
1883          * accesses done via mmap(). After each access we hence
1884          * trigger IN_MODIFY by truncating the journal file to its
1885          * current size which triggers IN_MODIFY. */
1886
1887         __sync_synchronize();
1888
1889         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1890                 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1891 }
1892
1893 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1894         assert(userdata);
1895
1896         journal_file_post_change(userdata);
1897
1898         return 1;
1899 }
1900
1901 static void schedule_post_change(JournalFile *f) {
1902         sd_event_source *timer;
1903         int enabled, r;
1904         uint64_t now;
1905
1906         assert(f);
1907         assert(f->post_change_timer);
1908
1909         timer = f->post_change_timer;
1910
1911         r = sd_event_source_get_enabled(timer, &enabled);
1912         if (r < 0) {
1913                 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1914                 goto fail;
1915         }
1916
1917         if (enabled == SD_EVENT_ONESHOT)
1918                 return;
1919
1920         r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1921         if (r < 0) {
1922                 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1923                 goto fail;
1924         }
1925
1926         r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1927         if (r < 0) {
1928                 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1929                 goto fail;
1930         }
1931
1932         r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1933         if (r < 0) {
1934                 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1935                 goto fail;
1936         }
1937
1938         return;
1939
1940 fail:
1941         /* On failure, let's simply post the change immediately. */
1942         journal_file_post_change(f);
1943 }
1944
1945 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1946 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1947         _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1948         int r;
1949
1950         assert(f);
1951         assert_return(!f->post_change_timer, -EINVAL);
1952         assert(e);
1953         assert(t);
1954
1955         r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1956         if (r < 0)
1957                 return r;
1958
1959         r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1960         if (r < 0)
1961                 return r;
1962
1963         f->post_change_timer = timer;
1964         timer = NULL;
1965         f->post_change_timer_period = t;
1966
1967         return r;
1968 }
1969
1970 static int entry_item_cmp(const void *_a, const void *_b) {
1971         const EntryItem *a = _a, *b = _b;
1972
1973         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1974                 return -1;
1975         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1976                 return 1;
1977         return 0;
1978 }
1979
1980 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1981         unsigned i;
1982         EntryItem *items;
1983         int r;
1984         uint64_t xor_hash = 0;
1985         struct dual_timestamp _ts;
1986
1987         assert(f);
1988         assert(f->header);
1989         assert(iovec || n_iovec == 0);
1990
1991         if (!ts) {
1992                 dual_timestamp_get(&_ts);
1993                 ts = &_ts;
1994         }
1995
1996 #if HAVE_GCRYPT
1997         r = journal_file_maybe_append_tag(f, ts->realtime);
1998         if (r < 0)
1999                 return r;
2000 #endif
2001
2002         /* alloca() can't take 0, hence let's allocate at least one */
2003         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
2004
2005         for (i = 0; i < n_iovec; i++) {
2006                 uint64_t p;
2007                 Object *o;
2008
2009                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2010                 if (r < 0)
2011                         return r;
2012
2013                 xor_hash ^= le64toh(o->data.hash);
2014                 items[i].object_offset = htole64(p);
2015                 items[i].hash = o->data.hash;
2016         }
2017
2018         /* Order by the position on disk, in order to improve seek
2019          * times for rotating media. */
2020         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
2021
2022         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
2023
2024         /* If the memory mapping triggered a SIGBUS then we return an
2025          * IO error and ignore the error code passed down to us, since
2026          * it is very likely just an effect of a nullified replacement
2027          * mapping page */
2028
2029         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
2030                 r = -EIO;
2031
2032         if (f->post_change_timer)
2033                 schedule_post_change(f);
2034         else
2035                 journal_file_post_change(f);
2036
2037         return r;
2038 }
2039
2040 typedef struct ChainCacheItem {
2041         uint64_t first; /* the array at the beginning of the chain */
2042         uint64_t array; /* the cached array */
2043         uint64_t begin; /* the first item in the cached array */
2044         uint64_t total; /* the total number of items in all arrays before this one in the chain */
2045         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2046 } ChainCacheItem;
2047
2048 static void chain_cache_put(
2049                 OrderedHashmap *h,
2050                 ChainCacheItem *ci,
2051                 uint64_t first,
2052                 uint64_t array,
2053                 uint64_t begin,
2054                 uint64_t total,
2055                 uint64_t last_index) {
2056
2057         if (!ci) {
2058                 /* If the chain item to cache for this chain is the
2059                  * first one it's not worth caching anything */
2060                 if (array == first)
2061                         return;
2062
2063                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2064                         ci = ordered_hashmap_steal_first(h);
2065                         assert(ci);
2066                 } else {
2067                         ci = new(ChainCacheItem, 1);
2068                         if (!ci)
2069                                 return;
2070                 }
2071
2072                 ci->first = first;
2073
2074                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2075                         free(ci);
2076                         return;
2077                 }
2078         } else
2079                 assert(ci->first == first);
2080
2081         ci->array = array;
2082         ci->begin = begin;
2083         ci->total = total;
2084         ci->last_index = last_index;
2085 }
2086
2087 static int generic_array_get(
2088                 JournalFile *f,
2089                 uint64_t first,
2090                 uint64_t i,
2091                 Object **ret, uint64_t *offset) {
2092
2093         Object *o;
2094         uint64_t p = 0, a, t = 0;
2095         int r;
2096         ChainCacheItem *ci;
2097
2098         assert(f);
2099
2100         a = first;
2101
2102         /* Try the chain cache first */
2103         ci = ordered_hashmap_get(f->chain_cache, &first);
2104         if (ci && i > ci->total) {
2105                 a = ci->array;
2106                 i -= ci->total;
2107                 t = ci->total;
2108         }
2109
2110         while (a > 0) {
2111                 uint64_t k;
2112
2113                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2114                 if (r < 0)
2115                         return r;
2116
2117                 k = journal_file_entry_array_n_items(o);
2118                 if (i < k) {
2119                         p = le64toh(o->entry_array.items[i]);
2120                         goto found;
2121                 }
2122
2123                 i -= k;
2124                 t += k;
2125                 a = le64toh(o->entry_array.next_entry_array_offset);
2126         }
2127
2128         return 0;
2129
2130 found:
2131         /* Let's cache this item for the next invocation */
2132         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2133
2134         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2135         if (r < 0)
2136                 return r;
2137
2138         if (ret)
2139                 *ret = o;
2140
2141         if (offset)
2142                 *offset = p;
2143
2144         return 1;
2145 }
2146
2147 static int generic_array_get_plus_one(
2148                 JournalFile *f,
2149                 uint64_t extra,
2150                 uint64_t first,
2151                 uint64_t i,
2152                 Object **ret, uint64_t *offset) {
2153
2154         Object *o;
2155
2156         assert(f);
2157
2158         if (i == 0) {
2159                 int r;
2160
2161                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2162                 if (r < 0)
2163                         return r;
2164
2165                 if (ret)
2166                         *ret = o;
2167
2168                 if (offset)
2169                         *offset = extra;
2170
2171                 return 1;
2172         }
2173
2174         return generic_array_get(f, first, i-1, ret, offset);
2175 }
2176
2177 enum {
2178         TEST_FOUND,
2179         TEST_LEFT,
2180         TEST_RIGHT
2181 };
2182
2183 static int generic_array_bisect(
2184                 JournalFile *f,
2185                 uint64_t first,
2186                 uint64_t n,
2187                 uint64_t needle,
2188                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2189                 direction_t direction,
2190                 Object **ret,
2191                 uint64_t *offset,
2192                 uint64_t *idx) {
2193
2194         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2195         bool subtract_one = false;
2196         Object *o, *array = NULL;
2197         int r;
2198         ChainCacheItem *ci;
2199
2200         assert(f);
2201         assert(test_object);
2202
2203         /* Start with the first array in the chain */
2204         a = first;
2205
2206         ci = ordered_hashmap_get(f->chain_cache, &first);
2207         if (ci && n > ci->total) {
2208                 /* Ah, we have iterated this bisection array chain
2209                  * previously! Let's see if we can skip ahead in the
2210                  * chain, as far as the last time. But we can't jump
2211                  * backwards in the chain, so let's check that
2212                  * first. */
2213
2214                 r = test_object(f, ci->begin, needle);
2215                 if (r < 0)
2216                         return r;
2217
2218                 if (r == TEST_LEFT) {
2219                         /* OK, what we are looking for is right of the
2220                          * begin of this EntryArray, so let's jump
2221                          * straight to previously cached array in the
2222                          * chain */
2223
2224                         a = ci->array;
2225                         n -= ci->total;
2226                         t = ci->total;
2227                         last_index = ci->last_index;
2228                 }
2229         }
2230
2231         while (a > 0) {
2232                 uint64_t left, right, k, lp;
2233
2234                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2235                 if (r < 0)
2236                         return r;
2237
2238                 k = journal_file_entry_array_n_items(array);
2239                 right = MIN(k, n);
2240                 if (right <= 0)
2241                         return 0;
2242
2243                 i = right - 1;
2244                 lp = p = le64toh(array->entry_array.items[i]);
2245                 if (p <= 0)
2246                         r = -EBADMSG;
2247                 else
2248                         r = test_object(f, p, needle);
2249                 if (r == -EBADMSG) {
2250                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2251                         n = i;
2252                         continue;
2253                 }
2254                 if (r < 0)
2255                         return r;
2256
2257                 if (r == TEST_FOUND)
2258                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2259
2260                 if (r == TEST_RIGHT) {
2261                         left = 0;
2262                         right -= 1;
2263
2264                         if (last_index != (uint64_t) -1) {
2265                                 assert(last_index <= right);
2266
2267                                 /* If we cached the last index we
2268                                  * looked at, let's try to not to jump
2269                                  * too wildly around and see if we can
2270                                  * limit the range to look at early to
2271                                  * the immediate neighbors of the last
2272                                  * index we looked at. */
2273
2274                                 if (last_index > 0) {
2275                                         uint64_t x = last_index - 1;
2276
2277                                         p = le64toh(array->entry_array.items[x]);
2278                                         if (p <= 0)
2279                                                 return -EBADMSG;
2280
2281                                         r = test_object(f, p, needle);
2282                                         if (r < 0)
2283                                                 return r;
2284
2285                                         if (r == TEST_FOUND)
2286                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2287
2288                                         if (r == TEST_RIGHT)
2289                                                 right = x;
2290                                         else
2291                                                 left = x + 1;
2292                                 }
2293
2294                                 if (last_index < right) {
2295                                         uint64_t y = last_index + 1;
2296
2297                                         p = le64toh(array->entry_array.items[y]);
2298                                         if (p <= 0)
2299                                                 return -EBADMSG;
2300
2301                                         r = test_object(f, p, needle);
2302                                         if (r < 0)
2303                                                 return r;
2304
2305                                         if (r == TEST_FOUND)
2306                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2307
2308                                         if (r == TEST_RIGHT)
2309                                                 right = y;
2310                                         else
2311                                                 left = y + 1;
2312                                 }
2313                         }
2314
2315                         for (;;) {
2316                                 if (left == right) {
2317                                         if (direction == DIRECTION_UP)
2318                                                 subtract_one = true;
2319
2320                                         i = left;
2321                                         goto found;
2322                                 }
2323
2324                                 assert(left < right);
2325                                 i = (left + right) / 2;
2326
2327                                 p = le64toh(array->entry_array.items[i]);
2328                                 if (p <= 0)
2329                                         r = -EBADMSG;
2330                                 else
2331                                         r = test_object(f, p, needle);
2332                                 if (r == -EBADMSG) {
2333                                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2334                                         right = n = i;
2335                                         continue;
2336                                 }
2337                                 if (r < 0)
2338                                         return r;
2339
2340                                 if (r == TEST_FOUND)
2341                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2342
2343                                 if (r == TEST_RIGHT)
2344                                         right = i;
2345                                 else
2346                                         left = i + 1;
2347                         }
2348                 }
2349
2350                 if (k >= n) {
2351                         if (direction == DIRECTION_UP) {
2352                                 i = n;
2353                                 subtract_one = true;
2354                                 goto found;
2355                         }
2356
2357                         return 0;
2358                 }
2359
2360                 last_p = lp;
2361
2362                 n -= k;
2363                 t += k;
2364                 last_index = (uint64_t) -1;
2365                 a = le64toh(array->entry_array.next_entry_array_offset);
2366         }
2367
2368         return 0;
2369
2370 found:
2371         if (subtract_one && t == 0 && i == 0)
2372                 return 0;
2373
2374         /* Let's cache this item for the next invocation */
2375         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2376
2377         if (subtract_one && i == 0)
2378                 p = last_p;
2379         else if (subtract_one)
2380                 p = le64toh(array->entry_array.items[i-1]);
2381         else
2382                 p = le64toh(array->entry_array.items[i]);
2383
2384         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2385         if (r < 0)
2386                 return r;
2387
2388         if (ret)
2389                 *ret = o;
2390
2391         if (offset)
2392                 *offset = p;
2393
2394         if (idx)
2395                 *idx = t + i + (subtract_one ? -1 : 0);
2396
2397         return 1;
2398 }
2399
2400 static int generic_array_bisect_plus_one(
2401                 JournalFile *f,
2402                 uint64_t extra,
2403                 uint64_t first,
2404                 uint64_t n,
2405                 uint64_t needle,
2406                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2407                 direction_t direction,
2408                 Object **ret,
2409                 uint64_t *offset,
2410                 uint64_t *idx) {
2411
2412         int r;
2413         bool step_back = false;
2414         Object *o;
2415
2416         assert(f);
2417         assert(test_object);
2418
2419         if (n <= 0)
2420                 return 0;
2421
2422         /* This bisects the array in object 'first', but first checks
2423          * an extra  */
2424         r = test_object(f, extra, needle);
2425         if (r < 0)
2426                 return r;
2427
2428         if (r == TEST_FOUND)
2429                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2430
2431         /* if we are looking with DIRECTION_UP then we need to first
2432            see if in the actual array there is a matching entry, and
2433            return the last one of that. But if there isn't any we need
2434            to return this one. Hence remember this, and return it
2435            below. */
2436         if (r == TEST_LEFT)
2437                 step_back = direction == DIRECTION_UP;
2438
2439         if (r == TEST_RIGHT) {
2440                 if (direction == DIRECTION_DOWN)
2441                         goto found;
2442                 else
2443                         return 0;
2444         }
2445
2446         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2447
2448         if (r == 0 && step_back)
2449                 goto found;
2450
2451         if (r > 0 && idx)
2452                 (*idx)++;
2453
2454         return r;
2455
2456 found:
2457         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2458         if (r < 0)
2459                 return r;
2460
2461         if (ret)
2462                 *ret = o;
2463
2464         if (offset)
2465                 *offset = extra;
2466
2467         if (idx)
2468                 *idx = 0;
2469
2470         return 1;
2471 }
2472
2473 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2474         assert(f);
2475         assert(p > 0);
2476
2477         if (p == needle)
2478                 return TEST_FOUND;
2479         else if (p < needle)
2480                 return TEST_LEFT;
2481         else
2482                 return TEST_RIGHT;
2483 }
2484
2485 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2486         Object *o;
2487         int r;
2488
2489         assert(f);
2490         assert(p > 0);
2491
2492         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2493         if (r < 0)
2494                 return r;
2495
2496         if (le64toh(o->entry.seqnum) == needle)
2497                 return TEST_FOUND;
2498         else if (le64toh(o->entry.seqnum) < needle)
2499                 return TEST_LEFT;
2500         else
2501                 return TEST_RIGHT;
2502 }
2503
2504 int journal_file_move_to_entry_by_seqnum(
2505                 JournalFile *f,
2506                 uint64_t seqnum,
2507                 direction_t direction,
2508                 Object **ret,
2509                 uint64_t *offset) {
2510         assert(f);
2511         assert(f->header);
2512
2513         return generic_array_bisect(f,
2514                                     le64toh(f->header->entry_array_offset),
2515                                     le64toh(f->header->n_entries),
2516                                     seqnum,
2517                                     test_object_seqnum,
2518                                     direction,
2519                                     ret, offset, NULL);
2520 }
2521
2522 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2523         Object *o;
2524         int r;
2525
2526         assert(f);
2527         assert(p > 0);
2528
2529         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2530         if (r < 0)
2531                 return r;
2532
2533         if (le64toh(o->entry.realtime) == needle)
2534                 return TEST_FOUND;
2535         else if (le64toh(o->entry.realtime) < needle)
2536                 return TEST_LEFT;
2537         else
2538                 return TEST_RIGHT;
2539 }
2540
2541 int journal_file_move_to_entry_by_realtime(
2542                 JournalFile *f,
2543                 uint64_t realtime,
2544                 direction_t direction,
2545                 Object **ret,
2546                 uint64_t *offset) {
2547         assert(f);
2548         assert(f->header);
2549
2550         return generic_array_bisect(f,
2551                                     le64toh(f->header->entry_array_offset),
2552                                     le64toh(f->header->n_entries),
2553                                     realtime,
2554                                     test_object_realtime,
2555                                     direction,
2556                                     ret, offset, NULL);
2557 }
2558
2559 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2560         Object *o;
2561         int r;
2562
2563         assert(f);
2564         assert(p > 0);
2565
2566         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2567         if (r < 0)
2568                 return r;
2569
2570         if (le64toh(o->entry.monotonic) == needle)
2571                 return TEST_FOUND;
2572         else if (le64toh(o->entry.monotonic) < needle)
2573                 return TEST_LEFT;
2574         else
2575                 return TEST_RIGHT;
2576 }
2577
2578 static int find_data_object_by_boot_id(
2579                 JournalFile *f,
2580                 sd_id128_t boot_id,
2581                 Object **o,
2582                 uint64_t *b) {
2583
2584         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2585
2586         sd_id128_to_string(boot_id, t + 9);
2587         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2588 }
2589
2590 int journal_file_move_to_entry_by_monotonic(
2591                 JournalFile *f,
2592                 sd_id128_t boot_id,
2593                 uint64_t monotonic,
2594                 direction_t direction,
2595                 Object **ret,
2596                 uint64_t *offset) {
2597
2598         Object *o;
2599         int r;
2600
2601         assert(f);
2602
2603         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2604         if (r < 0)
2605                 return r;
2606         if (r == 0)
2607                 return -ENOENT;
2608
2609         return generic_array_bisect_plus_one(f,
2610                                              le64toh(o->data.entry_offset),
2611                                              le64toh(o->data.entry_array_offset),
2612                                              le64toh(o->data.n_entries),
2613                                              monotonic,
2614                                              test_object_monotonic,
2615                                              direction,
2616                                              ret, offset, NULL);
2617 }
2618
2619 void journal_file_reset_location(JournalFile *f) {
2620         f->location_type = LOCATION_HEAD;
2621         f->current_offset = 0;
2622         f->current_seqnum = 0;
2623         f->current_realtime = 0;
2624         f->current_monotonic = 0;
2625         zero(f->current_boot_id);
2626         f->current_xor_hash = 0;
2627 }
2628
2629 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2630         f->location_type = LOCATION_SEEK;
2631         f->current_offset = offset;
2632         f->current_seqnum = le64toh(o->entry.seqnum);
2633         f->current_realtime = le64toh(o->entry.realtime);
2634         f->current_monotonic = le64toh(o->entry.monotonic);
2635         f->current_boot_id = o->entry.boot_id;
2636         f->current_xor_hash = le64toh(o->entry.xor_hash);
2637 }
2638
2639 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2640         assert(af);
2641         assert(af->header);
2642         assert(bf);
2643         assert(bf->header);
2644         assert(af->location_type == LOCATION_SEEK);
2645         assert(bf->location_type == LOCATION_SEEK);
2646
2647         /* If contents and timestamps match, these entries are
2648          * identical, even if the seqnum does not match */
2649         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2650             af->current_monotonic == bf->current_monotonic &&
2651             af->current_realtime == bf->current_realtime &&
2652             af->current_xor_hash == bf->current_xor_hash)
2653                 return 0;
2654
2655         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2656
2657                 /* If this is from the same seqnum source, compare
2658                  * seqnums */
2659                 if (af->current_seqnum < bf->current_seqnum)
2660                         return -1;
2661                 if (af->current_seqnum > bf->current_seqnum)
2662                         return 1;
2663
2664                 /* Wow! This is weird, different data but the same
2665                  * seqnums? Something is borked, but let's make the
2666                  * best of it and compare by time. */
2667         }
2668
2669         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2670
2671                 /* If the boot id matches, compare monotonic time */
2672                 if (af->current_monotonic < bf->current_monotonic)
2673                         return -1;
2674                 if (af->current_monotonic > bf->current_monotonic)
2675                         return 1;
2676         }
2677
2678         /* Otherwise, compare UTC time */
2679         if (af->current_realtime < bf->current_realtime)
2680                 return -1;
2681         if (af->current_realtime > bf->current_realtime)
2682                 return 1;
2683
2684         /* Finally, compare by contents */
2685         if (af->current_xor_hash < bf->current_xor_hash)
2686                 return -1;
2687         if (af->current_xor_hash > bf->current_xor_hash)
2688                 return 1;
2689
2690         return 0;
2691 }
2692
2693 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2694
2695         /* Increase or decrease the specified index, in the right direction. */
2696
2697         if (direction == DIRECTION_DOWN) {
2698                 if (*i >= n - 1)
2699                         return 0;
2700
2701                 (*i) ++;
2702         } else {
2703                 if (*i <= 0)
2704                         return 0;
2705
2706                 (*i) --;
2707         }
2708
2709         return 1;
2710 }
2711
2712 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2713
2714         /* Consider it an error if any of the two offsets is uninitialized */
2715         if (old_offset == 0 || new_offset == 0)
2716                 return false;
2717
2718         /* If we go down, the new offset must be larger than the old one. */
2719         return direction == DIRECTION_DOWN ?
2720                 new_offset > old_offset  :
2721                 new_offset < old_offset;
2722 }
2723
2724 int journal_file_next_entry(
2725                 JournalFile *f,
2726                 uint64_t p,
2727                 direction_t direction,
2728                 Object **ret, uint64_t *offset) {
2729
2730         uint64_t i, n, ofs;
2731         int r;
2732
2733         assert(f);
2734         assert(f->header);
2735
2736         n = le64toh(f->header->n_entries);
2737         if (n <= 0)
2738                 return 0;
2739
2740         if (p == 0)
2741                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2742         else {
2743                 r = generic_array_bisect(f,
2744                                          le64toh(f->header->entry_array_offset),
2745                                          le64toh(f->header->n_entries),
2746                                          p,
2747                                          test_object_offset,
2748                                          DIRECTION_DOWN,
2749                                          NULL, NULL,
2750                                          &i);
2751                 if (r <= 0)
2752                         return r;
2753
2754                 r = bump_array_index(&i, direction, n);
2755                 if (r <= 0)
2756                         return r;
2757         }
2758
2759         /* And jump to it */
2760         for (;;) {
2761                 r = generic_array_get(f,
2762                                       le64toh(f->header->entry_array_offset),
2763                                       i,
2764                                       ret, &ofs);
2765                 if (r > 0)
2766                         break;
2767                 if (r != -EBADMSG)
2768                         return r;
2769
2770                 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2771                  * the next one might work for us instead. */
2772                 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2773
2774                 r = bump_array_index(&i, direction, n);
2775                 if (r <= 0)
2776                         return r;
2777         }
2778
2779         /* Ensure our array is properly ordered. */
2780         if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2781                 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
2782                 return -EBADMSG;
2783         }
2784
2785         if (offset)
2786                 *offset = ofs;
2787
2788         return 1;
2789 }
2790
2791 int journal_file_next_entry_for_data(
2792                 JournalFile *f,
2793                 Object *o, uint64_t p,
2794                 uint64_t data_offset,
2795                 direction_t direction,
2796                 Object **ret, uint64_t *offset) {
2797
2798         uint64_t i, n, ofs;
2799         Object *d;
2800         int r;
2801
2802         assert(f);
2803         assert(p > 0 || !o);
2804
2805         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2806         if (r < 0)
2807                 return r;
2808
2809         n = le64toh(d->data.n_entries);
2810         if (n <= 0)
2811                 return n;
2812
2813         if (!o)
2814                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2815         else {
2816                 if (o->object.type != OBJECT_ENTRY)
2817                         return -EINVAL;
2818
2819                 r = generic_array_bisect_plus_one(f,
2820                                                   le64toh(d->data.entry_offset),
2821                                                   le64toh(d->data.entry_array_offset),
2822                                                   le64toh(d->data.n_entries),
2823                                                   p,
2824                                                   test_object_offset,
2825                                                   DIRECTION_DOWN,
2826                                                   NULL, NULL,
2827                                                   &i);
2828
2829                 if (r <= 0)
2830                         return r;
2831
2832                 r = bump_array_index(&i, direction, n);
2833                 if (r <= 0)
2834                         return r;
2835         }
2836
2837         for (;;) {
2838                 r = generic_array_get_plus_one(f,
2839                                                le64toh(d->data.entry_offset),
2840                                                le64toh(d->data.entry_array_offset),
2841                                                i,
2842                                                ret, &ofs);
2843                 if (r > 0)
2844                         break;
2845                 if (r != -EBADMSG)
2846                         return r;
2847
2848                 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2849
2850                 r = bump_array_index(&i, direction, n);
2851                 if (r <= 0)
2852                         return r;
2853         }
2854
2855         /* Ensure our array is properly ordered. */
2856         if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2857                 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2858                 return -EBADMSG;
2859         }
2860
2861         if (offset)
2862                 *offset = ofs;
2863
2864         return 1;
2865 }
2866
2867 int journal_file_move_to_entry_by_offset_for_data(
2868                 JournalFile *f,
2869                 uint64_t data_offset,
2870                 uint64_t p,
2871                 direction_t direction,
2872                 Object **ret, uint64_t *offset) {
2873
2874         int r;
2875         Object *d;
2876
2877         assert(f);
2878
2879         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2880         if (r < 0)
2881                 return r;
2882
2883         return generic_array_bisect_plus_one(f,
2884                                              le64toh(d->data.entry_offset),
2885                                              le64toh(d->data.entry_array_offset),
2886                                              le64toh(d->data.n_entries),
2887                                              p,
2888                                              test_object_offset,
2889                                              direction,
2890                                              ret, offset, NULL);
2891 }
2892
2893 int journal_file_move_to_entry_by_monotonic_for_data(
2894                 JournalFile *f,
2895                 uint64_t data_offset,
2896                 sd_id128_t boot_id,
2897                 uint64_t monotonic,
2898                 direction_t direction,
2899                 Object **ret, uint64_t *offset) {
2900
2901         Object *o, *d;
2902         int r;
2903         uint64_t b, z;
2904
2905         assert(f);
2906
2907         /* First, seek by time */
2908         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2909         if (r < 0)
2910                 return r;
2911         if (r == 0)
2912                 return -ENOENT;
2913
2914         r = generic_array_bisect_plus_one(f,
2915                                           le64toh(o->data.entry_offset),
2916                                           le64toh(o->data.entry_array_offset),
2917                                           le64toh(o->data.n_entries),
2918                                           monotonic,
2919                                           test_object_monotonic,
2920                                           direction,
2921                                           NULL, &z, NULL);
2922         if (r <= 0)
2923                 return r;
2924
2925         /* And now, continue seeking until we find an entry that
2926          * exists in both bisection arrays */
2927
2928         for (;;) {
2929                 Object *qo;
2930                 uint64_t p, q;
2931
2932                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2933                 if (r < 0)
2934                         return r;
2935
2936                 r = generic_array_bisect_plus_one(f,
2937                                                   le64toh(d->data.entry_offset),
2938                                                   le64toh(d->data.entry_array_offset),
2939                                                   le64toh(d->data.n_entries),
2940                                                   z,
2941                                                   test_object_offset,
2942                                                   direction,
2943                                                   NULL, &p, NULL);
2944                 if (r <= 0)
2945                         return r;
2946
2947                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2948                 if (r < 0)
2949                         return r;
2950
2951                 r = generic_array_bisect_plus_one(f,
2952                                                   le64toh(o->data.entry_offset),
2953                                                   le64toh(o->data.entry_array_offset),
2954                                                   le64toh(o->data.n_entries),
2955                                                   p,
2956                                                   test_object_offset,
2957                                                   direction,
2958                                                   &qo, &q, NULL);
2959
2960                 if (r <= 0)
2961                         return r;
2962
2963                 if (p == q) {
2964                         if (ret)
2965                                 *ret = qo;
2966                         if (offset)
2967                                 *offset = q;
2968
2969                         return 1;
2970                 }
2971
2972                 z = q;
2973         }
2974 }
2975
2976 int journal_file_move_to_entry_by_seqnum_for_data(
2977                 JournalFile *f,
2978                 uint64_t data_offset,
2979                 uint64_t seqnum,
2980                 direction_t direction,
2981                 Object **ret, uint64_t *offset) {
2982
2983         Object *d;
2984         int r;
2985
2986         assert(f);
2987
2988         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2989         if (r < 0)
2990                 return r;
2991
2992         return generic_array_bisect_plus_one(f,
2993                                              le64toh(d->data.entry_offset),
2994                                              le64toh(d->data.entry_array_offset),
2995                                              le64toh(d->data.n_entries),
2996                                              seqnum,
2997                                              test_object_seqnum,
2998                                              direction,
2999                                              ret, offset, NULL);
3000 }
3001
3002 int journal_file_move_to_entry_by_realtime_for_data(
3003                 JournalFile *f,
3004                 uint64_t data_offset,
3005                 uint64_t realtime,
3006                 direction_t direction,
3007                 Object **ret, uint64_t *offset) {
3008
3009         Object *d;
3010         int r;
3011
3012         assert(f);
3013
3014         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
3015         if (r < 0)
3016                 return r;
3017
3018         return generic_array_bisect_plus_one(f,
3019                                              le64toh(d->data.entry_offset),
3020                                              le64toh(d->data.entry_array_offset),
3021                                              le64toh(d->data.n_entries),
3022                                              realtime,
3023                                              test_object_realtime,
3024                                              direction,
3025                                              ret, offset, NULL);
3026 }
3027
3028 void journal_file_dump(JournalFile *f) {
3029         Object *o;
3030         int r;
3031         uint64_t p;
3032
3033         assert(f);
3034         assert(f->header);
3035
3036         journal_file_print_header(f);
3037
3038         p = le64toh(f->header->header_size);
3039         while (p != 0) {
3040                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3041                 if (r < 0)
3042                         goto fail;
3043
3044                 switch (o->object.type) {
3045
3046                 case OBJECT_UNUSED:
3047                         printf("Type: OBJECT_UNUSED\n");
3048                         break;
3049
3050                 case OBJECT_DATA:
3051                         printf("Type: OBJECT_DATA\n");
3052                         break;
3053
3054                 case OBJECT_FIELD:
3055                         printf("Type: OBJECT_FIELD\n");
3056                         break;
3057
3058                 case OBJECT_ENTRY:
3059                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3060                                le64toh(o->entry.seqnum),
3061                                le64toh(o->entry.monotonic),
3062                                le64toh(o->entry.realtime));
3063                         break;
3064
3065                 case OBJECT_FIELD_HASH_TABLE:
3066                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3067                         break;
3068
3069                 case OBJECT_DATA_HASH_TABLE:
3070                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
3071                         break;
3072
3073                 case OBJECT_ENTRY_ARRAY:
3074                         printf("Type: OBJECT_ENTRY_ARRAY\n");
3075                         break;
3076
3077                 case OBJECT_TAG:
3078                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3079                                le64toh(o->tag.seqnum),
3080                                le64toh(o->tag.epoch));
3081                         break;
3082
3083                 default:
3084                         printf("Type: unknown (%i)\n", o->object.type);
3085                         break;
3086                 }
3087
3088                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3089                         printf("Flags: %s\n",
3090                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3091
3092                 if (p == le64toh(f->header->tail_object_offset))
3093                         p = 0;
3094                 else
3095                         p = p + ALIGN64(le64toh(o->object.size));
3096         }
3097
3098         return;
3099 fail:
3100         log_error("File corrupt");
3101 }
3102
3103 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3104         const char *x;
3105
3106         x = format_timestamp(buf, l, t);
3107         if (x)
3108                 return x;
3109         return " --- ";
3110 }
3111
3112 void journal_file_print_header(JournalFile *f) {
3113         char a[33], b[33], c[33], d[33];
3114         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3115         struct stat st;
3116         char bytes[FORMAT_BYTES_MAX];
3117
3118         assert(f);
3119         assert(f->header);
3120
3121         printf("File Path: %s\n"
3122                "File ID: %s\n"
3123                "Machine ID: %s\n"
3124                "Boot ID: %s\n"
3125                "Sequential Number ID: %s\n"
3126                "State: %s\n"
3127                "Compatible Flags:%s%s\n"
3128                "Incompatible Flags:%s%s%s\n"
3129                "Header size: %"PRIu64"\n"
3130                "Arena size: %"PRIu64"\n"
3131                "Data Hash Table Size: %"PRIu64"\n"
3132                "Field Hash Table Size: %"PRIu64"\n"
3133                "Rotate Suggested: %s\n"
3134                "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3135                "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3136                "Head Realtime Timestamp: %s (%"PRIx64")\n"
3137                "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3138                "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3139                "Objects: %"PRIu64"\n"
3140                "Entry Objects: %"PRIu64"\n",
3141                f->path,
3142                sd_id128_to_string(f->header->file_id, a),
3143                sd_id128_to_string(f->header->machine_id, b),
3144                sd_id128_to_string(f->header->boot_id, c),
3145                sd_id128_to_string(f->header->seqnum_id, d),
3146                f->header->state == STATE_OFFLINE ? "OFFLINE" :
3147                f->header->state == STATE_ONLINE ? "ONLINE" :
3148                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3149                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3150                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3151                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3152                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3153                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3154                le64toh(f->header->header_size),
3155                le64toh(f->header->arena_size),
3156                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3157                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3158                yes_no(journal_file_rotate_suggested(f, 0)),
3159                le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3160                le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3161                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3162                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3163                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3164                le64toh(f->header->n_objects),
3165                le64toh(f->header->n_entries));
3166
3167         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3168                 printf("Data Objects: %"PRIu64"\n"
3169                        "Data Hash Table Fill: %.1f%%\n",
3170                        le64toh(f->header->n_data),
3171                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3172
3173         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3174                 printf("Field Objects: %"PRIu64"\n"
3175                        "Field Hash Table Fill: %.1f%%\n",
3176                        le64toh(f->header->n_fields),
3177                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3178
3179         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3180                 printf("Tag Objects: %"PRIu64"\n",
3181                        le64toh(f->header->n_tags));
3182         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3183                 printf("Entry Array Objects: %"PRIu64"\n",
3184                        le64toh(f->header->n_entry_arrays));
3185
3186         if (fstat(f->fd, &st) >= 0)
3187                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3188 }
3189
3190 static int journal_file_warn_btrfs(JournalFile *f) {
3191         unsigned attrs;
3192         int r;
3193
3194         assert(f);
3195
3196         /* Before we write anything, check if the COW logic is turned
3197          * off on btrfs. Given our write pattern that is quite
3198          * unfriendly to COW file systems this should greatly improve
3199          * performance on COW file systems, such as btrfs, at the
3200          * expense of data integrity features (which shouldn't be too
3201          * bad, given that we do our own checksumming). */
3202
3203         r = btrfs_is_filesystem(f->fd);
3204         if (r < 0)
3205                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3206         if (!r)
3207                 return 0;
3208
3209         r = read_attr_fd(f->fd, &attrs);
3210         if (r < 0)
3211                 return log_warning_errno(r, "Failed to read file attributes: %m");
3212
3213         if (attrs & FS_NOCOW_FL) {
3214                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3215                 return 0;
3216         }
3217
3218         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3219                    "This is likely to slow down journal access substantially, please consider turning "
3220                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3221
3222         return 1;
3223 }
3224
3225 int journal_file_open(
3226                 int fd,
3227                 const char *fname,
3228                 int flags,
3229                 mode_t mode,
3230                 bool compress,
3231                 bool seal,
3232                 JournalMetrics *metrics,
3233                 MMapCache *mmap_cache,
3234                 Set *deferred_closes,
3235                 JournalFile *template,
3236                 JournalFile **ret) {
3237
3238         bool newly_created = false;
3239         JournalFile *f;
3240         void *h;
3241         int r;
3242
3243         assert(ret);
3244         assert(fd >= 0 || fname);
3245
3246         if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3247                 return -EINVAL;
3248
3249         if (fname) {
3250                 if (!endswith(fname, ".journal") &&
3251                     !endswith(fname, ".journal~"))
3252                         return -EINVAL;
3253         }
3254
3255         f = new0(JournalFile, 1);
3256         if (!f)
3257                 return -ENOMEM;
3258
3259         f->fd = fd;
3260         f->mode = mode;
3261
3262         f->flags = flags;
3263         f->prot = prot_from_flags(flags);
3264         f->writable = (flags & O_ACCMODE) != O_RDONLY;
3265 #if HAVE_LZ4
3266         f->compress_lz4 = compress;
3267 #elif HAVE_XZ
3268         f->compress_xz = compress;
3269 #endif
3270 #if HAVE_GCRYPT
3271         f->seal = seal;
3272 #endif
3273
3274         if (mmap_cache)
3275                 f->mmap = mmap_cache_ref(mmap_cache);
3276         else {
3277                 f->mmap = mmap_cache_new();
3278                 if (!f->mmap) {
3279                         r = -ENOMEM;
3280                         goto fail;
3281                 }
3282         }
3283
3284         if (fname) {
3285                 f->path = strdup(fname);
3286                 if (!f->path) {
3287                         r = -ENOMEM;
3288                         goto fail;
3289                 }
3290         } else {
3291                 /* If we don't know the path, fill in something explanatory and vaguely useful */
3292                 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3293                         r = -ENOMEM;
3294                         goto fail;
3295                 }
3296         }
3297
3298         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3299         if (!f->chain_cache) {
3300                 r = -ENOMEM;
3301                 goto fail;
3302         }
3303
3304         if (f->fd < 0) {
3305                 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
3306                 if (f->fd < 0) {
3307                         r = -errno;
3308                         goto fail;
3309                 }
3310
3311                 /* fds we opened here by us should also be closed by us. */
3312                 f->close_fd = true;
3313         }
3314
3315         f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3316         if (!f->cache_fd) {
3317                 r = -ENOMEM;
3318                 goto fail;
3319         }
3320
3321         r = journal_file_fstat(f);
3322         if (r < 0)
3323                 goto fail;
3324
3325         if (f->last_stat.st_size == 0 && f->writable) {
3326
3327                 (void) journal_file_warn_btrfs(f);
3328
3329                 /* Let's attach the creation time to the journal file,
3330                  * so that the vacuuming code knows the age of this
3331                  * file even if the file might end up corrupted one
3332                  * day... Ideally we'd just use the creation time many
3333                  * file systems maintain for each file, but there is
3334                  * currently no usable API to query this, hence let's
3335                  * emulate this via extended attributes. If extended
3336                  * attributes are not supported we'll just skip this,
3337                  * and rely solely on mtime/atime/ctime of the file. */
3338
3339                 fd_setcrtime(f->fd, 0);
3340
3341 #if HAVE_GCRYPT
3342                 /* Try to load the FSPRG state, and if we can't, then
3343                  * just don't do sealing */
3344                 if (f->seal) {
3345                         r = journal_file_fss_load(f);
3346                         if (r < 0)
3347                                 f->seal = false;
3348                 }
3349 #endif
3350
3351                 r = journal_file_init_header(f, template);
3352                 if (r < 0)
3353                         goto fail;
3354
3355                 r = journal_file_fstat(f);
3356                 if (r < 0)
3357                         goto fail;
3358
3359                 newly_created = true;
3360         }
3361
3362         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3363                 r = -ENODATA;
3364                 goto fail;
3365         }
3366
3367         r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3368         if (r < 0)
3369                 goto fail;
3370
3371         f->header = h;
3372
3373         if (!newly_created) {
3374                 if (deferred_closes)
3375                         journal_file_close_set(deferred_closes);
3376
3377                 r = journal_file_verify_header(f);
3378                 if (r < 0)
3379                         goto fail;
3380         }
3381
3382 #if HAVE_GCRYPT
3383         if (!newly_created && f->writable) {
3384                 r = journal_file_fss_load(f);
3385                 if (r < 0)
3386                         goto fail;
3387         }
3388 #endif
3389
3390         if (f->writable) {
3391                 if (metrics) {
3392                         journal_default_metrics(metrics, f->fd);
3393                         f->metrics = *metrics;
3394                 } else if (template)
3395                         f->metrics = template->metrics;
3396
3397                 r = journal_file_refresh_header(f);
3398                 if (r < 0)
3399                         goto fail;
3400         }
3401
3402 #if HAVE_GCRYPT
3403         r = journal_file_hmac_setup(f);
3404         if (r < 0)
3405                 goto fail;
3406 #endif
3407
3408         if (newly_created) {
3409                 r = journal_file_setup_field_hash_table(f);
3410                 if (r < 0)
3411                         goto fail;
3412
3413                 r = journal_file_setup_data_hash_table(f);
3414                 if (r < 0)
3415                         goto fail;
3416
3417 #if HAVE_GCRYPT
3418                 r = journal_file_append_first_tag(f);
3419                 if (r < 0)
3420                         goto fail;
3421 #endif
3422         }
3423
3424         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3425                 r = -EIO;
3426                 goto fail;
3427         }
3428
3429         if (template && template->post_change_timer) {
3430                 r = journal_file_enable_post_change_timer(
3431                                 f,
3432                                 sd_event_source_get_event(template->post_change_timer),
3433                                 template->post_change_timer_period);
3434
3435                 if (r < 0)
3436                         goto fail;
3437         }
3438
3439         /* The file is opened now successfully, thus we take possession of any passed in fd. */
3440         f->close_fd = true;
3441
3442         *ret = f;
3443         return 0;
3444
3445 fail:
3446         if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3447                 r = -EIO;
3448
3449         (void) journal_file_close(f);
3450
3451         return r;
3452 }
3453
3454 int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
3455         _cleanup_free_ char *p = NULL;
3456         size_t l;
3457         JournalFile *old_file, *new_file = NULL;
3458         int r;
3459
3460         assert(f);
3461         assert(*f);
3462
3463         old_file = *f;
3464
3465         if (!old_file->writable)
3466                 return -EINVAL;
3467
3468         /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3469          * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3470         if (path_startswith(old_file->path, "/proc/self/fd"))
3471                 return -EINVAL;
3472
3473         if (!endswith(old_file->path, ".journal"))
3474                 return -EINVAL;
3475
3476         l = strlen(old_file->path);
3477         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3478                      (int) l - 8, old_file->path,
3479                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3480                      le64toh((*f)->header->head_entry_seqnum),
3481                      le64toh((*f)->header->head_entry_realtime));
3482         if (r < 0)
3483                 return -ENOMEM;
3484
3485         /* Try to rename the file to the archived version. If the file
3486          * already was deleted, we'll get ENOENT, let's ignore that
3487          * case. */
3488         r = rename(old_file->path, p);
3489         if (r < 0 && errno != ENOENT)
3490                 return -errno;
3491
3492         /* Sync the rename to disk */
3493         (void) fsync_directory_of_file(old_file->fd);
3494
3495         /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3496          * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3497          * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3498          * would result in the rotated journal never getting fsync() called before closing.
3499          * Now we simply queue the archive state by setting an archive bit, leaving the state
3500          * as STATE_ONLINE so proper offlining occurs. */
3501         old_file->archive = true;
3502
3503         /* Currently, btrfs is not very good with out write patterns
3504          * and fragments heavily. Let's defrag our journal files when
3505          * we archive them */
3506         old_file->defrag_on_close = true;
3507
3508         r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
3509
3510         if (deferred_closes &&
3511             set_put(deferred_closes, old_file) >= 0)
3512                 (void) journal_file_set_offline(old_file, false);
3513         else
3514                 (void) journal_file_close(old_file);
3515
3516         *f = new_file;
3517         return r;
3518 }
3519
3520 int journal_file_open_reliably(
3521                 const char *fname,
3522                 int flags,
3523                 mode_t mode,
3524                 bool compress,
3525                 bool seal,
3526                 JournalMetrics *metrics,
3527                 MMapCache *mmap_cache,
3528                 Set *deferred_closes,
3529                 JournalFile *template,
3530                 JournalFile **ret) {
3531
3532         int r;
3533         size_t l;
3534         _cleanup_free_ char *p = NULL;
3535
3536         r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3537         if (!IN_SET(r,
3538                     -EBADMSG,           /* Corrupted */
3539                     -ENODATA,           /* Truncated */
3540                     -EHOSTDOWN,         /* Other machine */
3541                     -EPROTONOSUPPORT,   /* Incompatible feature */
3542                     -EBUSY,             /* Unclean shutdown */
3543                     -ESHUTDOWN,         /* Already archived */
3544                     -EIO,               /* IO error, including SIGBUS on mmap */
3545                     -EIDRM,             /* File has been deleted */
3546                     -ETXTBSY))          /* File is from the future */
3547                 return r;
3548
3549         if ((flags & O_ACCMODE) == O_RDONLY)
3550                 return r;
3551
3552         if (!(flags & O_CREAT))
3553                 return r;
3554
3555         if (!endswith(fname, ".journal"))
3556                 return r;
3557
3558         /* The file is corrupted. Rotate it away and try it again (but only once) */
3559
3560         l = strlen(fname);
3561         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
3562                      (int) l - 8, fname,
3563                      now(CLOCK_REALTIME),
3564                      random_u64()) < 0)
3565                 return -ENOMEM;
3566
3567         if (rename(fname, p) < 0)
3568                 return -errno;
3569
3570         /* btrfs doesn't cope well with our write pattern and
3571          * fragments heavily. Let's defrag all files we rotate */
3572
3573         (void) chattr_path(p, 0, FS_NOCOW_FL);
3574         (void) btrfs_defrag(p);
3575
3576         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3577
3578         return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3579 }
3580
3581 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3582         uint64_t i, n;
3583         uint64_t q, xor_hash = 0;
3584         int r;
3585         EntryItem *items;
3586         dual_timestamp ts;
3587
3588         assert(from);
3589         assert(to);
3590         assert(o);
3591         assert(p);
3592
3593         if (!to->writable)
3594                 return -EPERM;
3595
3596         ts.monotonic = le64toh(o->entry.monotonic);
3597         ts.realtime = le64toh(o->entry.realtime);
3598
3599         n = journal_file_entry_n_items(o);
3600         /* alloca() can't take 0, hence let's allocate at least one */
3601         items = alloca(sizeof(EntryItem) * MAX(1u, n));
3602
3603         for (i = 0; i < n; i++) {
3604                 uint64_t l, h;
3605                 le64_t le_hash;
3606                 size_t t;
3607                 void *data;
3608                 Object *u;
3609
3610                 q = le64toh(o->entry.items[i].object_offset);
3611                 le_hash = o->entry.items[i].hash;
3612
3613                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3614                 if (r < 0)
3615                         return r;
3616
3617                 if (le_hash != o->data.hash)
3618                         return -EBADMSG;
3619
3620                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3621                 t = (size_t) l;
3622
3623                 /* We hit the limit on 32bit machines */
3624                 if ((uint64_t) t != l)
3625                         return -E2BIG;
3626
3627                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3628 #if HAVE_XZ || HAVE_LZ4
3629                         size_t rsize = 0;
3630
3631                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3632                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3633                         if (r < 0)
3634                                 return r;
3635
3636                         data = from->compress_buffer;
3637                         l = rsize;
3638 #else
3639                         return -EPROTONOSUPPORT;
3640 #endif
3641                 } else
3642                         data = o->data.payload;
3643
3644                 r = journal_file_append_data(to, data, l, &u, &h);
3645                 if (r < 0)
3646                         return r;
3647
3648                 xor_hash ^= le64toh(u->data.hash);
3649                 items[i].object_offset = htole64(h);
3650                 items[i].hash = u->data.hash;
3651
3652                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3653                 if (r < 0)
3654                         return r;
3655         }
3656
3657         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3658
3659         if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3660                 return -EIO;
3661
3662         return r;
3663 }
3664
3665 void journal_reset_metrics(JournalMetrics *m) {
3666         assert(m);
3667
3668         /* Set everything to "pick automatic values". */
3669
3670         *m = (JournalMetrics) {
3671                 .min_use = (uint64_t) -1,
3672                 .max_use = (uint64_t) -1,
3673                 .min_size = (uint64_t) -1,
3674                 .max_size = (uint64_t) -1,
3675                 .keep_free = (uint64_t) -1,
3676                 .n_max_files = (uint64_t) -1,
3677         };
3678 }
3679
3680 void journal_default_metrics(JournalMetrics *m, int fd) {
3681         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3682         struct statvfs ss;
3683         uint64_t fs_size;
3684
3685         assert(m);
3686         assert(fd >= 0);
3687
3688         if (fstatvfs(fd, &ss) >= 0)
3689                 fs_size = ss.f_frsize * ss.f_blocks;
3690         else {
3691                 log_debug_errno(errno, "Failed to detremine disk size: %m");
3692                 fs_size = 0;
3693         }
3694
3695         if (m->max_use == (uint64_t) -1) {
3696
3697                 if (fs_size > 0) {
3698                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3699
3700                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3701                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3702
3703                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3704                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3705                 } else
3706                         m->max_use = DEFAULT_MAX_USE_LOWER;
3707         } else {
3708                 m->max_use = PAGE_ALIGN(m->max_use);
3709
3710                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3711                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3712         }
3713
3714         if (m->min_use == (uint64_t) -1)
3715                 m->min_use = DEFAULT_MIN_USE;
3716
3717         if (m->min_use > m->max_use)
3718                 m->min_use = m->max_use;
3719
3720         if (m->max_size == (uint64_t) -1) {
3721                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3722
3723                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3724                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3725         } else
3726                 m->max_size = PAGE_ALIGN(m->max_size);
3727
3728         if (m->max_size != 0) {
3729                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3730                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3731
3732                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3733                         m->max_use = m->max_size*2;
3734         }
3735
3736         if (m->min_size == (uint64_t) -1)
3737                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3738         else {
3739                 m->min_size = PAGE_ALIGN(m->min_size);
3740
3741                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3742                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3743
3744                 if (m->max_size != 0 && m->min_size > m->max_size)
3745                         m->max_size = m->min_size;
3746         }
3747
3748         if (m->keep_free == (uint64_t) -1) {
3749
3750                 if (fs_size > 0) {
3751                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3752
3753                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3754                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3755
3756                 } else
3757                         m->keep_free = DEFAULT_KEEP_FREE;
3758         }
3759
3760         if (m->n_max_files == (uint64_t) -1)
3761                 m->n_max_files = DEFAULT_N_MAX_FILES;
3762
3763         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3764                   format_bytes(a, sizeof(a), m->min_use),
3765                   format_bytes(b, sizeof(b), m->max_use),
3766                   format_bytes(c, sizeof(c), m->max_size),
3767                   format_bytes(d, sizeof(d), m->min_size),
3768                   format_bytes(e, sizeof(e), m->keep_free),
3769                   m->n_max_files);
3770 }
3771
3772 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3773         assert(f);
3774         assert(f->header);
3775         assert(from || to);
3776
3777         if (from) {
3778                 if (f->header->head_entry_realtime == 0)
3779                         return -ENOENT;
3780
3781                 *from = le64toh(f->header->head_entry_realtime);
3782         }
3783
3784         if (to) {
3785                 if (f->header->tail_entry_realtime == 0)
3786                         return -ENOENT;
3787
3788                 *to = le64toh(f->header->tail_entry_realtime);
3789         }
3790
3791         return 1;
3792 }
3793
3794 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3795         Object *o;
3796         uint64_t p;
3797         int r;
3798
3799         assert(f);
3800         assert(from || to);
3801
3802         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3803         if (r <= 0)
3804                 return r;
3805
3806         if (le64toh(o->data.n_entries) <= 0)
3807                 return 0;
3808
3809         if (from) {
3810                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3811                 if (r < 0)
3812                         return r;
3813
3814                 *from = le64toh(o->entry.monotonic);
3815         }
3816
3817         if (to) {
3818                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3819                 if (r < 0)
3820                         return r;
3821
3822                 r = generic_array_get_plus_one(f,
3823                                                le64toh(o->data.entry_offset),
3824                                                le64toh(o->data.entry_array_offset),
3825                                                le64toh(o->data.n_entries)-1,
3826                                                &o, NULL);
3827                 if (r <= 0)
3828                         return r;
3829
3830                 *to = le64toh(o->entry.monotonic);
3831         }
3832
3833         return 1;
3834 }
3835
3836 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3837         assert(f);
3838         assert(f->header);
3839
3840         /* If we gained new header fields we gained new features,
3841          * hence suggest a rotation */
3842         if (le64toh(f->header->header_size) < sizeof(Header)) {
3843                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3844                 return true;
3845         }
3846
3847         /* Let's check if the hash tables grew over a certain fill
3848          * level (75%, borrowing this value from Java's hash table
3849          * implementation), and if so suggest a rotation. To calculate
3850          * the fill level we need the n_data field, which only exists
3851          * in newer versions. */
3852
3853         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3854                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3855                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3856                                   f->path,
3857                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3858                                   le64toh(f->header->n_data),
3859                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3860                                   (unsigned long long) f->last_stat.st_size,
3861                                   f->last_stat.st_size / le64toh(f->header->n_data));
3862                         return true;
3863                 }
3864
3865         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3866                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3867                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3868                                   f->path,
3869                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3870                                   le64toh(f->header->n_fields),
3871                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3872                         return true;
3873                 }
3874
3875         /* Are the data objects properly indexed by field objects? */
3876         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3877             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3878             le64toh(f->header->n_data) > 0 &&
3879             le64toh(f->header->n_fields) == 0)
3880                 return true;
3881
3882         if (max_file_usec > 0) {
3883                 usec_t t, h;
3884
3885                 h = le64toh(f->header->head_entry_realtime);
3886                 t = now(CLOCK_REALTIME);
3887
3888                 if (h > 0 && t > h + max_file_usec)
3889                         return true;
3890         }
3891
3892         return false;
3893 }