src/journal/journal-file.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2011 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <linux/fs.h>
  24 #include <pthread.h>
  25 #include <stddef.h>
  26 #include <sys/mman.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/uio.h>
  29 #include <unistd.h>
  30
  31 #include "alloc-util.h"
  32 #include "btrfs-util.h"
  33 #include "chattr-util.h"
  34 #include "compress.h"
  35 #include "fd-util.h"
  36 #include "fs-util.h"
  37 #include "journal-authenticate.h"
  38 #include "journal-def.h"
  39 #include "journal-file.h"
  40 #include "lookup3.h"
  41 #include "parse-util.h"
  42 #include "path-util.h"
  43 #include "random-util.h"
  44 #include "sd-event.h"
  45 #include "set.h"
  46 #include "stat-util.h"
  47 #include "string-util.h"
  48 #include "strv.h"
  49 #include "xattr-util.h"
  50
  51 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  52 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  53
  54 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
  55
  56 /* This is the minimum journal file size */
  57 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL)                 /* 512 KiB */
  58
  59 /* These are the lower and upper bounds if we deduce the max_use value
  60  * from the file system size */
  61 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  62 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  63
  64 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  65 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  66
  67 /* This is the upper bound if we deduce max_size from max_use */
  68 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  69
  70 /* This is the upper bound if we deduce the keep_free value from the
  71  * file system size */
  72 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  73
  74 /* This is the keep_free value when we can't determine the system
  75  * size */
  76 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  77
  78 /* This is the default maximum number of journal files to keep around. */
  79 #define DEFAULT_N_MAX_FILES (100)
  80
  81 /* n_data was the first entry we added after the initial file format design */
  82 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  83
  84 /* How many entries to keep in the entry array chain cache at max */
  85 #define CHAIN_CACHE_MAX 20
  86
  87 /* How much to increase the journal file size at once each time we allocate something new. */
  88 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  89
  90 /* Reread fstat() of the file for detecting deletions at least this often */
  91 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  92
  93 /* The mmap context to use for the header we pick as one above the last defined typed */
  94 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  95
  96 #ifdef __clang__
  97 #  pragma GCC diagnostic ignored "-Waddress-of-packed-member"
  98 #endif
  99
 100 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
 101  * As a result we use atomic operations on f->offline_state for inter-thread communications with
 102  * journal_file_set_offline() and journal_file_set_online(). */
 103 static void journal_file_set_offline_internal(JournalFile *f) {
 104         assert(f);
 105         assert(f->fd >= 0);
 106         assert(f->header);
 107
 108         for (;;) {
 109                 switch (f->offline_state) {
 110                 case OFFLINE_CANCEL:
 111                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
 112                                 continue;
 113                         return;
 114
 115                 case OFFLINE_AGAIN_FROM_SYNCING:
 116                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
 117                                 continue;
 118                         break;
 119
 120                 case OFFLINE_AGAIN_FROM_OFFLINING:
 121                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
 122                                 continue;
 123                         break;
 124
 125                 case OFFLINE_SYNCING:
 126                         (void) fsync(f->fd);
 127
 128                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
 129                                 continue;
 130
 131                         f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
 132                         (void) fsync(f->fd);
 133                         break;
 134
 135                 case OFFLINE_OFFLINING:
 136                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
 137                                 continue;
 138                         _fallthrough_;
 139                 case OFFLINE_DONE:
 140                         return;
 141
 142                 case OFFLINE_JOINED:
 143                         log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
 144                         return;
 145                 }
 146         }
 147 }
 148
 149 static void * journal_file_set_offline_thread(void *arg) {
 150         JournalFile *f = arg;
 151
 152         (void) pthread_setname_np(pthread_self(), "journal-offline");
 153
 154         journal_file_set_offline_internal(f);
 155
 156         return NULL;
 157 }
 158
 159 static int journal_file_set_offline_thread_join(JournalFile *f) {
 160         int r;
 161
 162         assert(f);
 163
 164         if (f->offline_state == OFFLINE_JOINED)
 165                 return 0;
 166
 167         r = pthread_join(f->offline_thread, NULL);
 168         if (r)
 169                 return -r;
 170
 171         f->offline_state = OFFLINE_JOINED;
 172
 173         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 174                 return -EIO;
 175
 176         return 0;
 177 }
 178
 179 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
 180 static bool journal_file_set_offline_try_restart(JournalFile *f) {
 181         for (;;) {
 182                 switch (f->offline_state) {
 183                 case OFFLINE_AGAIN_FROM_SYNCING:
 184                 case OFFLINE_AGAIN_FROM_OFFLINING:
 185                         return true;
 186
 187                 case OFFLINE_CANCEL:
 188                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
 189                                 continue;
 190                         return true;
 191
 192                 case OFFLINE_SYNCING:
 193                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
 194                                 continue;
 195                         return true;
 196
 197                 case OFFLINE_OFFLINING:
 198                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
 199                                 continue;
 200                         return true;
 201
 202                 default:
 203                         return false;
 204                 }
 205         }
 206 }
 207
 208 /* Sets a journal offline.
 209  *
 210  * If wait is false then an offline is dispatched in a separate thread for a
 211  * subsequent journal_file_set_offline() or journal_file_set_online() of the
 212  * same journal to synchronize with.
 213  *
 214  * If wait is true, then either an existing offline thread will be restarted
 215  * and joined, or if none exists the offline is simply performed in this
 216  * context without involving another thread.
 217  */
 218 int journal_file_set_offline(JournalFile *f, bool wait) {
 219         bool restarted;
 220         int r;
 221
 222         assert(f);
 223
 224         if (!f->writable)
 225                 return -EPERM;
 226
 227         if (!(f->fd >= 0 && f->header))
 228                 return -EINVAL;
 229
 230         /* An offlining journal is implicitly online and may modify f->header->state,
 231          * we must also join any potentially lingering offline thread when not online. */
 232         if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
 233                 return journal_file_set_offline_thread_join(f);
 234
 235         /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
 236         restarted = journal_file_set_offline_try_restart(f);
 237         if ((restarted && wait) || !restarted) {
 238                 r = journal_file_set_offline_thread_join(f);
 239                 if (r < 0)
 240                         return r;
 241         }
 242
 243         if (restarted)
 244                 return 0;
 245
 246         /* Initiate a new offline. */
 247         f->offline_state = OFFLINE_SYNCING;
 248
 249         if (wait) /* Without using a thread if waiting. */
 250                 journal_file_set_offline_internal(f);
 251         else {
 252                 sigset_t ss, saved_ss;
 253                 int k;
 254
 255                 if (sigfillset(&ss) < 0)
 256                         return -errno;
 257
 258                 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
 259                 if (r > 0)
 260                         return -r;
 261
 262                 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
 263
 264                 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
 265                 if (r > 0) {
 266                         f->offline_state = OFFLINE_JOINED;
 267                         return -r;
 268                 }
 269                 if (k > 0)
 270                         return -k;
 271         }
 272
 273         return 0;
 274 }
 275
 276 static int journal_file_set_online(JournalFile *f) {
 277         bool joined = false;
 278
 279         assert(f);
 280
 281         if (!f->writable)
 282                 return -EPERM;
 283
 284         if (!(f->fd >= 0 && f->header))
 285                 return -EINVAL;
 286
 287         while (!joined) {
 288                 switch (f->offline_state) {
 289                 case OFFLINE_JOINED:
 290                         /* No offline thread, no need to wait. */
 291                         joined = true;
 292                         break;
 293
 294                 case OFFLINE_SYNCING:
 295                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
 296                                 continue;
 297                         /* Canceled syncing prior to offlining, no need to wait. */
 298                         break;
 299
 300                 case OFFLINE_AGAIN_FROM_SYNCING:
 301                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
 302                                 continue;
 303                         /* Canceled restart from syncing, no need to wait. */
 304                         break;
 305
 306                 case OFFLINE_AGAIN_FROM_OFFLINING:
 307                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
 308                                 continue;
 309                         /* Canceled restart from offlining, must wait for offlining to complete however. */
 310                         _fallthrough_;
 311                 default: {
 312                         int r;
 313
 314                         r = journal_file_set_offline_thread_join(f);
 315                         if (r < 0)
 316                                 return r;
 317
 318                         joined = true;
 319                         break;
 320                 }
 321                 }
 322         }
 323
 324         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 325                 return -EIO;
 326
 327         switch (f->header->state) {
 328                 case STATE_ONLINE:
 329                         return 0;
 330
 331                 case STATE_OFFLINE:
 332                         f->header->state = STATE_ONLINE;
 333                         (void) fsync(f->fd);
 334                         return 0;
 335
 336                 default:
 337                         return -EINVAL;
 338         }
 339 }
 340
 341 bool journal_file_is_offlining(JournalFile *f) {
 342         assert(f);
 343
 344         __sync_synchronize();
 345
 346         if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
 347                 return false;
 348
 349         return true;
 350 }
 351
 352 JournalFile* journal_file_close(JournalFile *f) {
 353         assert(f);
 354
 355 #if HAVE_GCRYPT
 356         /* Write the final tag */
 357         if (f->seal && f->writable) {
 358                 int r;
 359
 360                 r = journal_file_append_tag(f);
 361                 if (r < 0)
 362                         log_error_errno(r, "Failed to append tag when closing journal: %m");
 363         }
 364 #endif
 365
 366         if (f->post_change_timer) {
 367                 int enabled;
 368
 369                 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
 370                         if (enabled == SD_EVENT_ONESHOT)
 371                                 journal_file_post_change(f);
 372
 373                 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
 374                 sd_event_source_unref(f->post_change_timer);
 375         }
 376
 377         journal_file_set_offline(f, true);
 378
 379         if (f->mmap && f->cache_fd)
 380                 mmap_cache_free_fd(f->mmap, f->cache_fd);
 381
 382         if (f->fd >= 0 && f->defrag_on_close) {
 383
 384                 /* Be friendly to btrfs: turn COW back on again now,
 385                  * and defragment the file. We won't write to the file
 386                  * ever again, hence remove all fragmentation, and
 387                  * reenable all the good bits COW usually provides
 388                  * (such as data checksumming). */
 389
 390                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 391                 (void) btrfs_defrag_fd(f->fd);
 392         }
 393
 394         if (f->close_fd)
 395                 safe_close(f->fd);
 396         free(f->path);
 397
 398         mmap_cache_unref(f->mmap);
 399
 400         ordered_hashmap_free_free(f->chain_cache);
 401
 402 #if HAVE_XZ || HAVE_LZ4
 403         free(f->compress_buffer);
 404 #endif
 405
 406 #if HAVE_GCRYPT
 407         if (f->fss_file)
 408                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 409         else
 410                 free(f->fsprg_state);
 411
 412         free(f->fsprg_seed);
 413
 414         if (f->hmac)
 415                 gcry_md_close(f->hmac);
 416 #endif
 417
 418         return mfree(f);
 419 }
 420
 421 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 422         Header h = {};
 423         ssize_t k;
 424         int r;
 425
 426         assert(f);
 427
 428         memcpy(h.signature, HEADER_SIGNATURE, 8);
 429         h.header_size = htole64(ALIGN64(sizeof(h)));
 430
 431         h.incompatible_flags |= htole32(
 432                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 433                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 434
 435         h.compatible_flags = htole32(
 436                 f->seal * HEADER_COMPATIBLE_SEALED);
 437
 438         r = sd_id128_randomize(&h.file_id);
 439         if (r < 0)
 440                 return r;
 441
 442         if (template) {
 443                 h.seqnum_id = template->header->seqnum_id;
 444                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 445         } else
 446                 h.seqnum_id = h.file_id;
 447
 448         k = pwrite(f->fd, &h, sizeof(h), 0);
 449         if (k < 0)
 450                 return -errno;
 451
 452         if (k != sizeof(h))
 453                 return -EIO;
 454
 455         return 0;
 456 }
 457
 458 static int journal_file_refresh_header(JournalFile *f) {
 459         sd_id128_t boot_id;
 460         int r;
 461
 462         assert(f);
 463         assert(f->header);
 464
 465         r = sd_id128_get_machine(&f->header->machine_id);
 466         if (r < 0)
 467                 return r;
 468
 469         r = sd_id128_get_boot(&boot_id);
 470         if (r < 0)
 471                 return r;
 472
 473         if (sd_id128_equal(boot_id, f->header->boot_id))
 474                 f->tail_entry_monotonic_valid = true;
 475
 476         f->header->boot_id = boot_id;
 477
 478         r = journal_file_set_online(f);
 479
 480         /* Sync the online state to disk */
 481         (void) fsync(f->fd);
 482
 483         /* We likely just created a new file, also sync the directory this file is located in. */
 484         (void) fsync_directory_of_file(f->fd);
 485
 486         return r;
 487 }
 488
 489 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
 490         const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
 491                 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
 492         const char *type = compatible ? "compatible" : "incompatible";
 493         uint32_t flags;
 494
 495         flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
 496
 497         if (flags & ~supported) {
 498                 if (flags & ~any)
 499                         log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
 500                                   f->path, type, flags & ~any);
 501                 flags = (flags & any) & ~supported;
 502                 if (flags) {
 503                         const char* strv[3];
 504                         unsigned n = 0;
 505                         _cleanup_free_ char *t = NULL;
 506
 507                         if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
 508                                 strv[n++] = "sealed";
 509                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
 510                                 strv[n++] = "xz-compressed";
 511                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
 512                                 strv[n++] = "lz4-compressed";
 513                         strv[n] = NULL;
 514                         assert(n < ELEMENTSOF(strv));
 515
 516                         t = strv_join((char**) strv, ", ");
 517                         log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
 518                                   f->path, type, n > 1 ? "flags" : "flag", strnull(t));
 519                 }
 520                 return true;
 521         }
 522
 523         return false;
 524 }
 525
 526 static int journal_file_verify_header(JournalFile *f) {
 527         uint64_t arena_size, header_size;
 528
 529         assert(f);
 530         assert(f->header);
 531
 532         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 533                 return -EBADMSG;
 534
 535         /* In both read and write mode we refuse to open files with incompatible
 536          * flags we don't know. */
 537         if (warn_wrong_flags(f, false))
 538                 return -EPROTONOSUPPORT;
 539
 540         /* When open for writing we refuse to open files with compatible flags, too. */
 541         if (f->writable && warn_wrong_flags(f, true))
 542                 return -EPROTONOSUPPORT;
 543
 544         if (f->header->state >= _STATE_MAX)
 545                 return -EBADMSG;
 546
 547         header_size = le64toh(f->header->header_size);
 548
 549         /* The first addition was n_data, so check that we are at least this large */
 550         if (header_size < HEADER_SIZE_MIN)
 551                 return -EBADMSG;
 552
 553         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 554                 return -EBADMSG;
 555
 556         arena_size = le64toh(f->header->arena_size);
 557
 558         if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
 559                 return -ENODATA;
 560
 561         if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
 562                 return -ENODATA;
 563
 564         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 565             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 566             !VALID64(le64toh(f->header->tail_object_offset)) ||
 567             !VALID64(le64toh(f->header->entry_array_offset)))
 568                 return -ENODATA;
 569
 570         if (f->writable) {
 571                 sd_id128_t machine_id;
 572                 uint8_t state;
 573                 int r;
 574
 575                 r = sd_id128_get_machine(&machine_id);
 576                 if (r < 0)
 577                         return r;
 578
 579                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 580                         return -EHOSTDOWN;
 581
 582                 state = f->header->state;
 583
 584                 if (state == STATE_ARCHIVED)
 585                         return -ESHUTDOWN; /* Already archived */
 586                 else if (state == STATE_ONLINE) {
 587                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 588                         return -EBUSY;
 589                 } else if (state != STATE_OFFLINE) {
 590                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 591                         return -EBUSY;
 592                 }
 593
 594                 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
 595                         return -EBADMSG;
 596
 597                 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
 598                  * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
 599                  * bisection. */
 600                 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
 601                         log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
 602                         return -ETXTBSY;
 603                 }
 604         }
 605
 606         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 607         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 608
 609         f->seal = JOURNAL_HEADER_SEALED(f->header);
 610
 611         return 0;
 612 }
 613
 614 static int journal_file_fstat(JournalFile *f) {
 615         int r;
 616
 617         assert(f);
 618         assert(f->fd >= 0);
 619
 620         if (fstat(f->fd, &f->last_stat) < 0)
 621                 return -errno;
 622
 623         f->last_stat_usec = now(CLOCK_MONOTONIC);
 624
 625         /* Refuse dealing with with files that aren't regular */
 626         r = stat_verify_regular(&f->last_stat);
 627         if (r < 0)
 628                 return r;
 629
 630         /* Refuse appending to files that are already deleted */
 631         if (f->last_stat.st_nlink <= 0)
 632                 return -EIDRM;
 633
 634         return 0;
 635 }
 636
 637 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 638         uint64_t old_size, new_size;
 639         int r;
 640
 641         assert(f);
 642         assert(f->header);
 643
 644         /* We assume that this file is not sparse, and we know that
 645          * for sure, since we always call posix_fallocate()
 646          * ourselves */
 647
 648         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 649                 return -EIO;
 650
 651         old_size =
 652                 le64toh(f->header->header_size) +
 653                 le64toh(f->header->arena_size);
 654
 655         new_size = PAGE_ALIGN(offset + size);
 656         if (new_size < le64toh(f->header->header_size))
 657                 new_size = le64toh(f->header->header_size);
 658
 659         if (new_size <= old_size) {
 660
 661                 /* We already pre-allocated enough space, but before
 662                  * we write to it, let's check with fstat() if the
 663                  * file got deleted, in order make sure we don't throw
 664                  * away the data immediately. Don't check fstat() for
 665                  * all writes though, but only once ever 10s. */
 666
 667                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 668                         return 0;
 669
 670                 return journal_file_fstat(f);
 671         }
 672
 673         /* Allocate more space. */
 674
 675         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 676                 return -E2BIG;
 677
 678         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 679                 struct statvfs svfs;
 680
 681                 if (fstatvfs(f->fd, &svfs) >= 0) {
 682                         uint64_t available;
 683
 684                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 685
 686                         if (new_size - old_size > available)
 687                                 return -E2BIG;
 688                 }
 689         }
 690
 691         /* Increase by larger blocks at once */
 692         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 693         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 694                 new_size = f->metrics.max_size;
 695
 696         /* Note that the glibc fallocate() fallback is very
 697            inefficient, hence we try to minimize the allocation area
 698            as we can. */
 699         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 700         if (r != 0)
 701                 return -r;
 702
 703         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 704
 705         return journal_file_fstat(f);
 706 }
 707
 708 static unsigned type_to_context(ObjectType type) {
 709         /* One context for each type, plus one catch-all for the rest */
 710         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 711         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 712         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 713 }
 714
 715 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
 716         int r;
 717
 718         assert(f);
 719         assert(ret);
 720
 721         if (size <= 0)
 722                 return -EINVAL;
 723
 724         /* Avoid SIGBUS on invalid accesses */
 725         if (offset + size > (uint64_t) f->last_stat.st_size) {
 726                 /* Hmm, out of range? Let's refresh the fstat() data
 727                  * first, before we trust that check. */
 728
 729                 r = journal_file_fstat(f);
 730                 if (r < 0)
 731                         return r;
 732
 733                 if (offset + size > (uint64_t) f->last_stat.st_size)
 734                         return -EADDRNOTAVAIL;
 735         }
 736
 737         return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
 738 }
 739
 740 static uint64_t minimum_header_size(Object *o) {
 741
 742         static const uint64_t table[] = {
 743                 [OBJECT_DATA] = sizeof(DataObject),
 744                 [OBJECT_FIELD] = sizeof(FieldObject),
 745                 [OBJECT_ENTRY] = sizeof(EntryObject),
 746                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 747                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 748                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 749                 [OBJECT_TAG] = sizeof(TagObject),
 750         };
 751
 752         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 753                 return sizeof(ObjectHeader);
 754
 755         return table[o->object.type];
 756 }
 757
 758 /* Lightweight object checks. We want this to be fast, so that we won't
 759  * slowdown every journal_file_move_to_object() call too much. */
 760 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
 761         assert(f);
 762         assert(o);
 763
 764         switch (o->object.type) {
 765
 766         case OBJECT_DATA: {
 767                 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
 768                         log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
 769                                         le64toh(o->data.n_entries), offset);
 770                         return -EBADMSG;
 771                 }
 772
 773                 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
 774                         log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
 775                               offsetof(DataObject, payload),
 776                               le64toh(o->object.size),
 777                               offset);
 778                         return -EBADMSG;
 779                 }
 780
 781                 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
 782                     !VALID64(le64toh(o->data.next_field_offset)) ||
 783                     !VALID64(le64toh(o->data.entry_offset)) ||
 784                     !VALID64(le64toh(o->data.entry_array_offset))) {
 785                         log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
 786                                 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
 787                               le64toh(o->data.next_hash_offset),
 788                               le64toh(o->data.next_field_offset),
 789                               le64toh(o->data.entry_offset),
 790                               le64toh(o->data.entry_array_offset),
 791                               offset);
 792                         return -EBADMSG;
 793                 }
 794
 795                 break;
 796         }
 797
 798         case OBJECT_FIELD:
 799                 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
 800                         log_debug(
 801                               "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
 802                               offsetof(FieldObject, payload),
 803                               le64toh(o->object.size),
 804                               offset);
 805                         return -EBADMSG;
 806                 }
 807
 808                 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
 809                     !VALID64(le64toh(o->field.head_data_offset))) {
 810                         log_debug(
 811                               "Invalid offset, next_hash_offset="OFSfmt
 812                               ", head_data_offset="OFSfmt": %"PRIu64,
 813                               le64toh(o->field.next_hash_offset),
 814                               le64toh(o->field.head_data_offset),
 815                               offset);
 816                         return -EBADMSG;
 817                 }
 818                 break;
 819
 820         case OBJECT_ENTRY:
 821                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
 822                         log_debug(
 823                               "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
 824                               offsetof(EntryObject, items),
 825                               le64toh(o->object.size),
 826                               offset);
 827                         return -EBADMSG;
 828                 }
 829
 830                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
 831                         log_debug(
 832                               "Invalid number items in entry: %"PRIu64": %"PRIu64,
 833                               (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
 834                               offset);
 835                         return -EBADMSG;
 836                 }
 837
 838                 if (le64toh(o->entry.seqnum) <= 0) {
 839                         log_debug(
 840                               "Invalid entry seqnum: %"PRIx64": %"PRIu64,
 841                               le64toh(o->entry.seqnum),
 842                               offset);
 843                         return -EBADMSG;
 844                 }
 845
 846                 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
 847                         log_debug(
 848                               "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
 849                               le64toh(o->entry.realtime),
 850                               offset);
 851                         return -EBADMSG;
 852                 }
 853
 854                 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
 855                         log_debug(
 856                               "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
 857                               le64toh(o->entry.monotonic),
 858                               offset);
 859                         return -EBADMSG;
 860                 }
 861
 862                 break;
 863
 864         case OBJECT_DATA_HASH_TABLE:
 865         case OBJECT_FIELD_HASH_TABLE:
 866                 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
 867                     (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
 868                         log_debug(
 869                               "Invalid %s hash table size: %"PRIu64": %"PRIu64,
 870                               o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
 871                               le64toh(o->object.size),
 872                               offset);
 873                         return -EBADMSG;
 874                 }
 875
 876                 break;
 877
 878         case OBJECT_ENTRY_ARRAY:
 879                 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
 880                     (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
 881                         log_debug(
 882                               "Invalid object entry array size: %"PRIu64": %"PRIu64,
 883                               le64toh(o->object.size),
 884                               offset);
 885                         return -EBADMSG;
 886                 }
 887
 888                 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
 889                         log_debug(
 890                               "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
 891                               le64toh(o->entry_array.next_entry_array_offset),
 892                               offset);
 893                         return -EBADMSG;
 894                 }
 895
 896                 break;
 897
 898         case OBJECT_TAG:
 899                 if (le64toh(o->object.size) != sizeof(TagObject)) {
 900                         log_debug(
 901                               "Invalid object tag size: %"PRIu64": %"PRIu64,
 902                               le64toh(o->object.size),
 903                               offset);
 904                         return -EBADMSG;
 905                 }
 906
 907                 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
 908                         log_debug(
 909                               "Invalid object tag epoch: %"PRIu64": %"PRIu64,
 910                               le64toh(o->tag.epoch),
 911                               offset);
 912                         return -EBADMSG;
 913                 }
 914
 915                 break;
 916         }
 917
 918         return 0;
 919 }
 920
 921 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 922         int r;
 923         void *t;
 924         size_t tsize;
 925         Object *o;
 926         uint64_t s;
 927
 928         assert(f);
 929         assert(ret);
 930
 931         /* Objects may only be located at multiple of 64 bit */
 932         if (!VALID64(offset)) {
 933                 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
 934                 return -EBADMSG;
 935         }
 936
 937         /* Object may not be located in the file header */
 938         if (offset < le64toh(f->header->header_size)) {
 939                 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
 940                 return -EBADMSG;
 941         }
 942
 943         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
 944         if (r < 0)
 945                 return r;
 946
 947         o = (Object*) t;
 948         s = le64toh(o->object.size);
 949
 950         if (s == 0) {
 951                 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
 952                 return -EBADMSG;
 953         }
 954         if (s < sizeof(ObjectHeader)) {
 955                 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
 956                 return -EBADMSG;
 957         }
 958
 959         if (o->object.type <= OBJECT_UNUSED) {
 960                 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
 961                 return -EBADMSG;
 962         }
 963
 964         if (s < minimum_header_size(o)) {
 965                 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
 966                 return -EBADMSG;
 967         }
 968
 969         if (type > OBJECT_UNUSED && o->object.type != type) {
 970                 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
 971                 return -EBADMSG;
 972         }
 973
 974         if (s > tsize) {
 975                 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
 976                 if (r < 0)
 977                         return r;
 978
 979                 o = (Object*) t;
 980         }
 981
 982         r = journal_file_check_object(f, offset, o);
 983         if (r < 0)
 984                 return r;
 985
 986         *ret = o;
 987         return 0;
 988 }
 989
 990 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 991         uint64_t r;
 992
 993         assert(f);
 994         assert(f->header);
 995
 996         r = le64toh(f->header->tail_entry_seqnum) + 1;
 997
 998         if (seqnum) {
 999                 /* If an external seqnum counter was passed, we update
1000                  * both the local and the external one, and set it to
1001                  * the maximum of both */
1002
1003                 if (*seqnum + 1 > r)
1004                         r = *seqnum + 1;
1005
1006                 *seqnum = r;
1007         }
1008
1009         f->header->tail_entry_seqnum = htole64(r);
1010
1011         if (f->header->head_entry_seqnum == 0)
1012                 f->header->head_entry_seqnum = htole64(r);
1013
1014         return r;
1015 }
1016
1017 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
1018         int r;
1019         uint64_t p;
1020         Object *tail, *o;
1021         void *t;
1022
1023         assert(f);
1024         assert(f->header);
1025         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
1026         assert(size >= sizeof(ObjectHeader));
1027         assert(offset);
1028         assert(ret);
1029
1030         r = journal_file_set_online(f);
1031         if (r < 0)
1032                 return r;
1033
1034         p = le64toh(f->header->tail_object_offset);
1035         if (p == 0)
1036                 p = le64toh(f->header->header_size);
1037         else {
1038                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1039                 if (r < 0)
1040                         return r;
1041
1042                 p += ALIGN64(le64toh(tail->object.size));
1043         }
1044
1045         r = journal_file_allocate(f, p, size);
1046         if (r < 0)
1047                 return r;
1048
1049         r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1050         if (r < 0)
1051                 return r;
1052
1053         o = (Object*) t;
1054
1055         zero(o->object);
1056         o->object.type = type;
1057         o->object.size = htole64(size);
1058
1059         f->header->tail_object_offset = htole64(p);
1060         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1061
1062         *ret = o;
1063         *offset = p;
1064
1065         return 0;
1066 }
1067
1068 static int journal_file_setup_data_hash_table(JournalFile *f) {
1069         uint64_t s, p;
1070         Object *o;
1071         int r;
1072
1073         assert(f);
1074         assert(f->header);
1075
1076         /* We estimate that we need 1 hash table entry per 768 bytes
1077            of journal file and we want to make sure we never get
1078            beyond 75% fill level. Calculate the hash table size for
1079            the maximum file size based on these metrics. */
1080
1081         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1082         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1083                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1084
1085         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1086
1087         r = journal_file_append_object(f,
1088                                        OBJECT_DATA_HASH_TABLE,
1089                                        offsetof(Object, hash_table.items) + s,
1090                                        &o, &p);
1091         if (r < 0)
1092                 return r;
1093
1094         memzero(o->hash_table.items, s);
1095
1096         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1097         f->header->data_hash_table_size = htole64(s);
1098
1099         return 0;
1100 }
1101
1102 static int journal_file_setup_field_hash_table(JournalFile *f) {
1103         uint64_t s, p;
1104         Object *o;
1105         int r;
1106
1107         assert(f);
1108         assert(f->header);
1109
1110         /* We use a fixed size hash table for the fields as this
1111          * number should grow very slowly only */
1112
1113         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1114         r = journal_file_append_object(f,
1115                                        OBJECT_FIELD_HASH_TABLE,
1116                                        offsetof(Object, hash_table.items) + s,
1117                                        &o, &p);
1118         if (r < 0)
1119                 return r;
1120
1121         memzero(o->hash_table.items, s);
1122
1123         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1124         f->header->field_hash_table_size = htole64(s);
1125
1126         return 0;
1127 }
1128
1129 int journal_file_map_data_hash_table(JournalFile *f) {
1130         uint64_t s, p;
1131         void *t;
1132         int r;
1133
1134         assert(f);
1135         assert(f->header);
1136
1137         if (f->data_hash_table)
1138                 return 0;
1139
1140         p = le64toh(f->header->data_hash_table_offset);
1141         s = le64toh(f->header->data_hash_table_size);
1142
1143         r = journal_file_move_to(f,
1144                                  OBJECT_DATA_HASH_TABLE,
1145                                  true,
1146                                  p, s,
1147                                  &t, NULL);
1148         if (r < 0)
1149                 return r;
1150
1151         f->data_hash_table = t;
1152         return 0;
1153 }
1154
1155 int journal_file_map_field_hash_table(JournalFile *f) {
1156         uint64_t s, p;
1157         void *t;
1158         int r;
1159
1160         assert(f);
1161         assert(f->header);
1162
1163         if (f->field_hash_table)
1164                 return 0;
1165
1166         p = le64toh(f->header->field_hash_table_offset);
1167         s = le64toh(f->header->field_hash_table_size);
1168
1169         r = journal_file_move_to(f,
1170                                  OBJECT_FIELD_HASH_TABLE,
1171                                  true,
1172                                  p, s,
1173                                  &t, NULL);
1174         if (r < 0)
1175                 return r;
1176
1177         f->field_hash_table = t;
1178         return 0;
1179 }
1180
1181 static int journal_file_link_field(
1182                 JournalFile *f,
1183                 Object *o,
1184                 uint64_t offset,
1185                 uint64_t hash) {
1186
1187         uint64_t p, h, m;
1188         int r;
1189
1190         assert(f);
1191         assert(f->header);
1192         assert(f->field_hash_table);
1193         assert(o);
1194         assert(offset > 0);
1195
1196         if (o->object.type != OBJECT_FIELD)
1197                 return -EINVAL;
1198
1199         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1200         if (m <= 0)
1201                 return -EBADMSG;
1202
1203         /* This might alter the window we are looking at */
1204         o->field.next_hash_offset = o->field.head_data_offset = 0;
1205
1206         h = hash % m;
1207         p = le64toh(f->field_hash_table[h].tail_hash_offset);
1208         if (p == 0)
1209                 f->field_hash_table[h].head_hash_offset = htole64(offset);
1210         else {
1211                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1212                 if (r < 0)
1213                         return r;
1214
1215                 o->field.next_hash_offset = htole64(offset);
1216         }
1217
1218         f->field_hash_table[h].tail_hash_offset = htole64(offset);
1219
1220         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1221                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1222
1223         return 0;
1224 }
1225
1226 static int journal_file_link_data(
1227                 JournalFile *f,
1228                 Object *o,
1229                 uint64_t offset,
1230                 uint64_t hash) {
1231
1232         uint64_t p, h, m;
1233         int r;
1234
1235         assert(f);
1236         assert(f->header);
1237         assert(f->data_hash_table);
1238         assert(o);
1239         assert(offset > 0);
1240
1241         if (o->object.type != OBJECT_DATA)
1242                 return -EINVAL;
1243
1244         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1245         if (m <= 0)
1246                 return -EBADMSG;
1247
1248         /* This might alter the window we are looking at */
1249         o->data.next_hash_offset = o->data.next_field_offset = 0;
1250         o->data.entry_offset = o->data.entry_array_offset = 0;
1251         o->data.n_entries = 0;
1252
1253         h = hash % m;
1254         p = le64toh(f->data_hash_table[h].tail_hash_offset);
1255         if (p == 0)
1256                 /* Only entry in the hash table is easy */
1257                 f->data_hash_table[h].head_hash_offset = htole64(offset);
1258         else {
1259                 /* Move back to the previous data object, to patch in
1260                  * pointer */
1261
1262                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1263                 if (r < 0)
1264                         return r;
1265
1266                 o->data.next_hash_offset = htole64(offset);
1267         }
1268
1269         f->data_hash_table[h].tail_hash_offset = htole64(offset);
1270
1271         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1272                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1273
1274         return 0;
1275 }
1276
1277 int journal_file_find_field_object_with_hash(
1278                 JournalFile *f,
1279                 const void *field, uint64_t size, uint64_t hash,
1280                 Object **ret, uint64_t *offset) {
1281
1282         uint64_t p, osize, h, m;
1283         int r;
1284
1285         assert(f);
1286         assert(f->header);
1287         assert(field && size > 0);
1288
1289         /* If the field hash table is empty, we can't find anything */
1290         if (le64toh(f->header->field_hash_table_size) <= 0)
1291                 return 0;
1292
1293         /* Map the field hash table, if it isn't mapped yet. */
1294         r = journal_file_map_field_hash_table(f);
1295         if (r < 0)
1296                 return r;
1297
1298         osize = offsetof(Object, field.payload) + size;
1299
1300         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1301         if (m <= 0)
1302                 return -EBADMSG;
1303
1304         h = hash % m;
1305         p = le64toh(f->field_hash_table[h].head_hash_offset);
1306
1307         while (p > 0) {
1308                 Object *o;
1309
1310                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1311                 if (r < 0)
1312                         return r;
1313
1314                 if (le64toh(o->field.hash) == hash &&
1315                     le64toh(o->object.size) == osize &&
1316                     memcmp(o->field.payload, field, size) == 0) {
1317
1318                         if (ret)
1319                                 *ret = o;
1320                         if (offset)
1321                                 *offset = p;
1322
1323                         return 1;
1324                 }
1325
1326                 p = le64toh(o->field.next_hash_offset);
1327         }
1328
1329         return 0;
1330 }
1331
1332 int journal_file_find_field_object(
1333                 JournalFile *f,
1334                 const void *field, uint64_t size,
1335                 Object **ret, uint64_t *offset) {
1336
1337         uint64_t hash;
1338
1339         assert(f);
1340         assert(field && size > 0);
1341
1342         hash = hash64(field, size);
1343
1344         return journal_file_find_field_object_with_hash(f,
1345                                                         field, size, hash,
1346                                                         ret, offset);
1347 }
1348
1349 int journal_file_find_data_object_with_hash(
1350                 JournalFile *f,
1351                 const void *data, uint64_t size, uint64_t hash,
1352                 Object **ret, uint64_t *offset) {
1353
1354         uint64_t p, osize, h, m;
1355         int r;
1356
1357         assert(f);
1358         assert(f->header);
1359         assert(data || size == 0);
1360
1361         /* If there's no data hash table, then there's no entry. */
1362         if (le64toh(f->header->data_hash_table_size) <= 0)
1363                 return 0;
1364
1365         /* Map the data hash table, if it isn't mapped yet. */
1366         r = journal_file_map_data_hash_table(f);
1367         if (r < 0)
1368                 return r;
1369
1370         osize = offsetof(Object, data.payload) + size;
1371
1372         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1373         if (m <= 0)
1374                 return -EBADMSG;
1375
1376         h = hash % m;
1377         p = le64toh(f->data_hash_table[h].head_hash_offset);
1378
1379         while (p > 0) {
1380                 Object *o;
1381
1382                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1383                 if (r < 0)
1384                         return r;
1385
1386                 if (le64toh(o->data.hash) != hash)
1387                         goto next;
1388
1389                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1390 #if HAVE_XZ || HAVE_LZ4
1391                         uint64_t l;
1392                         size_t rsize = 0;
1393
1394                         l = le64toh(o->object.size);
1395                         if (l <= offsetof(Object, data.payload))
1396                                 return -EBADMSG;
1397
1398                         l -= offsetof(Object, data.payload);
1399
1400                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1401                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1402                         if (r < 0)
1403                                 return r;
1404
1405                         if (rsize == size &&
1406                             memcmp(f->compress_buffer, data, size) == 0) {
1407
1408                                 if (ret)
1409                                         *ret = o;
1410
1411                                 if (offset)
1412                                         *offset = p;
1413
1414                                 return 1;
1415                         }
1416 #else
1417                         return -EPROTONOSUPPORT;
1418 #endif
1419                 } else if (le64toh(o->object.size) == osize &&
1420                            memcmp(o->data.payload, data, size) == 0) {
1421
1422                         if (ret)
1423                                 *ret = o;
1424
1425                         if (offset)
1426                                 *offset = p;
1427
1428                         return 1;
1429                 }
1430
1431         next:
1432                 p = le64toh(o->data.next_hash_offset);
1433         }
1434
1435         return 0;
1436 }
1437
1438 int journal_file_find_data_object(
1439                 JournalFile *f,
1440                 const void *data, uint64_t size,
1441                 Object **ret, uint64_t *offset) {
1442
1443         uint64_t hash;
1444
1445         assert(f);
1446         assert(data || size == 0);
1447
1448         hash = hash64(data, size);
1449
1450         return journal_file_find_data_object_with_hash(f,
1451                                                        data, size, hash,
1452                                                        ret, offset);
1453 }
1454
1455 static int journal_file_append_field(
1456                 JournalFile *f,
1457                 const void *field, uint64_t size,
1458                 Object **ret, uint64_t *offset) {
1459
1460         uint64_t hash, p;
1461         uint64_t osize;
1462         Object *o;
1463         int r;
1464
1465         assert(f);
1466         assert(field && size > 0);
1467
1468         hash = hash64(field, size);
1469
1470         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1471         if (r < 0)
1472                 return r;
1473         else if (r > 0) {
1474
1475                 if (ret)
1476                         *ret = o;
1477
1478                 if (offset)
1479                         *offset = p;
1480
1481                 return 0;
1482         }
1483
1484         osize = offsetof(Object, field.payload) + size;
1485         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1486         if (r < 0)
1487                 return r;
1488
1489         o->field.hash = htole64(hash);
1490         memcpy(o->field.payload, field, size);
1491
1492         r = journal_file_link_field(f, o, p, hash);
1493         if (r < 0)
1494                 return r;
1495
1496         /* The linking might have altered the window, so let's
1497          * refresh our pointer */
1498         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1499         if (r < 0)
1500                 return r;
1501
1502 #if HAVE_GCRYPT
1503         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1504         if (r < 0)
1505                 return r;
1506 #endif
1507
1508         if (ret)
1509                 *ret = o;
1510
1511         if (offset)
1512                 *offset = p;
1513
1514         return 0;
1515 }
1516
1517 static int journal_file_append_data(
1518                 JournalFile *f,
1519                 const void *data, uint64_t size,
1520                 Object **ret, uint64_t *offset) {
1521
1522         uint64_t hash, p;
1523         uint64_t osize;
1524         Object *o;
1525         int r, compression = 0;
1526         const void *eq;
1527
1528         assert(f);
1529         assert(data || size == 0);
1530
1531         hash = hash64(data, size);
1532
1533         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1534         if (r < 0)
1535                 return r;
1536         if (r > 0) {
1537
1538                 if (ret)
1539                         *ret = o;
1540
1541                 if (offset)
1542                         *offset = p;
1543
1544                 return 0;
1545         }
1546
1547         osize = offsetof(Object, data.payload) + size;
1548         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1549         if (r < 0)
1550                 return r;
1551
1552         o->data.hash = htole64(hash);
1553
1554 #if HAVE_XZ || HAVE_LZ4
1555         if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1556                 size_t rsize = 0;
1557
1558                 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1559
1560                 if (compression >= 0) {
1561                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1562                         o->object.flags |= compression;
1563
1564                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1565                                   size, rsize, object_compressed_to_string(compression));
1566                 } else
1567                         /* Compression didn't work, we don't really care why, let's continue without compression */
1568                         compression = 0;
1569         }
1570 #endif
1571
1572         if (compression == 0)
1573                 memcpy_safe(o->data.payload, data, size);
1574
1575         r = journal_file_link_data(f, o, p, hash);
1576         if (r < 0)
1577                 return r;
1578
1579 #if HAVE_GCRYPT
1580         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1581         if (r < 0)
1582                 return r;
1583 #endif
1584
1585         /* The linking might have altered the window, so let's
1586          * refresh our pointer */
1587         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1588         if (r < 0)
1589                 return r;
1590
1591         if (!data)
1592                 eq = NULL;
1593         else
1594                 eq = memchr(data, '=', size);
1595         if (eq && eq > data) {
1596                 Object *fo = NULL;
1597                 uint64_t fp;
1598
1599                 /* Create field object ... */
1600                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1601                 if (r < 0)
1602                         return r;
1603
1604                 /* ... and link it in. */
1605                 o->data.next_field_offset = fo->field.head_data_offset;
1606                 fo->field.head_data_offset = le64toh(p);
1607         }
1608
1609         if (ret)
1610                 *ret = o;
1611
1612         if (offset)
1613                 *offset = p;
1614
1615         return 0;
1616 }
1617
1618 uint64_t journal_file_entry_n_items(Object *o) {
1619         assert(o);
1620
1621         if (o->object.type != OBJECT_ENTRY)
1622                 return 0;
1623
1624         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1625 }
1626
1627 uint64_t journal_file_entry_array_n_items(Object *o) {
1628         assert(o);
1629
1630         if (o->object.type != OBJECT_ENTRY_ARRAY)
1631                 return 0;
1632
1633         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1634 }
1635
1636 uint64_t journal_file_hash_table_n_items(Object *o) {
1637         assert(o);
1638
1639         if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1640                 return 0;
1641
1642         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1643 }
1644
1645 static int link_entry_into_array(JournalFile *f,
1646                                  le64_t *first,
1647                                  le64_t *idx,
1648                                  uint64_t p) {
1649         int r;
1650         uint64_t n = 0, ap = 0, q, i, a, hidx;
1651         Object *o;
1652
1653         assert(f);
1654         assert(f->header);
1655         assert(first);
1656         assert(idx);
1657         assert(p > 0);
1658
1659         a = le64toh(*first);
1660         i = hidx = le64toh(*idx);
1661         while (a > 0) {
1662
1663                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1664                 if (r < 0)
1665                         return r;
1666
1667                 n = journal_file_entry_array_n_items(o);
1668                 if (i < n) {
1669                         o->entry_array.items[i] = htole64(p);
1670                         *idx = htole64(hidx + 1);
1671                         return 0;
1672                 }
1673
1674                 i -= n;
1675                 ap = a;
1676                 a = le64toh(o->entry_array.next_entry_array_offset);
1677         }
1678
1679         if (hidx > n)
1680                 n = (hidx+1) * 2;
1681         else
1682                 n = n * 2;
1683
1684         if (n < 4)
1685                 n = 4;
1686
1687         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1688                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1689                                        &o, &q);
1690         if (r < 0)
1691                 return r;
1692
1693 #if HAVE_GCRYPT
1694         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1695         if (r < 0)
1696                 return r;
1697 #endif
1698
1699         o->entry_array.items[i] = htole64(p);
1700
1701         if (ap == 0)
1702                 *first = htole64(q);
1703         else {
1704                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1705                 if (r < 0)
1706                         return r;
1707
1708                 o->entry_array.next_entry_array_offset = htole64(q);
1709         }
1710
1711         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1712                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1713
1714         *idx = htole64(hidx + 1);
1715
1716         return 0;
1717 }
1718
1719 static int link_entry_into_array_plus_one(JournalFile *f,
1720                                           le64_t *extra,
1721                                           le64_t *first,
1722                                           le64_t *idx,
1723                                           uint64_t p) {
1724
1725         int r;
1726
1727         assert(f);
1728         assert(extra);
1729         assert(first);
1730         assert(idx);
1731         assert(p > 0);
1732
1733         if (*idx == 0)
1734                 *extra = htole64(p);
1735         else {
1736                 le64_t i;
1737
1738                 i = htole64(le64toh(*idx) - 1);
1739                 r = link_entry_into_array(f, first, &i, p);
1740                 if (r < 0)
1741                         return r;
1742         }
1743
1744         *idx = htole64(le64toh(*idx) + 1);
1745         return 0;
1746 }
1747
1748 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1749         uint64_t p;
1750         int r;
1751         assert(f);
1752         assert(o);
1753         assert(offset > 0);
1754
1755         p = le64toh(o->entry.items[i].object_offset);
1756         if (p == 0)
1757                 return -EINVAL;
1758
1759         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1760         if (r < 0)
1761                 return r;
1762
1763         return link_entry_into_array_plus_one(f,
1764                                               &o->data.entry_offset,
1765                                               &o->data.entry_array_offset,
1766                                               &o->data.n_entries,
1767                                               offset);
1768 }
1769
1770 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1771         uint64_t n, i;
1772         int r;
1773
1774         assert(f);
1775         assert(f->header);
1776         assert(o);
1777         assert(offset > 0);
1778
1779         if (o->object.type != OBJECT_ENTRY)
1780                 return -EINVAL;
1781
1782         __sync_synchronize();
1783
1784         /* Link up the entry itself */
1785         r = link_entry_into_array(f,
1786                                   &f->header->entry_array_offset,
1787                                   &f->header->n_entries,
1788                                   offset);
1789         if (r < 0)
1790                 return r;
1791
1792         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1793
1794         if (f->header->head_entry_realtime == 0)
1795                 f->header->head_entry_realtime = o->entry.realtime;
1796
1797         f->header->tail_entry_realtime = o->entry.realtime;
1798         f->header->tail_entry_monotonic = o->entry.monotonic;
1799
1800         f->tail_entry_monotonic_valid = true;
1801
1802         /* Link up the items */
1803         n = journal_file_entry_n_items(o);
1804         for (i = 0; i < n; i++) {
1805                 r = journal_file_link_entry_item(f, o, offset, i);
1806                 if (r < 0)
1807                         return r;
1808         }
1809
1810         return 0;
1811 }
1812
1813 static int journal_file_append_entry_internal(
1814                 JournalFile *f,
1815                 const dual_timestamp *ts,
1816                 uint64_t xor_hash,
1817                 const EntryItem items[], unsigned n_items,
1818                 uint64_t *seqnum,
1819                 Object **ret, uint64_t *offset) {
1820         uint64_t np;
1821         uint64_t osize;
1822         Object *o;
1823         int r;
1824
1825         assert(f);
1826         assert(f->header);
1827         assert(items || n_items == 0);
1828         assert(ts);
1829
1830         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1831
1832         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1833         if (r < 0)
1834                 return r;
1835
1836         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1837         memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1838         o->entry.realtime = htole64(ts->realtime);
1839         o->entry.monotonic = htole64(ts->monotonic);
1840         o->entry.xor_hash = htole64(xor_hash);
1841         o->entry.boot_id = f->header->boot_id;
1842
1843 #if HAVE_GCRYPT
1844         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1845         if (r < 0)
1846                 return r;
1847 #endif
1848
1849         r = journal_file_link_entry(f, o, np);
1850         if (r < 0)
1851                 return r;
1852
1853         if (ret)
1854                 *ret = o;
1855
1856         if (offset)
1857                 *offset = np;
1858
1859         return 0;
1860 }
1861
1862 void journal_file_post_change(JournalFile *f) {
1863         assert(f);
1864
1865         /* inotify() does not receive IN_MODIFY events from file
1866          * accesses done via mmap(). After each access we hence
1867          * trigger IN_MODIFY by truncating the journal file to its
1868          * current size which triggers IN_MODIFY. */
1869
1870         __sync_synchronize();
1871
1872         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1873                 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1874 }
1875
1876 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1877         assert(userdata);
1878
1879         journal_file_post_change(userdata);
1880
1881         return 1;
1882 }
1883
1884 static void schedule_post_change(JournalFile *f) {
1885         sd_event_source *timer;
1886         int enabled, r;
1887         uint64_t now;
1888
1889         assert(f);
1890         assert(f->post_change_timer);
1891
1892         timer = f->post_change_timer;
1893
1894         r = sd_event_source_get_enabled(timer, &enabled);
1895         if (r < 0) {
1896                 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1897                 goto fail;
1898         }
1899
1900         if (enabled == SD_EVENT_ONESHOT)
1901                 return;
1902
1903         r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1904         if (r < 0) {
1905                 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1906                 goto fail;
1907         }
1908
1909         r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1910         if (r < 0) {
1911                 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1912                 goto fail;
1913         }
1914
1915         r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1916         if (r < 0) {
1917                 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1918                 goto fail;
1919         }
1920
1921         return;
1922
1923 fail:
1924         /* On failure, let's simply post the change immediately. */
1925         journal_file_post_change(f);
1926 }
1927
1928 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1929 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1930         _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1931         int r;
1932
1933         assert(f);
1934         assert_return(!f->post_change_timer, -EINVAL);
1935         assert(e);
1936         assert(t);
1937
1938         r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1939         if (r < 0)
1940                 return r;
1941
1942         r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1943         if (r < 0)
1944                 return r;
1945
1946         f->post_change_timer = timer;
1947         timer = NULL;
1948         f->post_change_timer_period = t;
1949
1950         return r;
1951 }
1952
1953 static int entry_item_cmp(const void *_a, const void *_b) {
1954         const EntryItem *a = _a, *b = _b;
1955
1956         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1957                 return -1;
1958         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1959                 return 1;
1960         return 0;
1961 }
1962
1963 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1964         unsigned i;
1965         EntryItem *items;
1966         int r;
1967         uint64_t xor_hash = 0;
1968         struct dual_timestamp _ts;
1969
1970         assert(f);
1971         assert(f->header);
1972         assert(iovec || n_iovec == 0);
1973
1974         if (!ts) {
1975                 dual_timestamp_get(&_ts);
1976                 ts = &_ts;
1977         }
1978
1979 #if HAVE_GCRYPT
1980         r = journal_file_maybe_append_tag(f, ts->realtime);
1981         if (r < 0)
1982                 return r;
1983 #endif
1984
1985         /* alloca() can't take 0, hence let's allocate at least one */
1986         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1987
1988         for (i = 0; i < n_iovec; i++) {
1989                 uint64_t p;
1990                 Object *o;
1991
1992                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1993                 if (r < 0)
1994                         return r;
1995
1996                 xor_hash ^= le64toh(o->data.hash);
1997                 items[i].object_offset = htole64(p);
1998                 items[i].hash = o->data.hash;
1999         }
2000
2001         /* Order by the position on disk, in order to improve seek
2002          * times for rotating media. */
2003         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
2004
2005         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
2006
2007         /* If the memory mapping triggered a SIGBUS then we return an
2008          * IO error and ignore the error code passed down to us, since
2009          * it is very likely just an effect of a nullified replacement
2010          * mapping page */
2011
2012         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
2013                 r = -EIO;
2014
2015         if (f->post_change_timer)
2016                 schedule_post_change(f);
2017         else
2018                 journal_file_post_change(f);
2019
2020         return r;
2021 }
2022
2023 typedef struct ChainCacheItem {
2024         uint64_t first; /* the array at the beginning of the chain */
2025         uint64_t array; /* the cached array */
2026         uint64_t begin; /* the first item in the cached array */
2027         uint64_t total; /* the total number of items in all arrays before this one in the chain */
2028         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2029 } ChainCacheItem;
2030
2031 static void chain_cache_put(
2032                 OrderedHashmap *h,
2033                 ChainCacheItem *ci,
2034                 uint64_t first,
2035                 uint64_t array,
2036                 uint64_t begin,
2037                 uint64_t total,
2038                 uint64_t last_index) {
2039
2040         if (!ci) {
2041                 /* If the chain item to cache for this chain is the
2042                  * first one it's not worth caching anything */
2043                 if (array == first)
2044                         return;
2045
2046                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2047                         ci = ordered_hashmap_steal_first(h);
2048                         assert(ci);
2049                 } else {
2050                         ci = new(ChainCacheItem, 1);
2051                         if (!ci)
2052                                 return;
2053                 }
2054
2055                 ci->first = first;
2056
2057                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2058                         free(ci);
2059                         return;
2060                 }
2061         } else
2062                 assert(ci->first == first);
2063
2064         ci->array = array;
2065         ci->begin = begin;
2066         ci->total = total;
2067         ci->last_index = last_index;
2068 }
2069
2070 static int generic_array_get(
2071                 JournalFile *f,
2072                 uint64_t first,
2073                 uint64_t i,
2074                 Object **ret, uint64_t *offset) {
2075
2076         Object *o;
2077         uint64_t p = 0, a, t = 0;
2078         int r;
2079         ChainCacheItem *ci;
2080
2081         assert(f);
2082
2083         a = first;
2084
2085         /* Try the chain cache first */
2086         ci = ordered_hashmap_get(f->chain_cache, &first);
2087         if (ci && i > ci->total) {
2088                 a = ci->array;
2089                 i -= ci->total;
2090                 t = ci->total;
2091         }
2092
2093         while (a > 0) {
2094                 uint64_t k;
2095
2096                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2097                 if (r < 0)
2098                         return r;
2099
2100                 k = journal_file_entry_array_n_items(o);
2101                 if (i < k) {
2102                         p = le64toh(o->entry_array.items[i]);
2103                         goto found;
2104                 }
2105
2106                 i -= k;
2107                 t += k;
2108                 a = le64toh(o->entry_array.next_entry_array_offset);
2109         }
2110
2111         return 0;
2112
2113 found:
2114         /* Let's cache this item for the next invocation */
2115         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2116
2117         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2118         if (r < 0)
2119                 return r;
2120
2121         if (ret)
2122                 *ret = o;
2123
2124         if (offset)
2125                 *offset = p;
2126
2127         return 1;
2128 }
2129
2130 static int generic_array_get_plus_one(
2131                 JournalFile *f,
2132                 uint64_t extra,
2133                 uint64_t first,
2134                 uint64_t i,
2135                 Object **ret, uint64_t *offset) {
2136
2137         Object *o;
2138
2139         assert(f);
2140
2141         if (i == 0) {
2142                 int r;
2143
2144                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2145                 if (r < 0)
2146                         return r;
2147
2148                 if (ret)
2149                         *ret = o;
2150
2151                 if (offset)
2152                         *offset = extra;
2153
2154                 return 1;
2155         }
2156
2157         return generic_array_get(f, first, i-1, ret, offset);
2158 }
2159
2160 enum {
2161         TEST_FOUND,
2162         TEST_LEFT,
2163         TEST_RIGHT
2164 };
2165
2166 static int generic_array_bisect(
2167                 JournalFile *f,
2168                 uint64_t first,
2169                 uint64_t n,
2170                 uint64_t needle,
2171                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2172                 direction_t direction,
2173                 Object **ret,
2174                 uint64_t *offset,
2175                 uint64_t *idx) {
2176
2177         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2178         bool subtract_one = false;
2179         Object *o, *array = NULL;
2180         int r;
2181         ChainCacheItem *ci;
2182
2183         assert(f);
2184         assert(test_object);
2185
2186         /* Start with the first array in the chain */
2187         a = first;
2188
2189         ci = ordered_hashmap_get(f->chain_cache, &first);
2190         if (ci && n > ci->total) {
2191                 /* Ah, we have iterated this bisection array chain
2192                  * previously! Let's see if we can skip ahead in the
2193                  * chain, as far as the last time. But we can't jump
2194                  * backwards in the chain, so let's check that
2195                  * first. */
2196
2197                 r = test_object(f, ci->begin, needle);
2198                 if (r < 0)
2199                         return r;
2200
2201                 if (r == TEST_LEFT) {
2202                         /* OK, what we are looking for is right of the
2203                          * begin of this EntryArray, so let's jump
2204                          * straight to previously cached array in the
2205                          * chain */
2206
2207                         a = ci->array;
2208                         n -= ci->total;
2209                         t = ci->total;
2210                         last_index = ci->last_index;
2211                 }
2212         }
2213
2214         while (a > 0) {
2215                 uint64_t left, right, k, lp;
2216
2217                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2218                 if (r < 0)
2219                         return r;
2220
2221                 k = journal_file_entry_array_n_items(array);
2222                 right = MIN(k, n);
2223                 if (right <= 0)
2224                         return 0;
2225
2226                 i = right - 1;
2227                 lp = p = le64toh(array->entry_array.items[i]);
2228                 if (p <= 0)
2229                         r = -EBADMSG;
2230                 else
2231                         r = test_object(f, p, needle);
2232                 if (r == -EBADMSG) {
2233                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2234                         n = i;
2235                         continue;
2236                 }
2237                 if (r < 0)
2238                         return r;
2239
2240                 if (r == TEST_FOUND)
2241                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2242
2243                 if (r == TEST_RIGHT) {
2244                         left = 0;
2245                         right -= 1;
2246
2247                         if (last_index != (uint64_t) -1) {
2248                                 assert(last_index <= right);
2249
2250                                 /* If we cached the last index we
2251                                  * looked at, let's try to not to jump
2252                                  * too wildly around and see if we can
2253                                  * limit the range to look at early to
2254                                  * the immediate neighbors of the last
2255                                  * index we looked at. */
2256
2257                                 if (last_index > 0) {
2258                                         uint64_t x = last_index - 1;
2259
2260                                         p = le64toh(array->entry_array.items[x]);
2261                                         if (p <= 0)
2262                                                 return -EBADMSG;
2263
2264                                         r = test_object(f, p, needle);
2265                                         if (r < 0)
2266                                                 return r;
2267
2268                                         if (r == TEST_FOUND)
2269                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2270
2271                                         if (r == TEST_RIGHT)
2272                                                 right = x;
2273                                         else
2274                                                 left = x + 1;
2275                                 }
2276
2277                                 if (last_index < right) {
2278                                         uint64_t y = last_index + 1;
2279
2280                                         p = le64toh(array->entry_array.items[y]);
2281                                         if (p <= 0)
2282                                                 return -EBADMSG;
2283
2284                                         r = test_object(f, p, needle);
2285                                         if (r < 0)
2286                                                 return r;
2287
2288                                         if (r == TEST_FOUND)
2289                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2290
2291                                         if (r == TEST_RIGHT)
2292                                                 right = y;
2293                                         else
2294                                                 left = y + 1;
2295                                 }
2296                         }
2297
2298                         for (;;) {
2299                                 if (left == right) {
2300                                         if (direction == DIRECTION_UP)
2301                                                 subtract_one = true;
2302
2303                                         i = left;
2304                                         goto found;
2305                                 }
2306
2307                                 assert(left < right);
2308                                 i = (left + right) / 2;
2309
2310                                 p = le64toh(array->entry_array.items[i]);
2311                                 if (p <= 0)
2312                                         r = -EBADMSG;
2313                                 else
2314                                         r = test_object(f, p, needle);
2315                                 if (r == -EBADMSG) {
2316                                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2317                                         right = n = i;
2318                                         continue;
2319                                 }
2320                                 if (r < 0)
2321                                         return r;
2322
2323                                 if (r == TEST_FOUND)
2324                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2325
2326                                 if (r == TEST_RIGHT)
2327                                         right = i;
2328                                 else
2329                                         left = i + 1;
2330                         }
2331                 }
2332
2333                 if (k >= n) {
2334                         if (direction == DIRECTION_UP) {
2335                                 i = n;
2336                                 subtract_one = true;
2337                                 goto found;
2338                         }
2339
2340                         return 0;
2341                 }
2342
2343                 last_p = lp;
2344
2345                 n -= k;
2346                 t += k;
2347                 last_index = (uint64_t) -1;
2348                 a = le64toh(array->entry_array.next_entry_array_offset);
2349         }
2350
2351         return 0;
2352
2353 found:
2354         if (subtract_one && t == 0 && i == 0)
2355                 return 0;
2356
2357         /* Let's cache this item for the next invocation */
2358         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2359
2360         if (subtract_one && i == 0)
2361                 p = last_p;
2362         else if (subtract_one)
2363                 p = le64toh(array->entry_array.items[i-1]);
2364         else
2365                 p = le64toh(array->entry_array.items[i]);
2366
2367         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2368         if (r < 0)
2369                 return r;
2370
2371         if (ret)
2372                 *ret = o;
2373
2374         if (offset)
2375                 *offset = p;
2376
2377         if (idx)
2378                 *idx = t + i + (subtract_one ? -1 : 0);
2379
2380         return 1;
2381 }
2382
2383 static int generic_array_bisect_plus_one(
2384                 JournalFile *f,
2385                 uint64_t extra,
2386                 uint64_t first,
2387                 uint64_t n,
2388                 uint64_t needle,
2389                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2390                 direction_t direction,
2391                 Object **ret,
2392                 uint64_t *offset,
2393                 uint64_t *idx) {
2394
2395         int r;
2396         bool step_back = false;
2397         Object *o;
2398
2399         assert(f);
2400         assert(test_object);
2401
2402         if (n <= 0)
2403                 return 0;
2404
2405         /* This bisects the array in object 'first', but first checks
2406          * an extra  */
2407         r = test_object(f, extra, needle);
2408         if (r < 0)
2409                 return r;
2410
2411         if (r == TEST_FOUND)
2412                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2413
2414         /* if we are looking with DIRECTION_UP then we need to first
2415            see if in the actual array there is a matching entry, and
2416            return the last one of that. But if there isn't any we need
2417            to return this one. Hence remember this, and return it
2418            below. */
2419         if (r == TEST_LEFT)
2420                 step_back = direction == DIRECTION_UP;
2421
2422         if (r == TEST_RIGHT) {
2423                 if (direction == DIRECTION_DOWN)
2424                         goto found;
2425                 else
2426                         return 0;
2427         }
2428
2429         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2430
2431         if (r == 0 && step_back)
2432                 goto found;
2433
2434         if (r > 0 && idx)
2435                 (*idx)++;
2436
2437         return r;
2438
2439 found:
2440         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2441         if (r < 0)
2442                 return r;
2443
2444         if (ret)
2445                 *ret = o;
2446
2447         if (offset)
2448                 *offset = extra;
2449
2450         if (idx)
2451                 *idx = 0;
2452
2453         return 1;
2454 }
2455
2456 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2457         assert(f);
2458         assert(p > 0);
2459
2460         if (p == needle)
2461                 return TEST_FOUND;
2462         else if (p < needle)
2463                 return TEST_LEFT;
2464         else
2465                 return TEST_RIGHT;
2466 }
2467
2468 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2469         Object *o;
2470         int r;
2471
2472         assert(f);
2473         assert(p > 0);
2474
2475         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2476         if (r < 0)
2477                 return r;
2478
2479         if (le64toh(o->entry.seqnum) == needle)
2480                 return TEST_FOUND;
2481         else if (le64toh(o->entry.seqnum) < needle)
2482                 return TEST_LEFT;
2483         else
2484                 return TEST_RIGHT;
2485 }
2486
2487 int journal_file_move_to_entry_by_seqnum(
2488                 JournalFile *f,
2489                 uint64_t seqnum,
2490                 direction_t direction,
2491                 Object **ret,
2492                 uint64_t *offset) {
2493         assert(f);
2494         assert(f->header);
2495
2496         return generic_array_bisect(f,
2497                                     le64toh(f->header->entry_array_offset),
2498                                     le64toh(f->header->n_entries),
2499                                     seqnum,
2500                                     test_object_seqnum,
2501                                     direction,
2502                                     ret, offset, NULL);
2503 }
2504
2505 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2506         Object *o;
2507         int r;
2508
2509         assert(f);
2510         assert(p > 0);
2511
2512         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2513         if (r < 0)
2514                 return r;
2515
2516         if (le64toh(o->entry.realtime) == needle)
2517                 return TEST_FOUND;
2518         else if (le64toh(o->entry.realtime) < needle)
2519                 return TEST_LEFT;
2520         else
2521                 return TEST_RIGHT;
2522 }
2523
2524 int journal_file_move_to_entry_by_realtime(
2525                 JournalFile *f,
2526                 uint64_t realtime,
2527                 direction_t direction,
2528                 Object **ret,
2529                 uint64_t *offset) {
2530         assert(f);
2531         assert(f->header);
2532
2533         return generic_array_bisect(f,
2534                                     le64toh(f->header->entry_array_offset),
2535                                     le64toh(f->header->n_entries),
2536                                     realtime,
2537                                     test_object_realtime,
2538                                     direction,
2539                                     ret, offset, NULL);
2540 }
2541
2542 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2543         Object *o;
2544         int r;
2545
2546         assert(f);
2547         assert(p > 0);
2548
2549         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2550         if (r < 0)
2551                 return r;
2552
2553         if (le64toh(o->entry.monotonic) == needle)
2554                 return TEST_FOUND;
2555         else if (le64toh(o->entry.monotonic) < needle)
2556                 return TEST_LEFT;
2557         else
2558                 return TEST_RIGHT;
2559 }
2560
2561 static int find_data_object_by_boot_id(
2562                 JournalFile *f,
2563                 sd_id128_t boot_id,
2564                 Object **o,
2565                 uint64_t *b) {
2566
2567         char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2568
2569         sd_id128_to_string(boot_id, t + 9);
2570         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2571 }
2572
2573 int journal_file_move_to_entry_by_monotonic(
2574                 JournalFile *f,
2575                 sd_id128_t boot_id,
2576                 uint64_t monotonic,
2577                 direction_t direction,
2578                 Object **ret,
2579                 uint64_t *offset) {
2580
2581         Object *o;
2582         int r;
2583
2584         assert(f);
2585
2586         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2587         if (r < 0)
2588                 return r;
2589         if (r == 0)
2590                 return -ENOENT;
2591
2592         return generic_array_bisect_plus_one(f,
2593                                              le64toh(o->data.entry_offset),
2594                                              le64toh(o->data.entry_array_offset),
2595                                              le64toh(o->data.n_entries),
2596                                              monotonic,
2597                                              test_object_monotonic,
2598                                              direction,
2599                                              ret, offset, NULL);
2600 }
2601
2602 void journal_file_reset_location(JournalFile *f) {
2603         f->location_type = LOCATION_HEAD;
2604         f->current_offset = 0;
2605         f->current_seqnum = 0;
2606         f->current_realtime = 0;
2607         f->current_monotonic = 0;
2608         zero(f->current_boot_id);
2609         f->current_xor_hash = 0;
2610 }
2611
2612 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2613         f->location_type = LOCATION_SEEK;
2614         f->current_offset = offset;
2615         f->current_seqnum = le64toh(o->entry.seqnum);
2616         f->current_realtime = le64toh(o->entry.realtime);
2617         f->current_monotonic = le64toh(o->entry.monotonic);
2618         f->current_boot_id = o->entry.boot_id;
2619         f->current_xor_hash = le64toh(o->entry.xor_hash);
2620 }
2621
2622 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2623         assert(af);
2624         assert(af->header);
2625         assert(bf);
2626         assert(bf->header);
2627         assert(af->location_type == LOCATION_SEEK);
2628         assert(bf->location_type == LOCATION_SEEK);
2629
2630         /* If contents and timestamps match, these entries are
2631          * identical, even if the seqnum does not match */
2632         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2633             af->current_monotonic == bf->current_monotonic &&
2634             af->current_realtime == bf->current_realtime &&
2635             af->current_xor_hash == bf->current_xor_hash)
2636                 return 0;
2637
2638         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2639
2640                 /* If this is from the same seqnum source, compare
2641                  * seqnums */
2642                 if (af->current_seqnum < bf->current_seqnum)
2643                         return -1;
2644                 if (af->current_seqnum > bf->current_seqnum)
2645                         return 1;
2646
2647                 /* Wow! This is weird, different data but the same
2648                  * seqnums? Something is borked, but let's make the
2649                  * best of it and compare by time. */
2650         }
2651
2652         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2653
2654                 /* If the boot id matches, compare monotonic time */
2655                 if (af->current_monotonic < bf->current_monotonic)
2656                         return -1;
2657                 if (af->current_monotonic > bf->current_monotonic)
2658                         return 1;
2659         }
2660
2661         /* Otherwise, compare UTC time */
2662         if (af->current_realtime < bf->current_realtime)
2663                 return -1;
2664         if (af->current_realtime > bf->current_realtime)
2665                 return 1;
2666
2667         /* Finally, compare by contents */
2668         if (af->current_xor_hash < bf->current_xor_hash)
2669                 return -1;
2670         if (af->current_xor_hash > bf->current_xor_hash)
2671                 return 1;
2672
2673         return 0;
2674 }
2675
2676 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2677
2678         /* Increase or decrease the specified index, in the right direction. */
2679
2680         if (direction == DIRECTION_DOWN) {
2681                 if (*i >= n - 1)
2682                         return 0;
2683
2684                 (*i) ++;
2685         } else {
2686                 if (*i <= 0)
2687                         return 0;
2688
2689                 (*i) --;
2690         }
2691
2692         return 1;
2693 }
2694
2695 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2696
2697         /* Consider it an error if any of the two offsets is uninitialized */
2698         if (old_offset == 0 || new_offset == 0)
2699                 return false;
2700
2701         /* If we go down, the new offset must be larger than the old one. */
2702         return direction == DIRECTION_DOWN ?
2703                 new_offset > old_offset  :
2704                 new_offset < old_offset;
2705 }
2706
2707 int journal_file_next_entry(
2708                 JournalFile *f,
2709                 uint64_t p,
2710                 direction_t direction,
2711                 Object **ret, uint64_t *offset) {
2712
2713         uint64_t i, n, ofs;
2714         int r;
2715
2716         assert(f);
2717         assert(f->header);
2718
2719         n = le64toh(f->header->n_entries);
2720         if (n <= 0)
2721                 return 0;
2722
2723         if (p == 0)
2724                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2725         else {
2726                 r = generic_array_bisect(f,
2727                                          le64toh(f->header->entry_array_offset),
2728                                          le64toh(f->header->n_entries),
2729                                          p,
2730                                          test_object_offset,
2731                                          DIRECTION_DOWN,
2732                                          NULL, NULL,
2733                                          &i);
2734                 if (r <= 0)
2735                         return r;
2736
2737                 r = bump_array_index(&i, direction, n);
2738                 if (r <= 0)
2739                         return r;
2740         }
2741
2742         /* And jump to it */
2743         for (;;) {
2744                 r = generic_array_get(f,
2745                                       le64toh(f->header->entry_array_offset),
2746                                       i,
2747                                       ret, &ofs);
2748                 if (r > 0)
2749                         break;
2750                 if (r != -EBADMSG)
2751                         return r;
2752
2753                 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2754                  * the next one might work for us instead. */
2755                 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2756
2757                 r = bump_array_index(&i, direction, n);
2758                 if (r <= 0)
2759                         return r;
2760         }
2761
2762         /* Ensure our array is properly ordered. */
2763         if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2764                 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
2765                 return -EBADMSG;
2766         }
2767
2768         if (offset)
2769                 *offset = ofs;
2770
2771         return 1;
2772 }
2773
2774 int journal_file_next_entry_for_data(
2775                 JournalFile *f,
2776                 Object *o, uint64_t p,
2777                 uint64_t data_offset,
2778                 direction_t direction,
2779                 Object **ret, uint64_t *offset) {
2780
2781         uint64_t i, n, ofs;
2782         Object *d;
2783         int r;
2784
2785         assert(f);
2786         assert(p > 0 || !o);
2787
2788         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2789         if (r < 0)
2790                 return r;
2791
2792         n = le64toh(d->data.n_entries);
2793         if (n <= 0)
2794                 return n;
2795
2796         if (!o)
2797                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2798         else {
2799                 if (o->object.type != OBJECT_ENTRY)
2800                         return -EINVAL;
2801
2802                 r = generic_array_bisect_plus_one(f,
2803                                                   le64toh(d->data.entry_offset),
2804                                                   le64toh(d->data.entry_array_offset),
2805                                                   le64toh(d->data.n_entries),
2806                                                   p,
2807                                                   test_object_offset,
2808                                                   DIRECTION_DOWN,
2809                                                   NULL, NULL,
2810                                                   &i);
2811
2812                 if (r <= 0)
2813                         return r;
2814
2815                 r = bump_array_index(&i, direction, n);
2816                 if (r <= 0)
2817                         return r;
2818         }
2819
2820         for (;;) {
2821                 r = generic_array_get_plus_one(f,
2822                                                le64toh(d->data.entry_offset),
2823                                                le64toh(d->data.entry_array_offset),
2824                                                i,
2825                                                ret, &ofs);
2826                 if (r > 0)
2827                         break;
2828                 if (r != -EBADMSG)
2829                         return r;
2830
2831                 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2832
2833                 r = bump_array_index(&i, direction, n);
2834                 if (r <= 0)
2835                         return r;
2836         }
2837
2838         /* Ensure our array is properly ordered. */
2839         if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2840                 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2841                 return -EBADMSG;
2842         }
2843
2844         if (offset)
2845                 *offset = ofs;
2846
2847         return 1;
2848 }
2849
2850 int journal_file_move_to_entry_by_offset_for_data(
2851                 JournalFile *f,
2852                 uint64_t data_offset,
2853                 uint64_t p,
2854                 direction_t direction,
2855                 Object **ret, uint64_t *offset) {
2856
2857         int r;
2858         Object *d;
2859
2860         assert(f);
2861
2862         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2863         if (r < 0)
2864                 return r;
2865
2866         return generic_array_bisect_plus_one(f,
2867                                              le64toh(d->data.entry_offset),
2868                                              le64toh(d->data.entry_array_offset),
2869                                              le64toh(d->data.n_entries),
2870                                              p,
2871                                              test_object_offset,
2872                                              direction,
2873                                              ret, offset, NULL);
2874 }
2875
2876 int journal_file_move_to_entry_by_monotonic_for_data(
2877                 JournalFile *f,
2878                 uint64_t data_offset,
2879                 sd_id128_t boot_id,
2880                 uint64_t monotonic,
2881                 direction_t direction,
2882                 Object **ret, uint64_t *offset) {
2883
2884         Object *o, *d;
2885         int r;
2886         uint64_t b, z;
2887
2888         assert(f);
2889
2890         /* First, seek by time */
2891         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2892         if (r < 0)
2893                 return r;
2894         if (r == 0)
2895                 return -ENOENT;
2896
2897         r = generic_array_bisect_plus_one(f,
2898                                           le64toh(o->data.entry_offset),
2899                                           le64toh(o->data.entry_array_offset),
2900                                           le64toh(o->data.n_entries),
2901                                           monotonic,
2902                                           test_object_monotonic,
2903                                           direction,
2904                                           NULL, &z, NULL);
2905         if (r <= 0)
2906                 return r;
2907
2908         /* And now, continue seeking until we find an entry that
2909          * exists in both bisection arrays */
2910
2911         for (;;) {
2912                 Object *qo;
2913                 uint64_t p, q;
2914
2915                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2916                 if (r < 0)
2917                         return r;
2918
2919                 r = generic_array_bisect_plus_one(f,
2920                                                   le64toh(d->data.entry_offset),
2921                                                   le64toh(d->data.entry_array_offset),
2922                                                   le64toh(d->data.n_entries),
2923                                                   z,
2924                                                   test_object_offset,
2925                                                   direction,
2926                                                   NULL, &p, NULL);
2927                 if (r <= 0)
2928                         return r;
2929
2930                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2931                 if (r < 0)
2932                         return r;
2933
2934                 r = generic_array_bisect_plus_one(f,
2935                                                   le64toh(o->data.entry_offset),
2936                                                   le64toh(o->data.entry_array_offset),
2937                                                   le64toh(o->data.n_entries),
2938                                                   p,
2939                                                   test_object_offset,
2940                                                   direction,
2941                                                   &qo, &q, NULL);
2942
2943                 if (r <= 0)
2944                         return r;
2945
2946                 if (p == q) {
2947                         if (ret)
2948                                 *ret = qo;
2949                         if (offset)
2950                                 *offset = q;
2951
2952                         return 1;
2953                 }
2954
2955                 z = q;
2956         }
2957 }
2958
2959 int journal_file_move_to_entry_by_seqnum_for_data(
2960                 JournalFile *f,
2961                 uint64_t data_offset,
2962                 uint64_t seqnum,
2963                 direction_t direction,
2964                 Object **ret, uint64_t *offset) {
2965
2966         Object *d;
2967         int r;
2968
2969         assert(f);
2970
2971         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2972         if (r < 0)
2973                 return r;
2974
2975         return generic_array_bisect_plus_one(f,
2976                                              le64toh(d->data.entry_offset),
2977                                              le64toh(d->data.entry_array_offset),
2978                                              le64toh(d->data.n_entries),
2979                                              seqnum,
2980                                              test_object_seqnum,
2981                                              direction,
2982                                              ret, offset, NULL);
2983 }
2984
2985 int journal_file_move_to_entry_by_realtime_for_data(
2986                 JournalFile *f,
2987                 uint64_t data_offset,
2988                 uint64_t realtime,
2989                 direction_t direction,
2990                 Object **ret, uint64_t *offset) {
2991
2992         Object *d;
2993         int r;
2994
2995         assert(f);
2996
2997         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2998         if (r < 0)
2999                 return r;
3000
3001         return generic_array_bisect_plus_one(f,
3002                                              le64toh(d->data.entry_offset),
3003                                              le64toh(d->data.entry_array_offset),
3004                                              le64toh(d->data.n_entries),
3005                                              realtime,
3006                                              test_object_realtime,
3007                                              direction,
3008                                              ret, offset, NULL);
3009 }
3010
3011 void journal_file_dump(JournalFile *f) {
3012         Object *o;
3013         int r;
3014         uint64_t p;
3015
3016         assert(f);
3017         assert(f->header);
3018
3019         journal_file_print_header(f);
3020
3021         p = le64toh(f->header->header_size);
3022         while (p != 0) {
3023                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3024                 if (r < 0)
3025                         goto fail;
3026
3027                 switch (o->object.type) {
3028
3029                 case OBJECT_UNUSED:
3030                         printf("Type: OBJECT_UNUSED\n");
3031                         break;
3032
3033                 case OBJECT_DATA:
3034                         printf("Type: OBJECT_DATA\n");
3035                         break;
3036
3037                 case OBJECT_FIELD:
3038                         printf("Type: OBJECT_FIELD\n");
3039                         break;
3040
3041                 case OBJECT_ENTRY:
3042                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3043                                le64toh(o->entry.seqnum),
3044                                le64toh(o->entry.monotonic),
3045                                le64toh(o->entry.realtime));
3046                         break;
3047
3048                 case OBJECT_FIELD_HASH_TABLE:
3049                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3050                         break;
3051
3052                 case OBJECT_DATA_HASH_TABLE:
3053                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
3054                         break;
3055
3056                 case OBJECT_ENTRY_ARRAY:
3057                         printf("Type: OBJECT_ENTRY_ARRAY\n");
3058                         break;
3059
3060                 case OBJECT_TAG:
3061                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3062                                le64toh(o->tag.seqnum),
3063                                le64toh(o->tag.epoch));
3064                         break;
3065
3066                 default:
3067                         printf("Type: unknown (%i)\n", o->object.type);
3068                         break;
3069                 }
3070
3071                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3072                         printf("Flags: %s\n",
3073                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3074
3075                 if (p == le64toh(f->header->tail_object_offset))
3076                         p = 0;
3077                 else
3078                         p = p + ALIGN64(le64toh(o->object.size));
3079         }
3080
3081         return;
3082 fail:
3083         log_error("File corrupt");
3084 }
3085
3086 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3087         const char *x;
3088
3089         x = format_timestamp(buf, l, t);
3090         if (x)
3091                 return x;
3092         return " --- ";
3093 }
3094
3095 void journal_file_print_header(JournalFile *f) {
3096         char a[33], b[33], c[33], d[33];
3097         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3098         struct stat st;
3099         char bytes[FORMAT_BYTES_MAX];
3100
3101         assert(f);
3102         assert(f->header);
3103
3104         printf("File Path: %s\n"
3105                "File ID: %s\n"
3106                "Machine ID: %s\n"
3107                "Boot ID: %s\n"
3108                "Sequential Number ID: %s\n"
3109                "State: %s\n"
3110                "Compatible Flags:%s%s\n"
3111                "Incompatible Flags:%s%s%s\n"
3112                "Header size: %"PRIu64"\n"
3113                "Arena size: %"PRIu64"\n"
3114                "Data Hash Table Size: %"PRIu64"\n"
3115                "Field Hash Table Size: %"PRIu64"\n"
3116                "Rotate Suggested: %s\n"
3117                "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3118                "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3119                "Head Realtime Timestamp: %s (%"PRIx64")\n"
3120                "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3121                "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3122                "Objects: %"PRIu64"\n"
3123                "Entry Objects: %"PRIu64"\n",
3124                f->path,
3125                sd_id128_to_string(f->header->file_id, a),
3126                sd_id128_to_string(f->header->machine_id, b),
3127                sd_id128_to_string(f->header->boot_id, c),
3128                sd_id128_to_string(f->header->seqnum_id, d),
3129                f->header->state == STATE_OFFLINE ? "OFFLINE" :
3130                f->header->state == STATE_ONLINE ? "ONLINE" :
3131                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3132                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3133                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3134                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3135                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3136                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3137                le64toh(f->header->header_size),
3138                le64toh(f->header->arena_size),
3139                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3140                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3141                yes_no(journal_file_rotate_suggested(f, 0)),
3142                le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3143                le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3144                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3145                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3146                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3147                le64toh(f->header->n_objects),
3148                le64toh(f->header->n_entries));
3149
3150         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3151                 printf("Data Objects: %"PRIu64"\n"
3152                        "Data Hash Table Fill: %.1f%%\n",
3153                        le64toh(f->header->n_data),
3154                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3155
3156         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3157                 printf("Field Objects: %"PRIu64"\n"
3158                        "Field Hash Table Fill: %.1f%%\n",
3159                        le64toh(f->header->n_fields),
3160                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3161
3162         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3163                 printf("Tag Objects: %"PRIu64"\n",
3164                        le64toh(f->header->n_tags));
3165         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3166                 printf("Entry Array Objects: %"PRIu64"\n",
3167                        le64toh(f->header->n_entry_arrays));
3168
3169         if (fstat(f->fd, &st) >= 0)
3170                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3171 }
3172
3173 static int journal_file_warn_btrfs(JournalFile *f) {
3174         unsigned attrs;
3175         int r;
3176
3177         assert(f);
3178
3179         /* Before we write anything, check if the COW logic is turned
3180          * off on btrfs. Given our write pattern that is quite
3181          * unfriendly to COW file systems this should greatly improve
3182          * performance on COW file systems, such as btrfs, at the
3183          * expense of data integrity features (which shouldn't be too
3184          * bad, given that we do our own checksumming). */
3185
3186         r = btrfs_is_filesystem(f->fd);
3187         if (r < 0)
3188                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3189         if (!r)
3190                 return 0;
3191
3192         r = read_attr_fd(f->fd, &attrs);
3193         if (r < 0)
3194                 return log_warning_errno(r, "Failed to read file attributes: %m");
3195
3196         if (attrs & FS_NOCOW_FL) {
3197                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3198                 return 0;
3199         }
3200
3201         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3202                    "This is likely to slow down journal access substantially, please consider turning "
3203                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3204
3205         return 1;
3206 }
3207
3208 int journal_file_open(
3209                 int fd,
3210                 const char *fname,
3211                 int flags,
3212                 mode_t mode,
3213                 bool compress,
3214                 bool seal,
3215                 JournalMetrics *metrics,
3216                 MMapCache *mmap_cache,
3217                 Set *deferred_closes,
3218                 JournalFile *template,
3219                 JournalFile **ret) {
3220
3221         bool newly_created = false;
3222         JournalFile *f;
3223         void *h;
3224         int r;
3225
3226         assert(ret);
3227         assert(fd >= 0 || fname);
3228
3229         if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3230                 return -EINVAL;
3231
3232         if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3233                 return -EINVAL;
3234
3235         f = new0(JournalFile, 1);
3236         if (!f)
3237                 return -ENOMEM;
3238
3239         f->fd = fd;
3240         f->mode = mode;
3241
3242         f->flags = flags;
3243         f->prot = prot_from_flags(flags);
3244         f->writable = (flags & O_ACCMODE) != O_RDONLY;
3245 #if HAVE_LZ4
3246         f->compress_lz4 = compress;
3247 #elif HAVE_XZ
3248         f->compress_xz = compress;
3249 #endif
3250 #if HAVE_GCRYPT
3251         f->seal = seal;
3252 #endif
3253
3254         if (mmap_cache)
3255                 f->mmap = mmap_cache_ref(mmap_cache);
3256         else {
3257                 f->mmap = mmap_cache_new();
3258                 if (!f->mmap) {
3259                         r = -ENOMEM;
3260                         goto fail;
3261                 }
3262         }
3263
3264         if (fname) {
3265                 f->path = strdup(fname);
3266                 if (!f->path) {
3267                         r = -ENOMEM;
3268                         goto fail;
3269                 }
3270         } else {
3271                 assert(fd >= 0);
3272
3273                 /* If we don't know the path, fill in something explanatory and vaguely useful */
3274                 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3275                         r = -ENOMEM;
3276                         goto fail;
3277                 }
3278         }
3279
3280         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3281         if (!f->chain_cache) {
3282                 r = -ENOMEM;
3283                 goto fail;
3284         }
3285
3286         if (f->fd < 0) {
3287                 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3288                  * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3289                  * it doesn't hurt in that case. */
3290
3291                 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
3292                 if (f->fd < 0) {
3293                         r = -errno;
3294                         goto fail;
3295                 }
3296
3297                 /* fds we opened here by us should also be closed by us. */
3298                 f->close_fd = true;
3299
3300                 r = fd_nonblock(f->fd, false);
3301                 if (r < 0)
3302                         goto fail;
3303         }
3304
3305         f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3306         if (!f->cache_fd) {
3307                 r = -ENOMEM;
3308                 goto fail;
3309         }
3310
3311         r = journal_file_fstat(f);
3312         if (r < 0)
3313                 goto fail;
3314
3315         if (f->last_stat.st_size == 0 && f->writable) {
3316
3317                 (void) journal_file_warn_btrfs(f);
3318
3319                 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3320                  * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3321                  * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3322                  * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3323                  * solely on mtime/atime/ctime of the file. */
3324                 (void) fd_setcrtime(f->fd, 0);
3325
3326 #if HAVE_GCRYPT
3327                 /* Try to load the FSPRG state, and if we can't, then
3328                  * just don't do sealing */
3329                 if (f->seal) {
3330                         r = journal_file_fss_load(f);
3331                         if (r < 0)
3332                                 f->seal = false;
3333                 }
3334 #endif
3335
3336                 r = journal_file_init_header(f, template);
3337                 if (r < 0)
3338                         goto fail;
3339
3340                 r = journal_file_fstat(f);
3341                 if (r < 0)
3342                         goto fail;
3343
3344                 newly_created = true;
3345         }
3346
3347         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3348                 r = -ENODATA;
3349                 goto fail;
3350         }
3351
3352         r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3353         if (r < 0)
3354                 goto fail;
3355
3356         f->header = h;
3357
3358         if (!newly_created) {
3359                 set_clear_with_destructor(deferred_closes, journal_file_close);
3360
3361                 r = journal_file_verify_header(f);
3362                 if (r < 0)
3363                         goto fail;
3364         }
3365
3366 #if HAVE_GCRYPT
3367         if (!newly_created && f->writable) {
3368                 r = journal_file_fss_load(f);
3369                 if (r < 0)
3370                         goto fail;
3371         }
3372 #endif
3373
3374         if (f->writable) {
3375                 if (metrics) {
3376                         journal_default_metrics(metrics, f->fd);
3377                         f->metrics = *metrics;
3378                 } else if (template)
3379                         f->metrics = template->metrics;
3380
3381                 r = journal_file_refresh_header(f);
3382                 if (r < 0)
3383                         goto fail;
3384         }
3385
3386 #if HAVE_GCRYPT
3387         r = journal_file_hmac_setup(f);
3388         if (r < 0)
3389                 goto fail;
3390 #endif
3391
3392         if (newly_created) {
3393                 r = journal_file_setup_field_hash_table(f);
3394                 if (r < 0)
3395                         goto fail;
3396
3397                 r = journal_file_setup_data_hash_table(f);
3398                 if (r < 0)
3399                         goto fail;
3400
3401 #if HAVE_GCRYPT
3402                 r = journal_file_append_first_tag(f);
3403                 if (r < 0)
3404                         goto fail;
3405 #endif
3406         }
3407
3408         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3409                 r = -EIO;
3410                 goto fail;
3411         }
3412
3413         if (template && template->post_change_timer) {
3414                 r = journal_file_enable_post_change_timer(
3415                                 f,
3416                                 sd_event_source_get_event(template->post_change_timer),
3417                                 template->post_change_timer_period);
3418
3419                 if (r < 0)
3420                         goto fail;
3421         }
3422
3423         /* The file is opened now successfully, thus we take possession of any passed in fd. */
3424         f->close_fd = true;
3425
3426         *ret = f;
3427         return 0;
3428
3429 fail:
3430         if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3431                 r = -EIO;
3432
3433         (void) journal_file_close(f);
3434
3435         return r;
3436 }
3437
3438 int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
3439         _cleanup_free_ char *p = NULL;
3440         size_t l;
3441         JournalFile *old_file, *new_file = NULL;
3442         int r;
3443
3444         assert(f);
3445         assert(*f);
3446
3447         old_file = *f;
3448
3449         if (!old_file->writable)
3450                 return -EINVAL;
3451
3452         /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3453          * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3454         if (path_startswith(old_file->path, "/proc/self/fd"))
3455                 return -EINVAL;
3456
3457         if (!endswith(old_file->path, ".journal"))
3458                 return -EINVAL;
3459
3460         l = strlen(old_file->path);
3461         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3462                      (int) l - 8, old_file->path,
3463                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3464                      le64toh((*f)->header->head_entry_seqnum),
3465                      le64toh((*f)->header->head_entry_realtime));
3466         if (r < 0)
3467                 return -ENOMEM;
3468
3469         /* Try to rename the file to the archived version. If the file
3470          * already was deleted, we'll get ENOENT, let's ignore that
3471          * case. */
3472         r = rename(old_file->path, p);
3473         if (r < 0 && errno != ENOENT)
3474                 return -errno;
3475
3476         /* Sync the rename to disk */
3477         (void) fsync_directory_of_file(old_file->fd);
3478
3479         /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3480          * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3481          * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3482          * would result in the rotated journal never getting fsync() called before closing.
3483          * Now we simply queue the archive state by setting an archive bit, leaving the state
3484          * as STATE_ONLINE so proper offlining occurs. */
3485         old_file->archive = true;
3486
3487         /* Currently, btrfs is not very good with out write patterns
3488          * and fragments heavily. Let's defrag our journal files when
3489          * we archive them */
3490         old_file->defrag_on_close = true;
3491
3492         r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
3493
3494         if (deferred_closes &&
3495             set_put(deferred_closes, old_file) >= 0)
3496                 (void) journal_file_set_offline(old_file, false);
3497         else
3498                 (void) journal_file_close(old_file);
3499
3500         *f = new_file;
3501         return r;
3502 }
3503
3504 int journal_file_open_reliably(
3505                 const char *fname,
3506                 int flags,
3507                 mode_t mode,
3508                 bool compress,
3509                 bool seal,
3510                 JournalMetrics *metrics,
3511                 MMapCache *mmap_cache,
3512                 Set *deferred_closes,
3513                 JournalFile *template,
3514                 JournalFile **ret) {
3515
3516         int r;
3517         size_t l;
3518         _cleanup_free_ char *p = NULL;
3519
3520         r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3521         if (!IN_SET(r,
3522                     -EBADMSG,           /* Corrupted */
3523                     -ENODATA,           /* Truncated */
3524                     -EHOSTDOWN,         /* Other machine */
3525                     -EPROTONOSUPPORT,   /* Incompatible feature */
3526                     -EBUSY,             /* Unclean shutdown */
3527                     -ESHUTDOWN,         /* Already archived */
3528                     -EIO,               /* IO error, including SIGBUS on mmap */
3529                     -EIDRM,             /* File has been deleted */
3530                     -ETXTBSY))          /* File is from the future */
3531                 return r;
3532
3533         if ((flags & O_ACCMODE) == O_RDONLY)
3534                 return r;
3535
3536         if (!(flags & O_CREAT))
3537                 return r;
3538
3539         if (!endswith(fname, ".journal"))
3540                 return r;
3541
3542         /* The file is corrupted. Rotate it away and try it again (but only once) */
3543
3544         l = strlen(fname);
3545         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
3546                      (int) l - 8, fname,
3547                      now(CLOCK_REALTIME),
3548                      random_u64()) < 0)
3549                 return -ENOMEM;
3550
3551         if (rename(fname, p) < 0)
3552                 return -errno;
3553
3554         /* btrfs doesn't cope well with our write pattern and
3555          * fragments heavily. Let's defrag all files we rotate */
3556
3557         (void) chattr_path(p, 0, FS_NOCOW_FL);
3558         (void) btrfs_defrag(p);
3559
3560         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3561
3562         return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3563 }
3564
3565 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3566         uint64_t i, n;
3567         uint64_t q, xor_hash = 0;
3568         int r;
3569         EntryItem *items;
3570         dual_timestamp ts;
3571
3572         assert(from);
3573         assert(to);
3574         assert(o);
3575         assert(p);
3576
3577         if (!to->writable)
3578                 return -EPERM;
3579
3580         ts.monotonic = le64toh(o->entry.monotonic);
3581         ts.realtime = le64toh(o->entry.realtime);
3582
3583         n = journal_file_entry_n_items(o);
3584         /* alloca() can't take 0, hence let's allocate at least one */
3585         items = alloca(sizeof(EntryItem) * MAX(1u, n));
3586
3587         for (i = 0; i < n; i++) {
3588                 uint64_t l, h;
3589                 le64_t le_hash;
3590                 size_t t;
3591                 void *data;
3592                 Object *u;
3593
3594                 q = le64toh(o->entry.items[i].object_offset);
3595                 le_hash = o->entry.items[i].hash;
3596
3597                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3598                 if (r < 0)
3599                         return r;
3600
3601                 if (le_hash != o->data.hash)
3602                         return -EBADMSG;
3603
3604                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3605                 t = (size_t) l;
3606
3607                 /* We hit the limit on 32bit machines */
3608                 if ((uint64_t) t != l)
3609                         return -E2BIG;
3610
3611                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3612 #if HAVE_XZ || HAVE_LZ4
3613                         size_t rsize = 0;
3614
3615                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3616                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3617                         if (r < 0)
3618                                 return r;
3619
3620                         data = from->compress_buffer;
3621                         l = rsize;
3622 #else
3623                         return -EPROTONOSUPPORT;
3624 #endif
3625                 } else
3626                         data = o->data.payload;
3627
3628                 r = journal_file_append_data(to, data, l, &u, &h);
3629                 if (r < 0)
3630                         return r;
3631
3632                 xor_hash ^= le64toh(u->data.hash);
3633                 items[i].object_offset = htole64(h);
3634                 items[i].hash = u->data.hash;
3635
3636                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3637                 if (r < 0)
3638                         return r;
3639         }
3640
3641         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3642
3643         if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3644                 return -EIO;
3645
3646         return r;
3647 }
3648
3649 void journal_reset_metrics(JournalMetrics *m) {
3650         assert(m);
3651
3652         /* Set everything to "pick automatic values". */
3653
3654         *m = (JournalMetrics) {
3655                 .min_use = (uint64_t) -1,
3656                 .max_use = (uint64_t) -1,
3657                 .min_size = (uint64_t) -1,
3658                 .max_size = (uint64_t) -1,
3659                 .keep_free = (uint64_t) -1,
3660                 .n_max_files = (uint64_t) -1,
3661         };
3662 }
3663
3664 void journal_default_metrics(JournalMetrics *m, int fd) {
3665         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3666         struct statvfs ss;
3667         uint64_t fs_size;
3668
3669         assert(m);
3670         assert(fd >= 0);
3671
3672         if (fstatvfs(fd, &ss) >= 0)
3673                 fs_size = ss.f_frsize * ss.f_blocks;
3674         else {
3675                 log_debug_errno(errno, "Failed to determine disk size: %m");
3676                 fs_size = 0;
3677         }
3678
3679         if (m->max_use == (uint64_t) -1) {
3680
3681                 if (fs_size > 0) {
3682                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3683
3684                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3685                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3686
3687                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3688                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3689                 } else
3690                         m->max_use = DEFAULT_MAX_USE_LOWER;
3691         } else {
3692                 m->max_use = PAGE_ALIGN(m->max_use);
3693
3694                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3695                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3696         }
3697
3698         if (m->min_use == (uint64_t) -1)
3699                 m->min_use = DEFAULT_MIN_USE;
3700
3701         if (m->min_use > m->max_use)
3702                 m->min_use = m->max_use;
3703
3704         if (m->max_size == (uint64_t) -1) {
3705                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3706
3707                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3708                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3709         } else
3710                 m->max_size = PAGE_ALIGN(m->max_size);
3711
3712         if (m->max_size != 0) {
3713                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3714                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3715
3716                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3717                         m->max_use = m->max_size*2;
3718         }
3719
3720         if (m->min_size == (uint64_t) -1)
3721                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3722         else {
3723                 m->min_size = PAGE_ALIGN(m->min_size);
3724
3725                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3726                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3727
3728                 if (m->max_size != 0 && m->min_size > m->max_size)
3729                         m->max_size = m->min_size;
3730         }
3731
3732         if (m->keep_free == (uint64_t) -1) {
3733
3734                 if (fs_size > 0) {
3735                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3736
3737                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3738                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3739
3740                 } else
3741                         m->keep_free = DEFAULT_KEEP_FREE;
3742         }
3743
3744         if (m->n_max_files == (uint64_t) -1)
3745                 m->n_max_files = DEFAULT_N_MAX_FILES;
3746
3747         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3748                   format_bytes(a, sizeof(a), m->min_use),
3749                   format_bytes(b, sizeof(b), m->max_use),
3750                   format_bytes(c, sizeof(c), m->max_size),
3751                   format_bytes(d, sizeof(d), m->min_size),
3752                   format_bytes(e, sizeof(e), m->keep_free),
3753                   m->n_max_files);
3754 }
3755
3756 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3757         assert(f);
3758         assert(f->header);
3759         assert(from || to);
3760
3761         if (from) {
3762                 if (f->header->head_entry_realtime == 0)
3763                         return -ENOENT;
3764
3765                 *from = le64toh(f->header->head_entry_realtime);
3766         }
3767
3768         if (to) {
3769                 if (f->header->tail_entry_realtime == 0)
3770                         return -ENOENT;
3771
3772                 *to = le64toh(f->header->tail_entry_realtime);
3773         }
3774
3775         return 1;
3776 }
3777
3778 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3779         Object *o;
3780         uint64_t p;
3781         int r;
3782
3783         assert(f);
3784         assert(from || to);
3785
3786         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3787         if (r <= 0)
3788                 return r;
3789
3790         if (le64toh(o->data.n_entries) <= 0)
3791                 return 0;
3792
3793         if (from) {
3794                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3795                 if (r < 0)
3796                         return r;
3797
3798                 *from = le64toh(o->entry.monotonic);
3799         }
3800
3801         if (to) {
3802                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3803                 if (r < 0)
3804                         return r;
3805
3806                 r = generic_array_get_plus_one(f,
3807                                                le64toh(o->data.entry_offset),
3808                                                le64toh(o->data.entry_array_offset),
3809                                                le64toh(o->data.n_entries)-1,
3810                                                &o, NULL);
3811                 if (r <= 0)
3812                         return r;
3813
3814                 *to = le64toh(o->entry.monotonic);
3815         }
3816
3817         return 1;
3818 }
3819
3820 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3821         assert(f);
3822         assert(f->header);
3823
3824         /* If we gained new header fields we gained new features,
3825          * hence suggest a rotation */
3826         if (le64toh(f->header->header_size) < sizeof(Header)) {
3827                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3828                 return true;
3829         }
3830
3831         /* Let's check if the hash tables grew over a certain fill
3832          * level (75%, borrowing this value from Java's hash table
3833          * implementation), and if so suggest a rotation. To calculate
3834          * the fill level we need the n_data field, which only exists
3835          * in newer versions. */
3836
3837         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3838                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3839                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3840                                   f->path,
3841                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3842                                   le64toh(f->header->n_data),
3843                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3844                                   (unsigned long long) f->last_stat.st_size,
3845                                   f->last_stat.st_size / le64toh(f->header->n_data));
3846                         return true;
3847                 }
3848
3849         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3850                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3851                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3852                                   f->path,
3853                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3854                                   le64toh(f->header->n_fields),
3855                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3856                         return true;
3857                 }
3858
3859         /* Are the data objects properly indexed by field objects? */
3860         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3861             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3862             le64toh(f->header->n_data) > 0 &&
3863             le64toh(f->header->n_fields) == 0)
3864                 return true;
3865
3866         if (max_file_usec > 0) {
3867                 usec_t t, h;
3868
3869                 h = le64toh(f->header->head_entry_realtime);
3870                 t = now(CLOCK_REALTIME);
3871
3872                 if (h > 0 && t > h + max_file_usec)
3873                         return true;
3874         }
3875
3876         return false;
3877 }