src/journal/journal-file.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2011 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <linux/fs.h>
  24 #include <pthread.h>
  25 #include <stddef.h>
  26 #include <sys/mman.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/uio.h>
  29 #include <unistd.h>
  30
  31 #include "alloc-util.h"
  32 #include "btrfs-util.h"
  33 #include "chattr-util.h"
  34 #include "compress.h"
  35 #include "fd-util.h"
  36 #include "fs-util.h"
  37 #include "journal-authenticate.h"
  38 #include "journal-def.h"
  39 #include "journal-file.h"
  40 #include "lookup3.h"
  41 #include "parse-util.h"
  42 #include "path-util.h"
  43 #include "random-util.h"
  44 #include "sd-event.h"
  45 #include "set.h"
  46 #include "stat-util.h"
  47 #include "string-util.h"
  48 #include "strv.h"
  49 #include "xattr-util.h"
  50
  51 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  52 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  53
  54 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
  55 #define MIN_COMPRESS_THRESHOLD (8ULL)
  56
  57 /* This is the minimum journal file size */
  58 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL)                 /* 512 KiB */
  59
  60 /* These are the lower and upper bounds if we deduce the max_use value
  61  * from the file system size */
  62 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  63 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  64
  65 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  66 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  67
  68 /* This is the upper bound if we deduce max_size from max_use */
  69 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  70
  71 /* This is the upper bound if we deduce the keep_free value from the
  72  * file system size */
  73 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  74
  75 /* This is the keep_free value when we can't determine the system
  76  * size */
  77 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  78
  79 /* This is the default maximum number of journal files to keep around. */
  80 #define DEFAULT_N_MAX_FILES (100)
  81
  82 /* n_data was the first entry we added after the initial file format design */
  83 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  84
  85 /* How many entries to keep in the entry array chain cache at max */
  86 #define CHAIN_CACHE_MAX 20
  87
  88 /* How much to increase the journal file size at once each time we allocate something new. */
  89 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  90
  91 /* Reread fstat() of the file for detecting deletions at least this often */
  92 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  93
  94 /* The mmap context to use for the header we pick as one above the last defined typed */
  95 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  96
  97 #ifdef __clang__
  98 #  pragma GCC diagnostic ignored "-Waddress-of-packed-member"
  99 #endif
 100
 101 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
 102  * As a result we use atomic operations on f->offline_state for inter-thread communications with
 103  * journal_file_set_offline() and journal_file_set_online(). */
 104 static void journal_file_set_offline_internal(JournalFile *f) {
 105         assert(f);
 106         assert(f->fd >= 0);
 107         assert(f->header);
 108
 109         for (;;) {
 110                 switch (f->offline_state) {
 111                 case OFFLINE_CANCEL:
 112                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
 113                                 continue;
 114                         return;
 115
 116                 case OFFLINE_AGAIN_FROM_SYNCING:
 117                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
 118                                 continue;
 119                         break;
 120
 121                 case OFFLINE_AGAIN_FROM_OFFLINING:
 122                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
 123                                 continue;
 124                         break;
 125
 126                 case OFFLINE_SYNCING:
 127                         (void) fsync(f->fd);
 128
 129                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
 130                                 continue;
 131
 132                         f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
 133                         (void) fsync(f->fd);
 134                         break;
 135
 136                 case OFFLINE_OFFLINING:
 137                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
 138                                 continue;
 139                         _fallthrough_;
 140                 case OFFLINE_DONE:
 141                         return;
 142
 143                 case OFFLINE_JOINED:
 144                         log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
 145                         return;
 146                 }
 147         }
 148 }
 149
 150 static void * journal_file_set_offline_thread(void *arg) {
 151         JournalFile *f = arg;
 152
 153         (void) pthread_setname_np(pthread_self(), "journal-offline");
 154
 155         journal_file_set_offline_internal(f);
 156
 157         return NULL;
 158 }
 159
 160 static int journal_file_set_offline_thread_join(JournalFile *f) {
 161         int r;
 162
 163         assert(f);
 164
 165         if (f->offline_state == OFFLINE_JOINED)
 166                 return 0;
 167
 168         r = pthread_join(f->offline_thread, NULL);
 169         if (r)
 170                 return -r;
 171
 172         f->offline_state = OFFLINE_JOINED;
 173
 174         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 175                 return -EIO;
 176
 177         return 0;
 178 }
 179
 180 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
 181 static bool journal_file_set_offline_try_restart(JournalFile *f) {
 182         for (;;) {
 183                 switch (f->offline_state) {
 184                 case OFFLINE_AGAIN_FROM_SYNCING:
 185                 case OFFLINE_AGAIN_FROM_OFFLINING:
 186                         return true;
 187
 188                 case OFFLINE_CANCEL:
 189                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
 190                                 continue;
 191                         return true;
 192
 193                 case OFFLINE_SYNCING:
 194                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
 195                                 continue;
 196                         return true;
 197
 198                 case OFFLINE_OFFLINING:
 199                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
 200                                 continue;
 201                         return true;
 202
 203                 default:
 204                         return false;
 205                 }
 206         }
 207 }
 208
 209 /* Sets a journal offline.
 210  *
 211  * If wait is false then an offline is dispatched in a separate thread for a
 212  * subsequent journal_file_set_offline() or journal_file_set_online() of the
 213  * same journal to synchronize with.
 214  *
 215  * If wait is true, then either an existing offline thread will be restarted
 216  * and joined, or if none exists the offline is simply performed in this
 217  * context without involving another thread.
 218  */
 219 int journal_file_set_offline(JournalFile *f, bool wait) {
 220         bool restarted;
 221         int r;
 222
 223         assert(f);
 224
 225         if (!f->writable)
 226                 return -EPERM;
 227
 228         if (!(f->fd >= 0 && f->header))
 229                 return -EINVAL;
 230
 231         /* An offlining journal is implicitly online and may modify f->header->state,
 232          * we must also join any potentially lingering offline thread when not online. */
 233         if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
 234                 return journal_file_set_offline_thread_join(f);
 235
 236         /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
 237         restarted = journal_file_set_offline_try_restart(f);
 238         if ((restarted && wait) || !restarted) {
 239                 r = journal_file_set_offline_thread_join(f);
 240                 if (r < 0)
 241                         return r;
 242         }
 243
 244         if (restarted)
 245                 return 0;
 246
 247         /* Initiate a new offline. */
 248         f->offline_state = OFFLINE_SYNCING;
 249
 250         if (wait) /* Without using a thread if waiting. */
 251                 journal_file_set_offline_internal(f);
 252         else {
 253                 sigset_t ss, saved_ss;
 254                 int k;
 255
 256                 if (sigfillset(&ss) < 0)
 257                         return -errno;
 258
 259                 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
 260                 if (r > 0)
 261                         return -r;
 262
 263                 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
 264
 265                 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
 266                 if (r > 0) {
 267                         f->offline_state = OFFLINE_JOINED;
 268                         return -r;
 269                 }
 270                 if (k > 0)
 271                         return -k;
 272         }
 273
 274         return 0;
 275 }
 276
 277 static int journal_file_set_online(JournalFile *f) {
 278         bool joined = false;
 279
 280         assert(f);
 281
 282         if (!f->writable)
 283                 return -EPERM;
 284
 285         if (!(f->fd >= 0 && f->header))
 286                 return -EINVAL;
 287
 288         while (!joined) {
 289                 switch (f->offline_state) {
 290                 case OFFLINE_JOINED:
 291                         /* No offline thread, no need to wait. */
 292                         joined = true;
 293                         break;
 294
 295                 case OFFLINE_SYNCING:
 296                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
 297                                 continue;
 298                         /* Canceled syncing prior to offlining, no need to wait. */
 299                         break;
 300
 301                 case OFFLINE_AGAIN_FROM_SYNCING:
 302                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
 303                                 continue;
 304                         /* Canceled restart from syncing, no need to wait. */
 305                         break;
 306
 307                 case OFFLINE_AGAIN_FROM_OFFLINING:
 308                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
 309                                 continue;
 310                         /* Canceled restart from offlining, must wait for offlining to complete however. */
 311                         _fallthrough_;
 312                 default: {
 313                         int r;
 314
 315                         r = journal_file_set_offline_thread_join(f);
 316                         if (r < 0)
 317                                 return r;
 318
 319                         joined = true;
 320                         break;
 321                 }
 322                 }
 323         }
 324
 325         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 326                 return -EIO;
 327
 328         switch (f->header->state) {
 329                 case STATE_ONLINE:
 330                         return 0;
 331
 332                 case STATE_OFFLINE:
 333                         f->header->state = STATE_ONLINE;
 334                         (void) fsync(f->fd);
 335                         return 0;
 336
 337                 default:
 338                         return -EINVAL;
 339         }
 340 }
 341
 342 bool journal_file_is_offlining(JournalFile *f) {
 343         assert(f);
 344
 345         __sync_synchronize();
 346
 347         if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
 348                 return false;
 349
 350         return true;
 351 }
 352
 353 JournalFile* journal_file_close(JournalFile *f) {
 354         assert(f);
 355
 356 #if HAVE_GCRYPT
 357         /* Write the final tag */
 358         if (f->seal && f->writable) {
 359                 int r;
 360
 361                 r = journal_file_append_tag(f);
 362                 if (r < 0)
 363                         log_error_errno(r, "Failed to append tag when closing journal: %m");
 364         }
 365 #endif
 366
 367         if (f->post_change_timer) {
 368                 int enabled;
 369
 370                 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
 371                         if (enabled == SD_EVENT_ONESHOT)
 372                                 journal_file_post_change(f);
 373
 374                 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
 375                 sd_event_source_unref(f->post_change_timer);
 376         }
 377
 378         journal_file_set_offline(f, true);
 379
 380         if (f->mmap && f->cache_fd)
 381                 mmap_cache_free_fd(f->mmap, f->cache_fd);
 382
 383         if (f->fd >= 0 && f->defrag_on_close) {
 384
 385                 /* Be friendly to btrfs: turn COW back on again now,
 386                  * and defragment the file. We won't write to the file
 387                  * ever again, hence remove all fragmentation, and
 388                  * reenable all the good bits COW usually provides
 389                  * (such as data checksumming). */
 390
 391                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 392                 (void) btrfs_defrag_fd(f->fd);
 393         }
 394
 395         if (f->close_fd)
 396                 safe_close(f->fd);
 397         free(f->path);
 398
 399         mmap_cache_unref(f->mmap);
 400
 401         ordered_hashmap_free_free(f->chain_cache);
 402
 403 #if HAVE_XZ || HAVE_LZ4
 404         free(f->compress_buffer);
 405 #endif
 406
 407 #if HAVE_GCRYPT
 408         if (f->fss_file)
 409                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 410         else
 411                 free(f->fsprg_state);
 412
 413         free(f->fsprg_seed);
 414
 415         if (f->hmac)
 416                 gcry_md_close(f->hmac);
 417 #endif
 418
 419         return mfree(f);
 420 }
 421
 422 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 423         Header h = {};
 424         ssize_t k;
 425         int r;
 426
 427         assert(f);
 428
 429         memcpy(h.signature, HEADER_SIGNATURE, 8);
 430         h.header_size = htole64(ALIGN64(sizeof(h)));
 431
 432         h.incompatible_flags |= htole32(
 433                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 434                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 435
 436         h.compatible_flags = htole32(
 437                 f->seal * HEADER_COMPATIBLE_SEALED);
 438
 439         r = sd_id128_randomize(&h.file_id);
 440         if (r < 0)
 441                 return r;
 442
 443         if (template) {
 444                 h.seqnum_id = template->header->seqnum_id;
 445                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 446         } else
 447                 h.seqnum_id = h.file_id;
 448
 449         k = pwrite(f->fd, &h, sizeof(h), 0);
 450         if (k < 0)
 451                 return -errno;
 452
 453         if (k != sizeof(h))
 454                 return -EIO;
 455
 456         return 0;
 457 }
 458
 459 static int journal_file_refresh_header(JournalFile *f) {
 460         sd_id128_t boot_id;
 461         int r;
 462
 463         assert(f);
 464         assert(f->header);
 465
 466         r = sd_id128_get_machine(&f->header->machine_id);
 467         if (r < 0)
 468                 return r;
 469
 470         r = sd_id128_get_boot(&boot_id);
 471         if (r < 0)
 472                 return r;
 473
 474         f->header->boot_id = boot_id;
 475
 476         r = journal_file_set_online(f);
 477
 478         /* Sync the online state to disk */
 479         (void) fsync(f->fd);
 480
 481         /* We likely just created a new file, also sync the directory this file is located in. */
 482         (void) fsync_directory_of_file(f->fd);
 483
 484         return r;
 485 }
 486
 487 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
 488         const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
 489                 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
 490         const char *type = compatible ? "compatible" : "incompatible";
 491         uint32_t flags;
 492
 493         flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
 494
 495         if (flags & ~supported) {
 496                 if (flags & ~any)
 497                         log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
 498                                   f->path, type, flags & ~any);
 499                 flags = (flags & any) & ~supported;
 500                 if (flags) {
 501                         const char* strv[3];
 502                         unsigned n = 0;
 503                         _cleanup_free_ char *t = NULL;
 504
 505                         if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
 506                                 strv[n++] = "sealed";
 507                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
 508                                 strv[n++] = "xz-compressed";
 509                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
 510                                 strv[n++] = "lz4-compressed";
 511                         strv[n] = NULL;
 512                         assert(n < ELEMENTSOF(strv));
 513
 514                         t = strv_join((char**) strv, ", ");
 515                         log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
 516                                   f->path, type, n > 1 ? "flags" : "flag", strnull(t));
 517                 }
 518                 return true;
 519         }
 520
 521         return false;
 522 }
 523
 524 static int journal_file_verify_header(JournalFile *f) {
 525         uint64_t arena_size, header_size;
 526
 527         assert(f);
 528         assert(f->header);
 529
 530         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 531                 return -EBADMSG;
 532
 533         /* In both read and write mode we refuse to open files with incompatible
 534          * flags we don't know. */
 535         if (warn_wrong_flags(f, false))
 536                 return -EPROTONOSUPPORT;
 537
 538         /* When open for writing we refuse to open files with compatible flags, too. */
 539         if (f->writable && warn_wrong_flags(f, true))
 540                 return -EPROTONOSUPPORT;
 541
 542         if (f->header->state >= _STATE_MAX)
 543                 return -EBADMSG;
 544
 545         header_size = le64toh(f->header->header_size);
 546
 547         /* The first addition was n_data, so check that we are at least this large */
 548         if (header_size < HEADER_SIZE_MIN)
 549                 return -EBADMSG;
 550
 551         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 552                 return -EBADMSG;
 553
 554         arena_size = le64toh(f->header->arena_size);
 555
 556         if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
 557                 return -ENODATA;
 558
 559         if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
 560                 return -ENODATA;
 561
 562         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 563             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 564             !VALID64(le64toh(f->header->tail_object_offset)) ||
 565             !VALID64(le64toh(f->header->entry_array_offset)))
 566                 return -ENODATA;
 567
 568         if (f->writable) {
 569                 sd_id128_t machine_id;
 570                 uint8_t state;
 571                 int r;
 572
 573                 r = sd_id128_get_machine(&machine_id);
 574                 if (r < 0)
 575                         return r;
 576
 577                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 578                         return -EHOSTDOWN;
 579
 580                 state = f->header->state;
 581
 582                 if (state == STATE_ARCHIVED)
 583                         return -ESHUTDOWN; /* Already archived */
 584                 else if (state == STATE_ONLINE) {
 585                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 586                         return -EBUSY;
 587                 } else if (state != STATE_OFFLINE) {
 588                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 589                         return -EBUSY;
 590                 }
 591
 592                 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
 593                         return -EBADMSG;
 594
 595                 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
 596                  * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
 597                  * bisection. */
 598                 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
 599                         log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
 600                         return -ETXTBSY;
 601                 }
 602         }
 603
 604         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 605         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 606
 607         f->seal = JOURNAL_HEADER_SEALED(f->header);
 608
 609         return 0;
 610 }
 611
 612 static int journal_file_fstat(JournalFile *f) {
 613         int r;
 614
 615         assert(f);
 616         assert(f->fd >= 0);
 617
 618         if (fstat(f->fd, &f->last_stat) < 0)
 619                 return -errno;
 620
 621         f->last_stat_usec = now(CLOCK_MONOTONIC);
 622
 623         /* Refuse dealing with with files that aren't regular */
 624         r = stat_verify_regular(&f->last_stat);
 625         if (r < 0)
 626                 return r;
 627
 628         /* Refuse appending to files that are already deleted */
 629         if (f->last_stat.st_nlink <= 0)
 630                 return -EIDRM;
 631
 632         return 0;
 633 }
 634
 635 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 636         uint64_t old_size, new_size;
 637         int r;
 638
 639         assert(f);
 640         assert(f->header);
 641
 642         /* We assume that this file is not sparse, and we know that
 643          * for sure, since we always call posix_fallocate()
 644          * ourselves */
 645
 646         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
 647                 return -EIO;
 648
 649         old_size =
 650                 le64toh(f->header->header_size) +
 651                 le64toh(f->header->arena_size);
 652
 653         new_size = PAGE_ALIGN(offset + size);
 654         if (new_size < le64toh(f->header->header_size))
 655                 new_size = le64toh(f->header->header_size);
 656
 657         if (new_size <= old_size) {
 658
 659                 /* We already pre-allocated enough space, but before
 660                  * we write to it, let's check with fstat() if the
 661                  * file got deleted, in order make sure we don't throw
 662                  * away the data immediately. Don't check fstat() for
 663                  * all writes though, but only once ever 10s. */
 664
 665                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 666                         return 0;
 667
 668                 return journal_file_fstat(f);
 669         }
 670
 671         /* Allocate more space. */
 672
 673         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 674                 return -E2BIG;
 675
 676         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 677                 struct statvfs svfs;
 678
 679                 if (fstatvfs(f->fd, &svfs) >= 0) {
 680                         uint64_t available;
 681
 682                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 683
 684                         if (new_size - old_size > available)
 685                                 return -E2BIG;
 686                 }
 687         }
 688
 689         /* Increase by larger blocks at once */
 690         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 691         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 692                 new_size = f->metrics.max_size;
 693
 694         /* Note that the glibc fallocate() fallback is very
 695            inefficient, hence we try to minimize the allocation area
 696            as we can. */
 697         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 698         if (r != 0)
 699                 return -r;
 700
 701         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 702
 703         return journal_file_fstat(f);
 704 }
 705
 706 static unsigned type_to_context(ObjectType type) {
 707         /* One context for each type, plus one catch-all for the rest */
 708         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 709         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 710         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 711 }
 712
 713 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
 714         int r;
 715
 716         assert(f);
 717         assert(ret);
 718
 719         if (size <= 0)
 720                 return -EINVAL;
 721
 722         /* Avoid SIGBUS on invalid accesses */
 723         if (offset + size > (uint64_t) f->last_stat.st_size) {
 724                 /* Hmm, out of range? Let's refresh the fstat() data
 725                  * first, before we trust that check. */
 726
 727                 r = journal_file_fstat(f);
 728                 if (r < 0)
 729                         return r;
 730
 731                 if (offset + size > (uint64_t) f->last_stat.st_size)
 732                         return -EADDRNOTAVAIL;
 733         }
 734
 735         return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
 736 }
 737
 738 static uint64_t minimum_header_size(Object *o) {
 739
 740         static const uint64_t table[] = {
 741                 [OBJECT_DATA] = sizeof(DataObject),
 742                 [OBJECT_FIELD] = sizeof(FieldObject),
 743                 [OBJECT_ENTRY] = sizeof(EntryObject),
 744                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 745                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 746                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 747                 [OBJECT_TAG] = sizeof(TagObject),
 748         };
 749
 750         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 751                 return sizeof(ObjectHeader);
 752
 753         return table[o->object.type];
 754 }
 755
 756 /* Lightweight object checks. We want this to be fast, so that we won't
 757  * slowdown every journal_file_move_to_object() call too much. */
 758 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
 759         assert(f);
 760         assert(o);
 761
 762         switch (o->object.type) {
 763
 764         case OBJECT_DATA: {
 765                 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
 766                         log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
 767                                         le64toh(o->data.n_entries), offset);
 768                         return -EBADMSG;
 769                 }
 770
 771                 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
 772                         log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
 773                               offsetof(DataObject, payload),
 774                               le64toh(o->object.size),
 775                               offset);
 776                         return -EBADMSG;
 777                 }
 778
 779                 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
 780                     !VALID64(le64toh(o->data.next_field_offset)) ||
 781                     !VALID64(le64toh(o->data.entry_offset)) ||
 782                     !VALID64(le64toh(o->data.entry_array_offset))) {
 783                         log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
 784                                 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
 785                               le64toh(o->data.next_hash_offset),
 786                               le64toh(o->data.next_field_offset),
 787                               le64toh(o->data.entry_offset),
 788                               le64toh(o->data.entry_array_offset),
 789                               offset);
 790                         return -EBADMSG;
 791                 }
 792
 793                 break;
 794         }
 795
 796         case OBJECT_FIELD:
 797                 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
 798                         log_debug(
 799                               "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
 800                               offsetof(FieldObject, payload),
 801                               le64toh(o->object.size),
 802                               offset);
 803                         return -EBADMSG;
 804                 }
 805
 806                 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
 807                     !VALID64(le64toh(o->field.head_data_offset))) {
 808                         log_debug(
 809                               "Invalid offset, next_hash_offset="OFSfmt
 810                               ", head_data_offset="OFSfmt": %"PRIu64,
 811                               le64toh(o->field.next_hash_offset),
 812                               le64toh(o->field.head_data_offset),
 813                               offset);
 814                         return -EBADMSG;
 815                 }
 816                 break;
 817
 818         case OBJECT_ENTRY:
 819                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
 820                         log_debug(
 821                               "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
 822                               offsetof(EntryObject, items),
 823                               le64toh(o->object.size),
 824                               offset);
 825                         return -EBADMSG;
 826                 }
 827
 828                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
 829                         log_debug(
 830                               "Invalid number items in entry: %"PRIu64": %"PRIu64,
 831                               (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
 832                               offset);
 833                         return -EBADMSG;
 834                 }
 835
 836                 if (le64toh(o->entry.seqnum) <= 0) {
 837                         log_debug(
 838                               "Invalid entry seqnum: %"PRIx64": %"PRIu64,
 839                               le64toh(o->entry.seqnum),
 840                               offset);
 841                         return -EBADMSG;
 842                 }
 843
 844                 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
 845                         log_debug(
 846                               "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
 847                               le64toh(o->entry.realtime),
 848                               offset);
 849                         return -EBADMSG;
 850                 }
 851
 852                 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
 853                         log_debug(
 854                               "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
 855                               le64toh(o->entry.monotonic),
 856                               offset);
 857                         return -EBADMSG;
 858                 }
 859
 860                 break;
 861
 862         case OBJECT_DATA_HASH_TABLE:
 863         case OBJECT_FIELD_HASH_TABLE:
 864                 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
 865                     (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
 866                         log_debug(
 867                               "Invalid %s hash table size: %"PRIu64": %"PRIu64,
 868                               o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
 869                               le64toh(o->object.size),
 870                               offset);
 871                         return -EBADMSG;
 872                 }
 873
 874                 break;
 875
 876         case OBJECT_ENTRY_ARRAY:
 877                 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
 878                     (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
 879                         log_debug(
 880                               "Invalid object entry array size: %"PRIu64": %"PRIu64,
 881                               le64toh(o->object.size),
 882                               offset);
 883                         return -EBADMSG;
 884                 }
 885
 886                 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
 887                         log_debug(
 888                               "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
 889                               le64toh(o->entry_array.next_entry_array_offset),
 890                               offset);
 891                         return -EBADMSG;
 892                 }
 893
 894                 break;
 895
 896         case OBJECT_TAG:
 897                 if (le64toh(o->object.size) != sizeof(TagObject)) {
 898                         log_debug(
 899                               "Invalid object tag size: %"PRIu64": %"PRIu64,
 900                               le64toh(o->object.size),
 901                               offset);
 902                         return -EBADMSG;
 903                 }
 904
 905                 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
 906                         log_debug(
 907                               "Invalid object tag epoch: %"PRIu64": %"PRIu64,
 908                               le64toh(o->tag.epoch),
 909                               offset);
 910                         return -EBADMSG;
 911                 }
 912
 913                 break;
 914         }
 915
 916         return 0;
 917 }
 918
 919 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 920         int r;
 921         void *t;
 922         size_t tsize;
 923         Object *o;
 924         uint64_t s;
 925
 926         assert(f);
 927         assert(ret);
 928
 929         /* Objects may only be located at multiple of 64 bit */
 930         if (!VALID64(offset)) {
 931                 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
 932                 return -EBADMSG;
 933         }
 934
 935         /* Object may not be located in the file header */
 936         if (offset < le64toh(f->header->header_size)) {
 937                 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
 938                 return -EBADMSG;
 939         }
 940
 941         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
 942         if (r < 0)
 943                 return r;
 944
 945         o = (Object*) t;
 946         s = le64toh(o->object.size);
 947
 948         if (s == 0) {
 949                 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
 950                 return -EBADMSG;
 951         }
 952         if (s < sizeof(ObjectHeader)) {
 953                 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
 954                 return -EBADMSG;
 955         }
 956
 957         if (o->object.type <= OBJECT_UNUSED) {
 958                 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
 959                 return -EBADMSG;
 960         }
 961
 962         if (s < minimum_header_size(o)) {
 963                 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
 964                 return -EBADMSG;
 965         }
 966
 967         if (type > OBJECT_UNUSED && o->object.type != type) {
 968                 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
 969                 return -EBADMSG;
 970         }
 971
 972         if (s > tsize) {
 973                 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
 974                 if (r < 0)
 975                         return r;
 976
 977                 o = (Object*) t;
 978         }
 979
 980         r = journal_file_check_object(f, offset, o);
 981         if (r < 0)
 982                 return r;
 983
 984         *ret = o;
 985         return 0;
 986 }
 987
 988 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 989         uint64_t r;
 990
 991         assert(f);
 992         assert(f->header);
 993
 994         r = le64toh(f->header->tail_entry_seqnum) + 1;
 995
 996         if (seqnum) {
 997                 /* If an external seqnum counter was passed, we update
 998                  * both the local and the external one, and set it to
 999                  * the maximum of both */
1000
1001                 if (*seqnum + 1 > r)
1002                         r = *seqnum + 1;
1003
1004                 *seqnum = r;
1005         }
1006
1007         f->header->tail_entry_seqnum = htole64(r);
1008
1009         if (f->header->head_entry_seqnum == 0)
1010                 f->header->head_entry_seqnum = htole64(r);
1011
1012         return r;
1013 }
1014
1015 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
1016         int r;
1017         uint64_t p;
1018         Object *tail, *o;
1019         void *t;
1020
1021         assert(f);
1022         assert(f->header);
1023         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
1024         assert(size >= sizeof(ObjectHeader));
1025         assert(offset);
1026         assert(ret);
1027
1028         r = journal_file_set_online(f);
1029         if (r < 0)
1030                 return r;
1031
1032         p = le64toh(f->header->tail_object_offset);
1033         if (p == 0)
1034                 p = le64toh(f->header->header_size);
1035         else {
1036                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1037                 if (r < 0)
1038                         return r;
1039
1040                 p += ALIGN64(le64toh(tail->object.size));
1041         }
1042
1043         r = journal_file_allocate(f, p, size);
1044         if (r < 0)
1045                 return r;
1046
1047         r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1048         if (r < 0)
1049                 return r;
1050
1051         o = (Object*) t;
1052
1053         zero(o->object);
1054         o->object.type = type;
1055         o->object.size = htole64(size);
1056
1057         f->header->tail_object_offset = htole64(p);
1058         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1059
1060         *ret = o;
1061         *offset = p;
1062
1063         return 0;
1064 }
1065
1066 static int journal_file_setup_data_hash_table(JournalFile *f) {
1067         uint64_t s, p;
1068         Object *o;
1069         int r;
1070
1071         assert(f);
1072         assert(f->header);
1073
1074         /* We estimate that we need 1 hash table entry per 768 bytes
1075            of journal file and we want to make sure we never get
1076            beyond 75% fill level. Calculate the hash table size for
1077            the maximum file size based on these metrics. */
1078
1079         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1080         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1081                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1082
1083         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1084
1085         r = journal_file_append_object(f,
1086                                        OBJECT_DATA_HASH_TABLE,
1087                                        offsetof(Object, hash_table.items) + s,
1088                                        &o, &p);
1089         if (r < 0)
1090                 return r;
1091
1092         memzero(o->hash_table.items, s);
1093
1094         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1095         f->header->data_hash_table_size = htole64(s);
1096
1097         return 0;
1098 }
1099
1100 static int journal_file_setup_field_hash_table(JournalFile *f) {
1101         uint64_t s, p;
1102         Object *o;
1103         int r;
1104
1105         assert(f);
1106         assert(f->header);
1107
1108         /* We use a fixed size hash table for the fields as this
1109          * number should grow very slowly only */
1110
1111         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1112         r = journal_file_append_object(f,
1113                                        OBJECT_FIELD_HASH_TABLE,
1114                                        offsetof(Object, hash_table.items) + s,
1115                                        &o, &p);
1116         if (r < 0)
1117                 return r;
1118
1119         memzero(o->hash_table.items, s);
1120
1121         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1122         f->header->field_hash_table_size = htole64(s);
1123
1124         return 0;
1125 }
1126
1127 int journal_file_map_data_hash_table(JournalFile *f) {
1128         uint64_t s, p;
1129         void *t;
1130         int r;
1131
1132         assert(f);
1133         assert(f->header);
1134
1135         if (f->data_hash_table)
1136                 return 0;
1137
1138         p = le64toh(f->header->data_hash_table_offset);
1139         s = le64toh(f->header->data_hash_table_size);
1140
1141         r = journal_file_move_to(f,
1142                                  OBJECT_DATA_HASH_TABLE,
1143                                  true,
1144                                  p, s,
1145                                  &t, NULL);
1146         if (r < 0)
1147                 return r;
1148
1149         f->data_hash_table = t;
1150         return 0;
1151 }
1152
1153 int journal_file_map_field_hash_table(JournalFile *f) {
1154         uint64_t s, p;
1155         void *t;
1156         int r;
1157
1158         assert(f);
1159         assert(f->header);
1160
1161         if (f->field_hash_table)
1162                 return 0;
1163
1164         p = le64toh(f->header->field_hash_table_offset);
1165         s = le64toh(f->header->field_hash_table_size);
1166
1167         r = journal_file_move_to(f,
1168                                  OBJECT_FIELD_HASH_TABLE,
1169                                  true,
1170                                  p, s,
1171                                  &t, NULL);
1172         if (r < 0)
1173                 return r;
1174
1175         f->field_hash_table = t;
1176         return 0;
1177 }
1178
1179 static int journal_file_link_field(
1180                 JournalFile *f,
1181                 Object *o,
1182                 uint64_t offset,
1183                 uint64_t hash) {
1184
1185         uint64_t p, h, m;
1186         int r;
1187
1188         assert(f);
1189         assert(f->header);
1190         assert(f->field_hash_table);
1191         assert(o);
1192         assert(offset > 0);
1193
1194         if (o->object.type != OBJECT_FIELD)
1195                 return -EINVAL;
1196
1197         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1198         if (m <= 0)
1199                 return -EBADMSG;
1200
1201         /* This might alter the window we are looking at */
1202         o->field.next_hash_offset = o->field.head_data_offset = 0;
1203
1204         h = hash % m;
1205         p = le64toh(f->field_hash_table[h].tail_hash_offset);
1206         if (p == 0)
1207                 f->field_hash_table[h].head_hash_offset = htole64(offset);
1208         else {
1209                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1210                 if (r < 0)
1211                         return r;
1212
1213                 o->field.next_hash_offset = htole64(offset);
1214         }
1215
1216         f->field_hash_table[h].tail_hash_offset = htole64(offset);
1217
1218         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1219                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1220
1221         return 0;
1222 }
1223
1224 static int journal_file_link_data(
1225                 JournalFile *f,
1226                 Object *o,
1227                 uint64_t offset,
1228                 uint64_t hash) {
1229
1230         uint64_t p, h, m;
1231         int r;
1232
1233         assert(f);
1234         assert(f->header);
1235         assert(f->data_hash_table);
1236         assert(o);
1237         assert(offset > 0);
1238
1239         if (o->object.type != OBJECT_DATA)
1240                 return -EINVAL;
1241
1242         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1243         if (m <= 0)
1244                 return -EBADMSG;
1245
1246         /* This might alter the window we are looking at */
1247         o->data.next_hash_offset = o->data.next_field_offset = 0;
1248         o->data.entry_offset = o->data.entry_array_offset = 0;
1249         o->data.n_entries = 0;
1250
1251         h = hash % m;
1252         p = le64toh(f->data_hash_table[h].tail_hash_offset);
1253         if (p == 0)
1254                 /* Only entry in the hash table is easy */
1255                 f->data_hash_table[h].head_hash_offset = htole64(offset);
1256         else {
1257                 /* Move back to the previous data object, to patch in
1258                  * pointer */
1259
1260                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1261                 if (r < 0)
1262                         return r;
1263
1264                 o->data.next_hash_offset = htole64(offset);
1265         }
1266
1267         f->data_hash_table[h].tail_hash_offset = htole64(offset);
1268
1269         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1270                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1271
1272         return 0;
1273 }
1274
1275 int journal_file_find_field_object_with_hash(
1276                 JournalFile *f,
1277                 const void *field, uint64_t size, uint64_t hash,
1278                 Object **ret, uint64_t *offset) {
1279
1280         uint64_t p, osize, h, m;
1281         int r;
1282
1283         assert(f);
1284         assert(f->header);
1285         assert(field && size > 0);
1286
1287         /* If the field hash table is empty, we can't find anything */
1288         if (le64toh(f->header->field_hash_table_size) <= 0)
1289                 return 0;
1290
1291         /* Map the field hash table, if it isn't mapped yet. */
1292         r = journal_file_map_field_hash_table(f);
1293         if (r < 0)
1294                 return r;
1295
1296         osize = offsetof(Object, field.payload) + size;
1297
1298         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1299         if (m <= 0)
1300                 return -EBADMSG;
1301
1302         h = hash % m;
1303         p = le64toh(f->field_hash_table[h].head_hash_offset);
1304
1305         while (p > 0) {
1306                 Object *o;
1307
1308                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1309                 if (r < 0)
1310                         return r;
1311
1312                 if (le64toh(o->field.hash) == hash &&
1313                     le64toh(o->object.size) == osize &&
1314                     memcmp(o->field.payload, field, size) == 0) {
1315
1316                         if (ret)
1317                                 *ret = o;
1318                         if (offset)
1319                                 *offset = p;
1320
1321                         return 1;
1322                 }
1323
1324                 p = le64toh(o->field.next_hash_offset);
1325         }
1326
1327         return 0;
1328 }
1329
1330 int journal_file_find_field_object(
1331                 JournalFile *f,
1332                 const void *field, uint64_t size,
1333                 Object **ret, uint64_t *offset) {
1334
1335         uint64_t hash;
1336
1337         assert(f);
1338         assert(field && size > 0);
1339
1340         hash = hash64(field, size);
1341
1342         return journal_file_find_field_object_with_hash(f,
1343                                                         field, size, hash,
1344                                                         ret, offset);
1345 }
1346
1347 int journal_file_find_data_object_with_hash(
1348                 JournalFile *f,
1349                 const void *data, uint64_t size, uint64_t hash,
1350                 Object **ret, uint64_t *offset) {
1351
1352         uint64_t p, osize, h, m;
1353         int r;
1354
1355         assert(f);
1356         assert(f->header);
1357         assert(data || size == 0);
1358
1359         /* If there's no data hash table, then there's no entry. */
1360         if (le64toh(f->header->data_hash_table_size) <= 0)
1361                 return 0;
1362
1363         /* Map the data hash table, if it isn't mapped yet. */
1364         r = journal_file_map_data_hash_table(f);
1365         if (r < 0)
1366                 return r;
1367
1368         osize = offsetof(Object, data.payload) + size;
1369
1370         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1371         if (m <= 0)
1372                 return -EBADMSG;
1373
1374         h = hash % m;
1375         p = le64toh(f->data_hash_table[h].head_hash_offset);
1376
1377         while (p > 0) {
1378                 Object *o;
1379
1380                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1381                 if (r < 0)
1382                         return r;
1383
1384                 if (le64toh(o->data.hash) != hash)
1385                         goto next;
1386
1387                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1388 #if HAVE_XZ || HAVE_LZ4
1389                         uint64_t l;
1390                         size_t rsize = 0;
1391
1392                         l = le64toh(o->object.size);
1393                         if (l <= offsetof(Object, data.payload))
1394                                 return -EBADMSG;
1395
1396                         l -= offsetof(Object, data.payload);
1397
1398                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1399                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1400                         if (r < 0)
1401                                 return r;
1402
1403                         if (rsize == size &&
1404                             memcmp(f->compress_buffer, data, size) == 0) {
1405
1406                                 if (ret)
1407                                         *ret = o;
1408
1409                                 if (offset)
1410                                         *offset = p;
1411
1412                                 return 1;
1413                         }
1414 #else
1415                         return -EPROTONOSUPPORT;
1416 #endif
1417                 } else if (le64toh(o->object.size) == osize &&
1418                            memcmp(o->data.payload, data, size) == 0) {
1419
1420                         if (ret)
1421                                 *ret = o;
1422
1423                         if (offset)
1424                                 *offset = p;
1425
1426                         return 1;
1427                 }
1428
1429         next:
1430                 p = le64toh(o->data.next_hash_offset);
1431         }
1432
1433         return 0;
1434 }
1435
1436 int journal_file_find_data_object(
1437                 JournalFile *f,
1438                 const void *data, uint64_t size,
1439                 Object **ret, uint64_t *offset) {
1440
1441         uint64_t hash;
1442
1443         assert(f);
1444         assert(data || size == 0);
1445
1446         hash = hash64(data, size);
1447
1448         return journal_file_find_data_object_with_hash(f,
1449                                                        data, size, hash,
1450                                                        ret, offset);
1451 }
1452
1453 static int journal_file_append_field(
1454                 JournalFile *f,
1455                 const void *field, uint64_t size,
1456                 Object **ret, uint64_t *offset) {
1457
1458         uint64_t hash, p;
1459         uint64_t osize;
1460         Object *o;
1461         int r;
1462
1463         assert(f);
1464         assert(field && size > 0);
1465
1466         hash = hash64(field, size);
1467
1468         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1469         if (r < 0)
1470                 return r;
1471         else if (r > 0) {
1472
1473                 if (ret)
1474                         *ret = o;
1475
1476                 if (offset)
1477                         *offset = p;
1478
1479                 return 0;
1480         }
1481
1482         osize = offsetof(Object, field.payload) + size;
1483         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1484         if (r < 0)
1485                 return r;
1486
1487         o->field.hash = htole64(hash);
1488         memcpy(o->field.payload, field, size);
1489
1490         r = journal_file_link_field(f, o, p, hash);
1491         if (r < 0)
1492                 return r;
1493
1494         /* The linking might have altered the window, so let's
1495          * refresh our pointer */
1496         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1497         if (r < 0)
1498                 return r;
1499
1500 #if HAVE_GCRYPT
1501         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1502         if (r < 0)
1503                 return r;
1504 #endif
1505
1506         if (ret)
1507                 *ret = o;
1508
1509         if (offset)
1510                 *offset = p;
1511
1512         return 0;
1513 }
1514
1515 static int journal_file_append_data(
1516                 JournalFile *f,
1517                 const void *data, uint64_t size,
1518                 Object **ret, uint64_t *offset) {
1519
1520         uint64_t hash, p;
1521         uint64_t osize;
1522         Object *o;
1523         int r, compression = 0;
1524         const void *eq;
1525
1526         assert(f);
1527         assert(data || size == 0);
1528
1529         hash = hash64(data, size);
1530
1531         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1532         if (r < 0)
1533                 return r;
1534         if (r > 0) {
1535
1536                 if (ret)
1537                         *ret = o;
1538
1539                 if (offset)
1540                         *offset = p;
1541
1542                 return 0;
1543         }
1544
1545         osize = offsetof(Object, data.payload) + size;
1546         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1547         if (r < 0)
1548                 return r;
1549
1550         o->data.hash = htole64(hash);
1551
1552 #if HAVE_XZ || HAVE_LZ4
1553         if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
1554                 size_t rsize = 0;
1555
1556                 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1557
1558                 if (compression >= 0) {
1559                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1560                         o->object.flags |= compression;
1561
1562                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1563                                   size, rsize, object_compressed_to_string(compression));
1564                 } else
1565                         /* Compression didn't work, we don't really care why, let's continue without compression */
1566                         compression = 0;
1567         }
1568 #endif
1569
1570         if (compression == 0)
1571                 memcpy_safe(o->data.payload, data, size);
1572
1573         r = journal_file_link_data(f, o, p, hash);
1574         if (r < 0)
1575                 return r;
1576
1577 #if HAVE_GCRYPT
1578         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1579         if (r < 0)
1580                 return r;
1581 #endif
1582
1583         /* The linking might have altered the window, so let's
1584          * refresh our pointer */
1585         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1586         if (r < 0)
1587                 return r;
1588
1589         if (!data)
1590                 eq = NULL;
1591         else
1592                 eq = memchr(data, '=', size);
1593         if (eq && eq > data) {
1594                 Object *fo = NULL;
1595                 uint64_t fp;
1596
1597                 /* Create field object ... */
1598                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1599                 if (r < 0)
1600                         return r;
1601
1602                 /* ... and link it in. */
1603                 o->data.next_field_offset = fo->field.head_data_offset;
1604                 fo->field.head_data_offset = le64toh(p);
1605         }
1606
1607         if (ret)
1608                 *ret = o;
1609
1610         if (offset)
1611                 *offset = p;
1612
1613         return 0;
1614 }
1615
1616 uint64_t journal_file_entry_n_items(Object *o) {
1617         assert(o);
1618
1619         if (o->object.type != OBJECT_ENTRY)
1620                 return 0;
1621
1622         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1623 }
1624
1625 uint64_t journal_file_entry_array_n_items(Object *o) {
1626         assert(o);
1627
1628         if (o->object.type != OBJECT_ENTRY_ARRAY)
1629                 return 0;
1630
1631         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1632 }
1633
1634 uint64_t journal_file_hash_table_n_items(Object *o) {
1635         assert(o);
1636
1637         if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1638                 return 0;
1639
1640         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1641 }
1642
1643 static int link_entry_into_array(JournalFile *f,
1644                                  le64_t *first,
1645                                  le64_t *idx,
1646                                  uint64_t p) {
1647         int r;
1648         uint64_t n = 0, ap = 0, q, i, a, hidx;
1649         Object *o;
1650
1651         assert(f);
1652         assert(f->header);
1653         assert(first);
1654         assert(idx);
1655         assert(p > 0);
1656
1657         a = le64toh(*first);
1658         i = hidx = le64toh(*idx);
1659         while (a > 0) {
1660
1661                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1662                 if (r < 0)
1663                         return r;
1664
1665                 n = journal_file_entry_array_n_items(o);
1666                 if (i < n) {
1667                         o->entry_array.items[i] = htole64(p);
1668                         *idx = htole64(hidx + 1);
1669                         return 0;
1670                 }
1671
1672                 i -= n;
1673                 ap = a;
1674                 a = le64toh(o->entry_array.next_entry_array_offset);
1675         }
1676
1677         if (hidx > n)
1678                 n = (hidx+1) * 2;
1679         else
1680                 n = n * 2;
1681
1682         if (n < 4)
1683                 n = 4;
1684
1685         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1686                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1687                                        &o, &q);
1688         if (r < 0)
1689                 return r;
1690
1691 #if HAVE_GCRYPT
1692         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1693         if (r < 0)
1694                 return r;
1695 #endif
1696
1697         o->entry_array.items[i] = htole64(p);
1698
1699         if (ap == 0)
1700                 *first = htole64(q);
1701         else {
1702                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1703                 if (r < 0)
1704                         return r;
1705
1706                 o->entry_array.next_entry_array_offset = htole64(q);
1707         }
1708
1709         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1710                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1711
1712         *idx = htole64(hidx + 1);
1713
1714         return 0;
1715 }
1716
1717 static int link_entry_into_array_plus_one(JournalFile *f,
1718                                           le64_t *extra,
1719                                           le64_t *first,
1720                                           le64_t *idx,
1721                                           uint64_t p) {
1722
1723         int r;
1724
1725         assert(f);
1726         assert(extra);
1727         assert(first);
1728         assert(idx);
1729         assert(p > 0);
1730
1731         if (*idx == 0)
1732                 *extra = htole64(p);
1733         else {
1734                 le64_t i;
1735
1736                 i = htole64(le64toh(*idx) - 1);
1737                 r = link_entry_into_array(f, first, &i, p);
1738                 if (r < 0)
1739                         return r;
1740         }
1741
1742         *idx = htole64(le64toh(*idx) + 1);
1743         return 0;
1744 }
1745
1746 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1747         uint64_t p;
1748         int r;
1749         assert(f);
1750         assert(o);
1751         assert(offset > 0);
1752
1753         p = le64toh(o->entry.items[i].object_offset);
1754         if (p == 0)
1755                 return -EINVAL;
1756
1757         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1758         if (r < 0)
1759                 return r;
1760
1761         return link_entry_into_array_plus_one(f,
1762                                               &o->data.entry_offset,
1763                                               &o->data.entry_array_offset,
1764                                               &o->data.n_entries,
1765                                               offset);
1766 }
1767
1768 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1769         uint64_t n, i;
1770         int r;
1771
1772         assert(f);
1773         assert(f->header);
1774         assert(o);
1775         assert(offset > 0);
1776
1777         if (o->object.type != OBJECT_ENTRY)
1778                 return -EINVAL;
1779
1780         __sync_synchronize();
1781
1782         /* Link up the entry itself */
1783         r = link_entry_into_array(f,
1784                                   &f->header->entry_array_offset,
1785                                   &f->header->n_entries,
1786                                   offset);
1787         if (r < 0)
1788                 return r;
1789
1790         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1791
1792         if (f->header->head_entry_realtime == 0)
1793                 f->header->head_entry_realtime = o->entry.realtime;
1794
1795         f->header->tail_entry_realtime = o->entry.realtime;
1796         f->header->tail_entry_monotonic = o->entry.monotonic;
1797
1798         /* Link up the items */
1799         n = journal_file_entry_n_items(o);
1800         for (i = 0; i < n; i++) {
1801                 r = journal_file_link_entry_item(f, o, offset, i);
1802                 if (r < 0)
1803                         return r;
1804         }
1805
1806         return 0;
1807 }
1808
1809 static int journal_file_append_entry_internal(
1810                 JournalFile *f,
1811                 const dual_timestamp *ts,
1812                 uint64_t xor_hash,
1813                 const EntryItem items[], unsigned n_items,
1814                 uint64_t *seqnum,
1815                 Object **ret, uint64_t *offset) {
1816         uint64_t np;
1817         uint64_t osize;
1818         Object *o;
1819         int r;
1820
1821         assert(f);
1822         assert(f->header);
1823         assert(items || n_items == 0);
1824         assert(ts);
1825
1826         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1827
1828         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1829         if (r < 0)
1830                 return r;
1831
1832         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1833         memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1834         o->entry.realtime = htole64(ts->realtime);
1835         o->entry.monotonic = htole64(ts->monotonic);
1836         o->entry.xor_hash = htole64(xor_hash);
1837         o->entry.boot_id = f->header->boot_id;
1838
1839 #if HAVE_GCRYPT
1840         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1841         if (r < 0)
1842                 return r;
1843 #endif
1844
1845         r = journal_file_link_entry(f, o, np);
1846         if (r < 0)
1847                 return r;
1848
1849         if (ret)
1850                 *ret = o;
1851
1852         if (offset)
1853                 *offset = np;
1854
1855         return 0;
1856 }
1857
1858 void journal_file_post_change(JournalFile *f) {
1859         assert(f);
1860
1861         /* inotify() does not receive IN_MODIFY events from file
1862          * accesses done via mmap(). After each access we hence
1863          * trigger IN_MODIFY by truncating the journal file to its
1864          * current size which triggers IN_MODIFY. */
1865
1866         __sync_synchronize();
1867
1868         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1869                 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1870 }
1871
1872 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1873         assert(userdata);
1874
1875         journal_file_post_change(userdata);
1876
1877         return 1;
1878 }
1879
1880 static void schedule_post_change(JournalFile *f) {
1881         sd_event_source *timer;
1882         int enabled, r;
1883         uint64_t now;
1884
1885         assert(f);
1886         assert(f->post_change_timer);
1887
1888         timer = f->post_change_timer;
1889
1890         r = sd_event_source_get_enabled(timer, &enabled);
1891         if (r < 0) {
1892                 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1893                 goto fail;
1894         }
1895
1896         if (enabled == SD_EVENT_ONESHOT)
1897                 return;
1898
1899         r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1900         if (r < 0) {
1901                 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1902                 goto fail;
1903         }
1904
1905         r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1906         if (r < 0) {
1907                 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1908                 goto fail;
1909         }
1910
1911         r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1912         if (r < 0) {
1913                 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1914                 goto fail;
1915         }
1916
1917         return;
1918
1919 fail:
1920         /* On failure, let's simply post the change immediately. */
1921         journal_file_post_change(f);
1922 }
1923
1924 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1925 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1926         _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1927         int r;
1928
1929         assert(f);
1930         assert_return(!f->post_change_timer, -EINVAL);
1931         assert(e);
1932         assert(t);
1933
1934         r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1935         if (r < 0)
1936                 return r;
1937
1938         r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1939         if (r < 0)
1940                 return r;
1941
1942         f->post_change_timer = timer;
1943         timer = NULL;
1944         f->post_change_timer_period = t;
1945
1946         return r;
1947 }
1948
1949 static int entry_item_cmp(const void *_a, const void *_b) {
1950         const EntryItem *a = _a, *b = _b;
1951
1952         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1953                 return -1;
1954         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1955                 return 1;
1956         return 0;
1957 }
1958
1959 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1960         unsigned i;
1961         EntryItem *items;
1962         int r;
1963         uint64_t xor_hash = 0;
1964         struct dual_timestamp _ts;
1965
1966         assert(f);
1967         assert(f->header);
1968         assert(iovec || n_iovec == 0);
1969
1970         if (!ts) {
1971                 dual_timestamp_get(&_ts);
1972                 ts = &_ts;
1973         }
1974
1975 #if HAVE_GCRYPT
1976         r = journal_file_maybe_append_tag(f, ts->realtime);
1977         if (r < 0)
1978                 return r;
1979 #endif
1980
1981         /* alloca() can't take 0, hence let's allocate at least one */
1982         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1983
1984         for (i = 0; i < n_iovec; i++) {
1985                 uint64_t p;
1986                 Object *o;
1987
1988                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1989                 if (r < 0)
1990                         return r;
1991
1992                 xor_hash ^= le64toh(o->data.hash);
1993                 items[i].object_offset = htole64(p);
1994                 items[i].hash = o->data.hash;
1995         }
1996
1997         /* Order by the position on disk, in order to improve seek
1998          * times for rotating media. */
1999         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
2000
2001         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
2002
2003         /* If the memory mapping triggered a SIGBUS then we return an
2004          * IO error and ignore the error code passed down to us, since
2005          * it is very likely just an effect of a nullified replacement
2006          * mapping page */
2007
2008         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
2009                 r = -EIO;
2010
2011         if (f->post_change_timer)
2012                 schedule_post_change(f);
2013         else
2014                 journal_file_post_change(f);
2015
2016         return r;
2017 }
2018
2019 typedef struct ChainCacheItem {
2020         uint64_t first; /* the array at the beginning of the chain */
2021         uint64_t array; /* the cached array */
2022         uint64_t begin; /* the first item in the cached array */
2023         uint64_t total; /* the total number of items in all arrays before this one in the chain */
2024         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2025 } ChainCacheItem;
2026
2027 static void chain_cache_put(
2028                 OrderedHashmap *h,
2029                 ChainCacheItem *ci,
2030                 uint64_t first,
2031                 uint64_t array,
2032                 uint64_t begin,
2033                 uint64_t total,
2034                 uint64_t last_index) {
2035
2036         if (!ci) {
2037                 /* If the chain item to cache for this chain is the
2038                  * first one it's not worth caching anything */
2039                 if (array == first)
2040                         return;
2041
2042                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2043                         ci = ordered_hashmap_steal_first(h);
2044                         assert(ci);
2045                 } else {
2046                         ci = new(ChainCacheItem, 1);
2047                         if (!ci)
2048                                 return;
2049                 }
2050
2051                 ci->first = first;
2052
2053                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2054                         free(ci);
2055                         return;
2056                 }
2057         } else
2058                 assert(ci->first == first);
2059
2060         ci->array = array;
2061         ci->begin = begin;
2062         ci->total = total;
2063         ci->last_index = last_index;
2064 }
2065
2066 static int generic_array_get(
2067                 JournalFile *f,
2068                 uint64_t first,
2069                 uint64_t i,
2070                 Object **ret, uint64_t *offset) {
2071
2072         Object *o;
2073         uint64_t p = 0, a, t = 0;
2074         int r;
2075         ChainCacheItem *ci;
2076
2077         assert(f);
2078
2079         a = first;
2080
2081         /* Try the chain cache first */
2082         ci = ordered_hashmap_get(f->chain_cache, &first);
2083         if (ci && i > ci->total) {
2084                 a = ci->array;
2085                 i -= ci->total;
2086                 t = ci->total;
2087         }
2088
2089         while (a > 0) {
2090                 uint64_t k;
2091
2092                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2093                 if (r < 0)
2094                         return r;
2095
2096                 k = journal_file_entry_array_n_items(o);
2097                 if (i < k) {
2098                         p = le64toh(o->entry_array.items[i]);
2099                         goto found;
2100                 }
2101
2102                 i -= k;
2103                 t += k;
2104                 a = le64toh(o->entry_array.next_entry_array_offset);
2105         }
2106
2107         return 0;
2108
2109 found:
2110         /* Let's cache this item for the next invocation */
2111         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2112
2113         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2114         if (r < 0)
2115                 return r;
2116
2117         if (ret)
2118                 *ret = o;
2119
2120         if (offset)
2121                 *offset = p;
2122
2123         return 1;
2124 }
2125
2126 static int generic_array_get_plus_one(
2127                 JournalFile *f,
2128                 uint64_t extra,
2129                 uint64_t first,
2130                 uint64_t i,
2131                 Object **ret, uint64_t *offset) {
2132
2133         Object *o;
2134
2135         assert(f);
2136
2137         if (i == 0) {
2138                 int r;
2139
2140                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2141                 if (r < 0)
2142                         return r;
2143
2144                 if (ret)
2145                         *ret = o;
2146
2147                 if (offset)
2148                         *offset = extra;
2149
2150                 return 1;
2151         }
2152
2153         return generic_array_get(f, first, i-1, ret, offset);
2154 }
2155
2156 enum {
2157         TEST_FOUND,
2158         TEST_LEFT,
2159         TEST_RIGHT
2160 };
2161
2162 static int generic_array_bisect(
2163                 JournalFile *f,
2164                 uint64_t first,
2165                 uint64_t n,
2166                 uint64_t needle,
2167                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2168                 direction_t direction,
2169                 Object **ret,
2170                 uint64_t *offset,
2171                 uint64_t *idx) {
2172
2173         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2174         bool subtract_one = false;
2175         Object *o, *array = NULL;
2176         int r;
2177         ChainCacheItem *ci;
2178
2179         assert(f);
2180         assert(test_object);
2181
2182         /* Start with the first array in the chain */
2183         a = first;
2184
2185         ci = ordered_hashmap_get(f->chain_cache, &first);
2186         if (ci && n > ci->total) {
2187                 /* Ah, we have iterated this bisection array chain
2188                  * previously! Let's see if we can skip ahead in the
2189                  * chain, as far as the last time. But we can't jump
2190                  * backwards in the chain, so let's check that
2191                  * first. */
2192
2193                 r = test_object(f, ci->begin, needle);
2194                 if (r < 0)
2195                         return r;
2196
2197                 if (r == TEST_LEFT) {
2198                         /* OK, what we are looking for is right of the
2199                          * begin of this EntryArray, so let's jump
2200                          * straight to previously cached array in the
2201                          * chain */
2202
2203                         a = ci->array;
2204                         n -= ci->total;
2205                         t = ci->total;
2206                         last_index = ci->last_index;
2207                 }
2208         }
2209
2210         while (a > 0) {
2211                 uint64_t left, right, k, lp;
2212
2213                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2214                 if (r < 0)
2215                         return r;
2216
2217                 k = journal_file_entry_array_n_items(array);
2218                 right = MIN(k, n);
2219                 if (right <= 0)
2220                         return 0;
2221
2222                 i = right - 1;
2223                 lp = p = le64toh(array->entry_array.items[i]);
2224                 if (p <= 0)
2225                         r = -EBADMSG;
2226                 else
2227                         r = test_object(f, p, needle);
2228                 if (r == -EBADMSG) {
2229                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2230                         n = i;
2231                         continue;
2232                 }
2233                 if (r < 0)
2234                         return r;
2235
2236                 if (r == TEST_FOUND)
2237                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2238
2239                 if (r == TEST_RIGHT) {
2240                         left = 0;
2241                         right -= 1;
2242
2243                         if (last_index != (uint64_t) -1) {
2244                                 assert(last_index <= right);
2245
2246                                 /* If we cached the last index we
2247                                  * looked at, let's try to not to jump
2248                                  * too wildly around and see if we can
2249                                  * limit the range to look at early to
2250                                  * the immediate neighbors of the last
2251                                  * index we looked at. */
2252
2253                                 if (last_index > 0) {
2254                                         uint64_t x = last_index - 1;
2255
2256                                         p = le64toh(array->entry_array.items[x]);
2257                                         if (p <= 0)
2258                                                 return -EBADMSG;
2259
2260                                         r = test_object(f, p, needle);
2261                                         if (r < 0)
2262                                                 return r;
2263
2264                                         if (r == TEST_FOUND)
2265                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2266
2267                                         if (r == TEST_RIGHT)
2268                                                 right = x;
2269                                         else
2270                                                 left = x + 1;
2271                                 }
2272
2273                                 if (last_index < right) {
2274                                         uint64_t y = last_index + 1;
2275
2276                                         p = le64toh(array->entry_array.items[y]);
2277                                         if (p <= 0)
2278                                                 return -EBADMSG;
2279
2280                                         r = test_object(f, p, needle);
2281                                         if (r < 0)
2282                                                 return r;
2283
2284                                         if (r == TEST_FOUND)
2285                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2286
2287                                         if (r == TEST_RIGHT)
2288                                                 right = y;
2289                                         else
2290                                                 left = y + 1;
2291                                 }
2292                         }
2293
2294                         for (;;) {
2295                                 if (left == right) {
2296                                         if (direction == DIRECTION_UP)
2297                                                 subtract_one = true;
2298
2299                                         i = left;
2300                                         goto found;
2301                                 }
2302
2303                                 assert(left < right);
2304                                 i = (left + right) / 2;
2305
2306                                 p = le64toh(array->entry_array.items[i]);
2307                                 if (p <= 0)
2308                                         r = -EBADMSG;
2309                                 else
2310                                         r = test_object(f, p, needle);
2311                                 if (r == -EBADMSG) {
2312                                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2313                                         right = n = i;
2314                                         continue;
2315                                 }
2316                                 if (r < 0)
2317                                         return r;
2318
2319                                 if (r == TEST_FOUND)
2320                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2321
2322                                 if (r == TEST_RIGHT)
2323                                         right = i;
2324                                 else
2325                                         left = i + 1;
2326                         }
2327                 }
2328
2329                 if (k >= n) {
2330                         if (direction == DIRECTION_UP) {
2331                                 i = n;
2332                                 subtract_one = true;
2333                                 goto found;
2334                         }
2335
2336                         return 0;
2337                 }
2338
2339                 last_p = lp;
2340
2341                 n -= k;
2342                 t += k;
2343                 last_index = (uint64_t) -1;
2344                 a = le64toh(array->entry_array.next_entry_array_offset);
2345         }
2346
2347         return 0;
2348
2349 found:
2350         if (subtract_one && t == 0 && i == 0)
2351                 return 0;
2352
2353         /* Let's cache this item for the next invocation */
2354         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2355
2356         if (subtract_one && i == 0)
2357                 p = last_p;
2358         else if (subtract_one)
2359                 p = le64toh(array->entry_array.items[i-1]);
2360         else
2361                 p = le64toh(array->entry_array.items[i]);
2362
2363         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2364         if (r < 0)
2365                 return r;
2366
2367         if (ret)
2368                 *ret = o;
2369
2370         if (offset)
2371                 *offset = p;
2372
2373         if (idx)
2374                 *idx = t + i + (subtract_one ? -1 : 0);
2375
2376         return 1;
2377 }
2378
2379 static int generic_array_bisect_plus_one(
2380                 JournalFile *f,
2381                 uint64_t extra,
2382                 uint64_t first,
2383                 uint64_t n,
2384                 uint64_t needle,
2385                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2386                 direction_t direction,
2387                 Object **ret,
2388                 uint64_t *offset,
2389                 uint64_t *idx) {
2390
2391         int r;
2392         bool step_back = false;
2393         Object *o;
2394
2395         assert(f);
2396         assert(test_object);
2397
2398         if (n <= 0)
2399                 return 0;
2400
2401         /* This bisects the array in object 'first', but first checks
2402          * an extra  */
2403         r = test_object(f, extra, needle);
2404         if (r < 0)
2405                 return r;
2406
2407         if (r == TEST_FOUND)
2408                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2409
2410         /* if we are looking with DIRECTION_UP then we need to first
2411            see if in the actual array there is a matching entry, and
2412            return the last one of that. But if there isn't any we need
2413            to return this one. Hence remember this, and return it
2414            below. */
2415         if (r == TEST_LEFT)
2416                 step_back = direction == DIRECTION_UP;
2417
2418         if (r == TEST_RIGHT) {
2419                 if (direction == DIRECTION_DOWN)
2420                         goto found;
2421                 else
2422                         return 0;
2423         }
2424
2425         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2426
2427         if (r == 0 && step_back)
2428                 goto found;
2429
2430         if (r > 0 && idx)
2431                 (*idx)++;
2432
2433         return r;
2434
2435 found:
2436         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2437         if (r < 0)
2438                 return r;
2439
2440         if (ret)
2441                 *ret = o;
2442
2443         if (offset)
2444                 *offset = extra;
2445
2446         if (idx)
2447                 *idx = 0;
2448
2449         return 1;
2450 }
2451
2452 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2453         assert(f);
2454         assert(p > 0);
2455
2456         if (p == needle)
2457                 return TEST_FOUND;
2458         else if (p < needle)
2459                 return TEST_LEFT;
2460         else
2461                 return TEST_RIGHT;
2462 }
2463
2464 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2465         Object *o;
2466         int r;
2467
2468         assert(f);
2469         assert(p > 0);
2470
2471         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2472         if (r < 0)
2473                 return r;
2474
2475         if (le64toh(o->entry.seqnum) == needle)
2476                 return TEST_FOUND;
2477         else if (le64toh(o->entry.seqnum) < needle)
2478                 return TEST_LEFT;
2479         else
2480                 return TEST_RIGHT;
2481 }
2482
2483 int journal_file_move_to_entry_by_seqnum(
2484                 JournalFile *f,
2485                 uint64_t seqnum,
2486                 direction_t direction,
2487                 Object **ret,
2488                 uint64_t *offset) {
2489         assert(f);
2490         assert(f->header);
2491
2492         return generic_array_bisect(f,
2493                                     le64toh(f->header->entry_array_offset),
2494                                     le64toh(f->header->n_entries),
2495                                     seqnum,
2496                                     test_object_seqnum,
2497                                     direction,
2498                                     ret, offset, NULL);
2499 }
2500
2501 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2502         Object *o;
2503         int r;
2504
2505         assert(f);
2506         assert(p > 0);
2507
2508         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2509         if (r < 0)
2510                 return r;
2511
2512         if (le64toh(o->entry.realtime) == needle)
2513                 return TEST_FOUND;
2514         else if (le64toh(o->entry.realtime) < needle)
2515                 return TEST_LEFT;
2516         else
2517                 return TEST_RIGHT;
2518 }
2519
2520 int journal_file_move_to_entry_by_realtime(
2521                 JournalFile *f,
2522                 uint64_t realtime,
2523                 direction_t direction,
2524                 Object **ret,
2525                 uint64_t *offset) {
2526         assert(f);
2527         assert(f->header);
2528
2529         return generic_array_bisect(f,
2530                                     le64toh(f->header->entry_array_offset),
2531                                     le64toh(f->header->n_entries),
2532                                     realtime,
2533                                     test_object_realtime,
2534                                     direction,
2535                                     ret, offset, NULL);
2536 }
2537
2538 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2539         Object *o;
2540         int r;
2541
2542         assert(f);
2543         assert(p > 0);
2544
2545         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2546         if (r < 0)
2547                 return r;
2548
2549         if (le64toh(o->entry.monotonic) == needle)
2550                 return TEST_FOUND;
2551         else if (le64toh(o->entry.monotonic) < needle)
2552                 return TEST_LEFT;
2553         else
2554                 return TEST_RIGHT;
2555 }
2556
2557 static int find_data_object_by_boot_id(
2558                 JournalFile *f,
2559                 sd_id128_t boot_id,
2560                 Object **o,
2561                 uint64_t *b) {
2562
2563         char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2564
2565         sd_id128_to_string(boot_id, t + 9);
2566         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2567 }
2568
2569 int journal_file_move_to_entry_by_monotonic(
2570                 JournalFile *f,
2571                 sd_id128_t boot_id,
2572                 uint64_t monotonic,
2573                 direction_t direction,
2574                 Object **ret,
2575                 uint64_t *offset) {
2576
2577         Object *o;
2578         int r;
2579
2580         assert(f);
2581
2582         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2583         if (r < 0)
2584                 return r;
2585         if (r == 0)
2586                 return -ENOENT;
2587
2588         return generic_array_bisect_plus_one(f,
2589                                              le64toh(o->data.entry_offset),
2590                                              le64toh(o->data.entry_array_offset),
2591                                              le64toh(o->data.n_entries),
2592                                              monotonic,
2593                                              test_object_monotonic,
2594                                              direction,
2595                                              ret, offset, NULL);
2596 }
2597
2598 void journal_file_reset_location(JournalFile *f) {
2599         f->location_type = LOCATION_HEAD;
2600         f->current_offset = 0;
2601         f->current_seqnum = 0;
2602         f->current_realtime = 0;
2603         f->current_monotonic = 0;
2604         zero(f->current_boot_id);
2605         f->current_xor_hash = 0;
2606 }
2607
2608 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2609         f->location_type = LOCATION_SEEK;
2610         f->current_offset = offset;
2611         f->current_seqnum = le64toh(o->entry.seqnum);
2612         f->current_realtime = le64toh(o->entry.realtime);
2613         f->current_monotonic = le64toh(o->entry.monotonic);
2614         f->current_boot_id = o->entry.boot_id;
2615         f->current_xor_hash = le64toh(o->entry.xor_hash);
2616 }
2617
2618 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2619         assert(af);
2620         assert(af->header);
2621         assert(bf);
2622         assert(bf->header);
2623         assert(af->location_type == LOCATION_SEEK);
2624         assert(bf->location_type == LOCATION_SEEK);
2625
2626         /* If contents and timestamps match, these entries are
2627          * identical, even if the seqnum does not match */
2628         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2629             af->current_monotonic == bf->current_monotonic &&
2630             af->current_realtime == bf->current_realtime &&
2631             af->current_xor_hash == bf->current_xor_hash)
2632                 return 0;
2633
2634         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2635
2636                 /* If this is from the same seqnum source, compare
2637                  * seqnums */
2638                 if (af->current_seqnum < bf->current_seqnum)
2639                         return -1;
2640                 if (af->current_seqnum > bf->current_seqnum)
2641                         return 1;
2642
2643                 /* Wow! This is weird, different data but the same
2644                  * seqnums? Something is borked, but let's make the
2645                  * best of it and compare by time. */
2646         }
2647
2648         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2649
2650                 /* If the boot id matches, compare monotonic time */
2651                 if (af->current_monotonic < bf->current_monotonic)
2652                         return -1;
2653                 if (af->current_monotonic > bf->current_monotonic)
2654                         return 1;
2655         }
2656
2657         /* Otherwise, compare UTC time */
2658         if (af->current_realtime < bf->current_realtime)
2659                 return -1;
2660         if (af->current_realtime > bf->current_realtime)
2661                 return 1;
2662
2663         /* Finally, compare by contents */
2664         if (af->current_xor_hash < bf->current_xor_hash)
2665                 return -1;
2666         if (af->current_xor_hash > bf->current_xor_hash)
2667                 return 1;
2668
2669         return 0;
2670 }
2671
2672 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2673
2674         /* Increase or decrease the specified index, in the right direction. */
2675
2676         if (direction == DIRECTION_DOWN) {
2677                 if (*i >= n - 1)
2678                         return 0;
2679
2680                 (*i) ++;
2681         } else {
2682                 if (*i <= 0)
2683                         return 0;
2684
2685                 (*i) --;
2686         }
2687
2688         return 1;
2689 }
2690
2691 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2692
2693         /* Consider it an error if any of the two offsets is uninitialized */
2694         if (old_offset == 0 || new_offset == 0)
2695                 return false;
2696
2697         /* If we go down, the new offset must be larger than the old one. */
2698         return direction == DIRECTION_DOWN ?
2699                 new_offset > old_offset  :
2700                 new_offset < old_offset;
2701 }
2702
2703 int journal_file_next_entry(
2704                 JournalFile *f,
2705                 uint64_t p,
2706                 direction_t direction,
2707                 Object **ret, uint64_t *offset) {
2708
2709         uint64_t i, n, ofs;
2710         int r;
2711
2712         assert(f);
2713         assert(f->header);
2714
2715         n = le64toh(f->header->n_entries);
2716         if (n <= 0)
2717                 return 0;
2718
2719         if (p == 0)
2720                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2721         else {
2722                 r = generic_array_bisect(f,
2723                                          le64toh(f->header->entry_array_offset),
2724                                          le64toh(f->header->n_entries),
2725                                          p,
2726                                          test_object_offset,
2727                                          DIRECTION_DOWN,
2728                                          NULL, NULL,
2729                                          &i);
2730                 if (r <= 0)
2731                         return r;
2732
2733                 r = bump_array_index(&i, direction, n);
2734                 if (r <= 0)
2735                         return r;
2736         }
2737
2738         /* And jump to it */
2739         for (;;) {
2740                 r = generic_array_get(f,
2741                                       le64toh(f->header->entry_array_offset),
2742                                       i,
2743                                       ret, &ofs);
2744                 if (r > 0)
2745                         break;
2746                 if (r != -EBADMSG)
2747                         return r;
2748
2749                 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2750                  * the next one might work for us instead. */
2751                 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2752
2753                 r = bump_array_index(&i, direction, n);
2754                 if (r <= 0)
2755                         return r;
2756         }
2757
2758         /* Ensure our array is properly ordered. */
2759         if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2760                 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
2761                 return -EBADMSG;
2762         }
2763
2764         if (offset)
2765                 *offset = ofs;
2766
2767         return 1;
2768 }
2769
2770 int journal_file_next_entry_for_data(
2771                 JournalFile *f,
2772                 Object *o, uint64_t p,
2773                 uint64_t data_offset,
2774                 direction_t direction,
2775                 Object **ret, uint64_t *offset) {
2776
2777         uint64_t i, n, ofs;
2778         Object *d;
2779         int r;
2780
2781         assert(f);
2782         assert(p > 0 || !o);
2783
2784         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2785         if (r < 0)
2786                 return r;
2787
2788         n = le64toh(d->data.n_entries);
2789         if (n <= 0)
2790                 return n;
2791
2792         if (!o)
2793                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2794         else {
2795                 if (o->object.type != OBJECT_ENTRY)
2796                         return -EINVAL;
2797
2798                 r = generic_array_bisect_plus_one(f,
2799                                                   le64toh(d->data.entry_offset),
2800                                                   le64toh(d->data.entry_array_offset),
2801                                                   le64toh(d->data.n_entries),
2802                                                   p,
2803                                                   test_object_offset,
2804                                                   DIRECTION_DOWN,
2805                                                   NULL, NULL,
2806                                                   &i);
2807
2808                 if (r <= 0)
2809                         return r;
2810
2811                 r = bump_array_index(&i, direction, n);
2812                 if (r <= 0)
2813                         return r;
2814         }
2815
2816         for (;;) {
2817                 r = generic_array_get_plus_one(f,
2818                                                le64toh(d->data.entry_offset),
2819                                                le64toh(d->data.entry_array_offset),
2820                                                i,
2821                                                ret, &ofs);
2822                 if (r > 0)
2823                         break;
2824                 if (r != -EBADMSG)
2825                         return r;
2826
2827                 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2828
2829                 r = bump_array_index(&i, direction, n);
2830                 if (r <= 0)
2831                         return r;
2832         }
2833
2834         /* Ensure our array is properly ordered. */
2835         if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2836                 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2837                 return -EBADMSG;
2838         }
2839
2840         if (offset)
2841                 *offset = ofs;
2842
2843         return 1;
2844 }
2845
2846 int journal_file_move_to_entry_by_offset_for_data(
2847                 JournalFile *f,
2848                 uint64_t data_offset,
2849                 uint64_t p,
2850                 direction_t direction,
2851                 Object **ret, uint64_t *offset) {
2852
2853         int r;
2854         Object *d;
2855
2856         assert(f);
2857
2858         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2859         if (r < 0)
2860                 return r;
2861
2862         return generic_array_bisect_plus_one(f,
2863                                              le64toh(d->data.entry_offset),
2864                                              le64toh(d->data.entry_array_offset),
2865                                              le64toh(d->data.n_entries),
2866                                              p,
2867                                              test_object_offset,
2868                                              direction,
2869                                              ret, offset, NULL);
2870 }
2871
2872 int journal_file_move_to_entry_by_monotonic_for_data(
2873                 JournalFile *f,
2874                 uint64_t data_offset,
2875                 sd_id128_t boot_id,
2876                 uint64_t monotonic,
2877                 direction_t direction,
2878                 Object **ret, uint64_t *offset) {
2879
2880         Object *o, *d;
2881         int r;
2882         uint64_t b, z;
2883
2884         assert(f);
2885
2886         /* First, seek by time */
2887         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2888         if (r < 0)
2889                 return r;
2890         if (r == 0)
2891                 return -ENOENT;
2892
2893         r = generic_array_bisect_plus_one(f,
2894                                           le64toh(o->data.entry_offset),
2895                                           le64toh(o->data.entry_array_offset),
2896                                           le64toh(o->data.n_entries),
2897                                           monotonic,
2898                                           test_object_monotonic,
2899                                           direction,
2900                                           NULL, &z, NULL);
2901         if (r <= 0)
2902                 return r;
2903
2904         /* And now, continue seeking until we find an entry that
2905          * exists in both bisection arrays */
2906
2907         for (;;) {
2908                 Object *qo;
2909                 uint64_t p, q;
2910
2911                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2912                 if (r < 0)
2913                         return r;
2914
2915                 r = generic_array_bisect_plus_one(f,
2916                                                   le64toh(d->data.entry_offset),
2917                                                   le64toh(d->data.entry_array_offset),
2918                                                   le64toh(d->data.n_entries),
2919                                                   z,
2920                                                   test_object_offset,
2921                                                   direction,
2922                                                   NULL, &p, NULL);
2923                 if (r <= 0)
2924                         return r;
2925
2926                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2927                 if (r < 0)
2928                         return r;
2929
2930                 r = generic_array_bisect_plus_one(f,
2931                                                   le64toh(o->data.entry_offset),
2932                                                   le64toh(o->data.entry_array_offset),
2933                                                   le64toh(o->data.n_entries),
2934                                                   p,
2935                                                   test_object_offset,
2936                                                   direction,
2937                                                   &qo, &q, NULL);
2938
2939                 if (r <= 0)
2940                         return r;
2941
2942                 if (p == q) {
2943                         if (ret)
2944                                 *ret = qo;
2945                         if (offset)
2946                                 *offset = q;
2947
2948                         return 1;
2949                 }
2950
2951                 z = q;
2952         }
2953 }
2954
2955 int journal_file_move_to_entry_by_seqnum_for_data(
2956                 JournalFile *f,
2957                 uint64_t data_offset,
2958                 uint64_t seqnum,
2959                 direction_t direction,
2960                 Object **ret, uint64_t *offset) {
2961
2962         Object *d;
2963         int r;
2964
2965         assert(f);
2966
2967         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2968         if (r < 0)
2969                 return r;
2970
2971         return generic_array_bisect_plus_one(f,
2972                                              le64toh(d->data.entry_offset),
2973                                              le64toh(d->data.entry_array_offset),
2974                                              le64toh(d->data.n_entries),
2975                                              seqnum,
2976                                              test_object_seqnum,
2977                                              direction,
2978                                              ret, offset, NULL);
2979 }
2980
2981 int journal_file_move_to_entry_by_realtime_for_data(
2982                 JournalFile *f,
2983                 uint64_t data_offset,
2984                 uint64_t realtime,
2985                 direction_t direction,
2986                 Object **ret, uint64_t *offset) {
2987
2988         Object *d;
2989         int r;
2990
2991         assert(f);
2992
2993         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2994         if (r < 0)
2995                 return r;
2996
2997         return generic_array_bisect_plus_one(f,
2998                                              le64toh(d->data.entry_offset),
2999                                              le64toh(d->data.entry_array_offset),
3000                                              le64toh(d->data.n_entries),
3001                                              realtime,
3002                                              test_object_realtime,
3003                                              direction,
3004                                              ret, offset, NULL);
3005 }
3006
3007 void journal_file_dump(JournalFile *f) {
3008         Object *o;
3009         int r;
3010         uint64_t p;
3011
3012         assert(f);
3013         assert(f->header);
3014
3015         journal_file_print_header(f);
3016
3017         p = le64toh(f->header->header_size);
3018         while (p != 0) {
3019                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3020                 if (r < 0)
3021                         goto fail;
3022
3023                 switch (o->object.type) {
3024
3025                 case OBJECT_UNUSED:
3026                         printf("Type: OBJECT_UNUSED\n");
3027                         break;
3028
3029                 case OBJECT_DATA:
3030                         printf("Type: OBJECT_DATA\n");
3031                         break;
3032
3033                 case OBJECT_FIELD:
3034                         printf("Type: OBJECT_FIELD\n");
3035                         break;
3036
3037                 case OBJECT_ENTRY:
3038                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3039                                le64toh(o->entry.seqnum),
3040                                le64toh(o->entry.monotonic),
3041                                le64toh(o->entry.realtime));
3042                         break;
3043
3044                 case OBJECT_FIELD_HASH_TABLE:
3045                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3046                         break;
3047
3048                 case OBJECT_DATA_HASH_TABLE:
3049                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
3050                         break;
3051
3052                 case OBJECT_ENTRY_ARRAY:
3053                         printf("Type: OBJECT_ENTRY_ARRAY\n");
3054                         break;
3055
3056                 case OBJECT_TAG:
3057                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3058                                le64toh(o->tag.seqnum),
3059                                le64toh(o->tag.epoch));
3060                         break;
3061
3062                 default:
3063                         printf("Type: unknown (%i)\n", o->object.type);
3064                         break;
3065                 }
3066
3067                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3068                         printf("Flags: %s\n",
3069                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3070
3071                 if (p == le64toh(f->header->tail_object_offset))
3072                         p = 0;
3073                 else
3074                         p = p + ALIGN64(le64toh(o->object.size));
3075         }
3076
3077         return;
3078 fail:
3079         log_error("File corrupt");
3080 }
3081
3082 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3083         const char *x;
3084
3085         x = format_timestamp(buf, l, t);
3086         if (x)
3087                 return x;
3088         return " --- ";
3089 }
3090
3091 void journal_file_print_header(JournalFile *f) {
3092         char a[33], b[33], c[33], d[33];
3093         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3094         struct stat st;
3095         char bytes[FORMAT_BYTES_MAX];
3096
3097         assert(f);
3098         assert(f->header);
3099
3100         printf("File Path: %s\n"
3101                "File ID: %s\n"
3102                "Machine ID: %s\n"
3103                "Boot ID: %s\n"
3104                "Sequential Number ID: %s\n"
3105                "State: %s\n"
3106                "Compatible Flags:%s%s\n"
3107                "Incompatible Flags:%s%s%s\n"
3108                "Header size: %"PRIu64"\n"
3109                "Arena size: %"PRIu64"\n"
3110                "Data Hash Table Size: %"PRIu64"\n"
3111                "Field Hash Table Size: %"PRIu64"\n"
3112                "Rotate Suggested: %s\n"
3113                "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3114                "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3115                "Head Realtime Timestamp: %s (%"PRIx64")\n"
3116                "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3117                "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3118                "Objects: %"PRIu64"\n"
3119                "Entry Objects: %"PRIu64"\n",
3120                f->path,
3121                sd_id128_to_string(f->header->file_id, a),
3122                sd_id128_to_string(f->header->machine_id, b),
3123                sd_id128_to_string(f->header->boot_id, c),
3124                sd_id128_to_string(f->header->seqnum_id, d),
3125                f->header->state == STATE_OFFLINE ? "OFFLINE" :
3126                f->header->state == STATE_ONLINE ? "ONLINE" :
3127                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3128                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3129                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3130                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3131                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3132                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3133                le64toh(f->header->header_size),
3134                le64toh(f->header->arena_size),
3135                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3136                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3137                yes_no(journal_file_rotate_suggested(f, 0)),
3138                le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3139                le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3140                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3141                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3142                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3143                le64toh(f->header->n_objects),
3144                le64toh(f->header->n_entries));
3145
3146         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3147                 printf("Data Objects: %"PRIu64"\n"
3148                        "Data Hash Table Fill: %.1f%%\n",
3149                        le64toh(f->header->n_data),
3150                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3151
3152         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3153                 printf("Field Objects: %"PRIu64"\n"
3154                        "Field Hash Table Fill: %.1f%%\n",
3155                        le64toh(f->header->n_fields),
3156                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3157
3158         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3159                 printf("Tag Objects: %"PRIu64"\n",
3160                        le64toh(f->header->n_tags));
3161         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3162                 printf("Entry Array Objects: %"PRIu64"\n",
3163                        le64toh(f->header->n_entry_arrays));
3164
3165         if (fstat(f->fd, &st) >= 0)
3166                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3167 }
3168
3169 static int journal_file_warn_btrfs(JournalFile *f) {
3170         unsigned attrs;
3171         int r;
3172
3173         assert(f);
3174
3175         /* Before we write anything, check if the COW logic is turned
3176          * off on btrfs. Given our write pattern that is quite
3177          * unfriendly to COW file systems this should greatly improve
3178          * performance on COW file systems, such as btrfs, at the
3179          * expense of data integrity features (which shouldn't be too
3180          * bad, given that we do our own checksumming). */
3181
3182         r = btrfs_is_filesystem(f->fd);
3183         if (r < 0)
3184                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3185         if (!r)
3186                 return 0;
3187
3188         r = read_attr_fd(f->fd, &attrs);
3189         if (r < 0)
3190                 return log_warning_errno(r, "Failed to read file attributes: %m");
3191
3192         if (attrs & FS_NOCOW_FL) {
3193                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3194                 return 0;
3195         }
3196
3197         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3198                    "This is likely to slow down journal access substantially, please consider turning "
3199                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3200
3201         return 1;
3202 }
3203
3204 int journal_file_open(
3205                 int fd,
3206                 const char *fname,
3207                 int flags,
3208                 mode_t mode,
3209                 bool compress,
3210                 uint64_t compress_threshold_bytes,
3211                 bool seal,
3212                 JournalMetrics *metrics,
3213                 MMapCache *mmap_cache,
3214                 Set *deferred_closes,
3215                 JournalFile *template,
3216                 JournalFile **ret) {
3217
3218         bool newly_created = false;
3219         JournalFile *f;
3220         void *h;
3221         int r;
3222         char bytes[FORMAT_BYTES_MAX];
3223
3224         assert(ret);
3225         assert(fd >= 0 || fname);
3226
3227         if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3228                 return -EINVAL;
3229
3230         if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3231                 return -EINVAL;
3232
3233         f = new0(JournalFile, 1);
3234         if (!f)
3235                 return -ENOMEM;
3236
3237         f->fd = fd;
3238         f->mode = mode;
3239
3240         f->flags = flags;
3241         f->prot = prot_from_flags(flags);
3242         f->writable = (flags & O_ACCMODE) != O_RDONLY;
3243 #if HAVE_LZ4
3244         f->compress_lz4 = compress;
3245 #elif HAVE_XZ
3246         f->compress_xz = compress;
3247 #endif
3248
3249         if (compress_threshold_bytes == (uint64_t) -1)
3250                 f->compress_threshold_bytes = DEFAULT_COMPRESS_THRESHOLD;
3251         else
3252                 f->compress_threshold_bytes = MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes);
3253
3254 #if HAVE_GCRYPT
3255         f->seal = seal;
3256 #endif
3257
3258         log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
3259                   yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
3260                   format_bytes(bytes, sizeof(bytes), f->compress_threshold_bytes));
3261
3262         if (mmap_cache)
3263                 f->mmap = mmap_cache_ref(mmap_cache);
3264         else {
3265                 f->mmap = mmap_cache_new();
3266                 if (!f->mmap) {
3267                         r = -ENOMEM;
3268                         goto fail;
3269                 }
3270         }
3271
3272         if (fname) {
3273                 f->path = strdup(fname);
3274                 if (!f->path) {
3275                         r = -ENOMEM;
3276                         goto fail;
3277                 }
3278         } else {
3279                 assert(fd >= 0);
3280
3281                 /* If we don't know the path, fill in something explanatory and vaguely useful */
3282                 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3283                         r = -ENOMEM;
3284                         goto fail;
3285                 }
3286         }
3287
3288         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3289         if (!f->chain_cache) {
3290                 r = -ENOMEM;
3291                 goto fail;
3292         }
3293
3294         if (f->fd < 0) {
3295                 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3296                  * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3297                  * it doesn't hurt in that case. */
3298
3299                 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
3300                 if (f->fd < 0) {
3301                         r = -errno;
3302                         goto fail;
3303                 }
3304
3305                 /* fds we opened here by us should also be closed by us. */
3306                 f->close_fd = true;
3307
3308                 r = fd_nonblock(f->fd, false);
3309                 if (r < 0)
3310                         goto fail;
3311         }
3312
3313         f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3314         if (!f->cache_fd) {
3315                 r = -ENOMEM;
3316                 goto fail;
3317         }
3318
3319         r = journal_file_fstat(f);
3320         if (r < 0)
3321                 goto fail;
3322
3323         if (f->last_stat.st_size == 0 && f->writable) {
3324
3325                 (void) journal_file_warn_btrfs(f);
3326
3327                 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3328                  * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3329                  * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3330                  * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3331                  * solely on mtime/atime/ctime of the file. */
3332                 (void) fd_setcrtime(f->fd, 0);
3333
3334 #if HAVE_GCRYPT
3335                 /* Try to load the FSPRG state, and if we can't, then
3336                  * just don't do sealing */
3337                 if (f->seal) {
3338                         r = journal_file_fss_load(f);
3339                         if (r < 0)
3340                                 f->seal = false;
3341                 }
3342 #endif
3343
3344                 r = journal_file_init_header(f, template);
3345                 if (r < 0)
3346                         goto fail;
3347
3348                 r = journal_file_fstat(f);
3349                 if (r < 0)
3350                         goto fail;
3351
3352                 newly_created = true;
3353         }
3354
3355         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3356                 r = -ENODATA;
3357                 goto fail;
3358         }
3359
3360         r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3361         if (r < 0)
3362                 goto fail;
3363
3364         f->header = h;
3365
3366         if (!newly_created) {
3367                 set_clear_with_destructor(deferred_closes, journal_file_close);
3368
3369                 r = journal_file_verify_header(f);
3370                 if (r < 0)
3371                         goto fail;
3372         }
3373
3374 #if HAVE_GCRYPT
3375         if (!newly_created && f->writable) {
3376                 r = journal_file_fss_load(f);
3377                 if (r < 0)
3378                         goto fail;
3379         }
3380 #endif
3381
3382         if (f->writable) {
3383                 if (metrics) {
3384                         journal_default_metrics(metrics, f->fd);
3385                         f->metrics = *metrics;
3386                 } else if (template)
3387                         f->metrics = template->metrics;
3388
3389                 r = journal_file_refresh_header(f);
3390                 if (r < 0)
3391                         goto fail;
3392         }
3393
3394 #if HAVE_GCRYPT
3395         r = journal_file_hmac_setup(f);
3396         if (r < 0)
3397                 goto fail;
3398 #endif
3399
3400         if (newly_created) {
3401                 r = journal_file_setup_field_hash_table(f);
3402                 if (r < 0)
3403                         goto fail;
3404
3405                 r = journal_file_setup_data_hash_table(f);
3406                 if (r < 0)
3407                         goto fail;
3408
3409 #if HAVE_GCRYPT
3410                 r = journal_file_append_first_tag(f);
3411                 if (r < 0)
3412                         goto fail;
3413 #endif
3414         }
3415
3416         if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3417                 r = -EIO;
3418                 goto fail;
3419         }
3420
3421         if (template && template->post_change_timer) {
3422                 r = journal_file_enable_post_change_timer(
3423                                 f,
3424                                 sd_event_source_get_event(template->post_change_timer),
3425                                 template->post_change_timer_period);
3426
3427                 if (r < 0)
3428                         goto fail;
3429         }
3430
3431         /* The file is opened now successfully, thus we take possession of any passed in fd. */
3432         f->close_fd = true;
3433
3434         *ret = f;
3435         return 0;
3436
3437 fail:
3438         if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3439                 r = -EIO;
3440
3441         (void) journal_file_close(f);
3442
3443         return r;
3444 }
3445
3446 int journal_file_rotate(JournalFile **f, bool compress, uint64_t compress_threshold_bytes, bool seal, Set *deferred_closes) {
3447         _cleanup_free_ char *p = NULL;
3448         size_t l;
3449         JournalFile *old_file, *new_file = NULL;
3450         int r;
3451
3452         assert(f);
3453         assert(*f);
3454
3455         old_file = *f;
3456
3457         if (!old_file->writable)
3458                 return -EINVAL;
3459
3460         /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3461          * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3462         if (path_startswith(old_file->path, "/proc/self/fd"))
3463                 return -EINVAL;
3464
3465         if (!endswith(old_file->path, ".journal"))
3466                 return -EINVAL;
3467
3468         l = strlen(old_file->path);
3469         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3470                      (int) l - 8, old_file->path,
3471                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3472                      le64toh((*f)->header->head_entry_seqnum),
3473                      le64toh((*f)->header->head_entry_realtime));
3474         if (r < 0)
3475                 return -ENOMEM;
3476
3477         /* Try to rename the file to the archived version. If the file
3478          * already was deleted, we'll get ENOENT, let's ignore that
3479          * case. */
3480         r = rename(old_file->path, p);
3481         if (r < 0 && errno != ENOENT)
3482                 return -errno;
3483
3484         /* Sync the rename to disk */
3485         (void) fsync_directory_of_file(old_file->fd);
3486
3487         /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3488          * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3489          * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3490          * would result in the rotated journal never getting fsync() called before closing.
3491          * Now we simply queue the archive state by setting an archive bit, leaving the state
3492          * as STATE_ONLINE so proper offlining occurs. */
3493         old_file->archive = true;
3494
3495         /* Currently, btrfs is not very good with out write patterns
3496          * and fragments heavily. Let's defrag our journal files when
3497          * we archive them */
3498         old_file->defrag_on_close = true;
3499
3500         r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress,
3501                               compress_threshold_bytes, seal, NULL, old_file->mmap, deferred_closes,
3502                               old_file, &new_file);
3503
3504         if (deferred_closes &&
3505             set_put(deferred_closes, old_file) >= 0)
3506                 (void) journal_file_set_offline(old_file, false);
3507         else
3508                 (void) journal_file_close(old_file);
3509
3510         *f = new_file;
3511         return r;
3512 }
3513
3514 int journal_file_open_reliably(
3515                 const char *fname,
3516                 int flags,
3517                 mode_t mode,
3518                 bool compress,
3519                 uint64_t compress_threshold_bytes,
3520                 bool seal,
3521                 JournalMetrics *metrics,
3522                 MMapCache *mmap_cache,
3523                 Set *deferred_closes,
3524                 JournalFile *template,
3525                 JournalFile **ret) {
3526
3527         int r;
3528         size_t l;
3529         _cleanup_free_ char *p = NULL;
3530
3531         r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3532                               deferred_closes, template, ret);
3533         if (!IN_SET(r,
3534                     -EBADMSG,           /* Corrupted */
3535                     -ENODATA,           /* Truncated */
3536                     -EHOSTDOWN,         /* Other machine */
3537                     -EPROTONOSUPPORT,   /* Incompatible feature */
3538                     -EBUSY,             /* Unclean shutdown */
3539                     -ESHUTDOWN,         /* Already archived */
3540                     -EIO,               /* IO error, including SIGBUS on mmap */
3541                     -EIDRM,             /* File has been deleted */
3542                     -ETXTBSY))          /* File is from the future */
3543                 return r;
3544
3545         if ((flags & O_ACCMODE) == O_RDONLY)
3546                 return r;
3547
3548         if (!(flags & O_CREAT))
3549                 return r;
3550
3551         if (!endswith(fname, ".journal"))
3552                 return r;
3553
3554         /* The file is corrupted. Rotate it away and try it again (but only once) */
3555
3556         l = strlen(fname);
3557         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
3558                      (int) l - 8, fname,
3559                      now(CLOCK_REALTIME),
3560                      random_u64()) < 0)
3561                 return -ENOMEM;
3562
3563         if (rename(fname, p) < 0)
3564                 return -errno;
3565
3566         /* btrfs doesn't cope well with our write pattern and
3567          * fragments heavily. Let's defrag all files we rotate */
3568
3569         (void) chattr_path(p, 0, FS_NOCOW_FL);
3570         (void) btrfs_defrag(p);
3571
3572         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3573
3574         return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3575                                  deferred_closes, template, ret);
3576 }
3577
3578 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3579         uint64_t i, n;
3580         uint64_t q, xor_hash = 0;
3581         int r;
3582         EntryItem *items;
3583         dual_timestamp ts;
3584
3585         assert(from);
3586         assert(to);
3587         assert(o);
3588         assert(p);
3589
3590         if (!to->writable)
3591                 return -EPERM;
3592
3593         ts.monotonic = le64toh(o->entry.monotonic);
3594         ts.realtime = le64toh(o->entry.realtime);
3595
3596         n = journal_file_entry_n_items(o);
3597         /* alloca() can't take 0, hence let's allocate at least one */
3598         items = alloca(sizeof(EntryItem) * MAX(1u, n));
3599
3600         for (i = 0; i < n; i++) {
3601                 uint64_t l, h;
3602                 le64_t le_hash;
3603                 size_t t;
3604                 void *data;
3605                 Object *u;
3606
3607                 q = le64toh(o->entry.items[i].object_offset);
3608                 le_hash = o->entry.items[i].hash;
3609
3610                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3611                 if (r < 0)
3612                         return r;
3613
3614                 if (le_hash != o->data.hash)
3615                         return -EBADMSG;
3616
3617                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3618                 t = (size_t) l;
3619
3620                 /* We hit the limit on 32bit machines */
3621                 if ((uint64_t) t != l)
3622                         return -E2BIG;
3623
3624                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3625 #if HAVE_XZ || HAVE_LZ4
3626                         size_t rsize = 0;
3627
3628                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3629                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3630                         if (r < 0)
3631                                 return r;
3632
3633                         data = from->compress_buffer;
3634                         l = rsize;
3635 #else
3636                         return -EPROTONOSUPPORT;
3637 #endif
3638                 } else
3639                         data = o->data.payload;
3640
3641                 r = journal_file_append_data(to, data, l, &u, &h);
3642                 if (r < 0)
3643                         return r;
3644
3645                 xor_hash ^= le64toh(u->data.hash);
3646                 items[i].object_offset = htole64(h);
3647                 items[i].hash = u->data.hash;
3648
3649                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3650                 if (r < 0)
3651                         return r;
3652         }
3653
3654         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3655
3656         if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3657                 return -EIO;
3658
3659         return r;
3660 }
3661
3662 void journal_reset_metrics(JournalMetrics *m) {
3663         assert(m);
3664
3665         /* Set everything to "pick automatic values". */
3666
3667         *m = (JournalMetrics) {
3668                 .min_use = (uint64_t) -1,
3669                 .max_use = (uint64_t) -1,
3670                 .min_size = (uint64_t) -1,
3671                 .max_size = (uint64_t) -1,
3672                 .keep_free = (uint64_t) -1,
3673                 .n_max_files = (uint64_t) -1,
3674         };
3675 }
3676
3677 void journal_default_metrics(JournalMetrics *m, int fd) {
3678         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3679         struct statvfs ss;
3680         uint64_t fs_size;
3681
3682         assert(m);
3683         assert(fd >= 0);
3684
3685         if (fstatvfs(fd, &ss) >= 0)
3686                 fs_size = ss.f_frsize * ss.f_blocks;
3687         else {
3688                 log_debug_errno(errno, "Failed to determine disk size: %m");
3689                 fs_size = 0;
3690         }
3691
3692         if (m->max_use == (uint64_t) -1) {
3693
3694                 if (fs_size > 0) {
3695                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3696
3697                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3698                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3699
3700                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3701                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3702                 } else
3703                         m->max_use = DEFAULT_MAX_USE_LOWER;
3704         } else {
3705                 m->max_use = PAGE_ALIGN(m->max_use);
3706
3707                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3708                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3709         }
3710
3711         if (m->min_use == (uint64_t) -1)
3712                 m->min_use = DEFAULT_MIN_USE;
3713
3714         if (m->min_use > m->max_use)
3715                 m->min_use = m->max_use;
3716
3717         if (m->max_size == (uint64_t) -1) {
3718                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3719
3720                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3721                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3722         } else
3723                 m->max_size = PAGE_ALIGN(m->max_size);
3724
3725         if (m->max_size != 0) {
3726                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3727                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3728
3729                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3730                         m->max_use = m->max_size*2;
3731         }
3732
3733         if (m->min_size == (uint64_t) -1)
3734                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3735         else {
3736                 m->min_size = PAGE_ALIGN(m->min_size);
3737
3738                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3739                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3740
3741                 if (m->max_size != 0 && m->min_size > m->max_size)
3742                         m->max_size = m->min_size;
3743         }
3744
3745         if (m->keep_free == (uint64_t) -1) {
3746
3747                 if (fs_size > 0) {
3748                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3749
3750                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3751                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3752
3753                 } else
3754                         m->keep_free = DEFAULT_KEEP_FREE;
3755         }
3756
3757         if (m->n_max_files == (uint64_t) -1)
3758                 m->n_max_files = DEFAULT_N_MAX_FILES;
3759
3760         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3761                   format_bytes(a, sizeof(a), m->min_use),
3762                   format_bytes(b, sizeof(b), m->max_use),
3763                   format_bytes(c, sizeof(c), m->max_size),
3764                   format_bytes(d, sizeof(d), m->min_size),
3765                   format_bytes(e, sizeof(e), m->keep_free),
3766                   m->n_max_files);
3767 }
3768
3769 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3770         assert(f);
3771         assert(f->header);
3772         assert(from || to);
3773
3774         if (from) {
3775                 if (f->header->head_entry_realtime == 0)
3776                         return -ENOENT;
3777
3778                 *from = le64toh(f->header->head_entry_realtime);
3779         }
3780
3781         if (to) {
3782                 if (f->header->tail_entry_realtime == 0)
3783                         return -ENOENT;
3784
3785                 *to = le64toh(f->header->tail_entry_realtime);
3786         }
3787
3788         return 1;
3789 }
3790
3791 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3792         Object *o;
3793         uint64_t p;
3794         int r;
3795
3796         assert(f);
3797         assert(from || to);
3798
3799         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3800         if (r <= 0)
3801                 return r;
3802
3803         if (le64toh(o->data.n_entries) <= 0)
3804                 return 0;
3805
3806         if (from) {
3807                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3808                 if (r < 0)
3809                         return r;
3810
3811                 *from = le64toh(o->entry.monotonic);
3812         }
3813
3814         if (to) {
3815                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3816                 if (r < 0)
3817                         return r;
3818
3819                 r = generic_array_get_plus_one(f,
3820                                                le64toh(o->data.entry_offset),
3821                                                le64toh(o->data.entry_array_offset),
3822                                                le64toh(o->data.n_entries)-1,
3823                                                &o, NULL);
3824                 if (r <= 0)
3825                         return r;
3826
3827                 *to = le64toh(o->entry.monotonic);
3828         }
3829
3830         return 1;
3831 }
3832
3833 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3834         assert(f);
3835         assert(f->header);
3836
3837         /* If we gained new header fields we gained new features,
3838          * hence suggest a rotation */
3839         if (le64toh(f->header->header_size) < sizeof(Header)) {
3840                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3841                 return true;
3842         }
3843
3844         /* Let's check if the hash tables grew over a certain fill
3845          * level (75%, borrowing this value from Java's hash table
3846          * implementation), and if so suggest a rotation. To calculate
3847          * the fill level we need the n_data field, which only exists
3848          * in newer versions. */
3849
3850         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3851                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3852                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3853                                   f->path,
3854                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3855                                   le64toh(f->header->n_data),
3856                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3857                                   (unsigned long long) f->last_stat.st_size,
3858                                   f->last_stat.st_size / le64toh(f->header->n_data));
3859                         return true;
3860                 }
3861
3862         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3863                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3864                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3865                                   f->path,
3866                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3867                                   le64toh(f->header->n_fields),
3868                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3869                         return true;
3870                 }
3871
3872         /* Are the data objects properly indexed by field objects? */
3873         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3874             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3875             le64toh(f->header->n_data) > 0 &&
3876             le64toh(f->header->n_fields) == 0)
3877                 return true;
3878
3879         if (max_file_usec > 0) {
3880                 usec_t t, h;
3881
3882                 h = le64toh(f->header->head_entry_realtime);
3883                 t = now(CLOCK_REALTIME);
3884
3885                 if (h > 0 && t > h + max_file_usec)
3886                         return true;
3887         }
3888
3889         return false;
3890 }