src/journal/journal-file.c

   1 /***
   2   This file is part of systemd.
   3
   4   Copyright 2011 Lennart Poettering
   5
   6   systemd is free software; you can redistribute it and/or modify it
   7   under the terms of the GNU Lesser General Public License as published by
   8   the Free Software Foundation; either version 2.1 of the License, or
   9   (at your option) any later version.
  10
  11   systemd is distributed in the hope that it will be useful, but
  12   WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public License
  17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  18 ***/
  19
  20 #include <errno.h>
  21 #include <fcntl.h>
  22 #include <linux/fs.h>
  23 #include <pthread.h>
  24 #include <stddef.h>
  25 #include <sys/mman.h>
  26 #include <sys/statvfs.h>
  27 #include <sys/uio.h>
  28 #include <unistd.h>
  29
  30 #include "alloc-util.h"
  31 #include "btrfs-util.h"
  32 #include "chattr-util.h"
  33 #include "compress.h"
  34 #include "fd-util.h"
  35 #include "journal-authenticate.h"
  36 #include "journal-def.h"
  37 #include "journal-file.h"
  38 #include "lookup3.h"
  39 #include "parse-util.h"
  40 #include "path-util.h"
  41 #include "random-util.h"
  42 #include "sd-event.h"
  43 #include "set.h"
  44 #include "string-util.h"
  45 #include "strv.h"
  46 #include "xattr-util.h"
  47
  48 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  49 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  50
  51 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
  52
  53 /* This is the minimum journal file size */
  54 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL)                 /* 512 KiB */
  55
  56 /* These are the lower and upper bounds if we deduce the max_use value
  57  * from the file system size */
  58 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  59 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  60
  61 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  62 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  63
  64 /* This is the upper bound if we deduce max_size from max_use */
  65 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  66
  67 /* This is the upper bound if we deduce the keep_free value from the
  68  * file system size */
  69 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  70
  71 /* This is the keep_free value when we can't determine the system
  72  * size */
  73 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  74
  75 /* This is the default maximum number of journal files to keep around. */
  76 #define DEFAULT_N_MAX_FILES (100)
  77
  78 /* n_data was the first entry we added after the initial file format design */
  79 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  80
  81 /* How many entries to keep in the entry array chain cache at max */
  82 #define CHAIN_CACHE_MAX 20
  83
  84 /* How much to increase the journal file size at once each time we allocate something new. */
  85 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  86
  87 /* Reread fstat() of the file for detecting deletions at least this often */
  88 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  89
  90 /* The mmap context to use for the header we pick as one above the last defined typed */
  91 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  92
  93 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
  94  * As a result we use atomic operations on f->offline_state for inter-thread communications with
  95  * journal_file_set_offline() and journal_file_set_online(). */
  96 static void journal_file_set_offline_internal(JournalFile *f) {
  97         assert(f);
  98         assert(f->fd >= 0);
  99         assert(f->header);
 100
 101         for (;;) {
 102                 switch (f->offline_state) {
 103                 case OFFLINE_CANCEL:
 104                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
 105                                 continue;
 106                         return;
 107
 108                 case OFFLINE_AGAIN_FROM_SYNCING:
 109                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
 110                                 continue;
 111                         break;
 112
 113                 case OFFLINE_AGAIN_FROM_OFFLINING:
 114                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
 115                                 continue;
 116                         break;
 117
 118                 case OFFLINE_SYNCING:
 119                         (void) fsync(f->fd);
 120
 121                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
 122                                 continue;
 123
 124                         f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
 125                         (void) fsync(f->fd);
 126                         break;
 127
 128                 case OFFLINE_OFFLINING:
 129                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
 130                                 continue;
 131                         /* fall through */
 132
 133                 case OFFLINE_DONE:
 134                         return;
 135
 136                 case OFFLINE_JOINED:
 137                         log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
 138                         return;
 139                 }
 140         }
 141 }
 142
 143 static void * journal_file_set_offline_thread(void *arg) {
 144         JournalFile *f = arg;
 145
 146         journal_file_set_offline_internal(f);
 147
 148         return NULL;
 149 }
 150
 151 static int journal_file_set_offline_thread_join(JournalFile *f) {
 152         int r;
 153
 154         assert(f);
 155
 156         if (f->offline_state == OFFLINE_JOINED)
 157                 return 0;
 158
 159         r = pthread_join(f->offline_thread, NULL);
 160         if (r)
 161                 return -r;
 162
 163         f->offline_state = OFFLINE_JOINED;
 164
 165         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 166                 return -EIO;
 167
 168         return 0;
 169 }
 170
 171 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
 172 static bool journal_file_set_offline_try_restart(JournalFile *f) {
 173         for (;;) {
 174                 switch (f->offline_state) {
 175                 case OFFLINE_AGAIN_FROM_SYNCING:
 176                 case OFFLINE_AGAIN_FROM_OFFLINING:
 177                         return true;
 178
 179                 case OFFLINE_CANCEL:
 180                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
 181                                 continue;
 182                         return true;
 183
 184                 case OFFLINE_SYNCING:
 185                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
 186                                 continue;
 187                         return true;
 188
 189                 case OFFLINE_OFFLINING:
 190                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
 191                                 continue;
 192                         return true;
 193
 194                 default:
 195                         return false;
 196                 }
 197         }
 198 }
 199
 200 /* Sets a journal offline.
 201  *
 202  * If wait is false then an offline is dispatched in a separate thread for a
 203  * subsequent journal_file_set_offline() or journal_file_set_online() of the
 204  * same journal to synchronize with.
 205  *
 206  * If wait is true, then either an existing offline thread will be restarted
 207  * and joined, or if none exists the offline is simply performed in this
 208  * context without involving another thread.
 209  */
 210 int journal_file_set_offline(JournalFile *f, bool wait) {
 211         bool restarted;
 212         int r;
 213
 214         assert(f);
 215
 216         if (!f->writable)
 217                 return -EPERM;
 218
 219         if (!(f->fd >= 0 && f->header))
 220                 return -EINVAL;
 221
 222         /* An offlining journal is implicitly online and may modify f->header->state,
 223          * we must also join any potentially lingering offline thread when not online. */
 224         if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
 225                 return journal_file_set_offline_thread_join(f);
 226
 227         /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
 228         restarted = journal_file_set_offline_try_restart(f);
 229         if ((restarted && wait) || !restarted) {
 230                 r = journal_file_set_offline_thread_join(f);
 231                 if (r < 0)
 232                         return r;
 233         }
 234
 235         if (restarted)
 236                 return 0;
 237
 238         /* Initiate a new offline. */
 239         f->offline_state = OFFLINE_SYNCING;
 240
 241         if (wait) /* Without using a thread if waiting. */
 242                 journal_file_set_offline_internal(f);
 243         else {
 244                 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
 245                 if (r > 0) {
 246                         f->offline_state = OFFLINE_JOINED;
 247                         return -r;
 248                 }
 249         }
 250
 251         return 0;
 252 }
 253
 254 static int journal_file_set_online(JournalFile *f) {
 255         bool joined = false;
 256
 257         assert(f);
 258
 259         if (!f->writable)
 260                 return -EPERM;
 261
 262         if (!(f->fd >= 0 && f->header))
 263                 return -EINVAL;
 264
 265         while (!joined) {
 266                 switch (f->offline_state) {
 267                 case OFFLINE_JOINED:
 268                         /* No offline thread, no need to wait. */
 269                         joined = true;
 270                         break;
 271
 272                 case OFFLINE_SYNCING:
 273                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
 274                                 continue;
 275                         /* Canceled syncing prior to offlining, no need to wait. */
 276                         break;
 277
 278                 case OFFLINE_AGAIN_FROM_SYNCING:
 279                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
 280                                 continue;
 281                         /* Canceled restart from syncing, no need to wait. */
 282                         break;
 283
 284                 case OFFLINE_AGAIN_FROM_OFFLINING:
 285                         if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
 286                                 continue;
 287                         /* Canceled restart from offlining, must wait for offlining to complete however. */
 288
 289                         /* fall through */
 290                 default: {
 291                         int r;
 292
 293                         r = journal_file_set_offline_thread_join(f);
 294                         if (r < 0)
 295                                 return r;
 296
 297                         joined = true;
 298                         break;
 299                 }
 300                 }
 301         }
 302
 303         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 304                 return -EIO;
 305
 306         switch (f->header->state) {
 307                 case STATE_ONLINE:
 308                         return 0;
 309
 310                 case STATE_OFFLINE:
 311                         f->header->state = STATE_ONLINE;
 312                         (void) fsync(f->fd);
 313                         return 0;
 314
 315                 default:
 316                         return -EINVAL;
 317         }
 318 }
 319
 320 bool journal_file_is_offlining(JournalFile *f) {
 321         assert(f);
 322
 323         __sync_synchronize();
 324
 325         if (f->offline_state == OFFLINE_DONE ||
 326             f->offline_state == OFFLINE_JOINED)
 327                 return false;
 328
 329         return true;
 330 }
 331
 332 JournalFile* journal_file_close(JournalFile *f) {
 333         assert(f);
 334
 335 #ifdef HAVE_GCRYPT
 336         /* Write the final tag */
 337         if (f->seal && f->writable) {
 338                 int r;
 339
 340                 r = journal_file_append_tag(f);
 341                 if (r < 0)
 342                         log_error_errno(r, "Failed to append tag when closing journal: %m");
 343         }
 344 #endif
 345
 346         if (f->post_change_timer) {
 347                 int enabled;
 348
 349                 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
 350                         if (enabled == SD_EVENT_ONESHOT)
 351                                 journal_file_post_change(f);
 352
 353                 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
 354                 sd_event_source_unref(f->post_change_timer);
 355         }
 356
 357         journal_file_set_offline(f, true);
 358
 359         if (f->mmap && f->fd >= 0)
 360                 mmap_cache_close_fd(f->mmap, f->fd);
 361
 362         if (f->fd >= 0 && f->defrag_on_close) {
 363
 364                 /* Be friendly to btrfs: turn COW back on again now,
 365                  * and defragment the file. We won't write to the file
 366                  * ever again, hence remove all fragmentation, and
 367                  * reenable all the good bits COW usually provides
 368                  * (such as data checksumming). */
 369
 370                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 371                 (void) btrfs_defrag_fd(f->fd);
 372         }
 373
 374         if (f->close_fd)
 375                 safe_close(f->fd);
 376         free(f->path);
 377
 378         mmap_cache_unref(f->mmap);
 379
 380         ordered_hashmap_free_free(f->chain_cache);
 381
 382 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 383         free(f->compress_buffer);
 384 #endif
 385
 386 #ifdef HAVE_GCRYPT
 387         if (f->fss_file)
 388                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 389         else
 390                 free(f->fsprg_state);
 391
 392         free(f->fsprg_seed);
 393
 394         if (f->hmac)
 395                 gcry_md_close(f->hmac);
 396 #endif
 397
 398         return mfree(f);
 399 }
 400
 401 void journal_file_close_set(Set *s) {
 402         JournalFile *f;
 403
 404         assert(s);
 405
 406         while ((f = set_steal_first(s)))
 407                 (void) journal_file_close(f);
 408 }
 409
 410 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 411         Header h = {};
 412         ssize_t k;
 413         int r;
 414
 415         assert(f);
 416
 417         memcpy(h.signature, HEADER_SIGNATURE, 8);
 418         h.header_size = htole64(ALIGN64(sizeof(h)));
 419
 420         h.incompatible_flags |= htole32(
 421                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 422                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 423
 424         h.compatible_flags = htole32(
 425                 f->seal * HEADER_COMPATIBLE_SEALED);
 426
 427         r = sd_id128_randomize(&h.file_id);
 428         if (r < 0)
 429                 return r;
 430
 431         if (template) {
 432                 h.seqnum_id = template->header->seqnum_id;
 433                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 434         } else
 435                 h.seqnum_id = h.file_id;
 436
 437         k = pwrite(f->fd, &h, sizeof(h), 0);
 438         if (k < 0)
 439                 return -errno;
 440
 441         if (k != sizeof(h))
 442                 return -EIO;
 443
 444         return 0;
 445 }
 446
 447 static int fsync_directory_of_file(int fd) {
 448         _cleanup_free_ char *path = NULL, *dn = NULL;
 449         _cleanup_close_ int dfd = -1;
 450         struct stat st;
 451         int r;
 452
 453         if (fstat(fd, &st) < 0)
 454                 return -errno;
 455
 456         if (!S_ISREG(st.st_mode))
 457                 return -EBADFD;
 458
 459         r = fd_get_path(fd, &path);
 460         if (r < 0)
 461                 return r;
 462
 463         if (!path_is_absolute(path))
 464                 return -EINVAL;
 465
 466         dn = dirname_malloc(path);
 467         if (!dn)
 468                 return -ENOMEM;
 469
 470         dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
 471         if (dfd < 0)
 472                 return -errno;
 473
 474         if (fsync(dfd) < 0)
 475                 return -errno;
 476
 477         return 0;
 478 }
 479
 480 static int journal_file_refresh_header(JournalFile *f) {
 481         sd_id128_t boot_id;
 482         int r;
 483
 484         assert(f);
 485         assert(f->header);
 486
 487         r = sd_id128_get_machine(&f->header->machine_id);
 488         if (r < 0)
 489                 return r;
 490
 491         r = sd_id128_get_boot(&boot_id);
 492         if (r < 0)
 493                 return r;
 494
 495         if (sd_id128_equal(boot_id, f->header->boot_id))
 496                 f->tail_entry_monotonic_valid = true;
 497
 498         f->header->boot_id = boot_id;
 499
 500         r = journal_file_set_online(f);
 501
 502         /* Sync the online state to disk */
 503         (void) fsync(f->fd);
 504
 505         /* We likely just created a new file, also sync the directory this file is located in. */
 506         (void) fsync_directory_of_file(f->fd);
 507
 508         return r;
 509 }
 510
 511 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
 512         const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
 513                 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
 514         const char *type = compatible ? "compatible" : "incompatible";
 515         uint32_t flags;
 516
 517         flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
 518
 519         if (flags & ~supported) {
 520                 if (flags & ~any)
 521                         log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
 522                                   f->path, type, flags & ~any);
 523                 flags = (flags & any) & ~supported;
 524                 if (flags) {
 525                         const char* strv[3];
 526                         unsigned n = 0;
 527                         _cleanup_free_ char *t = NULL;
 528
 529                         if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
 530                                 strv[n++] = "sealed";
 531                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
 532                                 strv[n++] = "xz-compressed";
 533                         if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
 534                                 strv[n++] = "lz4-compressed";
 535                         strv[n] = NULL;
 536                         assert(n < ELEMENTSOF(strv));
 537
 538                         t = strv_join((char**) strv, ", ");
 539                         log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
 540                                   f->path, type, n > 1 ? "flags" : "flag", strnull(t));
 541                 }
 542                 return true;
 543         }
 544
 545         return false;
 546 }
 547
 548 static int journal_file_verify_header(JournalFile *f) {
 549         assert(f);
 550         assert(f->header);
 551
 552         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 553                 return -EBADMSG;
 554
 555         /* In both read and write mode we refuse to open files with incompatible
 556          * flags we don't know. */
 557         if (warn_wrong_flags(f, false))
 558                 return -EPROTONOSUPPORT;
 559
 560         /* When open for writing we refuse to open files with compatible flags, too. */
 561         if (f->writable && warn_wrong_flags(f, true))
 562                 return -EPROTONOSUPPORT;
 563
 564         if (f->header->state >= _STATE_MAX)
 565                 return -EBADMSG;
 566
 567         /* The first addition was n_data, so check that we are at least this large */
 568         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
 569                 return -EBADMSG;
 570
 571         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 572                 return -EBADMSG;
 573
 574         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
 575                 return -ENODATA;
 576
 577         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
 578                 return -ENODATA;
 579
 580         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 581             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 582             !VALID64(le64toh(f->header->tail_object_offset)) ||
 583             !VALID64(le64toh(f->header->entry_array_offset)))
 584                 return -ENODATA;
 585
 586         if (f->writable) {
 587                 sd_id128_t machine_id;
 588                 uint8_t state;
 589                 int r;
 590
 591                 r = sd_id128_get_machine(&machine_id);
 592                 if (r < 0)
 593                         return r;
 594
 595                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 596                         return -EHOSTDOWN;
 597
 598                 state = f->header->state;
 599
 600                 if (state == STATE_ARCHIVED)
 601                         return -ESHUTDOWN; /* Already archived */
 602                 else if (state == STATE_ONLINE) {
 603                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 604                         return -EBUSY;
 605                 } else if (state != STATE_OFFLINE) {
 606                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 607                         return -EBUSY;
 608                 }
 609
 610                 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
 611                  * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
 612                  * bisection. */
 613                 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
 614                         log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
 615                         return -ETXTBSY;
 616                 }
 617         }
 618
 619         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 620         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 621
 622         f->seal = JOURNAL_HEADER_SEALED(f->header);
 623
 624         return 0;
 625 }
 626
 627 static int journal_file_fstat(JournalFile *f) {
 628         assert(f);
 629         assert(f->fd >= 0);
 630
 631         if (fstat(f->fd, &f->last_stat) < 0)
 632                 return -errno;
 633
 634         f->last_stat_usec = now(CLOCK_MONOTONIC);
 635
 636         /* Refuse appending to files that are already deleted */
 637         if (f->last_stat.st_nlink <= 0)
 638                 return -EIDRM;
 639
 640         return 0;
 641 }
 642
 643 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 644         uint64_t old_size, new_size;
 645         int r;
 646
 647         assert(f);
 648         assert(f->header);
 649
 650         /* We assume that this file is not sparse, and we know that
 651          * for sure, since we always call posix_fallocate()
 652          * ourselves */
 653
 654         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 655                 return -EIO;
 656
 657         old_size =
 658                 le64toh(f->header->header_size) +
 659                 le64toh(f->header->arena_size);
 660
 661         new_size = PAGE_ALIGN(offset + size);
 662         if (new_size < le64toh(f->header->header_size))
 663                 new_size = le64toh(f->header->header_size);
 664
 665         if (new_size <= old_size) {
 666
 667                 /* We already pre-allocated enough space, but before
 668                  * we write to it, let's check with fstat() if the
 669                  * file got deleted, in order make sure we don't throw
 670                  * away the data immediately. Don't check fstat() for
 671                  * all writes though, but only once ever 10s. */
 672
 673                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 674                         return 0;
 675
 676                 return journal_file_fstat(f);
 677         }
 678
 679         /* Allocate more space. */
 680
 681         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 682                 return -E2BIG;
 683
 684         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 685                 struct statvfs svfs;
 686
 687                 if (fstatvfs(f->fd, &svfs) >= 0) {
 688                         uint64_t available;
 689
 690                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 691
 692                         if (new_size - old_size > available)
 693                                 return -E2BIG;
 694                 }
 695         }
 696
 697         /* Increase by larger blocks at once */
 698         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 699         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 700                 new_size = f->metrics.max_size;
 701
 702         /* Note that the glibc fallocate() fallback is very
 703            inefficient, hence we try to minimize the allocation area
 704            as we can. */
 705         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 706         if (r != 0)
 707                 return -r;
 708
 709         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 710
 711         return journal_file_fstat(f);
 712 }
 713
 714 static unsigned type_to_context(ObjectType type) {
 715         /* One context for each type, plus one catch-all for the rest */
 716         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 717         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 718         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 719 }
 720
 721 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
 722         int r;
 723
 724         assert(f);
 725         assert(ret);
 726
 727         if (size <= 0)
 728                 return -EINVAL;
 729
 730         /* Avoid SIGBUS on invalid accesses */
 731         if (offset + size > (uint64_t) f->last_stat.st_size) {
 732                 /* Hmm, out of range? Let's refresh the fstat() data
 733                  * first, before we trust that check. */
 734
 735                 r = journal_file_fstat(f);
 736                 if (r < 0)
 737                         return r;
 738
 739                 if (offset + size > (uint64_t) f->last_stat.st_size)
 740                         return -EADDRNOTAVAIL;
 741         }
 742
 743         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
 744 }
 745
 746 static uint64_t minimum_header_size(Object *o) {
 747
 748         static const uint64_t table[] = {
 749                 [OBJECT_DATA] = sizeof(DataObject),
 750                 [OBJECT_FIELD] = sizeof(FieldObject),
 751                 [OBJECT_ENTRY] = sizeof(EntryObject),
 752                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 753                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 754                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 755                 [OBJECT_TAG] = sizeof(TagObject),
 756         };
 757
 758         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 759                 return sizeof(ObjectHeader);
 760
 761         return table[o->object.type];
 762 }
 763
 764 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 765         int r;
 766         void *t;
 767         Object *o;
 768         uint64_t s;
 769
 770         assert(f);
 771         assert(ret);
 772
 773         /* Objects may only be located at multiple of 64 bit */
 774         if (!VALID64(offset)) {
 775                 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
 776                 return -EBADMSG;
 777         }
 778
 779         /* Object may not be located in the file header */
 780         if (offset < le64toh(f->header->header_size)) {
 781                 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
 782                 return -EBADMSG;
 783         }
 784
 785         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
 786         if (r < 0)
 787                 return r;
 788
 789         o = (Object*) t;
 790         s = le64toh(o->object.size);
 791
 792         if (s == 0) {
 793                 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
 794                 return -EBADMSG;
 795         }
 796         if (s < sizeof(ObjectHeader)) {
 797                 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
 798                 return -EBADMSG;
 799         }
 800
 801         if (o->object.type <= OBJECT_UNUSED) {
 802                 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
 803                 return -EBADMSG;
 804         }
 805
 806         if (s < minimum_header_size(o)) {
 807                 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
 808                 return -EBADMSG;
 809         }
 810
 811         if (type > OBJECT_UNUSED && o->object.type != type) {
 812                 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
 813                 return -EBADMSG;
 814         }
 815
 816         if (s > sizeof(ObjectHeader)) {
 817                 r = journal_file_move_to(f, type, false, offset, s, &t);
 818                 if (r < 0)
 819                         return r;
 820
 821                 o = (Object*) t;
 822         }
 823
 824         *ret = o;
 825         return 0;
 826 }
 827
 828 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 829         uint64_t r;
 830
 831         assert(f);
 832         assert(f->header);
 833
 834         r = le64toh(f->header->tail_entry_seqnum) + 1;
 835
 836         if (seqnum) {
 837                 /* If an external seqnum counter was passed, we update
 838                  * both the local and the external one, and set it to
 839                  * the maximum of both */
 840
 841                 if (*seqnum + 1 > r)
 842                         r = *seqnum + 1;
 843
 844                 *seqnum = r;
 845         }
 846
 847         f->header->tail_entry_seqnum = htole64(r);
 848
 849         if (f->header->head_entry_seqnum == 0)
 850                 f->header->head_entry_seqnum = htole64(r);
 851
 852         return r;
 853 }
 854
 855 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
 856         int r;
 857         uint64_t p;
 858         Object *tail, *o;
 859         void *t;
 860
 861         assert(f);
 862         assert(f->header);
 863         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
 864         assert(size >= sizeof(ObjectHeader));
 865         assert(offset);
 866         assert(ret);
 867
 868         r = journal_file_set_online(f);
 869         if (r < 0)
 870                 return r;
 871
 872         p = le64toh(f->header->tail_object_offset);
 873         if (p == 0)
 874                 p = le64toh(f->header->header_size);
 875         else {
 876                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
 877                 if (r < 0)
 878                         return r;
 879
 880                 p += ALIGN64(le64toh(tail->object.size));
 881         }
 882
 883         r = journal_file_allocate(f, p, size);
 884         if (r < 0)
 885                 return r;
 886
 887         r = journal_file_move_to(f, type, false, p, size, &t);
 888         if (r < 0)
 889                 return r;
 890
 891         o = (Object*) t;
 892
 893         zero(o->object);
 894         o->object.type = type;
 895         o->object.size = htole64(size);
 896
 897         f->header->tail_object_offset = htole64(p);
 898         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
 899
 900         *ret = o;
 901         *offset = p;
 902
 903         return 0;
 904 }
 905
 906 static int journal_file_setup_data_hash_table(JournalFile *f) {
 907         uint64_t s, p;
 908         Object *o;
 909         int r;
 910
 911         assert(f);
 912         assert(f->header);
 913
 914         /* We estimate that we need 1 hash table entry per 768 bytes
 915            of journal file and we want to make sure we never get
 916            beyond 75% fill level. Calculate the hash table size for
 917            the maximum file size based on these metrics. */
 918
 919         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
 920         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
 921                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
 922
 923         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
 924
 925         r = journal_file_append_object(f,
 926                                        OBJECT_DATA_HASH_TABLE,
 927                                        offsetof(Object, hash_table.items) + s,
 928                                        &o, &p);
 929         if (r < 0)
 930                 return r;
 931
 932         memzero(o->hash_table.items, s);
 933
 934         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 935         f->header->data_hash_table_size = htole64(s);
 936
 937         return 0;
 938 }
 939
 940 static int journal_file_setup_field_hash_table(JournalFile *f) {
 941         uint64_t s, p;
 942         Object *o;
 943         int r;
 944
 945         assert(f);
 946         assert(f->header);
 947
 948         /* We use a fixed size hash table for the fields as this
 949          * number should grow very slowly only */
 950
 951         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
 952         r = journal_file_append_object(f,
 953                                        OBJECT_FIELD_HASH_TABLE,
 954                                        offsetof(Object, hash_table.items) + s,
 955                                        &o, &p);
 956         if (r < 0)
 957                 return r;
 958
 959         memzero(o->hash_table.items, s);
 960
 961         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 962         f->header->field_hash_table_size = htole64(s);
 963
 964         return 0;
 965 }
 966
 967 int journal_file_map_data_hash_table(JournalFile *f) {
 968         uint64_t s, p;
 969         void *t;
 970         int r;
 971
 972         assert(f);
 973         assert(f->header);
 974
 975         if (f->data_hash_table)
 976                 return 0;
 977
 978         p = le64toh(f->header->data_hash_table_offset);
 979         s = le64toh(f->header->data_hash_table_size);
 980
 981         r = journal_file_move_to(f,
 982                                  OBJECT_DATA_HASH_TABLE,
 983                                  true,
 984                                  p, s,
 985                                  &t);
 986         if (r < 0)
 987                 return r;
 988
 989         f->data_hash_table = t;
 990         return 0;
 991 }
 992
 993 int journal_file_map_field_hash_table(JournalFile *f) {
 994         uint64_t s, p;
 995         void *t;
 996         int r;
 997
 998         assert(f);
 999         assert(f->header);
1000
1001         if (f->field_hash_table)
1002                 return 0;
1003
1004         p = le64toh(f->header->field_hash_table_offset);
1005         s = le64toh(f->header->field_hash_table_size);
1006
1007         r = journal_file_move_to(f,
1008                                  OBJECT_FIELD_HASH_TABLE,
1009                                  true,
1010                                  p, s,
1011                                  &t);
1012         if (r < 0)
1013                 return r;
1014
1015         f->field_hash_table = t;
1016         return 0;
1017 }
1018
1019 static int journal_file_link_field(
1020                 JournalFile *f,
1021                 Object *o,
1022                 uint64_t offset,
1023                 uint64_t hash) {
1024
1025         uint64_t p, h, m;
1026         int r;
1027
1028         assert(f);
1029         assert(f->header);
1030         assert(f->field_hash_table);
1031         assert(o);
1032         assert(offset > 0);
1033
1034         if (o->object.type != OBJECT_FIELD)
1035                 return -EINVAL;
1036
1037         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1038         if (m <= 0)
1039                 return -EBADMSG;
1040
1041         /* This might alter the window we are looking at */
1042         o->field.next_hash_offset = o->field.head_data_offset = 0;
1043
1044         h = hash % m;
1045         p = le64toh(f->field_hash_table[h].tail_hash_offset);
1046         if (p == 0)
1047                 f->field_hash_table[h].head_hash_offset = htole64(offset);
1048         else {
1049                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1050                 if (r < 0)
1051                         return r;
1052
1053                 o->field.next_hash_offset = htole64(offset);
1054         }
1055
1056         f->field_hash_table[h].tail_hash_offset = htole64(offset);
1057
1058         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1059                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1060
1061         return 0;
1062 }
1063
1064 static int journal_file_link_data(
1065                 JournalFile *f,
1066                 Object *o,
1067                 uint64_t offset,
1068                 uint64_t hash) {
1069
1070         uint64_t p, h, m;
1071         int r;
1072
1073         assert(f);
1074         assert(f->header);
1075         assert(f->data_hash_table);
1076         assert(o);
1077         assert(offset > 0);
1078
1079         if (o->object.type != OBJECT_DATA)
1080                 return -EINVAL;
1081
1082         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1083         if (m <= 0)
1084                 return -EBADMSG;
1085
1086         /* This might alter the window we are looking at */
1087         o->data.next_hash_offset = o->data.next_field_offset = 0;
1088         o->data.entry_offset = o->data.entry_array_offset = 0;
1089         o->data.n_entries = 0;
1090
1091         h = hash % m;
1092         p = le64toh(f->data_hash_table[h].tail_hash_offset);
1093         if (p == 0)
1094                 /* Only entry in the hash table is easy */
1095                 f->data_hash_table[h].head_hash_offset = htole64(offset);
1096         else {
1097                 /* Move back to the previous data object, to patch in
1098                  * pointer */
1099
1100                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1101                 if (r < 0)
1102                         return r;
1103
1104                 o->data.next_hash_offset = htole64(offset);
1105         }
1106
1107         f->data_hash_table[h].tail_hash_offset = htole64(offset);
1108
1109         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1110                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1111
1112         return 0;
1113 }
1114
1115 int journal_file_find_field_object_with_hash(
1116                 JournalFile *f,
1117                 const void *field, uint64_t size, uint64_t hash,
1118                 Object **ret, uint64_t *offset) {
1119
1120         uint64_t p, osize, h, m;
1121         int r;
1122
1123         assert(f);
1124         assert(f->header);
1125         assert(field && size > 0);
1126
1127         /* If the field hash table is empty, we can't find anything */
1128         if (le64toh(f->header->field_hash_table_size) <= 0)
1129                 return 0;
1130
1131         /* Map the field hash table, if it isn't mapped yet. */
1132         r = journal_file_map_field_hash_table(f);
1133         if (r < 0)
1134                 return r;
1135
1136         osize = offsetof(Object, field.payload) + size;
1137
1138         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1139         if (m <= 0)
1140                 return -EBADMSG;
1141
1142         h = hash % m;
1143         p = le64toh(f->field_hash_table[h].head_hash_offset);
1144
1145         while (p > 0) {
1146                 Object *o;
1147
1148                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1149                 if (r < 0)
1150                         return r;
1151
1152                 if (le64toh(o->field.hash) == hash &&
1153                     le64toh(o->object.size) == osize &&
1154                     memcmp(o->field.payload, field, size) == 0) {
1155
1156                         if (ret)
1157                                 *ret = o;
1158                         if (offset)
1159                                 *offset = p;
1160
1161                         return 1;
1162                 }
1163
1164                 p = le64toh(o->field.next_hash_offset);
1165         }
1166
1167         return 0;
1168 }
1169
1170 int journal_file_find_field_object(
1171                 JournalFile *f,
1172                 const void *field, uint64_t size,
1173                 Object **ret, uint64_t *offset) {
1174
1175         uint64_t hash;
1176
1177         assert(f);
1178         assert(field && size > 0);
1179
1180         hash = hash64(field, size);
1181
1182         return journal_file_find_field_object_with_hash(f,
1183                                                         field, size, hash,
1184                                                         ret, offset);
1185 }
1186
1187 int journal_file_find_data_object_with_hash(
1188                 JournalFile *f,
1189                 const void *data, uint64_t size, uint64_t hash,
1190                 Object **ret, uint64_t *offset) {
1191
1192         uint64_t p, osize, h, m;
1193         int r;
1194
1195         assert(f);
1196         assert(f->header);
1197         assert(data || size == 0);
1198
1199         /* If there's no data hash table, then there's no entry. */
1200         if (le64toh(f->header->data_hash_table_size) <= 0)
1201                 return 0;
1202
1203         /* Map the data hash table, if it isn't mapped yet. */
1204         r = journal_file_map_data_hash_table(f);
1205         if (r < 0)
1206                 return r;
1207
1208         osize = offsetof(Object, data.payload) + size;
1209
1210         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1211         if (m <= 0)
1212                 return -EBADMSG;
1213
1214         h = hash % m;
1215         p = le64toh(f->data_hash_table[h].head_hash_offset);
1216
1217         while (p > 0) {
1218                 Object *o;
1219
1220                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1221                 if (r < 0)
1222                         return r;
1223
1224                 if (le64toh(o->data.hash) != hash)
1225                         goto next;
1226
1227                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1228 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1229                         uint64_t l;
1230                         size_t rsize = 0;
1231
1232                         l = le64toh(o->object.size);
1233                         if (l <= offsetof(Object, data.payload))
1234                                 return -EBADMSG;
1235
1236                         l -= offsetof(Object, data.payload);
1237
1238                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1239                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1240                         if (r < 0)
1241                                 return r;
1242
1243                         if (rsize == size &&
1244                             memcmp(f->compress_buffer, data, size) == 0) {
1245
1246                                 if (ret)
1247                                         *ret = o;
1248
1249                                 if (offset)
1250                                         *offset = p;
1251
1252                                 return 1;
1253                         }
1254 #else
1255                         return -EPROTONOSUPPORT;
1256 #endif
1257                 } else if (le64toh(o->object.size) == osize &&
1258                            memcmp(o->data.payload, data, size) == 0) {
1259
1260                         if (ret)
1261                                 *ret = o;
1262
1263                         if (offset)
1264                                 *offset = p;
1265
1266                         return 1;
1267                 }
1268
1269         next:
1270                 p = le64toh(o->data.next_hash_offset);
1271         }
1272
1273         return 0;
1274 }
1275
1276 int journal_file_find_data_object(
1277                 JournalFile *f,
1278                 const void *data, uint64_t size,
1279                 Object **ret, uint64_t *offset) {
1280
1281         uint64_t hash;
1282
1283         assert(f);
1284         assert(data || size == 0);
1285
1286         hash = hash64(data, size);
1287
1288         return journal_file_find_data_object_with_hash(f,
1289                                                        data, size, hash,
1290                                                        ret, offset);
1291 }
1292
1293 static int journal_file_append_field(
1294                 JournalFile *f,
1295                 const void *field, uint64_t size,
1296                 Object **ret, uint64_t *offset) {
1297
1298         uint64_t hash, p;
1299         uint64_t osize;
1300         Object *o;
1301         int r;
1302
1303         assert(f);
1304         assert(field && size > 0);
1305
1306         hash = hash64(field, size);
1307
1308         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1309         if (r < 0)
1310                 return r;
1311         else if (r > 0) {
1312
1313                 if (ret)
1314                         *ret = o;
1315
1316                 if (offset)
1317                         *offset = p;
1318
1319                 return 0;
1320         }
1321
1322         osize = offsetof(Object, field.payload) + size;
1323         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1324         if (r < 0)
1325                 return r;
1326
1327         o->field.hash = htole64(hash);
1328         memcpy(o->field.payload, field, size);
1329
1330         r = journal_file_link_field(f, o, p, hash);
1331         if (r < 0)
1332                 return r;
1333
1334         /* The linking might have altered the window, so let's
1335          * refresh our pointer */
1336         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1337         if (r < 0)
1338                 return r;
1339
1340 #ifdef HAVE_GCRYPT
1341         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1342         if (r < 0)
1343                 return r;
1344 #endif
1345
1346         if (ret)
1347                 *ret = o;
1348
1349         if (offset)
1350                 *offset = p;
1351
1352         return 0;
1353 }
1354
1355 static int journal_file_append_data(
1356                 JournalFile *f,
1357                 const void *data, uint64_t size,
1358                 Object **ret, uint64_t *offset) {
1359
1360         uint64_t hash, p;
1361         uint64_t osize;
1362         Object *o;
1363         int r, compression = 0;
1364         const void *eq;
1365
1366         assert(f);
1367         assert(data || size == 0);
1368
1369         hash = hash64(data, size);
1370
1371         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1372         if (r < 0)
1373                 return r;
1374         if (r > 0) {
1375
1376                 if (ret)
1377                         *ret = o;
1378
1379                 if (offset)
1380                         *offset = p;
1381
1382                 return 0;
1383         }
1384
1385         osize = offsetof(Object, data.payload) + size;
1386         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1387         if (r < 0)
1388                 return r;
1389
1390         o->data.hash = htole64(hash);
1391
1392 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1393         if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1394                 size_t rsize = 0;
1395
1396                 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1397
1398                 if (compression >= 0) {
1399                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1400                         o->object.flags |= compression;
1401
1402                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1403                                   size, rsize, object_compressed_to_string(compression));
1404                 } else
1405                         /* Compression didn't work, we don't really care why, let's continue without compression */
1406                         compression = 0;
1407         }
1408 #endif
1409
1410         if (compression == 0)
1411                 memcpy_safe(o->data.payload, data, size);
1412
1413         r = journal_file_link_data(f, o, p, hash);
1414         if (r < 0)
1415                 return r;
1416
1417 #ifdef HAVE_GCRYPT
1418         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1419         if (r < 0)
1420                 return r;
1421 #endif
1422
1423         /* The linking might have altered the window, so let's
1424          * refresh our pointer */
1425         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1426         if (r < 0)
1427                 return r;
1428
1429         if (!data)
1430                 eq = NULL;
1431         else
1432                 eq = memchr(data, '=', size);
1433         if (eq && eq > data) {
1434                 Object *fo = NULL;
1435                 uint64_t fp;
1436
1437                 /* Create field object ... */
1438                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1439                 if (r < 0)
1440                         return r;
1441
1442                 /* ... and link it in. */
1443                 o->data.next_field_offset = fo->field.head_data_offset;
1444                 fo->field.head_data_offset = le64toh(p);
1445         }
1446
1447         if (ret)
1448                 *ret = o;
1449
1450         if (offset)
1451                 *offset = p;
1452
1453         return 0;
1454 }
1455
1456 uint64_t journal_file_entry_n_items(Object *o) {
1457         assert(o);
1458
1459         if (o->object.type != OBJECT_ENTRY)
1460                 return 0;
1461
1462         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1463 }
1464
1465 uint64_t journal_file_entry_array_n_items(Object *o) {
1466         assert(o);
1467
1468         if (o->object.type != OBJECT_ENTRY_ARRAY)
1469                 return 0;
1470
1471         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1472 }
1473
1474 uint64_t journal_file_hash_table_n_items(Object *o) {
1475         assert(o);
1476
1477         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1478             o->object.type != OBJECT_FIELD_HASH_TABLE)
1479                 return 0;
1480
1481         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1482 }
1483
1484 static int link_entry_into_array(JournalFile *f,
1485                                  le64_t *first,
1486                                  le64_t *idx,
1487                                  uint64_t p) {
1488         int r;
1489         uint64_t n = 0, ap = 0, q, i, a, hidx;
1490         Object *o;
1491
1492         assert(f);
1493         assert(f->header);
1494         assert(first);
1495         assert(idx);
1496         assert(p > 0);
1497
1498         a = le64toh(*first);
1499         i = hidx = le64toh(*idx);
1500         while (a > 0) {
1501
1502                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1503                 if (r < 0)
1504                         return r;
1505
1506                 n = journal_file_entry_array_n_items(o);
1507                 if (i < n) {
1508                         o->entry_array.items[i] = htole64(p);
1509                         *idx = htole64(hidx + 1);
1510                         return 0;
1511                 }
1512
1513                 i -= n;
1514                 ap = a;
1515                 a = le64toh(o->entry_array.next_entry_array_offset);
1516         }
1517
1518         if (hidx > n)
1519                 n = (hidx+1) * 2;
1520         else
1521                 n = n * 2;
1522
1523         if (n < 4)
1524                 n = 4;
1525
1526         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1527                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1528                                        &o, &q);
1529         if (r < 0)
1530                 return r;
1531
1532 #ifdef HAVE_GCRYPT
1533         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1534         if (r < 0)
1535                 return r;
1536 #endif
1537
1538         o->entry_array.items[i] = htole64(p);
1539
1540         if (ap == 0)
1541                 *first = htole64(q);
1542         else {
1543                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1544                 if (r < 0)
1545                         return r;
1546
1547                 o->entry_array.next_entry_array_offset = htole64(q);
1548         }
1549
1550         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1551                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1552
1553         *idx = htole64(hidx + 1);
1554
1555         return 0;
1556 }
1557
1558 static int link_entry_into_array_plus_one(JournalFile *f,
1559                                           le64_t *extra,
1560                                           le64_t *first,
1561                                           le64_t *idx,
1562                                           uint64_t p) {
1563
1564         int r;
1565
1566         assert(f);
1567         assert(extra);
1568         assert(first);
1569         assert(idx);
1570         assert(p > 0);
1571
1572         if (*idx == 0)
1573                 *extra = htole64(p);
1574         else {
1575                 le64_t i;
1576
1577                 i = htole64(le64toh(*idx) - 1);
1578                 r = link_entry_into_array(f, first, &i, p);
1579                 if (r < 0)
1580                         return r;
1581         }
1582
1583         *idx = htole64(le64toh(*idx) + 1);
1584         return 0;
1585 }
1586
1587 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1588         uint64_t p;
1589         int r;
1590         assert(f);
1591         assert(o);
1592         assert(offset > 0);
1593
1594         p = le64toh(o->entry.items[i].object_offset);
1595         if (p == 0)
1596                 return -EINVAL;
1597
1598         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1599         if (r < 0)
1600                 return r;
1601
1602         return link_entry_into_array_plus_one(f,
1603                                               &o->data.entry_offset,
1604                                               &o->data.entry_array_offset,
1605                                               &o->data.n_entries,
1606                                               offset);
1607 }
1608
1609 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1610         uint64_t n, i;
1611         int r;
1612
1613         assert(f);
1614         assert(f->header);
1615         assert(o);
1616         assert(offset > 0);
1617
1618         if (o->object.type != OBJECT_ENTRY)
1619                 return -EINVAL;
1620
1621         __sync_synchronize();
1622
1623         /* Link up the entry itself */
1624         r = link_entry_into_array(f,
1625                                   &f->header->entry_array_offset,
1626                                   &f->header->n_entries,
1627                                   offset);
1628         if (r < 0)
1629                 return r;
1630
1631         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1632
1633         if (f->header->head_entry_realtime == 0)
1634                 f->header->head_entry_realtime = o->entry.realtime;
1635
1636         f->header->tail_entry_realtime = o->entry.realtime;
1637         f->header->tail_entry_monotonic = o->entry.monotonic;
1638
1639         f->tail_entry_monotonic_valid = true;
1640
1641         /* Link up the items */
1642         n = journal_file_entry_n_items(o);
1643         for (i = 0; i < n; i++) {
1644                 r = journal_file_link_entry_item(f, o, offset, i);
1645                 if (r < 0)
1646                         return r;
1647         }
1648
1649         return 0;
1650 }
1651
1652 static int journal_file_append_entry_internal(
1653                 JournalFile *f,
1654                 const dual_timestamp *ts,
1655                 uint64_t xor_hash,
1656                 const EntryItem items[], unsigned n_items,
1657                 uint64_t *seqnum,
1658                 Object **ret, uint64_t *offset) {
1659         uint64_t np;
1660         uint64_t osize;
1661         Object *o;
1662         int r;
1663
1664         assert(f);
1665         assert(f->header);
1666         assert(items || n_items == 0);
1667         assert(ts);
1668
1669         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1670
1671         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1672         if (r < 0)
1673                 return r;
1674
1675         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1676         memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1677         o->entry.realtime = htole64(ts->realtime);
1678         o->entry.monotonic = htole64(ts->monotonic);
1679         o->entry.xor_hash = htole64(xor_hash);
1680         o->entry.boot_id = f->header->boot_id;
1681
1682 #ifdef HAVE_GCRYPT
1683         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1684         if (r < 0)
1685                 return r;
1686 #endif
1687
1688         r = journal_file_link_entry(f, o, np);
1689         if (r < 0)
1690                 return r;
1691
1692         if (ret)
1693                 *ret = o;
1694
1695         if (offset)
1696                 *offset = np;
1697
1698         return 0;
1699 }
1700
1701 void journal_file_post_change(JournalFile *f) {
1702         assert(f);
1703
1704         /* inotify() does not receive IN_MODIFY events from file
1705          * accesses done via mmap(). After each access we hence
1706          * trigger IN_MODIFY by truncating the journal file to its
1707          * current size which triggers IN_MODIFY. */
1708
1709         __sync_synchronize();
1710
1711         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1712                 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1713 }
1714
1715 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1716         assert(userdata);
1717
1718         journal_file_post_change(userdata);
1719
1720         return 1;
1721 }
1722
1723 static void schedule_post_change(JournalFile *f) {
1724         sd_event_source *timer;
1725         int enabled, r;
1726         uint64_t now;
1727
1728         assert(f);
1729         assert(f->post_change_timer);
1730
1731         timer = f->post_change_timer;
1732
1733         r = sd_event_source_get_enabled(timer, &enabled);
1734         if (r < 0) {
1735                 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1736                 goto fail;
1737         }
1738
1739         if (enabled == SD_EVENT_ONESHOT)
1740                 return;
1741
1742         r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1743         if (r < 0) {
1744                 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1745                 goto fail;
1746         }
1747
1748         r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1749         if (r < 0) {
1750                 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1751                 goto fail;
1752         }
1753
1754         r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1755         if (r < 0) {
1756                 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1757                 goto fail;
1758         }
1759
1760         return;
1761
1762 fail:
1763         /* On failure, let's simply post the change immediately. */
1764         journal_file_post_change(f);
1765 }
1766
1767 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1768 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1769         _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1770         int r;
1771
1772         assert(f);
1773         assert_return(!f->post_change_timer, -EINVAL);
1774         assert(e);
1775         assert(t);
1776
1777         r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1778         if (r < 0)
1779                 return r;
1780
1781         r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1782         if (r < 0)
1783                 return r;
1784
1785         f->post_change_timer = timer;
1786         timer = NULL;
1787         f->post_change_timer_period = t;
1788
1789         return r;
1790 }
1791
1792 static int entry_item_cmp(const void *_a, const void *_b) {
1793         const EntryItem *a = _a, *b = _b;
1794
1795         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1796                 return -1;
1797         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1798                 return 1;
1799         return 0;
1800 }
1801
1802 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1803         unsigned i;
1804         EntryItem *items;
1805         int r;
1806         uint64_t xor_hash = 0;
1807         struct dual_timestamp _ts;
1808
1809         assert(f);
1810         assert(f->header);
1811         assert(iovec || n_iovec == 0);
1812
1813         if (!ts) {
1814                 dual_timestamp_get(&_ts);
1815                 ts = &_ts;
1816         }
1817
1818 #ifdef HAVE_GCRYPT
1819         r = journal_file_maybe_append_tag(f, ts->realtime);
1820         if (r < 0)
1821                 return r;
1822 #endif
1823
1824         /* alloca() can't take 0, hence let's allocate at least one */
1825         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1826
1827         for (i = 0; i < n_iovec; i++) {
1828                 uint64_t p;
1829                 Object *o;
1830
1831                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1832                 if (r < 0)
1833                         return r;
1834
1835                 xor_hash ^= le64toh(o->data.hash);
1836                 items[i].object_offset = htole64(p);
1837                 items[i].hash = o->data.hash;
1838         }
1839
1840         /* Order by the position on disk, in order to improve seek
1841          * times for rotating media. */
1842         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1843
1844         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1845
1846         /* If the memory mapping triggered a SIGBUS then we return an
1847          * IO error and ignore the error code passed down to us, since
1848          * it is very likely just an effect of a nullified replacement
1849          * mapping page */
1850
1851         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1852                 r = -EIO;
1853
1854         if (f->post_change_timer)
1855                 schedule_post_change(f);
1856         else
1857                 journal_file_post_change(f);
1858
1859         return r;
1860 }
1861
1862 typedef struct ChainCacheItem {
1863         uint64_t first; /* the array at the beginning of the chain */
1864         uint64_t array; /* the cached array */
1865         uint64_t begin; /* the first item in the cached array */
1866         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1867         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1868 } ChainCacheItem;
1869
1870 static void chain_cache_put(
1871                 OrderedHashmap *h,
1872                 ChainCacheItem *ci,
1873                 uint64_t first,
1874                 uint64_t array,
1875                 uint64_t begin,
1876                 uint64_t total,
1877                 uint64_t last_index) {
1878
1879         if (!ci) {
1880                 /* If the chain item to cache for this chain is the
1881                  * first one it's not worth caching anything */
1882                 if (array == first)
1883                         return;
1884
1885                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1886                         ci = ordered_hashmap_steal_first(h);
1887                         assert(ci);
1888                 } else {
1889                         ci = new(ChainCacheItem, 1);
1890                         if (!ci)
1891                                 return;
1892                 }
1893
1894                 ci->first = first;
1895
1896                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1897                         free(ci);
1898                         return;
1899                 }
1900         } else
1901                 assert(ci->first == first);
1902
1903         ci->array = array;
1904         ci->begin = begin;
1905         ci->total = total;
1906         ci->last_index = last_index;
1907 }
1908
1909 static int generic_array_get(
1910                 JournalFile *f,
1911                 uint64_t first,
1912                 uint64_t i,
1913                 Object **ret, uint64_t *offset) {
1914
1915         Object *o;
1916         uint64_t p = 0, a, t = 0;
1917         int r;
1918         ChainCacheItem *ci;
1919
1920         assert(f);
1921
1922         a = first;
1923
1924         /* Try the chain cache first */
1925         ci = ordered_hashmap_get(f->chain_cache, &first);
1926         if (ci && i > ci->total) {
1927                 a = ci->array;
1928                 i -= ci->total;
1929                 t = ci->total;
1930         }
1931
1932         while (a > 0) {
1933                 uint64_t k;
1934
1935                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1936                 if (r < 0)
1937                         return r;
1938
1939                 k = journal_file_entry_array_n_items(o);
1940                 if (i < k) {
1941                         p = le64toh(o->entry_array.items[i]);
1942                         goto found;
1943                 }
1944
1945                 i -= k;
1946                 t += k;
1947                 a = le64toh(o->entry_array.next_entry_array_offset);
1948         }
1949
1950         return 0;
1951
1952 found:
1953         /* Let's cache this item for the next invocation */
1954         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1955
1956         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1957         if (r < 0)
1958                 return r;
1959
1960         if (ret)
1961                 *ret = o;
1962
1963         if (offset)
1964                 *offset = p;
1965
1966         return 1;
1967 }
1968
1969 static int generic_array_get_plus_one(
1970                 JournalFile *f,
1971                 uint64_t extra,
1972                 uint64_t first,
1973                 uint64_t i,
1974                 Object **ret, uint64_t *offset) {
1975
1976         Object *o;
1977
1978         assert(f);
1979
1980         if (i == 0) {
1981                 int r;
1982
1983                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1984                 if (r < 0)
1985                         return r;
1986
1987                 if (ret)
1988                         *ret = o;
1989
1990                 if (offset)
1991                         *offset = extra;
1992
1993                 return 1;
1994         }
1995
1996         return generic_array_get(f, first, i-1, ret, offset);
1997 }
1998
1999 enum {
2000         TEST_FOUND,
2001         TEST_LEFT,
2002         TEST_RIGHT
2003 };
2004
2005 static int generic_array_bisect(
2006                 JournalFile *f,
2007                 uint64_t first,
2008                 uint64_t n,
2009                 uint64_t needle,
2010                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2011                 direction_t direction,
2012                 Object **ret,
2013                 uint64_t *offset,
2014                 uint64_t *idx) {
2015
2016         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2017         bool subtract_one = false;
2018         Object *o, *array = NULL;
2019         int r;
2020         ChainCacheItem *ci;
2021
2022         assert(f);
2023         assert(test_object);
2024
2025         /* Start with the first array in the chain */
2026         a = first;
2027
2028         ci = ordered_hashmap_get(f->chain_cache, &first);
2029         if (ci && n > ci->total) {
2030                 /* Ah, we have iterated this bisection array chain
2031                  * previously! Let's see if we can skip ahead in the
2032                  * chain, as far as the last time. But we can't jump
2033                  * backwards in the chain, so let's check that
2034                  * first. */
2035
2036                 r = test_object(f, ci->begin, needle);
2037                 if (r < 0)
2038                         return r;
2039
2040                 if (r == TEST_LEFT) {
2041                         /* OK, what we are looking for is right of the
2042                          * begin of this EntryArray, so let's jump
2043                          * straight to previously cached array in the
2044                          * chain */
2045
2046                         a = ci->array;
2047                         n -= ci->total;
2048                         t = ci->total;
2049                         last_index = ci->last_index;
2050                 }
2051         }
2052
2053         while (a > 0) {
2054                 uint64_t left, right, k, lp;
2055
2056                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2057                 if (r < 0)
2058                         return r;
2059
2060                 k = journal_file_entry_array_n_items(array);
2061                 right = MIN(k, n);
2062                 if (right <= 0)
2063                         return 0;
2064
2065                 i = right - 1;
2066                 lp = p = le64toh(array->entry_array.items[i]);
2067                 if (p <= 0)
2068                         r = -EBADMSG;
2069                 else
2070                         r = test_object(f, p, needle);
2071                 if (r == -EBADMSG) {
2072                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2073                         n = i;
2074                         continue;
2075                 }
2076                 if (r < 0)
2077                         return r;
2078
2079                 if (r == TEST_FOUND)
2080                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2081
2082                 if (r == TEST_RIGHT) {
2083                         left = 0;
2084                         right -= 1;
2085
2086                         if (last_index != (uint64_t) -1) {
2087                                 assert(last_index <= right);
2088
2089                                 /* If we cached the last index we
2090                                  * looked at, let's try to not to jump
2091                                  * too wildly around and see if we can
2092                                  * limit the range to look at early to
2093                                  * the immediate neighbors of the last
2094                                  * index we looked at. */
2095
2096                                 if (last_index > 0) {
2097                                         uint64_t x = last_index - 1;
2098
2099                                         p = le64toh(array->entry_array.items[x]);
2100                                         if (p <= 0)
2101                                                 return -EBADMSG;
2102
2103                                         r = test_object(f, p, needle);
2104                                         if (r < 0)
2105                                                 return r;
2106
2107                                         if (r == TEST_FOUND)
2108                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2109
2110                                         if (r == TEST_RIGHT)
2111                                                 right = x;
2112                                         else
2113                                                 left = x + 1;
2114                                 }
2115
2116                                 if (last_index < right) {
2117                                         uint64_t y = last_index + 1;
2118
2119                                         p = le64toh(array->entry_array.items[y]);
2120                                         if (p <= 0)
2121                                                 return -EBADMSG;
2122
2123                                         r = test_object(f, p, needle);
2124                                         if (r < 0)
2125                                                 return r;
2126
2127                                         if (r == TEST_FOUND)
2128                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2129
2130                                         if (r == TEST_RIGHT)
2131                                                 right = y;
2132                                         else
2133                                                 left = y + 1;
2134                                 }
2135                         }
2136
2137                         for (;;) {
2138                                 if (left == right) {
2139                                         if (direction == DIRECTION_UP)
2140                                                 subtract_one = true;
2141
2142                                         i = left;
2143                                         goto found;
2144                                 }
2145
2146                                 assert(left < right);
2147                                 i = (left + right) / 2;
2148
2149                                 p = le64toh(array->entry_array.items[i]);
2150                                 if (p <= 0)
2151                                         r = -EBADMSG;
2152                                 else
2153                                         r = test_object(f, p, needle);
2154                                 if (r == -EBADMSG) {
2155                                         log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2156                                         right = n = i;
2157                                         continue;
2158                                 }
2159                                 if (r < 0)
2160                                         return r;
2161
2162                                 if (r == TEST_FOUND)
2163                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2164
2165                                 if (r == TEST_RIGHT)
2166                                         right = i;
2167                                 else
2168                                         left = i + 1;
2169                         }
2170                 }
2171
2172                 if (k >= n) {
2173                         if (direction == DIRECTION_UP) {
2174                                 i = n;
2175                                 subtract_one = true;
2176                                 goto found;
2177                         }
2178
2179                         return 0;
2180                 }
2181
2182                 last_p = lp;
2183
2184                 n -= k;
2185                 t += k;
2186                 last_index = (uint64_t) -1;
2187                 a = le64toh(array->entry_array.next_entry_array_offset);
2188         }
2189
2190         return 0;
2191
2192 found:
2193         if (subtract_one && t == 0 && i == 0)
2194                 return 0;
2195
2196         /* Let's cache this item for the next invocation */
2197         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2198
2199         if (subtract_one && i == 0)
2200                 p = last_p;
2201         else if (subtract_one)
2202                 p = le64toh(array->entry_array.items[i-1]);
2203         else
2204                 p = le64toh(array->entry_array.items[i]);
2205
2206         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2207         if (r < 0)
2208                 return r;
2209
2210         if (ret)
2211                 *ret = o;
2212
2213         if (offset)
2214                 *offset = p;
2215
2216         if (idx)
2217                 *idx = t + i + (subtract_one ? -1 : 0);
2218
2219         return 1;
2220 }
2221
2222 static int generic_array_bisect_plus_one(
2223                 JournalFile *f,
2224                 uint64_t extra,
2225                 uint64_t first,
2226                 uint64_t n,
2227                 uint64_t needle,
2228                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2229                 direction_t direction,
2230                 Object **ret,
2231                 uint64_t *offset,
2232                 uint64_t *idx) {
2233
2234         int r;
2235         bool step_back = false;
2236         Object *o;
2237
2238         assert(f);
2239         assert(test_object);
2240
2241         if (n <= 0)
2242                 return 0;
2243
2244         /* This bisects the array in object 'first', but first checks
2245          * an extra  */
2246         r = test_object(f, extra, needle);
2247         if (r < 0)
2248                 return r;
2249
2250         if (r == TEST_FOUND)
2251                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2252
2253         /* if we are looking with DIRECTION_UP then we need to first
2254            see if in the actual array there is a matching entry, and
2255            return the last one of that. But if there isn't any we need
2256            to return this one. Hence remember this, and return it
2257            below. */
2258         if (r == TEST_LEFT)
2259                 step_back = direction == DIRECTION_UP;
2260
2261         if (r == TEST_RIGHT) {
2262                 if (direction == DIRECTION_DOWN)
2263                         goto found;
2264                 else
2265                         return 0;
2266         }
2267
2268         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2269
2270         if (r == 0 && step_back)
2271                 goto found;
2272
2273         if (r > 0 && idx)
2274                 (*idx)++;
2275
2276         return r;
2277
2278 found:
2279         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2280         if (r < 0)
2281                 return r;
2282
2283         if (ret)
2284                 *ret = o;
2285
2286         if (offset)
2287                 *offset = extra;
2288
2289         if (idx)
2290                 *idx = 0;
2291
2292         return 1;
2293 }
2294
2295 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2296         assert(f);
2297         assert(p > 0);
2298
2299         if (p == needle)
2300                 return TEST_FOUND;
2301         else if (p < needle)
2302                 return TEST_LEFT;
2303         else
2304                 return TEST_RIGHT;
2305 }
2306
2307 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2308         Object *o;
2309         int r;
2310
2311         assert(f);
2312         assert(p > 0);
2313
2314         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2315         if (r < 0)
2316                 return r;
2317
2318         if (le64toh(o->entry.seqnum) == needle)
2319                 return TEST_FOUND;
2320         else if (le64toh(o->entry.seqnum) < needle)
2321                 return TEST_LEFT;
2322         else
2323                 return TEST_RIGHT;
2324 }
2325
2326 int journal_file_move_to_entry_by_seqnum(
2327                 JournalFile *f,
2328                 uint64_t seqnum,
2329                 direction_t direction,
2330                 Object **ret,
2331                 uint64_t *offset) {
2332         assert(f);
2333         assert(f->header);
2334
2335         return generic_array_bisect(f,
2336                                     le64toh(f->header->entry_array_offset),
2337                                     le64toh(f->header->n_entries),
2338                                     seqnum,
2339                                     test_object_seqnum,
2340                                     direction,
2341                                     ret, offset, NULL);
2342 }
2343
2344 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2345         Object *o;
2346         int r;
2347
2348         assert(f);
2349         assert(p > 0);
2350
2351         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2352         if (r < 0)
2353                 return r;
2354
2355         if (le64toh(o->entry.realtime) == needle)
2356                 return TEST_FOUND;
2357         else if (le64toh(o->entry.realtime) < needle)
2358                 return TEST_LEFT;
2359         else
2360                 return TEST_RIGHT;
2361 }
2362
2363 int journal_file_move_to_entry_by_realtime(
2364                 JournalFile *f,
2365                 uint64_t realtime,
2366                 direction_t direction,
2367                 Object **ret,
2368                 uint64_t *offset) {
2369         assert(f);
2370         assert(f->header);
2371
2372         return generic_array_bisect(f,
2373                                     le64toh(f->header->entry_array_offset),
2374                                     le64toh(f->header->n_entries),
2375                                     realtime,
2376                                     test_object_realtime,
2377                                     direction,
2378                                     ret, offset, NULL);
2379 }
2380
2381 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2382         Object *o;
2383         int r;
2384
2385         assert(f);
2386         assert(p > 0);
2387
2388         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2389         if (r < 0)
2390                 return r;
2391
2392         if (le64toh(o->entry.monotonic) == needle)
2393                 return TEST_FOUND;
2394         else if (le64toh(o->entry.monotonic) < needle)
2395                 return TEST_LEFT;
2396         else
2397                 return TEST_RIGHT;
2398 }
2399
2400 static int find_data_object_by_boot_id(
2401                 JournalFile *f,
2402                 sd_id128_t boot_id,
2403                 Object **o,
2404                 uint64_t *b) {
2405
2406         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2407
2408         sd_id128_to_string(boot_id, t + 9);
2409         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2410 }
2411
2412 int journal_file_move_to_entry_by_monotonic(
2413                 JournalFile *f,
2414                 sd_id128_t boot_id,
2415                 uint64_t monotonic,
2416                 direction_t direction,
2417                 Object **ret,
2418                 uint64_t *offset) {
2419
2420         Object *o;
2421         int r;
2422
2423         assert(f);
2424
2425         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2426         if (r < 0)
2427                 return r;
2428         if (r == 0)
2429                 return -ENOENT;
2430
2431         return generic_array_bisect_plus_one(f,
2432                                              le64toh(o->data.entry_offset),
2433                                              le64toh(o->data.entry_array_offset),
2434                                              le64toh(o->data.n_entries),
2435                                              monotonic,
2436                                              test_object_monotonic,
2437                                              direction,
2438                                              ret, offset, NULL);
2439 }
2440
2441 void journal_file_reset_location(JournalFile *f) {
2442         f->location_type = LOCATION_HEAD;
2443         f->current_offset = 0;
2444         f->current_seqnum = 0;
2445         f->current_realtime = 0;
2446         f->current_monotonic = 0;
2447         zero(f->current_boot_id);
2448         f->current_xor_hash = 0;
2449 }
2450
2451 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2452         f->location_type = LOCATION_SEEK;
2453         f->current_offset = offset;
2454         f->current_seqnum = le64toh(o->entry.seqnum);
2455         f->current_realtime = le64toh(o->entry.realtime);
2456         f->current_monotonic = le64toh(o->entry.monotonic);
2457         f->current_boot_id = o->entry.boot_id;
2458         f->current_xor_hash = le64toh(o->entry.xor_hash);
2459 }
2460
2461 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2462         assert(af);
2463         assert(af->header);
2464         assert(bf);
2465         assert(bf->header);
2466         assert(af->location_type == LOCATION_SEEK);
2467         assert(bf->location_type == LOCATION_SEEK);
2468
2469         /* If contents and timestamps match, these entries are
2470          * identical, even if the seqnum does not match */
2471         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2472             af->current_monotonic == bf->current_monotonic &&
2473             af->current_realtime == bf->current_realtime &&
2474             af->current_xor_hash == bf->current_xor_hash)
2475                 return 0;
2476
2477         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2478
2479                 /* If this is from the same seqnum source, compare
2480                  * seqnums */
2481                 if (af->current_seqnum < bf->current_seqnum)
2482                         return -1;
2483                 if (af->current_seqnum > bf->current_seqnum)
2484                         return 1;
2485
2486                 /* Wow! This is weird, different data but the same
2487                  * seqnums? Something is borked, but let's make the
2488                  * best of it and compare by time. */
2489         }
2490
2491         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2492
2493                 /* If the boot id matches, compare monotonic time */
2494                 if (af->current_monotonic < bf->current_monotonic)
2495                         return -1;
2496                 if (af->current_monotonic > bf->current_monotonic)
2497                         return 1;
2498         }
2499
2500         /* Otherwise, compare UTC time */
2501         if (af->current_realtime < bf->current_realtime)
2502                 return -1;
2503         if (af->current_realtime > bf->current_realtime)
2504                 return 1;
2505
2506         /* Finally, compare by contents */
2507         if (af->current_xor_hash < bf->current_xor_hash)
2508                 return -1;
2509         if (af->current_xor_hash > bf->current_xor_hash)
2510                 return 1;
2511
2512         return 0;
2513 }
2514
2515 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2516
2517         /* Increase or decrease the specified index, in the right direction. */
2518
2519         if (direction == DIRECTION_DOWN) {
2520                 if (*i >= n - 1)
2521                         return 0;
2522
2523                 (*i) ++;
2524         } else {
2525                 if (*i <= 0)
2526                         return 0;
2527
2528                 (*i) --;
2529         }
2530
2531         return 1;
2532 }
2533
2534 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2535
2536         /* Consider it an error if any of the two offsets is uninitialized */
2537         if (old_offset == 0 || new_offset == 0)
2538                 return false;
2539
2540         /* If we go down, the new offset must be larger than the old one. */
2541         return direction == DIRECTION_DOWN ?
2542                 new_offset > old_offset  :
2543                 new_offset < old_offset;
2544 }
2545
2546 int journal_file_next_entry(
2547                 JournalFile *f,
2548                 uint64_t p,
2549                 direction_t direction,
2550                 Object **ret, uint64_t *offset) {
2551
2552         uint64_t i, n, ofs;
2553         int r;
2554
2555         assert(f);
2556         assert(f->header);
2557
2558         n = le64toh(f->header->n_entries);
2559         if (n <= 0)
2560                 return 0;
2561
2562         if (p == 0)
2563                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2564         else {
2565                 r = generic_array_bisect(f,
2566                                          le64toh(f->header->entry_array_offset),
2567                                          le64toh(f->header->n_entries),
2568                                          p,
2569                                          test_object_offset,
2570                                          DIRECTION_DOWN,
2571                                          NULL, NULL,
2572                                          &i);
2573                 if (r <= 0)
2574                         return r;
2575
2576                 r = bump_array_index(&i, direction, n);
2577                 if (r <= 0)
2578                         return r;
2579         }
2580
2581         /* And jump to it */
2582         for (;;) {
2583                 r = generic_array_get(f,
2584                                       le64toh(f->header->entry_array_offset),
2585                                       i,
2586                                       ret, &ofs);
2587                 if (r > 0)
2588                         break;
2589                 if (r != -EBADMSG)
2590                         return r;
2591
2592                 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2593                  * the next one might work for us instead. */
2594                 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2595
2596                 r = bump_array_index(&i, direction, n);
2597                 if (r <= 0)
2598                         return r;
2599         }
2600
2601         /* Ensure our array is properly ordered. */
2602         if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2603                 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
2604                 return -EBADMSG;
2605         }
2606
2607         if (offset)
2608                 *offset = ofs;
2609
2610         return 1;
2611 }
2612
2613 int journal_file_next_entry_for_data(
2614                 JournalFile *f,
2615                 Object *o, uint64_t p,
2616                 uint64_t data_offset,
2617                 direction_t direction,
2618                 Object **ret, uint64_t *offset) {
2619
2620         uint64_t i, n, ofs;
2621         Object *d;
2622         int r;
2623
2624         assert(f);
2625         assert(p > 0 || !o);
2626
2627         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2628         if (r < 0)
2629                 return r;
2630
2631         n = le64toh(d->data.n_entries);
2632         if (n <= 0)
2633                 return n;
2634
2635         if (!o)
2636                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2637         else {
2638                 if (o->object.type != OBJECT_ENTRY)
2639                         return -EINVAL;
2640
2641                 r = generic_array_bisect_plus_one(f,
2642                                                   le64toh(d->data.entry_offset),
2643                                                   le64toh(d->data.entry_array_offset),
2644                                                   le64toh(d->data.n_entries),
2645                                                   p,
2646                                                   test_object_offset,
2647                                                   DIRECTION_DOWN,
2648                                                   NULL, NULL,
2649                                                   &i);
2650
2651                 if (r <= 0)
2652                         return r;
2653
2654                 r = bump_array_index(&i, direction, n);
2655                 if (r <= 0)
2656                         return r;
2657         }
2658
2659         for (;;) {
2660                 r = generic_array_get_plus_one(f,
2661                                                le64toh(d->data.entry_offset),
2662                                                le64toh(d->data.entry_array_offset),
2663                                                i,
2664                                                ret, &ofs);
2665                 if (r > 0)
2666                         break;
2667                 if (r != -EBADMSG)
2668                         return r;
2669
2670                 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2671
2672                 r = bump_array_index(&i, direction, n);
2673                 if (r <= 0)
2674                         return r;
2675         }
2676
2677         /* Ensure our array is properly ordered. */
2678         if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2679                 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2680                 return -EBADMSG;
2681         }
2682
2683         if (offset)
2684                 *offset = ofs;
2685
2686         return 1;
2687 }
2688
2689 int journal_file_move_to_entry_by_offset_for_data(
2690                 JournalFile *f,
2691                 uint64_t data_offset,
2692                 uint64_t p,
2693                 direction_t direction,
2694                 Object **ret, uint64_t *offset) {
2695
2696         int r;
2697         Object *d;
2698
2699         assert(f);
2700
2701         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2702         if (r < 0)
2703                 return r;
2704
2705         return generic_array_bisect_plus_one(f,
2706                                              le64toh(d->data.entry_offset),
2707                                              le64toh(d->data.entry_array_offset),
2708                                              le64toh(d->data.n_entries),
2709                                              p,
2710                                              test_object_offset,
2711                                              direction,
2712                                              ret, offset, NULL);
2713 }
2714
2715 int journal_file_move_to_entry_by_monotonic_for_data(
2716                 JournalFile *f,
2717                 uint64_t data_offset,
2718                 sd_id128_t boot_id,
2719                 uint64_t monotonic,
2720                 direction_t direction,
2721                 Object **ret, uint64_t *offset) {
2722
2723         Object *o, *d;
2724         int r;
2725         uint64_t b, z;
2726
2727         assert(f);
2728
2729         /* First, seek by time */
2730         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2731         if (r < 0)
2732                 return r;
2733         if (r == 0)
2734                 return -ENOENT;
2735
2736         r = generic_array_bisect_plus_one(f,
2737                                           le64toh(o->data.entry_offset),
2738                                           le64toh(o->data.entry_array_offset),
2739                                           le64toh(o->data.n_entries),
2740                                           monotonic,
2741                                           test_object_monotonic,
2742                                           direction,
2743                                           NULL, &z, NULL);
2744         if (r <= 0)
2745                 return r;
2746
2747         /* And now, continue seeking until we find an entry that
2748          * exists in both bisection arrays */
2749
2750         for (;;) {
2751                 Object *qo;
2752                 uint64_t p, q;
2753
2754                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2755                 if (r < 0)
2756                         return r;
2757
2758                 r = generic_array_bisect_plus_one(f,
2759                                                   le64toh(d->data.entry_offset),
2760                                                   le64toh(d->data.entry_array_offset),
2761                                                   le64toh(d->data.n_entries),
2762                                                   z,
2763                                                   test_object_offset,
2764                                                   direction,
2765                                                   NULL, &p, NULL);
2766                 if (r <= 0)
2767                         return r;
2768
2769                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2770                 if (r < 0)
2771                         return r;
2772
2773                 r = generic_array_bisect_plus_one(f,
2774                                                   le64toh(o->data.entry_offset),
2775                                                   le64toh(o->data.entry_array_offset),
2776                                                   le64toh(o->data.n_entries),
2777                                                   p,
2778                                                   test_object_offset,
2779                                                   direction,
2780                                                   &qo, &q, NULL);
2781
2782                 if (r <= 0)
2783                         return r;
2784
2785                 if (p == q) {
2786                         if (ret)
2787                                 *ret = qo;
2788                         if (offset)
2789                                 *offset = q;
2790
2791                         return 1;
2792                 }
2793
2794                 z = q;
2795         }
2796 }
2797
2798 int journal_file_move_to_entry_by_seqnum_for_data(
2799                 JournalFile *f,
2800                 uint64_t data_offset,
2801                 uint64_t seqnum,
2802                 direction_t direction,
2803                 Object **ret, uint64_t *offset) {
2804
2805         Object *d;
2806         int r;
2807
2808         assert(f);
2809
2810         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2811         if (r < 0)
2812                 return r;
2813
2814         return generic_array_bisect_plus_one(f,
2815                                              le64toh(d->data.entry_offset),
2816                                              le64toh(d->data.entry_array_offset),
2817                                              le64toh(d->data.n_entries),
2818                                              seqnum,
2819                                              test_object_seqnum,
2820                                              direction,
2821                                              ret, offset, NULL);
2822 }
2823
2824 int journal_file_move_to_entry_by_realtime_for_data(
2825                 JournalFile *f,
2826                 uint64_t data_offset,
2827                 uint64_t realtime,
2828                 direction_t direction,
2829                 Object **ret, uint64_t *offset) {
2830
2831         Object *d;
2832         int r;
2833
2834         assert(f);
2835
2836         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2837         if (r < 0)
2838                 return r;
2839
2840         return generic_array_bisect_plus_one(f,
2841                                              le64toh(d->data.entry_offset),
2842                                              le64toh(d->data.entry_array_offset),
2843                                              le64toh(d->data.n_entries),
2844                                              realtime,
2845                                              test_object_realtime,
2846                                              direction,
2847                                              ret, offset, NULL);
2848 }
2849
2850 void journal_file_dump(JournalFile *f) {
2851         Object *o;
2852         int r;
2853         uint64_t p;
2854
2855         assert(f);
2856         assert(f->header);
2857
2858         journal_file_print_header(f);
2859
2860         p = le64toh(f->header->header_size);
2861         while (p != 0) {
2862                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2863                 if (r < 0)
2864                         goto fail;
2865
2866                 switch (o->object.type) {
2867
2868                 case OBJECT_UNUSED:
2869                         printf("Type: OBJECT_UNUSED\n");
2870                         break;
2871
2872                 case OBJECT_DATA:
2873                         printf("Type: OBJECT_DATA\n");
2874                         break;
2875
2876                 case OBJECT_FIELD:
2877                         printf("Type: OBJECT_FIELD\n");
2878                         break;
2879
2880                 case OBJECT_ENTRY:
2881                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2882                                le64toh(o->entry.seqnum),
2883                                le64toh(o->entry.monotonic),
2884                                le64toh(o->entry.realtime));
2885                         break;
2886
2887                 case OBJECT_FIELD_HASH_TABLE:
2888                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2889                         break;
2890
2891                 case OBJECT_DATA_HASH_TABLE:
2892                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2893                         break;
2894
2895                 case OBJECT_ENTRY_ARRAY:
2896                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2897                         break;
2898
2899                 case OBJECT_TAG:
2900                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2901                                le64toh(o->tag.seqnum),
2902                                le64toh(o->tag.epoch));
2903                         break;
2904
2905                 default:
2906                         printf("Type: unknown (%i)\n", o->object.type);
2907                         break;
2908                 }
2909
2910                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2911                         printf("Flags: %s\n",
2912                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2913
2914                 if (p == le64toh(f->header->tail_object_offset))
2915                         p = 0;
2916                 else
2917                         p = p + ALIGN64(le64toh(o->object.size));
2918         }
2919
2920         return;
2921 fail:
2922         log_error("File corrupt");
2923 }
2924
2925 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2926         const char *x;
2927
2928         x = format_timestamp(buf, l, t);
2929         if (x)
2930                 return x;
2931         return " --- ";
2932 }
2933
2934 void journal_file_print_header(JournalFile *f) {
2935         char a[33], b[33], c[33], d[33];
2936         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2937         struct stat st;
2938         char bytes[FORMAT_BYTES_MAX];
2939
2940         assert(f);
2941         assert(f->header);
2942
2943         printf("File Path: %s\n"
2944                "File ID: %s\n"
2945                "Machine ID: %s\n"
2946                "Boot ID: %s\n"
2947                "Sequential Number ID: %s\n"
2948                "State: %s\n"
2949                "Compatible Flags:%s%s\n"
2950                "Incompatible Flags:%s%s%s\n"
2951                "Header size: %"PRIu64"\n"
2952                "Arena size: %"PRIu64"\n"
2953                "Data Hash Table Size: %"PRIu64"\n"
2954                "Field Hash Table Size: %"PRIu64"\n"
2955                "Rotate Suggested: %s\n"
2956                "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
2957                "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
2958                "Head Realtime Timestamp: %s (%"PRIx64")\n"
2959                "Tail Realtime Timestamp: %s (%"PRIx64")\n"
2960                "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
2961                "Objects: %"PRIu64"\n"
2962                "Entry Objects: %"PRIu64"\n",
2963                f->path,
2964                sd_id128_to_string(f->header->file_id, a),
2965                sd_id128_to_string(f->header->machine_id, b),
2966                sd_id128_to_string(f->header->boot_id, c),
2967                sd_id128_to_string(f->header->seqnum_id, d),
2968                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2969                f->header->state == STATE_ONLINE ? "ONLINE" :
2970                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2971                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2972                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2973                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2974                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2975                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2976                le64toh(f->header->header_size),
2977                le64toh(f->header->arena_size),
2978                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2979                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2980                yes_no(journal_file_rotate_suggested(f, 0)),
2981                le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
2982                le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
2983                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
2984                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
2985                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
2986                le64toh(f->header->n_objects),
2987                le64toh(f->header->n_entries));
2988
2989         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2990                 printf("Data Objects: %"PRIu64"\n"
2991                        "Data Hash Table Fill: %.1f%%\n",
2992                        le64toh(f->header->n_data),
2993                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2994
2995         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2996                 printf("Field Objects: %"PRIu64"\n"
2997                        "Field Hash Table Fill: %.1f%%\n",
2998                        le64toh(f->header->n_fields),
2999                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3000
3001         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3002                 printf("Tag Objects: %"PRIu64"\n",
3003                        le64toh(f->header->n_tags));
3004         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3005                 printf("Entry Array Objects: %"PRIu64"\n",
3006                        le64toh(f->header->n_entry_arrays));
3007
3008         if (fstat(f->fd, &st) >= 0)
3009                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3010 }
3011
3012 static int journal_file_warn_btrfs(JournalFile *f) {
3013         unsigned attrs;
3014         int r;
3015
3016         assert(f);
3017
3018         /* Before we write anything, check if the COW logic is turned
3019          * off on btrfs. Given our write pattern that is quite
3020          * unfriendly to COW file systems this should greatly improve
3021          * performance on COW file systems, such as btrfs, at the
3022          * expense of data integrity features (which shouldn't be too
3023          * bad, given that we do our own checksumming). */
3024
3025         r = btrfs_is_filesystem(f->fd);
3026         if (r < 0)
3027                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3028         if (!r)
3029                 return 0;
3030
3031         r = read_attr_fd(f->fd, &attrs);
3032         if (r < 0)
3033                 return log_warning_errno(r, "Failed to read file attributes: %m");
3034
3035         if (attrs & FS_NOCOW_FL) {
3036                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3037                 return 0;
3038         }
3039
3040         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3041                    "This is likely to slow down journal access substantially, please consider turning "
3042                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3043
3044         return 1;
3045 }
3046
3047 int journal_file_open(
3048                 int fd,
3049                 const char *fname,
3050                 int flags,
3051                 mode_t mode,
3052                 bool compress,
3053                 bool seal,
3054                 JournalMetrics *metrics,
3055                 MMapCache *mmap_cache,
3056                 Set *deferred_closes,
3057                 JournalFile *template,
3058                 JournalFile **ret) {
3059
3060         bool newly_created = false;
3061         JournalFile *f;
3062         void *h;
3063         int r;
3064
3065         assert(ret);
3066         assert(fd >= 0 || fname);
3067
3068         if ((flags & O_ACCMODE) != O_RDONLY &&
3069             (flags & O_ACCMODE) != O_RDWR)
3070                 return -EINVAL;
3071
3072         if (fname) {
3073                 if (!endswith(fname, ".journal") &&
3074                     !endswith(fname, ".journal~"))
3075                         return -EINVAL;
3076         }
3077
3078         f = new0(JournalFile, 1);
3079         if (!f)
3080                 return -ENOMEM;
3081
3082         f->fd = fd;
3083         f->mode = mode;
3084
3085         f->flags = flags;
3086         f->prot = prot_from_flags(flags);
3087         f->writable = (flags & O_ACCMODE) != O_RDONLY;
3088 #if defined(HAVE_LZ4)
3089         f->compress_lz4 = compress;
3090 #elif defined(HAVE_XZ)
3091         f->compress_xz = compress;
3092 #endif
3093 #ifdef HAVE_GCRYPT
3094         f->seal = seal;
3095 #endif
3096
3097         if (mmap_cache)
3098                 f->mmap = mmap_cache_ref(mmap_cache);
3099         else {
3100                 f->mmap = mmap_cache_new();
3101                 if (!f->mmap) {
3102                         r = -ENOMEM;
3103                         goto fail;
3104                 }
3105         }
3106
3107         if (fname) {
3108                 f->path = strdup(fname);
3109                 if (!f->path) {
3110                         r = -ENOMEM;
3111                         goto fail;
3112                 }
3113         } else {
3114                 /* If we don't know the path, fill in something explanatory and vaguely useful */
3115                 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3116                         r = -ENOMEM;
3117                         goto fail;
3118                 }
3119         }
3120
3121         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3122         if (!f->chain_cache) {
3123                 r = -ENOMEM;
3124                 goto fail;
3125         }
3126
3127         if (f->fd < 0) {
3128                 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
3129                 if (f->fd < 0) {
3130                         r = -errno;
3131                         goto fail;
3132                 }
3133
3134                 /* fds we opened here by us should also be closed by us. */
3135                 f->close_fd = true;
3136         }
3137
3138         r = journal_file_fstat(f);
3139         if (r < 0)
3140                 goto fail;
3141
3142         if (f->last_stat.st_size == 0 && f->writable) {
3143
3144                 (void) journal_file_warn_btrfs(f);
3145
3146                 /* Let's attach the creation time to the journal file,
3147                  * so that the vacuuming code knows the age of this
3148                  * file even if the file might end up corrupted one
3149                  * day... Ideally we'd just use the creation time many
3150                  * file systems maintain for each file, but there is
3151                  * currently no usable API to query this, hence let's
3152                  * emulate this via extended attributes. If extended
3153                  * attributes are not supported we'll just skip this,
3154                  * and rely solely on mtime/atime/ctime of the file. */
3155
3156                 fd_setcrtime(f->fd, 0);
3157
3158 #ifdef HAVE_GCRYPT
3159                 /* Try to load the FSPRG state, and if we can't, then
3160                  * just don't do sealing */
3161                 if (f->seal) {
3162                         r = journal_file_fss_load(f);
3163                         if (r < 0)
3164                                 f->seal = false;
3165                 }
3166 #endif
3167
3168                 r = journal_file_init_header(f, template);
3169                 if (r < 0)
3170                         goto fail;
3171
3172                 r = journal_file_fstat(f);
3173                 if (r < 0)
3174                         goto fail;
3175
3176                 newly_created = true;
3177         }
3178
3179         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3180                 r = -ENODATA;
3181                 goto fail;
3182         }
3183
3184         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
3185         if (r < 0)
3186                 goto fail;
3187
3188         f->header = h;
3189
3190         if (!newly_created) {
3191                 if (deferred_closes)
3192                         journal_file_close_set(deferred_closes);
3193
3194                 r = journal_file_verify_header(f);
3195                 if (r < 0)
3196                         goto fail;
3197         }
3198
3199 #ifdef HAVE_GCRYPT
3200         if (!newly_created && f->writable) {
3201                 r = journal_file_fss_load(f);
3202                 if (r < 0)
3203                         goto fail;
3204         }
3205 #endif
3206
3207         if (f->writable) {
3208                 if (metrics) {
3209                         journal_default_metrics(metrics, f->fd);
3210                         f->metrics = *metrics;
3211                 } else if (template)
3212                         f->metrics = template->metrics;
3213
3214                 r = journal_file_refresh_header(f);
3215                 if (r < 0)
3216                         goto fail;
3217         }
3218
3219 #ifdef HAVE_GCRYPT
3220         r = journal_file_hmac_setup(f);
3221         if (r < 0)
3222                 goto fail;
3223 #endif
3224
3225         if (newly_created) {
3226                 r = journal_file_setup_field_hash_table(f);
3227                 if (r < 0)
3228                         goto fail;
3229
3230                 r = journal_file_setup_data_hash_table(f);
3231                 if (r < 0)
3232                         goto fail;
3233
3234 #ifdef HAVE_GCRYPT
3235                 r = journal_file_append_first_tag(f);
3236                 if (r < 0)
3237                         goto fail;
3238 #endif
3239         }
3240
3241         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
3242                 r = -EIO;
3243                 goto fail;
3244         }
3245
3246         if (template && template->post_change_timer) {
3247                 r = journal_file_enable_post_change_timer(
3248                                 f,
3249                                 sd_event_source_get_event(template->post_change_timer),
3250                                 template->post_change_timer_period);
3251
3252                 if (r < 0)
3253                         goto fail;
3254         }
3255
3256         /* The file is opened now successfully, thus we take possession of any passed in fd. */
3257         f->close_fd = true;
3258
3259         *ret = f;
3260         return 0;
3261
3262 fail:
3263         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
3264                 r = -EIO;
3265
3266         (void) journal_file_close(f);
3267
3268         return r;
3269 }
3270
3271 int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
3272         _cleanup_free_ char *p = NULL;
3273         size_t l;
3274         JournalFile *old_file, *new_file = NULL;
3275         int r;
3276
3277         assert(f);
3278         assert(*f);
3279
3280         old_file = *f;
3281
3282         if (!old_file->writable)
3283                 return -EINVAL;
3284
3285         /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3286          * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3287         if (path_startswith(old_file->path, "/proc/self/fd"))
3288                 return -EINVAL;
3289
3290         if (!endswith(old_file->path, ".journal"))
3291                 return -EINVAL;
3292
3293         l = strlen(old_file->path);
3294         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3295                      (int) l - 8, old_file->path,
3296                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3297                      le64toh((*f)->header->head_entry_seqnum),
3298                      le64toh((*f)->header->head_entry_realtime));
3299         if (r < 0)
3300                 return -ENOMEM;
3301
3302         /* Try to rename the file to the archived version. If the file
3303          * already was deleted, we'll get ENOENT, let's ignore that
3304          * case. */
3305         r = rename(old_file->path, p);
3306         if (r < 0 && errno != ENOENT)
3307                 return -errno;
3308
3309         /* Sync the rename to disk */
3310         (void) fsync_directory_of_file(old_file->fd);
3311
3312         /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3313          * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3314          * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3315          * would result in the rotated journal never getting fsync() called before closing.
3316          * Now we simply queue the archive state by setting an archive bit, leaving the state
3317          * as STATE_ONLINE so proper offlining occurs. */
3318         old_file->archive = true;
3319
3320         /* Currently, btrfs is not very good with out write patterns
3321          * and fragments heavily. Let's defrag our journal files when
3322          * we archive them */
3323         old_file->defrag_on_close = true;
3324
3325         r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
3326
3327         if (deferred_closes &&
3328             set_put(deferred_closes, old_file) >= 0)
3329                 (void) journal_file_set_offline(old_file, false);
3330         else
3331                 (void) journal_file_close(old_file);
3332
3333         *f = new_file;
3334         return r;
3335 }
3336
3337 int journal_file_open_reliably(
3338                 const char *fname,
3339                 int flags,
3340                 mode_t mode,
3341                 bool compress,
3342                 bool seal,
3343                 JournalMetrics *metrics,
3344                 MMapCache *mmap_cache,
3345                 Set *deferred_closes,
3346                 JournalFile *template,
3347                 JournalFile **ret) {
3348
3349         int r;
3350         size_t l;
3351         _cleanup_free_ char *p = NULL;
3352
3353         r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3354         if (!IN_SET(r,
3355                     -EBADMSG,           /* Corrupted */
3356                     -ENODATA,           /* Truncated */
3357                     -EHOSTDOWN,         /* Other machine */
3358                     -EPROTONOSUPPORT,   /* Incompatible feature */
3359                     -EBUSY,             /* Unclean shutdown */
3360                     -ESHUTDOWN,         /* Already archived */
3361                     -EIO,               /* IO error, including SIGBUS on mmap */
3362                     -EIDRM,             /* File has been deleted */
3363                     -ETXTBSY))          /* File is from the future */
3364                 return r;
3365
3366         if ((flags & O_ACCMODE) == O_RDONLY)
3367                 return r;
3368
3369         if (!(flags & O_CREAT))
3370                 return r;
3371
3372         if (!endswith(fname, ".journal"))
3373                 return r;
3374
3375         /* The file is corrupted. Rotate it away and try it again (but only once) */
3376
3377         l = strlen(fname);
3378         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
3379                      (int) l - 8, fname,
3380                      now(CLOCK_REALTIME),
3381                      random_u64()) < 0)
3382                 return -ENOMEM;
3383
3384         if (rename(fname, p) < 0)
3385                 return -errno;
3386
3387         /* btrfs doesn't cope well with our write pattern and
3388          * fragments heavily. Let's defrag all files we rotate */
3389
3390         (void) chattr_path(p, 0, FS_NOCOW_FL);
3391         (void) btrfs_defrag(p);
3392
3393         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3394
3395         return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3396 }
3397
3398 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3399         uint64_t i, n;
3400         uint64_t q, xor_hash = 0;
3401         int r;
3402         EntryItem *items;
3403         dual_timestamp ts;
3404
3405         assert(from);
3406         assert(to);
3407         assert(o);
3408         assert(p);
3409
3410         if (!to->writable)
3411                 return -EPERM;
3412
3413         ts.monotonic = le64toh(o->entry.monotonic);
3414         ts.realtime = le64toh(o->entry.realtime);
3415
3416         n = journal_file_entry_n_items(o);
3417         /* alloca() can't take 0, hence let's allocate at least one */
3418         items = alloca(sizeof(EntryItem) * MAX(1u, n));
3419
3420         for (i = 0; i < n; i++) {
3421                 uint64_t l, h;
3422                 le64_t le_hash;
3423                 size_t t;
3424                 void *data;
3425                 Object *u;
3426
3427                 q = le64toh(o->entry.items[i].object_offset);
3428                 le_hash = o->entry.items[i].hash;
3429
3430                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3431                 if (r < 0)
3432                         return r;
3433
3434                 if (le_hash != o->data.hash)
3435                         return -EBADMSG;
3436
3437                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3438                 t = (size_t) l;
3439
3440                 /* We hit the limit on 32bit machines */
3441                 if ((uint64_t) t != l)
3442                         return -E2BIG;
3443
3444                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3445 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
3446                         size_t rsize = 0;
3447
3448                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3449                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3450                         if (r < 0)
3451                                 return r;
3452
3453                         data = from->compress_buffer;
3454                         l = rsize;
3455 #else
3456                         return -EPROTONOSUPPORT;
3457 #endif
3458                 } else
3459                         data = o->data.payload;
3460
3461                 r = journal_file_append_data(to, data, l, &u, &h);
3462                 if (r < 0)
3463                         return r;
3464
3465                 xor_hash ^= le64toh(u->data.hash);
3466                 items[i].object_offset = htole64(h);
3467                 items[i].hash = u->data.hash;
3468
3469                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3470                 if (r < 0)
3471                         return r;
3472         }
3473
3474         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3475
3476         if (mmap_cache_got_sigbus(to->mmap, to->fd))
3477                 return -EIO;
3478
3479         return r;
3480 }
3481
3482 void journal_reset_metrics(JournalMetrics *m) {
3483         assert(m);
3484
3485         /* Set everything to "pick automatic values". */
3486
3487         *m = (JournalMetrics) {
3488                 .min_use = (uint64_t) -1,
3489                 .max_use = (uint64_t) -1,
3490                 .min_size = (uint64_t) -1,
3491                 .max_size = (uint64_t) -1,
3492                 .keep_free = (uint64_t) -1,
3493                 .n_max_files = (uint64_t) -1,
3494         };
3495 }
3496
3497 void journal_default_metrics(JournalMetrics *m, int fd) {
3498         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3499         struct statvfs ss;
3500         uint64_t fs_size;
3501
3502         assert(m);
3503         assert(fd >= 0);
3504
3505         if (fstatvfs(fd, &ss) >= 0)
3506                 fs_size = ss.f_frsize * ss.f_blocks;
3507         else {
3508                 log_debug_errno(errno, "Failed to detremine disk size: %m");
3509                 fs_size = 0;
3510         }
3511
3512         if (m->max_use == (uint64_t) -1) {
3513
3514                 if (fs_size > 0) {
3515                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3516
3517                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3518                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3519
3520                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3521                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3522                 } else
3523                         m->max_use = DEFAULT_MAX_USE_LOWER;
3524         } else {
3525                 m->max_use = PAGE_ALIGN(m->max_use);
3526
3527                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3528                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3529         }
3530
3531         if (m->min_use == (uint64_t) -1)
3532                 m->min_use = DEFAULT_MIN_USE;
3533
3534         if (m->min_use > m->max_use)
3535                 m->min_use = m->max_use;
3536
3537         if (m->max_size == (uint64_t) -1) {
3538                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3539
3540                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3541                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3542         } else
3543                 m->max_size = PAGE_ALIGN(m->max_size);
3544
3545         if (m->max_size != 0) {
3546                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3547                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3548
3549                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3550                         m->max_use = m->max_size*2;
3551         }
3552
3553         if (m->min_size == (uint64_t) -1)
3554                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3555         else {
3556                 m->min_size = PAGE_ALIGN(m->min_size);
3557
3558                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3559                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3560
3561                 if (m->max_size != 0 && m->min_size > m->max_size)
3562                         m->max_size = m->min_size;
3563         }
3564
3565         if (m->keep_free == (uint64_t) -1) {
3566
3567                 if (fs_size > 0) {
3568                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3569
3570                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3571                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3572
3573                 } else
3574                         m->keep_free = DEFAULT_KEEP_FREE;
3575         }
3576
3577         if (m->n_max_files == (uint64_t) -1)
3578                 m->n_max_files = DEFAULT_N_MAX_FILES;
3579
3580         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3581                   format_bytes(a, sizeof(a), m->min_use),
3582                   format_bytes(b, sizeof(b), m->max_use),
3583                   format_bytes(c, sizeof(c), m->max_size),
3584                   format_bytes(d, sizeof(d), m->min_size),
3585                   format_bytes(e, sizeof(e), m->keep_free),
3586                   m->n_max_files);
3587 }
3588
3589 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3590         assert(f);
3591         assert(f->header);
3592         assert(from || to);
3593
3594         if (from) {
3595                 if (f->header->head_entry_realtime == 0)
3596                         return -ENOENT;
3597
3598                 *from = le64toh(f->header->head_entry_realtime);
3599         }
3600
3601         if (to) {
3602                 if (f->header->tail_entry_realtime == 0)
3603                         return -ENOENT;
3604
3605                 *to = le64toh(f->header->tail_entry_realtime);
3606         }
3607
3608         return 1;
3609 }
3610
3611 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3612         Object *o;
3613         uint64_t p;
3614         int r;
3615
3616         assert(f);
3617         assert(from || to);
3618
3619         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3620         if (r <= 0)
3621                 return r;
3622
3623         if (le64toh(o->data.n_entries) <= 0)
3624                 return 0;
3625
3626         if (from) {
3627                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3628                 if (r < 0)
3629                         return r;
3630
3631                 *from = le64toh(o->entry.monotonic);
3632         }
3633
3634         if (to) {
3635                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3636                 if (r < 0)
3637                         return r;
3638
3639                 r = generic_array_get_plus_one(f,
3640                                                le64toh(o->data.entry_offset),
3641                                                le64toh(o->data.entry_array_offset),
3642                                                le64toh(o->data.n_entries)-1,
3643                                                &o, NULL);
3644                 if (r <= 0)
3645                         return r;
3646
3647                 *to = le64toh(o->entry.monotonic);
3648         }
3649
3650         return 1;
3651 }
3652
3653 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3654         assert(f);
3655         assert(f->header);
3656
3657         /* If we gained new header fields we gained new features,
3658          * hence suggest a rotation */
3659         if (le64toh(f->header->header_size) < sizeof(Header)) {
3660                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3661                 return true;
3662         }
3663
3664         /* Let's check if the hash tables grew over a certain fill
3665          * level (75%, borrowing this value from Java's hash table
3666          * implementation), and if so suggest a rotation. To calculate
3667          * the fill level we need the n_data field, which only exists
3668          * in newer versions. */
3669
3670         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3671                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3672                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3673                                   f->path,
3674                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3675                                   le64toh(f->header->n_data),
3676                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3677                                   (unsigned long long) f->last_stat.st_size,
3678                                   f->last_stat.st_size / le64toh(f->header->n_data));
3679                         return true;
3680                 }
3681
3682         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3683                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3684                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3685                                   f->path,
3686                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3687                                   le64toh(f->header->n_fields),
3688                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3689                         return true;
3690                 }
3691
3692         /* Are the data objects properly indexed by field objects? */
3693         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3694             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3695             le64toh(f->header->n_data) > 0 &&
3696             le64toh(f->header->n_fields) == 0)
3697                 return true;
3698
3699         if (max_file_usec > 0) {
3700                 usec_t t, h;
3701
3702                 h = le64toh(f->header->head_entry_realtime);
3703                 t = now(CLOCK_REALTIME);
3704
3705                 if (h > 0 && t > h + max_file_usec)
3706                         return true;
3707         }
3708
3709         return false;
3710 }