src/journal/journal-file.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2011 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <fcntl.h>
  24 #include <linux/fs.h>
  25 #include <stddef.h>
  26 #include <sys/mman.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/uio.h>
  29 #include <unistd.h>
  30
  31 #include "btrfs-util.h"
  32 #include "compress.h"
  33 #include "journal-authenticate.h"
  34 #include "journal-def.h"
  35 #include "lookup3.h"
  36 #include "random-util.h"
  37 #include "string-util.h"
  38 #include "journal-file.h"
  39
  40 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  41 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  42
  43 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
  44
  45 /* This is the minimum journal file size */
  46 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
  47
  48 /* These are the lower and upper bounds if we deduce the max_use value
  49  * from the file system size */
  50 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  51 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  52
  53 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  54 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  55
  56 /* This is the upper bound if we deduce max_size from max_use */
  57 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  58
  59 /* This is the upper bound if we deduce the keep_free value from the
  60  * file system size */
  61 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  62
  63 /* This is the keep_free value when we can't determine the system
  64  * size */
  65 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  66
  67 /* This is the default maximum number of journal files to keep around. */
  68 #define DEFAULT_N_MAX_FILES (100)
  69
  70 /* n_data was the first entry we added after the initial file format design */
  71 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  72
  73 /* How many entries to keep in the entry array chain cache at max */
  74 #define CHAIN_CACHE_MAX 20
  75
  76 /* How much to increase the journal file size at once each time we allocate something new. */
  77 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  78
  79 /* Reread fstat() of the file for detecting deletions at least this often */
  80 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  81
  82 /* The mmap context to use for the header we pick as one above the last defined typed */
  83 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  84
  85 static int journal_file_set_online(JournalFile *f) {
  86         assert(f);
  87
  88         if (!f->writable)
  89                 return -EPERM;
  90
  91         if (!(f->fd >= 0 && f->header))
  92                 return -EINVAL;
  93
  94         if (mmap_cache_got_sigbus(f->mmap, f->fd))
  95                 return -EIO;
  96
  97         switch(f->header->state) {
  98                 case STATE_ONLINE:
  99                         return 0;
 100
 101                 case STATE_OFFLINE:
 102                         f->header->state = STATE_ONLINE;
 103                         fsync(f->fd);
 104                         return 0;
 105
 106                 default:
 107                         return -EINVAL;
 108         }
 109 }
 110
 111 int journal_file_set_offline(JournalFile *f) {
 112         assert(f);
 113
 114         if (!f->writable)
 115                 return -EPERM;
 116
 117         if (!(f->fd >= 0 && f->header))
 118                 return -EINVAL;
 119
 120         if (f->header->state != STATE_ONLINE)
 121                 return 0;
 122
 123         fsync(f->fd);
 124
 125         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 126                 return -EIO;
 127
 128         f->header->state = STATE_OFFLINE;
 129
 130         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 131                 return -EIO;
 132
 133         fsync(f->fd);
 134
 135         return 0;
 136 }
 137
 138 JournalFile* journal_file_close(JournalFile *f) {
 139         assert(f);
 140
 141 #ifdef HAVE_GCRYPT
 142         /* Write the final tag */
 143         if (f->seal && f->writable)
 144                 journal_file_append_tag(f);
 145 #endif
 146
 147         journal_file_set_offline(f);
 148
 149         if (f->mmap && f->fd >= 0)
 150                 mmap_cache_close_fd(f->mmap, f->fd);
 151
 152         if (f->fd >= 0 && f->defrag_on_close) {
 153
 154                 /* Be friendly to btrfs: turn COW back on again now,
 155                  * and defragment the file. We won't write to the file
 156                  * ever again, hence remove all fragmentation, and
 157                  * reenable all the good bits COW usually provides
 158                  * (such as data checksumming). */
 159
 160                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 161                 (void) btrfs_defrag_fd(f->fd);
 162         }
 163
 164         safe_close(f->fd);
 165         free(f->path);
 166
 167         if (f->mmap)
 168                 mmap_cache_unref(f->mmap);
 169
 170         ordered_hashmap_free_free(f->chain_cache);
 171
 172 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 173         free(f->compress_buffer);
 174 #endif
 175
 176 #ifdef HAVE_GCRYPT
 177         if (f->fss_file)
 178                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 179         else
 180                 free(f->fsprg_state);
 181
 182         free(f->fsprg_seed);
 183
 184         if (f->hmac)
 185                 gcry_md_close(f->hmac);
 186 #endif
 187
 188         free(f);
 189         return NULL;
 190 }
 191
 192 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 193         Header h = {};
 194         ssize_t k;
 195         int r;
 196
 197         assert(f);
 198
 199         memcpy(h.signature, HEADER_SIGNATURE, 8);
 200         h.header_size = htole64(ALIGN64(sizeof(h)));
 201
 202         h.incompatible_flags |= htole32(
 203                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 204                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 205
 206         h.compatible_flags = htole32(
 207                 f->seal * HEADER_COMPATIBLE_SEALED);
 208
 209         r = sd_id128_randomize(&h.file_id);
 210         if (r < 0)
 211                 return r;
 212
 213         if (template) {
 214                 h.seqnum_id = template->header->seqnum_id;
 215                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 216         } else
 217                 h.seqnum_id = h.file_id;
 218
 219         k = pwrite(f->fd, &h, sizeof(h), 0);
 220         if (k < 0)
 221                 return -errno;
 222
 223         if (k != sizeof(h))
 224                 return -EIO;
 225
 226         return 0;
 227 }
 228
 229 static int journal_file_refresh_header(JournalFile *f) {
 230         sd_id128_t boot_id;
 231         int r;
 232
 233         assert(f);
 234
 235         r = sd_id128_get_machine(&f->header->machine_id);
 236         if (r < 0)
 237                 return r;
 238
 239         r = sd_id128_get_boot(&boot_id);
 240         if (r < 0)
 241                 return r;
 242
 243         if (sd_id128_equal(boot_id, f->header->boot_id))
 244                 f->tail_entry_monotonic_valid = true;
 245
 246         f->header->boot_id = boot_id;
 247
 248         r = journal_file_set_online(f);
 249
 250         /* Sync the online state to disk */
 251         fsync(f->fd);
 252
 253         return r;
 254 }
 255
 256 static int journal_file_verify_header(JournalFile *f) {
 257         uint32_t flags;
 258
 259         assert(f);
 260
 261         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 262                 return -EBADMSG;
 263
 264         /* In both read and write mode we refuse to open files with
 265          * incompatible flags we don't know */
 266         flags = le32toh(f->header->incompatible_flags);
 267         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
 268                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
 269                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
 270                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
 271                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
 272                 if (flags)
 273                         log_debug("Journal file %s uses incompatible flags %"PRIx32
 274                                   " disabled at compilation time.", f->path, flags);
 275                 return -EPROTONOSUPPORT;
 276         }
 277
 278         /* When open for writing we refuse to open files with
 279          * compatible flags, too */
 280         flags = le32toh(f->header->compatible_flags);
 281         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
 282                 if (flags & ~HEADER_COMPATIBLE_ANY)
 283                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
 284                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
 285                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
 286                 if (flags)
 287                         log_debug("Journal file %s uses compatible flags %"PRIx32
 288                                   " disabled at compilation time.", f->path, flags);
 289                 return -EPROTONOSUPPORT;
 290         }
 291
 292         if (f->header->state >= _STATE_MAX)
 293                 return -EBADMSG;
 294
 295         /* The first addition was n_data, so check that we are at least this large */
 296         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
 297                 return -EBADMSG;
 298
 299         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 300                 return -EBADMSG;
 301
 302         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
 303                 return -ENODATA;
 304
 305         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
 306                 return -ENODATA;
 307
 308         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 309             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 310             !VALID64(le64toh(f->header->tail_object_offset)) ||
 311             !VALID64(le64toh(f->header->entry_array_offset)))
 312                 return -ENODATA;
 313
 314         if (f->writable) {
 315                 uint8_t state;
 316                 sd_id128_t machine_id;
 317                 int r;
 318
 319                 r = sd_id128_get_machine(&machine_id);
 320                 if (r < 0)
 321                         return r;
 322
 323                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 324                         return -EHOSTDOWN;
 325
 326                 state = f->header->state;
 327
 328                 if (state == STATE_ONLINE) {
 329                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 330                         return -EBUSY;
 331                 } else if (state == STATE_ARCHIVED)
 332                         return -ESHUTDOWN;
 333                 else if (state != STATE_OFFLINE) {
 334                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 335                         return -EBUSY;
 336                 }
 337         }
 338
 339         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 340         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 341
 342         f->seal = JOURNAL_HEADER_SEALED(f->header);
 343
 344         return 0;
 345 }
 346
 347 static int journal_file_fstat(JournalFile *f) {
 348         assert(f);
 349         assert(f->fd >= 0);
 350
 351         if (fstat(f->fd, &f->last_stat) < 0)
 352                 return -errno;
 353
 354         f->last_stat_usec = now(CLOCK_MONOTONIC);
 355
 356         /* Refuse appending to files that are already deleted */
 357         if (f->last_stat.st_nlink <= 0)
 358                 return -EIDRM;
 359
 360         return 0;
 361 }
 362
 363 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 364         uint64_t old_size, new_size;
 365         int r;
 366
 367         assert(f);
 368
 369         /* We assume that this file is not sparse, and we know that
 370          * for sure, since we always call posix_fallocate()
 371          * ourselves */
 372
 373         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 374                 return -EIO;
 375
 376         old_size =
 377                 le64toh(f->header->header_size) +
 378                 le64toh(f->header->arena_size);
 379
 380         new_size = PAGE_ALIGN(offset + size);
 381         if (new_size < le64toh(f->header->header_size))
 382                 new_size = le64toh(f->header->header_size);
 383
 384         if (new_size <= old_size) {
 385
 386                 /* We already pre-allocated enough space, but before
 387                  * we write to it, let's check with fstat() if the
 388                  * file got deleted, in order make sure we don't throw
 389                  * away the data immediately. Don't check fstat() for
 390                  * all writes though, but only once ever 10s. */
 391
 392                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 393                         return 0;
 394
 395                 return journal_file_fstat(f);
 396         }
 397
 398         /* Allocate more space. */
 399
 400         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 401                 return -E2BIG;
 402
 403         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 404                 struct statvfs svfs;
 405
 406                 if (fstatvfs(f->fd, &svfs) >= 0) {
 407                         uint64_t available;
 408
 409                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 410
 411                         if (new_size - old_size > available)
 412                                 return -E2BIG;
 413                 }
 414         }
 415
 416         /* Increase by larger blocks at once */
 417         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 418         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 419                 new_size = f->metrics.max_size;
 420
 421         /* Note that the glibc fallocate() fallback is very
 422            inefficient, hence we try to minimize the allocation area
 423            as we can. */
 424         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 425         if (r != 0)
 426                 return -r;
 427
 428         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 429
 430         return journal_file_fstat(f);
 431 }
 432
 433 static unsigned type_to_context(ObjectType type) {
 434         /* One context for each type, plus one catch-all for the rest */
 435         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 436         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 437         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 438 }
 439
 440 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
 441         int r;
 442
 443         assert(f);
 444         assert(ret);
 445
 446         if (size <= 0)
 447                 return -EINVAL;
 448
 449         /* Avoid SIGBUS on invalid accesses */
 450         if (offset + size > (uint64_t) f->last_stat.st_size) {
 451                 /* Hmm, out of range? Let's refresh the fstat() data
 452                  * first, before we trust that check. */
 453
 454                 r = journal_file_fstat(f);
 455                 if (r < 0)
 456                         return r;
 457
 458                 if (offset + size > (uint64_t) f->last_stat.st_size)
 459                         return -EADDRNOTAVAIL;
 460         }
 461
 462         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
 463 }
 464
 465 static uint64_t minimum_header_size(Object *o) {
 466
 467         static const uint64_t table[] = {
 468                 [OBJECT_DATA] = sizeof(DataObject),
 469                 [OBJECT_FIELD] = sizeof(FieldObject),
 470                 [OBJECT_ENTRY] = sizeof(EntryObject),
 471                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 472                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 473                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 474                 [OBJECT_TAG] = sizeof(TagObject),
 475         };
 476
 477         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 478                 return sizeof(ObjectHeader);
 479
 480         return table[o->object.type];
 481 }
 482
 483 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 484         int r;
 485         void *t;
 486         Object *o;
 487         uint64_t s;
 488
 489         assert(f);
 490         assert(ret);
 491
 492         /* Objects may only be located at multiple of 64 bit */
 493         if (!VALID64(offset))
 494                 return -EFAULT;
 495
 496         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
 497         if (r < 0)
 498                 return r;
 499
 500         o = (Object*) t;
 501         s = le64toh(o->object.size);
 502
 503         if (s < sizeof(ObjectHeader))
 504                 return -EBADMSG;
 505
 506         if (o->object.type <= OBJECT_UNUSED)
 507                 return -EBADMSG;
 508
 509         if (s < minimum_header_size(o))
 510                 return -EBADMSG;
 511
 512         if (type > OBJECT_UNUSED && o->object.type != type)
 513                 return -EBADMSG;
 514
 515         if (s > sizeof(ObjectHeader)) {
 516                 r = journal_file_move_to(f, type, false, offset, s, &t);
 517                 if (r < 0)
 518                         return r;
 519
 520                 o = (Object*) t;
 521         }
 522
 523         *ret = o;
 524         return 0;
 525 }
 526
 527 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 528         uint64_t r;
 529
 530         assert(f);
 531
 532         r = le64toh(f->header->tail_entry_seqnum) + 1;
 533
 534         if (seqnum) {
 535                 /* If an external seqnum counter was passed, we update
 536                  * both the local and the external one, and set it to
 537                  * the maximum of both */
 538
 539                 if (*seqnum + 1 > r)
 540                         r = *seqnum + 1;
 541
 542                 *seqnum = r;
 543         }
 544
 545         f->header->tail_entry_seqnum = htole64(r);
 546
 547         if (f->header->head_entry_seqnum == 0)
 548                 f->header->head_entry_seqnum = htole64(r);
 549
 550         return r;
 551 }
 552
 553 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
 554         int r;
 555         uint64_t p;
 556         Object *tail, *o;
 557         void *t;
 558
 559         assert(f);
 560         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
 561         assert(size >= sizeof(ObjectHeader));
 562         assert(offset);
 563         assert(ret);
 564
 565         r = journal_file_set_online(f);
 566         if (r < 0)
 567                 return r;
 568
 569         p = le64toh(f->header->tail_object_offset);
 570         if (p == 0)
 571                 p = le64toh(f->header->header_size);
 572         else {
 573                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
 574                 if (r < 0)
 575                         return r;
 576
 577                 p += ALIGN64(le64toh(tail->object.size));
 578         }
 579
 580         r = journal_file_allocate(f, p, size);
 581         if (r < 0)
 582                 return r;
 583
 584         r = journal_file_move_to(f, type, false, p, size, &t);
 585         if (r < 0)
 586                 return r;
 587
 588         o = (Object*) t;
 589
 590         zero(o->object);
 591         o->object.type = type;
 592         o->object.size = htole64(size);
 593
 594         f->header->tail_object_offset = htole64(p);
 595         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
 596
 597         *ret = o;
 598         *offset = p;
 599
 600         return 0;
 601 }
 602
 603 static int journal_file_setup_data_hash_table(JournalFile *f) {
 604         uint64_t s, p;
 605         Object *o;
 606         int r;
 607
 608         assert(f);
 609
 610         /* We estimate that we need 1 hash table entry per 768 bytes
 611            of journal file and we want to make sure we never get
 612            beyond 75% fill level. Calculate the hash table size for
 613            the maximum file size based on these metrics. */
 614
 615         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
 616         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
 617                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
 618
 619         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
 620
 621         r = journal_file_append_object(f,
 622                                        OBJECT_DATA_HASH_TABLE,
 623                                        offsetof(Object, hash_table.items) + s,
 624                                        &o, &p);
 625         if (r < 0)
 626                 return r;
 627
 628         memzero(o->hash_table.items, s);
 629
 630         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 631         f->header->data_hash_table_size = htole64(s);
 632
 633         return 0;
 634 }
 635
 636 static int journal_file_setup_field_hash_table(JournalFile *f) {
 637         uint64_t s, p;
 638         Object *o;
 639         int r;
 640
 641         assert(f);
 642
 643         /* We use a fixed size hash table for the fields as this
 644          * number should grow very slowly only */
 645
 646         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
 647         r = journal_file_append_object(f,
 648                                        OBJECT_FIELD_HASH_TABLE,
 649                                        offsetof(Object, hash_table.items) + s,
 650                                        &o, &p);
 651         if (r < 0)
 652                 return r;
 653
 654         memzero(o->hash_table.items, s);
 655
 656         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 657         f->header->field_hash_table_size = htole64(s);
 658
 659         return 0;
 660 }
 661
 662 int journal_file_map_data_hash_table(JournalFile *f) {
 663         uint64_t s, p;
 664         void *t;
 665         int r;
 666
 667         assert(f);
 668
 669         if (f->data_hash_table)
 670                 return 0;
 671
 672         p = le64toh(f->header->data_hash_table_offset);
 673         s = le64toh(f->header->data_hash_table_size);
 674
 675         r = journal_file_move_to(f,
 676                                  OBJECT_DATA_HASH_TABLE,
 677                                  true,
 678                                  p, s,
 679                                  &t);
 680         if (r < 0)
 681                 return r;
 682
 683         f->data_hash_table = t;
 684         return 0;
 685 }
 686
 687 int journal_file_map_field_hash_table(JournalFile *f) {
 688         uint64_t s, p;
 689         void *t;
 690         int r;
 691
 692         assert(f);
 693
 694         if (f->field_hash_table)
 695                 return 0;
 696
 697         p = le64toh(f->header->field_hash_table_offset);
 698         s = le64toh(f->header->field_hash_table_size);
 699
 700         r = journal_file_move_to(f,
 701                                  OBJECT_FIELD_HASH_TABLE,
 702                                  true,
 703                                  p, s,
 704                                  &t);
 705         if (r < 0)
 706                 return r;
 707
 708         f->field_hash_table = t;
 709         return 0;
 710 }
 711
 712 static int journal_file_link_field(
 713                 JournalFile *f,
 714                 Object *o,
 715                 uint64_t offset,
 716                 uint64_t hash) {
 717
 718         uint64_t p, h, m;
 719         int r;
 720
 721         assert(f);
 722         assert(o);
 723         assert(offset > 0);
 724
 725         if (o->object.type != OBJECT_FIELD)
 726                 return -EINVAL;
 727
 728         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 729         if (m <= 0)
 730                 return -EBADMSG;
 731
 732         /* This might alter the window we are looking at */
 733         o->field.next_hash_offset = o->field.head_data_offset = 0;
 734
 735         h = hash % m;
 736         p = le64toh(f->field_hash_table[h].tail_hash_offset);
 737         if (p == 0)
 738                 f->field_hash_table[h].head_hash_offset = htole64(offset);
 739         else {
 740                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 741                 if (r < 0)
 742                         return r;
 743
 744                 o->field.next_hash_offset = htole64(offset);
 745         }
 746
 747         f->field_hash_table[h].tail_hash_offset = htole64(offset);
 748
 749         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
 750                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
 751
 752         return 0;
 753 }
 754
 755 static int journal_file_link_data(
 756                 JournalFile *f,
 757                 Object *o,
 758                 uint64_t offset,
 759                 uint64_t hash) {
 760
 761         uint64_t p, h, m;
 762         int r;
 763
 764         assert(f);
 765         assert(o);
 766         assert(offset > 0);
 767
 768         if (o->object.type != OBJECT_DATA)
 769                 return -EINVAL;
 770
 771         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 772         if (m <= 0)
 773                 return -EBADMSG;
 774
 775         /* This might alter the window we are looking at */
 776         o->data.next_hash_offset = o->data.next_field_offset = 0;
 777         o->data.entry_offset = o->data.entry_array_offset = 0;
 778         o->data.n_entries = 0;
 779
 780         h = hash % m;
 781         p = le64toh(f->data_hash_table[h].tail_hash_offset);
 782         if (p == 0)
 783                 /* Only entry in the hash table is easy */
 784                 f->data_hash_table[h].head_hash_offset = htole64(offset);
 785         else {
 786                 /* Move back to the previous data object, to patch in
 787                  * pointer */
 788
 789                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 790                 if (r < 0)
 791                         return r;
 792
 793                 o->data.next_hash_offset = htole64(offset);
 794         }
 795
 796         f->data_hash_table[h].tail_hash_offset = htole64(offset);
 797
 798         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
 799                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
 800
 801         return 0;
 802 }
 803
 804 int journal_file_find_field_object_with_hash(
 805                 JournalFile *f,
 806                 const void *field, uint64_t size, uint64_t hash,
 807                 Object **ret, uint64_t *offset) {
 808
 809         uint64_t p, osize, h, m;
 810         int r;
 811
 812         assert(f);
 813         assert(field && size > 0);
 814
 815         /* If the field hash table is empty, we can't find anything */
 816         if (le64toh(f->header->field_hash_table_size) <= 0)
 817                 return 0;
 818
 819         /* Map the field hash table, if it isn't mapped yet. */
 820         r = journal_file_map_field_hash_table(f);
 821         if (r < 0)
 822                 return r;
 823
 824         osize = offsetof(Object, field.payload) + size;
 825
 826         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 827         if (m <= 0)
 828                 return -EBADMSG;
 829
 830         h = hash % m;
 831         p = le64toh(f->field_hash_table[h].head_hash_offset);
 832
 833         while (p > 0) {
 834                 Object *o;
 835
 836                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 837                 if (r < 0)
 838                         return r;
 839
 840                 if (le64toh(o->field.hash) == hash &&
 841                     le64toh(o->object.size) == osize &&
 842                     memcmp(o->field.payload, field, size) == 0) {
 843
 844                         if (ret)
 845                                 *ret = o;
 846                         if (offset)
 847                                 *offset = p;
 848
 849                         return 1;
 850                 }
 851
 852                 p = le64toh(o->field.next_hash_offset);
 853         }
 854
 855         return 0;
 856 }
 857
 858 int journal_file_find_field_object(
 859                 JournalFile *f,
 860                 const void *field, uint64_t size,
 861                 Object **ret, uint64_t *offset) {
 862
 863         uint64_t hash;
 864
 865         assert(f);
 866         assert(field && size > 0);
 867
 868         hash = hash64(field, size);
 869
 870         return journal_file_find_field_object_with_hash(f,
 871                                                         field, size, hash,
 872                                                         ret, offset);
 873 }
 874
 875 int journal_file_find_data_object_with_hash(
 876                 JournalFile *f,
 877                 const void *data, uint64_t size, uint64_t hash,
 878                 Object **ret, uint64_t *offset) {
 879
 880         uint64_t p, osize, h, m;
 881         int r;
 882
 883         assert(f);
 884         assert(data || size == 0);
 885
 886         /* If there's no data hash table, then there's no entry. */
 887         if (le64toh(f->header->data_hash_table_size) <= 0)
 888                 return 0;
 889
 890         /* Map the data hash table, if it isn't mapped yet. */
 891         r = journal_file_map_data_hash_table(f);
 892         if (r < 0)
 893                 return r;
 894
 895         osize = offsetof(Object, data.payload) + size;
 896
 897         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 898         if (m <= 0)
 899                 return -EBADMSG;
 900
 901         h = hash % m;
 902         p = le64toh(f->data_hash_table[h].head_hash_offset);
 903
 904         while (p > 0) {
 905                 Object *o;
 906
 907                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 908                 if (r < 0)
 909                         return r;
 910
 911                 if (le64toh(o->data.hash) != hash)
 912                         goto next;
 913
 914                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
 915 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 916                         uint64_t l;
 917                         size_t rsize = 0;
 918
 919                         l = le64toh(o->object.size);
 920                         if (l <= offsetof(Object, data.payload))
 921                                 return -EBADMSG;
 922
 923                         l -= offsetof(Object, data.payload);
 924
 925                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
 926                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
 927                         if (r < 0)
 928                                 return r;
 929
 930                         if (rsize == size &&
 931                             memcmp(f->compress_buffer, data, size) == 0) {
 932
 933                                 if (ret)
 934                                         *ret = o;
 935
 936                                 if (offset)
 937                                         *offset = p;
 938
 939                                 return 1;
 940                         }
 941 #else
 942                         return -EPROTONOSUPPORT;
 943 #endif
 944                 } else if (le64toh(o->object.size) == osize &&
 945                            memcmp(o->data.payload, data, size) == 0) {
 946
 947                         if (ret)
 948                                 *ret = o;
 949
 950                         if (offset)
 951                                 *offset = p;
 952
 953                         return 1;
 954                 }
 955
 956         next:
 957                 p = le64toh(o->data.next_hash_offset);
 958         }
 959
 960         return 0;
 961 }
 962
 963 int journal_file_find_data_object(
 964                 JournalFile *f,
 965                 const void *data, uint64_t size,
 966                 Object **ret, uint64_t *offset) {
 967
 968         uint64_t hash;
 969
 970         assert(f);
 971         assert(data || size == 0);
 972
 973         hash = hash64(data, size);
 974
 975         return journal_file_find_data_object_with_hash(f,
 976                                                        data, size, hash,
 977                                                        ret, offset);
 978 }
 979
 980 static int journal_file_append_field(
 981                 JournalFile *f,
 982                 const void *field, uint64_t size,
 983                 Object **ret, uint64_t *offset) {
 984
 985         uint64_t hash, p;
 986         uint64_t osize;
 987         Object *o;
 988         int r;
 989
 990         assert(f);
 991         assert(field && size > 0);
 992
 993         hash = hash64(field, size);
 994
 995         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
 996         if (r < 0)
 997                 return r;
 998         else if (r > 0) {
 999
1000                 if (ret)
1001                         *ret = o;
1002
1003                 if (offset)
1004                         *offset = p;
1005
1006                 return 0;
1007         }
1008
1009         osize = offsetof(Object, field.payload) + size;
1010         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1011         if (r < 0)
1012                 return r;
1013
1014         o->field.hash = htole64(hash);
1015         memcpy(o->field.payload, field, size);
1016
1017         r = journal_file_link_field(f, o, p, hash);
1018         if (r < 0)
1019                 return r;
1020
1021         /* The linking might have altered the window, so let's
1022          * refresh our pointer */
1023         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1024         if (r < 0)
1025                 return r;
1026
1027 #ifdef HAVE_GCRYPT
1028         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1029         if (r < 0)
1030                 return r;
1031 #endif
1032
1033         if (ret)
1034                 *ret = o;
1035
1036         if (offset)
1037                 *offset = p;
1038
1039         return 0;
1040 }
1041
1042 static int journal_file_append_data(
1043                 JournalFile *f,
1044                 const void *data, uint64_t size,
1045                 Object **ret, uint64_t *offset) {
1046
1047         uint64_t hash, p;
1048         uint64_t osize;
1049         Object *o;
1050         int r, compression = 0;
1051         const void *eq;
1052
1053         assert(f);
1054         assert(data || size == 0);
1055
1056         hash = hash64(data, size);
1057
1058         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1059         if (r < 0)
1060                 return r;
1061         else if (r > 0) {
1062
1063                 if (ret)
1064                         *ret = o;
1065
1066                 if (offset)
1067                         *offset = p;
1068
1069                 return 0;
1070         }
1071
1072         osize = offsetof(Object, data.payload) + size;
1073         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1074         if (r < 0)
1075                 return r;
1076
1077         o->data.hash = htole64(hash);
1078
1079 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1080         if (f->compress_xz &&
1081             size >= COMPRESSION_SIZE_THRESHOLD) {
1082                 size_t rsize = 0;
1083
1084                 compression = compress_blob(data, size, o->data.payload, &rsize);
1085
1086                 if (compression) {
1087                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1088                         o->object.flags |= compression;
1089
1090                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1091                                   size, rsize, object_compressed_to_string(compression));
1092                 }
1093         }
1094 #endif
1095
1096         if (!compression && size > 0)
1097                 memcpy(o->data.payload, data, size);
1098
1099         r = journal_file_link_data(f, o, p, hash);
1100         if (r < 0)
1101                 return r;
1102
1103         /* The linking might have altered the window, so let's
1104          * refresh our pointer */
1105         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1106         if (r < 0)
1107                 return r;
1108
1109         if (!data)
1110                 eq = NULL;
1111         else
1112                 eq = memchr(data, '=', size);
1113         if (eq && eq > data) {
1114                 Object *fo = NULL;
1115                 uint64_t fp;
1116
1117                 /* Create field object ... */
1118                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1119                 if (r < 0)
1120                         return r;
1121
1122                 /* ... and link it in. */
1123                 o->data.next_field_offset = fo->field.head_data_offset;
1124                 fo->field.head_data_offset = le64toh(p);
1125         }
1126
1127 #ifdef HAVE_GCRYPT
1128         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1129         if (r < 0)
1130                 return r;
1131 #endif
1132
1133         if (ret)
1134                 *ret = o;
1135
1136         if (offset)
1137                 *offset = p;
1138
1139         return 0;
1140 }
1141
1142 uint64_t journal_file_entry_n_items(Object *o) {
1143         assert(o);
1144
1145         if (o->object.type != OBJECT_ENTRY)
1146                 return 0;
1147
1148         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1149 }
1150
1151 uint64_t journal_file_entry_array_n_items(Object *o) {
1152         assert(o);
1153
1154         if (o->object.type != OBJECT_ENTRY_ARRAY)
1155                 return 0;
1156
1157         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1158 }
1159
1160 uint64_t journal_file_hash_table_n_items(Object *o) {
1161         assert(o);
1162
1163         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1164             o->object.type != OBJECT_FIELD_HASH_TABLE)
1165                 return 0;
1166
1167         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1168 }
1169
1170 static int link_entry_into_array(JournalFile *f,
1171                                  le64_t *first,
1172                                  le64_t *idx,
1173                                  uint64_t p) {
1174         int r;
1175         uint64_t n = 0, ap = 0, q, i, a, hidx;
1176         Object *o;
1177
1178         assert(f);
1179         assert(first);
1180         assert(idx);
1181         assert(p > 0);
1182
1183         a = le64toh(*first);
1184         i = hidx = le64toh(*idx);
1185         while (a > 0) {
1186
1187                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1188                 if (r < 0)
1189                         return r;
1190
1191                 n = journal_file_entry_array_n_items(o);
1192                 if (i < n) {
1193                         o->entry_array.items[i] = htole64(p);
1194                         *idx = htole64(hidx + 1);
1195                         return 0;
1196                 }
1197
1198                 i -= n;
1199                 ap = a;
1200                 a = le64toh(o->entry_array.next_entry_array_offset);
1201         }
1202
1203         if (hidx > n)
1204                 n = (hidx+1) * 2;
1205         else
1206                 n = n * 2;
1207
1208         if (n < 4)
1209                 n = 4;
1210
1211         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1212                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1213                                        &o, &q);
1214         if (r < 0)
1215                 return r;
1216
1217 #ifdef HAVE_GCRYPT
1218         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1219         if (r < 0)
1220                 return r;
1221 #endif
1222
1223         o->entry_array.items[i] = htole64(p);
1224
1225         if (ap == 0)
1226                 *first = htole64(q);
1227         else {
1228                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1229                 if (r < 0)
1230                         return r;
1231
1232                 o->entry_array.next_entry_array_offset = htole64(q);
1233         }
1234
1235         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1236                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1237
1238         *idx = htole64(hidx + 1);
1239
1240         return 0;
1241 }
1242
1243 static int link_entry_into_array_plus_one(JournalFile *f,
1244                                           le64_t *extra,
1245                                           le64_t *first,
1246                                           le64_t *idx,
1247                                           uint64_t p) {
1248
1249         int r;
1250
1251         assert(f);
1252         assert(extra);
1253         assert(first);
1254         assert(idx);
1255         assert(p > 0);
1256
1257         if (*idx == 0)
1258                 *extra = htole64(p);
1259         else {
1260                 le64_t i;
1261
1262                 i = htole64(le64toh(*idx) - 1);
1263                 r = link_entry_into_array(f, first, &i, p);
1264                 if (r < 0)
1265                         return r;
1266         }
1267
1268         *idx = htole64(le64toh(*idx) + 1);
1269         return 0;
1270 }
1271
1272 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1273         uint64_t p;
1274         int r;
1275         assert(f);
1276         assert(o);
1277         assert(offset > 0);
1278
1279         p = le64toh(o->entry.items[i].object_offset);
1280         if (p == 0)
1281                 return -EINVAL;
1282
1283         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1284         if (r < 0)
1285                 return r;
1286
1287         return link_entry_into_array_plus_one(f,
1288                                               &o->data.entry_offset,
1289                                               &o->data.entry_array_offset,
1290                                               &o->data.n_entries,
1291                                               offset);
1292 }
1293
1294 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1295         uint64_t n, i;
1296         int r;
1297
1298         assert(f);
1299         assert(o);
1300         assert(offset > 0);
1301
1302         if (o->object.type != OBJECT_ENTRY)
1303                 return -EINVAL;
1304
1305         __sync_synchronize();
1306
1307         /* Link up the entry itself */
1308         r = link_entry_into_array(f,
1309                                   &f->header->entry_array_offset,
1310                                   &f->header->n_entries,
1311                                   offset);
1312         if (r < 0)
1313                 return r;
1314
1315         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1316
1317         if (f->header->head_entry_realtime == 0)
1318                 f->header->head_entry_realtime = o->entry.realtime;
1319
1320         f->header->tail_entry_realtime = o->entry.realtime;
1321         f->header->tail_entry_monotonic = o->entry.monotonic;
1322
1323         f->tail_entry_monotonic_valid = true;
1324
1325         /* Link up the items */
1326         n = journal_file_entry_n_items(o);
1327         for (i = 0; i < n; i++) {
1328                 r = journal_file_link_entry_item(f, o, offset, i);
1329                 if (r < 0)
1330                         return r;
1331         }
1332
1333         return 0;
1334 }
1335
1336 static int journal_file_append_entry_internal(
1337                 JournalFile *f,
1338                 const dual_timestamp *ts,
1339                 uint64_t xor_hash,
1340                 const EntryItem items[], unsigned n_items,
1341                 uint64_t *seqnum,
1342                 Object **ret, uint64_t *offset) {
1343         uint64_t np;
1344         uint64_t osize;
1345         Object *o;
1346         int r;
1347
1348         assert(f);
1349         assert(items || n_items == 0);
1350         assert(ts);
1351
1352         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1353
1354         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1355         if (r < 0)
1356                 return r;
1357
1358         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1359         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1360         o->entry.realtime = htole64(ts->realtime);
1361         o->entry.monotonic = htole64(ts->monotonic);
1362         o->entry.xor_hash = htole64(xor_hash);
1363         o->entry.boot_id = f->header->boot_id;
1364
1365 #ifdef HAVE_GCRYPT
1366         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1367         if (r < 0)
1368                 return r;
1369 #endif
1370
1371         r = journal_file_link_entry(f, o, np);
1372         if (r < 0)
1373                 return r;
1374
1375         if (ret)
1376                 *ret = o;
1377
1378         if (offset)
1379                 *offset = np;
1380
1381         return 0;
1382 }
1383
1384 void journal_file_post_change(JournalFile *f) {
1385         assert(f);
1386
1387         /* inotify() does not receive IN_MODIFY events from file
1388          * accesses done via mmap(). After each access we hence
1389          * trigger IN_MODIFY by truncating the journal file to its
1390          * current size which triggers IN_MODIFY. */
1391
1392         __sync_synchronize();
1393
1394         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1395                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1396 }
1397
1398 static int entry_item_cmp(const void *_a, const void *_b) {
1399         const EntryItem *a = _a, *b = _b;
1400
1401         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1402                 return -1;
1403         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1404                 return 1;
1405         return 0;
1406 }
1407
1408 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1409         unsigned i;
1410         EntryItem *items;
1411         int r;
1412         uint64_t xor_hash = 0;
1413         struct dual_timestamp _ts;
1414
1415         assert(f);
1416         assert(iovec || n_iovec == 0);
1417
1418         if (!ts) {
1419                 dual_timestamp_get(&_ts);
1420                 ts = &_ts;
1421         }
1422
1423         if (f->tail_entry_monotonic_valid &&
1424             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1425                 return -EINVAL;
1426
1427 #ifdef HAVE_GCRYPT
1428         r = journal_file_maybe_append_tag(f, ts->realtime);
1429         if (r < 0)
1430                 return r;
1431 #endif
1432
1433         /* alloca() can't take 0, hence let's allocate at least one */
1434         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1435
1436         for (i = 0; i < n_iovec; i++) {
1437                 uint64_t p;
1438                 Object *o;
1439
1440                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1441                 if (r < 0)
1442                         return r;
1443
1444                 xor_hash ^= le64toh(o->data.hash);
1445                 items[i].object_offset = htole64(p);
1446                 items[i].hash = o->data.hash;
1447         }
1448
1449         /* Order by the position on disk, in order to improve seek
1450          * times for rotating media. */
1451         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1452
1453         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1454
1455         /* If the memory mapping triggered a SIGBUS then we return an
1456          * IO error and ignore the error code passed down to us, since
1457          * it is very likely just an effect of a nullified replacement
1458          * mapping page */
1459
1460         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1461                 r = -EIO;
1462
1463         journal_file_post_change(f);
1464
1465         return r;
1466 }
1467
1468 typedef struct ChainCacheItem {
1469         uint64_t first; /* the array at the beginning of the chain */
1470         uint64_t array; /* the cached array */
1471         uint64_t begin; /* the first item in the cached array */
1472         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1473         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1474 } ChainCacheItem;
1475
1476 static void chain_cache_put(
1477                 OrderedHashmap *h,
1478                 ChainCacheItem *ci,
1479                 uint64_t first,
1480                 uint64_t array,
1481                 uint64_t begin,
1482                 uint64_t total,
1483                 uint64_t last_index) {
1484
1485         if (!ci) {
1486                 /* If the chain item to cache for this chain is the
1487                  * first one it's not worth caching anything */
1488                 if (array == first)
1489                         return;
1490
1491                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1492                         ci = ordered_hashmap_steal_first(h);
1493                         assert(ci);
1494                 } else {
1495                         ci = new(ChainCacheItem, 1);
1496                         if (!ci)
1497                                 return;
1498                 }
1499
1500                 ci->first = first;
1501
1502                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1503                         free(ci);
1504                         return;
1505                 }
1506         } else
1507                 assert(ci->first == first);
1508
1509         ci->array = array;
1510         ci->begin = begin;
1511         ci->total = total;
1512         ci->last_index = last_index;
1513 }
1514
1515 static int generic_array_get(
1516                 JournalFile *f,
1517                 uint64_t first,
1518                 uint64_t i,
1519                 Object **ret, uint64_t *offset) {
1520
1521         Object *o;
1522         uint64_t p = 0, a, t = 0;
1523         int r;
1524         ChainCacheItem *ci;
1525
1526         assert(f);
1527
1528         a = first;
1529
1530         /* Try the chain cache first */
1531         ci = ordered_hashmap_get(f->chain_cache, &first);
1532         if (ci && i > ci->total) {
1533                 a = ci->array;
1534                 i -= ci->total;
1535                 t = ci->total;
1536         }
1537
1538         while (a > 0) {
1539                 uint64_t k;
1540
1541                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1542                 if (r < 0)
1543                         return r;
1544
1545                 k = journal_file_entry_array_n_items(o);
1546                 if (i < k) {
1547                         p = le64toh(o->entry_array.items[i]);
1548                         goto found;
1549                 }
1550
1551                 i -= k;
1552                 t += k;
1553                 a = le64toh(o->entry_array.next_entry_array_offset);
1554         }
1555
1556         return 0;
1557
1558 found:
1559         /* Let's cache this item for the next invocation */
1560         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1561
1562         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1563         if (r < 0)
1564                 return r;
1565
1566         if (ret)
1567                 *ret = o;
1568
1569         if (offset)
1570                 *offset = p;
1571
1572         return 1;
1573 }
1574
1575 static int generic_array_get_plus_one(
1576                 JournalFile *f,
1577                 uint64_t extra,
1578                 uint64_t first,
1579                 uint64_t i,
1580                 Object **ret, uint64_t *offset) {
1581
1582         Object *o;
1583
1584         assert(f);
1585
1586         if (i == 0) {
1587                 int r;
1588
1589                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1590                 if (r < 0)
1591                         return r;
1592
1593                 if (ret)
1594                         *ret = o;
1595
1596                 if (offset)
1597                         *offset = extra;
1598
1599                 return 1;
1600         }
1601
1602         return generic_array_get(f, first, i-1, ret, offset);
1603 }
1604
1605 enum {
1606         TEST_FOUND,
1607         TEST_LEFT,
1608         TEST_RIGHT
1609 };
1610
1611 static int generic_array_bisect(
1612                 JournalFile *f,
1613                 uint64_t first,
1614                 uint64_t n,
1615                 uint64_t needle,
1616                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1617                 direction_t direction,
1618                 Object **ret,
1619                 uint64_t *offset,
1620                 uint64_t *idx) {
1621
1622         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1623         bool subtract_one = false;
1624         Object *o, *array = NULL;
1625         int r;
1626         ChainCacheItem *ci;
1627
1628         assert(f);
1629         assert(test_object);
1630
1631         /* Start with the first array in the chain */
1632         a = first;
1633
1634         ci = ordered_hashmap_get(f->chain_cache, &first);
1635         if (ci && n > ci->total) {
1636                 /* Ah, we have iterated this bisection array chain
1637                  * previously! Let's see if we can skip ahead in the
1638                  * chain, as far as the last time. But we can't jump
1639                  * backwards in the chain, so let's check that
1640                  * first. */
1641
1642                 r = test_object(f, ci->begin, needle);
1643                 if (r < 0)
1644                         return r;
1645
1646                 if (r == TEST_LEFT) {
1647                         /* OK, what we are looking for is right of the
1648                          * begin of this EntryArray, so let's jump
1649                          * straight to previously cached array in the
1650                          * chain */
1651
1652                         a = ci->array;
1653                         n -= ci->total;
1654                         t = ci->total;
1655                         last_index = ci->last_index;
1656                 }
1657         }
1658
1659         while (a > 0) {
1660                 uint64_t left, right, k, lp;
1661
1662                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1663                 if (r < 0)
1664                         return r;
1665
1666                 k = journal_file_entry_array_n_items(array);
1667                 right = MIN(k, n);
1668                 if (right <= 0)
1669                         return 0;
1670
1671                 i = right - 1;
1672                 lp = p = le64toh(array->entry_array.items[i]);
1673                 if (p <= 0)
1674                         return -EBADMSG;
1675
1676                 r = test_object(f, p, needle);
1677                 if (r < 0)
1678                         return r;
1679
1680                 if (r == TEST_FOUND)
1681                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1682
1683                 if (r == TEST_RIGHT) {
1684                         left = 0;
1685                         right -= 1;
1686
1687                         if (last_index != (uint64_t) -1) {
1688                                 assert(last_index <= right);
1689
1690                                 /* If we cached the last index we
1691                                  * looked at, let's try to not to jump
1692                                  * too wildly around and see if we can
1693                                  * limit the range to look at early to
1694                                  * the immediate neighbors of the last
1695                                  * index we looked at. */
1696
1697                                 if (last_index > 0) {
1698                                         uint64_t x = last_index - 1;
1699
1700                                         p = le64toh(array->entry_array.items[x]);
1701                                         if (p <= 0)
1702                                                 return -EBADMSG;
1703
1704                                         r = test_object(f, p, needle);
1705                                         if (r < 0)
1706                                                 return r;
1707
1708                                         if (r == TEST_FOUND)
1709                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1710
1711                                         if (r == TEST_RIGHT)
1712                                                 right = x;
1713                                         else
1714                                                 left = x + 1;
1715                                 }
1716
1717                                 if (last_index < right) {
1718                                         uint64_t y = last_index + 1;
1719
1720                                         p = le64toh(array->entry_array.items[y]);
1721                                         if (p <= 0)
1722                                                 return -EBADMSG;
1723
1724                                         r = test_object(f, p, needle);
1725                                         if (r < 0)
1726                                                 return r;
1727
1728                                         if (r == TEST_FOUND)
1729                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1730
1731                                         if (r == TEST_RIGHT)
1732                                                 right = y;
1733                                         else
1734                                                 left = y + 1;
1735                                 }
1736                         }
1737
1738                         for (;;) {
1739                                 if (left == right) {
1740                                         if (direction == DIRECTION_UP)
1741                                                 subtract_one = true;
1742
1743                                         i = left;
1744                                         goto found;
1745                                 }
1746
1747                                 assert(left < right);
1748                                 i = (left + right) / 2;
1749
1750                                 p = le64toh(array->entry_array.items[i]);
1751                                 if (p <= 0)
1752                                         return -EBADMSG;
1753
1754                                 r = test_object(f, p, needle);
1755                                 if (r < 0)
1756                                         return r;
1757
1758                                 if (r == TEST_FOUND)
1759                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1760
1761                                 if (r == TEST_RIGHT)
1762                                         right = i;
1763                                 else
1764                                         left = i + 1;
1765                         }
1766                 }
1767
1768                 if (k >= n) {
1769                         if (direction == DIRECTION_UP) {
1770                                 i = n;
1771                                 subtract_one = true;
1772                                 goto found;
1773                         }
1774
1775                         return 0;
1776                 }
1777
1778                 last_p = lp;
1779
1780                 n -= k;
1781                 t += k;
1782                 last_index = (uint64_t) -1;
1783                 a = le64toh(array->entry_array.next_entry_array_offset);
1784         }
1785
1786         return 0;
1787
1788 found:
1789         if (subtract_one && t == 0 && i == 0)
1790                 return 0;
1791
1792         /* Let's cache this item for the next invocation */
1793         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1794
1795         if (subtract_one && i == 0)
1796                 p = last_p;
1797         else if (subtract_one)
1798                 p = le64toh(array->entry_array.items[i-1]);
1799         else
1800                 p = le64toh(array->entry_array.items[i]);
1801
1802         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1803         if (r < 0)
1804                 return r;
1805
1806         if (ret)
1807                 *ret = o;
1808
1809         if (offset)
1810                 *offset = p;
1811
1812         if (idx)
1813                 *idx = t + i + (subtract_one ? -1 : 0);
1814
1815         return 1;
1816 }
1817
1818 static int generic_array_bisect_plus_one(
1819                 JournalFile *f,
1820                 uint64_t extra,
1821                 uint64_t first,
1822                 uint64_t n,
1823                 uint64_t needle,
1824                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1825                 direction_t direction,
1826                 Object **ret,
1827                 uint64_t *offset,
1828                 uint64_t *idx) {
1829
1830         int r;
1831         bool step_back = false;
1832         Object *o;
1833
1834         assert(f);
1835         assert(test_object);
1836
1837         if (n <= 0)
1838                 return 0;
1839
1840         /* This bisects the array in object 'first', but first checks
1841          * an extra  */
1842         r = test_object(f, extra, needle);
1843         if (r < 0)
1844                 return r;
1845
1846         if (r == TEST_FOUND)
1847                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1848
1849         /* if we are looking with DIRECTION_UP then we need to first
1850            see if in the actual array there is a matching entry, and
1851            return the last one of that. But if there isn't any we need
1852            to return this one. Hence remember this, and return it
1853            below. */
1854         if (r == TEST_LEFT)
1855                 step_back = direction == DIRECTION_UP;
1856
1857         if (r == TEST_RIGHT) {
1858                 if (direction == DIRECTION_DOWN)
1859                         goto found;
1860                 else
1861                         return 0;
1862         }
1863
1864         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1865
1866         if (r == 0 && step_back)
1867                 goto found;
1868
1869         if (r > 0 && idx)
1870                 (*idx) ++;
1871
1872         return r;
1873
1874 found:
1875         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1876         if (r < 0)
1877                 return r;
1878
1879         if (ret)
1880                 *ret = o;
1881
1882         if (offset)
1883                 *offset = extra;
1884
1885         if (idx)
1886                 *idx = 0;
1887
1888         return 1;
1889 }
1890
1891 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1892         assert(f);
1893         assert(p > 0);
1894
1895         if (p == needle)
1896                 return TEST_FOUND;
1897         else if (p < needle)
1898                 return TEST_LEFT;
1899         else
1900                 return TEST_RIGHT;
1901 }
1902
1903 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1904         Object *o;
1905         int r;
1906
1907         assert(f);
1908         assert(p > 0);
1909
1910         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1911         if (r < 0)
1912                 return r;
1913
1914         if (le64toh(o->entry.seqnum) == needle)
1915                 return TEST_FOUND;
1916         else if (le64toh(o->entry.seqnum) < needle)
1917                 return TEST_LEFT;
1918         else
1919                 return TEST_RIGHT;
1920 }
1921
1922 int journal_file_move_to_entry_by_seqnum(
1923                 JournalFile *f,
1924                 uint64_t seqnum,
1925                 direction_t direction,
1926                 Object **ret,
1927                 uint64_t *offset) {
1928
1929         return generic_array_bisect(f,
1930                                     le64toh(f->header->entry_array_offset),
1931                                     le64toh(f->header->n_entries),
1932                                     seqnum,
1933                                     test_object_seqnum,
1934                                     direction,
1935                                     ret, offset, NULL);
1936 }
1937
1938 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1939         Object *o;
1940         int r;
1941
1942         assert(f);
1943         assert(p > 0);
1944
1945         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1946         if (r < 0)
1947                 return r;
1948
1949         if (le64toh(o->entry.realtime) == needle)
1950                 return TEST_FOUND;
1951         else if (le64toh(o->entry.realtime) < needle)
1952                 return TEST_LEFT;
1953         else
1954                 return TEST_RIGHT;
1955 }
1956
1957 int journal_file_move_to_entry_by_realtime(
1958                 JournalFile *f,
1959                 uint64_t realtime,
1960                 direction_t direction,
1961                 Object **ret,
1962                 uint64_t *offset) {
1963
1964         return generic_array_bisect(f,
1965                                     le64toh(f->header->entry_array_offset),
1966                                     le64toh(f->header->n_entries),
1967                                     realtime,
1968                                     test_object_realtime,
1969                                     direction,
1970                                     ret, offset, NULL);
1971 }
1972
1973 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1974         Object *o;
1975         int r;
1976
1977         assert(f);
1978         assert(p > 0);
1979
1980         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1981         if (r < 0)
1982                 return r;
1983
1984         if (le64toh(o->entry.monotonic) == needle)
1985                 return TEST_FOUND;
1986         else if (le64toh(o->entry.monotonic) < needle)
1987                 return TEST_LEFT;
1988         else
1989                 return TEST_RIGHT;
1990 }
1991
1992 static int find_data_object_by_boot_id(
1993                 JournalFile *f,
1994                 sd_id128_t boot_id,
1995                 Object **o,
1996                 uint64_t *b) {
1997
1998         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1999
2000         sd_id128_to_string(boot_id, t + 9);
2001         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2002 }
2003
2004 int journal_file_move_to_entry_by_monotonic(
2005                 JournalFile *f,
2006                 sd_id128_t boot_id,
2007                 uint64_t monotonic,
2008                 direction_t direction,
2009                 Object **ret,
2010                 uint64_t *offset) {
2011
2012         Object *o;
2013         int r;
2014
2015         assert(f);
2016
2017         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2018         if (r < 0)
2019                 return r;
2020         if (r == 0)
2021                 return -ENOENT;
2022
2023         return generic_array_bisect_plus_one(f,
2024                                              le64toh(o->data.entry_offset),
2025                                              le64toh(o->data.entry_array_offset),
2026                                              le64toh(o->data.n_entries),
2027                                              monotonic,
2028                                              test_object_monotonic,
2029                                              direction,
2030                                              ret, offset, NULL);
2031 }
2032
2033 void journal_file_reset_location(JournalFile *f) {
2034         f->location_type = LOCATION_HEAD;
2035         f->current_offset = 0;
2036         f->current_seqnum = 0;
2037         f->current_realtime = 0;
2038         f->current_monotonic = 0;
2039         zero(f->current_boot_id);
2040         f->current_xor_hash = 0;
2041 }
2042
2043 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2044         f->location_type = LOCATION_SEEK;
2045         f->current_offset = offset;
2046         f->current_seqnum = le64toh(o->entry.seqnum);
2047         f->current_realtime = le64toh(o->entry.realtime);
2048         f->current_monotonic = le64toh(o->entry.monotonic);
2049         f->current_boot_id = o->entry.boot_id;
2050         f->current_xor_hash = le64toh(o->entry.xor_hash);
2051 }
2052
2053 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2054         assert(af);
2055         assert(bf);
2056         assert(af->location_type == LOCATION_SEEK);
2057         assert(bf->location_type == LOCATION_SEEK);
2058
2059         /* If contents and timestamps match, these entries are
2060          * identical, even if the seqnum does not match */
2061         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2062             af->current_monotonic == bf->current_monotonic &&
2063             af->current_realtime == bf->current_realtime &&
2064             af->current_xor_hash == bf->current_xor_hash)
2065                 return 0;
2066
2067         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2068
2069                 /* If this is from the same seqnum source, compare
2070                  * seqnums */
2071                 if (af->current_seqnum < bf->current_seqnum)
2072                         return -1;
2073                 if (af->current_seqnum > bf->current_seqnum)
2074                         return 1;
2075
2076                 /* Wow! This is weird, different data but the same
2077                  * seqnums? Something is borked, but let's make the
2078                  * best of it and compare by time. */
2079         }
2080
2081         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2082
2083                 /* If the boot id matches, compare monotonic time */
2084                 if (af->current_monotonic < bf->current_monotonic)
2085                         return -1;
2086                 if (af->current_monotonic > bf->current_monotonic)
2087                         return 1;
2088         }
2089
2090         /* Otherwise, compare UTC time */
2091         if (af->current_realtime < bf->current_realtime)
2092                 return -1;
2093         if (af->current_realtime > bf->current_realtime)
2094                 return 1;
2095
2096         /* Finally, compare by contents */
2097         if (af->current_xor_hash < bf->current_xor_hash)
2098                 return -1;
2099         if (af->current_xor_hash > bf->current_xor_hash)
2100                 return 1;
2101
2102         return 0;
2103 }
2104
2105 int journal_file_next_entry(
2106                 JournalFile *f,
2107                 uint64_t p,
2108                 direction_t direction,
2109                 Object **ret, uint64_t *offset) {
2110
2111         uint64_t i, n, ofs;
2112         int r;
2113
2114         assert(f);
2115
2116         n = le64toh(f->header->n_entries);
2117         if (n <= 0)
2118                 return 0;
2119
2120         if (p == 0)
2121                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2122         else {
2123                 r = generic_array_bisect(f,
2124                                          le64toh(f->header->entry_array_offset),
2125                                          le64toh(f->header->n_entries),
2126                                          p,
2127                                          test_object_offset,
2128                                          DIRECTION_DOWN,
2129                                          NULL, NULL,
2130                                          &i);
2131                 if (r <= 0)
2132                         return r;
2133
2134                 if (direction == DIRECTION_DOWN) {
2135                         if (i >= n - 1)
2136                                 return 0;
2137
2138                         i++;
2139                 } else {
2140                         if (i <= 0)
2141                                 return 0;
2142
2143                         i--;
2144                 }
2145         }
2146
2147         /* And jump to it */
2148         r = generic_array_get(f,
2149                               le64toh(f->header->entry_array_offset),
2150                               i,
2151                               ret, &ofs);
2152         if (r <= 0)
2153                 return r;
2154
2155         if (p > 0 &&
2156             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2157                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2158                           f->path, i);
2159                 return -EBADMSG;
2160         }
2161
2162         if (offset)
2163                 *offset = ofs;
2164
2165         return 1;
2166 }
2167
2168 int journal_file_next_entry_for_data(
2169                 JournalFile *f,
2170                 Object *o, uint64_t p,
2171                 uint64_t data_offset,
2172                 direction_t direction,
2173                 Object **ret, uint64_t *offset) {
2174
2175         uint64_t n, i;
2176         int r;
2177         Object *d;
2178
2179         assert(f);
2180         assert(p > 0 || !o);
2181
2182         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2183         if (r < 0)
2184                 return r;
2185
2186         n = le64toh(d->data.n_entries);
2187         if (n <= 0)
2188                 return n;
2189
2190         if (!o)
2191                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2192         else {
2193                 if (o->object.type != OBJECT_ENTRY)
2194                         return -EINVAL;
2195
2196                 r = generic_array_bisect_plus_one(f,
2197                                                   le64toh(d->data.entry_offset),
2198                                                   le64toh(d->data.entry_array_offset),
2199                                                   le64toh(d->data.n_entries),
2200                                                   p,
2201                                                   test_object_offset,
2202                                                   DIRECTION_DOWN,
2203                                                   NULL, NULL,
2204                                                   &i);
2205
2206                 if (r <= 0)
2207                         return r;
2208
2209                 if (direction == DIRECTION_DOWN) {
2210                         if (i >= n - 1)
2211                                 return 0;
2212
2213                         i++;
2214                 } else {
2215                         if (i <= 0)
2216                                 return 0;
2217
2218                         i--;
2219                 }
2220
2221         }
2222
2223         return generic_array_get_plus_one(f,
2224                                           le64toh(d->data.entry_offset),
2225                                           le64toh(d->data.entry_array_offset),
2226                                           i,
2227                                           ret, offset);
2228 }
2229
2230 int journal_file_move_to_entry_by_offset_for_data(
2231                 JournalFile *f,
2232                 uint64_t data_offset,
2233                 uint64_t p,
2234                 direction_t direction,
2235                 Object **ret, uint64_t *offset) {
2236
2237         int r;
2238         Object *d;
2239
2240         assert(f);
2241
2242         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2243         if (r < 0)
2244                 return r;
2245
2246         return generic_array_bisect_plus_one(f,
2247                                              le64toh(d->data.entry_offset),
2248                                              le64toh(d->data.entry_array_offset),
2249                                              le64toh(d->data.n_entries),
2250                                              p,
2251                                              test_object_offset,
2252                                              direction,
2253                                              ret, offset, NULL);
2254 }
2255
2256 int journal_file_move_to_entry_by_monotonic_for_data(
2257                 JournalFile *f,
2258                 uint64_t data_offset,
2259                 sd_id128_t boot_id,
2260                 uint64_t monotonic,
2261                 direction_t direction,
2262                 Object **ret, uint64_t *offset) {
2263
2264         Object *o, *d;
2265         int r;
2266         uint64_t b, z;
2267
2268         assert(f);
2269
2270         /* First, seek by time */
2271         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2272         if (r < 0)
2273                 return r;
2274         if (r == 0)
2275                 return -ENOENT;
2276
2277         r = generic_array_bisect_plus_one(f,
2278                                           le64toh(o->data.entry_offset),
2279                                           le64toh(o->data.entry_array_offset),
2280                                           le64toh(o->data.n_entries),
2281                                           monotonic,
2282                                           test_object_monotonic,
2283                                           direction,
2284                                           NULL, &z, NULL);
2285         if (r <= 0)
2286                 return r;
2287
2288         /* And now, continue seeking until we find an entry that
2289          * exists in both bisection arrays */
2290
2291         for (;;) {
2292                 Object *qo;
2293                 uint64_t p, q;
2294
2295                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2296                 if (r < 0)
2297                         return r;
2298
2299                 r = generic_array_bisect_plus_one(f,
2300                                                   le64toh(d->data.entry_offset),
2301                                                   le64toh(d->data.entry_array_offset),
2302                                                   le64toh(d->data.n_entries),
2303                                                   z,
2304                                                   test_object_offset,
2305                                                   direction,
2306                                                   NULL, &p, NULL);
2307                 if (r <= 0)
2308                         return r;
2309
2310                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2311                 if (r < 0)
2312                         return r;
2313
2314                 r = generic_array_bisect_plus_one(f,
2315                                                   le64toh(o->data.entry_offset),
2316                                                   le64toh(o->data.entry_array_offset),
2317                                                   le64toh(o->data.n_entries),
2318                                                   p,
2319                                                   test_object_offset,
2320                                                   direction,
2321                                                   &qo, &q, NULL);
2322
2323                 if (r <= 0)
2324                         return r;
2325
2326                 if (p == q) {
2327                         if (ret)
2328                                 *ret = qo;
2329                         if (offset)
2330                                 *offset = q;
2331
2332                         return 1;
2333                 }
2334
2335                 z = q;
2336         }
2337 }
2338
2339 int journal_file_move_to_entry_by_seqnum_for_data(
2340                 JournalFile *f,
2341                 uint64_t data_offset,
2342                 uint64_t seqnum,
2343                 direction_t direction,
2344                 Object **ret, uint64_t *offset) {
2345
2346         Object *d;
2347         int r;
2348
2349         assert(f);
2350
2351         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2352         if (r < 0)
2353                 return r;
2354
2355         return generic_array_bisect_plus_one(f,
2356                                              le64toh(d->data.entry_offset),
2357                                              le64toh(d->data.entry_array_offset),
2358                                              le64toh(d->data.n_entries),
2359                                              seqnum,
2360                                              test_object_seqnum,
2361                                              direction,
2362                                              ret, offset, NULL);
2363 }
2364
2365 int journal_file_move_to_entry_by_realtime_for_data(
2366                 JournalFile *f,
2367                 uint64_t data_offset,
2368                 uint64_t realtime,
2369                 direction_t direction,
2370                 Object **ret, uint64_t *offset) {
2371
2372         Object *d;
2373         int r;
2374
2375         assert(f);
2376
2377         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2378         if (r < 0)
2379                 return r;
2380
2381         return generic_array_bisect_plus_one(f,
2382                                              le64toh(d->data.entry_offset),
2383                                              le64toh(d->data.entry_array_offset),
2384                                              le64toh(d->data.n_entries),
2385                                              realtime,
2386                                              test_object_realtime,
2387                                              direction,
2388                                              ret, offset, NULL);
2389 }
2390
2391 void journal_file_dump(JournalFile *f) {
2392         Object *o;
2393         int r;
2394         uint64_t p;
2395
2396         assert(f);
2397
2398         journal_file_print_header(f);
2399
2400         p = le64toh(f->header->header_size);
2401         while (p != 0) {
2402                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2403                 if (r < 0)
2404                         goto fail;
2405
2406                 switch (o->object.type) {
2407
2408                 case OBJECT_UNUSED:
2409                         printf("Type: OBJECT_UNUSED\n");
2410                         break;
2411
2412                 case OBJECT_DATA:
2413                         printf("Type: OBJECT_DATA\n");
2414                         break;
2415
2416                 case OBJECT_FIELD:
2417                         printf("Type: OBJECT_FIELD\n");
2418                         break;
2419
2420                 case OBJECT_ENTRY:
2421                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2422                                le64toh(o->entry.seqnum),
2423                                le64toh(o->entry.monotonic),
2424                                le64toh(o->entry.realtime));
2425                         break;
2426
2427                 case OBJECT_FIELD_HASH_TABLE:
2428                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2429                         break;
2430
2431                 case OBJECT_DATA_HASH_TABLE:
2432                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2433                         break;
2434
2435                 case OBJECT_ENTRY_ARRAY:
2436                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2437                         break;
2438
2439                 case OBJECT_TAG:
2440                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2441                                le64toh(o->tag.seqnum),
2442                                le64toh(o->tag.epoch));
2443                         break;
2444
2445                 default:
2446                         printf("Type: unknown (%i)\n", o->object.type);
2447                         break;
2448                 }
2449
2450                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2451                         printf("Flags: %s\n",
2452                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2453
2454                 if (p == le64toh(f->header->tail_object_offset))
2455                         p = 0;
2456                 else
2457                         p = p + ALIGN64(le64toh(o->object.size));
2458         }
2459
2460         return;
2461 fail:
2462         log_error("File corrupt");
2463 }
2464
2465 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2466         const char *x;
2467
2468         x = format_timestamp(buf, l, t);
2469         if (x)
2470                 return x;
2471         return " --- ";
2472 }
2473
2474 void journal_file_print_header(JournalFile *f) {
2475         char a[33], b[33], c[33], d[33];
2476         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2477         struct stat st;
2478         char bytes[FORMAT_BYTES_MAX];
2479
2480         assert(f);
2481
2482         printf("File Path: %s\n"
2483                "File ID: %s\n"
2484                "Machine ID: %s\n"
2485                "Boot ID: %s\n"
2486                "Sequential Number ID: %s\n"
2487                "State: %s\n"
2488                "Compatible Flags:%s%s\n"
2489                "Incompatible Flags:%s%s%s\n"
2490                "Header size: %"PRIu64"\n"
2491                "Arena size: %"PRIu64"\n"
2492                "Data Hash Table Size: %"PRIu64"\n"
2493                "Field Hash Table Size: %"PRIu64"\n"
2494                "Rotate Suggested: %s\n"
2495                "Head Sequential Number: %"PRIu64"\n"
2496                "Tail Sequential Number: %"PRIu64"\n"
2497                "Head Realtime Timestamp: %s\n"
2498                "Tail Realtime Timestamp: %s\n"
2499                "Tail Monotonic Timestamp: %s\n"
2500                "Objects: %"PRIu64"\n"
2501                "Entry Objects: %"PRIu64"\n",
2502                f->path,
2503                sd_id128_to_string(f->header->file_id, a),
2504                sd_id128_to_string(f->header->machine_id, b),
2505                sd_id128_to_string(f->header->boot_id, c),
2506                sd_id128_to_string(f->header->seqnum_id, d),
2507                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2508                f->header->state == STATE_ONLINE ? "ONLINE" :
2509                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2510                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2511                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2512                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2513                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2514                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2515                le64toh(f->header->header_size),
2516                le64toh(f->header->arena_size),
2517                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2518                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2519                yes_no(journal_file_rotate_suggested(f, 0)),
2520                le64toh(f->header->head_entry_seqnum),
2521                le64toh(f->header->tail_entry_seqnum),
2522                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2523                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2524                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2525                le64toh(f->header->n_objects),
2526                le64toh(f->header->n_entries));
2527
2528         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2529                 printf("Data Objects: %"PRIu64"\n"
2530                        "Data Hash Table Fill: %.1f%%\n",
2531                        le64toh(f->header->n_data),
2532                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2533
2534         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2535                 printf("Field Objects: %"PRIu64"\n"
2536                        "Field Hash Table Fill: %.1f%%\n",
2537                        le64toh(f->header->n_fields),
2538                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2539
2540         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2541                 printf("Tag Objects: %"PRIu64"\n",
2542                        le64toh(f->header->n_tags));
2543         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2544                 printf("Entry Array Objects: %"PRIu64"\n",
2545                        le64toh(f->header->n_entry_arrays));
2546
2547         if (fstat(f->fd, &st) >= 0)
2548                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2549 }
2550
2551 static int journal_file_warn_btrfs(JournalFile *f) {
2552         unsigned attrs;
2553         int r;
2554
2555         assert(f);
2556
2557         /* Before we write anything, check if the COW logic is turned
2558          * off on btrfs. Given our write pattern that is quite
2559          * unfriendly to COW file systems this should greatly improve
2560          * performance on COW file systems, such as btrfs, at the
2561          * expense of data integrity features (which shouldn't be too
2562          * bad, given that we do our own checksumming). */
2563
2564         r = btrfs_is_filesystem(f->fd);
2565         if (r < 0)
2566                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2567         if (!r)
2568                 return 0;
2569
2570         r = read_attr_fd(f->fd, &attrs);
2571         if (r < 0)
2572                 return log_warning_errno(r, "Failed to read file attributes: %m");
2573
2574         if (attrs & FS_NOCOW_FL) {
2575                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2576                 return 0;
2577         }
2578
2579         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2580                    "This is likely to slow down journal access substantially, please consider turning "
2581                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2582
2583         return 1;
2584 }
2585
2586 int journal_file_open(
2587                 const char *fname,
2588                 int flags,
2589                 mode_t mode,
2590                 bool compress,
2591                 bool seal,
2592                 JournalMetrics *metrics,
2593                 MMapCache *mmap_cache,
2594                 JournalFile *template,
2595                 JournalFile **ret) {
2596
2597         bool newly_created = false;
2598         JournalFile *f;
2599         void *h;
2600         int r;
2601
2602         assert(fname);
2603         assert(ret);
2604
2605         if ((flags & O_ACCMODE) != O_RDONLY &&
2606             (flags & O_ACCMODE) != O_RDWR)
2607                 return -EINVAL;
2608
2609         if (!endswith(fname, ".journal") &&
2610             !endswith(fname, ".journal~"))
2611                 return -EINVAL;
2612
2613         f = new0(JournalFile, 1);
2614         if (!f)
2615                 return -ENOMEM;
2616
2617         f->fd = -1;
2618         f->mode = mode;
2619
2620         f->flags = flags;
2621         f->prot = prot_from_flags(flags);
2622         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2623 #if defined(HAVE_LZ4)
2624         f->compress_lz4 = compress;
2625 #elif defined(HAVE_XZ)
2626         f->compress_xz = compress;
2627 #endif
2628 #ifdef HAVE_GCRYPT
2629         f->seal = seal;
2630 #endif
2631
2632         if (mmap_cache)
2633                 f->mmap = mmap_cache_ref(mmap_cache);
2634         else {
2635                 f->mmap = mmap_cache_new();
2636                 if (!f->mmap) {
2637                         r = -ENOMEM;
2638                         goto fail;
2639                 }
2640         }
2641
2642         f->path = strdup(fname);
2643         if (!f->path) {
2644                 r = -ENOMEM;
2645                 goto fail;
2646         }
2647
2648         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2649         if (!f->chain_cache) {
2650                 r = -ENOMEM;
2651                 goto fail;
2652         }
2653
2654         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2655         if (f->fd < 0) {
2656                 r = -errno;
2657                 goto fail;
2658         }
2659
2660         r = journal_file_fstat(f);
2661         if (r < 0)
2662                 goto fail;
2663
2664         if (f->last_stat.st_size == 0 && f->writable) {
2665
2666                 (void) journal_file_warn_btrfs(f);
2667
2668                 /* Let's attach the creation time to the journal file,
2669                  * so that the vacuuming code knows the age of this
2670                  * file even if the file might end up corrupted one
2671                  * day... Ideally we'd just use the creation time many
2672                  * file systems maintain for each file, but there is
2673                  * currently no usable API to query this, hence let's
2674                  * emulate this via extended attributes. If extended
2675                  * attributes are not supported we'll just skip this,
2676                  * and rely solely on mtime/atime/ctime of the file. */
2677
2678                 fd_setcrtime(f->fd, 0);
2679
2680 #ifdef HAVE_GCRYPT
2681                 /* Try to load the FSPRG state, and if we can't, then
2682                  * just don't do sealing */
2683                 if (f->seal) {
2684                         r = journal_file_fss_load(f);
2685                         if (r < 0)
2686                                 f->seal = false;
2687                 }
2688 #endif
2689
2690                 r = journal_file_init_header(f, template);
2691                 if (r < 0)
2692                         goto fail;
2693
2694                 r = journal_file_fstat(f);
2695                 if (r < 0)
2696                         goto fail;
2697
2698                 newly_created = true;
2699         }
2700
2701         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2702                 r = -EIO;
2703                 goto fail;
2704         }
2705
2706         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2707         if (r < 0)
2708                 goto fail;
2709
2710         f->header = h;
2711
2712         if (!newly_created) {
2713                 r = journal_file_verify_header(f);
2714                 if (r < 0)
2715                         goto fail;
2716         }
2717
2718 #ifdef HAVE_GCRYPT
2719         if (!newly_created && f->writable) {
2720                 r = journal_file_fss_load(f);
2721                 if (r < 0)
2722                         goto fail;
2723         }
2724 #endif
2725
2726         if (f->writable) {
2727                 if (metrics) {
2728                         journal_default_metrics(metrics, f->fd);
2729                         f->metrics = *metrics;
2730                 } else if (template)
2731                         f->metrics = template->metrics;
2732
2733                 r = journal_file_refresh_header(f);
2734                 if (r < 0)
2735                         goto fail;
2736         }
2737
2738 #ifdef HAVE_GCRYPT
2739         r = journal_file_hmac_setup(f);
2740         if (r < 0)
2741                 goto fail;
2742 #endif
2743
2744         if (newly_created) {
2745                 r = journal_file_setup_field_hash_table(f);
2746                 if (r < 0)
2747                         goto fail;
2748
2749                 r = journal_file_setup_data_hash_table(f);
2750                 if (r < 0)
2751                         goto fail;
2752
2753 #ifdef HAVE_GCRYPT
2754                 r = journal_file_append_first_tag(f);
2755                 if (r < 0)
2756                         goto fail;
2757 #endif
2758         }
2759
2760         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2761                 r = -EIO;
2762                 goto fail;
2763         }
2764
2765         *ret = f;
2766         return 0;
2767
2768 fail:
2769         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2770                 r = -EIO;
2771
2772         journal_file_close(f);
2773
2774         return r;
2775 }
2776
2777 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2778         _cleanup_free_ char *p = NULL;
2779         size_t l;
2780         JournalFile *old_file, *new_file = NULL;
2781         int r;
2782
2783         assert(f);
2784         assert(*f);
2785
2786         old_file = *f;
2787
2788         if (!old_file->writable)
2789                 return -EINVAL;
2790
2791         if (!endswith(old_file->path, ".journal"))
2792                 return -EINVAL;
2793
2794         l = strlen(old_file->path);
2795         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2796                      (int) l - 8, old_file->path,
2797                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2798                      le64toh((*f)->header->head_entry_seqnum),
2799                      le64toh((*f)->header->head_entry_realtime));
2800         if (r < 0)
2801                 return -ENOMEM;
2802
2803         /* Try to rename the file to the archived version. If the file
2804          * already was deleted, we'll get ENOENT, let's ignore that
2805          * case. */
2806         r = rename(old_file->path, p);
2807         if (r < 0 && errno != ENOENT)
2808                 return -errno;
2809
2810         old_file->header->state = STATE_ARCHIVED;
2811
2812         /* Currently, btrfs is not very good with out write patterns
2813          * and fragments heavily. Let's defrag our journal files when
2814          * we archive them */
2815         old_file->defrag_on_close = true;
2816
2817         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2818         journal_file_close(old_file);
2819
2820         *f = new_file;
2821         return r;
2822 }
2823
2824 int journal_file_open_reliably(
2825                 const char *fname,
2826                 int flags,
2827                 mode_t mode,
2828                 bool compress,
2829                 bool seal,
2830                 JournalMetrics *metrics,
2831                 MMapCache *mmap_cache,
2832                 JournalFile *template,
2833                 JournalFile **ret) {
2834
2835         int r;
2836         size_t l;
2837         _cleanup_free_ char *p = NULL;
2838
2839         r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2840         if (!IN_SET(r,
2841                     -EBADMSG,           /* corrupted */
2842                     -ENODATA,           /* truncated */
2843                     -EHOSTDOWN,         /* other machine */
2844                     -EPROTONOSUPPORT,   /* incompatible feature */
2845                     -EBUSY,             /* unclean shutdown */
2846                     -ESHUTDOWN,         /* already archived */
2847                     -EIO,               /* IO error, including SIGBUS on mmap */
2848                     -EIDRM              /* File has been deleted */))
2849                 return r;
2850
2851         if ((flags & O_ACCMODE) == O_RDONLY)
2852                 return r;
2853
2854         if (!(flags & O_CREAT))
2855                 return r;
2856
2857         if (!endswith(fname, ".journal"))
2858                 return r;
2859
2860         /* The file is corrupted. Rotate it away and try it again (but only once) */
2861
2862         l = strlen(fname);
2863         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2864                      (int) l - 8, fname,
2865                      now(CLOCK_REALTIME),
2866                      random_u64()) < 0)
2867                 return -ENOMEM;
2868
2869         if (rename(fname, p) < 0)
2870                 return -errno;
2871
2872         /* btrfs doesn't cope well with our write pattern and
2873          * fragments heavily. Let's defrag all files we rotate */
2874
2875         (void) chattr_path(p, false, FS_NOCOW_FL);
2876         (void) btrfs_defrag(p);
2877
2878         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2879
2880         return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2881 }
2882
2883 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2884         uint64_t i, n;
2885         uint64_t q, xor_hash = 0;
2886         int r;
2887         EntryItem *items;
2888         dual_timestamp ts;
2889
2890         assert(from);
2891         assert(to);
2892         assert(o);
2893         assert(p);
2894
2895         if (!to->writable)
2896                 return -EPERM;
2897
2898         ts.monotonic = le64toh(o->entry.monotonic);
2899         ts.realtime = le64toh(o->entry.realtime);
2900
2901         n = journal_file_entry_n_items(o);
2902         /* alloca() can't take 0, hence let's allocate at least one */
2903         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2904
2905         for (i = 0; i < n; i++) {
2906                 uint64_t l, h;
2907                 le64_t le_hash;
2908                 size_t t;
2909                 void *data;
2910                 Object *u;
2911
2912                 q = le64toh(o->entry.items[i].object_offset);
2913                 le_hash = o->entry.items[i].hash;
2914
2915                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2916                 if (r < 0)
2917                         return r;
2918
2919                 if (le_hash != o->data.hash)
2920                         return -EBADMSG;
2921
2922                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2923                 t = (size_t) l;
2924
2925                 /* We hit the limit on 32bit machines */
2926                 if ((uint64_t) t != l)
2927                         return -E2BIG;
2928
2929                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2930 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2931                         size_t rsize = 0;
2932
2933                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2934                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2935                         if (r < 0)
2936                                 return r;
2937
2938                         data = from->compress_buffer;
2939                         l = rsize;
2940 #else
2941                         return -EPROTONOSUPPORT;
2942 #endif
2943                 } else
2944                         data = o->data.payload;
2945
2946                 r = journal_file_append_data(to, data, l, &u, &h);
2947                 if (r < 0)
2948                         return r;
2949
2950                 xor_hash ^= le64toh(u->data.hash);
2951                 items[i].object_offset = htole64(h);
2952                 items[i].hash = u->data.hash;
2953
2954                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2955                 if (r < 0)
2956                         return r;
2957         }
2958
2959         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2960
2961         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2962                 return -EIO;
2963
2964         return r;
2965 }
2966
2967 void journal_reset_metrics(JournalMetrics *m) {
2968         assert(m);
2969
2970         /* Set everything to "pick automatic values". */
2971
2972         *m = (JournalMetrics) {
2973                 .min_use = (uint64_t) -1,
2974                 .max_use = (uint64_t) -1,
2975                 .min_size = (uint64_t) -1,
2976                 .max_size = (uint64_t) -1,
2977                 .keep_free = (uint64_t) -1,
2978                 .n_max_files = (uint64_t) -1,
2979         };
2980 }
2981
2982 void journal_default_metrics(JournalMetrics *m, int fd) {
2983         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
2984         struct statvfs ss;
2985         uint64_t fs_size;
2986
2987         assert(m);
2988         assert(fd >= 0);
2989
2990         if (fstatvfs(fd, &ss) >= 0)
2991                 fs_size = ss.f_frsize * ss.f_blocks;
2992         else {
2993                 log_debug_errno(errno, "Failed to detremine disk size: %m");
2994                 fs_size = 0;
2995         }
2996
2997         if (m->max_use == (uint64_t) -1) {
2998
2999                 if (fs_size > 0) {
3000                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3001
3002                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3003                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3004
3005                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3006                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3007                 } else
3008                         m->max_use = DEFAULT_MAX_USE_LOWER;
3009         } else {
3010                 m->max_use = PAGE_ALIGN(m->max_use);
3011
3012                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3013                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3014         }
3015
3016         if (m->min_use == (uint64_t) -1)
3017                 m->min_use = DEFAULT_MIN_USE;
3018
3019         if (m->min_use > m->max_use)
3020                 m->min_use = m->max_use;
3021
3022         if (m->max_size == (uint64_t) -1) {
3023                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3024
3025                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3026                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3027         } else
3028                 m->max_size = PAGE_ALIGN(m->max_size);
3029
3030         if (m->max_size != 0) {
3031                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3032                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3033
3034                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3035                         m->max_use = m->max_size*2;
3036         }
3037
3038         if (m->min_size == (uint64_t) -1)
3039                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3040         else {
3041                 m->min_size = PAGE_ALIGN(m->min_size);
3042
3043                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3044                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3045
3046                 if (m->max_size != 0 && m->min_size > m->max_size)
3047                         m->max_size = m->min_size;
3048         }
3049
3050         if (m->keep_free == (uint64_t) -1) {
3051
3052                 if (fs_size > 0) {
3053                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3054
3055                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3056                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3057
3058                 } else
3059                         m->keep_free = DEFAULT_KEEP_FREE;
3060         }
3061
3062         if (m->n_max_files == (uint64_t) -1)
3063                 m->n_max_files = DEFAULT_N_MAX_FILES;
3064
3065         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3066                   format_bytes(a, sizeof(a), m->min_use),
3067                   format_bytes(b, sizeof(b), m->max_use),
3068                   format_bytes(c, sizeof(c), m->max_size),
3069                   format_bytes(d, sizeof(d), m->min_size),
3070                   format_bytes(e, sizeof(e), m->keep_free),
3071                   m->n_max_files);
3072 }
3073
3074 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3075         assert(f);
3076         assert(from || to);
3077
3078         if (from) {
3079                 if (f->header->head_entry_realtime == 0)
3080                         return -ENOENT;
3081
3082                 *from = le64toh(f->header->head_entry_realtime);
3083         }
3084
3085         if (to) {
3086                 if (f->header->tail_entry_realtime == 0)
3087                         return -ENOENT;
3088
3089                 *to = le64toh(f->header->tail_entry_realtime);
3090         }
3091
3092         return 1;
3093 }
3094
3095 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3096         Object *o;
3097         uint64_t p;
3098         int r;
3099
3100         assert(f);
3101         assert(from || to);
3102
3103         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3104         if (r <= 0)
3105                 return r;
3106
3107         if (le64toh(o->data.n_entries) <= 0)
3108                 return 0;
3109
3110         if (from) {
3111                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3112                 if (r < 0)
3113                         return r;
3114
3115                 *from = le64toh(o->entry.monotonic);
3116         }
3117
3118         if (to) {
3119                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3120                 if (r < 0)
3121                         return r;
3122
3123                 r = generic_array_get_plus_one(f,
3124                                                le64toh(o->data.entry_offset),
3125                                                le64toh(o->data.entry_array_offset),
3126                                                le64toh(o->data.n_entries)-1,
3127                                                &o, NULL);
3128                 if (r <= 0)
3129                         return r;
3130
3131                 *to = le64toh(o->entry.monotonic);
3132         }
3133
3134         return 1;
3135 }
3136
3137 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3138         assert(f);
3139
3140         /* If we gained new header fields we gained new features,
3141          * hence suggest a rotation */
3142         if (le64toh(f->header->header_size) < sizeof(Header)) {
3143                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3144                 return true;
3145         }
3146
3147         /* Let's check if the hash tables grew over a certain fill
3148          * level (75%, borrowing this value from Java's hash table
3149          * implementation), and if so suggest a rotation. To calculate
3150          * the fill level we need the n_data field, which only exists
3151          * in newer versions. */
3152
3153         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3154                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3155                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3156                                   f->path,
3157                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3158                                   le64toh(f->header->n_data),
3159                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3160                                   (unsigned long long) f->last_stat.st_size,
3161                                   f->last_stat.st_size / le64toh(f->header->n_data));
3162                         return true;
3163                 }
3164
3165         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3166                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3167                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3168                                   f->path,
3169                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3170                                   le64toh(f->header->n_fields),
3171                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3172                         return true;
3173                 }
3174
3175         /* Are the data objects properly indexed by field objects? */
3176         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3177             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3178             le64toh(f->header->n_data) > 0 &&
3179             le64toh(f->header->n_fields) == 0)
3180                 return true;
3181
3182         if (max_file_usec > 0) {
3183                 usec_t t, h;
3184
3185                 h = le64toh(f->header->head_entry_realtime);
3186                 t = now(CLOCK_REALTIME);
3187
3188                 if (h > 0 && t > h + max_file_usec)
3189                         return true;
3190         }
3191
3192         return false;
3193 }