src/journal/journal-file.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2011 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <fcntl.h>
  24 #include <linux/fs.h>
  25 #include <stddef.h>
  26 #include <sys/mman.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/uio.h>
  29 #include <unistd.h>
  30
  31 #include "btrfs-util.h"
  32 #include "chattr-util.h"
  33 #include "compress.h"
  34 #include "fd-util.h"
  35 #include "journal-authenticate.h"
  36 #include "journal-def.h"
  37 #include "journal-file.h"
  38 #include "lookup3.h"
  39 #include "parse-util.h"
  40 #include "random-util.h"
  41 #include "string-util.h"
  42 #include "xattr-util.h"
  43
  44 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  45 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  46
  47 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
  48
  49 /* This is the minimum journal file size */
  50 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
  51
  52 /* These are the lower and upper bounds if we deduce the max_use value
  53  * from the file system size */
  54 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  55 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  56
  57 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  58 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  59
  60 /* This is the upper bound if we deduce max_size from max_use */
  61 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  62
  63 /* This is the upper bound if we deduce the keep_free value from the
  64  * file system size */
  65 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  66
  67 /* This is the keep_free value when we can't determine the system
  68  * size */
  69 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  70
  71 /* This is the default maximum number of journal files to keep around. */
  72 #define DEFAULT_N_MAX_FILES (100)
  73
  74 /* n_data was the first entry we added after the initial file format design */
  75 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  76
  77 /* How many entries to keep in the entry array chain cache at max */
  78 #define CHAIN_CACHE_MAX 20
  79
  80 /* How much to increase the journal file size at once each time we allocate something new. */
  81 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  82
  83 /* Reread fstat() of the file for detecting deletions at least this often */
  84 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  85
  86 /* The mmap context to use for the header we pick as one above the last defined typed */
  87 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  88
  89 static int journal_file_set_online(JournalFile *f) {
  90         assert(f);
  91
  92         if (!f->writable)
  93                 return -EPERM;
  94
  95         if (!(f->fd >= 0 && f->header))
  96                 return -EINVAL;
  97
  98         if (mmap_cache_got_sigbus(f->mmap, f->fd))
  99                 return -EIO;
 100
 101         switch(f->header->state) {
 102                 case STATE_ONLINE:
 103                         return 0;
 104
 105                 case STATE_OFFLINE:
 106                         f->header->state = STATE_ONLINE;
 107                         fsync(f->fd);
 108                         return 0;
 109
 110                 default:
 111                         return -EINVAL;
 112         }
 113 }
 114
 115 int journal_file_set_offline(JournalFile *f) {
 116         assert(f);
 117
 118         if (!f->writable)
 119                 return -EPERM;
 120
 121         if (!(f->fd >= 0 && f->header))
 122                 return -EINVAL;
 123
 124         if (f->header->state != STATE_ONLINE)
 125                 return 0;
 126
 127         fsync(f->fd);
 128
 129         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 130                 return -EIO;
 131
 132         f->header->state = STATE_OFFLINE;
 133
 134         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 135                 return -EIO;
 136
 137         fsync(f->fd);
 138
 139         return 0;
 140 }
 141
 142 JournalFile* journal_file_close(JournalFile *f) {
 143         assert(f);
 144
 145 #ifdef HAVE_GCRYPT
 146         /* Write the final tag */
 147         if (f->seal && f->writable)
 148                 journal_file_append_tag(f);
 149 #endif
 150
 151         journal_file_set_offline(f);
 152
 153         if (f->mmap && f->fd >= 0)
 154                 mmap_cache_close_fd(f->mmap, f->fd);
 155
 156         if (f->fd >= 0 && f->defrag_on_close) {
 157
 158                 /* Be friendly to btrfs: turn COW back on again now,
 159                  * and defragment the file. We won't write to the file
 160                  * ever again, hence remove all fragmentation, and
 161                  * reenable all the good bits COW usually provides
 162                  * (such as data checksumming). */
 163
 164                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 165                 (void) btrfs_defrag_fd(f->fd);
 166         }
 167
 168         safe_close(f->fd);
 169         free(f->path);
 170
 171         if (f->mmap)
 172                 mmap_cache_unref(f->mmap);
 173
 174         ordered_hashmap_free_free(f->chain_cache);
 175
 176 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 177         free(f->compress_buffer);
 178 #endif
 179
 180 #ifdef HAVE_GCRYPT
 181         if (f->fss_file)
 182                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 183         else
 184                 free(f->fsprg_state);
 185
 186         free(f->fsprg_seed);
 187
 188         if (f->hmac)
 189                 gcry_md_close(f->hmac);
 190 #endif
 191
 192         free(f);
 193         return NULL;
 194 }
 195
 196 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 197         Header h = {};
 198         ssize_t k;
 199         int r;
 200
 201         assert(f);
 202
 203         memcpy(h.signature, HEADER_SIGNATURE, 8);
 204         h.header_size = htole64(ALIGN64(sizeof(h)));
 205
 206         h.incompatible_flags |= htole32(
 207                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 208                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 209
 210         h.compatible_flags = htole32(
 211                 f->seal * HEADER_COMPATIBLE_SEALED);
 212
 213         r = sd_id128_randomize(&h.file_id);
 214         if (r < 0)
 215                 return r;
 216
 217         if (template) {
 218                 h.seqnum_id = template->header->seqnum_id;
 219                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 220         } else
 221                 h.seqnum_id = h.file_id;
 222
 223         k = pwrite(f->fd, &h, sizeof(h), 0);
 224         if (k < 0)
 225                 return -errno;
 226
 227         if (k != sizeof(h))
 228                 return -EIO;
 229
 230         return 0;
 231 }
 232
 233 static int journal_file_refresh_header(JournalFile *f) {
 234         sd_id128_t boot_id;
 235         int r;
 236
 237         assert(f);
 238
 239         r = sd_id128_get_machine(&f->header->machine_id);
 240         if (r < 0)
 241                 return r;
 242
 243         r = sd_id128_get_boot(&boot_id);
 244         if (r < 0)
 245                 return r;
 246
 247         if (sd_id128_equal(boot_id, f->header->boot_id))
 248                 f->tail_entry_monotonic_valid = true;
 249
 250         f->header->boot_id = boot_id;
 251
 252         r = journal_file_set_online(f);
 253
 254         /* Sync the online state to disk */
 255         fsync(f->fd);
 256
 257         return r;
 258 }
 259
 260 static int journal_file_verify_header(JournalFile *f) {
 261         uint32_t flags;
 262
 263         assert(f);
 264
 265         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 266                 return -EBADMSG;
 267
 268         /* In both read and write mode we refuse to open files with
 269          * incompatible flags we don't know */
 270         flags = le32toh(f->header->incompatible_flags);
 271         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
 272                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
 273                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
 274                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
 275                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
 276                 if (flags)
 277                         log_debug("Journal file %s uses incompatible flags %"PRIx32
 278                                   " disabled at compilation time.", f->path, flags);
 279                 return -EPROTONOSUPPORT;
 280         }
 281
 282         /* When open for writing we refuse to open files with
 283          * compatible flags, too */
 284         flags = le32toh(f->header->compatible_flags);
 285         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
 286                 if (flags & ~HEADER_COMPATIBLE_ANY)
 287                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
 288                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
 289                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
 290                 if (flags)
 291                         log_debug("Journal file %s uses compatible flags %"PRIx32
 292                                   " disabled at compilation time.", f->path, flags);
 293                 return -EPROTONOSUPPORT;
 294         }
 295
 296         if (f->header->state >= _STATE_MAX)
 297                 return -EBADMSG;
 298
 299         /* The first addition was n_data, so check that we are at least this large */
 300         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
 301                 return -EBADMSG;
 302
 303         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 304                 return -EBADMSG;
 305
 306         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
 307                 return -ENODATA;
 308
 309         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
 310                 return -ENODATA;
 311
 312         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 313             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 314             !VALID64(le64toh(f->header->tail_object_offset)) ||
 315             !VALID64(le64toh(f->header->entry_array_offset)))
 316                 return -ENODATA;
 317
 318         if (f->writable) {
 319                 uint8_t state;
 320                 sd_id128_t machine_id;
 321                 int r;
 322
 323                 r = sd_id128_get_machine(&machine_id);
 324                 if (r < 0)
 325                         return r;
 326
 327                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 328                         return -EHOSTDOWN;
 329
 330                 state = f->header->state;
 331
 332                 if (state == STATE_ONLINE) {
 333                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 334                         return -EBUSY;
 335                 } else if (state == STATE_ARCHIVED)
 336                         return -ESHUTDOWN;
 337                 else if (state != STATE_OFFLINE) {
 338                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 339                         return -EBUSY;
 340                 }
 341         }
 342
 343         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 344         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 345
 346         f->seal = JOURNAL_HEADER_SEALED(f->header);
 347
 348         return 0;
 349 }
 350
 351 static int journal_file_fstat(JournalFile *f) {
 352         assert(f);
 353         assert(f->fd >= 0);
 354
 355         if (fstat(f->fd, &f->last_stat) < 0)
 356                 return -errno;
 357
 358         f->last_stat_usec = now(CLOCK_MONOTONIC);
 359
 360         /* Refuse appending to files that are already deleted */
 361         if (f->last_stat.st_nlink <= 0)
 362                 return -EIDRM;
 363
 364         return 0;
 365 }
 366
 367 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 368         uint64_t old_size, new_size;
 369         int r;
 370
 371         assert(f);
 372
 373         /* We assume that this file is not sparse, and we know that
 374          * for sure, since we always call posix_fallocate()
 375          * ourselves */
 376
 377         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 378                 return -EIO;
 379
 380         old_size =
 381                 le64toh(f->header->header_size) +
 382                 le64toh(f->header->arena_size);
 383
 384         new_size = PAGE_ALIGN(offset + size);
 385         if (new_size < le64toh(f->header->header_size))
 386                 new_size = le64toh(f->header->header_size);
 387
 388         if (new_size <= old_size) {
 389
 390                 /* We already pre-allocated enough space, but before
 391                  * we write to it, let's check with fstat() if the
 392                  * file got deleted, in order make sure we don't throw
 393                  * away the data immediately. Don't check fstat() for
 394                  * all writes though, but only once ever 10s. */
 395
 396                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 397                         return 0;
 398
 399                 return journal_file_fstat(f);
 400         }
 401
 402         /* Allocate more space. */
 403
 404         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 405                 return -E2BIG;
 406
 407         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 408                 struct statvfs svfs;
 409
 410                 if (fstatvfs(f->fd, &svfs) >= 0) {
 411                         uint64_t available;
 412
 413                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 414
 415                         if (new_size - old_size > available)
 416                                 return -E2BIG;
 417                 }
 418         }
 419
 420         /* Increase by larger blocks at once */
 421         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 422         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 423                 new_size = f->metrics.max_size;
 424
 425         /* Note that the glibc fallocate() fallback is very
 426            inefficient, hence we try to minimize the allocation area
 427            as we can. */
 428         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 429         if (r != 0)
 430                 return -r;
 431
 432         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 433
 434         return journal_file_fstat(f);
 435 }
 436
 437 static unsigned type_to_context(ObjectType type) {
 438         /* One context for each type, plus one catch-all for the rest */
 439         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 440         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 441         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 442 }
 443
 444 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
 445         int r;
 446
 447         assert(f);
 448         assert(ret);
 449
 450         if (size <= 0)
 451                 return -EINVAL;
 452
 453         /* Avoid SIGBUS on invalid accesses */
 454         if (offset + size > (uint64_t) f->last_stat.st_size) {
 455                 /* Hmm, out of range? Let's refresh the fstat() data
 456                  * first, before we trust that check. */
 457
 458                 r = journal_file_fstat(f);
 459                 if (r < 0)
 460                         return r;
 461
 462                 if (offset + size > (uint64_t) f->last_stat.st_size)
 463                         return -EADDRNOTAVAIL;
 464         }
 465
 466         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
 467 }
 468
 469 static uint64_t minimum_header_size(Object *o) {
 470
 471         static const uint64_t table[] = {
 472                 [OBJECT_DATA] = sizeof(DataObject),
 473                 [OBJECT_FIELD] = sizeof(FieldObject),
 474                 [OBJECT_ENTRY] = sizeof(EntryObject),
 475                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 476                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 477                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 478                 [OBJECT_TAG] = sizeof(TagObject),
 479         };
 480
 481         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 482                 return sizeof(ObjectHeader);
 483
 484         return table[o->object.type];
 485 }
 486
 487 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 488         int r;
 489         void *t;
 490         Object *o;
 491         uint64_t s;
 492
 493         assert(f);
 494         assert(ret);
 495
 496         /* Objects may only be located at multiple of 64 bit */
 497         if (!VALID64(offset))
 498                 return -EFAULT;
 499
 500         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
 501         if (r < 0)
 502                 return r;
 503
 504         o = (Object*) t;
 505         s = le64toh(o->object.size);
 506
 507         if (s < sizeof(ObjectHeader))
 508                 return -EBADMSG;
 509
 510         if (o->object.type <= OBJECT_UNUSED)
 511                 return -EBADMSG;
 512
 513         if (s < minimum_header_size(o))
 514                 return -EBADMSG;
 515
 516         if (type > OBJECT_UNUSED && o->object.type != type)
 517                 return -EBADMSG;
 518
 519         if (s > sizeof(ObjectHeader)) {
 520                 r = journal_file_move_to(f, type, false, offset, s, &t);
 521                 if (r < 0)
 522                         return r;
 523
 524                 o = (Object*) t;
 525         }
 526
 527         *ret = o;
 528         return 0;
 529 }
 530
 531 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 532         uint64_t r;
 533
 534         assert(f);
 535
 536         r = le64toh(f->header->tail_entry_seqnum) + 1;
 537
 538         if (seqnum) {
 539                 /* If an external seqnum counter was passed, we update
 540                  * both the local and the external one, and set it to
 541                  * the maximum of both */
 542
 543                 if (*seqnum + 1 > r)
 544                         r = *seqnum + 1;
 545
 546                 *seqnum = r;
 547         }
 548
 549         f->header->tail_entry_seqnum = htole64(r);
 550
 551         if (f->header->head_entry_seqnum == 0)
 552                 f->header->head_entry_seqnum = htole64(r);
 553
 554         return r;
 555 }
 556
 557 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
 558         int r;
 559         uint64_t p;
 560         Object *tail, *o;
 561         void *t;
 562
 563         assert(f);
 564         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
 565         assert(size >= sizeof(ObjectHeader));
 566         assert(offset);
 567         assert(ret);
 568
 569         r = journal_file_set_online(f);
 570         if (r < 0)
 571                 return r;
 572
 573         p = le64toh(f->header->tail_object_offset);
 574         if (p == 0)
 575                 p = le64toh(f->header->header_size);
 576         else {
 577                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
 578                 if (r < 0)
 579                         return r;
 580
 581                 p += ALIGN64(le64toh(tail->object.size));
 582         }
 583
 584         r = journal_file_allocate(f, p, size);
 585         if (r < 0)
 586                 return r;
 587
 588         r = journal_file_move_to(f, type, false, p, size, &t);
 589         if (r < 0)
 590                 return r;
 591
 592         o = (Object*) t;
 593
 594         zero(o->object);
 595         o->object.type = type;
 596         o->object.size = htole64(size);
 597
 598         f->header->tail_object_offset = htole64(p);
 599         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
 600
 601         *ret = o;
 602         *offset = p;
 603
 604         return 0;
 605 }
 606
 607 static int journal_file_setup_data_hash_table(JournalFile *f) {
 608         uint64_t s, p;
 609         Object *o;
 610         int r;
 611
 612         assert(f);
 613
 614         /* We estimate that we need 1 hash table entry per 768 bytes
 615            of journal file and we want to make sure we never get
 616            beyond 75% fill level. Calculate the hash table size for
 617            the maximum file size based on these metrics. */
 618
 619         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
 620         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
 621                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
 622
 623         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
 624
 625         r = journal_file_append_object(f,
 626                                        OBJECT_DATA_HASH_TABLE,
 627                                        offsetof(Object, hash_table.items) + s,
 628                                        &o, &p);
 629         if (r < 0)
 630                 return r;
 631
 632         memzero(o->hash_table.items, s);
 633
 634         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 635         f->header->data_hash_table_size = htole64(s);
 636
 637         return 0;
 638 }
 639
 640 static int journal_file_setup_field_hash_table(JournalFile *f) {
 641         uint64_t s, p;
 642         Object *o;
 643         int r;
 644
 645         assert(f);
 646
 647         /* We use a fixed size hash table for the fields as this
 648          * number should grow very slowly only */
 649
 650         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
 651         r = journal_file_append_object(f,
 652                                        OBJECT_FIELD_HASH_TABLE,
 653                                        offsetof(Object, hash_table.items) + s,
 654                                        &o, &p);
 655         if (r < 0)
 656                 return r;
 657
 658         memzero(o->hash_table.items, s);
 659
 660         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 661         f->header->field_hash_table_size = htole64(s);
 662
 663         return 0;
 664 }
 665
 666 int journal_file_map_data_hash_table(JournalFile *f) {
 667         uint64_t s, p;
 668         void *t;
 669         int r;
 670
 671         assert(f);
 672
 673         if (f->data_hash_table)
 674                 return 0;
 675
 676         p = le64toh(f->header->data_hash_table_offset);
 677         s = le64toh(f->header->data_hash_table_size);
 678
 679         r = journal_file_move_to(f,
 680                                  OBJECT_DATA_HASH_TABLE,
 681                                  true,
 682                                  p, s,
 683                                  &t);
 684         if (r < 0)
 685                 return r;
 686
 687         f->data_hash_table = t;
 688         return 0;
 689 }
 690
 691 int journal_file_map_field_hash_table(JournalFile *f) {
 692         uint64_t s, p;
 693         void *t;
 694         int r;
 695
 696         assert(f);
 697
 698         if (f->field_hash_table)
 699                 return 0;
 700
 701         p = le64toh(f->header->field_hash_table_offset);
 702         s = le64toh(f->header->field_hash_table_size);
 703
 704         r = journal_file_move_to(f,
 705                                  OBJECT_FIELD_HASH_TABLE,
 706                                  true,
 707                                  p, s,
 708                                  &t);
 709         if (r < 0)
 710                 return r;
 711
 712         f->field_hash_table = t;
 713         return 0;
 714 }
 715
 716 static int journal_file_link_field(
 717                 JournalFile *f,
 718                 Object *o,
 719                 uint64_t offset,
 720                 uint64_t hash) {
 721
 722         uint64_t p, h, m;
 723         int r;
 724
 725         assert(f);
 726         assert(o);
 727         assert(offset > 0);
 728
 729         if (o->object.type != OBJECT_FIELD)
 730                 return -EINVAL;
 731
 732         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 733         if (m <= 0)
 734                 return -EBADMSG;
 735
 736         /* This might alter the window we are looking at */
 737         o->field.next_hash_offset = o->field.head_data_offset = 0;
 738
 739         h = hash % m;
 740         p = le64toh(f->field_hash_table[h].tail_hash_offset);
 741         if (p == 0)
 742                 f->field_hash_table[h].head_hash_offset = htole64(offset);
 743         else {
 744                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 745                 if (r < 0)
 746                         return r;
 747
 748                 o->field.next_hash_offset = htole64(offset);
 749         }
 750
 751         f->field_hash_table[h].tail_hash_offset = htole64(offset);
 752
 753         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
 754                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
 755
 756         return 0;
 757 }
 758
 759 static int journal_file_link_data(
 760                 JournalFile *f,
 761                 Object *o,
 762                 uint64_t offset,
 763                 uint64_t hash) {
 764
 765         uint64_t p, h, m;
 766         int r;
 767
 768         assert(f);
 769         assert(o);
 770         assert(offset > 0);
 771
 772         if (o->object.type != OBJECT_DATA)
 773                 return -EINVAL;
 774
 775         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 776         if (m <= 0)
 777                 return -EBADMSG;
 778
 779         /* This might alter the window we are looking at */
 780         o->data.next_hash_offset = o->data.next_field_offset = 0;
 781         o->data.entry_offset = o->data.entry_array_offset = 0;
 782         o->data.n_entries = 0;
 783
 784         h = hash % m;
 785         p = le64toh(f->data_hash_table[h].tail_hash_offset);
 786         if (p == 0)
 787                 /* Only entry in the hash table is easy */
 788                 f->data_hash_table[h].head_hash_offset = htole64(offset);
 789         else {
 790                 /* Move back to the previous data object, to patch in
 791                  * pointer */
 792
 793                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 794                 if (r < 0)
 795                         return r;
 796
 797                 o->data.next_hash_offset = htole64(offset);
 798         }
 799
 800         f->data_hash_table[h].tail_hash_offset = htole64(offset);
 801
 802         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
 803                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
 804
 805         return 0;
 806 }
 807
 808 int journal_file_find_field_object_with_hash(
 809                 JournalFile *f,
 810                 const void *field, uint64_t size, uint64_t hash,
 811                 Object **ret, uint64_t *offset) {
 812
 813         uint64_t p, osize, h, m;
 814         int r;
 815
 816         assert(f);
 817         assert(field && size > 0);
 818
 819         /* If the field hash table is empty, we can't find anything */
 820         if (le64toh(f->header->field_hash_table_size) <= 0)
 821                 return 0;
 822
 823         /* Map the field hash table, if it isn't mapped yet. */
 824         r = journal_file_map_field_hash_table(f);
 825         if (r < 0)
 826                 return r;
 827
 828         osize = offsetof(Object, field.payload) + size;
 829
 830         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 831         if (m <= 0)
 832                 return -EBADMSG;
 833
 834         h = hash % m;
 835         p = le64toh(f->field_hash_table[h].head_hash_offset);
 836
 837         while (p > 0) {
 838                 Object *o;
 839
 840                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 841                 if (r < 0)
 842                         return r;
 843
 844                 if (le64toh(o->field.hash) == hash &&
 845                     le64toh(o->object.size) == osize &&
 846                     memcmp(o->field.payload, field, size) == 0) {
 847
 848                         if (ret)
 849                                 *ret = o;
 850                         if (offset)
 851                                 *offset = p;
 852
 853                         return 1;
 854                 }
 855
 856                 p = le64toh(o->field.next_hash_offset);
 857         }
 858
 859         return 0;
 860 }
 861
 862 int journal_file_find_field_object(
 863                 JournalFile *f,
 864                 const void *field, uint64_t size,
 865                 Object **ret, uint64_t *offset) {
 866
 867         uint64_t hash;
 868
 869         assert(f);
 870         assert(field && size > 0);
 871
 872         hash = hash64(field, size);
 873
 874         return journal_file_find_field_object_with_hash(f,
 875                                                         field, size, hash,
 876                                                         ret, offset);
 877 }
 878
 879 int journal_file_find_data_object_with_hash(
 880                 JournalFile *f,
 881                 const void *data, uint64_t size, uint64_t hash,
 882                 Object **ret, uint64_t *offset) {
 883
 884         uint64_t p, osize, h, m;
 885         int r;
 886
 887         assert(f);
 888         assert(data || size == 0);
 889
 890         /* If there's no data hash table, then there's no entry. */
 891         if (le64toh(f->header->data_hash_table_size) <= 0)
 892                 return 0;
 893
 894         /* Map the data hash table, if it isn't mapped yet. */
 895         r = journal_file_map_data_hash_table(f);
 896         if (r < 0)
 897                 return r;
 898
 899         osize = offsetof(Object, data.payload) + size;
 900
 901         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 902         if (m <= 0)
 903                 return -EBADMSG;
 904
 905         h = hash % m;
 906         p = le64toh(f->data_hash_table[h].head_hash_offset);
 907
 908         while (p > 0) {
 909                 Object *o;
 910
 911                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 912                 if (r < 0)
 913                         return r;
 914
 915                 if (le64toh(o->data.hash) != hash)
 916                         goto next;
 917
 918                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
 919 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 920                         uint64_t l;
 921                         size_t rsize = 0;
 922
 923                         l = le64toh(o->object.size);
 924                         if (l <= offsetof(Object, data.payload))
 925                                 return -EBADMSG;
 926
 927                         l -= offsetof(Object, data.payload);
 928
 929                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
 930                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
 931                         if (r < 0)
 932                                 return r;
 933
 934                         if (rsize == size &&
 935                             memcmp(f->compress_buffer, data, size) == 0) {
 936
 937                                 if (ret)
 938                                         *ret = o;
 939
 940                                 if (offset)
 941                                         *offset = p;
 942
 943                                 return 1;
 944                         }
 945 #else
 946                         return -EPROTONOSUPPORT;
 947 #endif
 948                 } else if (le64toh(o->object.size) == osize &&
 949                            memcmp(o->data.payload, data, size) == 0) {
 950
 951                         if (ret)
 952                                 *ret = o;
 953
 954                         if (offset)
 955                                 *offset = p;
 956
 957                         return 1;
 958                 }
 959
 960         next:
 961                 p = le64toh(o->data.next_hash_offset);
 962         }
 963
 964         return 0;
 965 }
 966
 967 int journal_file_find_data_object(
 968                 JournalFile *f,
 969                 const void *data, uint64_t size,
 970                 Object **ret, uint64_t *offset) {
 971
 972         uint64_t hash;
 973
 974         assert(f);
 975         assert(data || size == 0);
 976
 977         hash = hash64(data, size);
 978
 979         return journal_file_find_data_object_with_hash(f,
 980                                                        data, size, hash,
 981                                                        ret, offset);
 982 }
 983
 984 static int journal_file_append_field(
 985                 JournalFile *f,
 986                 const void *field, uint64_t size,
 987                 Object **ret, uint64_t *offset) {
 988
 989         uint64_t hash, p;
 990         uint64_t osize;
 991         Object *o;
 992         int r;
 993
 994         assert(f);
 995         assert(field && size > 0);
 996
 997         hash = hash64(field, size);
 998
 999         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1000         if (r < 0)
1001                 return r;
1002         else if (r > 0) {
1003
1004                 if (ret)
1005                         *ret = o;
1006
1007                 if (offset)
1008                         *offset = p;
1009
1010                 return 0;
1011         }
1012
1013         osize = offsetof(Object, field.payload) + size;
1014         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1015         if (r < 0)
1016                 return r;
1017
1018         o->field.hash = htole64(hash);
1019         memcpy(o->field.payload, field, size);
1020
1021         r = journal_file_link_field(f, o, p, hash);
1022         if (r < 0)
1023                 return r;
1024
1025         /* The linking might have altered the window, so let's
1026          * refresh our pointer */
1027         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1028         if (r < 0)
1029                 return r;
1030
1031 #ifdef HAVE_GCRYPT
1032         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1033         if (r < 0)
1034                 return r;
1035 #endif
1036
1037         if (ret)
1038                 *ret = o;
1039
1040         if (offset)
1041                 *offset = p;
1042
1043         return 0;
1044 }
1045
1046 static int journal_file_append_data(
1047                 JournalFile *f,
1048                 const void *data, uint64_t size,
1049                 Object **ret, uint64_t *offset) {
1050
1051         uint64_t hash, p;
1052         uint64_t osize;
1053         Object *o;
1054         int r, compression = 0;
1055         const void *eq;
1056
1057         assert(f);
1058         assert(data || size == 0);
1059
1060         hash = hash64(data, size);
1061
1062         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1063         if (r < 0)
1064                 return r;
1065         if (r > 0) {
1066
1067                 if (ret)
1068                         *ret = o;
1069
1070                 if (offset)
1071                         *offset = p;
1072
1073                 return 0;
1074         }
1075
1076         osize = offsetof(Object, data.payload) + size;
1077         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1078         if (r < 0)
1079                 return r;
1080
1081         o->data.hash = htole64(hash);
1082
1083 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1084         if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1085                 size_t rsize = 0;
1086
1087                 compression = compress_blob(data, size, o->data.payload, &rsize);
1088
1089                 if (compression >= 0) {
1090                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1091                         o->object.flags |= compression;
1092
1093                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1094                                   size, rsize, object_compressed_to_string(compression));
1095                 } else
1096                         /* Compression didn't work, we don't really care why, let's continue without compression */
1097                         compression = 0;
1098         }
1099 #endif
1100
1101         if (compression == 0 && size > 0)
1102                 memcpy(o->data.payload, data, size);
1103
1104         r = journal_file_link_data(f, o, p, hash);
1105         if (r < 0)
1106                 return r;
1107
1108         /* The linking might have altered the window, so let's
1109          * refresh our pointer */
1110         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1111         if (r < 0)
1112                 return r;
1113
1114         if (!data)
1115                 eq = NULL;
1116         else
1117                 eq = memchr(data, '=', size);
1118         if (eq && eq > data) {
1119                 Object *fo = NULL;
1120                 uint64_t fp;
1121
1122                 /* Create field object ... */
1123                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1124                 if (r < 0)
1125                         return r;
1126
1127                 /* ... and link it in. */
1128                 o->data.next_field_offset = fo->field.head_data_offset;
1129                 fo->field.head_data_offset = le64toh(p);
1130         }
1131
1132 #ifdef HAVE_GCRYPT
1133         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1134         if (r < 0)
1135                 return r;
1136 #endif
1137
1138         if (ret)
1139                 *ret = o;
1140
1141         if (offset)
1142                 *offset = p;
1143
1144         return 0;
1145 }
1146
1147 uint64_t journal_file_entry_n_items(Object *o) {
1148         assert(o);
1149
1150         if (o->object.type != OBJECT_ENTRY)
1151                 return 0;
1152
1153         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1154 }
1155
1156 uint64_t journal_file_entry_array_n_items(Object *o) {
1157         assert(o);
1158
1159         if (o->object.type != OBJECT_ENTRY_ARRAY)
1160                 return 0;
1161
1162         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1163 }
1164
1165 uint64_t journal_file_hash_table_n_items(Object *o) {
1166         assert(o);
1167
1168         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1169             o->object.type != OBJECT_FIELD_HASH_TABLE)
1170                 return 0;
1171
1172         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1173 }
1174
1175 static int link_entry_into_array(JournalFile *f,
1176                                  le64_t *first,
1177                                  le64_t *idx,
1178                                  uint64_t p) {
1179         int r;
1180         uint64_t n = 0, ap = 0, q, i, a, hidx;
1181         Object *o;
1182
1183         assert(f);
1184         assert(first);
1185         assert(idx);
1186         assert(p > 0);
1187
1188         a = le64toh(*first);
1189         i = hidx = le64toh(*idx);
1190         while (a > 0) {
1191
1192                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1193                 if (r < 0)
1194                         return r;
1195
1196                 n = journal_file_entry_array_n_items(o);
1197                 if (i < n) {
1198                         o->entry_array.items[i] = htole64(p);
1199                         *idx = htole64(hidx + 1);
1200                         return 0;
1201                 }
1202
1203                 i -= n;
1204                 ap = a;
1205                 a = le64toh(o->entry_array.next_entry_array_offset);
1206         }
1207
1208         if (hidx > n)
1209                 n = (hidx+1) * 2;
1210         else
1211                 n = n * 2;
1212
1213         if (n < 4)
1214                 n = 4;
1215
1216         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1217                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1218                                        &o, &q);
1219         if (r < 0)
1220                 return r;
1221
1222 #ifdef HAVE_GCRYPT
1223         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1224         if (r < 0)
1225                 return r;
1226 #endif
1227
1228         o->entry_array.items[i] = htole64(p);
1229
1230         if (ap == 0)
1231                 *first = htole64(q);
1232         else {
1233                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1234                 if (r < 0)
1235                         return r;
1236
1237                 o->entry_array.next_entry_array_offset = htole64(q);
1238         }
1239
1240         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1241                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1242
1243         *idx = htole64(hidx + 1);
1244
1245         return 0;
1246 }
1247
1248 static int link_entry_into_array_plus_one(JournalFile *f,
1249                                           le64_t *extra,
1250                                           le64_t *first,
1251                                           le64_t *idx,
1252                                           uint64_t p) {
1253
1254         int r;
1255
1256         assert(f);
1257         assert(extra);
1258         assert(first);
1259         assert(idx);
1260         assert(p > 0);
1261
1262         if (*idx == 0)
1263                 *extra = htole64(p);
1264         else {
1265                 le64_t i;
1266
1267                 i = htole64(le64toh(*idx) - 1);
1268                 r = link_entry_into_array(f, first, &i, p);
1269                 if (r < 0)
1270                         return r;
1271         }
1272
1273         *idx = htole64(le64toh(*idx) + 1);
1274         return 0;
1275 }
1276
1277 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1278         uint64_t p;
1279         int r;
1280         assert(f);
1281         assert(o);
1282         assert(offset > 0);
1283
1284         p = le64toh(o->entry.items[i].object_offset);
1285         if (p == 0)
1286                 return -EINVAL;
1287
1288         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1289         if (r < 0)
1290                 return r;
1291
1292         return link_entry_into_array_plus_one(f,
1293                                               &o->data.entry_offset,
1294                                               &o->data.entry_array_offset,
1295                                               &o->data.n_entries,
1296                                               offset);
1297 }
1298
1299 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1300         uint64_t n, i;
1301         int r;
1302
1303         assert(f);
1304         assert(o);
1305         assert(offset > 0);
1306
1307         if (o->object.type != OBJECT_ENTRY)
1308                 return -EINVAL;
1309
1310         __sync_synchronize();
1311
1312         /* Link up the entry itself */
1313         r = link_entry_into_array(f,
1314                                   &f->header->entry_array_offset,
1315                                   &f->header->n_entries,
1316                                   offset);
1317         if (r < 0)
1318                 return r;
1319
1320         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1321
1322         if (f->header->head_entry_realtime == 0)
1323                 f->header->head_entry_realtime = o->entry.realtime;
1324
1325         f->header->tail_entry_realtime = o->entry.realtime;
1326         f->header->tail_entry_monotonic = o->entry.monotonic;
1327
1328         f->tail_entry_monotonic_valid = true;
1329
1330         /* Link up the items */
1331         n = journal_file_entry_n_items(o);
1332         for (i = 0; i < n; i++) {
1333                 r = journal_file_link_entry_item(f, o, offset, i);
1334                 if (r < 0)
1335                         return r;
1336         }
1337
1338         return 0;
1339 }
1340
1341 static int journal_file_append_entry_internal(
1342                 JournalFile *f,
1343                 const dual_timestamp *ts,
1344                 uint64_t xor_hash,
1345                 const EntryItem items[], unsigned n_items,
1346                 uint64_t *seqnum,
1347                 Object **ret, uint64_t *offset) {
1348         uint64_t np;
1349         uint64_t osize;
1350         Object *o;
1351         int r;
1352
1353         assert(f);
1354         assert(items || n_items == 0);
1355         assert(ts);
1356
1357         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1358
1359         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1360         if (r < 0)
1361                 return r;
1362
1363         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1364         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1365         o->entry.realtime = htole64(ts->realtime);
1366         o->entry.monotonic = htole64(ts->monotonic);
1367         o->entry.xor_hash = htole64(xor_hash);
1368         o->entry.boot_id = f->header->boot_id;
1369
1370 #ifdef HAVE_GCRYPT
1371         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1372         if (r < 0)
1373                 return r;
1374 #endif
1375
1376         r = journal_file_link_entry(f, o, np);
1377         if (r < 0)
1378                 return r;
1379
1380         if (ret)
1381                 *ret = o;
1382
1383         if (offset)
1384                 *offset = np;
1385
1386         return 0;
1387 }
1388
1389 void journal_file_post_change(JournalFile *f) {
1390         assert(f);
1391
1392         /* inotify() does not receive IN_MODIFY events from file
1393          * accesses done via mmap(). After each access we hence
1394          * trigger IN_MODIFY by truncating the journal file to its
1395          * current size which triggers IN_MODIFY. */
1396
1397         __sync_synchronize();
1398
1399         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1400                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1401 }
1402
1403 static int entry_item_cmp(const void *_a, const void *_b) {
1404         const EntryItem *a = _a, *b = _b;
1405
1406         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1407                 return -1;
1408         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1409                 return 1;
1410         return 0;
1411 }
1412
1413 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1414         unsigned i;
1415         EntryItem *items;
1416         int r;
1417         uint64_t xor_hash = 0;
1418         struct dual_timestamp _ts;
1419
1420         assert(f);
1421         assert(iovec || n_iovec == 0);
1422
1423         if (!ts) {
1424                 dual_timestamp_get(&_ts);
1425                 ts = &_ts;
1426         }
1427
1428         if (f->tail_entry_monotonic_valid &&
1429             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1430                 return -EINVAL;
1431
1432 #ifdef HAVE_GCRYPT
1433         r = journal_file_maybe_append_tag(f, ts->realtime);
1434         if (r < 0)
1435                 return r;
1436 #endif
1437
1438         /* alloca() can't take 0, hence let's allocate at least one */
1439         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1440
1441         for (i = 0; i < n_iovec; i++) {
1442                 uint64_t p;
1443                 Object *o;
1444
1445                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1446                 if (r < 0)
1447                         return r;
1448
1449                 xor_hash ^= le64toh(o->data.hash);
1450                 items[i].object_offset = htole64(p);
1451                 items[i].hash = o->data.hash;
1452         }
1453
1454         /* Order by the position on disk, in order to improve seek
1455          * times for rotating media. */
1456         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1457
1458         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1459
1460         /* If the memory mapping triggered a SIGBUS then we return an
1461          * IO error and ignore the error code passed down to us, since
1462          * it is very likely just an effect of a nullified replacement
1463          * mapping page */
1464
1465         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1466                 r = -EIO;
1467
1468         journal_file_post_change(f);
1469
1470         return r;
1471 }
1472
1473 typedef struct ChainCacheItem {
1474         uint64_t first; /* the array at the beginning of the chain */
1475         uint64_t array; /* the cached array */
1476         uint64_t begin; /* the first item in the cached array */
1477         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1478         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1479 } ChainCacheItem;
1480
1481 static void chain_cache_put(
1482                 OrderedHashmap *h,
1483                 ChainCacheItem *ci,
1484                 uint64_t first,
1485                 uint64_t array,
1486                 uint64_t begin,
1487                 uint64_t total,
1488                 uint64_t last_index) {
1489
1490         if (!ci) {
1491                 /* If the chain item to cache for this chain is the
1492                  * first one it's not worth caching anything */
1493                 if (array == first)
1494                         return;
1495
1496                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1497                         ci = ordered_hashmap_steal_first(h);
1498                         assert(ci);
1499                 } else {
1500                         ci = new(ChainCacheItem, 1);
1501                         if (!ci)
1502                                 return;
1503                 }
1504
1505                 ci->first = first;
1506
1507                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1508                         free(ci);
1509                         return;
1510                 }
1511         } else
1512                 assert(ci->first == first);
1513
1514         ci->array = array;
1515         ci->begin = begin;
1516         ci->total = total;
1517         ci->last_index = last_index;
1518 }
1519
1520 static int generic_array_get(
1521                 JournalFile *f,
1522                 uint64_t first,
1523                 uint64_t i,
1524                 Object **ret, uint64_t *offset) {
1525
1526         Object *o;
1527         uint64_t p = 0, a, t = 0;
1528         int r;
1529         ChainCacheItem *ci;
1530
1531         assert(f);
1532
1533         a = first;
1534
1535         /* Try the chain cache first */
1536         ci = ordered_hashmap_get(f->chain_cache, &first);
1537         if (ci && i > ci->total) {
1538                 a = ci->array;
1539                 i -= ci->total;
1540                 t = ci->total;
1541         }
1542
1543         while (a > 0) {
1544                 uint64_t k;
1545
1546                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1547                 if (r < 0)
1548                         return r;
1549
1550                 k = journal_file_entry_array_n_items(o);
1551                 if (i < k) {
1552                         p = le64toh(o->entry_array.items[i]);
1553                         goto found;
1554                 }
1555
1556                 i -= k;
1557                 t += k;
1558                 a = le64toh(o->entry_array.next_entry_array_offset);
1559         }
1560
1561         return 0;
1562
1563 found:
1564         /* Let's cache this item for the next invocation */
1565         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1566
1567         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1568         if (r < 0)
1569                 return r;
1570
1571         if (ret)
1572                 *ret = o;
1573
1574         if (offset)
1575                 *offset = p;
1576
1577         return 1;
1578 }
1579
1580 static int generic_array_get_plus_one(
1581                 JournalFile *f,
1582                 uint64_t extra,
1583                 uint64_t first,
1584                 uint64_t i,
1585                 Object **ret, uint64_t *offset) {
1586
1587         Object *o;
1588
1589         assert(f);
1590
1591         if (i == 0) {
1592                 int r;
1593
1594                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1595                 if (r < 0)
1596                         return r;
1597
1598                 if (ret)
1599                         *ret = o;
1600
1601                 if (offset)
1602                         *offset = extra;
1603
1604                 return 1;
1605         }
1606
1607         return generic_array_get(f, first, i-1, ret, offset);
1608 }
1609
1610 enum {
1611         TEST_FOUND,
1612         TEST_LEFT,
1613         TEST_RIGHT
1614 };
1615
1616 static int generic_array_bisect(
1617                 JournalFile *f,
1618                 uint64_t first,
1619                 uint64_t n,
1620                 uint64_t needle,
1621                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1622                 direction_t direction,
1623                 Object **ret,
1624                 uint64_t *offset,
1625                 uint64_t *idx) {
1626
1627         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1628         bool subtract_one = false;
1629         Object *o, *array = NULL;
1630         int r;
1631         ChainCacheItem *ci;
1632
1633         assert(f);
1634         assert(test_object);
1635
1636         /* Start with the first array in the chain */
1637         a = first;
1638
1639         ci = ordered_hashmap_get(f->chain_cache, &first);
1640         if (ci && n > ci->total) {
1641                 /* Ah, we have iterated this bisection array chain
1642                  * previously! Let's see if we can skip ahead in the
1643                  * chain, as far as the last time. But we can't jump
1644                  * backwards in the chain, so let's check that
1645                  * first. */
1646
1647                 r = test_object(f, ci->begin, needle);
1648                 if (r < 0)
1649                         return r;
1650
1651                 if (r == TEST_LEFT) {
1652                         /* OK, what we are looking for is right of the
1653                          * begin of this EntryArray, so let's jump
1654                          * straight to previously cached array in the
1655                          * chain */
1656
1657                         a = ci->array;
1658                         n -= ci->total;
1659                         t = ci->total;
1660                         last_index = ci->last_index;
1661                 }
1662         }
1663
1664         while (a > 0) {
1665                 uint64_t left, right, k, lp;
1666
1667                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1668                 if (r < 0)
1669                         return r;
1670
1671                 k = journal_file_entry_array_n_items(array);
1672                 right = MIN(k, n);
1673                 if (right <= 0)
1674                         return 0;
1675
1676                 i = right - 1;
1677                 lp = p = le64toh(array->entry_array.items[i]);
1678                 if (p <= 0)
1679                         return -EBADMSG;
1680
1681                 r = test_object(f, p, needle);
1682                 if (r < 0)
1683                         return r;
1684
1685                 if (r == TEST_FOUND)
1686                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1687
1688                 if (r == TEST_RIGHT) {
1689                         left = 0;
1690                         right -= 1;
1691
1692                         if (last_index != (uint64_t) -1) {
1693                                 assert(last_index <= right);
1694
1695                                 /* If we cached the last index we
1696                                  * looked at, let's try to not to jump
1697                                  * too wildly around and see if we can
1698                                  * limit the range to look at early to
1699                                  * the immediate neighbors of the last
1700                                  * index we looked at. */
1701
1702                                 if (last_index > 0) {
1703                                         uint64_t x = last_index - 1;
1704
1705                                         p = le64toh(array->entry_array.items[x]);
1706                                         if (p <= 0)
1707                                                 return -EBADMSG;
1708
1709                                         r = test_object(f, p, needle);
1710                                         if (r < 0)
1711                                                 return r;
1712
1713                                         if (r == TEST_FOUND)
1714                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1715
1716                                         if (r == TEST_RIGHT)
1717                                                 right = x;
1718                                         else
1719                                                 left = x + 1;
1720                                 }
1721
1722                                 if (last_index < right) {
1723                                         uint64_t y = last_index + 1;
1724
1725                                         p = le64toh(array->entry_array.items[y]);
1726                                         if (p <= 0)
1727                                                 return -EBADMSG;
1728
1729                                         r = test_object(f, p, needle);
1730                                         if (r < 0)
1731                                                 return r;
1732
1733                                         if (r == TEST_FOUND)
1734                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1735
1736                                         if (r == TEST_RIGHT)
1737                                                 right = y;
1738                                         else
1739                                                 left = y + 1;
1740                                 }
1741                         }
1742
1743                         for (;;) {
1744                                 if (left == right) {
1745                                         if (direction == DIRECTION_UP)
1746                                                 subtract_one = true;
1747
1748                                         i = left;
1749                                         goto found;
1750                                 }
1751
1752                                 assert(left < right);
1753                                 i = (left + right) / 2;
1754
1755                                 p = le64toh(array->entry_array.items[i]);
1756                                 if (p <= 0)
1757                                         return -EBADMSG;
1758
1759                                 r = test_object(f, p, needle);
1760                                 if (r < 0)
1761                                         return r;
1762
1763                                 if (r == TEST_FOUND)
1764                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1765
1766                                 if (r == TEST_RIGHT)
1767                                         right = i;
1768                                 else
1769                                         left = i + 1;
1770                         }
1771                 }
1772
1773                 if (k >= n) {
1774                         if (direction == DIRECTION_UP) {
1775                                 i = n;
1776                                 subtract_one = true;
1777                                 goto found;
1778                         }
1779
1780                         return 0;
1781                 }
1782
1783                 last_p = lp;
1784
1785                 n -= k;
1786                 t += k;
1787                 last_index = (uint64_t) -1;
1788                 a = le64toh(array->entry_array.next_entry_array_offset);
1789         }
1790
1791         return 0;
1792
1793 found:
1794         if (subtract_one && t == 0 && i == 0)
1795                 return 0;
1796
1797         /* Let's cache this item for the next invocation */
1798         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1799
1800         if (subtract_one && i == 0)
1801                 p = last_p;
1802         else if (subtract_one)
1803                 p = le64toh(array->entry_array.items[i-1]);
1804         else
1805                 p = le64toh(array->entry_array.items[i]);
1806
1807         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1808         if (r < 0)
1809                 return r;
1810
1811         if (ret)
1812                 *ret = o;
1813
1814         if (offset)
1815                 *offset = p;
1816
1817         if (idx)
1818                 *idx = t + i + (subtract_one ? -1 : 0);
1819
1820         return 1;
1821 }
1822
1823 static int generic_array_bisect_plus_one(
1824                 JournalFile *f,
1825                 uint64_t extra,
1826                 uint64_t first,
1827                 uint64_t n,
1828                 uint64_t needle,
1829                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1830                 direction_t direction,
1831                 Object **ret,
1832                 uint64_t *offset,
1833                 uint64_t *idx) {
1834
1835         int r;
1836         bool step_back = false;
1837         Object *o;
1838
1839         assert(f);
1840         assert(test_object);
1841
1842         if (n <= 0)
1843                 return 0;
1844
1845         /* This bisects the array in object 'first', but first checks
1846          * an extra  */
1847         r = test_object(f, extra, needle);
1848         if (r < 0)
1849                 return r;
1850
1851         if (r == TEST_FOUND)
1852                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1853
1854         /* if we are looking with DIRECTION_UP then we need to first
1855            see if in the actual array there is a matching entry, and
1856            return the last one of that. But if there isn't any we need
1857            to return this one. Hence remember this, and return it
1858            below. */
1859         if (r == TEST_LEFT)
1860                 step_back = direction == DIRECTION_UP;
1861
1862         if (r == TEST_RIGHT) {
1863                 if (direction == DIRECTION_DOWN)
1864                         goto found;
1865                 else
1866                         return 0;
1867         }
1868
1869         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1870
1871         if (r == 0 && step_back)
1872                 goto found;
1873
1874         if (r > 0 && idx)
1875                 (*idx) ++;
1876
1877         return r;
1878
1879 found:
1880         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1881         if (r < 0)
1882                 return r;
1883
1884         if (ret)
1885                 *ret = o;
1886
1887         if (offset)
1888                 *offset = extra;
1889
1890         if (idx)
1891                 *idx = 0;
1892
1893         return 1;
1894 }
1895
1896 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1897         assert(f);
1898         assert(p > 0);
1899
1900         if (p == needle)
1901                 return TEST_FOUND;
1902         else if (p < needle)
1903                 return TEST_LEFT;
1904         else
1905                 return TEST_RIGHT;
1906 }
1907
1908 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1909         Object *o;
1910         int r;
1911
1912         assert(f);
1913         assert(p > 0);
1914
1915         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1916         if (r < 0)
1917                 return r;
1918
1919         if (le64toh(o->entry.seqnum) == needle)
1920                 return TEST_FOUND;
1921         else if (le64toh(o->entry.seqnum) < needle)
1922                 return TEST_LEFT;
1923         else
1924                 return TEST_RIGHT;
1925 }
1926
1927 int journal_file_move_to_entry_by_seqnum(
1928                 JournalFile *f,
1929                 uint64_t seqnum,
1930                 direction_t direction,
1931                 Object **ret,
1932                 uint64_t *offset) {
1933
1934         return generic_array_bisect(f,
1935                                     le64toh(f->header->entry_array_offset),
1936                                     le64toh(f->header->n_entries),
1937                                     seqnum,
1938                                     test_object_seqnum,
1939                                     direction,
1940                                     ret, offset, NULL);
1941 }
1942
1943 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1944         Object *o;
1945         int r;
1946
1947         assert(f);
1948         assert(p > 0);
1949
1950         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1951         if (r < 0)
1952                 return r;
1953
1954         if (le64toh(o->entry.realtime) == needle)
1955                 return TEST_FOUND;
1956         else if (le64toh(o->entry.realtime) < needle)
1957                 return TEST_LEFT;
1958         else
1959                 return TEST_RIGHT;
1960 }
1961
1962 int journal_file_move_to_entry_by_realtime(
1963                 JournalFile *f,
1964                 uint64_t realtime,
1965                 direction_t direction,
1966                 Object **ret,
1967                 uint64_t *offset) {
1968
1969         return generic_array_bisect(f,
1970                                     le64toh(f->header->entry_array_offset),
1971                                     le64toh(f->header->n_entries),
1972                                     realtime,
1973                                     test_object_realtime,
1974                                     direction,
1975                                     ret, offset, NULL);
1976 }
1977
1978 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1979         Object *o;
1980         int r;
1981
1982         assert(f);
1983         assert(p > 0);
1984
1985         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1986         if (r < 0)
1987                 return r;
1988
1989         if (le64toh(o->entry.monotonic) == needle)
1990                 return TEST_FOUND;
1991         else if (le64toh(o->entry.monotonic) < needle)
1992                 return TEST_LEFT;
1993         else
1994                 return TEST_RIGHT;
1995 }
1996
1997 static int find_data_object_by_boot_id(
1998                 JournalFile *f,
1999                 sd_id128_t boot_id,
2000                 Object **o,
2001                 uint64_t *b) {
2002
2003         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2004
2005         sd_id128_to_string(boot_id, t + 9);
2006         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2007 }
2008
2009 int journal_file_move_to_entry_by_monotonic(
2010                 JournalFile *f,
2011                 sd_id128_t boot_id,
2012                 uint64_t monotonic,
2013                 direction_t direction,
2014                 Object **ret,
2015                 uint64_t *offset) {
2016
2017         Object *o;
2018         int r;
2019
2020         assert(f);
2021
2022         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2023         if (r < 0)
2024                 return r;
2025         if (r == 0)
2026                 return -ENOENT;
2027
2028         return generic_array_bisect_plus_one(f,
2029                                              le64toh(o->data.entry_offset),
2030                                              le64toh(o->data.entry_array_offset),
2031                                              le64toh(o->data.n_entries),
2032                                              monotonic,
2033                                              test_object_monotonic,
2034                                              direction,
2035                                              ret, offset, NULL);
2036 }
2037
2038 void journal_file_reset_location(JournalFile *f) {
2039         f->location_type = LOCATION_HEAD;
2040         f->current_offset = 0;
2041         f->current_seqnum = 0;
2042         f->current_realtime = 0;
2043         f->current_monotonic = 0;
2044         zero(f->current_boot_id);
2045         f->current_xor_hash = 0;
2046 }
2047
2048 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2049         f->location_type = LOCATION_SEEK;
2050         f->current_offset = offset;
2051         f->current_seqnum = le64toh(o->entry.seqnum);
2052         f->current_realtime = le64toh(o->entry.realtime);
2053         f->current_monotonic = le64toh(o->entry.monotonic);
2054         f->current_boot_id = o->entry.boot_id;
2055         f->current_xor_hash = le64toh(o->entry.xor_hash);
2056 }
2057
2058 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2059         assert(af);
2060         assert(bf);
2061         assert(af->location_type == LOCATION_SEEK);
2062         assert(bf->location_type == LOCATION_SEEK);
2063
2064         /* If contents and timestamps match, these entries are
2065          * identical, even if the seqnum does not match */
2066         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2067             af->current_monotonic == bf->current_monotonic &&
2068             af->current_realtime == bf->current_realtime &&
2069             af->current_xor_hash == bf->current_xor_hash)
2070                 return 0;
2071
2072         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2073
2074                 /* If this is from the same seqnum source, compare
2075                  * seqnums */
2076                 if (af->current_seqnum < bf->current_seqnum)
2077                         return -1;
2078                 if (af->current_seqnum > bf->current_seqnum)
2079                         return 1;
2080
2081                 /* Wow! This is weird, different data but the same
2082                  * seqnums? Something is borked, but let's make the
2083                  * best of it and compare by time. */
2084         }
2085
2086         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2087
2088                 /* If the boot id matches, compare monotonic time */
2089                 if (af->current_monotonic < bf->current_monotonic)
2090                         return -1;
2091                 if (af->current_monotonic > bf->current_monotonic)
2092                         return 1;
2093         }
2094
2095         /* Otherwise, compare UTC time */
2096         if (af->current_realtime < bf->current_realtime)
2097                 return -1;
2098         if (af->current_realtime > bf->current_realtime)
2099                 return 1;
2100
2101         /* Finally, compare by contents */
2102         if (af->current_xor_hash < bf->current_xor_hash)
2103                 return -1;
2104         if (af->current_xor_hash > bf->current_xor_hash)
2105                 return 1;
2106
2107         return 0;
2108 }
2109
2110 int journal_file_next_entry(
2111                 JournalFile *f,
2112                 uint64_t p,
2113                 direction_t direction,
2114                 Object **ret, uint64_t *offset) {
2115
2116         uint64_t i, n, ofs;
2117         int r;
2118
2119         assert(f);
2120
2121         n = le64toh(f->header->n_entries);
2122         if (n <= 0)
2123                 return 0;
2124
2125         if (p == 0)
2126                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2127         else {
2128                 r = generic_array_bisect(f,
2129                                          le64toh(f->header->entry_array_offset),
2130                                          le64toh(f->header->n_entries),
2131                                          p,
2132                                          test_object_offset,
2133                                          DIRECTION_DOWN,
2134                                          NULL, NULL,
2135                                          &i);
2136                 if (r <= 0)
2137                         return r;
2138
2139                 if (direction == DIRECTION_DOWN) {
2140                         if (i >= n - 1)
2141                                 return 0;
2142
2143                         i++;
2144                 } else {
2145                         if (i <= 0)
2146                                 return 0;
2147
2148                         i--;
2149                 }
2150         }
2151
2152         /* And jump to it */
2153         r = generic_array_get(f,
2154                               le64toh(f->header->entry_array_offset),
2155                               i,
2156                               ret, &ofs);
2157         if (r <= 0)
2158                 return r;
2159
2160         if (p > 0 &&
2161             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2162                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2163                           f->path, i);
2164                 return -EBADMSG;
2165         }
2166
2167         if (offset)
2168                 *offset = ofs;
2169
2170         return 1;
2171 }
2172
2173 int journal_file_next_entry_for_data(
2174                 JournalFile *f,
2175                 Object *o, uint64_t p,
2176                 uint64_t data_offset,
2177                 direction_t direction,
2178                 Object **ret, uint64_t *offset) {
2179
2180         uint64_t n, i;
2181         int r;
2182         Object *d;
2183
2184         assert(f);
2185         assert(p > 0 || !o);
2186
2187         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2188         if (r < 0)
2189                 return r;
2190
2191         n = le64toh(d->data.n_entries);
2192         if (n <= 0)
2193                 return n;
2194
2195         if (!o)
2196                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2197         else {
2198                 if (o->object.type != OBJECT_ENTRY)
2199                         return -EINVAL;
2200
2201                 r = generic_array_bisect_plus_one(f,
2202                                                   le64toh(d->data.entry_offset),
2203                                                   le64toh(d->data.entry_array_offset),
2204                                                   le64toh(d->data.n_entries),
2205                                                   p,
2206                                                   test_object_offset,
2207                                                   DIRECTION_DOWN,
2208                                                   NULL, NULL,
2209                                                   &i);
2210
2211                 if (r <= 0)
2212                         return r;
2213
2214                 if (direction == DIRECTION_DOWN) {
2215                         if (i >= n - 1)
2216                                 return 0;
2217
2218                         i++;
2219                 } else {
2220                         if (i <= 0)
2221                                 return 0;
2222
2223                         i--;
2224                 }
2225
2226         }
2227
2228         return generic_array_get_plus_one(f,
2229                                           le64toh(d->data.entry_offset),
2230                                           le64toh(d->data.entry_array_offset),
2231                                           i,
2232                                           ret, offset);
2233 }
2234
2235 int journal_file_move_to_entry_by_offset_for_data(
2236                 JournalFile *f,
2237                 uint64_t data_offset,
2238                 uint64_t p,
2239                 direction_t direction,
2240                 Object **ret, uint64_t *offset) {
2241
2242         int r;
2243         Object *d;
2244
2245         assert(f);
2246
2247         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2248         if (r < 0)
2249                 return r;
2250
2251         return generic_array_bisect_plus_one(f,
2252                                              le64toh(d->data.entry_offset),
2253                                              le64toh(d->data.entry_array_offset),
2254                                              le64toh(d->data.n_entries),
2255                                              p,
2256                                              test_object_offset,
2257                                              direction,
2258                                              ret, offset, NULL);
2259 }
2260
2261 int journal_file_move_to_entry_by_monotonic_for_data(
2262                 JournalFile *f,
2263                 uint64_t data_offset,
2264                 sd_id128_t boot_id,
2265                 uint64_t monotonic,
2266                 direction_t direction,
2267                 Object **ret, uint64_t *offset) {
2268
2269         Object *o, *d;
2270         int r;
2271         uint64_t b, z;
2272
2273         assert(f);
2274
2275         /* First, seek by time */
2276         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2277         if (r < 0)
2278                 return r;
2279         if (r == 0)
2280                 return -ENOENT;
2281
2282         r = generic_array_bisect_plus_one(f,
2283                                           le64toh(o->data.entry_offset),
2284                                           le64toh(o->data.entry_array_offset),
2285                                           le64toh(o->data.n_entries),
2286                                           monotonic,
2287                                           test_object_monotonic,
2288                                           direction,
2289                                           NULL, &z, NULL);
2290         if (r <= 0)
2291                 return r;
2292
2293         /* And now, continue seeking until we find an entry that
2294          * exists in both bisection arrays */
2295
2296         for (;;) {
2297                 Object *qo;
2298                 uint64_t p, q;
2299
2300                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2301                 if (r < 0)
2302                         return r;
2303
2304                 r = generic_array_bisect_plus_one(f,
2305                                                   le64toh(d->data.entry_offset),
2306                                                   le64toh(d->data.entry_array_offset),
2307                                                   le64toh(d->data.n_entries),
2308                                                   z,
2309                                                   test_object_offset,
2310                                                   direction,
2311                                                   NULL, &p, NULL);
2312                 if (r <= 0)
2313                         return r;
2314
2315                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2316                 if (r < 0)
2317                         return r;
2318
2319                 r = generic_array_bisect_plus_one(f,
2320                                                   le64toh(o->data.entry_offset),
2321                                                   le64toh(o->data.entry_array_offset),
2322                                                   le64toh(o->data.n_entries),
2323                                                   p,
2324                                                   test_object_offset,
2325                                                   direction,
2326                                                   &qo, &q, NULL);
2327
2328                 if (r <= 0)
2329                         return r;
2330
2331                 if (p == q) {
2332                         if (ret)
2333                                 *ret = qo;
2334                         if (offset)
2335                                 *offset = q;
2336
2337                         return 1;
2338                 }
2339
2340                 z = q;
2341         }
2342 }
2343
2344 int journal_file_move_to_entry_by_seqnum_for_data(
2345                 JournalFile *f,
2346                 uint64_t data_offset,
2347                 uint64_t seqnum,
2348                 direction_t direction,
2349                 Object **ret, uint64_t *offset) {
2350
2351         Object *d;
2352         int r;
2353
2354         assert(f);
2355
2356         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2357         if (r < 0)
2358                 return r;
2359
2360         return generic_array_bisect_plus_one(f,
2361                                              le64toh(d->data.entry_offset),
2362                                              le64toh(d->data.entry_array_offset),
2363                                              le64toh(d->data.n_entries),
2364                                              seqnum,
2365                                              test_object_seqnum,
2366                                              direction,
2367                                              ret, offset, NULL);
2368 }
2369
2370 int journal_file_move_to_entry_by_realtime_for_data(
2371                 JournalFile *f,
2372                 uint64_t data_offset,
2373                 uint64_t realtime,
2374                 direction_t direction,
2375                 Object **ret, uint64_t *offset) {
2376
2377         Object *d;
2378         int r;
2379
2380         assert(f);
2381
2382         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2383         if (r < 0)
2384                 return r;
2385
2386         return generic_array_bisect_plus_one(f,
2387                                              le64toh(d->data.entry_offset),
2388                                              le64toh(d->data.entry_array_offset),
2389                                              le64toh(d->data.n_entries),
2390                                              realtime,
2391                                              test_object_realtime,
2392                                              direction,
2393                                              ret, offset, NULL);
2394 }
2395
2396 void journal_file_dump(JournalFile *f) {
2397         Object *o;
2398         int r;
2399         uint64_t p;
2400
2401         assert(f);
2402
2403         journal_file_print_header(f);
2404
2405         p = le64toh(f->header->header_size);
2406         while (p != 0) {
2407                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2408                 if (r < 0)
2409                         goto fail;
2410
2411                 switch (o->object.type) {
2412
2413                 case OBJECT_UNUSED:
2414                         printf("Type: OBJECT_UNUSED\n");
2415                         break;
2416
2417                 case OBJECT_DATA:
2418                         printf("Type: OBJECT_DATA\n");
2419                         break;
2420
2421                 case OBJECT_FIELD:
2422                         printf("Type: OBJECT_FIELD\n");
2423                         break;
2424
2425                 case OBJECT_ENTRY:
2426                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2427                                le64toh(o->entry.seqnum),
2428                                le64toh(o->entry.monotonic),
2429                                le64toh(o->entry.realtime));
2430                         break;
2431
2432                 case OBJECT_FIELD_HASH_TABLE:
2433                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2434                         break;
2435
2436                 case OBJECT_DATA_HASH_TABLE:
2437                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2438                         break;
2439
2440                 case OBJECT_ENTRY_ARRAY:
2441                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2442                         break;
2443
2444                 case OBJECT_TAG:
2445                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2446                                le64toh(o->tag.seqnum),
2447                                le64toh(o->tag.epoch));
2448                         break;
2449
2450                 default:
2451                         printf("Type: unknown (%i)\n", o->object.type);
2452                         break;
2453                 }
2454
2455                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2456                         printf("Flags: %s\n",
2457                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2458
2459                 if (p == le64toh(f->header->tail_object_offset))
2460                         p = 0;
2461                 else
2462                         p = p + ALIGN64(le64toh(o->object.size));
2463         }
2464
2465         return;
2466 fail:
2467         log_error("File corrupt");
2468 }
2469
2470 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2471         const char *x;
2472
2473         x = format_timestamp(buf, l, t);
2474         if (x)
2475                 return x;
2476         return " --- ";
2477 }
2478
2479 void journal_file_print_header(JournalFile *f) {
2480         char a[33], b[33], c[33], d[33];
2481         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2482         struct stat st;
2483         char bytes[FORMAT_BYTES_MAX];
2484
2485         assert(f);
2486
2487         printf("File Path: %s\n"
2488                "File ID: %s\n"
2489                "Machine ID: %s\n"
2490                "Boot ID: %s\n"
2491                "Sequential Number ID: %s\n"
2492                "State: %s\n"
2493                "Compatible Flags:%s%s\n"
2494                "Incompatible Flags:%s%s%s\n"
2495                "Header size: %"PRIu64"\n"
2496                "Arena size: %"PRIu64"\n"
2497                "Data Hash Table Size: %"PRIu64"\n"
2498                "Field Hash Table Size: %"PRIu64"\n"
2499                "Rotate Suggested: %s\n"
2500                "Head Sequential Number: %"PRIu64"\n"
2501                "Tail Sequential Number: %"PRIu64"\n"
2502                "Head Realtime Timestamp: %s\n"
2503                "Tail Realtime Timestamp: %s\n"
2504                "Tail Monotonic Timestamp: %s\n"
2505                "Objects: %"PRIu64"\n"
2506                "Entry Objects: %"PRIu64"\n",
2507                f->path,
2508                sd_id128_to_string(f->header->file_id, a),
2509                sd_id128_to_string(f->header->machine_id, b),
2510                sd_id128_to_string(f->header->boot_id, c),
2511                sd_id128_to_string(f->header->seqnum_id, d),
2512                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2513                f->header->state == STATE_ONLINE ? "ONLINE" :
2514                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2515                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2516                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2517                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2518                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2519                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2520                le64toh(f->header->header_size),
2521                le64toh(f->header->arena_size),
2522                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2523                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2524                yes_no(journal_file_rotate_suggested(f, 0)),
2525                le64toh(f->header->head_entry_seqnum),
2526                le64toh(f->header->tail_entry_seqnum),
2527                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2528                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2529                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2530                le64toh(f->header->n_objects),
2531                le64toh(f->header->n_entries));
2532
2533         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2534                 printf("Data Objects: %"PRIu64"\n"
2535                        "Data Hash Table Fill: %.1f%%\n",
2536                        le64toh(f->header->n_data),
2537                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2538
2539         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2540                 printf("Field Objects: %"PRIu64"\n"
2541                        "Field Hash Table Fill: %.1f%%\n",
2542                        le64toh(f->header->n_fields),
2543                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2544
2545         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2546                 printf("Tag Objects: %"PRIu64"\n",
2547                        le64toh(f->header->n_tags));
2548         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2549                 printf("Entry Array Objects: %"PRIu64"\n",
2550                        le64toh(f->header->n_entry_arrays));
2551
2552         if (fstat(f->fd, &st) >= 0)
2553                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2554 }
2555
2556 static int journal_file_warn_btrfs(JournalFile *f) {
2557         unsigned attrs;
2558         int r;
2559
2560         assert(f);
2561
2562         /* Before we write anything, check if the COW logic is turned
2563          * off on btrfs. Given our write pattern that is quite
2564          * unfriendly to COW file systems this should greatly improve
2565          * performance on COW file systems, such as btrfs, at the
2566          * expense of data integrity features (which shouldn't be too
2567          * bad, given that we do our own checksumming). */
2568
2569         r = btrfs_is_filesystem(f->fd);
2570         if (r < 0)
2571                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2572         if (!r)
2573                 return 0;
2574
2575         r = read_attr_fd(f->fd, &attrs);
2576         if (r < 0)
2577                 return log_warning_errno(r, "Failed to read file attributes: %m");
2578
2579         if (attrs & FS_NOCOW_FL) {
2580                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2581                 return 0;
2582         }
2583
2584         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2585                    "This is likely to slow down journal access substantially, please consider turning "
2586                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2587
2588         return 1;
2589 }
2590
2591 int journal_file_open(
2592                 const char *fname,
2593                 int flags,
2594                 mode_t mode,
2595                 bool compress,
2596                 bool seal,
2597                 JournalMetrics *metrics,
2598                 MMapCache *mmap_cache,
2599                 JournalFile *template,
2600                 JournalFile **ret) {
2601
2602         bool newly_created = false;
2603         JournalFile *f;
2604         void *h;
2605         int r;
2606
2607         assert(fname);
2608         assert(ret);
2609
2610         if ((flags & O_ACCMODE) != O_RDONLY &&
2611             (flags & O_ACCMODE) != O_RDWR)
2612                 return -EINVAL;
2613
2614         if (!endswith(fname, ".journal") &&
2615             !endswith(fname, ".journal~"))
2616                 return -EINVAL;
2617
2618         f = new0(JournalFile, 1);
2619         if (!f)
2620                 return -ENOMEM;
2621
2622         f->fd = -1;
2623         f->mode = mode;
2624
2625         f->flags = flags;
2626         f->prot = prot_from_flags(flags);
2627         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2628 #if defined(HAVE_LZ4)
2629         f->compress_lz4 = compress;
2630 #elif defined(HAVE_XZ)
2631         f->compress_xz = compress;
2632 #endif
2633 #ifdef HAVE_GCRYPT
2634         f->seal = seal;
2635 #endif
2636
2637         if (mmap_cache)
2638                 f->mmap = mmap_cache_ref(mmap_cache);
2639         else {
2640                 f->mmap = mmap_cache_new();
2641                 if (!f->mmap) {
2642                         r = -ENOMEM;
2643                         goto fail;
2644                 }
2645         }
2646
2647         f->path = strdup(fname);
2648         if (!f->path) {
2649                 r = -ENOMEM;
2650                 goto fail;
2651         }
2652
2653         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2654         if (!f->chain_cache) {
2655                 r = -ENOMEM;
2656                 goto fail;
2657         }
2658
2659         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2660         if (f->fd < 0) {
2661                 r = -errno;
2662                 goto fail;
2663         }
2664
2665         r = journal_file_fstat(f);
2666         if (r < 0)
2667                 goto fail;
2668
2669         if (f->last_stat.st_size == 0 && f->writable) {
2670
2671                 (void) journal_file_warn_btrfs(f);
2672
2673                 /* Let's attach the creation time to the journal file,
2674                  * so that the vacuuming code knows the age of this
2675                  * file even if the file might end up corrupted one
2676                  * day... Ideally we'd just use the creation time many
2677                  * file systems maintain for each file, but there is
2678                  * currently no usable API to query this, hence let's
2679                  * emulate this via extended attributes. If extended
2680                  * attributes are not supported we'll just skip this,
2681                  * and rely solely on mtime/atime/ctime of the file. */
2682
2683                 fd_setcrtime(f->fd, 0);
2684
2685 #ifdef HAVE_GCRYPT
2686                 /* Try to load the FSPRG state, and if we can't, then
2687                  * just don't do sealing */
2688                 if (f->seal) {
2689                         r = journal_file_fss_load(f);
2690                         if (r < 0)
2691                                 f->seal = false;
2692                 }
2693 #endif
2694
2695                 r = journal_file_init_header(f, template);
2696                 if (r < 0)
2697                         goto fail;
2698
2699                 r = journal_file_fstat(f);
2700                 if (r < 0)
2701                         goto fail;
2702
2703                 newly_created = true;
2704         }
2705
2706         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2707                 r = -EIO;
2708                 goto fail;
2709         }
2710
2711         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2712         if (r < 0)
2713                 goto fail;
2714
2715         f->header = h;
2716
2717         if (!newly_created) {
2718                 r = journal_file_verify_header(f);
2719                 if (r < 0)
2720                         goto fail;
2721         }
2722
2723 #ifdef HAVE_GCRYPT
2724         if (!newly_created && f->writable) {
2725                 r = journal_file_fss_load(f);
2726                 if (r < 0)
2727                         goto fail;
2728         }
2729 #endif
2730
2731         if (f->writable) {
2732                 if (metrics) {
2733                         journal_default_metrics(metrics, f->fd);
2734                         f->metrics = *metrics;
2735                 } else if (template)
2736                         f->metrics = template->metrics;
2737
2738                 r = journal_file_refresh_header(f);
2739                 if (r < 0)
2740                         goto fail;
2741         }
2742
2743 #ifdef HAVE_GCRYPT
2744         r = journal_file_hmac_setup(f);
2745         if (r < 0)
2746                 goto fail;
2747 #endif
2748
2749         if (newly_created) {
2750                 r = journal_file_setup_field_hash_table(f);
2751                 if (r < 0)
2752                         goto fail;
2753
2754                 r = journal_file_setup_data_hash_table(f);
2755                 if (r < 0)
2756                         goto fail;
2757
2758 #ifdef HAVE_GCRYPT
2759                 r = journal_file_append_first_tag(f);
2760                 if (r < 0)
2761                         goto fail;
2762 #endif
2763         }
2764
2765         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2766                 r = -EIO;
2767                 goto fail;
2768         }
2769
2770         *ret = f;
2771         return 0;
2772
2773 fail:
2774         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2775                 r = -EIO;
2776
2777         journal_file_close(f);
2778
2779         return r;
2780 }
2781
2782 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2783         _cleanup_free_ char *p = NULL;
2784         size_t l;
2785         JournalFile *old_file, *new_file = NULL;
2786         int r;
2787
2788         assert(f);
2789         assert(*f);
2790
2791         old_file = *f;
2792
2793         if (!old_file->writable)
2794                 return -EINVAL;
2795
2796         if (!endswith(old_file->path, ".journal"))
2797                 return -EINVAL;
2798
2799         l = strlen(old_file->path);
2800         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2801                      (int) l - 8, old_file->path,
2802                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2803                      le64toh((*f)->header->head_entry_seqnum),
2804                      le64toh((*f)->header->head_entry_realtime));
2805         if (r < 0)
2806                 return -ENOMEM;
2807
2808         /* Try to rename the file to the archived version. If the file
2809          * already was deleted, we'll get ENOENT, let's ignore that
2810          * case. */
2811         r = rename(old_file->path, p);
2812         if (r < 0 && errno != ENOENT)
2813                 return -errno;
2814
2815         old_file->header->state = STATE_ARCHIVED;
2816
2817         /* Currently, btrfs is not very good with out write patterns
2818          * and fragments heavily. Let's defrag our journal files when
2819          * we archive them */
2820         old_file->defrag_on_close = true;
2821
2822         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2823         journal_file_close(old_file);
2824
2825         *f = new_file;
2826         return r;
2827 }
2828
2829 int journal_file_open_reliably(
2830                 const char *fname,
2831                 int flags,
2832                 mode_t mode,
2833                 bool compress,
2834                 bool seal,
2835                 JournalMetrics *metrics,
2836                 MMapCache *mmap_cache,
2837                 JournalFile *template,
2838                 JournalFile **ret) {
2839
2840         int r;
2841         size_t l;
2842         _cleanup_free_ char *p = NULL;
2843
2844         r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2845         if (!IN_SET(r,
2846                     -EBADMSG,           /* corrupted */
2847                     -ENODATA,           /* truncated */
2848                     -EHOSTDOWN,         /* other machine */
2849                     -EPROTONOSUPPORT,   /* incompatible feature */
2850                     -EBUSY,             /* unclean shutdown */
2851                     -ESHUTDOWN,         /* already archived */
2852                     -EIO,               /* IO error, including SIGBUS on mmap */
2853                     -EIDRM              /* File has been deleted */))
2854                 return r;
2855
2856         if ((flags & O_ACCMODE) == O_RDONLY)
2857                 return r;
2858
2859         if (!(flags & O_CREAT))
2860                 return r;
2861
2862         if (!endswith(fname, ".journal"))
2863                 return r;
2864
2865         /* The file is corrupted. Rotate it away and try it again (but only once) */
2866
2867         l = strlen(fname);
2868         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2869                      (int) l - 8, fname,
2870                      now(CLOCK_REALTIME),
2871                      random_u64()) < 0)
2872                 return -ENOMEM;
2873
2874         if (rename(fname, p) < 0)
2875                 return -errno;
2876
2877         /* btrfs doesn't cope well with our write pattern and
2878          * fragments heavily. Let's defrag all files we rotate */
2879
2880         (void) chattr_path(p, false, FS_NOCOW_FL);
2881         (void) btrfs_defrag(p);
2882
2883         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2884
2885         return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2886 }
2887
2888 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2889         uint64_t i, n;
2890         uint64_t q, xor_hash = 0;
2891         int r;
2892         EntryItem *items;
2893         dual_timestamp ts;
2894
2895         assert(from);
2896         assert(to);
2897         assert(o);
2898         assert(p);
2899
2900         if (!to->writable)
2901                 return -EPERM;
2902
2903         ts.monotonic = le64toh(o->entry.monotonic);
2904         ts.realtime = le64toh(o->entry.realtime);
2905
2906         n = journal_file_entry_n_items(o);
2907         /* alloca() can't take 0, hence let's allocate at least one */
2908         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2909
2910         for (i = 0; i < n; i++) {
2911                 uint64_t l, h;
2912                 le64_t le_hash;
2913                 size_t t;
2914                 void *data;
2915                 Object *u;
2916
2917                 q = le64toh(o->entry.items[i].object_offset);
2918                 le_hash = o->entry.items[i].hash;
2919
2920                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2921                 if (r < 0)
2922                         return r;
2923
2924                 if (le_hash != o->data.hash)
2925                         return -EBADMSG;
2926
2927                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2928                 t = (size_t) l;
2929
2930                 /* We hit the limit on 32bit machines */
2931                 if ((uint64_t) t != l)
2932                         return -E2BIG;
2933
2934                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2935 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2936                         size_t rsize = 0;
2937
2938                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2939                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2940                         if (r < 0)
2941                                 return r;
2942
2943                         data = from->compress_buffer;
2944                         l = rsize;
2945 #else
2946                         return -EPROTONOSUPPORT;
2947 #endif
2948                 } else
2949                         data = o->data.payload;
2950
2951                 r = journal_file_append_data(to, data, l, &u, &h);
2952                 if (r < 0)
2953                         return r;
2954
2955                 xor_hash ^= le64toh(u->data.hash);
2956                 items[i].object_offset = htole64(h);
2957                 items[i].hash = u->data.hash;
2958
2959                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2960                 if (r < 0)
2961                         return r;
2962         }
2963
2964         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2965
2966         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2967                 return -EIO;
2968
2969         return r;
2970 }
2971
2972 void journal_reset_metrics(JournalMetrics *m) {
2973         assert(m);
2974
2975         /* Set everything to "pick automatic values". */
2976
2977         *m = (JournalMetrics) {
2978                 .min_use = (uint64_t) -1,
2979                 .max_use = (uint64_t) -1,
2980                 .min_size = (uint64_t) -1,
2981                 .max_size = (uint64_t) -1,
2982                 .keep_free = (uint64_t) -1,
2983                 .n_max_files = (uint64_t) -1,
2984         };
2985 }
2986
2987 void journal_default_metrics(JournalMetrics *m, int fd) {
2988         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
2989         struct statvfs ss;
2990         uint64_t fs_size;
2991
2992         assert(m);
2993         assert(fd >= 0);
2994
2995         if (fstatvfs(fd, &ss) >= 0)
2996                 fs_size = ss.f_frsize * ss.f_blocks;
2997         else {
2998                 log_debug_errno(errno, "Failed to detremine disk size: %m");
2999                 fs_size = 0;
3000         }
3001
3002         if (m->max_use == (uint64_t) -1) {
3003
3004                 if (fs_size > 0) {
3005                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3006
3007                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3008                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3009
3010                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3011                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3012                 } else
3013                         m->max_use = DEFAULT_MAX_USE_LOWER;
3014         } else {
3015                 m->max_use = PAGE_ALIGN(m->max_use);
3016
3017                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3018                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3019         }
3020
3021         if (m->min_use == (uint64_t) -1)
3022                 m->min_use = DEFAULT_MIN_USE;
3023
3024         if (m->min_use > m->max_use)
3025                 m->min_use = m->max_use;
3026
3027         if (m->max_size == (uint64_t) -1) {
3028                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3029
3030                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3031                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3032         } else
3033                 m->max_size = PAGE_ALIGN(m->max_size);
3034
3035         if (m->max_size != 0) {
3036                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3037                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3038
3039                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3040                         m->max_use = m->max_size*2;
3041         }
3042
3043         if (m->min_size == (uint64_t) -1)
3044                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3045         else {
3046                 m->min_size = PAGE_ALIGN(m->min_size);
3047
3048                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3049                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3050
3051                 if (m->max_size != 0 && m->min_size > m->max_size)
3052                         m->max_size = m->min_size;
3053         }
3054
3055         if (m->keep_free == (uint64_t) -1) {
3056
3057                 if (fs_size > 0) {
3058                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3059
3060                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3061                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3062
3063                 } else
3064                         m->keep_free = DEFAULT_KEEP_FREE;
3065         }
3066
3067         if (m->n_max_files == (uint64_t) -1)
3068                 m->n_max_files = DEFAULT_N_MAX_FILES;
3069
3070         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3071                   format_bytes(a, sizeof(a), m->min_use),
3072                   format_bytes(b, sizeof(b), m->max_use),
3073                   format_bytes(c, sizeof(c), m->max_size),
3074                   format_bytes(d, sizeof(d), m->min_size),
3075                   format_bytes(e, sizeof(e), m->keep_free),
3076                   m->n_max_files);
3077 }
3078
3079 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3080         assert(f);
3081         assert(from || to);
3082
3083         if (from) {
3084                 if (f->header->head_entry_realtime == 0)
3085                         return -ENOENT;
3086
3087                 *from = le64toh(f->header->head_entry_realtime);
3088         }
3089
3090         if (to) {
3091                 if (f->header->tail_entry_realtime == 0)
3092                         return -ENOENT;
3093
3094                 *to = le64toh(f->header->tail_entry_realtime);
3095         }
3096
3097         return 1;
3098 }
3099
3100 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3101         Object *o;
3102         uint64_t p;
3103         int r;
3104
3105         assert(f);
3106         assert(from || to);
3107
3108         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3109         if (r <= 0)
3110                 return r;
3111
3112         if (le64toh(o->data.n_entries) <= 0)
3113                 return 0;
3114
3115         if (from) {
3116                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3117                 if (r < 0)
3118                         return r;
3119
3120                 *from = le64toh(o->entry.monotonic);
3121         }
3122
3123         if (to) {
3124                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3125                 if (r < 0)
3126                         return r;
3127
3128                 r = generic_array_get_plus_one(f,
3129                                                le64toh(o->data.entry_offset),
3130                                                le64toh(o->data.entry_array_offset),
3131                                                le64toh(o->data.n_entries)-1,
3132                                                &o, NULL);
3133                 if (r <= 0)
3134                         return r;
3135
3136                 *to = le64toh(o->entry.monotonic);
3137         }
3138
3139         return 1;
3140 }
3141
3142 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3143         assert(f);
3144
3145         /* If we gained new header fields we gained new features,
3146          * hence suggest a rotation */
3147         if (le64toh(f->header->header_size) < sizeof(Header)) {
3148                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3149                 return true;
3150         }
3151
3152         /* Let's check if the hash tables grew over a certain fill
3153          * level (75%, borrowing this value from Java's hash table
3154          * implementation), and if so suggest a rotation. To calculate
3155          * the fill level we need the n_data field, which only exists
3156          * in newer versions. */
3157
3158         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3159                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3160                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3161                                   f->path,
3162                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3163                                   le64toh(f->header->n_data),
3164                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3165                                   (unsigned long long) f->last_stat.st_size,
3166                                   f->last_stat.st_size / le64toh(f->header->n_data));
3167                         return true;
3168                 }
3169
3170         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3171                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3172                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3173                                   f->path,
3174                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3175                                   le64toh(f->header->n_fields),
3176                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3177                         return true;
3178                 }
3179
3180         /* Are the data objects properly indexed by field objects? */
3181         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3182             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3183             le64toh(f->header->n_data) > 0 &&
3184             le64toh(f->header->n_fields) == 0)
3185                 return true;
3186
3187         if (max_file_usec > 0) {
3188                 usec_t t, h;
3189
3190                 h = le64toh(f->header->head_entry_realtime);
3191                 t = now(CLOCK_REALTIME);
3192
3193                 if (h > 0 && t > h + max_file_usec)
3194                         return true;
3195         }
3196
3197         return false;
3198 }