src/journal/journal-file.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2011 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <sys/mman.h>
  23 #include <errno.h>
  24 #include <sys/uio.h>
  25 #include <unistd.h>
  26 #include <sys/statvfs.h>
  27 #include <fcntl.h>
  28 #include <stddef.h>
  29 #include <linux/fs.h>
  30
  31 #include "btrfs-util.h"
  32 #include "journal-def.h"
  33 #include "journal-file.h"
  34 #include "journal-authenticate.h"
  35 #include "lookup3.h"
  36 #include "compress.h"
  37 #include "random-util.h"
  38
  39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  41
  42 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
  43
  44 /* This is the minimum journal file size */
  45 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
  46
  47 /* These are the lower and upper bounds if we deduce the max_use value
  48  * from the file system size */
  49 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  50 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  51
  52 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  53 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  54
  55 /* This is the upper bound if we deduce max_size from max_use */
  56 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  57
  58 /* This is the upper bound if we deduce the keep_free value from the
  59  * file system size */
  60 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  61
  62 /* This is the keep_free value when we can't determine the system
  63  * size */
  64 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  65
  66 /* This is the default maximum number of journal files to keep around. */
  67 #define DEFAULT_N_MAX_FILES (100)
  68
  69 /* n_data was the first entry we added after the initial file format design */
  70 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  71
  72 /* How many entries to keep in the entry array chain cache at max */
  73 #define CHAIN_CACHE_MAX 20
  74
  75 /* How much to increase the journal file size at once each time we allocate something new. */
  76 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  77
  78 /* Reread fstat() of the file for detecting deletions at least this often */
  79 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  80
  81 /* The mmap context to use for the header we pick as one above the last defined typed */
  82 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  83
  84 static int journal_file_set_online(JournalFile *f) {
  85         assert(f);
  86
  87         if (!f->writable)
  88                 return -EPERM;
  89
  90         if (!(f->fd >= 0 && f->header))
  91                 return -EINVAL;
  92
  93         if (mmap_cache_got_sigbus(f->mmap, f->fd))
  94                 return -EIO;
  95
  96         switch(f->header->state) {
  97                 case STATE_ONLINE:
  98                         return 0;
  99
 100                 case STATE_OFFLINE:
 101                         f->header->state = STATE_ONLINE;
 102                         fsync(f->fd);
 103                         return 0;
 104
 105                 default:
 106                         return -EINVAL;
 107         }
 108 }
 109
 110 int journal_file_set_offline(JournalFile *f) {
 111         assert(f);
 112
 113         if (!f->writable)
 114                 return -EPERM;
 115
 116         if (!(f->fd >= 0 && f->header))
 117                 return -EINVAL;
 118
 119         if (f->header->state != STATE_ONLINE)
 120                 return 0;
 121
 122         fsync(f->fd);
 123
 124         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 125                 return -EIO;
 126
 127         f->header->state = STATE_OFFLINE;
 128
 129         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 130                 return -EIO;
 131
 132         fsync(f->fd);
 133
 134         return 0;
 135 }
 136
 137 JournalFile* journal_file_close(JournalFile *f) {
 138         assert(f);
 139
 140 #ifdef HAVE_GCRYPT
 141         /* Write the final tag */
 142         if (f->seal && f->writable)
 143                 journal_file_append_tag(f);
 144 #endif
 145
 146         journal_file_set_offline(f);
 147
 148         if (f->mmap && f->fd >= 0)
 149                 mmap_cache_close_fd(f->mmap, f->fd);
 150
 151         if (f->fd >= 0 && f->defrag_on_close) {
 152
 153                 /* Be friendly to btrfs: turn COW back on again now,
 154                  * and defragment the file. We won't write to the file
 155                  * ever again, hence remove all fragmentation, and
 156                  * reenable all the good bits COW usually provides
 157                  * (such as data checksumming). */
 158
 159                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 160                 (void) btrfs_defrag_fd(f->fd);
 161         }
 162
 163         safe_close(f->fd);
 164         free(f->path);
 165
 166         if (f->mmap)
 167                 mmap_cache_unref(f->mmap);
 168
 169         ordered_hashmap_free_free(f->chain_cache);
 170
 171 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 172         free(f->compress_buffer);
 173 #endif
 174
 175 #ifdef HAVE_GCRYPT
 176         if (f->fss_file)
 177                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 178         else
 179                 free(f->fsprg_state);
 180
 181         free(f->fsprg_seed);
 182
 183         if (f->hmac)
 184                 gcry_md_close(f->hmac);
 185 #endif
 186
 187         free(f);
 188         return NULL;
 189 }
 190
 191 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 192         Header h = {};
 193         ssize_t k;
 194         int r;
 195
 196         assert(f);
 197
 198         memcpy(h.signature, HEADER_SIGNATURE, 8);
 199         h.header_size = htole64(ALIGN64(sizeof(h)));
 200
 201         h.incompatible_flags |= htole32(
 202                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 203                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 204
 205         h.compatible_flags = htole32(
 206                 f->seal * HEADER_COMPATIBLE_SEALED);
 207
 208         r = sd_id128_randomize(&h.file_id);
 209         if (r < 0)
 210                 return r;
 211
 212         if (template) {
 213                 h.seqnum_id = template->header->seqnum_id;
 214                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 215         } else
 216                 h.seqnum_id = h.file_id;
 217
 218         k = pwrite(f->fd, &h, sizeof(h), 0);
 219         if (k < 0)
 220                 return -errno;
 221
 222         if (k != sizeof(h))
 223                 return -EIO;
 224
 225         return 0;
 226 }
 227
 228 static int journal_file_refresh_header(JournalFile *f) {
 229         sd_id128_t boot_id;
 230         int r;
 231
 232         assert(f);
 233
 234         r = sd_id128_get_machine(&f->header->machine_id);
 235         if (r < 0)
 236                 return r;
 237
 238         r = sd_id128_get_boot(&boot_id);
 239         if (r < 0)
 240                 return r;
 241
 242         if (sd_id128_equal(boot_id, f->header->boot_id))
 243                 f->tail_entry_monotonic_valid = true;
 244
 245         f->header->boot_id = boot_id;
 246
 247         r = journal_file_set_online(f);
 248
 249         /* Sync the online state to disk */
 250         fsync(f->fd);
 251
 252         return r;
 253 }
 254
 255 static int journal_file_verify_header(JournalFile *f) {
 256         uint32_t flags;
 257
 258         assert(f);
 259
 260         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 261                 return -EBADMSG;
 262
 263         /* In both read and write mode we refuse to open files with
 264          * incompatible flags we don't know */
 265         flags = le32toh(f->header->incompatible_flags);
 266         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
 267                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
 268                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
 269                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
 270                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
 271                 if (flags)
 272                         log_debug("Journal file %s uses incompatible flags %"PRIx32
 273                                   " disabled at compilation time.", f->path, flags);
 274                 return -EPROTONOSUPPORT;
 275         }
 276
 277         /* When open for writing we refuse to open files with
 278          * compatible flags, too */
 279         flags = le32toh(f->header->compatible_flags);
 280         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
 281                 if (flags & ~HEADER_COMPATIBLE_ANY)
 282                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
 283                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
 284                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
 285                 if (flags)
 286                         log_debug("Journal file %s uses compatible flags %"PRIx32
 287                                   " disabled at compilation time.", f->path, flags);
 288                 return -EPROTONOSUPPORT;
 289         }
 290
 291         if (f->header->state >= _STATE_MAX)
 292                 return -EBADMSG;
 293
 294         /* The first addition was n_data, so check that we are at least this large */
 295         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
 296                 return -EBADMSG;
 297
 298         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 299                 return -EBADMSG;
 300
 301         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
 302                 return -ENODATA;
 303
 304         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
 305                 return -ENODATA;
 306
 307         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 308             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 309             !VALID64(le64toh(f->header->tail_object_offset)) ||
 310             !VALID64(le64toh(f->header->entry_array_offset)))
 311                 return -ENODATA;
 312
 313         if (f->writable) {
 314                 uint8_t state;
 315                 sd_id128_t machine_id;
 316                 int r;
 317
 318                 r = sd_id128_get_machine(&machine_id);
 319                 if (r < 0)
 320                         return r;
 321
 322                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 323                         return -EHOSTDOWN;
 324
 325                 state = f->header->state;
 326
 327                 if (state == STATE_ONLINE) {
 328                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 329                         return -EBUSY;
 330                 } else if (state == STATE_ARCHIVED)
 331                         return -ESHUTDOWN;
 332                 else if (state != STATE_OFFLINE) {
 333                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 334                         return -EBUSY;
 335                 }
 336         }
 337
 338         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 339         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 340
 341         f->seal = JOURNAL_HEADER_SEALED(f->header);
 342
 343         return 0;
 344 }
 345
 346 static int journal_file_fstat(JournalFile *f) {
 347         assert(f);
 348         assert(f->fd >= 0);
 349
 350         if (fstat(f->fd, &f->last_stat) < 0)
 351                 return -errno;
 352
 353         f->last_stat_usec = now(CLOCK_MONOTONIC);
 354
 355         /* Refuse appending to files that are already deleted */
 356         if (f->last_stat.st_nlink <= 0)
 357                 return -EIDRM;
 358
 359         return 0;
 360 }
 361
 362 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 363         uint64_t old_size, new_size;
 364         int r;
 365
 366         assert(f);
 367
 368         /* We assume that this file is not sparse, and we know that
 369          * for sure, since we always call posix_fallocate()
 370          * ourselves */
 371
 372         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 373                 return -EIO;
 374
 375         old_size =
 376                 le64toh(f->header->header_size) +
 377                 le64toh(f->header->arena_size);
 378
 379         new_size = PAGE_ALIGN(offset + size);
 380         if (new_size < le64toh(f->header->header_size))
 381                 new_size = le64toh(f->header->header_size);
 382
 383         if (new_size <= old_size) {
 384
 385                 /* We already pre-allocated enough space, but before
 386                  * we write to it, let's check with fstat() if the
 387                  * file got deleted, in order make sure we don't throw
 388                  * away the data immediately. Don't check fstat() for
 389                  * all writes though, but only once ever 10s. */
 390
 391                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 392                         return 0;
 393
 394                 return journal_file_fstat(f);
 395         }
 396
 397         /* Allocate more space. */
 398
 399         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 400                 return -E2BIG;
 401
 402         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 403                 struct statvfs svfs;
 404
 405                 if (fstatvfs(f->fd, &svfs) >= 0) {
 406                         uint64_t available;
 407
 408                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 409
 410                         if (new_size - old_size > available)
 411                                 return -E2BIG;
 412                 }
 413         }
 414
 415         /* Increase by larger blocks at once */
 416         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 417         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 418                 new_size = f->metrics.max_size;
 419
 420         /* Note that the glibc fallocate() fallback is very
 421            inefficient, hence we try to minimize the allocation area
 422            as we can. */
 423         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 424         if (r != 0)
 425                 return -r;
 426
 427         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 428
 429         return journal_file_fstat(f);
 430 }
 431
 432 static unsigned type_to_context(ObjectType type) {
 433         /* One context for each type, plus one catch-all for the rest */
 434         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 435         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 436         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 437 }
 438
 439 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
 440         int r;
 441
 442         assert(f);
 443         assert(ret);
 444
 445         if (size <= 0)
 446                 return -EINVAL;
 447
 448         /* Avoid SIGBUS on invalid accesses */
 449         if (offset + size > (uint64_t) f->last_stat.st_size) {
 450                 /* Hmm, out of range? Let's refresh the fstat() data
 451                  * first, before we trust that check. */
 452
 453                 r = journal_file_fstat(f);
 454                 if (r < 0)
 455                         return r;
 456
 457                 if (offset + size > (uint64_t) f->last_stat.st_size)
 458                         return -EADDRNOTAVAIL;
 459         }
 460
 461         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
 462 }
 463
 464 static uint64_t minimum_header_size(Object *o) {
 465
 466         static const uint64_t table[] = {
 467                 [OBJECT_DATA] = sizeof(DataObject),
 468                 [OBJECT_FIELD] = sizeof(FieldObject),
 469                 [OBJECT_ENTRY] = sizeof(EntryObject),
 470                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 471                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 472                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 473                 [OBJECT_TAG] = sizeof(TagObject),
 474         };
 475
 476         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 477                 return sizeof(ObjectHeader);
 478
 479         return table[o->object.type];
 480 }
 481
 482 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 483         int r;
 484         void *t;
 485         Object *o;
 486         uint64_t s;
 487
 488         assert(f);
 489         assert(ret);
 490
 491         /* Objects may only be located at multiple of 64 bit */
 492         if (!VALID64(offset))
 493                 return -EFAULT;
 494
 495         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
 496         if (r < 0)
 497                 return r;
 498
 499         o = (Object*) t;
 500         s = le64toh(o->object.size);
 501
 502         if (s < sizeof(ObjectHeader))
 503                 return -EBADMSG;
 504
 505         if (o->object.type <= OBJECT_UNUSED)
 506                 return -EBADMSG;
 507
 508         if (s < minimum_header_size(o))
 509                 return -EBADMSG;
 510
 511         if (type > OBJECT_UNUSED && o->object.type != type)
 512                 return -EBADMSG;
 513
 514         if (s > sizeof(ObjectHeader)) {
 515                 r = journal_file_move_to(f, type, false, offset, s, &t);
 516                 if (r < 0)
 517                         return r;
 518
 519                 o = (Object*) t;
 520         }
 521
 522         *ret = o;
 523         return 0;
 524 }
 525
 526 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 527         uint64_t r;
 528
 529         assert(f);
 530
 531         r = le64toh(f->header->tail_entry_seqnum) + 1;
 532
 533         if (seqnum) {
 534                 /* If an external seqnum counter was passed, we update
 535                  * both the local and the external one, and set it to
 536                  * the maximum of both */
 537
 538                 if (*seqnum + 1 > r)
 539                         r = *seqnum + 1;
 540
 541                 *seqnum = r;
 542         }
 543
 544         f->header->tail_entry_seqnum = htole64(r);
 545
 546         if (f->header->head_entry_seqnum == 0)
 547                 f->header->head_entry_seqnum = htole64(r);
 548
 549         return r;
 550 }
 551
 552 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
 553         int r;
 554         uint64_t p;
 555         Object *tail, *o;
 556         void *t;
 557
 558         assert(f);
 559         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
 560         assert(size >= sizeof(ObjectHeader));
 561         assert(offset);
 562         assert(ret);
 563
 564         r = journal_file_set_online(f);
 565         if (r < 0)
 566                 return r;
 567
 568         p = le64toh(f->header->tail_object_offset);
 569         if (p == 0)
 570                 p = le64toh(f->header->header_size);
 571         else {
 572                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
 573                 if (r < 0)
 574                         return r;
 575
 576                 p += ALIGN64(le64toh(tail->object.size));
 577         }
 578
 579         r = journal_file_allocate(f, p, size);
 580         if (r < 0)
 581                 return r;
 582
 583         r = journal_file_move_to(f, type, false, p, size, &t);
 584         if (r < 0)
 585                 return r;
 586
 587         o = (Object*) t;
 588
 589         zero(o->object);
 590         o->object.type = type;
 591         o->object.size = htole64(size);
 592
 593         f->header->tail_object_offset = htole64(p);
 594         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
 595
 596         *ret = o;
 597         *offset = p;
 598
 599         return 0;
 600 }
 601
 602 static int journal_file_setup_data_hash_table(JournalFile *f) {
 603         uint64_t s, p;
 604         Object *o;
 605         int r;
 606
 607         assert(f);
 608
 609         /* We estimate that we need 1 hash table entry per 768 bytes
 610            of journal file and we want to make sure we never get
 611            beyond 75% fill level. Calculate the hash table size for
 612            the maximum file size based on these metrics. */
 613
 614         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
 615         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
 616                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
 617
 618         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
 619
 620         r = journal_file_append_object(f,
 621                                        OBJECT_DATA_HASH_TABLE,
 622                                        offsetof(Object, hash_table.items) + s,
 623                                        &o, &p);
 624         if (r < 0)
 625                 return r;
 626
 627         memzero(o->hash_table.items, s);
 628
 629         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 630         f->header->data_hash_table_size = htole64(s);
 631
 632         return 0;
 633 }
 634
 635 static int journal_file_setup_field_hash_table(JournalFile *f) {
 636         uint64_t s, p;
 637         Object *o;
 638         int r;
 639
 640         assert(f);
 641
 642         /* We use a fixed size hash table for the fields as this
 643          * number should grow very slowly only */
 644
 645         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
 646         r = journal_file_append_object(f,
 647                                        OBJECT_FIELD_HASH_TABLE,
 648                                        offsetof(Object, hash_table.items) + s,
 649                                        &o, &p);
 650         if (r < 0)
 651                 return r;
 652
 653         memzero(o->hash_table.items, s);
 654
 655         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 656         f->header->field_hash_table_size = htole64(s);
 657
 658         return 0;
 659 }
 660
 661 int journal_file_map_data_hash_table(JournalFile *f) {
 662         uint64_t s, p;
 663         void *t;
 664         int r;
 665
 666         assert(f);
 667
 668         if (f->data_hash_table)
 669                 return 0;
 670
 671         p = le64toh(f->header->data_hash_table_offset);
 672         s = le64toh(f->header->data_hash_table_size);
 673
 674         r = journal_file_move_to(f,
 675                                  OBJECT_DATA_HASH_TABLE,
 676                                  true,
 677                                  p, s,
 678                                  &t);
 679         if (r < 0)
 680                 return r;
 681
 682         f->data_hash_table = t;
 683         return 0;
 684 }
 685
 686 int journal_file_map_field_hash_table(JournalFile *f) {
 687         uint64_t s, p;
 688         void *t;
 689         int r;
 690
 691         assert(f);
 692
 693         if (f->field_hash_table)
 694                 return 0;
 695
 696         p = le64toh(f->header->field_hash_table_offset);
 697         s = le64toh(f->header->field_hash_table_size);
 698
 699         r = journal_file_move_to(f,
 700                                  OBJECT_FIELD_HASH_TABLE,
 701                                  true,
 702                                  p, s,
 703                                  &t);
 704         if (r < 0)
 705                 return r;
 706
 707         f->field_hash_table = t;
 708         return 0;
 709 }
 710
 711 static int journal_file_link_field(
 712                 JournalFile *f,
 713                 Object *o,
 714                 uint64_t offset,
 715                 uint64_t hash) {
 716
 717         uint64_t p, h, m;
 718         int r;
 719
 720         assert(f);
 721         assert(o);
 722         assert(offset > 0);
 723
 724         if (o->object.type != OBJECT_FIELD)
 725                 return -EINVAL;
 726
 727         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 728         if (m <= 0)
 729                 return -EBADMSG;
 730
 731         /* This might alter the window we are looking at */
 732         o->field.next_hash_offset = o->field.head_data_offset = 0;
 733
 734         h = hash % m;
 735         p = le64toh(f->field_hash_table[h].tail_hash_offset);
 736         if (p == 0)
 737                 f->field_hash_table[h].head_hash_offset = htole64(offset);
 738         else {
 739                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 740                 if (r < 0)
 741                         return r;
 742
 743                 o->field.next_hash_offset = htole64(offset);
 744         }
 745
 746         f->field_hash_table[h].tail_hash_offset = htole64(offset);
 747
 748         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
 749                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
 750
 751         return 0;
 752 }
 753
 754 static int journal_file_link_data(
 755                 JournalFile *f,
 756                 Object *o,
 757                 uint64_t offset,
 758                 uint64_t hash) {
 759
 760         uint64_t p, h, m;
 761         int r;
 762
 763         assert(f);
 764         assert(o);
 765         assert(offset > 0);
 766
 767         if (o->object.type != OBJECT_DATA)
 768                 return -EINVAL;
 769
 770         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 771         if (m <= 0)
 772                 return -EBADMSG;
 773
 774         /* This might alter the window we are looking at */
 775         o->data.next_hash_offset = o->data.next_field_offset = 0;
 776         o->data.entry_offset = o->data.entry_array_offset = 0;
 777         o->data.n_entries = 0;
 778
 779         h = hash % m;
 780         p = le64toh(f->data_hash_table[h].tail_hash_offset);
 781         if (p == 0)
 782                 /* Only entry in the hash table is easy */
 783                 f->data_hash_table[h].head_hash_offset = htole64(offset);
 784         else {
 785                 /* Move back to the previous data object, to patch in
 786                  * pointer */
 787
 788                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 789                 if (r < 0)
 790                         return r;
 791
 792                 o->data.next_hash_offset = htole64(offset);
 793         }
 794
 795         f->data_hash_table[h].tail_hash_offset = htole64(offset);
 796
 797         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
 798                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
 799
 800         return 0;
 801 }
 802
 803 int journal_file_find_field_object_with_hash(
 804                 JournalFile *f,
 805                 const void *field, uint64_t size, uint64_t hash,
 806                 Object **ret, uint64_t *offset) {
 807
 808         uint64_t p, osize, h, m;
 809         int r;
 810
 811         assert(f);
 812         assert(field && size > 0);
 813
 814         /* If the field hash table is empty, we can't find anything */
 815         if (le64toh(f->header->field_hash_table_size) <= 0)
 816                 return 0;
 817
 818         /* Map the field hash table, if it isn't mapped yet. */
 819         r = journal_file_map_field_hash_table(f);
 820         if (r < 0)
 821                 return r;
 822
 823         osize = offsetof(Object, field.payload) + size;
 824
 825         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 826         if (m <= 0)
 827                 return -EBADMSG;
 828
 829         h = hash % m;
 830         p = le64toh(f->field_hash_table[h].head_hash_offset);
 831
 832         while (p > 0) {
 833                 Object *o;
 834
 835                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 836                 if (r < 0)
 837                         return r;
 838
 839                 if (le64toh(o->field.hash) == hash &&
 840                     le64toh(o->object.size) == osize &&
 841                     memcmp(o->field.payload, field, size) == 0) {
 842
 843                         if (ret)
 844                                 *ret = o;
 845                         if (offset)
 846                                 *offset = p;
 847
 848                         return 1;
 849                 }
 850
 851                 p = le64toh(o->field.next_hash_offset);
 852         }
 853
 854         return 0;
 855 }
 856
 857 int journal_file_find_field_object(
 858                 JournalFile *f,
 859                 const void *field, uint64_t size,
 860                 Object **ret, uint64_t *offset) {
 861
 862         uint64_t hash;
 863
 864         assert(f);
 865         assert(field && size > 0);
 866
 867         hash = hash64(field, size);
 868
 869         return journal_file_find_field_object_with_hash(f,
 870                                                         field, size, hash,
 871                                                         ret, offset);
 872 }
 873
 874 int journal_file_find_data_object_with_hash(
 875                 JournalFile *f,
 876                 const void *data, uint64_t size, uint64_t hash,
 877                 Object **ret, uint64_t *offset) {
 878
 879         uint64_t p, osize, h, m;
 880         int r;
 881
 882         assert(f);
 883         assert(data || size == 0);
 884
 885         /* If there's no data hash table, then there's no entry. */
 886         if (le64toh(f->header->data_hash_table_size) <= 0)
 887                 return 0;
 888
 889         /* Map the data hash table, if it isn't mapped yet. */
 890         r = journal_file_map_data_hash_table(f);
 891         if (r < 0)
 892                 return r;
 893
 894         osize = offsetof(Object, data.payload) + size;
 895
 896         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 897         if (m <= 0)
 898                 return -EBADMSG;
 899
 900         h = hash % m;
 901         p = le64toh(f->data_hash_table[h].head_hash_offset);
 902
 903         while (p > 0) {
 904                 Object *o;
 905
 906                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 907                 if (r < 0)
 908                         return r;
 909
 910                 if (le64toh(o->data.hash) != hash)
 911                         goto next;
 912
 913                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
 914 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 915                         uint64_t l;
 916                         size_t rsize = 0;
 917
 918                         l = le64toh(o->object.size);
 919                         if (l <= offsetof(Object, data.payload))
 920                                 return -EBADMSG;
 921
 922                         l -= offsetof(Object, data.payload);
 923
 924                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
 925                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
 926                         if (r < 0)
 927                                 return r;
 928
 929                         if (rsize == size &&
 930                             memcmp(f->compress_buffer, data, size) == 0) {
 931
 932                                 if (ret)
 933                                         *ret = o;
 934
 935                                 if (offset)
 936                                         *offset = p;
 937
 938                                 return 1;
 939                         }
 940 #else
 941                         return -EPROTONOSUPPORT;
 942 #endif
 943                 } else if (le64toh(o->object.size) == osize &&
 944                            memcmp(o->data.payload, data, size) == 0) {
 945
 946                         if (ret)
 947                                 *ret = o;
 948
 949                         if (offset)
 950                                 *offset = p;
 951
 952                         return 1;
 953                 }
 954
 955         next:
 956                 p = le64toh(o->data.next_hash_offset);
 957         }
 958
 959         return 0;
 960 }
 961
 962 int journal_file_find_data_object(
 963                 JournalFile *f,
 964                 const void *data, uint64_t size,
 965                 Object **ret, uint64_t *offset) {
 966
 967         uint64_t hash;
 968
 969         assert(f);
 970         assert(data || size == 0);
 971
 972         hash = hash64(data, size);
 973
 974         return journal_file_find_data_object_with_hash(f,
 975                                                        data, size, hash,
 976                                                        ret, offset);
 977 }
 978
 979 static int journal_file_append_field(
 980                 JournalFile *f,
 981                 const void *field, uint64_t size,
 982                 Object **ret, uint64_t *offset) {
 983
 984         uint64_t hash, p;
 985         uint64_t osize;
 986         Object *o;
 987         int r;
 988
 989         assert(f);
 990         assert(field && size > 0);
 991
 992         hash = hash64(field, size);
 993
 994         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
 995         if (r < 0)
 996                 return r;
 997         else if (r > 0) {
 998
 999                 if (ret)
1000                         *ret = o;
1001
1002                 if (offset)
1003                         *offset = p;
1004
1005                 return 0;
1006         }
1007
1008         osize = offsetof(Object, field.payload) + size;
1009         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1010         if (r < 0)
1011                 return r;
1012
1013         o->field.hash = htole64(hash);
1014         memcpy(o->field.payload, field, size);
1015
1016         r = journal_file_link_field(f, o, p, hash);
1017         if (r < 0)
1018                 return r;
1019
1020         /* The linking might have altered the window, so let's
1021          * refresh our pointer */
1022         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1023         if (r < 0)
1024                 return r;
1025
1026 #ifdef HAVE_GCRYPT
1027         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1028         if (r < 0)
1029                 return r;
1030 #endif
1031
1032         if (ret)
1033                 *ret = o;
1034
1035         if (offset)
1036                 *offset = p;
1037
1038         return 0;
1039 }
1040
1041 static int journal_file_append_data(
1042                 JournalFile *f,
1043                 const void *data, uint64_t size,
1044                 Object **ret, uint64_t *offset) {
1045
1046         uint64_t hash, p;
1047         uint64_t osize;
1048         Object *o;
1049         int r, compression = 0;
1050         const void *eq;
1051
1052         assert(f);
1053         assert(data || size == 0);
1054
1055         hash = hash64(data, size);
1056
1057         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1058         if (r < 0)
1059                 return r;
1060         else if (r > 0) {
1061
1062                 if (ret)
1063                         *ret = o;
1064
1065                 if (offset)
1066                         *offset = p;
1067
1068                 return 0;
1069         }
1070
1071         osize = offsetof(Object, data.payload) + size;
1072         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1073         if (r < 0)
1074                 return r;
1075
1076         o->data.hash = htole64(hash);
1077
1078 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1079         if (f->compress_xz &&
1080             size >= COMPRESSION_SIZE_THRESHOLD) {
1081                 size_t rsize = 0;
1082
1083                 compression = compress_blob(data, size, o->data.payload, &rsize);
1084
1085                 if (compression) {
1086                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1087                         o->object.flags |= compression;
1088
1089                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1090                                   size, rsize, object_compressed_to_string(compression));
1091                 }
1092         }
1093 #endif
1094
1095         if (!compression && size > 0)
1096                 memcpy(o->data.payload, data, size);
1097
1098         r = journal_file_link_data(f, o, p, hash);
1099         if (r < 0)
1100                 return r;
1101
1102         /* The linking might have altered the window, so let's
1103          * refresh our pointer */
1104         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1105         if (r < 0)
1106                 return r;
1107
1108         if (!data)
1109                 eq = NULL;
1110         else
1111                 eq = memchr(data, '=', size);
1112         if (eq && eq > data) {
1113                 Object *fo = NULL;
1114                 uint64_t fp;
1115
1116                 /* Create field object ... */
1117                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1118                 if (r < 0)
1119                         return r;
1120
1121                 /* ... and link it in. */
1122                 o->data.next_field_offset = fo->field.head_data_offset;
1123                 fo->field.head_data_offset = le64toh(p);
1124         }
1125
1126 #ifdef HAVE_GCRYPT
1127         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1128         if (r < 0)
1129                 return r;
1130 #endif
1131
1132         if (ret)
1133                 *ret = o;
1134
1135         if (offset)
1136                 *offset = p;
1137
1138         return 0;
1139 }
1140
1141 uint64_t journal_file_entry_n_items(Object *o) {
1142         assert(o);
1143
1144         if (o->object.type != OBJECT_ENTRY)
1145                 return 0;
1146
1147         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1148 }
1149
1150 uint64_t journal_file_entry_array_n_items(Object *o) {
1151         assert(o);
1152
1153         if (o->object.type != OBJECT_ENTRY_ARRAY)
1154                 return 0;
1155
1156         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1157 }
1158
1159 uint64_t journal_file_hash_table_n_items(Object *o) {
1160         assert(o);
1161
1162         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1163             o->object.type != OBJECT_FIELD_HASH_TABLE)
1164                 return 0;
1165
1166         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1167 }
1168
1169 static int link_entry_into_array(JournalFile *f,
1170                                  le64_t *first,
1171                                  le64_t *idx,
1172                                  uint64_t p) {
1173         int r;
1174         uint64_t n = 0, ap = 0, q, i, a, hidx;
1175         Object *o;
1176
1177         assert(f);
1178         assert(first);
1179         assert(idx);
1180         assert(p > 0);
1181
1182         a = le64toh(*first);
1183         i = hidx = le64toh(*idx);
1184         while (a > 0) {
1185
1186                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1187                 if (r < 0)
1188                         return r;
1189
1190                 n = journal_file_entry_array_n_items(o);
1191                 if (i < n) {
1192                         o->entry_array.items[i] = htole64(p);
1193                         *idx = htole64(hidx + 1);
1194                         return 0;
1195                 }
1196
1197                 i -= n;
1198                 ap = a;
1199                 a = le64toh(o->entry_array.next_entry_array_offset);
1200         }
1201
1202         if (hidx > n)
1203                 n = (hidx+1) * 2;
1204         else
1205                 n = n * 2;
1206
1207         if (n < 4)
1208                 n = 4;
1209
1210         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1211                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1212                                        &o, &q);
1213         if (r < 0)
1214                 return r;
1215
1216 #ifdef HAVE_GCRYPT
1217         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1218         if (r < 0)
1219                 return r;
1220 #endif
1221
1222         o->entry_array.items[i] = htole64(p);
1223
1224         if (ap == 0)
1225                 *first = htole64(q);
1226         else {
1227                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1228                 if (r < 0)
1229                         return r;
1230
1231                 o->entry_array.next_entry_array_offset = htole64(q);
1232         }
1233
1234         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1235                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1236
1237         *idx = htole64(hidx + 1);
1238
1239         return 0;
1240 }
1241
1242 static int link_entry_into_array_plus_one(JournalFile *f,
1243                                           le64_t *extra,
1244                                           le64_t *first,
1245                                           le64_t *idx,
1246                                           uint64_t p) {
1247
1248         int r;
1249
1250         assert(f);
1251         assert(extra);
1252         assert(first);
1253         assert(idx);
1254         assert(p > 0);
1255
1256         if (*idx == 0)
1257                 *extra = htole64(p);
1258         else {
1259                 le64_t i;
1260
1261                 i = htole64(le64toh(*idx) - 1);
1262                 r = link_entry_into_array(f, first, &i, p);
1263                 if (r < 0)
1264                         return r;
1265         }
1266
1267         *idx = htole64(le64toh(*idx) + 1);
1268         return 0;
1269 }
1270
1271 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1272         uint64_t p;
1273         int r;
1274         assert(f);
1275         assert(o);
1276         assert(offset > 0);
1277
1278         p = le64toh(o->entry.items[i].object_offset);
1279         if (p == 0)
1280                 return -EINVAL;
1281
1282         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1283         if (r < 0)
1284                 return r;
1285
1286         return link_entry_into_array_plus_one(f,
1287                                               &o->data.entry_offset,
1288                                               &o->data.entry_array_offset,
1289                                               &o->data.n_entries,
1290                                               offset);
1291 }
1292
1293 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1294         uint64_t n, i;
1295         int r;
1296
1297         assert(f);
1298         assert(o);
1299         assert(offset > 0);
1300
1301         if (o->object.type != OBJECT_ENTRY)
1302                 return -EINVAL;
1303
1304         __sync_synchronize();
1305
1306         /* Link up the entry itself */
1307         r = link_entry_into_array(f,
1308                                   &f->header->entry_array_offset,
1309                                   &f->header->n_entries,
1310                                   offset);
1311         if (r < 0)
1312                 return r;
1313
1314         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1315
1316         if (f->header->head_entry_realtime == 0)
1317                 f->header->head_entry_realtime = o->entry.realtime;
1318
1319         f->header->tail_entry_realtime = o->entry.realtime;
1320         f->header->tail_entry_monotonic = o->entry.monotonic;
1321
1322         f->tail_entry_monotonic_valid = true;
1323
1324         /* Link up the items */
1325         n = journal_file_entry_n_items(o);
1326         for (i = 0; i < n; i++) {
1327                 r = journal_file_link_entry_item(f, o, offset, i);
1328                 if (r < 0)
1329                         return r;
1330         }
1331
1332         return 0;
1333 }
1334
1335 static int journal_file_append_entry_internal(
1336                 JournalFile *f,
1337                 const dual_timestamp *ts,
1338                 uint64_t xor_hash,
1339                 const EntryItem items[], unsigned n_items,
1340                 uint64_t *seqnum,
1341                 Object **ret, uint64_t *offset) {
1342         uint64_t np;
1343         uint64_t osize;
1344         Object *o;
1345         int r;
1346
1347         assert(f);
1348         assert(items || n_items == 0);
1349         assert(ts);
1350
1351         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1352
1353         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1354         if (r < 0)
1355                 return r;
1356
1357         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1358         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1359         o->entry.realtime = htole64(ts->realtime);
1360         o->entry.monotonic = htole64(ts->monotonic);
1361         o->entry.xor_hash = htole64(xor_hash);
1362         o->entry.boot_id = f->header->boot_id;
1363
1364 #ifdef HAVE_GCRYPT
1365         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1366         if (r < 0)
1367                 return r;
1368 #endif
1369
1370         r = journal_file_link_entry(f, o, np);
1371         if (r < 0)
1372                 return r;
1373
1374         if (ret)
1375                 *ret = o;
1376
1377         if (offset)
1378                 *offset = np;
1379
1380         return 0;
1381 }
1382
1383 void journal_file_post_change(JournalFile *f) {
1384         assert(f);
1385
1386         /* inotify() does not receive IN_MODIFY events from file
1387          * accesses done via mmap(). After each access we hence
1388          * trigger IN_MODIFY by truncating the journal file to its
1389          * current size which triggers IN_MODIFY. */
1390
1391         __sync_synchronize();
1392
1393         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1394                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1395 }
1396
1397 static int entry_item_cmp(const void *_a, const void *_b) {
1398         const EntryItem *a = _a, *b = _b;
1399
1400         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1401                 return -1;
1402         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1403                 return 1;
1404         return 0;
1405 }
1406
1407 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1408         unsigned i;
1409         EntryItem *items;
1410         int r;
1411         uint64_t xor_hash = 0;
1412         struct dual_timestamp _ts;
1413
1414         assert(f);
1415         assert(iovec || n_iovec == 0);
1416
1417         if (!ts) {
1418                 dual_timestamp_get(&_ts);
1419                 ts = &_ts;
1420         }
1421
1422         if (f->tail_entry_monotonic_valid &&
1423             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1424                 return -EINVAL;
1425
1426 #ifdef HAVE_GCRYPT
1427         r = journal_file_maybe_append_tag(f, ts->realtime);
1428         if (r < 0)
1429                 return r;
1430 #endif
1431
1432         /* alloca() can't take 0, hence let's allocate at least one */
1433         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1434
1435         for (i = 0; i < n_iovec; i++) {
1436                 uint64_t p;
1437                 Object *o;
1438
1439                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1440                 if (r < 0)
1441                         return r;
1442
1443                 xor_hash ^= le64toh(o->data.hash);
1444                 items[i].object_offset = htole64(p);
1445                 items[i].hash = o->data.hash;
1446         }
1447
1448         /* Order by the position on disk, in order to improve seek
1449          * times for rotating media. */
1450         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1451
1452         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1453
1454         /* If the memory mapping triggered a SIGBUS then we return an
1455          * IO error and ignore the error code passed down to us, since
1456          * it is very likely just an effect of a nullified replacement
1457          * mapping page */
1458
1459         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1460                 r = -EIO;
1461
1462         journal_file_post_change(f);
1463
1464         return r;
1465 }
1466
1467 typedef struct ChainCacheItem {
1468         uint64_t first; /* the array at the beginning of the chain */
1469         uint64_t array; /* the cached array */
1470         uint64_t begin; /* the first item in the cached array */
1471         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1472         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1473 } ChainCacheItem;
1474
1475 static void chain_cache_put(
1476                 OrderedHashmap *h,
1477                 ChainCacheItem *ci,
1478                 uint64_t first,
1479                 uint64_t array,
1480                 uint64_t begin,
1481                 uint64_t total,
1482                 uint64_t last_index) {
1483
1484         if (!ci) {
1485                 /* If the chain item to cache for this chain is the
1486                  * first one it's not worth caching anything */
1487                 if (array == first)
1488                         return;
1489
1490                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1491                         ci = ordered_hashmap_steal_first(h);
1492                         assert(ci);
1493                 } else {
1494                         ci = new(ChainCacheItem, 1);
1495                         if (!ci)
1496                                 return;
1497                 }
1498
1499                 ci->first = first;
1500
1501                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1502                         free(ci);
1503                         return;
1504                 }
1505         } else
1506                 assert(ci->first == first);
1507
1508         ci->array = array;
1509         ci->begin = begin;
1510         ci->total = total;
1511         ci->last_index = last_index;
1512 }
1513
1514 static int generic_array_get(
1515                 JournalFile *f,
1516                 uint64_t first,
1517                 uint64_t i,
1518                 Object **ret, uint64_t *offset) {
1519
1520         Object *o;
1521         uint64_t p = 0, a, t = 0;
1522         int r;
1523         ChainCacheItem *ci;
1524
1525         assert(f);
1526
1527         a = first;
1528
1529         /* Try the chain cache first */
1530         ci = ordered_hashmap_get(f->chain_cache, &first);
1531         if (ci && i > ci->total) {
1532                 a = ci->array;
1533                 i -= ci->total;
1534                 t = ci->total;
1535         }
1536
1537         while (a > 0) {
1538                 uint64_t k;
1539
1540                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1541                 if (r < 0)
1542                         return r;
1543
1544                 k = journal_file_entry_array_n_items(o);
1545                 if (i < k) {
1546                         p = le64toh(o->entry_array.items[i]);
1547                         goto found;
1548                 }
1549
1550                 i -= k;
1551                 t += k;
1552                 a = le64toh(o->entry_array.next_entry_array_offset);
1553         }
1554
1555         return 0;
1556
1557 found:
1558         /* Let's cache this item for the next invocation */
1559         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1560
1561         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1562         if (r < 0)
1563                 return r;
1564
1565         if (ret)
1566                 *ret = o;
1567
1568         if (offset)
1569                 *offset = p;
1570
1571         return 1;
1572 }
1573
1574 static int generic_array_get_plus_one(
1575                 JournalFile *f,
1576                 uint64_t extra,
1577                 uint64_t first,
1578                 uint64_t i,
1579                 Object **ret, uint64_t *offset) {
1580
1581         Object *o;
1582
1583         assert(f);
1584
1585         if (i == 0) {
1586                 int r;
1587
1588                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1589                 if (r < 0)
1590                         return r;
1591
1592                 if (ret)
1593                         *ret = o;
1594
1595                 if (offset)
1596                         *offset = extra;
1597
1598                 return 1;
1599         }
1600
1601         return generic_array_get(f, first, i-1, ret, offset);
1602 }
1603
1604 enum {
1605         TEST_FOUND,
1606         TEST_LEFT,
1607         TEST_RIGHT
1608 };
1609
1610 static int generic_array_bisect(
1611                 JournalFile *f,
1612                 uint64_t first,
1613                 uint64_t n,
1614                 uint64_t needle,
1615                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1616                 direction_t direction,
1617                 Object **ret,
1618                 uint64_t *offset,
1619                 uint64_t *idx) {
1620
1621         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1622         bool subtract_one = false;
1623         Object *o, *array = NULL;
1624         int r;
1625         ChainCacheItem *ci;
1626
1627         assert(f);
1628         assert(test_object);
1629
1630         /* Start with the first array in the chain */
1631         a = first;
1632
1633         ci = ordered_hashmap_get(f->chain_cache, &first);
1634         if (ci && n > ci->total) {
1635                 /* Ah, we have iterated this bisection array chain
1636                  * previously! Let's see if we can skip ahead in the
1637                  * chain, as far as the last time. But we can't jump
1638                  * backwards in the chain, so let's check that
1639                  * first. */
1640
1641                 r = test_object(f, ci->begin, needle);
1642                 if (r < 0)
1643                         return r;
1644
1645                 if (r == TEST_LEFT) {
1646                         /* OK, what we are looking for is right of the
1647                          * begin of this EntryArray, so let's jump
1648                          * straight to previously cached array in the
1649                          * chain */
1650
1651                         a = ci->array;
1652                         n -= ci->total;
1653                         t = ci->total;
1654                         last_index = ci->last_index;
1655                 }
1656         }
1657
1658         while (a > 0) {
1659                 uint64_t left, right, k, lp;
1660
1661                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1662                 if (r < 0)
1663                         return r;
1664
1665                 k = journal_file_entry_array_n_items(array);
1666                 right = MIN(k, n);
1667                 if (right <= 0)
1668                         return 0;
1669
1670                 i = right - 1;
1671                 lp = p = le64toh(array->entry_array.items[i]);
1672                 if (p <= 0)
1673                         return -EBADMSG;
1674
1675                 r = test_object(f, p, needle);
1676                 if (r < 0)
1677                         return r;
1678
1679                 if (r == TEST_FOUND)
1680                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1681
1682                 if (r == TEST_RIGHT) {
1683                         left = 0;
1684                         right -= 1;
1685
1686                         if (last_index != (uint64_t) -1) {
1687                                 assert(last_index <= right);
1688
1689                                 /* If we cached the last index we
1690                                  * looked at, let's try to not to jump
1691                                  * too wildly around and see if we can
1692                                  * limit the range to look at early to
1693                                  * the immediate neighbors of the last
1694                                  * index we looked at. */
1695
1696                                 if (last_index > 0) {
1697                                         uint64_t x = last_index - 1;
1698
1699                                         p = le64toh(array->entry_array.items[x]);
1700                                         if (p <= 0)
1701                                                 return -EBADMSG;
1702
1703                                         r = test_object(f, p, needle);
1704                                         if (r < 0)
1705                                                 return r;
1706
1707                                         if (r == TEST_FOUND)
1708                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1709
1710                                         if (r == TEST_RIGHT)
1711                                                 right = x;
1712                                         else
1713                                                 left = x + 1;
1714                                 }
1715
1716                                 if (last_index < right) {
1717                                         uint64_t y = last_index + 1;
1718
1719                                         p = le64toh(array->entry_array.items[y]);
1720                                         if (p <= 0)
1721                                                 return -EBADMSG;
1722
1723                                         r = test_object(f, p, needle);
1724                                         if (r < 0)
1725                                                 return r;
1726
1727                                         if (r == TEST_FOUND)
1728                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1729
1730                                         if (r == TEST_RIGHT)
1731                                                 right = y;
1732                                         else
1733                                                 left = y + 1;
1734                                 }
1735                         }
1736
1737                         for (;;) {
1738                                 if (left == right) {
1739                                         if (direction == DIRECTION_UP)
1740                                                 subtract_one = true;
1741
1742                                         i = left;
1743                                         goto found;
1744                                 }
1745
1746                                 assert(left < right);
1747                                 i = (left + right) / 2;
1748
1749                                 p = le64toh(array->entry_array.items[i]);
1750                                 if (p <= 0)
1751                                         return -EBADMSG;
1752
1753                                 r = test_object(f, p, needle);
1754                                 if (r < 0)
1755                                         return r;
1756
1757                                 if (r == TEST_FOUND)
1758                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1759
1760                                 if (r == TEST_RIGHT)
1761                                         right = i;
1762                                 else
1763                                         left = i + 1;
1764                         }
1765                 }
1766
1767                 if (k >= n) {
1768                         if (direction == DIRECTION_UP) {
1769                                 i = n;
1770                                 subtract_one = true;
1771                                 goto found;
1772                         }
1773
1774                         return 0;
1775                 }
1776
1777                 last_p = lp;
1778
1779                 n -= k;
1780                 t += k;
1781                 last_index = (uint64_t) -1;
1782                 a = le64toh(array->entry_array.next_entry_array_offset);
1783         }
1784
1785         return 0;
1786
1787 found:
1788         if (subtract_one && t == 0 && i == 0)
1789                 return 0;
1790
1791         /* Let's cache this item for the next invocation */
1792         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1793
1794         if (subtract_one && i == 0)
1795                 p = last_p;
1796         else if (subtract_one)
1797                 p = le64toh(array->entry_array.items[i-1]);
1798         else
1799                 p = le64toh(array->entry_array.items[i]);
1800
1801         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1802         if (r < 0)
1803                 return r;
1804
1805         if (ret)
1806                 *ret = o;
1807
1808         if (offset)
1809                 *offset = p;
1810
1811         if (idx)
1812                 *idx = t + i + (subtract_one ? -1 : 0);
1813
1814         return 1;
1815 }
1816
1817 static int generic_array_bisect_plus_one(
1818                 JournalFile *f,
1819                 uint64_t extra,
1820                 uint64_t first,
1821                 uint64_t n,
1822                 uint64_t needle,
1823                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1824                 direction_t direction,
1825                 Object **ret,
1826                 uint64_t *offset,
1827                 uint64_t *idx) {
1828
1829         int r;
1830         bool step_back = false;
1831         Object *o;
1832
1833         assert(f);
1834         assert(test_object);
1835
1836         if (n <= 0)
1837                 return 0;
1838
1839         /* This bisects the array in object 'first', but first checks
1840          * an extra  */
1841         r = test_object(f, extra, needle);
1842         if (r < 0)
1843                 return r;
1844
1845         if (r == TEST_FOUND)
1846                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1847
1848         /* if we are looking with DIRECTION_UP then we need to first
1849            see if in the actual array there is a matching entry, and
1850            return the last one of that. But if there isn't any we need
1851            to return this one. Hence remember this, and return it
1852            below. */
1853         if (r == TEST_LEFT)
1854                 step_back = direction == DIRECTION_UP;
1855
1856         if (r == TEST_RIGHT) {
1857                 if (direction == DIRECTION_DOWN)
1858                         goto found;
1859                 else
1860                         return 0;
1861         }
1862
1863         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1864
1865         if (r == 0 && step_back)
1866                 goto found;
1867
1868         if (r > 0 && idx)
1869                 (*idx) ++;
1870
1871         return r;
1872
1873 found:
1874         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1875         if (r < 0)
1876                 return r;
1877
1878         if (ret)
1879                 *ret = o;
1880
1881         if (offset)
1882                 *offset = extra;
1883
1884         if (idx)
1885                 *idx = 0;
1886
1887         return 1;
1888 }
1889
1890 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1891         assert(f);
1892         assert(p > 0);
1893
1894         if (p == needle)
1895                 return TEST_FOUND;
1896         else if (p < needle)
1897                 return TEST_LEFT;
1898         else
1899                 return TEST_RIGHT;
1900 }
1901
1902 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1903         Object *o;
1904         int r;
1905
1906         assert(f);
1907         assert(p > 0);
1908
1909         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1910         if (r < 0)
1911                 return r;
1912
1913         if (le64toh(o->entry.seqnum) == needle)
1914                 return TEST_FOUND;
1915         else if (le64toh(o->entry.seqnum) < needle)
1916                 return TEST_LEFT;
1917         else
1918                 return TEST_RIGHT;
1919 }
1920
1921 int journal_file_move_to_entry_by_seqnum(
1922                 JournalFile *f,
1923                 uint64_t seqnum,
1924                 direction_t direction,
1925                 Object **ret,
1926                 uint64_t *offset) {
1927
1928         return generic_array_bisect(f,
1929                                     le64toh(f->header->entry_array_offset),
1930                                     le64toh(f->header->n_entries),
1931                                     seqnum,
1932                                     test_object_seqnum,
1933                                     direction,
1934                                     ret, offset, NULL);
1935 }
1936
1937 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1938         Object *o;
1939         int r;
1940
1941         assert(f);
1942         assert(p > 0);
1943
1944         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1945         if (r < 0)
1946                 return r;
1947
1948         if (le64toh(o->entry.realtime) == needle)
1949                 return TEST_FOUND;
1950         else if (le64toh(o->entry.realtime) < needle)
1951                 return TEST_LEFT;
1952         else
1953                 return TEST_RIGHT;
1954 }
1955
1956 int journal_file_move_to_entry_by_realtime(
1957                 JournalFile *f,
1958                 uint64_t realtime,
1959                 direction_t direction,
1960                 Object **ret,
1961                 uint64_t *offset) {
1962
1963         return generic_array_bisect(f,
1964                                     le64toh(f->header->entry_array_offset),
1965                                     le64toh(f->header->n_entries),
1966                                     realtime,
1967                                     test_object_realtime,
1968                                     direction,
1969                                     ret, offset, NULL);
1970 }
1971
1972 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1973         Object *o;
1974         int r;
1975
1976         assert(f);
1977         assert(p > 0);
1978
1979         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1980         if (r < 0)
1981                 return r;
1982
1983         if (le64toh(o->entry.monotonic) == needle)
1984                 return TEST_FOUND;
1985         else if (le64toh(o->entry.monotonic) < needle)
1986                 return TEST_LEFT;
1987         else
1988                 return TEST_RIGHT;
1989 }
1990
1991 static int find_data_object_by_boot_id(
1992                 JournalFile *f,
1993                 sd_id128_t boot_id,
1994                 Object **o,
1995                 uint64_t *b) {
1996
1997         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1998
1999         sd_id128_to_string(boot_id, t + 9);
2000         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2001 }
2002
2003 int journal_file_move_to_entry_by_monotonic(
2004                 JournalFile *f,
2005                 sd_id128_t boot_id,
2006                 uint64_t monotonic,
2007                 direction_t direction,
2008                 Object **ret,
2009                 uint64_t *offset) {
2010
2011         Object *o;
2012         int r;
2013
2014         assert(f);
2015
2016         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2017         if (r < 0)
2018                 return r;
2019         if (r == 0)
2020                 return -ENOENT;
2021
2022         return generic_array_bisect_plus_one(f,
2023                                              le64toh(o->data.entry_offset),
2024                                              le64toh(o->data.entry_array_offset),
2025                                              le64toh(o->data.n_entries),
2026                                              monotonic,
2027                                              test_object_monotonic,
2028                                              direction,
2029                                              ret, offset, NULL);
2030 }
2031
2032 void journal_file_reset_location(JournalFile *f) {
2033         f->location_type = LOCATION_HEAD;
2034         f->current_offset = 0;
2035         f->current_seqnum = 0;
2036         f->current_realtime = 0;
2037         f->current_monotonic = 0;
2038         zero(f->current_boot_id);
2039         f->current_xor_hash = 0;
2040 }
2041
2042 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2043         f->location_type = LOCATION_SEEK;
2044         f->current_offset = offset;
2045         f->current_seqnum = le64toh(o->entry.seqnum);
2046         f->current_realtime = le64toh(o->entry.realtime);
2047         f->current_monotonic = le64toh(o->entry.monotonic);
2048         f->current_boot_id = o->entry.boot_id;
2049         f->current_xor_hash = le64toh(o->entry.xor_hash);
2050 }
2051
2052 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2053         assert(af);
2054         assert(bf);
2055         assert(af->location_type == LOCATION_SEEK);
2056         assert(bf->location_type == LOCATION_SEEK);
2057
2058         /* If contents and timestamps match, these entries are
2059          * identical, even if the seqnum does not match */
2060         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2061             af->current_monotonic == bf->current_monotonic &&
2062             af->current_realtime == bf->current_realtime &&
2063             af->current_xor_hash == bf->current_xor_hash)
2064                 return 0;
2065
2066         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2067
2068                 /* If this is from the same seqnum source, compare
2069                  * seqnums */
2070                 if (af->current_seqnum < bf->current_seqnum)
2071                         return -1;
2072                 if (af->current_seqnum > bf->current_seqnum)
2073                         return 1;
2074
2075                 /* Wow! This is weird, different data but the same
2076                  * seqnums? Something is borked, but let's make the
2077                  * best of it and compare by time. */
2078         }
2079
2080         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2081
2082                 /* If the boot id matches, compare monotonic time */
2083                 if (af->current_monotonic < bf->current_monotonic)
2084                         return -1;
2085                 if (af->current_monotonic > bf->current_monotonic)
2086                         return 1;
2087         }
2088
2089         /* Otherwise, compare UTC time */
2090         if (af->current_realtime < bf->current_realtime)
2091                 return -1;
2092         if (af->current_realtime > bf->current_realtime)
2093                 return 1;
2094
2095         /* Finally, compare by contents */
2096         if (af->current_xor_hash < bf->current_xor_hash)
2097                 return -1;
2098         if (af->current_xor_hash > bf->current_xor_hash)
2099                 return 1;
2100
2101         return 0;
2102 }
2103
2104 int journal_file_next_entry(
2105                 JournalFile *f,
2106                 uint64_t p,
2107                 direction_t direction,
2108                 Object **ret, uint64_t *offset) {
2109
2110         uint64_t i, n, ofs;
2111         int r;
2112
2113         assert(f);
2114
2115         n = le64toh(f->header->n_entries);
2116         if (n <= 0)
2117                 return 0;
2118
2119         if (p == 0)
2120                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2121         else {
2122                 r = generic_array_bisect(f,
2123                                          le64toh(f->header->entry_array_offset),
2124                                          le64toh(f->header->n_entries),
2125                                          p,
2126                                          test_object_offset,
2127                                          DIRECTION_DOWN,
2128                                          NULL, NULL,
2129                                          &i);
2130                 if (r <= 0)
2131                         return r;
2132
2133                 if (direction == DIRECTION_DOWN) {
2134                         if (i >= n - 1)
2135                                 return 0;
2136
2137                         i++;
2138                 } else {
2139                         if (i <= 0)
2140                                 return 0;
2141
2142                         i--;
2143                 }
2144         }
2145
2146         /* And jump to it */
2147         r = generic_array_get(f,
2148                               le64toh(f->header->entry_array_offset),
2149                               i,
2150                               ret, &ofs);
2151         if (r <= 0)
2152                 return r;
2153
2154         if (p > 0 &&
2155             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2156                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2157                           f->path, i);
2158                 return -EBADMSG;
2159         }
2160
2161         if (offset)
2162                 *offset = ofs;
2163
2164         return 1;
2165 }
2166
2167 int journal_file_next_entry_for_data(
2168                 JournalFile *f,
2169                 Object *o, uint64_t p,
2170                 uint64_t data_offset,
2171                 direction_t direction,
2172                 Object **ret, uint64_t *offset) {
2173
2174         uint64_t n, i;
2175         int r;
2176         Object *d;
2177
2178         assert(f);
2179         assert(p > 0 || !o);
2180
2181         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2182         if (r < 0)
2183                 return r;
2184
2185         n = le64toh(d->data.n_entries);
2186         if (n <= 0)
2187                 return n;
2188
2189         if (!o)
2190                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2191         else {
2192                 if (o->object.type != OBJECT_ENTRY)
2193                         return -EINVAL;
2194
2195                 r = generic_array_bisect_plus_one(f,
2196                                                   le64toh(d->data.entry_offset),
2197                                                   le64toh(d->data.entry_array_offset),
2198                                                   le64toh(d->data.n_entries),
2199                                                   p,
2200                                                   test_object_offset,
2201                                                   DIRECTION_DOWN,
2202                                                   NULL, NULL,
2203                                                   &i);
2204
2205                 if (r <= 0)
2206                         return r;
2207
2208                 if (direction == DIRECTION_DOWN) {
2209                         if (i >= n - 1)
2210                                 return 0;
2211
2212                         i++;
2213                 } else {
2214                         if (i <= 0)
2215                                 return 0;
2216
2217                         i--;
2218                 }
2219
2220         }
2221
2222         return generic_array_get_plus_one(f,
2223                                           le64toh(d->data.entry_offset),
2224                                           le64toh(d->data.entry_array_offset),
2225                                           i,
2226                                           ret, offset);
2227 }
2228
2229 int journal_file_move_to_entry_by_offset_for_data(
2230                 JournalFile *f,
2231                 uint64_t data_offset,
2232                 uint64_t p,
2233                 direction_t direction,
2234                 Object **ret, uint64_t *offset) {
2235
2236         int r;
2237         Object *d;
2238
2239         assert(f);
2240
2241         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2242         if (r < 0)
2243                 return r;
2244
2245         return generic_array_bisect_plus_one(f,
2246                                              le64toh(d->data.entry_offset),
2247                                              le64toh(d->data.entry_array_offset),
2248                                              le64toh(d->data.n_entries),
2249                                              p,
2250                                              test_object_offset,
2251                                              direction,
2252                                              ret, offset, NULL);
2253 }
2254
2255 int journal_file_move_to_entry_by_monotonic_for_data(
2256                 JournalFile *f,
2257                 uint64_t data_offset,
2258                 sd_id128_t boot_id,
2259                 uint64_t monotonic,
2260                 direction_t direction,
2261                 Object **ret, uint64_t *offset) {
2262
2263         Object *o, *d;
2264         int r;
2265         uint64_t b, z;
2266
2267         assert(f);
2268
2269         /* First, seek by time */
2270         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2271         if (r < 0)
2272                 return r;
2273         if (r == 0)
2274                 return -ENOENT;
2275
2276         r = generic_array_bisect_plus_one(f,
2277                                           le64toh(o->data.entry_offset),
2278                                           le64toh(o->data.entry_array_offset),
2279                                           le64toh(o->data.n_entries),
2280                                           monotonic,
2281                                           test_object_monotonic,
2282                                           direction,
2283                                           NULL, &z, NULL);
2284         if (r <= 0)
2285                 return r;
2286
2287         /* And now, continue seeking until we find an entry that
2288          * exists in both bisection arrays */
2289
2290         for (;;) {
2291                 Object *qo;
2292                 uint64_t p, q;
2293
2294                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2295                 if (r < 0)
2296                         return r;
2297
2298                 r = generic_array_bisect_plus_one(f,
2299                                                   le64toh(d->data.entry_offset),
2300                                                   le64toh(d->data.entry_array_offset),
2301                                                   le64toh(d->data.n_entries),
2302                                                   z,
2303                                                   test_object_offset,
2304                                                   direction,
2305                                                   NULL, &p, NULL);
2306                 if (r <= 0)
2307                         return r;
2308
2309                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2310                 if (r < 0)
2311                         return r;
2312
2313                 r = generic_array_bisect_plus_one(f,
2314                                                   le64toh(o->data.entry_offset),
2315                                                   le64toh(o->data.entry_array_offset),
2316                                                   le64toh(o->data.n_entries),
2317                                                   p,
2318                                                   test_object_offset,
2319                                                   direction,
2320                                                   &qo, &q, NULL);
2321
2322                 if (r <= 0)
2323                         return r;
2324
2325                 if (p == q) {
2326                         if (ret)
2327                                 *ret = qo;
2328                         if (offset)
2329                                 *offset = q;
2330
2331                         return 1;
2332                 }
2333
2334                 z = q;
2335         }
2336 }
2337
2338 int journal_file_move_to_entry_by_seqnum_for_data(
2339                 JournalFile *f,
2340                 uint64_t data_offset,
2341                 uint64_t seqnum,
2342                 direction_t direction,
2343                 Object **ret, uint64_t *offset) {
2344
2345         Object *d;
2346         int r;
2347
2348         assert(f);
2349
2350         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2351         if (r < 0)
2352                 return r;
2353
2354         return generic_array_bisect_plus_one(f,
2355                                              le64toh(d->data.entry_offset),
2356                                              le64toh(d->data.entry_array_offset),
2357                                              le64toh(d->data.n_entries),
2358                                              seqnum,
2359                                              test_object_seqnum,
2360                                              direction,
2361                                              ret, offset, NULL);
2362 }
2363
2364 int journal_file_move_to_entry_by_realtime_for_data(
2365                 JournalFile *f,
2366                 uint64_t data_offset,
2367                 uint64_t realtime,
2368                 direction_t direction,
2369                 Object **ret, uint64_t *offset) {
2370
2371         Object *d;
2372         int r;
2373
2374         assert(f);
2375
2376         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2377         if (r < 0)
2378                 return r;
2379
2380         return generic_array_bisect_plus_one(f,
2381                                              le64toh(d->data.entry_offset),
2382                                              le64toh(d->data.entry_array_offset),
2383                                              le64toh(d->data.n_entries),
2384                                              realtime,
2385                                              test_object_realtime,
2386                                              direction,
2387                                              ret, offset, NULL);
2388 }
2389
2390 void journal_file_dump(JournalFile *f) {
2391         Object *o;
2392         int r;
2393         uint64_t p;
2394
2395         assert(f);
2396
2397         journal_file_print_header(f);
2398
2399         p = le64toh(f->header->header_size);
2400         while (p != 0) {
2401                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2402                 if (r < 0)
2403                         goto fail;
2404
2405                 switch (o->object.type) {
2406
2407                 case OBJECT_UNUSED:
2408                         printf("Type: OBJECT_UNUSED\n");
2409                         break;
2410
2411                 case OBJECT_DATA:
2412                         printf("Type: OBJECT_DATA\n");
2413                         break;
2414
2415                 case OBJECT_FIELD:
2416                         printf("Type: OBJECT_FIELD\n");
2417                         break;
2418
2419                 case OBJECT_ENTRY:
2420                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2421                                le64toh(o->entry.seqnum),
2422                                le64toh(o->entry.monotonic),
2423                                le64toh(o->entry.realtime));
2424                         break;
2425
2426                 case OBJECT_FIELD_HASH_TABLE:
2427                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2428                         break;
2429
2430                 case OBJECT_DATA_HASH_TABLE:
2431                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2432                         break;
2433
2434                 case OBJECT_ENTRY_ARRAY:
2435                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2436                         break;
2437
2438                 case OBJECT_TAG:
2439                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2440                                le64toh(o->tag.seqnum),
2441                                le64toh(o->tag.epoch));
2442                         break;
2443
2444                 default:
2445                         printf("Type: unknown (%i)\n", o->object.type);
2446                         break;
2447                 }
2448
2449                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2450                         printf("Flags: %s\n",
2451                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2452
2453                 if (p == le64toh(f->header->tail_object_offset))
2454                         p = 0;
2455                 else
2456                         p = p + ALIGN64(le64toh(o->object.size));
2457         }
2458
2459         return;
2460 fail:
2461         log_error("File corrupt");
2462 }
2463
2464 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2465         const char *x;
2466
2467         x = format_timestamp(buf, l, t);
2468         if (x)
2469                 return x;
2470         return " --- ";
2471 }
2472
2473 void journal_file_print_header(JournalFile *f) {
2474         char a[33], b[33], c[33], d[33];
2475         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2476         struct stat st;
2477         char bytes[FORMAT_BYTES_MAX];
2478
2479         assert(f);
2480
2481         printf("File Path: %s\n"
2482                "File ID: %s\n"
2483                "Machine ID: %s\n"
2484                "Boot ID: %s\n"
2485                "Sequential Number ID: %s\n"
2486                "State: %s\n"
2487                "Compatible Flags:%s%s\n"
2488                "Incompatible Flags:%s%s%s\n"
2489                "Header size: %"PRIu64"\n"
2490                "Arena size: %"PRIu64"\n"
2491                "Data Hash Table Size: %"PRIu64"\n"
2492                "Field Hash Table Size: %"PRIu64"\n"
2493                "Rotate Suggested: %s\n"
2494                "Head Sequential Number: %"PRIu64"\n"
2495                "Tail Sequential Number: %"PRIu64"\n"
2496                "Head Realtime Timestamp: %s\n"
2497                "Tail Realtime Timestamp: %s\n"
2498                "Tail Monotonic Timestamp: %s\n"
2499                "Objects: %"PRIu64"\n"
2500                "Entry Objects: %"PRIu64"\n",
2501                f->path,
2502                sd_id128_to_string(f->header->file_id, a),
2503                sd_id128_to_string(f->header->machine_id, b),
2504                sd_id128_to_string(f->header->boot_id, c),
2505                sd_id128_to_string(f->header->seqnum_id, d),
2506                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2507                f->header->state == STATE_ONLINE ? "ONLINE" :
2508                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2509                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2510                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2511                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2512                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2513                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2514                le64toh(f->header->header_size),
2515                le64toh(f->header->arena_size),
2516                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2517                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2518                yes_no(journal_file_rotate_suggested(f, 0)),
2519                le64toh(f->header->head_entry_seqnum),
2520                le64toh(f->header->tail_entry_seqnum),
2521                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2522                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2523                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2524                le64toh(f->header->n_objects),
2525                le64toh(f->header->n_entries));
2526
2527         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2528                 printf("Data Objects: %"PRIu64"\n"
2529                        "Data Hash Table Fill: %.1f%%\n",
2530                        le64toh(f->header->n_data),
2531                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2532
2533         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2534                 printf("Field Objects: %"PRIu64"\n"
2535                        "Field Hash Table Fill: %.1f%%\n",
2536                        le64toh(f->header->n_fields),
2537                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2538
2539         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2540                 printf("Tag Objects: %"PRIu64"\n",
2541                        le64toh(f->header->n_tags));
2542         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2543                 printf("Entry Array Objects: %"PRIu64"\n",
2544                        le64toh(f->header->n_entry_arrays));
2545
2546         if (fstat(f->fd, &st) >= 0)
2547                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2548 }
2549
2550 static int journal_file_warn_btrfs(JournalFile *f) {
2551         unsigned attrs;
2552         int r;
2553
2554         assert(f);
2555
2556         /* Before we write anything, check if the COW logic is turned
2557          * off on btrfs. Given our write pattern that is quite
2558          * unfriendly to COW file systems this should greatly improve
2559          * performance on COW file systems, such as btrfs, at the
2560          * expense of data integrity features (which shouldn't be too
2561          * bad, given that we do our own checksumming). */
2562
2563         r = btrfs_is_filesystem(f->fd);
2564         if (r < 0)
2565                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2566         if (!r)
2567                 return 0;
2568
2569         r = read_attr_fd(f->fd, &attrs);
2570         if (r < 0)
2571                 return log_warning_errno(r, "Failed to read file attributes: %m");
2572
2573         if (attrs & FS_NOCOW_FL) {
2574                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2575                 return 0;
2576         }
2577
2578         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2579                    "This is likely to slow down journal access substantially, please consider turning "
2580                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2581
2582         return 1;
2583 }
2584
2585 int journal_file_open(
2586                 const char *fname,
2587                 int flags,
2588                 mode_t mode,
2589                 bool compress,
2590                 bool seal,
2591                 JournalMetrics *metrics,
2592                 MMapCache *mmap_cache,
2593                 JournalFile *template,
2594                 JournalFile **ret) {
2595
2596         bool newly_created = false;
2597         JournalFile *f;
2598         void *h;
2599         int r;
2600
2601         assert(fname);
2602         assert(ret);
2603
2604         if ((flags & O_ACCMODE) != O_RDONLY &&
2605             (flags & O_ACCMODE) != O_RDWR)
2606                 return -EINVAL;
2607
2608         if (!endswith(fname, ".journal") &&
2609             !endswith(fname, ".journal~"))
2610                 return -EINVAL;
2611
2612         f = new0(JournalFile, 1);
2613         if (!f)
2614                 return -ENOMEM;
2615
2616         f->fd = -1;
2617         f->mode = mode;
2618
2619         f->flags = flags;
2620         f->prot = prot_from_flags(flags);
2621         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2622 #if defined(HAVE_LZ4)
2623         f->compress_lz4 = compress;
2624 #elif defined(HAVE_XZ)
2625         f->compress_xz = compress;
2626 #endif
2627 #ifdef HAVE_GCRYPT
2628         f->seal = seal;
2629 #endif
2630
2631         if (mmap_cache)
2632                 f->mmap = mmap_cache_ref(mmap_cache);
2633         else {
2634                 f->mmap = mmap_cache_new();
2635                 if (!f->mmap) {
2636                         r = -ENOMEM;
2637                         goto fail;
2638                 }
2639         }
2640
2641         f->path = strdup(fname);
2642         if (!f->path) {
2643                 r = -ENOMEM;
2644                 goto fail;
2645         }
2646
2647         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2648         if (!f->chain_cache) {
2649                 r = -ENOMEM;
2650                 goto fail;
2651         }
2652
2653         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2654         if (f->fd < 0) {
2655                 r = -errno;
2656                 goto fail;
2657         }
2658
2659         r = journal_file_fstat(f);
2660         if (r < 0)
2661                 goto fail;
2662
2663         if (f->last_stat.st_size == 0 && f->writable) {
2664
2665                 (void) journal_file_warn_btrfs(f);
2666
2667                 /* Let's attach the creation time to the journal file,
2668                  * so that the vacuuming code knows the age of this
2669                  * file even if the file might end up corrupted one
2670                  * day... Ideally we'd just use the creation time many
2671                  * file systems maintain for each file, but there is
2672                  * currently no usable API to query this, hence let's
2673                  * emulate this via extended attributes. If extended
2674                  * attributes are not supported we'll just skip this,
2675                  * and rely solely on mtime/atime/ctime of the file. */
2676
2677                 fd_setcrtime(f->fd, 0);
2678
2679 #ifdef HAVE_GCRYPT
2680                 /* Try to load the FSPRG state, and if we can't, then
2681                  * just don't do sealing */
2682                 if (f->seal) {
2683                         r = journal_file_fss_load(f);
2684                         if (r < 0)
2685                                 f->seal = false;
2686                 }
2687 #endif
2688
2689                 r = journal_file_init_header(f, template);
2690                 if (r < 0)
2691                         goto fail;
2692
2693                 r = journal_file_fstat(f);
2694                 if (r < 0)
2695                         goto fail;
2696
2697                 newly_created = true;
2698         }
2699
2700         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2701                 r = -EIO;
2702                 goto fail;
2703         }
2704
2705         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2706         if (r < 0)
2707                 goto fail;
2708
2709         f->header = h;
2710
2711         if (!newly_created) {
2712                 r = journal_file_verify_header(f);
2713                 if (r < 0)
2714                         goto fail;
2715         }
2716
2717 #ifdef HAVE_GCRYPT
2718         if (!newly_created && f->writable) {
2719                 r = journal_file_fss_load(f);
2720                 if (r < 0)
2721                         goto fail;
2722         }
2723 #endif
2724
2725         if (f->writable) {
2726                 if (metrics) {
2727                         journal_default_metrics(metrics, f->fd);
2728                         f->metrics = *metrics;
2729                 } else if (template)
2730                         f->metrics = template->metrics;
2731
2732                 r = journal_file_refresh_header(f);
2733                 if (r < 0)
2734                         goto fail;
2735         }
2736
2737 #ifdef HAVE_GCRYPT
2738         r = journal_file_hmac_setup(f);
2739         if (r < 0)
2740                 goto fail;
2741 #endif
2742
2743         if (newly_created) {
2744                 r = journal_file_setup_field_hash_table(f);
2745                 if (r < 0)
2746                         goto fail;
2747
2748                 r = journal_file_setup_data_hash_table(f);
2749                 if (r < 0)
2750                         goto fail;
2751
2752 #ifdef HAVE_GCRYPT
2753                 r = journal_file_append_first_tag(f);
2754                 if (r < 0)
2755                         goto fail;
2756 #endif
2757         }
2758
2759         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2760                 r = -EIO;
2761                 goto fail;
2762         }
2763
2764         *ret = f;
2765         return 0;
2766
2767 fail:
2768         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2769                 r = -EIO;
2770
2771         journal_file_close(f);
2772
2773         return r;
2774 }
2775
2776 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2777         _cleanup_free_ char *p = NULL;
2778         size_t l;
2779         JournalFile *old_file, *new_file = NULL;
2780         int r;
2781
2782         assert(f);
2783         assert(*f);
2784
2785         old_file = *f;
2786
2787         if (!old_file->writable)
2788                 return -EINVAL;
2789
2790         if (!endswith(old_file->path, ".journal"))
2791                 return -EINVAL;
2792
2793         l = strlen(old_file->path);
2794         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2795                      (int) l - 8, old_file->path,
2796                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2797                      le64toh((*f)->header->head_entry_seqnum),
2798                      le64toh((*f)->header->head_entry_realtime));
2799         if (r < 0)
2800                 return -ENOMEM;
2801
2802         /* Try to rename the file to the archived version. If the file
2803          * already was deleted, we'll get ENOENT, let's ignore that
2804          * case. */
2805         r = rename(old_file->path, p);
2806         if (r < 0 && errno != ENOENT)
2807                 return -errno;
2808
2809         old_file->header->state = STATE_ARCHIVED;
2810
2811         /* Currently, btrfs is not very good with out write patterns
2812          * and fragments heavily. Let's defrag our journal files when
2813          * we archive them */
2814         old_file->defrag_on_close = true;
2815
2816         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2817         journal_file_close(old_file);
2818
2819         *f = new_file;
2820         return r;
2821 }
2822
2823 int journal_file_open_reliably(
2824                 const char *fname,
2825                 int flags,
2826                 mode_t mode,
2827                 bool compress,
2828                 bool seal,
2829                 JournalMetrics *metrics,
2830                 MMapCache *mmap_cache,
2831                 JournalFile *template,
2832                 JournalFile **ret) {
2833
2834         int r;
2835         size_t l;
2836         _cleanup_free_ char *p = NULL;
2837
2838         r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2839         if (!IN_SET(r,
2840                     -EBADMSG,           /* corrupted */
2841                     -ENODATA,           /* truncated */
2842                     -EHOSTDOWN,         /* other machine */
2843                     -EPROTONOSUPPORT,   /* incompatible feature */
2844                     -EBUSY,             /* unclean shutdown */
2845                     -ESHUTDOWN,         /* already archived */
2846                     -EIO,               /* IO error, including SIGBUS on mmap */
2847                     -EIDRM              /* File has been deleted */))
2848                 return r;
2849
2850         if ((flags & O_ACCMODE) == O_RDONLY)
2851                 return r;
2852
2853         if (!(flags & O_CREAT))
2854                 return r;
2855
2856         if (!endswith(fname, ".journal"))
2857                 return r;
2858
2859         /* The file is corrupted. Rotate it away and try it again (but only once) */
2860
2861         l = strlen(fname);
2862         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2863                      (int) l - 8, fname,
2864                      now(CLOCK_REALTIME),
2865                      random_u64()) < 0)
2866                 return -ENOMEM;
2867
2868         if (rename(fname, p) < 0)
2869                 return -errno;
2870
2871         /* btrfs doesn't cope well with our write pattern and
2872          * fragments heavily. Let's defrag all files we rotate */
2873
2874         (void) chattr_path(p, false, FS_NOCOW_FL);
2875         (void) btrfs_defrag(p);
2876
2877         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2878
2879         return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2880 }
2881
2882 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2883         uint64_t i, n;
2884         uint64_t q, xor_hash = 0;
2885         int r;
2886         EntryItem *items;
2887         dual_timestamp ts;
2888
2889         assert(from);
2890         assert(to);
2891         assert(o);
2892         assert(p);
2893
2894         if (!to->writable)
2895                 return -EPERM;
2896
2897         ts.monotonic = le64toh(o->entry.monotonic);
2898         ts.realtime = le64toh(o->entry.realtime);
2899
2900         n = journal_file_entry_n_items(o);
2901         /* alloca() can't take 0, hence let's allocate at least one */
2902         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2903
2904         for (i = 0; i < n; i++) {
2905                 uint64_t l, h;
2906                 le64_t le_hash;
2907                 size_t t;
2908                 void *data;
2909                 Object *u;
2910
2911                 q = le64toh(o->entry.items[i].object_offset);
2912                 le_hash = o->entry.items[i].hash;
2913
2914                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2915                 if (r < 0)
2916                         return r;
2917
2918                 if (le_hash != o->data.hash)
2919                         return -EBADMSG;
2920
2921                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2922                 t = (size_t) l;
2923
2924                 /* We hit the limit on 32bit machines */
2925                 if ((uint64_t) t != l)
2926                         return -E2BIG;
2927
2928                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2929 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2930                         size_t rsize = 0;
2931
2932                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2933                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2934                         if (r < 0)
2935                                 return r;
2936
2937                         data = from->compress_buffer;
2938                         l = rsize;
2939 #else
2940                         return -EPROTONOSUPPORT;
2941 #endif
2942                 } else
2943                         data = o->data.payload;
2944
2945                 r = journal_file_append_data(to, data, l, &u, &h);
2946                 if (r < 0)
2947                         return r;
2948
2949                 xor_hash ^= le64toh(u->data.hash);
2950                 items[i].object_offset = htole64(h);
2951                 items[i].hash = u->data.hash;
2952
2953                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2954                 if (r < 0)
2955                         return r;
2956         }
2957
2958         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2959
2960         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2961                 return -EIO;
2962
2963         return r;
2964 }
2965
2966 void journal_reset_metrics(JournalMetrics *m) {
2967         assert(m);
2968
2969         /* Set everything to "pick automatic values". */
2970
2971         *m = (JournalMetrics) {
2972                 .min_use = (uint64_t) -1,
2973                 .max_use = (uint64_t) -1,
2974                 .min_size = (uint64_t) -1,
2975                 .max_size = (uint64_t) -1,
2976                 .keep_free = (uint64_t) -1,
2977                 .n_max_files = (uint64_t) -1,
2978         };
2979 }
2980
2981 void journal_default_metrics(JournalMetrics *m, int fd) {
2982         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
2983         struct statvfs ss;
2984         uint64_t fs_size;
2985
2986         assert(m);
2987         assert(fd >= 0);
2988
2989         if (fstatvfs(fd, &ss) >= 0)
2990                 fs_size = ss.f_frsize * ss.f_blocks;
2991         else {
2992                 log_debug_errno(errno, "Failed to detremine disk size: %m");
2993                 fs_size = 0;
2994         }
2995
2996         if (m->max_use == (uint64_t) -1) {
2997
2998                 if (fs_size > 0) {
2999                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3000
3001                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3002                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3003
3004                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3005                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3006                 } else
3007                         m->max_use = DEFAULT_MAX_USE_LOWER;
3008         } else {
3009                 m->max_use = PAGE_ALIGN(m->max_use);
3010
3011                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3012                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3013         }
3014
3015         if (m->min_use == (uint64_t) -1)
3016                 m->min_use = DEFAULT_MIN_USE;
3017
3018         if (m->min_use > m->max_use)
3019                 m->min_use = m->max_use;
3020
3021         if (m->max_size == (uint64_t) -1) {
3022                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3023
3024                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3025                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3026         } else
3027                 m->max_size = PAGE_ALIGN(m->max_size);
3028
3029         if (m->max_size != 0) {
3030                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3031                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3032
3033                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3034                         m->max_use = m->max_size*2;
3035         }
3036
3037         if (m->min_size == (uint64_t) -1)
3038                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3039         else {
3040                 m->min_size = PAGE_ALIGN(m->min_size);
3041
3042                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3043                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3044
3045                 if (m->max_size != 0 && m->min_size > m->max_size)
3046                         m->max_size = m->min_size;
3047         }
3048
3049         if (m->keep_free == (uint64_t) -1) {
3050
3051                 if (fs_size > 0) {
3052                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3053
3054                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3055                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3056
3057                 } else
3058                         m->keep_free = DEFAULT_KEEP_FREE;
3059         }
3060
3061         if (m->n_max_files == (uint64_t) -1)
3062                 m->n_max_files = DEFAULT_N_MAX_FILES;
3063
3064         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3065                   format_bytes(a, sizeof(a), m->min_use),
3066                   format_bytes(b, sizeof(b), m->max_use),
3067                   format_bytes(c, sizeof(c), m->max_size),
3068                   format_bytes(d, sizeof(d), m->min_size),
3069                   format_bytes(e, sizeof(e), m->keep_free),
3070                   m->n_max_files);
3071 }
3072
3073 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3074         assert(f);
3075         assert(from || to);
3076
3077         if (from) {
3078                 if (f->header->head_entry_realtime == 0)
3079                         return -ENOENT;
3080
3081                 *from = le64toh(f->header->head_entry_realtime);
3082         }
3083
3084         if (to) {
3085                 if (f->header->tail_entry_realtime == 0)
3086                         return -ENOENT;
3087
3088                 *to = le64toh(f->header->tail_entry_realtime);
3089         }
3090
3091         return 1;
3092 }
3093
3094 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3095         Object *o;
3096         uint64_t p;
3097         int r;
3098
3099         assert(f);
3100         assert(from || to);
3101
3102         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3103         if (r <= 0)
3104                 return r;
3105
3106         if (le64toh(o->data.n_entries) <= 0)
3107                 return 0;
3108
3109         if (from) {
3110                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3111                 if (r < 0)
3112                         return r;
3113
3114                 *from = le64toh(o->entry.monotonic);
3115         }
3116
3117         if (to) {
3118                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3119                 if (r < 0)
3120                         return r;
3121
3122                 r = generic_array_get_plus_one(f,
3123                                                le64toh(o->data.entry_offset),
3124                                                le64toh(o->data.entry_array_offset),
3125                                                le64toh(o->data.n_entries)-1,
3126                                                &o, NULL);
3127                 if (r <= 0)
3128                         return r;
3129
3130                 *to = le64toh(o->entry.monotonic);
3131         }
3132
3133         return 1;
3134 }
3135
3136 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3137         assert(f);
3138
3139         /* If we gained new header fields we gained new features,
3140          * hence suggest a rotation */
3141         if (le64toh(f->header->header_size) < sizeof(Header)) {
3142                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3143                 return true;
3144         }
3145
3146         /* Let's check if the hash tables grew over a certain fill
3147          * level (75%, borrowing this value from Java's hash table
3148          * implementation), and if so suggest a rotation. To calculate
3149          * the fill level we need the n_data field, which only exists
3150          * in newer versions. */
3151
3152         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3153                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3154                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3155                                   f->path,
3156                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3157                                   le64toh(f->header->n_data),
3158                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3159                                   (unsigned long long) f->last_stat.st_size,
3160                                   f->last_stat.st_size / le64toh(f->header->n_data));
3161                         return true;
3162                 }
3163
3164         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3165                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3166                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3167                                   f->path,
3168                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3169                                   le64toh(f->header->n_fields),
3170                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3171                         return true;
3172                 }
3173
3174         /* Are the data objects properly indexed by field objects? */
3175         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3176             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3177             le64toh(f->header->n_data) > 0 &&
3178             le64toh(f->header->n_fields) == 0)
3179                 return true;
3180
3181         if (max_file_usec > 0) {
3182                 usec_t t, h;
3183
3184                 h = le64toh(f->header->head_entry_realtime);
3185                 t = now(CLOCK_REALTIME);
3186
3187                 if (h > 0 && t > h + max_file_usec)
3188                         return true;
3189         }
3190
3191         return false;
3192 }