src/journal/journal-file.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2011 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <fcntl.h>
  24 #include <linux/fs.h>
  25 #include <stddef.h>
  26 #include <sys/mman.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/uio.h>
  29 #include <unistd.h>
  30
  31 #include "btrfs-util.h"
  32 #include "compress.h"
  33 #include "fd-util.h"
  34 #include "journal-authenticate.h"
  35 #include "journal-def.h"
  36 #include "journal-file.h"
  37 #include "lookup3.h"
  38 #include "parse-util.h"
  39 #include "random-util.h"
  40 #include "string-util.h"
  41
  42 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  43 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  44
  45 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
  46
  47 /* This is the minimum journal file size */
  48 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
  49
  50 /* These are the lower and upper bounds if we deduce the max_use value
  51  * from the file system size */
  52 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  53 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  54
  55 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  56 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  57
  58 /* This is the upper bound if we deduce max_size from max_use */
  59 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  60
  61 /* This is the upper bound if we deduce the keep_free value from the
  62  * file system size */
  63 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  64
  65 /* This is the keep_free value when we can't determine the system
  66  * size */
  67 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  68
  69 /* This is the default maximum number of journal files to keep around. */
  70 #define DEFAULT_N_MAX_FILES (100)
  71
  72 /* n_data was the first entry we added after the initial file format design */
  73 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  74
  75 /* How many entries to keep in the entry array chain cache at max */
  76 #define CHAIN_CACHE_MAX 20
  77
  78 /* How much to increase the journal file size at once each time we allocate something new. */
  79 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  80
  81 /* Reread fstat() of the file for detecting deletions at least this often */
  82 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  83
  84 /* The mmap context to use for the header we pick as one above the last defined typed */
  85 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  86
  87 static int journal_file_set_online(JournalFile *f) {
  88         assert(f);
  89
  90         if (!f->writable)
  91                 return -EPERM;
  92
  93         if (!(f->fd >= 0 && f->header))
  94                 return -EINVAL;
  95
  96         if (mmap_cache_got_sigbus(f->mmap, f->fd))
  97                 return -EIO;
  98
  99         switch(f->header->state) {
 100                 case STATE_ONLINE:
 101                         return 0;
 102
 103                 case STATE_OFFLINE:
 104                         f->header->state = STATE_ONLINE;
 105                         fsync(f->fd);
 106                         return 0;
 107
 108                 default:
 109                         return -EINVAL;
 110         }
 111 }
 112
 113 int journal_file_set_offline(JournalFile *f) {
 114         assert(f);
 115
 116         if (!f->writable)
 117                 return -EPERM;
 118
 119         if (!(f->fd >= 0 && f->header))
 120                 return -EINVAL;
 121
 122         if (f->header->state != STATE_ONLINE)
 123                 return 0;
 124
 125         fsync(f->fd);
 126
 127         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 128                 return -EIO;
 129
 130         f->header->state = STATE_OFFLINE;
 131
 132         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 133                 return -EIO;
 134
 135         fsync(f->fd);
 136
 137         return 0;
 138 }
 139
 140 JournalFile* journal_file_close(JournalFile *f) {
 141         assert(f);
 142
 143 #ifdef HAVE_GCRYPT
 144         /* Write the final tag */
 145         if (f->seal && f->writable)
 146                 journal_file_append_tag(f);
 147 #endif
 148
 149         journal_file_set_offline(f);
 150
 151         if (f->mmap && f->fd >= 0)
 152                 mmap_cache_close_fd(f->mmap, f->fd);
 153
 154         if (f->fd >= 0 && f->defrag_on_close) {
 155
 156                 /* Be friendly to btrfs: turn COW back on again now,
 157                  * and defragment the file. We won't write to the file
 158                  * ever again, hence remove all fragmentation, and
 159                  * reenable all the good bits COW usually provides
 160                  * (such as data checksumming). */
 161
 162                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 163                 (void) btrfs_defrag_fd(f->fd);
 164         }
 165
 166         safe_close(f->fd);
 167         free(f->path);
 168
 169         if (f->mmap)
 170                 mmap_cache_unref(f->mmap);
 171
 172         ordered_hashmap_free_free(f->chain_cache);
 173
 174 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 175         free(f->compress_buffer);
 176 #endif
 177
 178 #ifdef HAVE_GCRYPT
 179         if (f->fss_file)
 180                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 181         else
 182                 free(f->fsprg_state);
 183
 184         free(f->fsprg_seed);
 185
 186         if (f->hmac)
 187                 gcry_md_close(f->hmac);
 188 #endif
 189
 190         free(f);
 191         return NULL;
 192 }
 193
 194 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 195         Header h = {};
 196         ssize_t k;
 197         int r;
 198
 199         assert(f);
 200
 201         memcpy(h.signature, HEADER_SIGNATURE, 8);
 202         h.header_size = htole64(ALIGN64(sizeof(h)));
 203
 204         h.incompatible_flags |= htole32(
 205                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 206                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 207
 208         h.compatible_flags = htole32(
 209                 f->seal * HEADER_COMPATIBLE_SEALED);
 210
 211         r = sd_id128_randomize(&h.file_id);
 212         if (r < 0)
 213                 return r;
 214
 215         if (template) {
 216                 h.seqnum_id = template->header->seqnum_id;
 217                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 218         } else
 219                 h.seqnum_id = h.file_id;
 220
 221         k = pwrite(f->fd, &h, sizeof(h), 0);
 222         if (k < 0)
 223                 return -errno;
 224
 225         if (k != sizeof(h))
 226                 return -EIO;
 227
 228         return 0;
 229 }
 230
 231 static int journal_file_refresh_header(JournalFile *f) {
 232         sd_id128_t boot_id;
 233         int r;
 234
 235         assert(f);
 236
 237         r = sd_id128_get_machine(&f->header->machine_id);
 238         if (r < 0)
 239                 return r;
 240
 241         r = sd_id128_get_boot(&boot_id);
 242         if (r < 0)
 243                 return r;
 244
 245         if (sd_id128_equal(boot_id, f->header->boot_id))
 246                 f->tail_entry_monotonic_valid = true;
 247
 248         f->header->boot_id = boot_id;
 249
 250         r = journal_file_set_online(f);
 251
 252         /* Sync the online state to disk */
 253         fsync(f->fd);
 254
 255         return r;
 256 }
 257
 258 static int journal_file_verify_header(JournalFile *f) {
 259         uint32_t flags;
 260
 261         assert(f);
 262
 263         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 264                 return -EBADMSG;
 265
 266         /* In both read and write mode we refuse to open files with
 267          * incompatible flags we don't know */
 268         flags = le32toh(f->header->incompatible_flags);
 269         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
 270                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
 271                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
 272                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
 273                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
 274                 if (flags)
 275                         log_debug("Journal file %s uses incompatible flags %"PRIx32
 276                                   " disabled at compilation time.", f->path, flags);
 277                 return -EPROTONOSUPPORT;
 278         }
 279
 280         /* When open for writing we refuse to open files with
 281          * compatible flags, too */
 282         flags = le32toh(f->header->compatible_flags);
 283         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
 284                 if (flags & ~HEADER_COMPATIBLE_ANY)
 285                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
 286                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
 287                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
 288                 if (flags)
 289                         log_debug("Journal file %s uses compatible flags %"PRIx32
 290                                   " disabled at compilation time.", f->path, flags);
 291                 return -EPROTONOSUPPORT;
 292         }
 293
 294         if (f->header->state >= _STATE_MAX)
 295                 return -EBADMSG;
 296
 297         /* The first addition was n_data, so check that we are at least this large */
 298         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
 299                 return -EBADMSG;
 300
 301         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 302                 return -EBADMSG;
 303
 304         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
 305                 return -ENODATA;
 306
 307         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
 308                 return -ENODATA;
 309
 310         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 311             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 312             !VALID64(le64toh(f->header->tail_object_offset)) ||
 313             !VALID64(le64toh(f->header->entry_array_offset)))
 314                 return -ENODATA;
 315
 316         if (f->writable) {
 317                 uint8_t state;
 318                 sd_id128_t machine_id;
 319                 int r;
 320
 321                 r = sd_id128_get_machine(&machine_id);
 322                 if (r < 0)
 323                         return r;
 324
 325                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 326                         return -EHOSTDOWN;
 327
 328                 state = f->header->state;
 329
 330                 if (state == STATE_ONLINE) {
 331                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 332                         return -EBUSY;
 333                 } else if (state == STATE_ARCHIVED)
 334                         return -ESHUTDOWN;
 335                 else if (state != STATE_OFFLINE) {
 336                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 337                         return -EBUSY;
 338                 }
 339         }
 340
 341         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 342         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 343
 344         f->seal = JOURNAL_HEADER_SEALED(f->header);
 345
 346         return 0;
 347 }
 348
 349 static int journal_file_fstat(JournalFile *f) {
 350         assert(f);
 351         assert(f->fd >= 0);
 352
 353         if (fstat(f->fd, &f->last_stat) < 0)
 354                 return -errno;
 355
 356         f->last_stat_usec = now(CLOCK_MONOTONIC);
 357
 358         /* Refuse appending to files that are already deleted */
 359         if (f->last_stat.st_nlink <= 0)
 360                 return -EIDRM;
 361
 362         return 0;
 363 }
 364
 365 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 366         uint64_t old_size, new_size;
 367         int r;
 368
 369         assert(f);
 370
 371         /* We assume that this file is not sparse, and we know that
 372          * for sure, since we always call posix_fallocate()
 373          * ourselves */
 374
 375         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 376                 return -EIO;
 377
 378         old_size =
 379                 le64toh(f->header->header_size) +
 380                 le64toh(f->header->arena_size);
 381
 382         new_size = PAGE_ALIGN(offset + size);
 383         if (new_size < le64toh(f->header->header_size))
 384                 new_size = le64toh(f->header->header_size);
 385
 386         if (new_size <= old_size) {
 387
 388                 /* We already pre-allocated enough space, but before
 389                  * we write to it, let's check with fstat() if the
 390                  * file got deleted, in order make sure we don't throw
 391                  * away the data immediately. Don't check fstat() for
 392                  * all writes though, but only once ever 10s. */
 393
 394                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 395                         return 0;
 396
 397                 return journal_file_fstat(f);
 398         }
 399
 400         /* Allocate more space. */
 401
 402         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 403                 return -E2BIG;
 404
 405         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 406                 struct statvfs svfs;
 407
 408                 if (fstatvfs(f->fd, &svfs) >= 0) {
 409                         uint64_t available;
 410
 411                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 412
 413                         if (new_size - old_size > available)
 414                                 return -E2BIG;
 415                 }
 416         }
 417
 418         /* Increase by larger blocks at once */
 419         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 420         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 421                 new_size = f->metrics.max_size;
 422
 423         /* Note that the glibc fallocate() fallback is very
 424            inefficient, hence we try to minimize the allocation area
 425            as we can. */
 426         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 427         if (r != 0)
 428                 return -r;
 429
 430         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 431
 432         return journal_file_fstat(f);
 433 }
 434
 435 static unsigned type_to_context(ObjectType type) {
 436         /* One context for each type, plus one catch-all for the rest */
 437         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 438         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 439         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 440 }
 441
 442 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
 443         int r;
 444
 445         assert(f);
 446         assert(ret);
 447
 448         if (size <= 0)
 449                 return -EINVAL;
 450
 451         /* Avoid SIGBUS on invalid accesses */
 452         if (offset + size > (uint64_t) f->last_stat.st_size) {
 453                 /* Hmm, out of range? Let's refresh the fstat() data
 454                  * first, before we trust that check. */
 455
 456                 r = journal_file_fstat(f);
 457                 if (r < 0)
 458                         return r;
 459
 460                 if (offset + size > (uint64_t) f->last_stat.st_size)
 461                         return -EADDRNOTAVAIL;
 462         }
 463
 464         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
 465 }
 466
 467 static uint64_t minimum_header_size(Object *o) {
 468
 469         static const uint64_t table[] = {
 470                 [OBJECT_DATA] = sizeof(DataObject),
 471                 [OBJECT_FIELD] = sizeof(FieldObject),
 472                 [OBJECT_ENTRY] = sizeof(EntryObject),
 473                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 474                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 475                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 476                 [OBJECT_TAG] = sizeof(TagObject),
 477         };
 478
 479         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 480                 return sizeof(ObjectHeader);
 481
 482         return table[o->object.type];
 483 }
 484
 485 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 486         int r;
 487         void *t;
 488         Object *o;
 489         uint64_t s;
 490
 491         assert(f);
 492         assert(ret);
 493
 494         /* Objects may only be located at multiple of 64 bit */
 495         if (!VALID64(offset))
 496                 return -EFAULT;
 497
 498         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
 499         if (r < 0)
 500                 return r;
 501
 502         o = (Object*) t;
 503         s = le64toh(o->object.size);
 504
 505         if (s < sizeof(ObjectHeader))
 506                 return -EBADMSG;
 507
 508         if (o->object.type <= OBJECT_UNUSED)
 509                 return -EBADMSG;
 510
 511         if (s < minimum_header_size(o))
 512                 return -EBADMSG;
 513
 514         if (type > OBJECT_UNUSED && o->object.type != type)
 515                 return -EBADMSG;
 516
 517         if (s > sizeof(ObjectHeader)) {
 518                 r = journal_file_move_to(f, type, false, offset, s, &t);
 519                 if (r < 0)
 520                         return r;
 521
 522                 o = (Object*) t;
 523         }
 524
 525         *ret = o;
 526         return 0;
 527 }
 528
 529 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 530         uint64_t r;
 531
 532         assert(f);
 533
 534         r = le64toh(f->header->tail_entry_seqnum) + 1;
 535
 536         if (seqnum) {
 537                 /* If an external seqnum counter was passed, we update
 538                  * both the local and the external one, and set it to
 539                  * the maximum of both */
 540
 541                 if (*seqnum + 1 > r)
 542                         r = *seqnum + 1;
 543
 544                 *seqnum = r;
 545         }
 546
 547         f->header->tail_entry_seqnum = htole64(r);
 548
 549         if (f->header->head_entry_seqnum == 0)
 550                 f->header->head_entry_seqnum = htole64(r);
 551
 552         return r;
 553 }
 554
 555 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
 556         int r;
 557         uint64_t p;
 558         Object *tail, *o;
 559         void *t;
 560
 561         assert(f);
 562         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
 563         assert(size >= sizeof(ObjectHeader));
 564         assert(offset);
 565         assert(ret);
 566
 567         r = journal_file_set_online(f);
 568         if (r < 0)
 569                 return r;
 570
 571         p = le64toh(f->header->tail_object_offset);
 572         if (p == 0)
 573                 p = le64toh(f->header->header_size);
 574         else {
 575                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
 576                 if (r < 0)
 577                         return r;
 578
 579                 p += ALIGN64(le64toh(tail->object.size));
 580         }
 581
 582         r = journal_file_allocate(f, p, size);
 583         if (r < 0)
 584                 return r;
 585
 586         r = journal_file_move_to(f, type, false, p, size, &t);
 587         if (r < 0)
 588                 return r;
 589
 590         o = (Object*) t;
 591
 592         zero(o->object);
 593         o->object.type = type;
 594         o->object.size = htole64(size);
 595
 596         f->header->tail_object_offset = htole64(p);
 597         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
 598
 599         *ret = o;
 600         *offset = p;
 601
 602         return 0;
 603 }
 604
 605 static int journal_file_setup_data_hash_table(JournalFile *f) {
 606         uint64_t s, p;
 607         Object *o;
 608         int r;
 609
 610         assert(f);
 611
 612         /* We estimate that we need 1 hash table entry per 768 bytes
 613            of journal file and we want to make sure we never get
 614            beyond 75% fill level. Calculate the hash table size for
 615            the maximum file size based on these metrics. */
 616
 617         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
 618         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
 619                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
 620
 621         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
 622
 623         r = journal_file_append_object(f,
 624                                        OBJECT_DATA_HASH_TABLE,
 625                                        offsetof(Object, hash_table.items) + s,
 626                                        &o, &p);
 627         if (r < 0)
 628                 return r;
 629
 630         memzero(o->hash_table.items, s);
 631
 632         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 633         f->header->data_hash_table_size = htole64(s);
 634
 635         return 0;
 636 }
 637
 638 static int journal_file_setup_field_hash_table(JournalFile *f) {
 639         uint64_t s, p;
 640         Object *o;
 641         int r;
 642
 643         assert(f);
 644
 645         /* We use a fixed size hash table for the fields as this
 646          * number should grow very slowly only */
 647
 648         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
 649         r = journal_file_append_object(f,
 650                                        OBJECT_FIELD_HASH_TABLE,
 651                                        offsetof(Object, hash_table.items) + s,
 652                                        &o, &p);
 653         if (r < 0)
 654                 return r;
 655
 656         memzero(o->hash_table.items, s);
 657
 658         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 659         f->header->field_hash_table_size = htole64(s);
 660
 661         return 0;
 662 }
 663
 664 int journal_file_map_data_hash_table(JournalFile *f) {
 665         uint64_t s, p;
 666         void *t;
 667         int r;
 668
 669         assert(f);
 670
 671         if (f->data_hash_table)
 672                 return 0;
 673
 674         p = le64toh(f->header->data_hash_table_offset);
 675         s = le64toh(f->header->data_hash_table_size);
 676
 677         r = journal_file_move_to(f,
 678                                  OBJECT_DATA_HASH_TABLE,
 679                                  true,
 680                                  p, s,
 681                                  &t);
 682         if (r < 0)
 683                 return r;
 684
 685         f->data_hash_table = t;
 686         return 0;
 687 }
 688
 689 int journal_file_map_field_hash_table(JournalFile *f) {
 690         uint64_t s, p;
 691         void *t;
 692         int r;
 693
 694         assert(f);
 695
 696         if (f->field_hash_table)
 697                 return 0;
 698
 699         p = le64toh(f->header->field_hash_table_offset);
 700         s = le64toh(f->header->field_hash_table_size);
 701
 702         r = journal_file_move_to(f,
 703                                  OBJECT_FIELD_HASH_TABLE,
 704                                  true,
 705                                  p, s,
 706                                  &t);
 707         if (r < 0)
 708                 return r;
 709
 710         f->field_hash_table = t;
 711         return 0;
 712 }
 713
 714 static int journal_file_link_field(
 715                 JournalFile *f,
 716                 Object *o,
 717                 uint64_t offset,
 718                 uint64_t hash) {
 719
 720         uint64_t p, h, m;
 721         int r;
 722
 723         assert(f);
 724         assert(o);
 725         assert(offset > 0);
 726
 727         if (o->object.type != OBJECT_FIELD)
 728                 return -EINVAL;
 729
 730         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 731         if (m <= 0)
 732                 return -EBADMSG;
 733
 734         /* This might alter the window we are looking at */
 735         o->field.next_hash_offset = o->field.head_data_offset = 0;
 736
 737         h = hash % m;
 738         p = le64toh(f->field_hash_table[h].tail_hash_offset);
 739         if (p == 0)
 740                 f->field_hash_table[h].head_hash_offset = htole64(offset);
 741         else {
 742                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 743                 if (r < 0)
 744                         return r;
 745
 746                 o->field.next_hash_offset = htole64(offset);
 747         }
 748
 749         f->field_hash_table[h].tail_hash_offset = htole64(offset);
 750
 751         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
 752                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
 753
 754         return 0;
 755 }
 756
 757 static int journal_file_link_data(
 758                 JournalFile *f,
 759                 Object *o,
 760                 uint64_t offset,
 761                 uint64_t hash) {
 762
 763         uint64_t p, h, m;
 764         int r;
 765
 766         assert(f);
 767         assert(o);
 768         assert(offset > 0);
 769
 770         if (o->object.type != OBJECT_DATA)
 771                 return -EINVAL;
 772
 773         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 774         if (m <= 0)
 775                 return -EBADMSG;
 776
 777         /* This might alter the window we are looking at */
 778         o->data.next_hash_offset = o->data.next_field_offset = 0;
 779         o->data.entry_offset = o->data.entry_array_offset = 0;
 780         o->data.n_entries = 0;
 781
 782         h = hash % m;
 783         p = le64toh(f->data_hash_table[h].tail_hash_offset);
 784         if (p == 0)
 785                 /* Only entry in the hash table is easy */
 786                 f->data_hash_table[h].head_hash_offset = htole64(offset);
 787         else {
 788                 /* Move back to the previous data object, to patch in
 789                  * pointer */
 790
 791                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 792                 if (r < 0)
 793                         return r;
 794
 795                 o->data.next_hash_offset = htole64(offset);
 796         }
 797
 798         f->data_hash_table[h].tail_hash_offset = htole64(offset);
 799
 800         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
 801                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
 802
 803         return 0;
 804 }
 805
 806 int journal_file_find_field_object_with_hash(
 807                 JournalFile *f,
 808                 const void *field, uint64_t size, uint64_t hash,
 809                 Object **ret, uint64_t *offset) {
 810
 811         uint64_t p, osize, h, m;
 812         int r;
 813
 814         assert(f);
 815         assert(field && size > 0);
 816
 817         /* If the field hash table is empty, we can't find anything */
 818         if (le64toh(f->header->field_hash_table_size) <= 0)
 819                 return 0;
 820
 821         /* Map the field hash table, if it isn't mapped yet. */
 822         r = journal_file_map_field_hash_table(f);
 823         if (r < 0)
 824                 return r;
 825
 826         osize = offsetof(Object, field.payload) + size;
 827
 828         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 829         if (m <= 0)
 830                 return -EBADMSG;
 831
 832         h = hash % m;
 833         p = le64toh(f->field_hash_table[h].head_hash_offset);
 834
 835         while (p > 0) {
 836                 Object *o;
 837
 838                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 839                 if (r < 0)
 840                         return r;
 841
 842                 if (le64toh(o->field.hash) == hash &&
 843                     le64toh(o->object.size) == osize &&
 844                     memcmp(o->field.payload, field, size) == 0) {
 845
 846                         if (ret)
 847                                 *ret = o;
 848                         if (offset)
 849                                 *offset = p;
 850
 851                         return 1;
 852                 }
 853
 854                 p = le64toh(o->field.next_hash_offset);
 855         }
 856
 857         return 0;
 858 }
 859
 860 int journal_file_find_field_object(
 861                 JournalFile *f,
 862                 const void *field, uint64_t size,
 863                 Object **ret, uint64_t *offset) {
 864
 865         uint64_t hash;
 866
 867         assert(f);
 868         assert(field && size > 0);
 869
 870         hash = hash64(field, size);
 871
 872         return journal_file_find_field_object_with_hash(f,
 873                                                         field, size, hash,
 874                                                         ret, offset);
 875 }
 876
 877 int journal_file_find_data_object_with_hash(
 878                 JournalFile *f,
 879                 const void *data, uint64_t size, uint64_t hash,
 880                 Object **ret, uint64_t *offset) {
 881
 882         uint64_t p, osize, h, m;
 883         int r;
 884
 885         assert(f);
 886         assert(data || size == 0);
 887
 888         /* If there's no data hash table, then there's no entry. */
 889         if (le64toh(f->header->data_hash_table_size) <= 0)
 890                 return 0;
 891
 892         /* Map the data hash table, if it isn't mapped yet. */
 893         r = journal_file_map_data_hash_table(f);
 894         if (r < 0)
 895                 return r;
 896
 897         osize = offsetof(Object, data.payload) + size;
 898
 899         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 900         if (m <= 0)
 901                 return -EBADMSG;
 902
 903         h = hash % m;
 904         p = le64toh(f->data_hash_table[h].head_hash_offset);
 905
 906         while (p > 0) {
 907                 Object *o;
 908
 909                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 910                 if (r < 0)
 911                         return r;
 912
 913                 if (le64toh(o->data.hash) != hash)
 914                         goto next;
 915
 916                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
 917 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 918                         uint64_t l;
 919                         size_t rsize = 0;
 920
 921                         l = le64toh(o->object.size);
 922                         if (l <= offsetof(Object, data.payload))
 923                                 return -EBADMSG;
 924
 925                         l -= offsetof(Object, data.payload);
 926
 927                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
 928                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
 929                         if (r < 0)
 930                                 return r;
 931
 932                         if (rsize == size &&
 933                             memcmp(f->compress_buffer, data, size) == 0) {
 934
 935                                 if (ret)
 936                                         *ret = o;
 937
 938                                 if (offset)
 939                                         *offset = p;
 940
 941                                 return 1;
 942                         }
 943 #else
 944                         return -EPROTONOSUPPORT;
 945 #endif
 946                 } else if (le64toh(o->object.size) == osize &&
 947                            memcmp(o->data.payload, data, size) == 0) {
 948
 949                         if (ret)
 950                                 *ret = o;
 951
 952                         if (offset)
 953                                 *offset = p;
 954
 955                         return 1;
 956                 }
 957
 958         next:
 959                 p = le64toh(o->data.next_hash_offset);
 960         }
 961
 962         return 0;
 963 }
 964
 965 int journal_file_find_data_object(
 966                 JournalFile *f,
 967                 const void *data, uint64_t size,
 968                 Object **ret, uint64_t *offset) {
 969
 970         uint64_t hash;
 971
 972         assert(f);
 973         assert(data || size == 0);
 974
 975         hash = hash64(data, size);
 976
 977         return journal_file_find_data_object_with_hash(f,
 978                                                        data, size, hash,
 979                                                        ret, offset);
 980 }
 981
 982 static int journal_file_append_field(
 983                 JournalFile *f,
 984                 const void *field, uint64_t size,
 985                 Object **ret, uint64_t *offset) {
 986
 987         uint64_t hash, p;
 988         uint64_t osize;
 989         Object *o;
 990         int r;
 991
 992         assert(f);
 993         assert(field && size > 0);
 994
 995         hash = hash64(field, size);
 996
 997         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
 998         if (r < 0)
 999                 return r;
1000         else if (r > 0) {
1001
1002                 if (ret)
1003                         *ret = o;
1004
1005                 if (offset)
1006                         *offset = p;
1007
1008                 return 0;
1009         }
1010
1011         osize = offsetof(Object, field.payload) + size;
1012         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1013         if (r < 0)
1014                 return r;
1015
1016         o->field.hash = htole64(hash);
1017         memcpy(o->field.payload, field, size);
1018
1019         r = journal_file_link_field(f, o, p, hash);
1020         if (r < 0)
1021                 return r;
1022
1023         /* The linking might have altered the window, so let's
1024          * refresh our pointer */
1025         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1026         if (r < 0)
1027                 return r;
1028
1029 #ifdef HAVE_GCRYPT
1030         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1031         if (r < 0)
1032                 return r;
1033 #endif
1034
1035         if (ret)
1036                 *ret = o;
1037
1038         if (offset)
1039                 *offset = p;
1040
1041         return 0;
1042 }
1043
1044 static int journal_file_append_data(
1045                 JournalFile *f,
1046                 const void *data, uint64_t size,
1047                 Object **ret, uint64_t *offset) {
1048
1049         uint64_t hash, p;
1050         uint64_t osize;
1051         Object *o;
1052         int r, compression = 0;
1053         const void *eq;
1054
1055         assert(f);
1056         assert(data || size == 0);
1057
1058         hash = hash64(data, size);
1059
1060         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1061         if (r < 0)
1062                 return r;
1063         if (r > 0) {
1064
1065                 if (ret)
1066                         *ret = o;
1067
1068                 if (offset)
1069                         *offset = p;
1070
1071                 return 0;
1072         }
1073
1074         osize = offsetof(Object, data.payload) + size;
1075         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1076         if (r < 0)
1077                 return r;
1078
1079         o->data.hash = htole64(hash);
1080
1081 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1082         if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1083                 size_t rsize = 0;
1084
1085                 compression = compress_blob(data, size, o->data.payload, &rsize);
1086
1087                 if (compression >= 0) {
1088                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1089                         o->object.flags |= compression;
1090
1091                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1092                                   size, rsize, object_compressed_to_string(compression));
1093                 } else
1094                         /* Compression didn't work, we don't really care why, let's continue without compression */
1095                         compression = 0;
1096         }
1097 #endif
1098
1099         if (compression == 0 && size > 0)
1100                 memcpy(o->data.payload, data, size);
1101
1102         r = journal_file_link_data(f, o, p, hash);
1103         if (r < 0)
1104                 return r;
1105
1106         /* The linking might have altered the window, so let's
1107          * refresh our pointer */
1108         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1109         if (r < 0)
1110                 return r;
1111
1112         if (!data)
1113                 eq = NULL;
1114         else
1115                 eq = memchr(data, '=', size);
1116         if (eq && eq > data) {
1117                 Object *fo = NULL;
1118                 uint64_t fp;
1119
1120                 /* Create field object ... */
1121                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1122                 if (r < 0)
1123                         return r;
1124
1125                 /* ... and link it in. */
1126                 o->data.next_field_offset = fo->field.head_data_offset;
1127                 fo->field.head_data_offset = le64toh(p);
1128         }
1129
1130 #ifdef HAVE_GCRYPT
1131         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1132         if (r < 0)
1133                 return r;
1134 #endif
1135
1136         if (ret)
1137                 *ret = o;
1138
1139         if (offset)
1140                 *offset = p;
1141
1142         return 0;
1143 }
1144
1145 uint64_t journal_file_entry_n_items(Object *o) {
1146         assert(o);
1147
1148         if (o->object.type != OBJECT_ENTRY)
1149                 return 0;
1150
1151         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1152 }
1153
1154 uint64_t journal_file_entry_array_n_items(Object *o) {
1155         assert(o);
1156
1157         if (o->object.type != OBJECT_ENTRY_ARRAY)
1158                 return 0;
1159
1160         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1161 }
1162
1163 uint64_t journal_file_hash_table_n_items(Object *o) {
1164         assert(o);
1165
1166         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1167             o->object.type != OBJECT_FIELD_HASH_TABLE)
1168                 return 0;
1169
1170         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1171 }
1172
1173 static int link_entry_into_array(JournalFile *f,
1174                                  le64_t *first,
1175                                  le64_t *idx,
1176                                  uint64_t p) {
1177         int r;
1178         uint64_t n = 0, ap = 0, q, i, a, hidx;
1179         Object *o;
1180
1181         assert(f);
1182         assert(first);
1183         assert(idx);
1184         assert(p > 0);
1185
1186         a = le64toh(*first);
1187         i = hidx = le64toh(*idx);
1188         while (a > 0) {
1189
1190                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1191                 if (r < 0)
1192                         return r;
1193
1194                 n = journal_file_entry_array_n_items(o);
1195                 if (i < n) {
1196                         o->entry_array.items[i] = htole64(p);
1197                         *idx = htole64(hidx + 1);
1198                         return 0;
1199                 }
1200
1201                 i -= n;
1202                 ap = a;
1203                 a = le64toh(o->entry_array.next_entry_array_offset);
1204         }
1205
1206         if (hidx > n)
1207                 n = (hidx+1) * 2;
1208         else
1209                 n = n * 2;
1210
1211         if (n < 4)
1212                 n = 4;
1213
1214         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1215                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1216                                        &o, &q);
1217         if (r < 0)
1218                 return r;
1219
1220 #ifdef HAVE_GCRYPT
1221         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1222         if (r < 0)
1223                 return r;
1224 #endif
1225
1226         o->entry_array.items[i] = htole64(p);
1227
1228         if (ap == 0)
1229                 *first = htole64(q);
1230         else {
1231                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1232                 if (r < 0)
1233                         return r;
1234
1235                 o->entry_array.next_entry_array_offset = htole64(q);
1236         }
1237
1238         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1239                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1240
1241         *idx = htole64(hidx + 1);
1242
1243         return 0;
1244 }
1245
1246 static int link_entry_into_array_plus_one(JournalFile *f,
1247                                           le64_t *extra,
1248                                           le64_t *first,
1249                                           le64_t *idx,
1250                                           uint64_t p) {
1251
1252         int r;
1253
1254         assert(f);
1255         assert(extra);
1256         assert(first);
1257         assert(idx);
1258         assert(p > 0);
1259
1260         if (*idx == 0)
1261                 *extra = htole64(p);
1262         else {
1263                 le64_t i;
1264
1265                 i = htole64(le64toh(*idx) - 1);
1266                 r = link_entry_into_array(f, first, &i, p);
1267                 if (r < 0)
1268                         return r;
1269         }
1270
1271         *idx = htole64(le64toh(*idx) + 1);
1272         return 0;
1273 }
1274
1275 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1276         uint64_t p;
1277         int r;
1278         assert(f);
1279         assert(o);
1280         assert(offset > 0);
1281
1282         p = le64toh(o->entry.items[i].object_offset);
1283         if (p == 0)
1284                 return -EINVAL;
1285
1286         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1287         if (r < 0)
1288                 return r;
1289
1290         return link_entry_into_array_plus_one(f,
1291                                               &o->data.entry_offset,
1292                                               &o->data.entry_array_offset,
1293                                               &o->data.n_entries,
1294                                               offset);
1295 }
1296
1297 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1298         uint64_t n, i;
1299         int r;
1300
1301         assert(f);
1302         assert(o);
1303         assert(offset > 0);
1304
1305         if (o->object.type != OBJECT_ENTRY)
1306                 return -EINVAL;
1307
1308         __sync_synchronize();
1309
1310         /* Link up the entry itself */
1311         r = link_entry_into_array(f,
1312                                   &f->header->entry_array_offset,
1313                                   &f->header->n_entries,
1314                                   offset);
1315         if (r < 0)
1316                 return r;
1317
1318         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1319
1320         if (f->header->head_entry_realtime == 0)
1321                 f->header->head_entry_realtime = o->entry.realtime;
1322
1323         f->header->tail_entry_realtime = o->entry.realtime;
1324         f->header->tail_entry_monotonic = o->entry.monotonic;
1325
1326         f->tail_entry_monotonic_valid = true;
1327
1328         /* Link up the items */
1329         n = journal_file_entry_n_items(o);
1330         for (i = 0; i < n; i++) {
1331                 r = journal_file_link_entry_item(f, o, offset, i);
1332                 if (r < 0)
1333                         return r;
1334         }
1335
1336         return 0;
1337 }
1338
1339 static int journal_file_append_entry_internal(
1340                 JournalFile *f,
1341                 const dual_timestamp *ts,
1342                 uint64_t xor_hash,
1343                 const EntryItem items[], unsigned n_items,
1344                 uint64_t *seqnum,
1345                 Object **ret, uint64_t *offset) {
1346         uint64_t np;
1347         uint64_t osize;
1348         Object *o;
1349         int r;
1350
1351         assert(f);
1352         assert(items || n_items == 0);
1353         assert(ts);
1354
1355         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1356
1357         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1358         if (r < 0)
1359                 return r;
1360
1361         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1362         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1363         o->entry.realtime = htole64(ts->realtime);
1364         o->entry.monotonic = htole64(ts->monotonic);
1365         o->entry.xor_hash = htole64(xor_hash);
1366         o->entry.boot_id = f->header->boot_id;
1367
1368 #ifdef HAVE_GCRYPT
1369         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1370         if (r < 0)
1371                 return r;
1372 #endif
1373
1374         r = journal_file_link_entry(f, o, np);
1375         if (r < 0)
1376                 return r;
1377
1378         if (ret)
1379                 *ret = o;
1380
1381         if (offset)
1382                 *offset = np;
1383
1384         return 0;
1385 }
1386
1387 void journal_file_post_change(JournalFile *f) {
1388         assert(f);
1389
1390         /* inotify() does not receive IN_MODIFY events from file
1391          * accesses done via mmap(). After each access we hence
1392          * trigger IN_MODIFY by truncating the journal file to its
1393          * current size which triggers IN_MODIFY. */
1394
1395         __sync_synchronize();
1396
1397         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1398                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1399 }
1400
1401 static int entry_item_cmp(const void *_a, const void *_b) {
1402         const EntryItem *a = _a, *b = _b;
1403
1404         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1405                 return -1;
1406         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1407                 return 1;
1408         return 0;
1409 }
1410
1411 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1412         unsigned i;
1413         EntryItem *items;
1414         int r;
1415         uint64_t xor_hash = 0;
1416         struct dual_timestamp _ts;
1417
1418         assert(f);
1419         assert(iovec || n_iovec == 0);
1420
1421         if (!ts) {
1422                 dual_timestamp_get(&_ts);
1423                 ts = &_ts;
1424         }
1425
1426         if (f->tail_entry_monotonic_valid &&
1427             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1428                 return -EINVAL;
1429
1430 #ifdef HAVE_GCRYPT
1431         r = journal_file_maybe_append_tag(f, ts->realtime);
1432         if (r < 0)
1433                 return r;
1434 #endif
1435
1436         /* alloca() can't take 0, hence let's allocate at least one */
1437         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1438
1439         for (i = 0; i < n_iovec; i++) {
1440                 uint64_t p;
1441                 Object *o;
1442
1443                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1444                 if (r < 0)
1445                         return r;
1446
1447                 xor_hash ^= le64toh(o->data.hash);
1448                 items[i].object_offset = htole64(p);
1449                 items[i].hash = o->data.hash;
1450         }
1451
1452         /* Order by the position on disk, in order to improve seek
1453          * times for rotating media. */
1454         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1455
1456         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1457
1458         /* If the memory mapping triggered a SIGBUS then we return an
1459          * IO error and ignore the error code passed down to us, since
1460          * it is very likely just an effect of a nullified replacement
1461          * mapping page */
1462
1463         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1464                 r = -EIO;
1465
1466         journal_file_post_change(f);
1467
1468         return r;
1469 }
1470
1471 typedef struct ChainCacheItem {
1472         uint64_t first; /* the array at the beginning of the chain */
1473         uint64_t array; /* the cached array */
1474         uint64_t begin; /* the first item in the cached array */
1475         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1476         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1477 } ChainCacheItem;
1478
1479 static void chain_cache_put(
1480                 OrderedHashmap *h,
1481                 ChainCacheItem *ci,
1482                 uint64_t first,
1483                 uint64_t array,
1484                 uint64_t begin,
1485                 uint64_t total,
1486                 uint64_t last_index) {
1487
1488         if (!ci) {
1489                 /* If the chain item to cache for this chain is the
1490                  * first one it's not worth caching anything */
1491                 if (array == first)
1492                         return;
1493
1494                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1495                         ci = ordered_hashmap_steal_first(h);
1496                         assert(ci);
1497                 } else {
1498                         ci = new(ChainCacheItem, 1);
1499                         if (!ci)
1500                                 return;
1501                 }
1502
1503                 ci->first = first;
1504
1505                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1506                         free(ci);
1507                         return;
1508                 }
1509         } else
1510                 assert(ci->first == first);
1511
1512         ci->array = array;
1513         ci->begin = begin;
1514         ci->total = total;
1515         ci->last_index = last_index;
1516 }
1517
1518 static int generic_array_get(
1519                 JournalFile *f,
1520                 uint64_t first,
1521                 uint64_t i,
1522                 Object **ret, uint64_t *offset) {
1523
1524         Object *o;
1525         uint64_t p = 0, a, t = 0;
1526         int r;
1527         ChainCacheItem *ci;
1528
1529         assert(f);
1530
1531         a = first;
1532
1533         /* Try the chain cache first */
1534         ci = ordered_hashmap_get(f->chain_cache, &first);
1535         if (ci && i > ci->total) {
1536                 a = ci->array;
1537                 i -= ci->total;
1538                 t = ci->total;
1539         }
1540
1541         while (a > 0) {
1542                 uint64_t k;
1543
1544                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1545                 if (r < 0)
1546                         return r;
1547
1548                 k = journal_file_entry_array_n_items(o);
1549                 if (i < k) {
1550                         p = le64toh(o->entry_array.items[i]);
1551                         goto found;
1552                 }
1553
1554                 i -= k;
1555                 t += k;
1556                 a = le64toh(o->entry_array.next_entry_array_offset);
1557         }
1558
1559         return 0;
1560
1561 found:
1562         /* Let's cache this item for the next invocation */
1563         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1564
1565         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1566         if (r < 0)
1567                 return r;
1568
1569         if (ret)
1570                 *ret = o;
1571
1572         if (offset)
1573                 *offset = p;
1574
1575         return 1;
1576 }
1577
1578 static int generic_array_get_plus_one(
1579                 JournalFile *f,
1580                 uint64_t extra,
1581                 uint64_t first,
1582                 uint64_t i,
1583                 Object **ret, uint64_t *offset) {
1584
1585         Object *o;
1586
1587         assert(f);
1588
1589         if (i == 0) {
1590                 int r;
1591
1592                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1593                 if (r < 0)
1594                         return r;
1595
1596                 if (ret)
1597                         *ret = o;
1598
1599                 if (offset)
1600                         *offset = extra;
1601
1602                 return 1;
1603         }
1604
1605         return generic_array_get(f, first, i-1, ret, offset);
1606 }
1607
1608 enum {
1609         TEST_FOUND,
1610         TEST_LEFT,
1611         TEST_RIGHT
1612 };
1613
1614 static int generic_array_bisect(
1615                 JournalFile *f,
1616                 uint64_t first,
1617                 uint64_t n,
1618                 uint64_t needle,
1619                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1620                 direction_t direction,
1621                 Object **ret,
1622                 uint64_t *offset,
1623                 uint64_t *idx) {
1624
1625         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1626         bool subtract_one = false;
1627         Object *o, *array = NULL;
1628         int r;
1629         ChainCacheItem *ci;
1630
1631         assert(f);
1632         assert(test_object);
1633
1634         /* Start with the first array in the chain */
1635         a = first;
1636
1637         ci = ordered_hashmap_get(f->chain_cache, &first);
1638         if (ci && n > ci->total) {
1639                 /* Ah, we have iterated this bisection array chain
1640                  * previously! Let's see if we can skip ahead in the
1641                  * chain, as far as the last time. But we can't jump
1642                  * backwards in the chain, so let's check that
1643                  * first. */
1644
1645                 r = test_object(f, ci->begin, needle);
1646                 if (r < 0)
1647                         return r;
1648
1649                 if (r == TEST_LEFT) {
1650                         /* OK, what we are looking for is right of the
1651                          * begin of this EntryArray, so let's jump
1652                          * straight to previously cached array in the
1653                          * chain */
1654
1655                         a = ci->array;
1656                         n -= ci->total;
1657                         t = ci->total;
1658                         last_index = ci->last_index;
1659                 }
1660         }
1661
1662         while (a > 0) {
1663                 uint64_t left, right, k, lp;
1664
1665                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1666                 if (r < 0)
1667                         return r;
1668
1669                 k = journal_file_entry_array_n_items(array);
1670                 right = MIN(k, n);
1671                 if (right <= 0)
1672                         return 0;
1673
1674                 i = right - 1;
1675                 lp = p = le64toh(array->entry_array.items[i]);
1676                 if (p <= 0)
1677                         return -EBADMSG;
1678
1679                 r = test_object(f, p, needle);
1680                 if (r < 0)
1681                         return r;
1682
1683                 if (r == TEST_FOUND)
1684                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1685
1686                 if (r == TEST_RIGHT) {
1687                         left = 0;
1688                         right -= 1;
1689
1690                         if (last_index != (uint64_t) -1) {
1691                                 assert(last_index <= right);
1692
1693                                 /* If we cached the last index we
1694                                  * looked at, let's try to not to jump
1695                                  * too wildly around and see if we can
1696                                  * limit the range to look at early to
1697                                  * the immediate neighbors of the last
1698                                  * index we looked at. */
1699
1700                                 if (last_index > 0) {
1701                                         uint64_t x = last_index - 1;
1702
1703                                         p = le64toh(array->entry_array.items[x]);
1704                                         if (p <= 0)
1705                                                 return -EBADMSG;
1706
1707                                         r = test_object(f, p, needle);
1708                                         if (r < 0)
1709                                                 return r;
1710
1711                                         if (r == TEST_FOUND)
1712                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1713
1714                                         if (r == TEST_RIGHT)
1715                                                 right = x;
1716                                         else
1717                                                 left = x + 1;
1718                                 }
1719
1720                                 if (last_index < right) {
1721                                         uint64_t y = last_index + 1;
1722
1723                                         p = le64toh(array->entry_array.items[y]);
1724                                         if (p <= 0)
1725                                                 return -EBADMSG;
1726
1727                                         r = test_object(f, p, needle);
1728                                         if (r < 0)
1729                                                 return r;
1730
1731                                         if (r == TEST_FOUND)
1732                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1733
1734                                         if (r == TEST_RIGHT)
1735                                                 right = y;
1736                                         else
1737                                                 left = y + 1;
1738                                 }
1739                         }
1740
1741                         for (;;) {
1742                                 if (left == right) {
1743                                         if (direction == DIRECTION_UP)
1744                                                 subtract_one = true;
1745
1746                                         i = left;
1747                                         goto found;
1748                                 }
1749
1750                                 assert(left < right);
1751                                 i = (left + right) / 2;
1752
1753                                 p = le64toh(array->entry_array.items[i]);
1754                                 if (p <= 0)
1755                                         return -EBADMSG;
1756
1757                                 r = test_object(f, p, needle);
1758                                 if (r < 0)
1759                                         return r;
1760
1761                                 if (r == TEST_FOUND)
1762                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1763
1764                                 if (r == TEST_RIGHT)
1765                                         right = i;
1766                                 else
1767                                         left = i + 1;
1768                         }
1769                 }
1770
1771                 if (k >= n) {
1772                         if (direction == DIRECTION_UP) {
1773                                 i = n;
1774                                 subtract_one = true;
1775                                 goto found;
1776                         }
1777
1778                         return 0;
1779                 }
1780
1781                 last_p = lp;
1782
1783                 n -= k;
1784                 t += k;
1785                 last_index = (uint64_t) -1;
1786                 a = le64toh(array->entry_array.next_entry_array_offset);
1787         }
1788
1789         return 0;
1790
1791 found:
1792         if (subtract_one && t == 0 && i == 0)
1793                 return 0;
1794
1795         /* Let's cache this item for the next invocation */
1796         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1797
1798         if (subtract_one && i == 0)
1799                 p = last_p;
1800         else if (subtract_one)
1801                 p = le64toh(array->entry_array.items[i-1]);
1802         else
1803                 p = le64toh(array->entry_array.items[i]);
1804
1805         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1806         if (r < 0)
1807                 return r;
1808
1809         if (ret)
1810                 *ret = o;
1811
1812         if (offset)
1813                 *offset = p;
1814
1815         if (idx)
1816                 *idx = t + i + (subtract_one ? -1 : 0);
1817
1818         return 1;
1819 }
1820
1821 static int generic_array_bisect_plus_one(
1822                 JournalFile *f,
1823                 uint64_t extra,
1824                 uint64_t first,
1825                 uint64_t n,
1826                 uint64_t needle,
1827                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1828                 direction_t direction,
1829                 Object **ret,
1830                 uint64_t *offset,
1831                 uint64_t *idx) {
1832
1833         int r;
1834         bool step_back = false;
1835         Object *o;
1836
1837         assert(f);
1838         assert(test_object);
1839
1840         if (n <= 0)
1841                 return 0;
1842
1843         /* This bisects the array in object 'first', but first checks
1844          * an extra  */
1845         r = test_object(f, extra, needle);
1846         if (r < 0)
1847                 return r;
1848
1849         if (r == TEST_FOUND)
1850                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1851
1852         /* if we are looking with DIRECTION_UP then we need to first
1853            see if in the actual array there is a matching entry, and
1854            return the last one of that. But if there isn't any we need
1855            to return this one. Hence remember this, and return it
1856            below. */
1857         if (r == TEST_LEFT)
1858                 step_back = direction == DIRECTION_UP;
1859
1860         if (r == TEST_RIGHT) {
1861                 if (direction == DIRECTION_DOWN)
1862                         goto found;
1863                 else
1864                         return 0;
1865         }
1866
1867         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1868
1869         if (r == 0 && step_back)
1870                 goto found;
1871
1872         if (r > 0 && idx)
1873                 (*idx) ++;
1874
1875         return r;
1876
1877 found:
1878         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1879         if (r < 0)
1880                 return r;
1881
1882         if (ret)
1883                 *ret = o;
1884
1885         if (offset)
1886                 *offset = extra;
1887
1888         if (idx)
1889                 *idx = 0;
1890
1891         return 1;
1892 }
1893
1894 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1895         assert(f);
1896         assert(p > 0);
1897
1898         if (p == needle)
1899                 return TEST_FOUND;
1900         else if (p < needle)
1901                 return TEST_LEFT;
1902         else
1903                 return TEST_RIGHT;
1904 }
1905
1906 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1907         Object *o;
1908         int r;
1909
1910         assert(f);
1911         assert(p > 0);
1912
1913         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1914         if (r < 0)
1915                 return r;
1916
1917         if (le64toh(o->entry.seqnum) == needle)
1918                 return TEST_FOUND;
1919         else if (le64toh(o->entry.seqnum) < needle)
1920                 return TEST_LEFT;
1921         else
1922                 return TEST_RIGHT;
1923 }
1924
1925 int journal_file_move_to_entry_by_seqnum(
1926                 JournalFile *f,
1927                 uint64_t seqnum,
1928                 direction_t direction,
1929                 Object **ret,
1930                 uint64_t *offset) {
1931
1932         return generic_array_bisect(f,
1933                                     le64toh(f->header->entry_array_offset),
1934                                     le64toh(f->header->n_entries),
1935                                     seqnum,
1936                                     test_object_seqnum,
1937                                     direction,
1938                                     ret, offset, NULL);
1939 }
1940
1941 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1942         Object *o;
1943         int r;
1944
1945         assert(f);
1946         assert(p > 0);
1947
1948         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1949         if (r < 0)
1950                 return r;
1951
1952         if (le64toh(o->entry.realtime) == needle)
1953                 return TEST_FOUND;
1954         else if (le64toh(o->entry.realtime) < needle)
1955                 return TEST_LEFT;
1956         else
1957                 return TEST_RIGHT;
1958 }
1959
1960 int journal_file_move_to_entry_by_realtime(
1961                 JournalFile *f,
1962                 uint64_t realtime,
1963                 direction_t direction,
1964                 Object **ret,
1965                 uint64_t *offset) {
1966
1967         return generic_array_bisect(f,
1968                                     le64toh(f->header->entry_array_offset),
1969                                     le64toh(f->header->n_entries),
1970                                     realtime,
1971                                     test_object_realtime,
1972                                     direction,
1973                                     ret, offset, NULL);
1974 }
1975
1976 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1977         Object *o;
1978         int r;
1979
1980         assert(f);
1981         assert(p > 0);
1982
1983         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1984         if (r < 0)
1985                 return r;
1986
1987         if (le64toh(o->entry.monotonic) == needle)
1988                 return TEST_FOUND;
1989         else if (le64toh(o->entry.monotonic) < needle)
1990                 return TEST_LEFT;
1991         else
1992                 return TEST_RIGHT;
1993 }
1994
1995 static int find_data_object_by_boot_id(
1996                 JournalFile *f,
1997                 sd_id128_t boot_id,
1998                 Object **o,
1999                 uint64_t *b) {
2000
2001         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2002
2003         sd_id128_to_string(boot_id, t + 9);
2004         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2005 }
2006
2007 int journal_file_move_to_entry_by_monotonic(
2008                 JournalFile *f,
2009                 sd_id128_t boot_id,
2010                 uint64_t monotonic,
2011                 direction_t direction,
2012                 Object **ret,
2013                 uint64_t *offset) {
2014
2015         Object *o;
2016         int r;
2017
2018         assert(f);
2019
2020         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2021         if (r < 0)
2022                 return r;
2023         if (r == 0)
2024                 return -ENOENT;
2025
2026         return generic_array_bisect_plus_one(f,
2027                                              le64toh(o->data.entry_offset),
2028                                              le64toh(o->data.entry_array_offset),
2029                                              le64toh(o->data.n_entries),
2030                                              monotonic,
2031                                              test_object_monotonic,
2032                                              direction,
2033                                              ret, offset, NULL);
2034 }
2035
2036 void journal_file_reset_location(JournalFile *f) {
2037         f->location_type = LOCATION_HEAD;
2038         f->current_offset = 0;
2039         f->current_seqnum = 0;
2040         f->current_realtime = 0;
2041         f->current_monotonic = 0;
2042         zero(f->current_boot_id);
2043         f->current_xor_hash = 0;
2044 }
2045
2046 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2047         f->location_type = LOCATION_SEEK;
2048         f->current_offset = offset;
2049         f->current_seqnum = le64toh(o->entry.seqnum);
2050         f->current_realtime = le64toh(o->entry.realtime);
2051         f->current_monotonic = le64toh(o->entry.monotonic);
2052         f->current_boot_id = o->entry.boot_id;
2053         f->current_xor_hash = le64toh(o->entry.xor_hash);
2054 }
2055
2056 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2057         assert(af);
2058         assert(bf);
2059         assert(af->location_type == LOCATION_SEEK);
2060         assert(bf->location_type == LOCATION_SEEK);
2061
2062         /* If contents and timestamps match, these entries are
2063          * identical, even if the seqnum does not match */
2064         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2065             af->current_monotonic == bf->current_monotonic &&
2066             af->current_realtime == bf->current_realtime &&
2067             af->current_xor_hash == bf->current_xor_hash)
2068                 return 0;
2069
2070         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2071
2072                 /* If this is from the same seqnum source, compare
2073                  * seqnums */
2074                 if (af->current_seqnum < bf->current_seqnum)
2075                         return -1;
2076                 if (af->current_seqnum > bf->current_seqnum)
2077                         return 1;
2078
2079                 /* Wow! This is weird, different data but the same
2080                  * seqnums? Something is borked, but let's make the
2081                  * best of it and compare by time. */
2082         }
2083
2084         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2085
2086                 /* If the boot id matches, compare monotonic time */
2087                 if (af->current_monotonic < bf->current_monotonic)
2088                         return -1;
2089                 if (af->current_monotonic > bf->current_monotonic)
2090                         return 1;
2091         }
2092
2093         /* Otherwise, compare UTC time */
2094         if (af->current_realtime < bf->current_realtime)
2095                 return -1;
2096         if (af->current_realtime > bf->current_realtime)
2097                 return 1;
2098
2099         /* Finally, compare by contents */
2100         if (af->current_xor_hash < bf->current_xor_hash)
2101                 return -1;
2102         if (af->current_xor_hash > bf->current_xor_hash)
2103                 return 1;
2104
2105         return 0;
2106 }
2107
2108 int journal_file_next_entry(
2109                 JournalFile *f,
2110                 uint64_t p,
2111                 direction_t direction,
2112                 Object **ret, uint64_t *offset) {
2113
2114         uint64_t i, n, ofs;
2115         int r;
2116
2117         assert(f);
2118
2119         n = le64toh(f->header->n_entries);
2120         if (n <= 0)
2121                 return 0;
2122
2123         if (p == 0)
2124                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2125         else {
2126                 r = generic_array_bisect(f,
2127                                          le64toh(f->header->entry_array_offset),
2128                                          le64toh(f->header->n_entries),
2129                                          p,
2130                                          test_object_offset,
2131                                          DIRECTION_DOWN,
2132                                          NULL, NULL,
2133                                          &i);
2134                 if (r <= 0)
2135                         return r;
2136
2137                 if (direction == DIRECTION_DOWN) {
2138                         if (i >= n - 1)
2139                                 return 0;
2140
2141                         i++;
2142                 } else {
2143                         if (i <= 0)
2144                                 return 0;
2145
2146                         i--;
2147                 }
2148         }
2149
2150         /* And jump to it */
2151         r = generic_array_get(f,
2152                               le64toh(f->header->entry_array_offset),
2153                               i,
2154                               ret, &ofs);
2155         if (r <= 0)
2156                 return r;
2157
2158         if (p > 0 &&
2159             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2160                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2161                           f->path, i);
2162                 return -EBADMSG;
2163         }
2164
2165         if (offset)
2166                 *offset = ofs;
2167
2168         return 1;
2169 }
2170
2171 int journal_file_next_entry_for_data(
2172                 JournalFile *f,
2173                 Object *o, uint64_t p,
2174                 uint64_t data_offset,
2175                 direction_t direction,
2176                 Object **ret, uint64_t *offset) {
2177
2178         uint64_t n, i;
2179         int r;
2180         Object *d;
2181
2182         assert(f);
2183         assert(p > 0 || !o);
2184
2185         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2186         if (r < 0)
2187                 return r;
2188
2189         n = le64toh(d->data.n_entries);
2190         if (n <= 0)
2191                 return n;
2192
2193         if (!o)
2194                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2195         else {
2196                 if (o->object.type != OBJECT_ENTRY)
2197                         return -EINVAL;
2198
2199                 r = generic_array_bisect_plus_one(f,
2200                                                   le64toh(d->data.entry_offset),
2201                                                   le64toh(d->data.entry_array_offset),
2202                                                   le64toh(d->data.n_entries),
2203                                                   p,
2204                                                   test_object_offset,
2205                                                   DIRECTION_DOWN,
2206                                                   NULL, NULL,
2207                                                   &i);
2208
2209                 if (r <= 0)
2210                         return r;
2211
2212                 if (direction == DIRECTION_DOWN) {
2213                         if (i >= n - 1)
2214                                 return 0;
2215
2216                         i++;
2217                 } else {
2218                         if (i <= 0)
2219                                 return 0;
2220
2221                         i--;
2222                 }
2223
2224         }
2225
2226         return generic_array_get_plus_one(f,
2227                                           le64toh(d->data.entry_offset),
2228                                           le64toh(d->data.entry_array_offset),
2229                                           i,
2230                                           ret, offset);
2231 }
2232
2233 int journal_file_move_to_entry_by_offset_for_data(
2234                 JournalFile *f,
2235                 uint64_t data_offset,
2236                 uint64_t p,
2237                 direction_t direction,
2238                 Object **ret, uint64_t *offset) {
2239
2240         int r;
2241         Object *d;
2242
2243         assert(f);
2244
2245         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2246         if (r < 0)
2247                 return r;
2248
2249         return generic_array_bisect_plus_one(f,
2250                                              le64toh(d->data.entry_offset),
2251                                              le64toh(d->data.entry_array_offset),
2252                                              le64toh(d->data.n_entries),
2253                                              p,
2254                                              test_object_offset,
2255                                              direction,
2256                                              ret, offset, NULL);
2257 }
2258
2259 int journal_file_move_to_entry_by_monotonic_for_data(
2260                 JournalFile *f,
2261                 uint64_t data_offset,
2262                 sd_id128_t boot_id,
2263                 uint64_t monotonic,
2264                 direction_t direction,
2265                 Object **ret, uint64_t *offset) {
2266
2267         Object *o, *d;
2268         int r;
2269         uint64_t b, z;
2270
2271         assert(f);
2272
2273         /* First, seek by time */
2274         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2275         if (r < 0)
2276                 return r;
2277         if (r == 0)
2278                 return -ENOENT;
2279
2280         r = generic_array_bisect_plus_one(f,
2281                                           le64toh(o->data.entry_offset),
2282                                           le64toh(o->data.entry_array_offset),
2283                                           le64toh(o->data.n_entries),
2284                                           monotonic,
2285                                           test_object_monotonic,
2286                                           direction,
2287                                           NULL, &z, NULL);
2288         if (r <= 0)
2289                 return r;
2290
2291         /* And now, continue seeking until we find an entry that
2292          * exists in both bisection arrays */
2293
2294         for (;;) {
2295                 Object *qo;
2296                 uint64_t p, q;
2297
2298                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2299                 if (r < 0)
2300                         return r;
2301
2302                 r = generic_array_bisect_plus_one(f,
2303                                                   le64toh(d->data.entry_offset),
2304                                                   le64toh(d->data.entry_array_offset),
2305                                                   le64toh(d->data.n_entries),
2306                                                   z,
2307                                                   test_object_offset,
2308                                                   direction,
2309                                                   NULL, &p, NULL);
2310                 if (r <= 0)
2311                         return r;
2312
2313                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2314                 if (r < 0)
2315                         return r;
2316
2317                 r = generic_array_bisect_plus_one(f,
2318                                                   le64toh(o->data.entry_offset),
2319                                                   le64toh(o->data.entry_array_offset),
2320                                                   le64toh(o->data.n_entries),
2321                                                   p,
2322                                                   test_object_offset,
2323                                                   direction,
2324                                                   &qo, &q, NULL);
2325
2326                 if (r <= 0)
2327                         return r;
2328
2329                 if (p == q) {
2330                         if (ret)
2331                                 *ret = qo;
2332                         if (offset)
2333                                 *offset = q;
2334
2335                         return 1;
2336                 }
2337
2338                 z = q;
2339         }
2340 }
2341
2342 int journal_file_move_to_entry_by_seqnum_for_data(
2343                 JournalFile *f,
2344                 uint64_t data_offset,
2345                 uint64_t seqnum,
2346                 direction_t direction,
2347                 Object **ret, uint64_t *offset) {
2348
2349         Object *d;
2350         int r;
2351
2352         assert(f);
2353
2354         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2355         if (r < 0)
2356                 return r;
2357
2358         return generic_array_bisect_plus_one(f,
2359                                              le64toh(d->data.entry_offset),
2360                                              le64toh(d->data.entry_array_offset),
2361                                              le64toh(d->data.n_entries),
2362                                              seqnum,
2363                                              test_object_seqnum,
2364                                              direction,
2365                                              ret, offset, NULL);
2366 }
2367
2368 int journal_file_move_to_entry_by_realtime_for_data(
2369                 JournalFile *f,
2370                 uint64_t data_offset,
2371                 uint64_t realtime,
2372                 direction_t direction,
2373                 Object **ret, uint64_t *offset) {
2374
2375         Object *d;
2376         int r;
2377
2378         assert(f);
2379
2380         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2381         if (r < 0)
2382                 return r;
2383
2384         return generic_array_bisect_plus_one(f,
2385                                              le64toh(d->data.entry_offset),
2386                                              le64toh(d->data.entry_array_offset),
2387                                              le64toh(d->data.n_entries),
2388                                              realtime,
2389                                              test_object_realtime,
2390                                              direction,
2391                                              ret, offset, NULL);
2392 }
2393
2394 void journal_file_dump(JournalFile *f) {
2395         Object *o;
2396         int r;
2397         uint64_t p;
2398
2399         assert(f);
2400
2401         journal_file_print_header(f);
2402
2403         p = le64toh(f->header->header_size);
2404         while (p != 0) {
2405                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2406                 if (r < 0)
2407                         goto fail;
2408
2409                 switch (o->object.type) {
2410
2411                 case OBJECT_UNUSED:
2412                         printf("Type: OBJECT_UNUSED\n");
2413                         break;
2414
2415                 case OBJECT_DATA:
2416                         printf("Type: OBJECT_DATA\n");
2417                         break;
2418
2419                 case OBJECT_FIELD:
2420                         printf("Type: OBJECT_FIELD\n");
2421                         break;
2422
2423                 case OBJECT_ENTRY:
2424                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2425                                le64toh(o->entry.seqnum),
2426                                le64toh(o->entry.monotonic),
2427                                le64toh(o->entry.realtime));
2428                         break;
2429
2430                 case OBJECT_FIELD_HASH_TABLE:
2431                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2432                         break;
2433
2434                 case OBJECT_DATA_HASH_TABLE:
2435                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2436                         break;
2437
2438                 case OBJECT_ENTRY_ARRAY:
2439                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2440                         break;
2441
2442                 case OBJECT_TAG:
2443                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2444                                le64toh(o->tag.seqnum),
2445                                le64toh(o->tag.epoch));
2446                         break;
2447
2448                 default:
2449                         printf("Type: unknown (%i)\n", o->object.type);
2450                         break;
2451                 }
2452
2453                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2454                         printf("Flags: %s\n",
2455                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2456
2457                 if (p == le64toh(f->header->tail_object_offset))
2458                         p = 0;
2459                 else
2460                         p = p + ALIGN64(le64toh(o->object.size));
2461         }
2462
2463         return;
2464 fail:
2465         log_error("File corrupt");
2466 }
2467
2468 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2469         const char *x;
2470
2471         x = format_timestamp(buf, l, t);
2472         if (x)
2473                 return x;
2474         return " --- ";
2475 }
2476
2477 void journal_file_print_header(JournalFile *f) {
2478         char a[33], b[33], c[33], d[33];
2479         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2480         struct stat st;
2481         char bytes[FORMAT_BYTES_MAX];
2482
2483         assert(f);
2484
2485         printf("File Path: %s\n"
2486                "File ID: %s\n"
2487                "Machine ID: %s\n"
2488                "Boot ID: %s\n"
2489                "Sequential Number ID: %s\n"
2490                "State: %s\n"
2491                "Compatible Flags:%s%s\n"
2492                "Incompatible Flags:%s%s%s\n"
2493                "Header size: %"PRIu64"\n"
2494                "Arena size: %"PRIu64"\n"
2495                "Data Hash Table Size: %"PRIu64"\n"
2496                "Field Hash Table Size: %"PRIu64"\n"
2497                "Rotate Suggested: %s\n"
2498                "Head Sequential Number: %"PRIu64"\n"
2499                "Tail Sequential Number: %"PRIu64"\n"
2500                "Head Realtime Timestamp: %s\n"
2501                "Tail Realtime Timestamp: %s\n"
2502                "Tail Monotonic Timestamp: %s\n"
2503                "Objects: %"PRIu64"\n"
2504                "Entry Objects: %"PRIu64"\n",
2505                f->path,
2506                sd_id128_to_string(f->header->file_id, a),
2507                sd_id128_to_string(f->header->machine_id, b),
2508                sd_id128_to_string(f->header->boot_id, c),
2509                sd_id128_to_string(f->header->seqnum_id, d),
2510                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2511                f->header->state == STATE_ONLINE ? "ONLINE" :
2512                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2513                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2514                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2515                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2516                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2517                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2518                le64toh(f->header->header_size),
2519                le64toh(f->header->arena_size),
2520                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2521                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2522                yes_no(journal_file_rotate_suggested(f, 0)),
2523                le64toh(f->header->head_entry_seqnum),
2524                le64toh(f->header->tail_entry_seqnum),
2525                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2526                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2527                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2528                le64toh(f->header->n_objects),
2529                le64toh(f->header->n_entries));
2530
2531         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2532                 printf("Data Objects: %"PRIu64"\n"
2533                        "Data Hash Table Fill: %.1f%%\n",
2534                        le64toh(f->header->n_data),
2535                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2536
2537         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2538                 printf("Field Objects: %"PRIu64"\n"
2539                        "Field Hash Table Fill: %.1f%%\n",
2540                        le64toh(f->header->n_fields),
2541                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2542
2543         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2544                 printf("Tag Objects: %"PRIu64"\n",
2545                        le64toh(f->header->n_tags));
2546         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2547                 printf("Entry Array Objects: %"PRIu64"\n",
2548                        le64toh(f->header->n_entry_arrays));
2549
2550         if (fstat(f->fd, &st) >= 0)
2551                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2552 }
2553
2554 static int journal_file_warn_btrfs(JournalFile *f) {
2555         unsigned attrs;
2556         int r;
2557
2558         assert(f);
2559
2560         /* Before we write anything, check if the COW logic is turned
2561          * off on btrfs. Given our write pattern that is quite
2562          * unfriendly to COW file systems this should greatly improve
2563          * performance on COW file systems, such as btrfs, at the
2564          * expense of data integrity features (which shouldn't be too
2565          * bad, given that we do our own checksumming). */
2566
2567         r = btrfs_is_filesystem(f->fd);
2568         if (r < 0)
2569                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2570         if (!r)
2571                 return 0;
2572
2573         r = read_attr_fd(f->fd, &attrs);
2574         if (r < 0)
2575                 return log_warning_errno(r, "Failed to read file attributes: %m");
2576
2577         if (attrs & FS_NOCOW_FL) {
2578                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2579                 return 0;
2580         }
2581
2582         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2583                    "This is likely to slow down journal access substantially, please consider turning "
2584                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2585
2586         return 1;
2587 }
2588
2589 int journal_file_open(
2590                 const char *fname,
2591                 int flags,
2592                 mode_t mode,
2593                 bool compress,
2594                 bool seal,
2595                 JournalMetrics *metrics,
2596                 MMapCache *mmap_cache,
2597                 JournalFile *template,
2598                 JournalFile **ret) {
2599
2600         bool newly_created = false;
2601         JournalFile *f;
2602         void *h;
2603         int r;
2604
2605         assert(fname);
2606         assert(ret);
2607
2608         if ((flags & O_ACCMODE) != O_RDONLY &&
2609             (flags & O_ACCMODE) != O_RDWR)
2610                 return -EINVAL;
2611
2612         if (!endswith(fname, ".journal") &&
2613             !endswith(fname, ".journal~"))
2614                 return -EINVAL;
2615
2616         f = new0(JournalFile, 1);
2617         if (!f)
2618                 return -ENOMEM;
2619
2620         f->fd = -1;
2621         f->mode = mode;
2622
2623         f->flags = flags;
2624         f->prot = prot_from_flags(flags);
2625         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2626 #if defined(HAVE_LZ4)
2627         f->compress_lz4 = compress;
2628 #elif defined(HAVE_XZ)
2629         f->compress_xz = compress;
2630 #endif
2631 #ifdef HAVE_GCRYPT
2632         f->seal = seal;
2633 #endif
2634
2635         if (mmap_cache)
2636                 f->mmap = mmap_cache_ref(mmap_cache);
2637         else {
2638                 f->mmap = mmap_cache_new();
2639                 if (!f->mmap) {
2640                         r = -ENOMEM;
2641                         goto fail;
2642                 }
2643         }
2644
2645         f->path = strdup(fname);
2646         if (!f->path) {
2647                 r = -ENOMEM;
2648                 goto fail;
2649         }
2650
2651         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2652         if (!f->chain_cache) {
2653                 r = -ENOMEM;
2654                 goto fail;
2655         }
2656
2657         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2658         if (f->fd < 0) {
2659                 r = -errno;
2660                 goto fail;
2661         }
2662
2663         r = journal_file_fstat(f);
2664         if (r < 0)
2665                 goto fail;
2666
2667         if (f->last_stat.st_size == 0 && f->writable) {
2668
2669                 (void) journal_file_warn_btrfs(f);
2670
2671                 /* Let's attach the creation time to the journal file,
2672                  * so that the vacuuming code knows the age of this
2673                  * file even if the file might end up corrupted one
2674                  * day... Ideally we'd just use the creation time many
2675                  * file systems maintain for each file, but there is
2676                  * currently no usable API to query this, hence let's
2677                  * emulate this via extended attributes. If extended
2678                  * attributes are not supported we'll just skip this,
2679                  * and rely solely on mtime/atime/ctime of the file. */
2680
2681                 fd_setcrtime(f->fd, 0);
2682
2683 #ifdef HAVE_GCRYPT
2684                 /* Try to load the FSPRG state, and if we can't, then
2685                  * just don't do sealing */
2686                 if (f->seal) {
2687                         r = journal_file_fss_load(f);
2688                         if (r < 0)
2689                                 f->seal = false;
2690                 }
2691 #endif
2692
2693                 r = journal_file_init_header(f, template);
2694                 if (r < 0)
2695                         goto fail;
2696
2697                 r = journal_file_fstat(f);
2698                 if (r < 0)
2699                         goto fail;
2700
2701                 newly_created = true;
2702         }
2703
2704         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2705                 r = -EIO;
2706                 goto fail;
2707         }
2708
2709         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2710         if (r < 0)
2711                 goto fail;
2712
2713         f->header = h;
2714
2715         if (!newly_created) {
2716                 r = journal_file_verify_header(f);
2717                 if (r < 0)
2718                         goto fail;
2719         }
2720
2721 #ifdef HAVE_GCRYPT
2722         if (!newly_created && f->writable) {
2723                 r = journal_file_fss_load(f);
2724                 if (r < 0)
2725                         goto fail;
2726         }
2727 #endif
2728
2729         if (f->writable) {
2730                 if (metrics) {
2731                         journal_default_metrics(metrics, f->fd);
2732                         f->metrics = *metrics;
2733                 } else if (template)
2734                         f->metrics = template->metrics;
2735
2736                 r = journal_file_refresh_header(f);
2737                 if (r < 0)
2738                         goto fail;
2739         }
2740
2741 #ifdef HAVE_GCRYPT
2742         r = journal_file_hmac_setup(f);
2743         if (r < 0)
2744                 goto fail;
2745 #endif
2746
2747         if (newly_created) {
2748                 r = journal_file_setup_field_hash_table(f);
2749                 if (r < 0)
2750                         goto fail;
2751
2752                 r = journal_file_setup_data_hash_table(f);
2753                 if (r < 0)
2754                         goto fail;
2755
2756 #ifdef HAVE_GCRYPT
2757                 r = journal_file_append_first_tag(f);
2758                 if (r < 0)
2759                         goto fail;
2760 #endif
2761         }
2762
2763         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2764                 r = -EIO;
2765                 goto fail;
2766         }
2767
2768         *ret = f;
2769         return 0;
2770
2771 fail:
2772         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2773                 r = -EIO;
2774
2775         journal_file_close(f);
2776
2777         return r;
2778 }
2779
2780 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2781         _cleanup_free_ char *p = NULL;
2782         size_t l;
2783         JournalFile *old_file, *new_file = NULL;
2784         int r;
2785
2786         assert(f);
2787         assert(*f);
2788
2789         old_file = *f;
2790
2791         if (!old_file->writable)
2792                 return -EINVAL;
2793
2794         if (!endswith(old_file->path, ".journal"))
2795                 return -EINVAL;
2796
2797         l = strlen(old_file->path);
2798         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2799                      (int) l - 8, old_file->path,
2800                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2801                      le64toh((*f)->header->head_entry_seqnum),
2802                      le64toh((*f)->header->head_entry_realtime));
2803         if (r < 0)
2804                 return -ENOMEM;
2805
2806         /* Try to rename the file to the archived version. If the file
2807          * already was deleted, we'll get ENOENT, let's ignore that
2808          * case. */
2809         r = rename(old_file->path, p);
2810         if (r < 0 && errno != ENOENT)
2811                 return -errno;
2812
2813         old_file->header->state = STATE_ARCHIVED;
2814
2815         /* Currently, btrfs is not very good with out write patterns
2816          * and fragments heavily. Let's defrag our journal files when
2817          * we archive them */
2818         old_file->defrag_on_close = true;
2819
2820         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2821         journal_file_close(old_file);
2822
2823         *f = new_file;
2824         return r;
2825 }
2826
2827 int journal_file_open_reliably(
2828                 const char *fname,
2829                 int flags,
2830                 mode_t mode,
2831                 bool compress,
2832                 bool seal,
2833                 JournalMetrics *metrics,
2834                 MMapCache *mmap_cache,
2835                 JournalFile *template,
2836                 JournalFile **ret) {
2837
2838         int r;
2839         size_t l;
2840         _cleanup_free_ char *p = NULL;
2841
2842         r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2843         if (!IN_SET(r,
2844                     -EBADMSG,           /* corrupted */
2845                     -ENODATA,           /* truncated */
2846                     -EHOSTDOWN,         /* other machine */
2847                     -EPROTONOSUPPORT,   /* incompatible feature */
2848                     -EBUSY,             /* unclean shutdown */
2849                     -ESHUTDOWN,         /* already archived */
2850                     -EIO,               /* IO error, including SIGBUS on mmap */
2851                     -EIDRM              /* File has been deleted */))
2852                 return r;
2853
2854         if ((flags & O_ACCMODE) == O_RDONLY)
2855                 return r;
2856
2857         if (!(flags & O_CREAT))
2858                 return r;
2859
2860         if (!endswith(fname, ".journal"))
2861                 return r;
2862
2863         /* The file is corrupted. Rotate it away and try it again (but only once) */
2864
2865         l = strlen(fname);
2866         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2867                      (int) l - 8, fname,
2868                      now(CLOCK_REALTIME),
2869                      random_u64()) < 0)
2870                 return -ENOMEM;
2871
2872         if (rename(fname, p) < 0)
2873                 return -errno;
2874
2875         /* btrfs doesn't cope well with our write pattern and
2876          * fragments heavily. Let's defrag all files we rotate */
2877
2878         (void) chattr_path(p, false, FS_NOCOW_FL);
2879         (void) btrfs_defrag(p);
2880
2881         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2882
2883         return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2884 }
2885
2886 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2887         uint64_t i, n;
2888         uint64_t q, xor_hash = 0;
2889         int r;
2890         EntryItem *items;
2891         dual_timestamp ts;
2892
2893         assert(from);
2894         assert(to);
2895         assert(o);
2896         assert(p);
2897
2898         if (!to->writable)
2899                 return -EPERM;
2900
2901         ts.monotonic = le64toh(o->entry.monotonic);
2902         ts.realtime = le64toh(o->entry.realtime);
2903
2904         n = journal_file_entry_n_items(o);
2905         /* alloca() can't take 0, hence let's allocate at least one */
2906         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2907
2908         for (i = 0; i < n; i++) {
2909                 uint64_t l, h;
2910                 le64_t le_hash;
2911                 size_t t;
2912                 void *data;
2913                 Object *u;
2914
2915                 q = le64toh(o->entry.items[i].object_offset);
2916                 le_hash = o->entry.items[i].hash;
2917
2918                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2919                 if (r < 0)
2920                         return r;
2921
2922                 if (le_hash != o->data.hash)
2923                         return -EBADMSG;
2924
2925                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2926                 t = (size_t) l;
2927
2928                 /* We hit the limit on 32bit machines */
2929                 if ((uint64_t) t != l)
2930                         return -E2BIG;
2931
2932                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2933 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2934                         size_t rsize = 0;
2935
2936                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2937                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2938                         if (r < 0)
2939                                 return r;
2940
2941                         data = from->compress_buffer;
2942                         l = rsize;
2943 #else
2944                         return -EPROTONOSUPPORT;
2945 #endif
2946                 } else
2947                         data = o->data.payload;
2948
2949                 r = journal_file_append_data(to, data, l, &u, &h);
2950                 if (r < 0)
2951                         return r;
2952
2953                 xor_hash ^= le64toh(u->data.hash);
2954                 items[i].object_offset = htole64(h);
2955                 items[i].hash = u->data.hash;
2956
2957                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2958                 if (r < 0)
2959                         return r;
2960         }
2961
2962         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2963
2964         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2965                 return -EIO;
2966
2967         return r;
2968 }
2969
2970 void journal_reset_metrics(JournalMetrics *m) {
2971         assert(m);
2972
2973         /* Set everything to "pick automatic values". */
2974
2975         *m = (JournalMetrics) {
2976                 .min_use = (uint64_t) -1,
2977                 .max_use = (uint64_t) -1,
2978                 .min_size = (uint64_t) -1,
2979                 .max_size = (uint64_t) -1,
2980                 .keep_free = (uint64_t) -1,
2981                 .n_max_files = (uint64_t) -1,
2982         };
2983 }
2984
2985 void journal_default_metrics(JournalMetrics *m, int fd) {
2986         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
2987         struct statvfs ss;
2988         uint64_t fs_size;
2989
2990         assert(m);
2991         assert(fd >= 0);
2992
2993         if (fstatvfs(fd, &ss) >= 0)
2994                 fs_size = ss.f_frsize * ss.f_blocks;
2995         else {
2996                 log_debug_errno(errno, "Failed to detremine disk size: %m");
2997                 fs_size = 0;
2998         }
2999
3000         if (m->max_use == (uint64_t) -1) {
3001
3002                 if (fs_size > 0) {
3003                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3004
3005                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3006                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3007
3008                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3009                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3010                 } else
3011                         m->max_use = DEFAULT_MAX_USE_LOWER;
3012         } else {
3013                 m->max_use = PAGE_ALIGN(m->max_use);
3014
3015                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3016                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3017         }
3018
3019         if (m->min_use == (uint64_t) -1)
3020                 m->min_use = DEFAULT_MIN_USE;
3021
3022         if (m->min_use > m->max_use)
3023                 m->min_use = m->max_use;
3024
3025         if (m->max_size == (uint64_t) -1) {
3026                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3027
3028                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3029                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3030         } else
3031                 m->max_size = PAGE_ALIGN(m->max_size);
3032
3033         if (m->max_size != 0) {
3034                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3035                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3036
3037                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3038                         m->max_use = m->max_size*2;
3039         }
3040
3041         if (m->min_size == (uint64_t) -1)
3042                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3043         else {
3044                 m->min_size = PAGE_ALIGN(m->min_size);
3045
3046                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3047                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3048
3049                 if (m->max_size != 0 && m->min_size > m->max_size)
3050                         m->max_size = m->min_size;
3051         }
3052
3053         if (m->keep_free == (uint64_t) -1) {
3054
3055                 if (fs_size > 0) {
3056                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3057
3058                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3059                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3060
3061                 } else
3062                         m->keep_free = DEFAULT_KEEP_FREE;
3063         }
3064
3065         if (m->n_max_files == (uint64_t) -1)
3066                 m->n_max_files = DEFAULT_N_MAX_FILES;
3067
3068         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3069                   format_bytes(a, sizeof(a), m->min_use),
3070                   format_bytes(b, sizeof(b), m->max_use),
3071                   format_bytes(c, sizeof(c), m->max_size),
3072                   format_bytes(d, sizeof(d), m->min_size),
3073                   format_bytes(e, sizeof(e), m->keep_free),
3074                   m->n_max_files);
3075 }
3076
3077 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3078         assert(f);
3079         assert(from || to);
3080
3081         if (from) {
3082                 if (f->header->head_entry_realtime == 0)
3083                         return -ENOENT;
3084
3085                 *from = le64toh(f->header->head_entry_realtime);
3086         }
3087
3088         if (to) {
3089                 if (f->header->tail_entry_realtime == 0)
3090                         return -ENOENT;
3091
3092                 *to = le64toh(f->header->tail_entry_realtime);
3093         }
3094
3095         return 1;
3096 }
3097
3098 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3099         Object *o;
3100         uint64_t p;
3101         int r;
3102
3103         assert(f);
3104         assert(from || to);
3105
3106         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3107         if (r <= 0)
3108                 return r;
3109
3110         if (le64toh(o->data.n_entries) <= 0)
3111                 return 0;
3112
3113         if (from) {
3114                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3115                 if (r < 0)
3116                         return r;
3117
3118                 *from = le64toh(o->entry.monotonic);
3119         }
3120
3121         if (to) {
3122                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3123                 if (r < 0)
3124                         return r;
3125
3126                 r = generic_array_get_plus_one(f,
3127                                                le64toh(o->data.entry_offset),
3128                                                le64toh(o->data.entry_array_offset),
3129                                                le64toh(o->data.n_entries)-1,
3130                                                &o, NULL);
3131                 if (r <= 0)
3132                         return r;
3133
3134                 *to = le64toh(o->entry.monotonic);
3135         }
3136
3137         return 1;
3138 }
3139
3140 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3141         assert(f);
3142
3143         /* If we gained new header fields we gained new features,
3144          * hence suggest a rotation */
3145         if (le64toh(f->header->header_size) < sizeof(Header)) {
3146                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3147                 return true;
3148         }
3149
3150         /* Let's check if the hash tables grew over a certain fill
3151          * level (75%, borrowing this value from Java's hash table
3152          * implementation), and if so suggest a rotation. To calculate
3153          * the fill level we need the n_data field, which only exists
3154          * in newer versions. */
3155
3156         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3157                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3158                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3159                                   f->path,
3160                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3161                                   le64toh(f->header->n_data),
3162                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3163                                   (unsigned long long) f->last_stat.st_size,
3164                                   f->last_stat.st_size / le64toh(f->header->n_data));
3165                         return true;
3166                 }
3167
3168         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3169                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3170                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3171                                   f->path,
3172                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3173                                   le64toh(f->header->n_fields),
3174                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3175                         return true;
3176                 }
3177
3178         /* Are the data objects properly indexed by field objects? */
3179         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3180             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3181             le64toh(f->header->n_data) > 0 &&
3182             le64toh(f->header->n_fields) == 0)
3183                 return true;
3184
3185         if (max_file_usec > 0) {
3186                 usec_t t, h;
3187
3188                 h = le64toh(f->header->head_entry_realtime);
3189                 t = now(CLOCK_REALTIME);
3190
3191                 if (h > 0 && t > h + max_file_usec)
3192                         return true;
3193         }
3194
3195         return false;
3196 }