src/journal/journal-file.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2011 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <fcntl.h>
  24 #include <linux/fs.h>
  25 #include <stddef.h>
  26 #include <sys/mman.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/uio.h>
  29 #include <unistd.h>
  30
  31 #include "btrfs-util.h"
  32 #include "compress.h"
  33 #include "fd-util.h"
  34 #include "journal-authenticate.h"
  35 #include "journal-def.h"
  36 #include "journal-file.h"
  37 #include "lookup3.h"
  38 #include "random-util.h"
  39 #include "string-util.h"
  40
  41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  43
  44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
  45
  46 /* This is the minimum journal file size */
  47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
  48
  49 /* These are the lower and upper bounds if we deduce the max_use value
  50  * from the file system size */
  51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  53
  54 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  55 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  56
  57 /* This is the upper bound if we deduce max_size from max_use */
  58 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  59
  60 /* This is the upper bound if we deduce the keep_free value from the
  61  * file system size */
  62 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  63
  64 /* This is the keep_free value when we can't determine the system
  65  * size */
  66 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  67
  68 /* This is the default maximum number of journal files to keep around. */
  69 #define DEFAULT_N_MAX_FILES (100)
  70
  71 /* n_data was the first entry we added after the initial file format design */
  72 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  73
  74 /* How many entries to keep in the entry array chain cache at max */
  75 #define CHAIN_CACHE_MAX 20
  76
  77 /* How much to increase the journal file size at once each time we allocate something new. */
  78 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  79
  80 /* Reread fstat() of the file for detecting deletions at least this often */
  81 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  82
  83 /* The mmap context to use for the header we pick as one above the last defined typed */
  84 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  85
  86 static int journal_file_set_online(JournalFile *f) {
  87         assert(f);
  88
  89         if (!f->writable)
  90                 return -EPERM;
  91
  92         if (!(f->fd >= 0 && f->header))
  93                 return -EINVAL;
  94
  95         if (mmap_cache_got_sigbus(f->mmap, f->fd))
  96                 return -EIO;
  97
  98         switch(f->header->state) {
  99                 case STATE_ONLINE:
 100                         return 0;
 101
 102                 case STATE_OFFLINE:
 103                         f->header->state = STATE_ONLINE;
 104                         fsync(f->fd);
 105                         return 0;
 106
 107                 default:
 108                         return -EINVAL;
 109         }
 110 }
 111
 112 int journal_file_set_offline(JournalFile *f) {
 113         assert(f);
 114
 115         if (!f->writable)
 116                 return -EPERM;
 117
 118         if (!(f->fd >= 0 && f->header))
 119                 return -EINVAL;
 120
 121         if (f->header->state != STATE_ONLINE)
 122                 return 0;
 123
 124         fsync(f->fd);
 125
 126         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 127                 return -EIO;
 128
 129         f->header->state = STATE_OFFLINE;
 130
 131         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 132                 return -EIO;
 133
 134         fsync(f->fd);
 135
 136         return 0;
 137 }
 138
 139 JournalFile* journal_file_close(JournalFile *f) {
 140         assert(f);
 141
 142 #ifdef HAVE_GCRYPT
 143         /* Write the final tag */
 144         if (f->seal && f->writable)
 145                 journal_file_append_tag(f);
 146 #endif
 147
 148         journal_file_set_offline(f);
 149
 150         if (f->mmap && f->fd >= 0)
 151                 mmap_cache_close_fd(f->mmap, f->fd);
 152
 153         if (f->fd >= 0 && f->defrag_on_close) {
 154
 155                 /* Be friendly to btrfs: turn COW back on again now,
 156                  * and defragment the file. We won't write to the file
 157                  * ever again, hence remove all fragmentation, and
 158                  * reenable all the good bits COW usually provides
 159                  * (such as data checksumming). */
 160
 161                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 162                 (void) btrfs_defrag_fd(f->fd);
 163         }
 164
 165         safe_close(f->fd);
 166         free(f->path);
 167
 168         if (f->mmap)
 169                 mmap_cache_unref(f->mmap);
 170
 171         ordered_hashmap_free_free(f->chain_cache);
 172
 173 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 174         free(f->compress_buffer);
 175 #endif
 176
 177 #ifdef HAVE_GCRYPT
 178         if (f->fss_file)
 179                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 180         else
 181                 free(f->fsprg_state);
 182
 183         free(f->fsprg_seed);
 184
 185         if (f->hmac)
 186                 gcry_md_close(f->hmac);
 187 #endif
 188
 189         free(f);
 190         return NULL;
 191 }
 192
 193 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 194         Header h = {};
 195         ssize_t k;
 196         int r;
 197
 198         assert(f);
 199
 200         memcpy(h.signature, HEADER_SIGNATURE, 8);
 201         h.header_size = htole64(ALIGN64(sizeof(h)));
 202
 203         h.incompatible_flags |= htole32(
 204                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 205                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 206
 207         h.compatible_flags = htole32(
 208                 f->seal * HEADER_COMPATIBLE_SEALED);
 209
 210         r = sd_id128_randomize(&h.file_id);
 211         if (r < 0)
 212                 return r;
 213
 214         if (template) {
 215                 h.seqnum_id = template->header->seqnum_id;
 216                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 217         } else
 218                 h.seqnum_id = h.file_id;
 219
 220         k = pwrite(f->fd, &h, sizeof(h), 0);
 221         if (k < 0)
 222                 return -errno;
 223
 224         if (k != sizeof(h))
 225                 return -EIO;
 226
 227         return 0;
 228 }
 229
 230 static int journal_file_refresh_header(JournalFile *f) {
 231         sd_id128_t boot_id;
 232         int r;
 233
 234         assert(f);
 235
 236         r = sd_id128_get_machine(&f->header->machine_id);
 237         if (r < 0)
 238                 return r;
 239
 240         r = sd_id128_get_boot(&boot_id);
 241         if (r < 0)
 242                 return r;
 243
 244         if (sd_id128_equal(boot_id, f->header->boot_id))
 245                 f->tail_entry_monotonic_valid = true;
 246
 247         f->header->boot_id = boot_id;
 248
 249         r = journal_file_set_online(f);
 250
 251         /* Sync the online state to disk */
 252         fsync(f->fd);
 253
 254         return r;
 255 }
 256
 257 static int journal_file_verify_header(JournalFile *f) {
 258         uint32_t flags;
 259
 260         assert(f);
 261
 262         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 263                 return -EBADMSG;
 264
 265         /* In both read and write mode we refuse to open files with
 266          * incompatible flags we don't know */
 267         flags = le32toh(f->header->incompatible_flags);
 268         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
 269                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
 270                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
 271                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
 272                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
 273                 if (flags)
 274                         log_debug("Journal file %s uses incompatible flags %"PRIx32
 275                                   " disabled at compilation time.", f->path, flags);
 276                 return -EPROTONOSUPPORT;
 277         }
 278
 279         /* When open for writing we refuse to open files with
 280          * compatible flags, too */
 281         flags = le32toh(f->header->compatible_flags);
 282         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
 283                 if (flags & ~HEADER_COMPATIBLE_ANY)
 284                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
 285                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
 286                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
 287                 if (flags)
 288                         log_debug("Journal file %s uses compatible flags %"PRIx32
 289                                   " disabled at compilation time.", f->path, flags);
 290                 return -EPROTONOSUPPORT;
 291         }
 292
 293         if (f->header->state >= _STATE_MAX)
 294                 return -EBADMSG;
 295
 296         /* The first addition was n_data, so check that we are at least this large */
 297         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
 298                 return -EBADMSG;
 299
 300         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 301                 return -EBADMSG;
 302
 303         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
 304                 return -ENODATA;
 305
 306         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
 307                 return -ENODATA;
 308
 309         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 310             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 311             !VALID64(le64toh(f->header->tail_object_offset)) ||
 312             !VALID64(le64toh(f->header->entry_array_offset)))
 313                 return -ENODATA;
 314
 315         if (f->writable) {
 316                 uint8_t state;
 317                 sd_id128_t machine_id;
 318                 int r;
 319
 320                 r = sd_id128_get_machine(&machine_id);
 321                 if (r < 0)
 322                         return r;
 323
 324                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 325                         return -EHOSTDOWN;
 326
 327                 state = f->header->state;
 328
 329                 if (state == STATE_ONLINE) {
 330                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 331                         return -EBUSY;
 332                 } else if (state == STATE_ARCHIVED)
 333                         return -ESHUTDOWN;
 334                 else if (state != STATE_OFFLINE) {
 335                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 336                         return -EBUSY;
 337                 }
 338         }
 339
 340         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 341         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 342
 343         f->seal = JOURNAL_HEADER_SEALED(f->header);
 344
 345         return 0;
 346 }
 347
 348 static int journal_file_fstat(JournalFile *f) {
 349         assert(f);
 350         assert(f->fd >= 0);
 351
 352         if (fstat(f->fd, &f->last_stat) < 0)
 353                 return -errno;
 354
 355         f->last_stat_usec = now(CLOCK_MONOTONIC);
 356
 357         /* Refuse appending to files that are already deleted */
 358         if (f->last_stat.st_nlink <= 0)
 359                 return -EIDRM;
 360
 361         return 0;
 362 }
 363
 364 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 365         uint64_t old_size, new_size;
 366         int r;
 367
 368         assert(f);
 369
 370         /* We assume that this file is not sparse, and we know that
 371          * for sure, since we always call posix_fallocate()
 372          * ourselves */
 373
 374         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 375                 return -EIO;
 376
 377         old_size =
 378                 le64toh(f->header->header_size) +
 379                 le64toh(f->header->arena_size);
 380
 381         new_size = PAGE_ALIGN(offset + size);
 382         if (new_size < le64toh(f->header->header_size))
 383                 new_size = le64toh(f->header->header_size);
 384
 385         if (new_size <= old_size) {
 386
 387                 /* We already pre-allocated enough space, but before
 388                  * we write to it, let's check with fstat() if the
 389                  * file got deleted, in order make sure we don't throw
 390                  * away the data immediately. Don't check fstat() for
 391                  * all writes though, but only once ever 10s. */
 392
 393                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 394                         return 0;
 395
 396                 return journal_file_fstat(f);
 397         }
 398
 399         /* Allocate more space. */
 400
 401         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 402                 return -E2BIG;
 403
 404         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 405                 struct statvfs svfs;
 406
 407                 if (fstatvfs(f->fd, &svfs) >= 0) {
 408                         uint64_t available;
 409
 410                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 411
 412                         if (new_size - old_size > available)
 413                                 return -E2BIG;
 414                 }
 415         }
 416
 417         /* Increase by larger blocks at once */
 418         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 419         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 420                 new_size = f->metrics.max_size;
 421
 422         /* Note that the glibc fallocate() fallback is very
 423            inefficient, hence we try to minimize the allocation area
 424            as we can. */
 425         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 426         if (r != 0)
 427                 return -r;
 428
 429         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 430
 431         return journal_file_fstat(f);
 432 }
 433
 434 static unsigned type_to_context(ObjectType type) {
 435         /* One context for each type, plus one catch-all for the rest */
 436         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 437         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 438         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 439 }
 440
 441 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
 442         int r;
 443
 444         assert(f);
 445         assert(ret);
 446
 447         if (size <= 0)
 448                 return -EINVAL;
 449
 450         /* Avoid SIGBUS on invalid accesses */
 451         if (offset + size > (uint64_t) f->last_stat.st_size) {
 452                 /* Hmm, out of range? Let's refresh the fstat() data
 453                  * first, before we trust that check. */
 454
 455                 r = journal_file_fstat(f);
 456                 if (r < 0)
 457                         return r;
 458
 459                 if (offset + size > (uint64_t) f->last_stat.st_size)
 460                         return -EADDRNOTAVAIL;
 461         }
 462
 463         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
 464 }
 465
 466 static uint64_t minimum_header_size(Object *o) {
 467
 468         static const uint64_t table[] = {
 469                 [OBJECT_DATA] = sizeof(DataObject),
 470                 [OBJECT_FIELD] = sizeof(FieldObject),
 471                 [OBJECT_ENTRY] = sizeof(EntryObject),
 472                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 473                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 474                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 475                 [OBJECT_TAG] = sizeof(TagObject),
 476         };
 477
 478         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 479                 return sizeof(ObjectHeader);
 480
 481         return table[o->object.type];
 482 }
 483
 484 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 485         int r;
 486         void *t;
 487         Object *o;
 488         uint64_t s;
 489
 490         assert(f);
 491         assert(ret);
 492
 493         /* Objects may only be located at multiple of 64 bit */
 494         if (!VALID64(offset))
 495                 return -EFAULT;
 496
 497         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
 498         if (r < 0)
 499                 return r;
 500
 501         o = (Object*) t;
 502         s = le64toh(o->object.size);
 503
 504         if (s < sizeof(ObjectHeader))
 505                 return -EBADMSG;
 506
 507         if (o->object.type <= OBJECT_UNUSED)
 508                 return -EBADMSG;
 509
 510         if (s < minimum_header_size(o))
 511                 return -EBADMSG;
 512
 513         if (type > OBJECT_UNUSED && o->object.type != type)
 514                 return -EBADMSG;
 515
 516         if (s > sizeof(ObjectHeader)) {
 517                 r = journal_file_move_to(f, type, false, offset, s, &t);
 518                 if (r < 0)
 519                         return r;
 520
 521                 o = (Object*) t;
 522         }
 523
 524         *ret = o;
 525         return 0;
 526 }
 527
 528 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 529         uint64_t r;
 530
 531         assert(f);
 532
 533         r = le64toh(f->header->tail_entry_seqnum) + 1;
 534
 535         if (seqnum) {
 536                 /* If an external seqnum counter was passed, we update
 537                  * both the local and the external one, and set it to
 538                  * the maximum of both */
 539
 540                 if (*seqnum + 1 > r)
 541                         r = *seqnum + 1;
 542
 543                 *seqnum = r;
 544         }
 545
 546         f->header->tail_entry_seqnum = htole64(r);
 547
 548         if (f->header->head_entry_seqnum == 0)
 549                 f->header->head_entry_seqnum = htole64(r);
 550
 551         return r;
 552 }
 553
 554 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
 555         int r;
 556         uint64_t p;
 557         Object *tail, *o;
 558         void *t;
 559
 560         assert(f);
 561         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
 562         assert(size >= sizeof(ObjectHeader));
 563         assert(offset);
 564         assert(ret);
 565
 566         r = journal_file_set_online(f);
 567         if (r < 0)
 568                 return r;
 569
 570         p = le64toh(f->header->tail_object_offset);
 571         if (p == 0)
 572                 p = le64toh(f->header->header_size);
 573         else {
 574                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
 575                 if (r < 0)
 576                         return r;
 577
 578                 p += ALIGN64(le64toh(tail->object.size));
 579         }
 580
 581         r = journal_file_allocate(f, p, size);
 582         if (r < 0)
 583                 return r;
 584
 585         r = journal_file_move_to(f, type, false, p, size, &t);
 586         if (r < 0)
 587                 return r;
 588
 589         o = (Object*) t;
 590
 591         zero(o->object);
 592         o->object.type = type;
 593         o->object.size = htole64(size);
 594
 595         f->header->tail_object_offset = htole64(p);
 596         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
 597
 598         *ret = o;
 599         *offset = p;
 600
 601         return 0;
 602 }
 603
 604 static int journal_file_setup_data_hash_table(JournalFile *f) {
 605         uint64_t s, p;
 606         Object *o;
 607         int r;
 608
 609         assert(f);
 610
 611         /* We estimate that we need 1 hash table entry per 768 bytes
 612            of journal file and we want to make sure we never get
 613            beyond 75% fill level. Calculate the hash table size for
 614            the maximum file size based on these metrics. */
 615
 616         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
 617         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
 618                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
 619
 620         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
 621
 622         r = journal_file_append_object(f,
 623                                        OBJECT_DATA_HASH_TABLE,
 624                                        offsetof(Object, hash_table.items) + s,
 625                                        &o, &p);
 626         if (r < 0)
 627                 return r;
 628
 629         memzero(o->hash_table.items, s);
 630
 631         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 632         f->header->data_hash_table_size = htole64(s);
 633
 634         return 0;
 635 }
 636
 637 static int journal_file_setup_field_hash_table(JournalFile *f) {
 638         uint64_t s, p;
 639         Object *o;
 640         int r;
 641
 642         assert(f);
 643
 644         /* We use a fixed size hash table for the fields as this
 645          * number should grow very slowly only */
 646
 647         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
 648         r = journal_file_append_object(f,
 649                                        OBJECT_FIELD_HASH_TABLE,
 650                                        offsetof(Object, hash_table.items) + s,
 651                                        &o, &p);
 652         if (r < 0)
 653                 return r;
 654
 655         memzero(o->hash_table.items, s);
 656
 657         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 658         f->header->field_hash_table_size = htole64(s);
 659
 660         return 0;
 661 }
 662
 663 int journal_file_map_data_hash_table(JournalFile *f) {
 664         uint64_t s, p;
 665         void *t;
 666         int r;
 667
 668         assert(f);
 669
 670         if (f->data_hash_table)
 671                 return 0;
 672
 673         p = le64toh(f->header->data_hash_table_offset);
 674         s = le64toh(f->header->data_hash_table_size);
 675
 676         r = journal_file_move_to(f,
 677                                  OBJECT_DATA_HASH_TABLE,
 678                                  true,
 679                                  p, s,
 680                                  &t);
 681         if (r < 0)
 682                 return r;
 683
 684         f->data_hash_table = t;
 685         return 0;
 686 }
 687
 688 int journal_file_map_field_hash_table(JournalFile *f) {
 689         uint64_t s, p;
 690         void *t;
 691         int r;
 692
 693         assert(f);
 694
 695         if (f->field_hash_table)
 696                 return 0;
 697
 698         p = le64toh(f->header->field_hash_table_offset);
 699         s = le64toh(f->header->field_hash_table_size);
 700
 701         r = journal_file_move_to(f,
 702                                  OBJECT_FIELD_HASH_TABLE,
 703                                  true,
 704                                  p, s,
 705                                  &t);
 706         if (r < 0)
 707                 return r;
 708
 709         f->field_hash_table = t;
 710         return 0;
 711 }
 712
 713 static int journal_file_link_field(
 714                 JournalFile *f,
 715                 Object *o,
 716                 uint64_t offset,
 717                 uint64_t hash) {
 718
 719         uint64_t p, h, m;
 720         int r;
 721
 722         assert(f);
 723         assert(o);
 724         assert(offset > 0);
 725
 726         if (o->object.type != OBJECT_FIELD)
 727                 return -EINVAL;
 728
 729         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 730         if (m <= 0)
 731                 return -EBADMSG;
 732
 733         /* This might alter the window we are looking at */
 734         o->field.next_hash_offset = o->field.head_data_offset = 0;
 735
 736         h = hash % m;
 737         p = le64toh(f->field_hash_table[h].tail_hash_offset);
 738         if (p == 0)
 739                 f->field_hash_table[h].head_hash_offset = htole64(offset);
 740         else {
 741                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 742                 if (r < 0)
 743                         return r;
 744
 745                 o->field.next_hash_offset = htole64(offset);
 746         }
 747
 748         f->field_hash_table[h].tail_hash_offset = htole64(offset);
 749
 750         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
 751                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
 752
 753         return 0;
 754 }
 755
 756 static int journal_file_link_data(
 757                 JournalFile *f,
 758                 Object *o,
 759                 uint64_t offset,
 760                 uint64_t hash) {
 761
 762         uint64_t p, h, m;
 763         int r;
 764
 765         assert(f);
 766         assert(o);
 767         assert(offset > 0);
 768
 769         if (o->object.type != OBJECT_DATA)
 770                 return -EINVAL;
 771
 772         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 773         if (m <= 0)
 774                 return -EBADMSG;
 775
 776         /* This might alter the window we are looking at */
 777         o->data.next_hash_offset = o->data.next_field_offset = 0;
 778         o->data.entry_offset = o->data.entry_array_offset = 0;
 779         o->data.n_entries = 0;
 780
 781         h = hash % m;
 782         p = le64toh(f->data_hash_table[h].tail_hash_offset);
 783         if (p == 0)
 784                 /* Only entry in the hash table is easy */
 785                 f->data_hash_table[h].head_hash_offset = htole64(offset);
 786         else {
 787                 /* Move back to the previous data object, to patch in
 788                  * pointer */
 789
 790                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 791                 if (r < 0)
 792                         return r;
 793
 794                 o->data.next_hash_offset = htole64(offset);
 795         }
 796
 797         f->data_hash_table[h].tail_hash_offset = htole64(offset);
 798
 799         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
 800                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
 801
 802         return 0;
 803 }
 804
 805 int journal_file_find_field_object_with_hash(
 806                 JournalFile *f,
 807                 const void *field, uint64_t size, uint64_t hash,
 808                 Object **ret, uint64_t *offset) {
 809
 810         uint64_t p, osize, h, m;
 811         int r;
 812
 813         assert(f);
 814         assert(field && size > 0);
 815
 816         /* If the field hash table is empty, we can't find anything */
 817         if (le64toh(f->header->field_hash_table_size) <= 0)
 818                 return 0;
 819
 820         /* Map the field hash table, if it isn't mapped yet. */
 821         r = journal_file_map_field_hash_table(f);
 822         if (r < 0)
 823                 return r;
 824
 825         osize = offsetof(Object, field.payload) + size;
 826
 827         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 828         if (m <= 0)
 829                 return -EBADMSG;
 830
 831         h = hash % m;
 832         p = le64toh(f->field_hash_table[h].head_hash_offset);
 833
 834         while (p > 0) {
 835                 Object *o;
 836
 837                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 838                 if (r < 0)
 839                         return r;
 840
 841                 if (le64toh(o->field.hash) == hash &&
 842                     le64toh(o->object.size) == osize &&
 843                     memcmp(o->field.payload, field, size) == 0) {
 844
 845                         if (ret)
 846                                 *ret = o;
 847                         if (offset)
 848                                 *offset = p;
 849
 850                         return 1;
 851                 }
 852
 853                 p = le64toh(o->field.next_hash_offset);
 854         }
 855
 856         return 0;
 857 }
 858
 859 int journal_file_find_field_object(
 860                 JournalFile *f,
 861                 const void *field, uint64_t size,
 862                 Object **ret, uint64_t *offset) {
 863
 864         uint64_t hash;
 865
 866         assert(f);
 867         assert(field && size > 0);
 868
 869         hash = hash64(field, size);
 870
 871         return journal_file_find_field_object_with_hash(f,
 872                                                         field, size, hash,
 873                                                         ret, offset);
 874 }
 875
 876 int journal_file_find_data_object_with_hash(
 877                 JournalFile *f,
 878                 const void *data, uint64_t size, uint64_t hash,
 879                 Object **ret, uint64_t *offset) {
 880
 881         uint64_t p, osize, h, m;
 882         int r;
 883
 884         assert(f);
 885         assert(data || size == 0);
 886
 887         /* If there's no data hash table, then there's no entry. */
 888         if (le64toh(f->header->data_hash_table_size) <= 0)
 889                 return 0;
 890
 891         /* Map the data hash table, if it isn't mapped yet. */
 892         r = journal_file_map_data_hash_table(f);
 893         if (r < 0)
 894                 return r;
 895
 896         osize = offsetof(Object, data.payload) + size;
 897
 898         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 899         if (m <= 0)
 900                 return -EBADMSG;
 901
 902         h = hash % m;
 903         p = le64toh(f->data_hash_table[h].head_hash_offset);
 904
 905         while (p > 0) {
 906                 Object *o;
 907
 908                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 909                 if (r < 0)
 910                         return r;
 911
 912                 if (le64toh(o->data.hash) != hash)
 913                         goto next;
 914
 915                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
 916 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 917                         uint64_t l;
 918                         size_t rsize = 0;
 919
 920                         l = le64toh(o->object.size);
 921                         if (l <= offsetof(Object, data.payload))
 922                                 return -EBADMSG;
 923
 924                         l -= offsetof(Object, data.payload);
 925
 926                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
 927                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
 928                         if (r < 0)
 929                                 return r;
 930
 931                         if (rsize == size &&
 932                             memcmp(f->compress_buffer, data, size) == 0) {
 933
 934                                 if (ret)
 935                                         *ret = o;
 936
 937                                 if (offset)
 938                                         *offset = p;
 939
 940                                 return 1;
 941                         }
 942 #else
 943                         return -EPROTONOSUPPORT;
 944 #endif
 945                 } else if (le64toh(o->object.size) == osize &&
 946                            memcmp(o->data.payload, data, size) == 0) {
 947
 948                         if (ret)
 949                                 *ret = o;
 950
 951                         if (offset)
 952                                 *offset = p;
 953
 954                         return 1;
 955                 }
 956
 957         next:
 958                 p = le64toh(o->data.next_hash_offset);
 959         }
 960
 961         return 0;
 962 }
 963
 964 int journal_file_find_data_object(
 965                 JournalFile *f,
 966                 const void *data, uint64_t size,
 967                 Object **ret, uint64_t *offset) {
 968
 969         uint64_t hash;
 970
 971         assert(f);
 972         assert(data || size == 0);
 973
 974         hash = hash64(data, size);
 975
 976         return journal_file_find_data_object_with_hash(f,
 977                                                        data, size, hash,
 978                                                        ret, offset);
 979 }
 980
 981 static int journal_file_append_field(
 982                 JournalFile *f,
 983                 const void *field, uint64_t size,
 984                 Object **ret, uint64_t *offset) {
 985
 986         uint64_t hash, p;
 987         uint64_t osize;
 988         Object *o;
 989         int r;
 990
 991         assert(f);
 992         assert(field && size > 0);
 993
 994         hash = hash64(field, size);
 995
 996         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
 997         if (r < 0)
 998                 return r;
 999         else if (r > 0) {
1000
1001                 if (ret)
1002                         *ret = o;
1003
1004                 if (offset)
1005                         *offset = p;
1006
1007                 return 0;
1008         }
1009
1010         osize = offsetof(Object, field.payload) + size;
1011         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1012         if (r < 0)
1013                 return r;
1014
1015         o->field.hash = htole64(hash);
1016         memcpy(o->field.payload, field, size);
1017
1018         r = journal_file_link_field(f, o, p, hash);
1019         if (r < 0)
1020                 return r;
1021
1022         /* The linking might have altered the window, so let's
1023          * refresh our pointer */
1024         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1025         if (r < 0)
1026                 return r;
1027
1028 #ifdef HAVE_GCRYPT
1029         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1030         if (r < 0)
1031                 return r;
1032 #endif
1033
1034         if (ret)
1035                 *ret = o;
1036
1037         if (offset)
1038                 *offset = p;
1039
1040         return 0;
1041 }
1042
1043 static int journal_file_append_data(
1044                 JournalFile *f,
1045                 const void *data, uint64_t size,
1046                 Object **ret, uint64_t *offset) {
1047
1048         uint64_t hash, p;
1049         uint64_t osize;
1050         Object *o;
1051         int r, compression = 0;
1052         const void *eq;
1053
1054         assert(f);
1055         assert(data || size == 0);
1056
1057         hash = hash64(data, size);
1058
1059         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1060         if (r < 0)
1061                 return r;
1062         if (r > 0) {
1063
1064                 if (ret)
1065                         *ret = o;
1066
1067                 if (offset)
1068                         *offset = p;
1069
1070                 return 0;
1071         }
1072
1073         osize = offsetof(Object, data.payload) + size;
1074         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1075         if (r < 0)
1076                 return r;
1077
1078         o->data.hash = htole64(hash);
1079
1080 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1081         if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1082                 size_t rsize = 0;
1083
1084                 compression = compress_blob(data, size, o->data.payload, &rsize);
1085
1086                 if (compression >= 0) {
1087                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1088                         o->object.flags |= compression;
1089
1090                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1091                                   size, rsize, object_compressed_to_string(compression));
1092                 } else
1093                         /* Compression didn't work, we don't really care why, let's continue without compression */
1094                         compression = 0;
1095         }
1096 #endif
1097
1098         if (compression == 0 && size > 0)
1099                 memcpy(o->data.payload, data, size);
1100
1101         r = journal_file_link_data(f, o, p, hash);
1102         if (r < 0)
1103                 return r;
1104
1105         /* The linking might have altered the window, so let's
1106          * refresh our pointer */
1107         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1108         if (r < 0)
1109                 return r;
1110
1111         if (!data)
1112                 eq = NULL;
1113         else
1114                 eq = memchr(data, '=', size);
1115         if (eq && eq > data) {
1116                 Object *fo = NULL;
1117                 uint64_t fp;
1118
1119                 /* Create field object ... */
1120                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1121                 if (r < 0)
1122                         return r;
1123
1124                 /* ... and link it in. */
1125                 o->data.next_field_offset = fo->field.head_data_offset;
1126                 fo->field.head_data_offset = le64toh(p);
1127         }
1128
1129 #ifdef HAVE_GCRYPT
1130         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1131         if (r < 0)
1132                 return r;
1133 #endif
1134
1135         if (ret)
1136                 *ret = o;
1137
1138         if (offset)
1139                 *offset = p;
1140
1141         return 0;
1142 }
1143
1144 uint64_t journal_file_entry_n_items(Object *o) {
1145         assert(o);
1146
1147         if (o->object.type != OBJECT_ENTRY)
1148                 return 0;
1149
1150         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1151 }
1152
1153 uint64_t journal_file_entry_array_n_items(Object *o) {
1154         assert(o);
1155
1156         if (o->object.type != OBJECT_ENTRY_ARRAY)
1157                 return 0;
1158
1159         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1160 }
1161
1162 uint64_t journal_file_hash_table_n_items(Object *o) {
1163         assert(o);
1164
1165         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1166             o->object.type != OBJECT_FIELD_HASH_TABLE)
1167                 return 0;
1168
1169         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1170 }
1171
1172 static int link_entry_into_array(JournalFile *f,
1173                                  le64_t *first,
1174                                  le64_t *idx,
1175                                  uint64_t p) {
1176         int r;
1177         uint64_t n = 0, ap = 0, q, i, a, hidx;
1178         Object *o;
1179
1180         assert(f);
1181         assert(first);
1182         assert(idx);
1183         assert(p > 0);
1184
1185         a = le64toh(*first);
1186         i = hidx = le64toh(*idx);
1187         while (a > 0) {
1188
1189                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1190                 if (r < 0)
1191                         return r;
1192
1193                 n = journal_file_entry_array_n_items(o);
1194                 if (i < n) {
1195                         o->entry_array.items[i] = htole64(p);
1196                         *idx = htole64(hidx + 1);
1197                         return 0;
1198                 }
1199
1200                 i -= n;
1201                 ap = a;
1202                 a = le64toh(o->entry_array.next_entry_array_offset);
1203         }
1204
1205         if (hidx > n)
1206                 n = (hidx+1) * 2;
1207         else
1208                 n = n * 2;
1209
1210         if (n < 4)
1211                 n = 4;
1212
1213         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1214                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1215                                        &o, &q);
1216         if (r < 0)
1217                 return r;
1218
1219 #ifdef HAVE_GCRYPT
1220         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1221         if (r < 0)
1222                 return r;
1223 #endif
1224
1225         o->entry_array.items[i] = htole64(p);
1226
1227         if (ap == 0)
1228                 *first = htole64(q);
1229         else {
1230                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1231                 if (r < 0)
1232                         return r;
1233
1234                 o->entry_array.next_entry_array_offset = htole64(q);
1235         }
1236
1237         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1238                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1239
1240         *idx = htole64(hidx + 1);
1241
1242         return 0;
1243 }
1244
1245 static int link_entry_into_array_plus_one(JournalFile *f,
1246                                           le64_t *extra,
1247                                           le64_t *first,
1248                                           le64_t *idx,
1249                                           uint64_t p) {
1250
1251         int r;
1252
1253         assert(f);
1254         assert(extra);
1255         assert(first);
1256         assert(idx);
1257         assert(p > 0);
1258
1259         if (*idx == 0)
1260                 *extra = htole64(p);
1261         else {
1262                 le64_t i;
1263
1264                 i = htole64(le64toh(*idx) - 1);
1265                 r = link_entry_into_array(f, first, &i, p);
1266                 if (r < 0)
1267                         return r;
1268         }
1269
1270         *idx = htole64(le64toh(*idx) + 1);
1271         return 0;
1272 }
1273
1274 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1275         uint64_t p;
1276         int r;
1277         assert(f);
1278         assert(o);
1279         assert(offset > 0);
1280
1281         p = le64toh(o->entry.items[i].object_offset);
1282         if (p == 0)
1283                 return -EINVAL;
1284
1285         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1286         if (r < 0)
1287                 return r;
1288
1289         return link_entry_into_array_plus_one(f,
1290                                               &o->data.entry_offset,
1291                                               &o->data.entry_array_offset,
1292                                               &o->data.n_entries,
1293                                               offset);
1294 }
1295
1296 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1297         uint64_t n, i;
1298         int r;
1299
1300         assert(f);
1301         assert(o);
1302         assert(offset > 0);
1303
1304         if (o->object.type != OBJECT_ENTRY)
1305                 return -EINVAL;
1306
1307         __sync_synchronize();
1308
1309         /* Link up the entry itself */
1310         r = link_entry_into_array(f,
1311                                   &f->header->entry_array_offset,
1312                                   &f->header->n_entries,
1313                                   offset);
1314         if (r < 0)
1315                 return r;
1316
1317         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1318
1319         if (f->header->head_entry_realtime == 0)
1320                 f->header->head_entry_realtime = o->entry.realtime;
1321
1322         f->header->tail_entry_realtime = o->entry.realtime;
1323         f->header->tail_entry_monotonic = o->entry.monotonic;
1324
1325         f->tail_entry_monotonic_valid = true;
1326
1327         /* Link up the items */
1328         n = journal_file_entry_n_items(o);
1329         for (i = 0; i < n; i++) {
1330                 r = journal_file_link_entry_item(f, o, offset, i);
1331                 if (r < 0)
1332                         return r;
1333         }
1334
1335         return 0;
1336 }
1337
1338 static int journal_file_append_entry_internal(
1339                 JournalFile *f,
1340                 const dual_timestamp *ts,
1341                 uint64_t xor_hash,
1342                 const EntryItem items[], unsigned n_items,
1343                 uint64_t *seqnum,
1344                 Object **ret, uint64_t *offset) {
1345         uint64_t np;
1346         uint64_t osize;
1347         Object *o;
1348         int r;
1349
1350         assert(f);
1351         assert(items || n_items == 0);
1352         assert(ts);
1353
1354         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1355
1356         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1357         if (r < 0)
1358                 return r;
1359
1360         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1361         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1362         o->entry.realtime = htole64(ts->realtime);
1363         o->entry.monotonic = htole64(ts->monotonic);
1364         o->entry.xor_hash = htole64(xor_hash);
1365         o->entry.boot_id = f->header->boot_id;
1366
1367 #ifdef HAVE_GCRYPT
1368         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1369         if (r < 0)
1370                 return r;
1371 #endif
1372
1373         r = journal_file_link_entry(f, o, np);
1374         if (r < 0)
1375                 return r;
1376
1377         if (ret)
1378                 *ret = o;
1379
1380         if (offset)
1381                 *offset = np;
1382
1383         return 0;
1384 }
1385
1386 void journal_file_post_change(JournalFile *f) {
1387         assert(f);
1388
1389         /* inotify() does not receive IN_MODIFY events from file
1390          * accesses done via mmap(). After each access we hence
1391          * trigger IN_MODIFY by truncating the journal file to its
1392          * current size which triggers IN_MODIFY. */
1393
1394         __sync_synchronize();
1395
1396         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1397                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1398 }
1399
1400 static int entry_item_cmp(const void *_a, const void *_b) {
1401         const EntryItem *a = _a, *b = _b;
1402
1403         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1404                 return -1;
1405         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1406                 return 1;
1407         return 0;
1408 }
1409
1410 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1411         unsigned i;
1412         EntryItem *items;
1413         int r;
1414         uint64_t xor_hash = 0;
1415         struct dual_timestamp _ts;
1416
1417         assert(f);
1418         assert(iovec || n_iovec == 0);
1419
1420         if (!ts) {
1421                 dual_timestamp_get(&_ts);
1422                 ts = &_ts;
1423         }
1424
1425         if (f->tail_entry_monotonic_valid &&
1426             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1427                 return -EINVAL;
1428
1429 #ifdef HAVE_GCRYPT
1430         r = journal_file_maybe_append_tag(f, ts->realtime);
1431         if (r < 0)
1432                 return r;
1433 #endif
1434
1435         /* alloca() can't take 0, hence let's allocate at least one */
1436         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1437
1438         for (i = 0; i < n_iovec; i++) {
1439                 uint64_t p;
1440                 Object *o;
1441
1442                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1443                 if (r < 0)
1444                         return r;
1445
1446                 xor_hash ^= le64toh(o->data.hash);
1447                 items[i].object_offset = htole64(p);
1448                 items[i].hash = o->data.hash;
1449         }
1450
1451         /* Order by the position on disk, in order to improve seek
1452          * times for rotating media. */
1453         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1454
1455         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1456
1457         /* If the memory mapping triggered a SIGBUS then we return an
1458          * IO error and ignore the error code passed down to us, since
1459          * it is very likely just an effect of a nullified replacement
1460          * mapping page */
1461
1462         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1463                 r = -EIO;
1464
1465         journal_file_post_change(f);
1466
1467         return r;
1468 }
1469
1470 typedef struct ChainCacheItem {
1471         uint64_t first; /* the array at the beginning of the chain */
1472         uint64_t array; /* the cached array */
1473         uint64_t begin; /* the first item in the cached array */
1474         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1475         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1476 } ChainCacheItem;
1477
1478 static void chain_cache_put(
1479                 OrderedHashmap *h,
1480                 ChainCacheItem *ci,
1481                 uint64_t first,
1482                 uint64_t array,
1483                 uint64_t begin,
1484                 uint64_t total,
1485                 uint64_t last_index) {
1486
1487         if (!ci) {
1488                 /* If the chain item to cache for this chain is the
1489                  * first one it's not worth caching anything */
1490                 if (array == first)
1491                         return;
1492
1493                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1494                         ci = ordered_hashmap_steal_first(h);
1495                         assert(ci);
1496                 } else {
1497                         ci = new(ChainCacheItem, 1);
1498                         if (!ci)
1499                                 return;
1500                 }
1501
1502                 ci->first = first;
1503
1504                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1505                         free(ci);
1506                         return;
1507                 }
1508         } else
1509                 assert(ci->first == first);
1510
1511         ci->array = array;
1512         ci->begin = begin;
1513         ci->total = total;
1514         ci->last_index = last_index;
1515 }
1516
1517 static int generic_array_get(
1518                 JournalFile *f,
1519                 uint64_t first,
1520                 uint64_t i,
1521                 Object **ret, uint64_t *offset) {
1522
1523         Object *o;
1524         uint64_t p = 0, a, t = 0;
1525         int r;
1526         ChainCacheItem *ci;
1527
1528         assert(f);
1529
1530         a = first;
1531
1532         /* Try the chain cache first */
1533         ci = ordered_hashmap_get(f->chain_cache, &first);
1534         if (ci && i > ci->total) {
1535                 a = ci->array;
1536                 i -= ci->total;
1537                 t = ci->total;
1538         }
1539
1540         while (a > 0) {
1541                 uint64_t k;
1542
1543                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1544                 if (r < 0)
1545                         return r;
1546
1547                 k = journal_file_entry_array_n_items(o);
1548                 if (i < k) {
1549                         p = le64toh(o->entry_array.items[i]);
1550                         goto found;
1551                 }
1552
1553                 i -= k;
1554                 t += k;
1555                 a = le64toh(o->entry_array.next_entry_array_offset);
1556         }
1557
1558         return 0;
1559
1560 found:
1561         /* Let's cache this item for the next invocation */
1562         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1563
1564         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1565         if (r < 0)
1566                 return r;
1567
1568         if (ret)
1569                 *ret = o;
1570
1571         if (offset)
1572                 *offset = p;
1573
1574         return 1;
1575 }
1576
1577 static int generic_array_get_plus_one(
1578                 JournalFile *f,
1579                 uint64_t extra,
1580                 uint64_t first,
1581                 uint64_t i,
1582                 Object **ret, uint64_t *offset) {
1583
1584         Object *o;
1585
1586         assert(f);
1587
1588         if (i == 0) {
1589                 int r;
1590
1591                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1592                 if (r < 0)
1593                         return r;
1594
1595                 if (ret)
1596                         *ret = o;
1597
1598                 if (offset)
1599                         *offset = extra;
1600
1601                 return 1;
1602         }
1603
1604         return generic_array_get(f, first, i-1, ret, offset);
1605 }
1606
1607 enum {
1608         TEST_FOUND,
1609         TEST_LEFT,
1610         TEST_RIGHT
1611 };
1612
1613 static int generic_array_bisect(
1614                 JournalFile *f,
1615                 uint64_t first,
1616                 uint64_t n,
1617                 uint64_t needle,
1618                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1619                 direction_t direction,
1620                 Object **ret,
1621                 uint64_t *offset,
1622                 uint64_t *idx) {
1623
1624         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1625         bool subtract_one = false;
1626         Object *o, *array = NULL;
1627         int r;
1628         ChainCacheItem *ci;
1629
1630         assert(f);
1631         assert(test_object);
1632
1633         /* Start with the first array in the chain */
1634         a = first;
1635
1636         ci = ordered_hashmap_get(f->chain_cache, &first);
1637         if (ci && n > ci->total) {
1638                 /* Ah, we have iterated this bisection array chain
1639                  * previously! Let's see if we can skip ahead in the
1640                  * chain, as far as the last time. But we can't jump
1641                  * backwards in the chain, so let's check that
1642                  * first. */
1643
1644                 r = test_object(f, ci->begin, needle);
1645                 if (r < 0)
1646                         return r;
1647
1648                 if (r == TEST_LEFT) {
1649                         /* OK, what we are looking for is right of the
1650                          * begin of this EntryArray, so let's jump
1651                          * straight to previously cached array in the
1652                          * chain */
1653
1654                         a = ci->array;
1655                         n -= ci->total;
1656                         t = ci->total;
1657                         last_index = ci->last_index;
1658                 }
1659         }
1660
1661         while (a > 0) {
1662                 uint64_t left, right, k, lp;
1663
1664                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1665                 if (r < 0)
1666                         return r;
1667
1668                 k = journal_file_entry_array_n_items(array);
1669                 right = MIN(k, n);
1670                 if (right <= 0)
1671                         return 0;
1672
1673                 i = right - 1;
1674                 lp = p = le64toh(array->entry_array.items[i]);
1675                 if (p <= 0)
1676                         return -EBADMSG;
1677
1678                 r = test_object(f, p, needle);
1679                 if (r < 0)
1680                         return r;
1681
1682                 if (r == TEST_FOUND)
1683                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1684
1685                 if (r == TEST_RIGHT) {
1686                         left = 0;
1687                         right -= 1;
1688
1689                         if (last_index != (uint64_t) -1) {
1690                                 assert(last_index <= right);
1691
1692                                 /* If we cached the last index we
1693                                  * looked at, let's try to not to jump
1694                                  * too wildly around and see if we can
1695                                  * limit the range to look at early to
1696                                  * the immediate neighbors of the last
1697                                  * index we looked at. */
1698
1699                                 if (last_index > 0) {
1700                                         uint64_t x = last_index - 1;
1701
1702                                         p = le64toh(array->entry_array.items[x]);
1703                                         if (p <= 0)
1704                                                 return -EBADMSG;
1705
1706                                         r = test_object(f, p, needle);
1707                                         if (r < 0)
1708                                                 return r;
1709
1710                                         if (r == TEST_FOUND)
1711                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1712
1713                                         if (r == TEST_RIGHT)
1714                                                 right = x;
1715                                         else
1716                                                 left = x + 1;
1717                                 }
1718
1719                                 if (last_index < right) {
1720                                         uint64_t y = last_index + 1;
1721
1722                                         p = le64toh(array->entry_array.items[y]);
1723                                         if (p <= 0)
1724                                                 return -EBADMSG;
1725
1726                                         r = test_object(f, p, needle);
1727                                         if (r < 0)
1728                                                 return r;
1729
1730                                         if (r == TEST_FOUND)
1731                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1732
1733                                         if (r == TEST_RIGHT)
1734                                                 right = y;
1735                                         else
1736                                                 left = y + 1;
1737                                 }
1738                         }
1739
1740                         for (;;) {
1741                                 if (left == right) {
1742                                         if (direction == DIRECTION_UP)
1743                                                 subtract_one = true;
1744
1745                                         i = left;
1746                                         goto found;
1747                                 }
1748
1749                                 assert(left < right);
1750                                 i = (left + right) / 2;
1751
1752                                 p = le64toh(array->entry_array.items[i]);
1753                                 if (p <= 0)
1754                                         return -EBADMSG;
1755
1756                                 r = test_object(f, p, needle);
1757                                 if (r < 0)
1758                                         return r;
1759
1760                                 if (r == TEST_FOUND)
1761                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1762
1763                                 if (r == TEST_RIGHT)
1764                                         right = i;
1765                                 else
1766                                         left = i + 1;
1767                         }
1768                 }
1769
1770                 if (k >= n) {
1771                         if (direction == DIRECTION_UP) {
1772                                 i = n;
1773                                 subtract_one = true;
1774                                 goto found;
1775                         }
1776
1777                         return 0;
1778                 }
1779
1780                 last_p = lp;
1781
1782                 n -= k;
1783                 t += k;
1784                 last_index = (uint64_t) -1;
1785                 a = le64toh(array->entry_array.next_entry_array_offset);
1786         }
1787
1788         return 0;
1789
1790 found:
1791         if (subtract_one && t == 0 && i == 0)
1792                 return 0;
1793
1794         /* Let's cache this item for the next invocation */
1795         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1796
1797         if (subtract_one && i == 0)
1798                 p = last_p;
1799         else if (subtract_one)
1800                 p = le64toh(array->entry_array.items[i-1]);
1801         else
1802                 p = le64toh(array->entry_array.items[i]);
1803
1804         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1805         if (r < 0)
1806                 return r;
1807
1808         if (ret)
1809                 *ret = o;
1810
1811         if (offset)
1812                 *offset = p;
1813
1814         if (idx)
1815                 *idx = t + i + (subtract_one ? -1 : 0);
1816
1817         return 1;
1818 }
1819
1820 static int generic_array_bisect_plus_one(
1821                 JournalFile *f,
1822                 uint64_t extra,
1823                 uint64_t first,
1824                 uint64_t n,
1825                 uint64_t needle,
1826                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1827                 direction_t direction,
1828                 Object **ret,
1829                 uint64_t *offset,
1830                 uint64_t *idx) {
1831
1832         int r;
1833         bool step_back = false;
1834         Object *o;
1835
1836         assert(f);
1837         assert(test_object);
1838
1839         if (n <= 0)
1840                 return 0;
1841
1842         /* This bisects the array in object 'first', but first checks
1843          * an extra  */
1844         r = test_object(f, extra, needle);
1845         if (r < 0)
1846                 return r;
1847
1848         if (r == TEST_FOUND)
1849                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1850
1851         /* if we are looking with DIRECTION_UP then we need to first
1852            see if in the actual array there is a matching entry, and
1853            return the last one of that. But if there isn't any we need
1854            to return this one. Hence remember this, and return it
1855            below. */
1856         if (r == TEST_LEFT)
1857                 step_back = direction == DIRECTION_UP;
1858
1859         if (r == TEST_RIGHT) {
1860                 if (direction == DIRECTION_DOWN)
1861                         goto found;
1862                 else
1863                         return 0;
1864         }
1865
1866         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1867
1868         if (r == 0 && step_back)
1869                 goto found;
1870
1871         if (r > 0 && idx)
1872                 (*idx) ++;
1873
1874         return r;
1875
1876 found:
1877         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1878         if (r < 0)
1879                 return r;
1880
1881         if (ret)
1882                 *ret = o;
1883
1884         if (offset)
1885                 *offset = extra;
1886
1887         if (idx)
1888                 *idx = 0;
1889
1890         return 1;
1891 }
1892
1893 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1894         assert(f);
1895         assert(p > 0);
1896
1897         if (p == needle)
1898                 return TEST_FOUND;
1899         else if (p < needle)
1900                 return TEST_LEFT;
1901         else
1902                 return TEST_RIGHT;
1903 }
1904
1905 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1906         Object *o;
1907         int r;
1908
1909         assert(f);
1910         assert(p > 0);
1911
1912         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1913         if (r < 0)
1914                 return r;
1915
1916         if (le64toh(o->entry.seqnum) == needle)
1917                 return TEST_FOUND;
1918         else if (le64toh(o->entry.seqnum) < needle)
1919                 return TEST_LEFT;
1920         else
1921                 return TEST_RIGHT;
1922 }
1923
1924 int journal_file_move_to_entry_by_seqnum(
1925                 JournalFile *f,
1926                 uint64_t seqnum,
1927                 direction_t direction,
1928                 Object **ret,
1929                 uint64_t *offset) {
1930
1931         return generic_array_bisect(f,
1932                                     le64toh(f->header->entry_array_offset),
1933                                     le64toh(f->header->n_entries),
1934                                     seqnum,
1935                                     test_object_seqnum,
1936                                     direction,
1937                                     ret, offset, NULL);
1938 }
1939
1940 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1941         Object *o;
1942         int r;
1943
1944         assert(f);
1945         assert(p > 0);
1946
1947         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1948         if (r < 0)
1949                 return r;
1950
1951         if (le64toh(o->entry.realtime) == needle)
1952                 return TEST_FOUND;
1953         else if (le64toh(o->entry.realtime) < needle)
1954                 return TEST_LEFT;
1955         else
1956                 return TEST_RIGHT;
1957 }
1958
1959 int journal_file_move_to_entry_by_realtime(
1960                 JournalFile *f,
1961                 uint64_t realtime,
1962                 direction_t direction,
1963                 Object **ret,
1964                 uint64_t *offset) {
1965
1966         return generic_array_bisect(f,
1967                                     le64toh(f->header->entry_array_offset),
1968                                     le64toh(f->header->n_entries),
1969                                     realtime,
1970                                     test_object_realtime,
1971                                     direction,
1972                                     ret, offset, NULL);
1973 }
1974
1975 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1976         Object *o;
1977         int r;
1978
1979         assert(f);
1980         assert(p > 0);
1981
1982         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1983         if (r < 0)
1984                 return r;
1985
1986         if (le64toh(o->entry.monotonic) == needle)
1987                 return TEST_FOUND;
1988         else if (le64toh(o->entry.monotonic) < needle)
1989                 return TEST_LEFT;
1990         else
1991                 return TEST_RIGHT;
1992 }
1993
1994 static int find_data_object_by_boot_id(
1995                 JournalFile *f,
1996                 sd_id128_t boot_id,
1997                 Object **o,
1998                 uint64_t *b) {
1999
2000         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2001
2002         sd_id128_to_string(boot_id, t + 9);
2003         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2004 }
2005
2006 int journal_file_move_to_entry_by_monotonic(
2007                 JournalFile *f,
2008                 sd_id128_t boot_id,
2009                 uint64_t monotonic,
2010                 direction_t direction,
2011                 Object **ret,
2012                 uint64_t *offset) {
2013
2014         Object *o;
2015         int r;
2016
2017         assert(f);
2018
2019         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2020         if (r < 0)
2021                 return r;
2022         if (r == 0)
2023                 return -ENOENT;
2024
2025         return generic_array_bisect_plus_one(f,
2026                                              le64toh(o->data.entry_offset),
2027                                              le64toh(o->data.entry_array_offset),
2028                                              le64toh(o->data.n_entries),
2029                                              monotonic,
2030                                              test_object_monotonic,
2031                                              direction,
2032                                              ret, offset, NULL);
2033 }
2034
2035 void journal_file_reset_location(JournalFile *f) {
2036         f->location_type = LOCATION_HEAD;
2037         f->current_offset = 0;
2038         f->current_seqnum = 0;
2039         f->current_realtime = 0;
2040         f->current_monotonic = 0;
2041         zero(f->current_boot_id);
2042         f->current_xor_hash = 0;
2043 }
2044
2045 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2046         f->location_type = LOCATION_SEEK;
2047         f->current_offset = offset;
2048         f->current_seqnum = le64toh(o->entry.seqnum);
2049         f->current_realtime = le64toh(o->entry.realtime);
2050         f->current_monotonic = le64toh(o->entry.monotonic);
2051         f->current_boot_id = o->entry.boot_id;
2052         f->current_xor_hash = le64toh(o->entry.xor_hash);
2053 }
2054
2055 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2056         assert(af);
2057         assert(bf);
2058         assert(af->location_type == LOCATION_SEEK);
2059         assert(bf->location_type == LOCATION_SEEK);
2060
2061         /* If contents and timestamps match, these entries are
2062          * identical, even if the seqnum does not match */
2063         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2064             af->current_monotonic == bf->current_monotonic &&
2065             af->current_realtime == bf->current_realtime &&
2066             af->current_xor_hash == bf->current_xor_hash)
2067                 return 0;
2068
2069         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2070
2071                 /* If this is from the same seqnum source, compare
2072                  * seqnums */
2073                 if (af->current_seqnum < bf->current_seqnum)
2074                         return -1;
2075                 if (af->current_seqnum > bf->current_seqnum)
2076                         return 1;
2077
2078                 /* Wow! This is weird, different data but the same
2079                  * seqnums? Something is borked, but let's make the
2080                  * best of it and compare by time. */
2081         }
2082
2083         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2084
2085                 /* If the boot id matches, compare monotonic time */
2086                 if (af->current_monotonic < bf->current_monotonic)
2087                         return -1;
2088                 if (af->current_monotonic > bf->current_monotonic)
2089                         return 1;
2090         }
2091
2092         /* Otherwise, compare UTC time */
2093         if (af->current_realtime < bf->current_realtime)
2094                 return -1;
2095         if (af->current_realtime > bf->current_realtime)
2096                 return 1;
2097
2098         /* Finally, compare by contents */
2099         if (af->current_xor_hash < bf->current_xor_hash)
2100                 return -1;
2101         if (af->current_xor_hash > bf->current_xor_hash)
2102                 return 1;
2103
2104         return 0;
2105 }
2106
2107 int journal_file_next_entry(
2108                 JournalFile *f,
2109                 uint64_t p,
2110                 direction_t direction,
2111                 Object **ret, uint64_t *offset) {
2112
2113         uint64_t i, n, ofs;
2114         int r;
2115
2116         assert(f);
2117
2118         n = le64toh(f->header->n_entries);
2119         if (n <= 0)
2120                 return 0;
2121
2122         if (p == 0)
2123                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2124         else {
2125                 r = generic_array_bisect(f,
2126                                          le64toh(f->header->entry_array_offset),
2127                                          le64toh(f->header->n_entries),
2128                                          p,
2129                                          test_object_offset,
2130                                          DIRECTION_DOWN,
2131                                          NULL, NULL,
2132                                          &i);
2133                 if (r <= 0)
2134                         return r;
2135
2136                 if (direction == DIRECTION_DOWN) {
2137                         if (i >= n - 1)
2138                                 return 0;
2139
2140                         i++;
2141                 } else {
2142                         if (i <= 0)
2143                                 return 0;
2144
2145                         i--;
2146                 }
2147         }
2148
2149         /* And jump to it */
2150         r = generic_array_get(f,
2151                               le64toh(f->header->entry_array_offset),
2152                               i,
2153                               ret, &ofs);
2154         if (r <= 0)
2155                 return r;
2156
2157         if (p > 0 &&
2158             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2159                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2160                           f->path, i);
2161                 return -EBADMSG;
2162         }
2163
2164         if (offset)
2165                 *offset = ofs;
2166
2167         return 1;
2168 }
2169
2170 int journal_file_next_entry_for_data(
2171                 JournalFile *f,
2172                 Object *o, uint64_t p,
2173                 uint64_t data_offset,
2174                 direction_t direction,
2175                 Object **ret, uint64_t *offset) {
2176
2177         uint64_t n, i;
2178         int r;
2179         Object *d;
2180
2181         assert(f);
2182         assert(p > 0 || !o);
2183
2184         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2185         if (r < 0)
2186                 return r;
2187
2188         n = le64toh(d->data.n_entries);
2189         if (n <= 0)
2190                 return n;
2191
2192         if (!o)
2193                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2194         else {
2195                 if (o->object.type != OBJECT_ENTRY)
2196                         return -EINVAL;
2197
2198                 r = generic_array_bisect_plus_one(f,
2199                                                   le64toh(d->data.entry_offset),
2200                                                   le64toh(d->data.entry_array_offset),
2201                                                   le64toh(d->data.n_entries),
2202                                                   p,
2203                                                   test_object_offset,
2204                                                   DIRECTION_DOWN,
2205                                                   NULL, NULL,
2206                                                   &i);
2207
2208                 if (r <= 0)
2209                         return r;
2210
2211                 if (direction == DIRECTION_DOWN) {
2212                         if (i >= n - 1)
2213                                 return 0;
2214
2215                         i++;
2216                 } else {
2217                         if (i <= 0)
2218                                 return 0;
2219
2220                         i--;
2221                 }
2222
2223         }
2224
2225         return generic_array_get_plus_one(f,
2226                                           le64toh(d->data.entry_offset),
2227                                           le64toh(d->data.entry_array_offset),
2228                                           i,
2229                                           ret, offset);
2230 }
2231
2232 int journal_file_move_to_entry_by_offset_for_data(
2233                 JournalFile *f,
2234                 uint64_t data_offset,
2235                 uint64_t p,
2236                 direction_t direction,
2237                 Object **ret, uint64_t *offset) {
2238
2239         int r;
2240         Object *d;
2241
2242         assert(f);
2243
2244         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2245         if (r < 0)
2246                 return r;
2247
2248         return generic_array_bisect_plus_one(f,
2249                                              le64toh(d->data.entry_offset),
2250                                              le64toh(d->data.entry_array_offset),
2251                                              le64toh(d->data.n_entries),
2252                                              p,
2253                                              test_object_offset,
2254                                              direction,
2255                                              ret, offset, NULL);
2256 }
2257
2258 int journal_file_move_to_entry_by_monotonic_for_data(
2259                 JournalFile *f,
2260                 uint64_t data_offset,
2261                 sd_id128_t boot_id,
2262                 uint64_t monotonic,
2263                 direction_t direction,
2264                 Object **ret, uint64_t *offset) {
2265
2266         Object *o, *d;
2267         int r;
2268         uint64_t b, z;
2269
2270         assert(f);
2271
2272         /* First, seek by time */
2273         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2274         if (r < 0)
2275                 return r;
2276         if (r == 0)
2277                 return -ENOENT;
2278
2279         r = generic_array_bisect_plus_one(f,
2280                                           le64toh(o->data.entry_offset),
2281                                           le64toh(o->data.entry_array_offset),
2282                                           le64toh(o->data.n_entries),
2283                                           monotonic,
2284                                           test_object_monotonic,
2285                                           direction,
2286                                           NULL, &z, NULL);
2287         if (r <= 0)
2288                 return r;
2289
2290         /* And now, continue seeking until we find an entry that
2291          * exists in both bisection arrays */
2292
2293         for (;;) {
2294                 Object *qo;
2295                 uint64_t p, q;
2296
2297                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2298                 if (r < 0)
2299                         return r;
2300
2301                 r = generic_array_bisect_plus_one(f,
2302                                                   le64toh(d->data.entry_offset),
2303                                                   le64toh(d->data.entry_array_offset),
2304                                                   le64toh(d->data.n_entries),
2305                                                   z,
2306                                                   test_object_offset,
2307                                                   direction,
2308                                                   NULL, &p, NULL);
2309                 if (r <= 0)
2310                         return r;
2311
2312                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2313                 if (r < 0)
2314                         return r;
2315
2316                 r = generic_array_bisect_plus_one(f,
2317                                                   le64toh(o->data.entry_offset),
2318                                                   le64toh(o->data.entry_array_offset),
2319                                                   le64toh(o->data.n_entries),
2320                                                   p,
2321                                                   test_object_offset,
2322                                                   direction,
2323                                                   &qo, &q, NULL);
2324
2325                 if (r <= 0)
2326                         return r;
2327
2328                 if (p == q) {
2329                         if (ret)
2330                                 *ret = qo;
2331                         if (offset)
2332                                 *offset = q;
2333
2334                         return 1;
2335                 }
2336
2337                 z = q;
2338         }
2339 }
2340
2341 int journal_file_move_to_entry_by_seqnum_for_data(
2342                 JournalFile *f,
2343                 uint64_t data_offset,
2344                 uint64_t seqnum,
2345                 direction_t direction,
2346                 Object **ret, uint64_t *offset) {
2347
2348         Object *d;
2349         int r;
2350
2351         assert(f);
2352
2353         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2354         if (r < 0)
2355                 return r;
2356
2357         return generic_array_bisect_plus_one(f,
2358                                              le64toh(d->data.entry_offset),
2359                                              le64toh(d->data.entry_array_offset),
2360                                              le64toh(d->data.n_entries),
2361                                              seqnum,
2362                                              test_object_seqnum,
2363                                              direction,
2364                                              ret, offset, NULL);
2365 }
2366
2367 int journal_file_move_to_entry_by_realtime_for_data(
2368                 JournalFile *f,
2369                 uint64_t data_offset,
2370                 uint64_t realtime,
2371                 direction_t direction,
2372                 Object **ret, uint64_t *offset) {
2373
2374         Object *d;
2375         int r;
2376
2377         assert(f);
2378
2379         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2380         if (r < 0)
2381                 return r;
2382
2383         return generic_array_bisect_plus_one(f,
2384                                              le64toh(d->data.entry_offset),
2385                                              le64toh(d->data.entry_array_offset),
2386                                              le64toh(d->data.n_entries),
2387                                              realtime,
2388                                              test_object_realtime,
2389                                              direction,
2390                                              ret, offset, NULL);
2391 }
2392
2393 void journal_file_dump(JournalFile *f) {
2394         Object *o;
2395         int r;
2396         uint64_t p;
2397
2398         assert(f);
2399
2400         journal_file_print_header(f);
2401
2402         p = le64toh(f->header->header_size);
2403         while (p != 0) {
2404                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2405                 if (r < 0)
2406                         goto fail;
2407
2408                 switch (o->object.type) {
2409
2410                 case OBJECT_UNUSED:
2411                         printf("Type: OBJECT_UNUSED\n");
2412                         break;
2413
2414                 case OBJECT_DATA:
2415                         printf("Type: OBJECT_DATA\n");
2416                         break;
2417
2418                 case OBJECT_FIELD:
2419                         printf("Type: OBJECT_FIELD\n");
2420                         break;
2421
2422                 case OBJECT_ENTRY:
2423                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2424                                le64toh(o->entry.seqnum),
2425                                le64toh(o->entry.monotonic),
2426                                le64toh(o->entry.realtime));
2427                         break;
2428
2429                 case OBJECT_FIELD_HASH_TABLE:
2430                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2431                         break;
2432
2433                 case OBJECT_DATA_HASH_TABLE:
2434                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2435                         break;
2436
2437                 case OBJECT_ENTRY_ARRAY:
2438                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2439                         break;
2440
2441                 case OBJECT_TAG:
2442                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2443                                le64toh(o->tag.seqnum),
2444                                le64toh(o->tag.epoch));
2445                         break;
2446
2447                 default:
2448                         printf("Type: unknown (%i)\n", o->object.type);
2449                         break;
2450                 }
2451
2452                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2453                         printf("Flags: %s\n",
2454                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2455
2456                 if (p == le64toh(f->header->tail_object_offset))
2457                         p = 0;
2458                 else
2459                         p = p + ALIGN64(le64toh(o->object.size));
2460         }
2461
2462         return;
2463 fail:
2464         log_error("File corrupt");
2465 }
2466
2467 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2468         const char *x;
2469
2470         x = format_timestamp(buf, l, t);
2471         if (x)
2472                 return x;
2473         return " --- ";
2474 }
2475
2476 void journal_file_print_header(JournalFile *f) {
2477         char a[33], b[33], c[33], d[33];
2478         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2479         struct stat st;
2480         char bytes[FORMAT_BYTES_MAX];
2481
2482         assert(f);
2483
2484         printf("File Path: %s\n"
2485                "File ID: %s\n"
2486                "Machine ID: %s\n"
2487                "Boot ID: %s\n"
2488                "Sequential Number ID: %s\n"
2489                "State: %s\n"
2490                "Compatible Flags:%s%s\n"
2491                "Incompatible Flags:%s%s%s\n"
2492                "Header size: %"PRIu64"\n"
2493                "Arena size: %"PRIu64"\n"
2494                "Data Hash Table Size: %"PRIu64"\n"
2495                "Field Hash Table Size: %"PRIu64"\n"
2496                "Rotate Suggested: %s\n"
2497                "Head Sequential Number: %"PRIu64"\n"
2498                "Tail Sequential Number: %"PRIu64"\n"
2499                "Head Realtime Timestamp: %s\n"
2500                "Tail Realtime Timestamp: %s\n"
2501                "Tail Monotonic Timestamp: %s\n"
2502                "Objects: %"PRIu64"\n"
2503                "Entry Objects: %"PRIu64"\n",
2504                f->path,
2505                sd_id128_to_string(f->header->file_id, a),
2506                sd_id128_to_string(f->header->machine_id, b),
2507                sd_id128_to_string(f->header->boot_id, c),
2508                sd_id128_to_string(f->header->seqnum_id, d),
2509                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2510                f->header->state == STATE_ONLINE ? "ONLINE" :
2511                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2512                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2513                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2514                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2515                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2516                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2517                le64toh(f->header->header_size),
2518                le64toh(f->header->arena_size),
2519                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2520                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2521                yes_no(journal_file_rotate_suggested(f, 0)),
2522                le64toh(f->header->head_entry_seqnum),
2523                le64toh(f->header->tail_entry_seqnum),
2524                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2525                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2526                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2527                le64toh(f->header->n_objects),
2528                le64toh(f->header->n_entries));
2529
2530         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2531                 printf("Data Objects: %"PRIu64"\n"
2532                        "Data Hash Table Fill: %.1f%%\n",
2533                        le64toh(f->header->n_data),
2534                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2535
2536         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2537                 printf("Field Objects: %"PRIu64"\n"
2538                        "Field Hash Table Fill: %.1f%%\n",
2539                        le64toh(f->header->n_fields),
2540                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2541
2542         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2543                 printf("Tag Objects: %"PRIu64"\n",
2544                        le64toh(f->header->n_tags));
2545         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2546                 printf("Entry Array Objects: %"PRIu64"\n",
2547                        le64toh(f->header->n_entry_arrays));
2548
2549         if (fstat(f->fd, &st) >= 0)
2550                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2551 }
2552
2553 static int journal_file_warn_btrfs(JournalFile *f) {
2554         unsigned attrs;
2555         int r;
2556
2557         assert(f);
2558
2559         /* Before we write anything, check if the COW logic is turned
2560          * off on btrfs. Given our write pattern that is quite
2561          * unfriendly to COW file systems this should greatly improve
2562          * performance on COW file systems, such as btrfs, at the
2563          * expense of data integrity features (which shouldn't be too
2564          * bad, given that we do our own checksumming). */
2565
2566         r = btrfs_is_filesystem(f->fd);
2567         if (r < 0)
2568                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2569         if (!r)
2570                 return 0;
2571
2572         r = read_attr_fd(f->fd, &attrs);
2573         if (r < 0)
2574                 return log_warning_errno(r, "Failed to read file attributes: %m");
2575
2576         if (attrs & FS_NOCOW_FL) {
2577                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2578                 return 0;
2579         }
2580
2581         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2582                    "This is likely to slow down journal access substantially, please consider turning "
2583                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2584
2585         return 1;
2586 }
2587
2588 int journal_file_open(
2589                 const char *fname,
2590                 int flags,
2591                 mode_t mode,
2592                 bool compress,
2593                 bool seal,
2594                 JournalMetrics *metrics,
2595                 MMapCache *mmap_cache,
2596                 JournalFile *template,
2597                 JournalFile **ret) {
2598
2599         bool newly_created = false;
2600         JournalFile *f;
2601         void *h;
2602         int r;
2603
2604         assert(fname);
2605         assert(ret);
2606
2607         if ((flags & O_ACCMODE) != O_RDONLY &&
2608             (flags & O_ACCMODE) != O_RDWR)
2609                 return -EINVAL;
2610
2611         if (!endswith(fname, ".journal") &&
2612             !endswith(fname, ".journal~"))
2613                 return -EINVAL;
2614
2615         f = new0(JournalFile, 1);
2616         if (!f)
2617                 return -ENOMEM;
2618
2619         f->fd = -1;
2620         f->mode = mode;
2621
2622         f->flags = flags;
2623         f->prot = prot_from_flags(flags);
2624         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2625 #if defined(HAVE_LZ4)
2626         f->compress_lz4 = compress;
2627 #elif defined(HAVE_XZ)
2628         f->compress_xz = compress;
2629 #endif
2630 #ifdef HAVE_GCRYPT
2631         f->seal = seal;
2632 #endif
2633
2634         if (mmap_cache)
2635                 f->mmap = mmap_cache_ref(mmap_cache);
2636         else {
2637                 f->mmap = mmap_cache_new();
2638                 if (!f->mmap) {
2639                         r = -ENOMEM;
2640                         goto fail;
2641                 }
2642         }
2643
2644         f->path = strdup(fname);
2645         if (!f->path) {
2646                 r = -ENOMEM;
2647                 goto fail;
2648         }
2649
2650         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2651         if (!f->chain_cache) {
2652                 r = -ENOMEM;
2653                 goto fail;
2654         }
2655
2656         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2657         if (f->fd < 0) {
2658                 r = -errno;
2659                 goto fail;
2660         }
2661
2662         r = journal_file_fstat(f);
2663         if (r < 0)
2664                 goto fail;
2665
2666         if (f->last_stat.st_size == 0 && f->writable) {
2667
2668                 (void) journal_file_warn_btrfs(f);
2669
2670                 /* Let's attach the creation time to the journal file,
2671                  * so that the vacuuming code knows the age of this
2672                  * file even if the file might end up corrupted one
2673                  * day... Ideally we'd just use the creation time many
2674                  * file systems maintain for each file, but there is
2675                  * currently no usable API to query this, hence let's
2676                  * emulate this via extended attributes. If extended
2677                  * attributes are not supported we'll just skip this,
2678                  * and rely solely on mtime/atime/ctime of the file. */
2679
2680                 fd_setcrtime(f->fd, 0);
2681
2682 #ifdef HAVE_GCRYPT
2683                 /* Try to load the FSPRG state, and if we can't, then
2684                  * just don't do sealing */
2685                 if (f->seal) {
2686                         r = journal_file_fss_load(f);
2687                         if (r < 0)
2688                                 f->seal = false;
2689                 }
2690 #endif
2691
2692                 r = journal_file_init_header(f, template);
2693                 if (r < 0)
2694                         goto fail;
2695
2696                 r = journal_file_fstat(f);
2697                 if (r < 0)
2698                         goto fail;
2699
2700                 newly_created = true;
2701         }
2702
2703         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2704                 r = -EIO;
2705                 goto fail;
2706         }
2707
2708         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2709         if (r < 0)
2710                 goto fail;
2711
2712         f->header = h;
2713
2714         if (!newly_created) {
2715                 r = journal_file_verify_header(f);
2716                 if (r < 0)
2717                         goto fail;
2718         }
2719
2720 #ifdef HAVE_GCRYPT
2721         if (!newly_created && f->writable) {
2722                 r = journal_file_fss_load(f);
2723                 if (r < 0)
2724                         goto fail;
2725         }
2726 #endif
2727
2728         if (f->writable) {
2729                 if (metrics) {
2730                         journal_default_metrics(metrics, f->fd);
2731                         f->metrics = *metrics;
2732                 } else if (template)
2733                         f->metrics = template->metrics;
2734
2735                 r = journal_file_refresh_header(f);
2736                 if (r < 0)
2737                         goto fail;
2738         }
2739
2740 #ifdef HAVE_GCRYPT
2741         r = journal_file_hmac_setup(f);
2742         if (r < 0)
2743                 goto fail;
2744 #endif
2745
2746         if (newly_created) {
2747                 r = journal_file_setup_field_hash_table(f);
2748                 if (r < 0)
2749                         goto fail;
2750
2751                 r = journal_file_setup_data_hash_table(f);
2752                 if (r < 0)
2753                         goto fail;
2754
2755 #ifdef HAVE_GCRYPT
2756                 r = journal_file_append_first_tag(f);
2757                 if (r < 0)
2758                         goto fail;
2759 #endif
2760         }
2761
2762         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2763                 r = -EIO;
2764                 goto fail;
2765         }
2766
2767         *ret = f;
2768         return 0;
2769
2770 fail:
2771         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2772                 r = -EIO;
2773
2774         journal_file_close(f);
2775
2776         return r;
2777 }
2778
2779 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2780         _cleanup_free_ char *p = NULL;
2781         size_t l;
2782         JournalFile *old_file, *new_file = NULL;
2783         int r;
2784
2785         assert(f);
2786         assert(*f);
2787
2788         old_file = *f;
2789
2790         if (!old_file->writable)
2791                 return -EINVAL;
2792
2793         if (!endswith(old_file->path, ".journal"))
2794                 return -EINVAL;
2795
2796         l = strlen(old_file->path);
2797         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2798                      (int) l - 8, old_file->path,
2799                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2800                      le64toh((*f)->header->head_entry_seqnum),
2801                      le64toh((*f)->header->head_entry_realtime));
2802         if (r < 0)
2803                 return -ENOMEM;
2804
2805         /* Try to rename the file to the archived version. If the file
2806          * already was deleted, we'll get ENOENT, let's ignore that
2807          * case. */
2808         r = rename(old_file->path, p);
2809         if (r < 0 && errno != ENOENT)
2810                 return -errno;
2811
2812         old_file->header->state = STATE_ARCHIVED;
2813
2814         /* Currently, btrfs is not very good with out write patterns
2815          * and fragments heavily. Let's defrag our journal files when
2816          * we archive them */
2817         old_file->defrag_on_close = true;
2818
2819         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2820         journal_file_close(old_file);
2821
2822         *f = new_file;
2823         return r;
2824 }
2825
2826 int journal_file_open_reliably(
2827                 const char *fname,
2828                 int flags,
2829                 mode_t mode,
2830                 bool compress,
2831                 bool seal,
2832                 JournalMetrics *metrics,
2833                 MMapCache *mmap_cache,
2834                 JournalFile *template,
2835                 JournalFile **ret) {
2836
2837         int r;
2838         size_t l;
2839         _cleanup_free_ char *p = NULL;
2840
2841         r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2842         if (!IN_SET(r,
2843                     -EBADMSG,           /* corrupted */
2844                     -ENODATA,           /* truncated */
2845                     -EHOSTDOWN,         /* other machine */
2846                     -EPROTONOSUPPORT,   /* incompatible feature */
2847                     -EBUSY,             /* unclean shutdown */
2848                     -ESHUTDOWN,         /* already archived */
2849                     -EIO,               /* IO error, including SIGBUS on mmap */
2850                     -EIDRM              /* File has been deleted */))
2851                 return r;
2852
2853         if ((flags & O_ACCMODE) == O_RDONLY)
2854                 return r;
2855
2856         if (!(flags & O_CREAT))
2857                 return r;
2858
2859         if (!endswith(fname, ".journal"))
2860                 return r;
2861
2862         /* The file is corrupted. Rotate it away and try it again (but only once) */
2863
2864         l = strlen(fname);
2865         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2866                      (int) l - 8, fname,
2867                      now(CLOCK_REALTIME),
2868                      random_u64()) < 0)
2869                 return -ENOMEM;
2870
2871         if (rename(fname, p) < 0)
2872                 return -errno;
2873
2874         /* btrfs doesn't cope well with our write pattern and
2875          * fragments heavily. Let's defrag all files we rotate */
2876
2877         (void) chattr_path(p, false, FS_NOCOW_FL);
2878         (void) btrfs_defrag(p);
2879
2880         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2881
2882         return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2883 }
2884
2885 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2886         uint64_t i, n;
2887         uint64_t q, xor_hash = 0;
2888         int r;
2889         EntryItem *items;
2890         dual_timestamp ts;
2891
2892         assert(from);
2893         assert(to);
2894         assert(o);
2895         assert(p);
2896
2897         if (!to->writable)
2898                 return -EPERM;
2899
2900         ts.monotonic = le64toh(o->entry.monotonic);
2901         ts.realtime = le64toh(o->entry.realtime);
2902
2903         n = journal_file_entry_n_items(o);
2904         /* alloca() can't take 0, hence let's allocate at least one */
2905         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2906
2907         for (i = 0; i < n; i++) {
2908                 uint64_t l, h;
2909                 le64_t le_hash;
2910                 size_t t;
2911                 void *data;
2912                 Object *u;
2913
2914                 q = le64toh(o->entry.items[i].object_offset);
2915                 le_hash = o->entry.items[i].hash;
2916
2917                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2918                 if (r < 0)
2919                         return r;
2920
2921                 if (le_hash != o->data.hash)
2922                         return -EBADMSG;
2923
2924                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2925                 t = (size_t) l;
2926
2927                 /* We hit the limit on 32bit machines */
2928                 if ((uint64_t) t != l)
2929                         return -E2BIG;
2930
2931                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2932 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2933                         size_t rsize = 0;
2934
2935                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2936                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2937                         if (r < 0)
2938                                 return r;
2939
2940                         data = from->compress_buffer;
2941                         l = rsize;
2942 #else
2943                         return -EPROTONOSUPPORT;
2944 #endif
2945                 } else
2946                         data = o->data.payload;
2947
2948                 r = journal_file_append_data(to, data, l, &u, &h);
2949                 if (r < 0)
2950                         return r;
2951
2952                 xor_hash ^= le64toh(u->data.hash);
2953                 items[i].object_offset = htole64(h);
2954                 items[i].hash = u->data.hash;
2955
2956                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2957                 if (r < 0)
2958                         return r;
2959         }
2960
2961         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2962
2963         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2964                 return -EIO;
2965
2966         return r;
2967 }
2968
2969 void journal_reset_metrics(JournalMetrics *m) {
2970         assert(m);
2971
2972         /* Set everything to "pick automatic values". */
2973
2974         *m = (JournalMetrics) {
2975                 .min_use = (uint64_t) -1,
2976                 .max_use = (uint64_t) -1,
2977                 .min_size = (uint64_t) -1,
2978                 .max_size = (uint64_t) -1,
2979                 .keep_free = (uint64_t) -1,
2980                 .n_max_files = (uint64_t) -1,
2981         };
2982 }
2983
2984 void journal_default_metrics(JournalMetrics *m, int fd) {
2985         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
2986         struct statvfs ss;
2987         uint64_t fs_size;
2988
2989         assert(m);
2990         assert(fd >= 0);
2991
2992         if (fstatvfs(fd, &ss) >= 0)
2993                 fs_size = ss.f_frsize * ss.f_blocks;
2994         else {
2995                 log_debug_errno(errno, "Failed to detremine disk size: %m");
2996                 fs_size = 0;
2997         }
2998
2999         if (m->max_use == (uint64_t) -1) {
3000
3001                 if (fs_size > 0) {
3002                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3003
3004                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3005                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3006
3007                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3008                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3009                 } else
3010                         m->max_use = DEFAULT_MAX_USE_LOWER;
3011         } else {
3012                 m->max_use = PAGE_ALIGN(m->max_use);
3013
3014                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3015                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3016         }
3017
3018         if (m->min_use == (uint64_t) -1)
3019                 m->min_use = DEFAULT_MIN_USE;
3020
3021         if (m->min_use > m->max_use)
3022                 m->min_use = m->max_use;
3023
3024         if (m->max_size == (uint64_t) -1) {
3025                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3026
3027                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3028                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3029         } else
3030                 m->max_size = PAGE_ALIGN(m->max_size);
3031
3032         if (m->max_size != 0) {
3033                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3034                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3035
3036                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3037                         m->max_use = m->max_size*2;
3038         }
3039
3040         if (m->min_size == (uint64_t) -1)
3041                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3042         else {
3043                 m->min_size = PAGE_ALIGN(m->min_size);
3044
3045                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3046                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3047
3048                 if (m->max_size != 0 && m->min_size > m->max_size)
3049                         m->max_size = m->min_size;
3050         }
3051
3052         if (m->keep_free == (uint64_t) -1) {
3053
3054                 if (fs_size > 0) {
3055                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3056
3057                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3058                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3059
3060                 } else
3061                         m->keep_free = DEFAULT_KEEP_FREE;
3062         }
3063
3064         if (m->n_max_files == (uint64_t) -1)
3065                 m->n_max_files = DEFAULT_N_MAX_FILES;
3066
3067         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3068                   format_bytes(a, sizeof(a), m->min_use),
3069                   format_bytes(b, sizeof(b), m->max_use),
3070                   format_bytes(c, sizeof(c), m->max_size),
3071                   format_bytes(d, sizeof(d), m->min_size),
3072                   format_bytes(e, sizeof(e), m->keep_free),
3073                   m->n_max_files);
3074 }
3075
3076 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3077         assert(f);
3078         assert(from || to);
3079
3080         if (from) {
3081                 if (f->header->head_entry_realtime == 0)
3082                         return -ENOENT;
3083
3084                 *from = le64toh(f->header->head_entry_realtime);
3085         }
3086
3087         if (to) {
3088                 if (f->header->tail_entry_realtime == 0)
3089                         return -ENOENT;
3090
3091                 *to = le64toh(f->header->tail_entry_realtime);
3092         }
3093
3094         return 1;
3095 }
3096
3097 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3098         Object *o;
3099         uint64_t p;
3100         int r;
3101
3102         assert(f);
3103         assert(from || to);
3104
3105         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3106         if (r <= 0)
3107                 return r;
3108
3109         if (le64toh(o->data.n_entries) <= 0)
3110                 return 0;
3111
3112         if (from) {
3113                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3114                 if (r < 0)
3115                         return r;
3116
3117                 *from = le64toh(o->entry.monotonic);
3118         }
3119
3120         if (to) {
3121                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3122                 if (r < 0)
3123                         return r;
3124
3125                 r = generic_array_get_plus_one(f,
3126                                                le64toh(o->data.entry_offset),
3127                                                le64toh(o->data.entry_array_offset),
3128                                                le64toh(o->data.n_entries)-1,
3129                                                &o, NULL);
3130                 if (r <= 0)
3131                         return r;
3132
3133                 *to = le64toh(o->entry.monotonic);
3134         }
3135
3136         return 1;
3137 }
3138
3139 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3140         assert(f);
3141
3142         /* If we gained new header fields we gained new features,
3143          * hence suggest a rotation */
3144         if (le64toh(f->header->header_size) < sizeof(Header)) {
3145                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3146                 return true;
3147         }
3148
3149         /* Let's check if the hash tables grew over a certain fill
3150          * level (75%, borrowing this value from Java's hash table
3151          * implementation), and if so suggest a rotation. To calculate
3152          * the fill level we need the n_data field, which only exists
3153          * in newer versions. */
3154
3155         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3156                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3157                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3158                                   f->path,
3159                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3160                                   le64toh(f->header->n_data),
3161                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3162                                   (unsigned long long) f->last_stat.st_size,
3163                                   f->last_stat.st_size / le64toh(f->header->n_data));
3164                         return true;
3165                 }
3166
3167         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3168                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3169                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3170                                   f->path,
3171                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3172                                   le64toh(f->header->n_fields),
3173                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3174                         return true;
3175                 }
3176
3177         /* Are the data objects properly indexed by field objects? */
3178         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3179             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3180             le64toh(f->header->n_data) > 0 &&
3181             le64toh(f->header->n_fields) == 0)
3182                 return true;
3183
3184         if (max_file_usec > 0) {
3185                 usec_t t, h;
3186
3187                 h = le64toh(f->header->head_entry_realtime);
3188                 t = now(CLOCK_REALTIME);
3189
3190                 if (h > 0 && t > h + max_file_usec)
3191                         return true;
3192         }
3193
3194         return false;
3195 }