src/journal/journal-file.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2011 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <fcntl.h>
  24 #include <linux/fs.h>
  25 #include <stddef.h>
  26 #include <sys/mman.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/uio.h>
  29 #include <unistd.h>
  30
  31 #include "alloc-util.h"
  32 #include "btrfs-util.h"
  33 #include "chattr-util.h"
  34 #include "compress.h"
  35 #include "fd-util.h"
  36 #include "journal-authenticate.h"
  37 #include "journal-def.h"
  38 #include "journal-file.h"
  39 #include "lookup3.h"
  40 #include "parse-util.h"
  41 #include "random-util.h"
  42 #include "string-util.h"
  43 #include "xattr-util.h"
  44
  45 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  46 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  47
  48 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
  49
  50 /* This is the minimum journal file size */
  51 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
  52
  53 /* These are the lower and upper bounds if we deduce the max_use value
  54  * from the file system size */
  55 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  56 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  57
  58 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  59 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  60
  61 /* This is the upper bound if we deduce max_size from max_use */
  62 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  63
  64 /* This is the upper bound if we deduce the keep_free value from the
  65  * file system size */
  66 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  67
  68 /* This is the keep_free value when we can't determine the system
  69  * size */
  70 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  71
  72 /* This is the default maximum number of journal files to keep around. */
  73 #define DEFAULT_N_MAX_FILES (100)
  74
  75 /* n_data was the first entry we added after the initial file format design */
  76 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  77
  78 /* How many entries to keep in the entry array chain cache at max */
  79 #define CHAIN_CACHE_MAX 20
  80
  81 /* How much to increase the journal file size at once each time we allocate something new. */
  82 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  83
  84 /* Reread fstat() of the file for detecting deletions at least this often */
  85 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  86
  87 /* The mmap context to use for the header we pick as one above the last defined typed */
  88 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  89
  90 static int journal_file_set_online(JournalFile *f) {
  91         assert(f);
  92
  93         if (!f->writable)
  94                 return -EPERM;
  95
  96         if (!(f->fd >= 0 && f->header))
  97                 return -EINVAL;
  98
  99         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 100                 return -EIO;
 101
 102         switch(f->header->state) {
 103                 case STATE_ONLINE:
 104                         return 0;
 105
 106                 case STATE_OFFLINE:
 107                         f->header->state = STATE_ONLINE;
 108                         fsync(f->fd);
 109                         return 0;
 110
 111                 default:
 112                         return -EINVAL;
 113         }
 114 }
 115
 116 int journal_file_set_offline(JournalFile *f) {
 117         assert(f);
 118
 119         if (!f->writable)
 120                 return -EPERM;
 121
 122         if (!(f->fd >= 0 && f->header))
 123                 return -EINVAL;
 124
 125         if (f->header->state != STATE_ONLINE)
 126                 return 0;
 127
 128         fsync(f->fd);
 129
 130         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 131                 return -EIO;
 132
 133         f->header->state = STATE_OFFLINE;
 134
 135         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 136                 return -EIO;
 137
 138         fsync(f->fd);
 139
 140         return 0;
 141 }
 142
 143 JournalFile* journal_file_close(JournalFile *f) {
 144         assert(f);
 145
 146 #ifdef HAVE_GCRYPT
 147         /* Write the final tag */
 148         if (f->seal && f->writable)
 149                 journal_file_append_tag(f);
 150 #endif
 151
 152         journal_file_set_offline(f);
 153
 154         if (f->mmap && f->fd >= 0)
 155                 mmap_cache_close_fd(f->mmap, f->fd);
 156
 157         if (f->fd >= 0 && f->defrag_on_close) {
 158
 159                 /* Be friendly to btrfs: turn COW back on again now,
 160                  * and defragment the file. We won't write to the file
 161                  * ever again, hence remove all fragmentation, and
 162                  * reenable all the good bits COW usually provides
 163                  * (such as data checksumming). */
 164
 165                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 166                 (void) btrfs_defrag_fd(f->fd);
 167         }
 168
 169         safe_close(f->fd);
 170         free(f->path);
 171
 172         if (f->mmap)
 173                 mmap_cache_unref(f->mmap);
 174
 175         ordered_hashmap_free_free(f->chain_cache);
 176
 177 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 178         free(f->compress_buffer);
 179 #endif
 180
 181 #ifdef HAVE_GCRYPT
 182         if (f->fss_file)
 183                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 184         else
 185                 free(f->fsprg_state);
 186
 187         free(f->fsprg_seed);
 188
 189         if (f->hmac)
 190                 gcry_md_close(f->hmac);
 191 #endif
 192
 193         free(f);
 194         return NULL;
 195 }
 196
 197 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 198         Header h = {};
 199         ssize_t k;
 200         int r;
 201
 202         assert(f);
 203
 204         memcpy(h.signature, HEADER_SIGNATURE, 8);
 205         h.header_size = htole64(ALIGN64(sizeof(h)));
 206
 207         h.incompatible_flags |= htole32(
 208                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 209                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 210
 211         h.compatible_flags = htole32(
 212                 f->seal * HEADER_COMPATIBLE_SEALED);
 213
 214         r = sd_id128_randomize(&h.file_id);
 215         if (r < 0)
 216                 return r;
 217
 218         if (template) {
 219                 h.seqnum_id = template->header->seqnum_id;
 220                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 221         } else
 222                 h.seqnum_id = h.file_id;
 223
 224         k = pwrite(f->fd, &h, sizeof(h), 0);
 225         if (k < 0)
 226                 return -errno;
 227
 228         if (k != sizeof(h))
 229                 return -EIO;
 230
 231         return 0;
 232 }
 233
 234 static int journal_file_refresh_header(JournalFile *f) {
 235         sd_id128_t boot_id;
 236         int r;
 237
 238         assert(f);
 239
 240         r = sd_id128_get_machine(&f->header->machine_id);
 241         if (r < 0)
 242                 return r;
 243
 244         r = sd_id128_get_boot(&boot_id);
 245         if (r < 0)
 246                 return r;
 247
 248         if (sd_id128_equal(boot_id, f->header->boot_id))
 249                 f->tail_entry_monotonic_valid = true;
 250
 251         f->header->boot_id = boot_id;
 252
 253         r = journal_file_set_online(f);
 254
 255         /* Sync the online state to disk */
 256         fsync(f->fd);
 257
 258         return r;
 259 }
 260
 261 static int journal_file_verify_header(JournalFile *f) {
 262         uint32_t flags;
 263
 264         assert(f);
 265
 266         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 267                 return -EBADMSG;
 268
 269         /* In both read and write mode we refuse to open files with
 270          * incompatible flags we don't know */
 271         flags = le32toh(f->header->incompatible_flags);
 272         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
 273                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
 274                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
 275                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
 276                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
 277                 if (flags)
 278                         log_debug("Journal file %s uses incompatible flags %"PRIx32
 279                                   " disabled at compilation time.", f->path, flags);
 280                 return -EPROTONOSUPPORT;
 281         }
 282
 283         /* When open for writing we refuse to open files with
 284          * compatible flags, too */
 285         flags = le32toh(f->header->compatible_flags);
 286         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
 287                 if (flags & ~HEADER_COMPATIBLE_ANY)
 288                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
 289                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
 290                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
 291                 if (flags)
 292                         log_debug("Journal file %s uses compatible flags %"PRIx32
 293                                   " disabled at compilation time.", f->path, flags);
 294                 return -EPROTONOSUPPORT;
 295         }
 296
 297         if (f->header->state >= _STATE_MAX)
 298                 return -EBADMSG;
 299
 300         /* The first addition was n_data, so check that we are at least this large */
 301         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
 302                 return -EBADMSG;
 303
 304         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 305                 return -EBADMSG;
 306
 307         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
 308                 return -ENODATA;
 309
 310         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
 311                 return -ENODATA;
 312
 313         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 314             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 315             !VALID64(le64toh(f->header->tail_object_offset)) ||
 316             !VALID64(le64toh(f->header->entry_array_offset)))
 317                 return -ENODATA;
 318
 319         if (f->writable) {
 320                 uint8_t state;
 321                 sd_id128_t machine_id;
 322                 int r;
 323
 324                 r = sd_id128_get_machine(&machine_id);
 325                 if (r < 0)
 326                         return r;
 327
 328                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 329                         return -EHOSTDOWN;
 330
 331                 state = f->header->state;
 332
 333                 if (state == STATE_ONLINE) {
 334                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 335                         return -EBUSY;
 336                 } else if (state == STATE_ARCHIVED)
 337                         return -ESHUTDOWN;
 338                 else if (state != STATE_OFFLINE) {
 339                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 340                         return -EBUSY;
 341                 }
 342         }
 343
 344         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 345         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 346
 347         f->seal = JOURNAL_HEADER_SEALED(f->header);
 348
 349         return 0;
 350 }
 351
 352 static int journal_file_fstat(JournalFile *f) {
 353         assert(f);
 354         assert(f->fd >= 0);
 355
 356         if (fstat(f->fd, &f->last_stat) < 0)
 357                 return -errno;
 358
 359         f->last_stat_usec = now(CLOCK_MONOTONIC);
 360
 361         /* Refuse appending to files that are already deleted */
 362         if (f->last_stat.st_nlink <= 0)
 363                 return -EIDRM;
 364
 365         return 0;
 366 }
 367
 368 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 369         uint64_t old_size, new_size;
 370         int r;
 371
 372         assert(f);
 373
 374         /* We assume that this file is not sparse, and we know that
 375          * for sure, since we always call posix_fallocate()
 376          * ourselves */
 377
 378         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 379                 return -EIO;
 380
 381         old_size =
 382                 le64toh(f->header->header_size) +
 383                 le64toh(f->header->arena_size);
 384
 385         new_size = PAGE_ALIGN(offset + size);
 386         if (new_size < le64toh(f->header->header_size))
 387                 new_size = le64toh(f->header->header_size);
 388
 389         if (new_size <= old_size) {
 390
 391                 /* We already pre-allocated enough space, but before
 392                  * we write to it, let's check with fstat() if the
 393                  * file got deleted, in order make sure we don't throw
 394                  * away the data immediately. Don't check fstat() for
 395                  * all writes though, but only once ever 10s. */
 396
 397                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 398                         return 0;
 399
 400                 return journal_file_fstat(f);
 401         }
 402
 403         /* Allocate more space. */
 404
 405         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 406                 return -E2BIG;
 407
 408         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 409                 struct statvfs svfs;
 410
 411                 if (fstatvfs(f->fd, &svfs) >= 0) {
 412                         uint64_t available;
 413
 414                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 415
 416                         if (new_size - old_size > available)
 417                                 return -E2BIG;
 418                 }
 419         }
 420
 421         /* Increase by larger blocks at once */
 422         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 423         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 424                 new_size = f->metrics.max_size;
 425
 426         /* Note that the glibc fallocate() fallback is very
 427            inefficient, hence we try to minimize the allocation area
 428            as we can. */
 429         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 430         if (r != 0)
 431                 return -r;
 432
 433         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 434
 435         return journal_file_fstat(f);
 436 }
 437
 438 static unsigned type_to_context(ObjectType type) {
 439         /* One context for each type, plus one catch-all for the rest */
 440         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 441         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 442         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 443 }
 444
 445 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
 446         int r;
 447
 448         assert(f);
 449         assert(ret);
 450
 451         if (size <= 0)
 452                 return -EINVAL;
 453
 454         /* Avoid SIGBUS on invalid accesses */
 455         if (offset + size > (uint64_t) f->last_stat.st_size) {
 456                 /* Hmm, out of range? Let's refresh the fstat() data
 457                  * first, before we trust that check. */
 458
 459                 r = journal_file_fstat(f);
 460                 if (r < 0)
 461                         return r;
 462
 463                 if (offset + size > (uint64_t) f->last_stat.st_size)
 464                         return -EADDRNOTAVAIL;
 465         }
 466
 467         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
 468 }
 469
 470 static uint64_t minimum_header_size(Object *o) {
 471
 472         static const uint64_t table[] = {
 473                 [OBJECT_DATA] = sizeof(DataObject),
 474                 [OBJECT_FIELD] = sizeof(FieldObject),
 475                 [OBJECT_ENTRY] = sizeof(EntryObject),
 476                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 477                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 478                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 479                 [OBJECT_TAG] = sizeof(TagObject),
 480         };
 481
 482         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 483                 return sizeof(ObjectHeader);
 484
 485         return table[o->object.type];
 486 }
 487
 488 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 489         int r;
 490         void *t;
 491         Object *o;
 492         uint64_t s;
 493
 494         assert(f);
 495         assert(ret);
 496
 497         /* Objects may only be located at multiple of 64 bit */
 498         if (!VALID64(offset))
 499                 return -EFAULT;
 500
 501         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
 502         if (r < 0)
 503                 return r;
 504
 505         o = (Object*) t;
 506         s = le64toh(o->object.size);
 507
 508         if (s < sizeof(ObjectHeader))
 509                 return -EBADMSG;
 510
 511         if (o->object.type <= OBJECT_UNUSED)
 512                 return -EBADMSG;
 513
 514         if (s < minimum_header_size(o))
 515                 return -EBADMSG;
 516
 517         if (type > OBJECT_UNUSED && o->object.type != type)
 518                 return -EBADMSG;
 519
 520         if (s > sizeof(ObjectHeader)) {
 521                 r = journal_file_move_to(f, type, false, offset, s, &t);
 522                 if (r < 0)
 523                         return r;
 524
 525                 o = (Object*) t;
 526         }
 527
 528         *ret = o;
 529         return 0;
 530 }
 531
 532 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 533         uint64_t r;
 534
 535         assert(f);
 536
 537         r = le64toh(f->header->tail_entry_seqnum) + 1;
 538
 539         if (seqnum) {
 540                 /* If an external seqnum counter was passed, we update
 541                  * both the local and the external one, and set it to
 542                  * the maximum of both */
 543
 544                 if (*seqnum + 1 > r)
 545                         r = *seqnum + 1;
 546
 547                 *seqnum = r;
 548         }
 549
 550         f->header->tail_entry_seqnum = htole64(r);
 551
 552         if (f->header->head_entry_seqnum == 0)
 553                 f->header->head_entry_seqnum = htole64(r);
 554
 555         return r;
 556 }
 557
 558 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
 559         int r;
 560         uint64_t p;
 561         Object *tail, *o;
 562         void *t;
 563
 564         assert(f);
 565         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
 566         assert(size >= sizeof(ObjectHeader));
 567         assert(offset);
 568         assert(ret);
 569
 570         r = journal_file_set_online(f);
 571         if (r < 0)
 572                 return r;
 573
 574         p = le64toh(f->header->tail_object_offset);
 575         if (p == 0)
 576                 p = le64toh(f->header->header_size);
 577         else {
 578                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
 579                 if (r < 0)
 580                         return r;
 581
 582                 p += ALIGN64(le64toh(tail->object.size));
 583         }
 584
 585         r = journal_file_allocate(f, p, size);
 586         if (r < 0)
 587                 return r;
 588
 589         r = journal_file_move_to(f, type, false, p, size, &t);
 590         if (r < 0)
 591                 return r;
 592
 593         o = (Object*) t;
 594
 595         zero(o->object);
 596         o->object.type = type;
 597         o->object.size = htole64(size);
 598
 599         f->header->tail_object_offset = htole64(p);
 600         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
 601
 602         *ret = o;
 603         *offset = p;
 604
 605         return 0;
 606 }
 607
 608 static int journal_file_setup_data_hash_table(JournalFile *f) {
 609         uint64_t s, p;
 610         Object *o;
 611         int r;
 612
 613         assert(f);
 614
 615         /* We estimate that we need 1 hash table entry per 768 bytes
 616            of journal file and we want to make sure we never get
 617            beyond 75% fill level. Calculate the hash table size for
 618            the maximum file size based on these metrics. */
 619
 620         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
 621         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
 622                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
 623
 624         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
 625
 626         r = journal_file_append_object(f,
 627                                        OBJECT_DATA_HASH_TABLE,
 628                                        offsetof(Object, hash_table.items) + s,
 629                                        &o, &p);
 630         if (r < 0)
 631                 return r;
 632
 633         memzero(o->hash_table.items, s);
 634
 635         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 636         f->header->data_hash_table_size = htole64(s);
 637
 638         return 0;
 639 }
 640
 641 static int journal_file_setup_field_hash_table(JournalFile *f) {
 642         uint64_t s, p;
 643         Object *o;
 644         int r;
 645
 646         assert(f);
 647
 648         /* We use a fixed size hash table for the fields as this
 649          * number should grow very slowly only */
 650
 651         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
 652         r = journal_file_append_object(f,
 653                                        OBJECT_FIELD_HASH_TABLE,
 654                                        offsetof(Object, hash_table.items) + s,
 655                                        &o, &p);
 656         if (r < 0)
 657                 return r;
 658
 659         memzero(o->hash_table.items, s);
 660
 661         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 662         f->header->field_hash_table_size = htole64(s);
 663
 664         return 0;
 665 }
 666
 667 int journal_file_map_data_hash_table(JournalFile *f) {
 668         uint64_t s, p;
 669         void *t;
 670         int r;
 671
 672         assert(f);
 673
 674         if (f->data_hash_table)
 675                 return 0;
 676
 677         p = le64toh(f->header->data_hash_table_offset);
 678         s = le64toh(f->header->data_hash_table_size);
 679
 680         r = journal_file_move_to(f,
 681                                  OBJECT_DATA_HASH_TABLE,
 682                                  true,
 683                                  p, s,
 684                                  &t);
 685         if (r < 0)
 686                 return r;
 687
 688         f->data_hash_table = t;
 689         return 0;
 690 }
 691
 692 int journal_file_map_field_hash_table(JournalFile *f) {
 693         uint64_t s, p;
 694         void *t;
 695         int r;
 696
 697         assert(f);
 698
 699         if (f->field_hash_table)
 700                 return 0;
 701
 702         p = le64toh(f->header->field_hash_table_offset);
 703         s = le64toh(f->header->field_hash_table_size);
 704
 705         r = journal_file_move_to(f,
 706                                  OBJECT_FIELD_HASH_TABLE,
 707                                  true,
 708                                  p, s,
 709                                  &t);
 710         if (r < 0)
 711                 return r;
 712
 713         f->field_hash_table = t;
 714         return 0;
 715 }
 716
 717 static int journal_file_link_field(
 718                 JournalFile *f,
 719                 Object *o,
 720                 uint64_t offset,
 721                 uint64_t hash) {
 722
 723         uint64_t p, h, m;
 724         int r;
 725
 726         assert(f);
 727         assert(o);
 728         assert(offset > 0);
 729
 730         if (o->object.type != OBJECT_FIELD)
 731                 return -EINVAL;
 732
 733         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 734         if (m <= 0)
 735                 return -EBADMSG;
 736
 737         /* This might alter the window we are looking at */
 738         o->field.next_hash_offset = o->field.head_data_offset = 0;
 739
 740         h = hash % m;
 741         p = le64toh(f->field_hash_table[h].tail_hash_offset);
 742         if (p == 0)
 743                 f->field_hash_table[h].head_hash_offset = htole64(offset);
 744         else {
 745                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 746                 if (r < 0)
 747                         return r;
 748
 749                 o->field.next_hash_offset = htole64(offset);
 750         }
 751
 752         f->field_hash_table[h].tail_hash_offset = htole64(offset);
 753
 754         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
 755                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
 756
 757         return 0;
 758 }
 759
 760 static int journal_file_link_data(
 761                 JournalFile *f,
 762                 Object *o,
 763                 uint64_t offset,
 764                 uint64_t hash) {
 765
 766         uint64_t p, h, m;
 767         int r;
 768
 769         assert(f);
 770         assert(o);
 771         assert(offset > 0);
 772
 773         if (o->object.type != OBJECT_DATA)
 774                 return -EINVAL;
 775
 776         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 777         if (m <= 0)
 778                 return -EBADMSG;
 779
 780         /* This might alter the window we are looking at */
 781         o->data.next_hash_offset = o->data.next_field_offset = 0;
 782         o->data.entry_offset = o->data.entry_array_offset = 0;
 783         o->data.n_entries = 0;
 784
 785         h = hash % m;
 786         p = le64toh(f->data_hash_table[h].tail_hash_offset);
 787         if (p == 0)
 788                 /* Only entry in the hash table is easy */
 789                 f->data_hash_table[h].head_hash_offset = htole64(offset);
 790         else {
 791                 /* Move back to the previous data object, to patch in
 792                  * pointer */
 793
 794                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 795                 if (r < 0)
 796                         return r;
 797
 798                 o->data.next_hash_offset = htole64(offset);
 799         }
 800
 801         f->data_hash_table[h].tail_hash_offset = htole64(offset);
 802
 803         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
 804                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
 805
 806         return 0;
 807 }
 808
 809 int journal_file_find_field_object_with_hash(
 810                 JournalFile *f,
 811                 const void *field, uint64_t size, uint64_t hash,
 812                 Object **ret, uint64_t *offset) {
 813
 814         uint64_t p, osize, h, m;
 815         int r;
 816
 817         assert(f);
 818         assert(field && size > 0);
 819
 820         /* If the field hash table is empty, we can't find anything */
 821         if (le64toh(f->header->field_hash_table_size) <= 0)
 822                 return 0;
 823
 824         /* Map the field hash table, if it isn't mapped yet. */
 825         r = journal_file_map_field_hash_table(f);
 826         if (r < 0)
 827                 return r;
 828
 829         osize = offsetof(Object, field.payload) + size;
 830
 831         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 832         if (m <= 0)
 833                 return -EBADMSG;
 834
 835         h = hash % m;
 836         p = le64toh(f->field_hash_table[h].head_hash_offset);
 837
 838         while (p > 0) {
 839                 Object *o;
 840
 841                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 842                 if (r < 0)
 843                         return r;
 844
 845                 if (le64toh(o->field.hash) == hash &&
 846                     le64toh(o->object.size) == osize &&
 847                     memcmp(o->field.payload, field, size) == 0) {
 848
 849                         if (ret)
 850                                 *ret = o;
 851                         if (offset)
 852                                 *offset = p;
 853
 854                         return 1;
 855                 }
 856
 857                 p = le64toh(o->field.next_hash_offset);
 858         }
 859
 860         return 0;
 861 }
 862
 863 int journal_file_find_field_object(
 864                 JournalFile *f,
 865                 const void *field, uint64_t size,
 866                 Object **ret, uint64_t *offset) {
 867
 868         uint64_t hash;
 869
 870         assert(f);
 871         assert(field && size > 0);
 872
 873         hash = hash64(field, size);
 874
 875         return journal_file_find_field_object_with_hash(f,
 876                                                         field, size, hash,
 877                                                         ret, offset);
 878 }
 879
 880 int journal_file_find_data_object_with_hash(
 881                 JournalFile *f,
 882                 const void *data, uint64_t size, uint64_t hash,
 883                 Object **ret, uint64_t *offset) {
 884
 885         uint64_t p, osize, h, m;
 886         int r;
 887
 888         assert(f);
 889         assert(data || size == 0);
 890
 891         /* If there's no data hash table, then there's no entry. */
 892         if (le64toh(f->header->data_hash_table_size) <= 0)
 893                 return 0;
 894
 895         /* Map the data hash table, if it isn't mapped yet. */
 896         r = journal_file_map_data_hash_table(f);
 897         if (r < 0)
 898                 return r;
 899
 900         osize = offsetof(Object, data.payload) + size;
 901
 902         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 903         if (m <= 0)
 904                 return -EBADMSG;
 905
 906         h = hash % m;
 907         p = le64toh(f->data_hash_table[h].head_hash_offset);
 908
 909         while (p > 0) {
 910                 Object *o;
 911
 912                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 913                 if (r < 0)
 914                         return r;
 915
 916                 if (le64toh(o->data.hash) != hash)
 917                         goto next;
 918
 919                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
 920 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 921                         uint64_t l;
 922                         size_t rsize = 0;
 923
 924                         l = le64toh(o->object.size);
 925                         if (l <= offsetof(Object, data.payload))
 926                                 return -EBADMSG;
 927
 928                         l -= offsetof(Object, data.payload);
 929
 930                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
 931                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
 932                         if (r < 0)
 933                                 return r;
 934
 935                         if (rsize == size &&
 936                             memcmp(f->compress_buffer, data, size) == 0) {
 937
 938                                 if (ret)
 939                                         *ret = o;
 940
 941                                 if (offset)
 942                                         *offset = p;
 943
 944                                 return 1;
 945                         }
 946 #else
 947                         return -EPROTONOSUPPORT;
 948 #endif
 949                 } else if (le64toh(o->object.size) == osize &&
 950                            memcmp(o->data.payload, data, size) == 0) {
 951
 952                         if (ret)
 953                                 *ret = o;
 954
 955                         if (offset)
 956                                 *offset = p;
 957
 958                         return 1;
 959                 }
 960
 961         next:
 962                 p = le64toh(o->data.next_hash_offset);
 963         }
 964
 965         return 0;
 966 }
 967
 968 int journal_file_find_data_object(
 969                 JournalFile *f,
 970                 const void *data, uint64_t size,
 971                 Object **ret, uint64_t *offset) {
 972
 973         uint64_t hash;
 974
 975         assert(f);
 976         assert(data || size == 0);
 977
 978         hash = hash64(data, size);
 979
 980         return journal_file_find_data_object_with_hash(f,
 981                                                        data, size, hash,
 982                                                        ret, offset);
 983 }
 984
 985 static int journal_file_append_field(
 986                 JournalFile *f,
 987                 const void *field, uint64_t size,
 988                 Object **ret, uint64_t *offset) {
 989
 990         uint64_t hash, p;
 991         uint64_t osize;
 992         Object *o;
 993         int r;
 994
 995         assert(f);
 996         assert(field && size > 0);
 997
 998         hash = hash64(field, size);
 999
1000         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1001         if (r < 0)
1002                 return r;
1003         else if (r > 0) {
1004
1005                 if (ret)
1006                         *ret = o;
1007
1008                 if (offset)
1009                         *offset = p;
1010
1011                 return 0;
1012         }
1013
1014         osize = offsetof(Object, field.payload) + size;
1015         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1016         if (r < 0)
1017                 return r;
1018
1019         o->field.hash = htole64(hash);
1020         memcpy(o->field.payload, field, size);
1021
1022         r = journal_file_link_field(f, o, p, hash);
1023         if (r < 0)
1024                 return r;
1025
1026         /* The linking might have altered the window, so let's
1027          * refresh our pointer */
1028         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1029         if (r < 0)
1030                 return r;
1031
1032 #ifdef HAVE_GCRYPT
1033         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1034         if (r < 0)
1035                 return r;
1036 #endif
1037
1038         if (ret)
1039                 *ret = o;
1040
1041         if (offset)
1042                 *offset = p;
1043
1044         return 0;
1045 }
1046
1047 static int journal_file_append_data(
1048                 JournalFile *f,
1049                 const void *data, uint64_t size,
1050                 Object **ret, uint64_t *offset) {
1051
1052         uint64_t hash, p;
1053         uint64_t osize;
1054         Object *o;
1055         int r, compression = 0;
1056         const void *eq;
1057
1058         assert(f);
1059         assert(data || size == 0);
1060
1061         hash = hash64(data, size);
1062
1063         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1064         if (r < 0)
1065                 return r;
1066         if (r > 0) {
1067
1068                 if (ret)
1069                         *ret = o;
1070
1071                 if (offset)
1072                         *offset = p;
1073
1074                 return 0;
1075         }
1076
1077         osize = offsetof(Object, data.payload) + size;
1078         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1079         if (r < 0)
1080                 return r;
1081
1082         o->data.hash = htole64(hash);
1083
1084 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1085         if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1086                 size_t rsize = 0;
1087
1088                 compression = compress_blob(data, size, o->data.payload, &rsize);
1089
1090                 if (compression >= 0) {
1091                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1092                         o->object.flags |= compression;
1093
1094                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1095                                   size, rsize, object_compressed_to_string(compression));
1096                 } else
1097                         /* Compression didn't work, we don't really care why, let's continue without compression */
1098                         compression = 0;
1099         }
1100 #endif
1101
1102         if (compression == 0 && size > 0)
1103                 memcpy(o->data.payload, data, size);
1104
1105         r = journal_file_link_data(f, o, p, hash);
1106         if (r < 0)
1107                 return r;
1108
1109         /* The linking might have altered the window, so let's
1110          * refresh our pointer */
1111         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1112         if (r < 0)
1113                 return r;
1114
1115         if (!data)
1116                 eq = NULL;
1117         else
1118                 eq = memchr(data, '=', size);
1119         if (eq && eq > data) {
1120                 Object *fo = NULL;
1121                 uint64_t fp;
1122
1123                 /* Create field object ... */
1124                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1125                 if (r < 0)
1126                         return r;
1127
1128                 /* ... and link it in. */
1129                 o->data.next_field_offset = fo->field.head_data_offset;
1130                 fo->field.head_data_offset = le64toh(p);
1131         }
1132
1133 #ifdef HAVE_GCRYPT
1134         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1135         if (r < 0)
1136                 return r;
1137 #endif
1138
1139         if (ret)
1140                 *ret = o;
1141
1142         if (offset)
1143                 *offset = p;
1144
1145         return 0;
1146 }
1147
1148 uint64_t journal_file_entry_n_items(Object *o) {
1149         assert(o);
1150
1151         if (o->object.type != OBJECT_ENTRY)
1152                 return 0;
1153
1154         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1155 }
1156
1157 uint64_t journal_file_entry_array_n_items(Object *o) {
1158         assert(o);
1159
1160         if (o->object.type != OBJECT_ENTRY_ARRAY)
1161                 return 0;
1162
1163         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1164 }
1165
1166 uint64_t journal_file_hash_table_n_items(Object *o) {
1167         assert(o);
1168
1169         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1170             o->object.type != OBJECT_FIELD_HASH_TABLE)
1171                 return 0;
1172
1173         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1174 }
1175
1176 static int link_entry_into_array(JournalFile *f,
1177                                  le64_t *first,
1178                                  le64_t *idx,
1179                                  uint64_t p) {
1180         int r;
1181         uint64_t n = 0, ap = 0, q, i, a, hidx;
1182         Object *o;
1183
1184         assert(f);
1185         assert(first);
1186         assert(idx);
1187         assert(p > 0);
1188
1189         a = le64toh(*first);
1190         i = hidx = le64toh(*idx);
1191         while (a > 0) {
1192
1193                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1194                 if (r < 0)
1195                         return r;
1196
1197                 n = journal_file_entry_array_n_items(o);
1198                 if (i < n) {
1199                         o->entry_array.items[i] = htole64(p);
1200                         *idx = htole64(hidx + 1);
1201                         return 0;
1202                 }
1203
1204                 i -= n;
1205                 ap = a;
1206                 a = le64toh(o->entry_array.next_entry_array_offset);
1207         }
1208
1209         if (hidx > n)
1210                 n = (hidx+1) * 2;
1211         else
1212                 n = n * 2;
1213
1214         if (n < 4)
1215                 n = 4;
1216
1217         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1218                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1219                                        &o, &q);
1220         if (r < 0)
1221                 return r;
1222
1223 #ifdef HAVE_GCRYPT
1224         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1225         if (r < 0)
1226                 return r;
1227 #endif
1228
1229         o->entry_array.items[i] = htole64(p);
1230
1231         if (ap == 0)
1232                 *first = htole64(q);
1233         else {
1234                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1235                 if (r < 0)
1236                         return r;
1237
1238                 o->entry_array.next_entry_array_offset = htole64(q);
1239         }
1240
1241         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1242                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1243
1244         *idx = htole64(hidx + 1);
1245
1246         return 0;
1247 }
1248
1249 static int link_entry_into_array_plus_one(JournalFile *f,
1250                                           le64_t *extra,
1251                                           le64_t *first,
1252                                           le64_t *idx,
1253                                           uint64_t p) {
1254
1255         int r;
1256
1257         assert(f);
1258         assert(extra);
1259         assert(first);
1260         assert(idx);
1261         assert(p > 0);
1262
1263         if (*idx == 0)
1264                 *extra = htole64(p);
1265         else {
1266                 le64_t i;
1267
1268                 i = htole64(le64toh(*idx) - 1);
1269                 r = link_entry_into_array(f, first, &i, p);
1270                 if (r < 0)
1271                         return r;
1272         }
1273
1274         *idx = htole64(le64toh(*idx) + 1);
1275         return 0;
1276 }
1277
1278 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1279         uint64_t p;
1280         int r;
1281         assert(f);
1282         assert(o);
1283         assert(offset > 0);
1284
1285         p = le64toh(o->entry.items[i].object_offset);
1286         if (p == 0)
1287                 return -EINVAL;
1288
1289         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1290         if (r < 0)
1291                 return r;
1292
1293         return link_entry_into_array_plus_one(f,
1294                                               &o->data.entry_offset,
1295                                               &o->data.entry_array_offset,
1296                                               &o->data.n_entries,
1297                                               offset);
1298 }
1299
1300 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1301         uint64_t n, i;
1302         int r;
1303
1304         assert(f);
1305         assert(o);
1306         assert(offset > 0);
1307
1308         if (o->object.type != OBJECT_ENTRY)
1309                 return -EINVAL;
1310
1311         __sync_synchronize();
1312
1313         /* Link up the entry itself */
1314         r = link_entry_into_array(f,
1315                                   &f->header->entry_array_offset,
1316                                   &f->header->n_entries,
1317                                   offset);
1318         if (r < 0)
1319                 return r;
1320
1321         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1322
1323         if (f->header->head_entry_realtime == 0)
1324                 f->header->head_entry_realtime = o->entry.realtime;
1325
1326         f->header->tail_entry_realtime = o->entry.realtime;
1327         f->header->tail_entry_monotonic = o->entry.monotonic;
1328
1329         f->tail_entry_monotonic_valid = true;
1330
1331         /* Link up the items */
1332         n = journal_file_entry_n_items(o);
1333         for (i = 0; i < n; i++) {
1334                 r = journal_file_link_entry_item(f, o, offset, i);
1335                 if (r < 0)
1336                         return r;
1337         }
1338
1339         return 0;
1340 }
1341
1342 static int journal_file_append_entry_internal(
1343                 JournalFile *f,
1344                 const dual_timestamp *ts,
1345                 uint64_t xor_hash,
1346                 const EntryItem items[], unsigned n_items,
1347                 uint64_t *seqnum,
1348                 Object **ret, uint64_t *offset) {
1349         uint64_t np;
1350         uint64_t osize;
1351         Object *o;
1352         int r;
1353
1354         assert(f);
1355         assert(items || n_items == 0);
1356         assert(ts);
1357
1358         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1359
1360         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1361         if (r < 0)
1362                 return r;
1363
1364         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1365         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1366         o->entry.realtime = htole64(ts->realtime);
1367         o->entry.monotonic = htole64(ts->monotonic);
1368         o->entry.xor_hash = htole64(xor_hash);
1369         o->entry.boot_id = f->header->boot_id;
1370
1371 #ifdef HAVE_GCRYPT
1372         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1373         if (r < 0)
1374                 return r;
1375 #endif
1376
1377         r = journal_file_link_entry(f, o, np);
1378         if (r < 0)
1379                 return r;
1380
1381         if (ret)
1382                 *ret = o;
1383
1384         if (offset)
1385                 *offset = np;
1386
1387         return 0;
1388 }
1389
1390 void journal_file_post_change(JournalFile *f) {
1391         assert(f);
1392
1393         /* inotify() does not receive IN_MODIFY events from file
1394          * accesses done via mmap(). After each access we hence
1395          * trigger IN_MODIFY by truncating the journal file to its
1396          * current size which triggers IN_MODIFY. */
1397
1398         __sync_synchronize();
1399
1400         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1401                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1402 }
1403
1404 static int entry_item_cmp(const void *_a, const void *_b) {
1405         const EntryItem *a = _a, *b = _b;
1406
1407         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1408                 return -1;
1409         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1410                 return 1;
1411         return 0;
1412 }
1413
1414 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1415         unsigned i;
1416         EntryItem *items;
1417         int r;
1418         uint64_t xor_hash = 0;
1419         struct dual_timestamp _ts;
1420
1421         assert(f);
1422         assert(iovec || n_iovec == 0);
1423
1424         if (!ts) {
1425                 dual_timestamp_get(&_ts);
1426                 ts = &_ts;
1427         }
1428
1429         if (f->tail_entry_monotonic_valid &&
1430             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1431                 return -EINVAL;
1432
1433 #ifdef HAVE_GCRYPT
1434         r = journal_file_maybe_append_tag(f, ts->realtime);
1435         if (r < 0)
1436                 return r;
1437 #endif
1438
1439         /* alloca() can't take 0, hence let's allocate at least one */
1440         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1441
1442         for (i = 0; i < n_iovec; i++) {
1443                 uint64_t p;
1444                 Object *o;
1445
1446                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1447                 if (r < 0)
1448                         return r;
1449
1450                 xor_hash ^= le64toh(o->data.hash);
1451                 items[i].object_offset = htole64(p);
1452                 items[i].hash = o->data.hash;
1453         }
1454
1455         /* Order by the position on disk, in order to improve seek
1456          * times for rotating media. */
1457         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1458
1459         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1460
1461         /* If the memory mapping triggered a SIGBUS then we return an
1462          * IO error and ignore the error code passed down to us, since
1463          * it is very likely just an effect of a nullified replacement
1464          * mapping page */
1465
1466         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1467                 r = -EIO;
1468
1469         journal_file_post_change(f);
1470
1471         return r;
1472 }
1473
1474 typedef struct ChainCacheItem {
1475         uint64_t first; /* the array at the beginning of the chain */
1476         uint64_t array; /* the cached array */
1477         uint64_t begin; /* the first item in the cached array */
1478         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1479         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1480 } ChainCacheItem;
1481
1482 static void chain_cache_put(
1483                 OrderedHashmap *h,
1484                 ChainCacheItem *ci,
1485                 uint64_t first,
1486                 uint64_t array,
1487                 uint64_t begin,
1488                 uint64_t total,
1489                 uint64_t last_index) {
1490
1491         if (!ci) {
1492                 /* If the chain item to cache for this chain is the
1493                  * first one it's not worth caching anything */
1494                 if (array == first)
1495                         return;
1496
1497                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1498                         ci = ordered_hashmap_steal_first(h);
1499                         assert(ci);
1500                 } else {
1501                         ci = new(ChainCacheItem, 1);
1502                         if (!ci)
1503                                 return;
1504                 }
1505
1506                 ci->first = first;
1507
1508                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1509                         free(ci);
1510                         return;
1511                 }
1512         } else
1513                 assert(ci->first == first);
1514
1515         ci->array = array;
1516         ci->begin = begin;
1517         ci->total = total;
1518         ci->last_index = last_index;
1519 }
1520
1521 static int generic_array_get(
1522                 JournalFile *f,
1523                 uint64_t first,
1524                 uint64_t i,
1525                 Object **ret, uint64_t *offset) {
1526
1527         Object *o;
1528         uint64_t p = 0, a, t = 0;
1529         int r;
1530         ChainCacheItem *ci;
1531
1532         assert(f);
1533
1534         a = first;
1535
1536         /* Try the chain cache first */
1537         ci = ordered_hashmap_get(f->chain_cache, &first);
1538         if (ci && i > ci->total) {
1539                 a = ci->array;
1540                 i -= ci->total;
1541                 t = ci->total;
1542         }
1543
1544         while (a > 0) {
1545                 uint64_t k;
1546
1547                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1548                 if (r < 0)
1549                         return r;
1550
1551                 k = journal_file_entry_array_n_items(o);
1552                 if (i < k) {
1553                         p = le64toh(o->entry_array.items[i]);
1554                         goto found;
1555                 }
1556
1557                 i -= k;
1558                 t += k;
1559                 a = le64toh(o->entry_array.next_entry_array_offset);
1560         }
1561
1562         return 0;
1563
1564 found:
1565         /* Let's cache this item for the next invocation */
1566         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1567
1568         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1569         if (r < 0)
1570                 return r;
1571
1572         if (ret)
1573                 *ret = o;
1574
1575         if (offset)
1576                 *offset = p;
1577
1578         return 1;
1579 }
1580
1581 static int generic_array_get_plus_one(
1582                 JournalFile *f,
1583                 uint64_t extra,
1584                 uint64_t first,
1585                 uint64_t i,
1586                 Object **ret, uint64_t *offset) {
1587
1588         Object *o;
1589
1590         assert(f);
1591
1592         if (i == 0) {
1593                 int r;
1594
1595                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1596                 if (r < 0)
1597                         return r;
1598
1599                 if (ret)
1600                         *ret = o;
1601
1602                 if (offset)
1603                         *offset = extra;
1604
1605                 return 1;
1606         }
1607
1608         return generic_array_get(f, first, i-1, ret, offset);
1609 }
1610
1611 enum {
1612         TEST_FOUND,
1613         TEST_LEFT,
1614         TEST_RIGHT
1615 };
1616
1617 static int generic_array_bisect(
1618                 JournalFile *f,
1619                 uint64_t first,
1620                 uint64_t n,
1621                 uint64_t needle,
1622                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1623                 direction_t direction,
1624                 Object **ret,
1625                 uint64_t *offset,
1626                 uint64_t *idx) {
1627
1628         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1629         bool subtract_one = false;
1630         Object *o, *array = NULL;
1631         int r;
1632         ChainCacheItem *ci;
1633
1634         assert(f);
1635         assert(test_object);
1636
1637         /* Start with the first array in the chain */
1638         a = first;
1639
1640         ci = ordered_hashmap_get(f->chain_cache, &first);
1641         if (ci && n > ci->total) {
1642                 /* Ah, we have iterated this bisection array chain
1643                  * previously! Let's see if we can skip ahead in the
1644                  * chain, as far as the last time. But we can't jump
1645                  * backwards in the chain, so let's check that
1646                  * first. */
1647
1648                 r = test_object(f, ci->begin, needle);
1649                 if (r < 0)
1650                         return r;
1651
1652                 if (r == TEST_LEFT) {
1653                         /* OK, what we are looking for is right of the
1654                          * begin of this EntryArray, so let's jump
1655                          * straight to previously cached array in the
1656                          * chain */
1657
1658                         a = ci->array;
1659                         n -= ci->total;
1660                         t = ci->total;
1661                         last_index = ci->last_index;
1662                 }
1663         }
1664
1665         while (a > 0) {
1666                 uint64_t left, right, k, lp;
1667
1668                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1669                 if (r < 0)
1670                         return r;
1671
1672                 k = journal_file_entry_array_n_items(array);
1673                 right = MIN(k, n);
1674                 if (right <= 0)
1675                         return 0;
1676
1677                 i = right - 1;
1678                 lp = p = le64toh(array->entry_array.items[i]);
1679                 if (p <= 0)
1680                         return -EBADMSG;
1681
1682                 r = test_object(f, p, needle);
1683                 if (r < 0)
1684                         return r;
1685
1686                 if (r == TEST_FOUND)
1687                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1688
1689                 if (r == TEST_RIGHT) {
1690                         left = 0;
1691                         right -= 1;
1692
1693                         if (last_index != (uint64_t) -1) {
1694                                 assert(last_index <= right);
1695
1696                                 /* If we cached the last index we
1697                                  * looked at, let's try to not to jump
1698                                  * too wildly around and see if we can
1699                                  * limit the range to look at early to
1700                                  * the immediate neighbors of the last
1701                                  * index we looked at. */
1702
1703                                 if (last_index > 0) {
1704                                         uint64_t x = last_index - 1;
1705
1706                                         p = le64toh(array->entry_array.items[x]);
1707                                         if (p <= 0)
1708                                                 return -EBADMSG;
1709
1710                                         r = test_object(f, p, needle);
1711                                         if (r < 0)
1712                                                 return r;
1713
1714                                         if (r == TEST_FOUND)
1715                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1716
1717                                         if (r == TEST_RIGHT)
1718                                                 right = x;
1719                                         else
1720                                                 left = x + 1;
1721                                 }
1722
1723                                 if (last_index < right) {
1724                                         uint64_t y = last_index + 1;
1725
1726                                         p = le64toh(array->entry_array.items[y]);
1727                                         if (p <= 0)
1728                                                 return -EBADMSG;
1729
1730                                         r = test_object(f, p, needle);
1731                                         if (r < 0)
1732                                                 return r;
1733
1734                                         if (r == TEST_FOUND)
1735                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1736
1737                                         if (r == TEST_RIGHT)
1738                                                 right = y;
1739                                         else
1740                                                 left = y + 1;
1741                                 }
1742                         }
1743
1744                         for (;;) {
1745                                 if (left == right) {
1746                                         if (direction == DIRECTION_UP)
1747                                                 subtract_one = true;
1748
1749                                         i = left;
1750                                         goto found;
1751                                 }
1752
1753                                 assert(left < right);
1754                                 i = (left + right) / 2;
1755
1756                                 p = le64toh(array->entry_array.items[i]);
1757                                 if (p <= 0)
1758                                         return -EBADMSG;
1759
1760                                 r = test_object(f, p, needle);
1761                                 if (r < 0)
1762                                         return r;
1763
1764                                 if (r == TEST_FOUND)
1765                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1766
1767                                 if (r == TEST_RIGHT)
1768                                         right = i;
1769                                 else
1770                                         left = i + 1;
1771                         }
1772                 }
1773
1774                 if (k >= n) {
1775                         if (direction == DIRECTION_UP) {
1776                                 i = n;
1777                                 subtract_one = true;
1778                                 goto found;
1779                         }
1780
1781                         return 0;
1782                 }
1783
1784                 last_p = lp;
1785
1786                 n -= k;
1787                 t += k;
1788                 last_index = (uint64_t) -1;
1789                 a = le64toh(array->entry_array.next_entry_array_offset);
1790         }
1791
1792         return 0;
1793
1794 found:
1795         if (subtract_one && t == 0 && i == 0)
1796                 return 0;
1797
1798         /* Let's cache this item for the next invocation */
1799         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1800
1801         if (subtract_one && i == 0)
1802                 p = last_p;
1803         else if (subtract_one)
1804                 p = le64toh(array->entry_array.items[i-1]);
1805         else
1806                 p = le64toh(array->entry_array.items[i]);
1807
1808         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1809         if (r < 0)
1810                 return r;
1811
1812         if (ret)
1813                 *ret = o;
1814
1815         if (offset)
1816                 *offset = p;
1817
1818         if (idx)
1819                 *idx = t + i + (subtract_one ? -1 : 0);
1820
1821         return 1;
1822 }
1823
1824 static int generic_array_bisect_plus_one(
1825                 JournalFile *f,
1826                 uint64_t extra,
1827                 uint64_t first,
1828                 uint64_t n,
1829                 uint64_t needle,
1830                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1831                 direction_t direction,
1832                 Object **ret,
1833                 uint64_t *offset,
1834                 uint64_t *idx) {
1835
1836         int r;
1837         bool step_back = false;
1838         Object *o;
1839
1840         assert(f);
1841         assert(test_object);
1842
1843         if (n <= 0)
1844                 return 0;
1845
1846         /* This bisects the array in object 'first', but first checks
1847          * an extra  */
1848         r = test_object(f, extra, needle);
1849         if (r < 0)
1850                 return r;
1851
1852         if (r == TEST_FOUND)
1853                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1854
1855         /* if we are looking with DIRECTION_UP then we need to first
1856            see if in the actual array there is a matching entry, and
1857            return the last one of that. But if there isn't any we need
1858            to return this one. Hence remember this, and return it
1859            below. */
1860         if (r == TEST_LEFT)
1861                 step_back = direction == DIRECTION_UP;
1862
1863         if (r == TEST_RIGHT) {
1864                 if (direction == DIRECTION_DOWN)
1865                         goto found;
1866                 else
1867                         return 0;
1868         }
1869
1870         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1871
1872         if (r == 0 && step_back)
1873                 goto found;
1874
1875         if (r > 0 && idx)
1876                 (*idx) ++;
1877
1878         return r;
1879
1880 found:
1881         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1882         if (r < 0)
1883                 return r;
1884
1885         if (ret)
1886                 *ret = o;
1887
1888         if (offset)
1889                 *offset = extra;
1890
1891         if (idx)
1892                 *idx = 0;
1893
1894         return 1;
1895 }
1896
1897 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1898         assert(f);
1899         assert(p > 0);
1900
1901         if (p == needle)
1902                 return TEST_FOUND;
1903         else if (p < needle)
1904                 return TEST_LEFT;
1905         else
1906                 return TEST_RIGHT;
1907 }
1908
1909 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1910         Object *o;
1911         int r;
1912
1913         assert(f);
1914         assert(p > 0);
1915
1916         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1917         if (r < 0)
1918                 return r;
1919
1920         if (le64toh(o->entry.seqnum) == needle)
1921                 return TEST_FOUND;
1922         else if (le64toh(o->entry.seqnum) < needle)
1923                 return TEST_LEFT;
1924         else
1925                 return TEST_RIGHT;
1926 }
1927
1928 int journal_file_move_to_entry_by_seqnum(
1929                 JournalFile *f,
1930                 uint64_t seqnum,
1931                 direction_t direction,
1932                 Object **ret,
1933                 uint64_t *offset) {
1934
1935         return generic_array_bisect(f,
1936                                     le64toh(f->header->entry_array_offset),
1937                                     le64toh(f->header->n_entries),
1938                                     seqnum,
1939                                     test_object_seqnum,
1940                                     direction,
1941                                     ret, offset, NULL);
1942 }
1943
1944 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1945         Object *o;
1946         int r;
1947
1948         assert(f);
1949         assert(p > 0);
1950
1951         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1952         if (r < 0)
1953                 return r;
1954
1955         if (le64toh(o->entry.realtime) == needle)
1956                 return TEST_FOUND;
1957         else if (le64toh(o->entry.realtime) < needle)
1958                 return TEST_LEFT;
1959         else
1960                 return TEST_RIGHT;
1961 }
1962
1963 int journal_file_move_to_entry_by_realtime(
1964                 JournalFile *f,
1965                 uint64_t realtime,
1966                 direction_t direction,
1967                 Object **ret,
1968                 uint64_t *offset) {
1969
1970         return generic_array_bisect(f,
1971                                     le64toh(f->header->entry_array_offset),
1972                                     le64toh(f->header->n_entries),
1973                                     realtime,
1974                                     test_object_realtime,
1975                                     direction,
1976                                     ret, offset, NULL);
1977 }
1978
1979 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1980         Object *o;
1981         int r;
1982
1983         assert(f);
1984         assert(p > 0);
1985
1986         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1987         if (r < 0)
1988                 return r;
1989
1990         if (le64toh(o->entry.monotonic) == needle)
1991                 return TEST_FOUND;
1992         else if (le64toh(o->entry.monotonic) < needle)
1993                 return TEST_LEFT;
1994         else
1995                 return TEST_RIGHT;
1996 }
1997
1998 static int find_data_object_by_boot_id(
1999                 JournalFile *f,
2000                 sd_id128_t boot_id,
2001                 Object **o,
2002                 uint64_t *b) {
2003
2004         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2005
2006         sd_id128_to_string(boot_id, t + 9);
2007         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2008 }
2009
2010 int journal_file_move_to_entry_by_monotonic(
2011                 JournalFile *f,
2012                 sd_id128_t boot_id,
2013                 uint64_t monotonic,
2014                 direction_t direction,
2015                 Object **ret,
2016                 uint64_t *offset) {
2017
2018         Object *o;
2019         int r;
2020
2021         assert(f);
2022
2023         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2024         if (r < 0)
2025                 return r;
2026         if (r == 0)
2027                 return -ENOENT;
2028
2029         return generic_array_bisect_plus_one(f,
2030                                              le64toh(o->data.entry_offset),
2031                                              le64toh(o->data.entry_array_offset),
2032                                              le64toh(o->data.n_entries),
2033                                              monotonic,
2034                                              test_object_monotonic,
2035                                              direction,
2036                                              ret, offset, NULL);
2037 }
2038
2039 void journal_file_reset_location(JournalFile *f) {
2040         f->location_type = LOCATION_HEAD;
2041         f->current_offset = 0;
2042         f->current_seqnum = 0;
2043         f->current_realtime = 0;
2044         f->current_monotonic = 0;
2045         zero(f->current_boot_id);
2046         f->current_xor_hash = 0;
2047 }
2048
2049 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2050         f->location_type = LOCATION_SEEK;
2051         f->current_offset = offset;
2052         f->current_seqnum = le64toh(o->entry.seqnum);
2053         f->current_realtime = le64toh(o->entry.realtime);
2054         f->current_monotonic = le64toh(o->entry.monotonic);
2055         f->current_boot_id = o->entry.boot_id;
2056         f->current_xor_hash = le64toh(o->entry.xor_hash);
2057 }
2058
2059 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2060         assert(af);
2061         assert(bf);
2062         assert(af->location_type == LOCATION_SEEK);
2063         assert(bf->location_type == LOCATION_SEEK);
2064
2065         /* If contents and timestamps match, these entries are
2066          * identical, even if the seqnum does not match */
2067         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2068             af->current_monotonic == bf->current_monotonic &&
2069             af->current_realtime == bf->current_realtime &&
2070             af->current_xor_hash == bf->current_xor_hash)
2071                 return 0;
2072
2073         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2074
2075                 /* If this is from the same seqnum source, compare
2076                  * seqnums */
2077                 if (af->current_seqnum < bf->current_seqnum)
2078                         return -1;
2079                 if (af->current_seqnum > bf->current_seqnum)
2080                         return 1;
2081
2082                 /* Wow! This is weird, different data but the same
2083                  * seqnums? Something is borked, but let's make the
2084                  * best of it and compare by time. */
2085         }
2086
2087         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2088
2089                 /* If the boot id matches, compare monotonic time */
2090                 if (af->current_monotonic < bf->current_monotonic)
2091                         return -1;
2092                 if (af->current_monotonic > bf->current_monotonic)
2093                         return 1;
2094         }
2095
2096         /* Otherwise, compare UTC time */
2097         if (af->current_realtime < bf->current_realtime)
2098                 return -1;
2099         if (af->current_realtime > bf->current_realtime)
2100                 return 1;
2101
2102         /* Finally, compare by contents */
2103         if (af->current_xor_hash < bf->current_xor_hash)
2104                 return -1;
2105         if (af->current_xor_hash > bf->current_xor_hash)
2106                 return 1;
2107
2108         return 0;
2109 }
2110
2111 int journal_file_next_entry(
2112                 JournalFile *f,
2113                 uint64_t p,
2114                 direction_t direction,
2115                 Object **ret, uint64_t *offset) {
2116
2117         uint64_t i, n, ofs;
2118         int r;
2119
2120         assert(f);
2121
2122         n = le64toh(f->header->n_entries);
2123         if (n <= 0)
2124                 return 0;
2125
2126         if (p == 0)
2127                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2128         else {
2129                 r = generic_array_bisect(f,
2130                                          le64toh(f->header->entry_array_offset),
2131                                          le64toh(f->header->n_entries),
2132                                          p,
2133                                          test_object_offset,
2134                                          DIRECTION_DOWN,
2135                                          NULL, NULL,
2136                                          &i);
2137                 if (r <= 0)
2138                         return r;
2139
2140                 if (direction == DIRECTION_DOWN) {
2141                         if (i >= n - 1)
2142                                 return 0;
2143
2144                         i++;
2145                 } else {
2146                         if (i <= 0)
2147                                 return 0;
2148
2149                         i--;
2150                 }
2151         }
2152
2153         /* And jump to it */
2154         r = generic_array_get(f,
2155                               le64toh(f->header->entry_array_offset),
2156                               i,
2157                               ret, &ofs);
2158         if (r <= 0)
2159                 return r;
2160
2161         if (p > 0 &&
2162             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2163                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2164                           f->path, i);
2165                 return -EBADMSG;
2166         }
2167
2168         if (offset)
2169                 *offset = ofs;
2170
2171         return 1;
2172 }
2173
2174 int journal_file_next_entry_for_data(
2175                 JournalFile *f,
2176                 Object *o, uint64_t p,
2177                 uint64_t data_offset,
2178                 direction_t direction,
2179                 Object **ret, uint64_t *offset) {
2180
2181         uint64_t n, i;
2182         int r;
2183         Object *d;
2184
2185         assert(f);
2186         assert(p > 0 || !o);
2187
2188         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2189         if (r < 0)
2190                 return r;
2191
2192         n = le64toh(d->data.n_entries);
2193         if (n <= 0)
2194                 return n;
2195
2196         if (!o)
2197                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2198         else {
2199                 if (o->object.type != OBJECT_ENTRY)
2200                         return -EINVAL;
2201
2202                 r = generic_array_bisect_plus_one(f,
2203                                                   le64toh(d->data.entry_offset),
2204                                                   le64toh(d->data.entry_array_offset),
2205                                                   le64toh(d->data.n_entries),
2206                                                   p,
2207                                                   test_object_offset,
2208                                                   DIRECTION_DOWN,
2209                                                   NULL, NULL,
2210                                                   &i);
2211
2212                 if (r <= 0)
2213                         return r;
2214
2215                 if (direction == DIRECTION_DOWN) {
2216                         if (i >= n - 1)
2217                                 return 0;
2218
2219                         i++;
2220                 } else {
2221                         if (i <= 0)
2222                                 return 0;
2223
2224                         i--;
2225                 }
2226
2227         }
2228
2229         return generic_array_get_plus_one(f,
2230                                           le64toh(d->data.entry_offset),
2231                                           le64toh(d->data.entry_array_offset),
2232                                           i,
2233                                           ret, offset);
2234 }
2235
2236 int journal_file_move_to_entry_by_offset_for_data(
2237                 JournalFile *f,
2238                 uint64_t data_offset,
2239                 uint64_t p,
2240                 direction_t direction,
2241                 Object **ret, uint64_t *offset) {
2242
2243         int r;
2244         Object *d;
2245
2246         assert(f);
2247
2248         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2249         if (r < 0)
2250                 return r;
2251
2252         return generic_array_bisect_plus_one(f,
2253                                              le64toh(d->data.entry_offset),
2254                                              le64toh(d->data.entry_array_offset),
2255                                              le64toh(d->data.n_entries),
2256                                              p,
2257                                              test_object_offset,
2258                                              direction,
2259                                              ret, offset, NULL);
2260 }
2261
2262 int journal_file_move_to_entry_by_monotonic_for_data(
2263                 JournalFile *f,
2264                 uint64_t data_offset,
2265                 sd_id128_t boot_id,
2266                 uint64_t monotonic,
2267                 direction_t direction,
2268                 Object **ret, uint64_t *offset) {
2269
2270         Object *o, *d;
2271         int r;
2272         uint64_t b, z;
2273
2274         assert(f);
2275
2276         /* First, seek by time */
2277         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2278         if (r < 0)
2279                 return r;
2280         if (r == 0)
2281                 return -ENOENT;
2282
2283         r = generic_array_bisect_plus_one(f,
2284                                           le64toh(o->data.entry_offset),
2285                                           le64toh(o->data.entry_array_offset),
2286                                           le64toh(o->data.n_entries),
2287                                           monotonic,
2288                                           test_object_monotonic,
2289                                           direction,
2290                                           NULL, &z, NULL);
2291         if (r <= 0)
2292                 return r;
2293
2294         /* And now, continue seeking until we find an entry that
2295          * exists in both bisection arrays */
2296
2297         for (;;) {
2298                 Object *qo;
2299                 uint64_t p, q;
2300
2301                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2302                 if (r < 0)
2303                         return r;
2304
2305                 r = generic_array_bisect_plus_one(f,
2306                                                   le64toh(d->data.entry_offset),
2307                                                   le64toh(d->data.entry_array_offset),
2308                                                   le64toh(d->data.n_entries),
2309                                                   z,
2310                                                   test_object_offset,
2311                                                   direction,
2312                                                   NULL, &p, NULL);
2313                 if (r <= 0)
2314                         return r;
2315
2316                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2317                 if (r < 0)
2318                         return r;
2319
2320                 r = generic_array_bisect_plus_one(f,
2321                                                   le64toh(o->data.entry_offset),
2322                                                   le64toh(o->data.entry_array_offset),
2323                                                   le64toh(o->data.n_entries),
2324                                                   p,
2325                                                   test_object_offset,
2326                                                   direction,
2327                                                   &qo, &q, NULL);
2328
2329                 if (r <= 0)
2330                         return r;
2331
2332                 if (p == q) {
2333                         if (ret)
2334                                 *ret = qo;
2335                         if (offset)
2336                                 *offset = q;
2337
2338                         return 1;
2339                 }
2340
2341                 z = q;
2342         }
2343 }
2344
2345 int journal_file_move_to_entry_by_seqnum_for_data(
2346                 JournalFile *f,
2347                 uint64_t data_offset,
2348                 uint64_t seqnum,
2349                 direction_t direction,
2350                 Object **ret, uint64_t *offset) {
2351
2352         Object *d;
2353         int r;
2354
2355         assert(f);
2356
2357         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2358         if (r < 0)
2359                 return r;
2360
2361         return generic_array_bisect_plus_one(f,
2362                                              le64toh(d->data.entry_offset),
2363                                              le64toh(d->data.entry_array_offset),
2364                                              le64toh(d->data.n_entries),
2365                                              seqnum,
2366                                              test_object_seqnum,
2367                                              direction,
2368                                              ret, offset, NULL);
2369 }
2370
2371 int journal_file_move_to_entry_by_realtime_for_data(
2372                 JournalFile *f,
2373                 uint64_t data_offset,
2374                 uint64_t realtime,
2375                 direction_t direction,
2376                 Object **ret, uint64_t *offset) {
2377
2378         Object *d;
2379         int r;
2380
2381         assert(f);
2382
2383         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2384         if (r < 0)
2385                 return r;
2386
2387         return generic_array_bisect_plus_one(f,
2388                                              le64toh(d->data.entry_offset),
2389                                              le64toh(d->data.entry_array_offset),
2390                                              le64toh(d->data.n_entries),
2391                                              realtime,
2392                                              test_object_realtime,
2393                                              direction,
2394                                              ret, offset, NULL);
2395 }
2396
2397 void journal_file_dump(JournalFile *f) {
2398         Object *o;
2399         int r;
2400         uint64_t p;
2401
2402         assert(f);
2403
2404         journal_file_print_header(f);
2405
2406         p = le64toh(f->header->header_size);
2407         while (p != 0) {
2408                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2409                 if (r < 0)
2410                         goto fail;
2411
2412                 switch (o->object.type) {
2413
2414                 case OBJECT_UNUSED:
2415                         printf("Type: OBJECT_UNUSED\n");
2416                         break;
2417
2418                 case OBJECT_DATA:
2419                         printf("Type: OBJECT_DATA\n");
2420                         break;
2421
2422                 case OBJECT_FIELD:
2423                         printf("Type: OBJECT_FIELD\n");
2424                         break;
2425
2426                 case OBJECT_ENTRY:
2427                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2428                                le64toh(o->entry.seqnum),
2429                                le64toh(o->entry.monotonic),
2430                                le64toh(o->entry.realtime));
2431                         break;
2432
2433                 case OBJECT_FIELD_HASH_TABLE:
2434                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2435                         break;
2436
2437                 case OBJECT_DATA_HASH_TABLE:
2438                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2439                         break;
2440
2441                 case OBJECT_ENTRY_ARRAY:
2442                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2443                         break;
2444
2445                 case OBJECT_TAG:
2446                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2447                                le64toh(o->tag.seqnum),
2448                                le64toh(o->tag.epoch));
2449                         break;
2450
2451                 default:
2452                         printf("Type: unknown (%i)\n", o->object.type);
2453                         break;
2454                 }
2455
2456                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2457                         printf("Flags: %s\n",
2458                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2459
2460                 if (p == le64toh(f->header->tail_object_offset))
2461                         p = 0;
2462                 else
2463                         p = p + ALIGN64(le64toh(o->object.size));
2464         }
2465
2466         return;
2467 fail:
2468         log_error("File corrupt");
2469 }
2470
2471 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2472         const char *x;
2473
2474         x = format_timestamp(buf, l, t);
2475         if (x)
2476                 return x;
2477         return " --- ";
2478 }
2479
2480 void journal_file_print_header(JournalFile *f) {
2481         char a[33], b[33], c[33], d[33];
2482         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2483         struct stat st;
2484         char bytes[FORMAT_BYTES_MAX];
2485
2486         assert(f);
2487
2488         printf("File Path: %s\n"
2489                "File ID: %s\n"
2490                "Machine ID: %s\n"
2491                "Boot ID: %s\n"
2492                "Sequential Number ID: %s\n"
2493                "State: %s\n"
2494                "Compatible Flags:%s%s\n"
2495                "Incompatible Flags:%s%s%s\n"
2496                "Header size: %"PRIu64"\n"
2497                "Arena size: %"PRIu64"\n"
2498                "Data Hash Table Size: %"PRIu64"\n"
2499                "Field Hash Table Size: %"PRIu64"\n"
2500                "Rotate Suggested: %s\n"
2501                "Head Sequential Number: %"PRIu64"\n"
2502                "Tail Sequential Number: %"PRIu64"\n"
2503                "Head Realtime Timestamp: %s\n"
2504                "Tail Realtime Timestamp: %s\n"
2505                "Tail Monotonic Timestamp: %s\n"
2506                "Objects: %"PRIu64"\n"
2507                "Entry Objects: %"PRIu64"\n",
2508                f->path,
2509                sd_id128_to_string(f->header->file_id, a),
2510                sd_id128_to_string(f->header->machine_id, b),
2511                sd_id128_to_string(f->header->boot_id, c),
2512                sd_id128_to_string(f->header->seqnum_id, d),
2513                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2514                f->header->state == STATE_ONLINE ? "ONLINE" :
2515                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2516                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2517                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2518                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2519                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2520                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2521                le64toh(f->header->header_size),
2522                le64toh(f->header->arena_size),
2523                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2524                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2525                yes_no(journal_file_rotate_suggested(f, 0)),
2526                le64toh(f->header->head_entry_seqnum),
2527                le64toh(f->header->tail_entry_seqnum),
2528                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2529                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2530                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2531                le64toh(f->header->n_objects),
2532                le64toh(f->header->n_entries));
2533
2534         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2535                 printf("Data Objects: %"PRIu64"\n"
2536                        "Data Hash Table Fill: %.1f%%\n",
2537                        le64toh(f->header->n_data),
2538                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2539
2540         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2541                 printf("Field Objects: %"PRIu64"\n"
2542                        "Field Hash Table Fill: %.1f%%\n",
2543                        le64toh(f->header->n_fields),
2544                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2545
2546         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2547                 printf("Tag Objects: %"PRIu64"\n",
2548                        le64toh(f->header->n_tags));
2549         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2550                 printf("Entry Array Objects: %"PRIu64"\n",
2551                        le64toh(f->header->n_entry_arrays));
2552
2553         if (fstat(f->fd, &st) >= 0)
2554                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2555 }
2556
2557 static int journal_file_warn_btrfs(JournalFile *f) {
2558         unsigned attrs;
2559         int r;
2560
2561         assert(f);
2562
2563         /* Before we write anything, check if the COW logic is turned
2564          * off on btrfs. Given our write pattern that is quite
2565          * unfriendly to COW file systems this should greatly improve
2566          * performance on COW file systems, such as btrfs, at the
2567          * expense of data integrity features (which shouldn't be too
2568          * bad, given that we do our own checksumming). */
2569
2570         r = btrfs_is_filesystem(f->fd);
2571         if (r < 0)
2572                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2573         if (!r)
2574                 return 0;
2575
2576         r = read_attr_fd(f->fd, &attrs);
2577         if (r < 0)
2578                 return log_warning_errno(r, "Failed to read file attributes: %m");
2579
2580         if (attrs & FS_NOCOW_FL) {
2581                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2582                 return 0;
2583         }
2584
2585         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2586                    "This is likely to slow down journal access substantially, please consider turning "
2587                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2588
2589         return 1;
2590 }
2591
2592 int journal_file_open(
2593                 const char *fname,
2594                 int flags,
2595                 mode_t mode,
2596                 bool compress,
2597                 bool seal,
2598                 JournalMetrics *metrics,
2599                 MMapCache *mmap_cache,
2600                 JournalFile *template,
2601                 JournalFile **ret) {
2602
2603         bool newly_created = false;
2604         JournalFile *f;
2605         void *h;
2606         int r;
2607
2608         assert(fname);
2609         assert(ret);
2610
2611         if ((flags & O_ACCMODE) != O_RDONLY &&
2612             (flags & O_ACCMODE) != O_RDWR)
2613                 return -EINVAL;
2614
2615         if (!endswith(fname, ".journal") &&
2616             !endswith(fname, ".journal~"))
2617                 return -EINVAL;
2618
2619         f = new0(JournalFile, 1);
2620         if (!f)
2621                 return -ENOMEM;
2622
2623         f->fd = -1;
2624         f->mode = mode;
2625
2626         f->flags = flags;
2627         f->prot = prot_from_flags(flags);
2628         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2629 #if defined(HAVE_LZ4)
2630         f->compress_lz4 = compress;
2631 #elif defined(HAVE_XZ)
2632         f->compress_xz = compress;
2633 #endif
2634 #ifdef HAVE_GCRYPT
2635         f->seal = seal;
2636 #endif
2637
2638         if (mmap_cache)
2639                 f->mmap = mmap_cache_ref(mmap_cache);
2640         else {
2641                 f->mmap = mmap_cache_new();
2642                 if (!f->mmap) {
2643                         r = -ENOMEM;
2644                         goto fail;
2645                 }
2646         }
2647
2648         f->path = strdup(fname);
2649         if (!f->path) {
2650                 r = -ENOMEM;
2651                 goto fail;
2652         }
2653
2654         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2655         if (!f->chain_cache) {
2656                 r = -ENOMEM;
2657                 goto fail;
2658         }
2659
2660         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2661         if (f->fd < 0) {
2662                 r = -errno;
2663                 goto fail;
2664         }
2665
2666         r = journal_file_fstat(f);
2667         if (r < 0)
2668                 goto fail;
2669
2670         if (f->last_stat.st_size == 0 && f->writable) {
2671
2672                 (void) journal_file_warn_btrfs(f);
2673
2674                 /* Let's attach the creation time to the journal file,
2675                  * so that the vacuuming code knows the age of this
2676                  * file even if the file might end up corrupted one
2677                  * day... Ideally we'd just use the creation time many
2678                  * file systems maintain for each file, but there is
2679                  * currently no usable API to query this, hence let's
2680                  * emulate this via extended attributes. If extended
2681                  * attributes are not supported we'll just skip this,
2682                  * and rely solely on mtime/atime/ctime of the file. */
2683
2684                 fd_setcrtime(f->fd, 0);
2685
2686 #ifdef HAVE_GCRYPT
2687                 /* Try to load the FSPRG state, and if we can't, then
2688                  * just don't do sealing */
2689                 if (f->seal) {
2690                         r = journal_file_fss_load(f);
2691                         if (r < 0)
2692                                 f->seal = false;
2693                 }
2694 #endif
2695
2696                 r = journal_file_init_header(f, template);
2697                 if (r < 0)
2698                         goto fail;
2699
2700                 r = journal_file_fstat(f);
2701                 if (r < 0)
2702                         goto fail;
2703
2704                 newly_created = true;
2705         }
2706
2707         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2708                 r = -EIO;
2709                 goto fail;
2710         }
2711
2712         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2713         if (r < 0)
2714                 goto fail;
2715
2716         f->header = h;
2717
2718         if (!newly_created) {
2719                 r = journal_file_verify_header(f);
2720                 if (r < 0)
2721                         goto fail;
2722         }
2723
2724 #ifdef HAVE_GCRYPT
2725         if (!newly_created && f->writable) {
2726                 r = journal_file_fss_load(f);
2727                 if (r < 0)
2728                         goto fail;
2729         }
2730 #endif
2731
2732         if (f->writable) {
2733                 if (metrics) {
2734                         journal_default_metrics(metrics, f->fd);
2735                         f->metrics = *metrics;
2736                 } else if (template)
2737                         f->metrics = template->metrics;
2738
2739                 r = journal_file_refresh_header(f);
2740                 if (r < 0)
2741                         goto fail;
2742         }
2743
2744 #ifdef HAVE_GCRYPT
2745         r = journal_file_hmac_setup(f);
2746         if (r < 0)
2747                 goto fail;
2748 #endif
2749
2750         if (newly_created) {
2751                 r = journal_file_setup_field_hash_table(f);
2752                 if (r < 0)
2753                         goto fail;
2754
2755                 r = journal_file_setup_data_hash_table(f);
2756                 if (r < 0)
2757                         goto fail;
2758
2759 #ifdef HAVE_GCRYPT
2760                 r = journal_file_append_first_tag(f);
2761                 if (r < 0)
2762                         goto fail;
2763 #endif
2764         }
2765
2766         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2767                 r = -EIO;
2768                 goto fail;
2769         }
2770
2771         *ret = f;
2772         return 0;
2773
2774 fail:
2775         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2776                 r = -EIO;
2777
2778         journal_file_close(f);
2779
2780         return r;
2781 }
2782
2783 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2784         _cleanup_free_ char *p = NULL;
2785         size_t l;
2786         JournalFile *old_file, *new_file = NULL;
2787         int r;
2788
2789         assert(f);
2790         assert(*f);
2791
2792         old_file = *f;
2793
2794         if (!old_file->writable)
2795                 return -EINVAL;
2796
2797         if (!endswith(old_file->path, ".journal"))
2798                 return -EINVAL;
2799
2800         l = strlen(old_file->path);
2801         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2802                      (int) l - 8, old_file->path,
2803                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2804                      le64toh((*f)->header->head_entry_seqnum),
2805                      le64toh((*f)->header->head_entry_realtime));
2806         if (r < 0)
2807                 return -ENOMEM;
2808
2809         /* Try to rename the file to the archived version. If the file
2810          * already was deleted, we'll get ENOENT, let's ignore that
2811          * case. */
2812         r = rename(old_file->path, p);
2813         if (r < 0 && errno != ENOENT)
2814                 return -errno;
2815
2816         old_file->header->state = STATE_ARCHIVED;
2817
2818         /* Currently, btrfs is not very good with out write patterns
2819          * and fragments heavily. Let's defrag our journal files when
2820          * we archive them */
2821         old_file->defrag_on_close = true;
2822
2823         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2824         journal_file_close(old_file);
2825
2826         *f = new_file;
2827         return r;
2828 }
2829
2830 int journal_file_open_reliably(
2831                 const char *fname,
2832                 int flags,
2833                 mode_t mode,
2834                 bool compress,
2835                 bool seal,
2836                 JournalMetrics *metrics,
2837                 MMapCache *mmap_cache,
2838                 JournalFile *template,
2839                 JournalFile **ret) {
2840
2841         int r;
2842         size_t l;
2843         _cleanup_free_ char *p = NULL;
2844
2845         r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2846         if (!IN_SET(r,
2847                     -EBADMSG,           /* corrupted */
2848                     -ENODATA,           /* truncated */
2849                     -EHOSTDOWN,         /* other machine */
2850                     -EPROTONOSUPPORT,   /* incompatible feature */
2851                     -EBUSY,             /* unclean shutdown */
2852                     -ESHUTDOWN,         /* already archived */
2853                     -EIO,               /* IO error, including SIGBUS on mmap */
2854                     -EIDRM              /* File has been deleted */))
2855                 return r;
2856
2857         if ((flags & O_ACCMODE) == O_RDONLY)
2858                 return r;
2859
2860         if (!(flags & O_CREAT))
2861                 return r;
2862
2863         if (!endswith(fname, ".journal"))
2864                 return r;
2865
2866         /* The file is corrupted. Rotate it away and try it again (but only once) */
2867
2868         l = strlen(fname);
2869         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2870                      (int) l - 8, fname,
2871                      now(CLOCK_REALTIME),
2872                      random_u64()) < 0)
2873                 return -ENOMEM;
2874
2875         if (rename(fname, p) < 0)
2876                 return -errno;
2877
2878         /* btrfs doesn't cope well with our write pattern and
2879          * fragments heavily. Let's defrag all files we rotate */
2880
2881         (void) chattr_path(p, false, FS_NOCOW_FL);
2882         (void) btrfs_defrag(p);
2883
2884         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2885
2886         return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2887 }
2888
2889 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2890         uint64_t i, n;
2891         uint64_t q, xor_hash = 0;
2892         int r;
2893         EntryItem *items;
2894         dual_timestamp ts;
2895
2896         assert(from);
2897         assert(to);
2898         assert(o);
2899         assert(p);
2900
2901         if (!to->writable)
2902                 return -EPERM;
2903
2904         ts.monotonic = le64toh(o->entry.monotonic);
2905         ts.realtime = le64toh(o->entry.realtime);
2906
2907         n = journal_file_entry_n_items(o);
2908         /* alloca() can't take 0, hence let's allocate at least one */
2909         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2910
2911         for (i = 0; i < n; i++) {
2912                 uint64_t l, h;
2913                 le64_t le_hash;
2914                 size_t t;
2915                 void *data;
2916                 Object *u;
2917
2918                 q = le64toh(o->entry.items[i].object_offset);
2919                 le_hash = o->entry.items[i].hash;
2920
2921                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2922                 if (r < 0)
2923                         return r;
2924
2925                 if (le_hash != o->data.hash)
2926                         return -EBADMSG;
2927
2928                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2929                 t = (size_t) l;
2930
2931                 /* We hit the limit on 32bit machines */
2932                 if ((uint64_t) t != l)
2933                         return -E2BIG;
2934
2935                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2936 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2937                         size_t rsize = 0;
2938
2939                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2940                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2941                         if (r < 0)
2942                                 return r;
2943
2944                         data = from->compress_buffer;
2945                         l = rsize;
2946 #else
2947                         return -EPROTONOSUPPORT;
2948 #endif
2949                 } else
2950                         data = o->data.payload;
2951
2952                 r = journal_file_append_data(to, data, l, &u, &h);
2953                 if (r < 0)
2954                         return r;
2955
2956                 xor_hash ^= le64toh(u->data.hash);
2957                 items[i].object_offset = htole64(h);
2958                 items[i].hash = u->data.hash;
2959
2960                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2961                 if (r < 0)
2962                         return r;
2963         }
2964
2965         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2966
2967         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2968                 return -EIO;
2969
2970         return r;
2971 }
2972
2973 void journal_reset_metrics(JournalMetrics *m) {
2974         assert(m);
2975
2976         /* Set everything to "pick automatic values". */
2977
2978         *m = (JournalMetrics) {
2979                 .min_use = (uint64_t) -1,
2980                 .max_use = (uint64_t) -1,
2981                 .min_size = (uint64_t) -1,
2982                 .max_size = (uint64_t) -1,
2983                 .keep_free = (uint64_t) -1,
2984                 .n_max_files = (uint64_t) -1,
2985         };
2986 }
2987
2988 void journal_default_metrics(JournalMetrics *m, int fd) {
2989         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
2990         struct statvfs ss;
2991         uint64_t fs_size;
2992
2993         assert(m);
2994         assert(fd >= 0);
2995
2996         if (fstatvfs(fd, &ss) >= 0)
2997                 fs_size = ss.f_frsize * ss.f_blocks;
2998         else {
2999                 log_debug_errno(errno, "Failed to detremine disk size: %m");
3000                 fs_size = 0;
3001         }
3002
3003         if (m->max_use == (uint64_t) -1) {
3004
3005                 if (fs_size > 0) {
3006                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3007
3008                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3009                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3010
3011                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3012                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3013                 } else
3014                         m->max_use = DEFAULT_MAX_USE_LOWER;
3015         } else {
3016                 m->max_use = PAGE_ALIGN(m->max_use);
3017
3018                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3019                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3020         }
3021
3022         if (m->min_use == (uint64_t) -1)
3023                 m->min_use = DEFAULT_MIN_USE;
3024
3025         if (m->min_use > m->max_use)
3026                 m->min_use = m->max_use;
3027
3028         if (m->max_size == (uint64_t) -1) {
3029                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3030
3031                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3032                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3033         } else
3034                 m->max_size = PAGE_ALIGN(m->max_size);
3035
3036         if (m->max_size != 0) {
3037                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3038                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3039
3040                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3041                         m->max_use = m->max_size*2;
3042         }
3043
3044         if (m->min_size == (uint64_t) -1)
3045                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3046         else {
3047                 m->min_size = PAGE_ALIGN(m->min_size);
3048
3049                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3050                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3051
3052                 if (m->max_size != 0 && m->min_size > m->max_size)
3053                         m->max_size = m->min_size;
3054         }
3055
3056         if (m->keep_free == (uint64_t) -1) {
3057
3058                 if (fs_size > 0) {
3059                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3060
3061                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3062                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3063
3064                 } else
3065                         m->keep_free = DEFAULT_KEEP_FREE;
3066         }
3067
3068         if (m->n_max_files == (uint64_t) -1)
3069                 m->n_max_files = DEFAULT_N_MAX_FILES;
3070
3071         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3072                   format_bytes(a, sizeof(a), m->min_use),
3073                   format_bytes(b, sizeof(b), m->max_use),
3074                   format_bytes(c, sizeof(c), m->max_size),
3075                   format_bytes(d, sizeof(d), m->min_size),
3076                   format_bytes(e, sizeof(e), m->keep_free),
3077                   m->n_max_files);
3078 }
3079
3080 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3081         assert(f);
3082         assert(from || to);
3083
3084         if (from) {
3085                 if (f->header->head_entry_realtime == 0)
3086                         return -ENOENT;
3087
3088                 *from = le64toh(f->header->head_entry_realtime);
3089         }
3090
3091         if (to) {
3092                 if (f->header->tail_entry_realtime == 0)
3093                         return -ENOENT;
3094
3095                 *to = le64toh(f->header->tail_entry_realtime);
3096         }
3097
3098         return 1;
3099 }
3100
3101 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3102         Object *o;
3103         uint64_t p;
3104         int r;
3105
3106         assert(f);
3107         assert(from || to);
3108
3109         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3110         if (r <= 0)
3111                 return r;
3112
3113         if (le64toh(o->data.n_entries) <= 0)
3114                 return 0;
3115
3116         if (from) {
3117                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3118                 if (r < 0)
3119                         return r;
3120
3121                 *from = le64toh(o->entry.monotonic);
3122         }
3123
3124         if (to) {
3125                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3126                 if (r < 0)
3127                         return r;
3128
3129                 r = generic_array_get_plus_one(f,
3130                                                le64toh(o->data.entry_offset),
3131                                                le64toh(o->data.entry_array_offset),
3132                                                le64toh(o->data.n_entries)-1,
3133                                                &o, NULL);
3134                 if (r <= 0)
3135                         return r;
3136
3137                 *to = le64toh(o->entry.monotonic);
3138         }
3139
3140         return 1;
3141 }
3142
3143 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3144         assert(f);
3145
3146         /* If we gained new header fields we gained new features,
3147          * hence suggest a rotation */
3148         if (le64toh(f->header->header_size) < sizeof(Header)) {
3149                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3150                 return true;
3151         }
3152
3153         /* Let's check if the hash tables grew over a certain fill
3154          * level (75%, borrowing this value from Java's hash table
3155          * implementation), and if so suggest a rotation. To calculate
3156          * the fill level we need the n_data field, which only exists
3157          * in newer versions. */
3158
3159         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3160                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3161                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3162                                   f->path,
3163                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3164                                   le64toh(f->header->n_data),
3165                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3166                                   (unsigned long long) f->last_stat.st_size,
3167                                   f->last_stat.st_size / le64toh(f->header->n_data));
3168                         return true;
3169                 }
3170
3171         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3172                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3173                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3174                                   f->path,
3175                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3176                                   le64toh(f->header->n_fields),
3177                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3178                         return true;
3179                 }
3180
3181         /* Are the data objects properly indexed by field objects? */
3182         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3183             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3184             le64toh(f->header->n_data) > 0 &&
3185             le64toh(f->header->n_fields) == 0)
3186                 return true;
3187
3188         if (max_file_usec > 0) {
3189                 usec_t t, h;
3190
3191                 h = le64toh(f->header->head_entry_realtime);
3192                 t = now(CLOCK_REALTIME);
3193
3194                 if (h > 0 && t > h + max_file_usec)
3195                         return true;
3196         }
3197
3198         return false;
3199 }