src/journal/journal-file.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2011 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <fcntl.h>
  24 #include <linux/fs.h>
  25 #include <stddef.h>
  26 #include <sys/mman.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/uio.h>
  29 #include <unistd.h>
  30
  31 #include "alloc-util.h"
  32 #include "btrfs-util.h"
  33 #include "chattr-util.h"
  34 #include "compress.h"
  35 #include "fd-util.h"
  36 #include "journal-authenticate.h"
  37 #include "journal-def.h"
  38 #include "journal-file.h"
  39 #include "lookup3.h"
  40 #include "parse-util.h"
  41 #include "random-util.h"
  42 #include "sd-event.h"
  43 #include "string-util.h"
  44 #include "xattr-util.h"
  45
  46 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  47 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  48
  49 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
  50
  51 /* This is the minimum journal file size */
  52 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL)                 /* 512 KiB */
  53
  54 /* These are the lower and upper bounds if we deduce the max_use value
  55  * from the file system size */
  56 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  57 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  58
  59 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  60 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  61
  62 /* This is the upper bound if we deduce max_size from max_use */
  63 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  64
  65 /* This is the upper bound if we deduce the keep_free value from the
  66  * file system size */
  67 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  68
  69 /* This is the keep_free value when we can't determine the system
  70  * size */
  71 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  72
  73 /* This is the default maximum number of journal files to keep around. */
  74 #define DEFAULT_N_MAX_FILES (100)
  75
  76 /* n_data was the first entry we added after the initial file format design */
  77 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  78
  79 /* How many entries to keep in the entry array chain cache at max */
  80 #define CHAIN_CACHE_MAX 20
  81
  82 /* How much to increase the journal file size at once each time we allocate something new. */
  83 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  84
  85 /* Reread fstat() of the file for detecting deletions at least this often */
  86 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  87
  88 /* The mmap context to use for the header we pick as one above the last defined typed */
  89 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  90
  91 static int journal_file_set_online(JournalFile *f) {
  92         assert(f);
  93
  94         if (!f->writable)
  95                 return -EPERM;
  96
  97         if (!(f->fd >= 0 && f->header))
  98                 return -EINVAL;
  99
 100         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 101                 return -EIO;
 102
 103         switch(f->header->state) {
 104                 case STATE_ONLINE:
 105                         return 0;
 106
 107                 case STATE_OFFLINE:
 108                         f->header->state = STATE_ONLINE;
 109                         fsync(f->fd);
 110                         return 0;
 111
 112                 default:
 113                         return -EINVAL;
 114         }
 115 }
 116
 117 int journal_file_set_offline(JournalFile *f) {
 118         assert(f);
 119
 120         if (!f->writable)
 121                 return -EPERM;
 122
 123         if (!(f->fd >= 0 && f->header))
 124                 return -EINVAL;
 125
 126         if (f->header->state != STATE_ONLINE)
 127                 return 0;
 128
 129         fsync(f->fd);
 130
 131         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 132                 return -EIO;
 133
 134         f->header->state = STATE_OFFLINE;
 135
 136         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 137                 return -EIO;
 138
 139         fsync(f->fd);
 140
 141         return 0;
 142 }
 143
 144 JournalFile* journal_file_close(JournalFile *f) {
 145         assert(f);
 146
 147 #ifdef HAVE_GCRYPT
 148         /* Write the final tag */
 149         if (f->seal && f->writable)
 150                 journal_file_append_tag(f);
 151 #endif
 152
 153         if (f->post_change_timer) {
 154                 int enabled;
 155
 156                 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
 157                         if (enabled == SD_EVENT_ONESHOT)
 158                                 journal_file_post_change(f);
 159
 160                 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
 161                 sd_event_source_unref(f->post_change_timer);
 162         }
 163
 164         journal_file_set_offline(f);
 165
 166         if (f->mmap && f->fd >= 0)
 167                 mmap_cache_close_fd(f->mmap, f->fd);
 168
 169         if (f->fd >= 0 && f->defrag_on_close) {
 170
 171                 /* Be friendly to btrfs: turn COW back on again now,
 172                  * and defragment the file. We won't write to the file
 173                  * ever again, hence remove all fragmentation, and
 174                  * reenable all the good bits COW usually provides
 175                  * (such as data checksumming). */
 176
 177                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 178                 (void) btrfs_defrag_fd(f->fd);
 179         }
 180
 181         safe_close(f->fd);
 182         free(f->path);
 183
 184         mmap_cache_unref(f->mmap);
 185
 186         ordered_hashmap_free_free(f->chain_cache);
 187
 188 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 189         free(f->compress_buffer);
 190 #endif
 191
 192 #ifdef HAVE_GCRYPT
 193         if (f->fss_file)
 194                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 195         else
 196                 free(f->fsprg_state);
 197
 198         free(f->fsprg_seed);
 199
 200         if (f->hmac)
 201                 gcry_md_close(f->hmac);
 202 #endif
 203
 204         free(f);
 205         return NULL;
 206 }
 207
 208 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 209         Header h = {};
 210         ssize_t k;
 211         int r;
 212
 213         assert(f);
 214
 215         memcpy(h.signature, HEADER_SIGNATURE, 8);
 216         h.header_size = htole64(ALIGN64(sizeof(h)));
 217
 218         h.incompatible_flags |= htole32(
 219                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 220                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 221
 222         h.compatible_flags = htole32(
 223                 f->seal * HEADER_COMPATIBLE_SEALED);
 224
 225         r = sd_id128_randomize(&h.file_id);
 226         if (r < 0)
 227                 return r;
 228
 229         if (template) {
 230                 h.seqnum_id = template->header->seqnum_id;
 231                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 232         } else
 233                 h.seqnum_id = h.file_id;
 234
 235         k = pwrite(f->fd, &h, sizeof(h), 0);
 236         if (k < 0)
 237                 return -errno;
 238
 239         if (k != sizeof(h))
 240                 return -EIO;
 241
 242         return 0;
 243 }
 244
 245 static int journal_file_refresh_header(JournalFile *f) {
 246         sd_id128_t boot_id;
 247         int r;
 248
 249         assert(f);
 250
 251         r = sd_id128_get_machine(&f->header->machine_id);
 252         if (r < 0)
 253                 return r;
 254
 255         r = sd_id128_get_boot(&boot_id);
 256         if (r < 0)
 257                 return r;
 258
 259         if (sd_id128_equal(boot_id, f->header->boot_id))
 260                 f->tail_entry_monotonic_valid = true;
 261
 262         f->header->boot_id = boot_id;
 263
 264         r = journal_file_set_online(f);
 265
 266         /* Sync the online state to disk */
 267         fsync(f->fd);
 268
 269         return r;
 270 }
 271
 272 static int journal_file_verify_header(JournalFile *f) {
 273         uint32_t flags;
 274
 275         assert(f);
 276
 277         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 278                 return -EBADMSG;
 279
 280         /* In both read and write mode we refuse to open files with
 281          * incompatible flags we don't know */
 282         flags = le32toh(f->header->incompatible_flags);
 283         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
 284                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
 285                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
 286                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
 287                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
 288                 if (flags)
 289                         log_debug("Journal file %s uses incompatible flags %"PRIx32
 290                                   " disabled at compilation time.", f->path, flags);
 291                 return -EPROTONOSUPPORT;
 292         }
 293
 294         /* When open for writing we refuse to open files with
 295          * compatible flags, too */
 296         flags = le32toh(f->header->compatible_flags);
 297         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
 298                 if (flags & ~HEADER_COMPATIBLE_ANY)
 299                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
 300                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
 301                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
 302                 if (flags)
 303                         log_debug("Journal file %s uses compatible flags %"PRIx32
 304                                   " disabled at compilation time.", f->path, flags);
 305                 return -EPROTONOSUPPORT;
 306         }
 307
 308         if (f->header->state >= _STATE_MAX)
 309                 return -EBADMSG;
 310
 311         /* The first addition was n_data, so check that we are at least this large */
 312         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
 313                 return -EBADMSG;
 314
 315         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 316                 return -EBADMSG;
 317
 318         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
 319                 return -ENODATA;
 320
 321         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
 322                 return -ENODATA;
 323
 324         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 325             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 326             !VALID64(le64toh(f->header->tail_object_offset)) ||
 327             !VALID64(le64toh(f->header->entry_array_offset)))
 328                 return -ENODATA;
 329
 330         if (f->writable) {
 331                 uint8_t state;
 332                 sd_id128_t machine_id;
 333                 int r;
 334
 335                 r = sd_id128_get_machine(&machine_id);
 336                 if (r < 0)
 337                         return r;
 338
 339                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 340                         return -EHOSTDOWN;
 341
 342                 state = f->header->state;
 343
 344                 if (state == STATE_ONLINE) {
 345                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 346                         return -EBUSY;
 347                 } else if (state == STATE_ARCHIVED)
 348                         return -ESHUTDOWN;
 349                 else if (state != STATE_OFFLINE) {
 350                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 351                         return -EBUSY;
 352                 }
 353         }
 354
 355         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 356         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 357
 358         f->seal = JOURNAL_HEADER_SEALED(f->header);
 359
 360         return 0;
 361 }
 362
 363 static int journal_file_fstat(JournalFile *f) {
 364         assert(f);
 365         assert(f->fd >= 0);
 366
 367         if (fstat(f->fd, &f->last_stat) < 0)
 368                 return -errno;
 369
 370         f->last_stat_usec = now(CLOCK_MONOTONIC);
 371
 372         /* Refuse appending to files that are already deleted */
 373         if (f->last_stat.st_nlink <= 0)
 374                 return -EIDRM;
 375
 376         return 0;
 377 }
 378
 379 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 380         uint64_t old_size, new_size;
 381         int r;
 382
 383         assert(f);
 384
 385         /* We assume that this file is not sparse, and we know that
 386          * for sure, since we always call posix_fallocate()
 387          * ourselves */
 388
 389         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 390                 return -EIO;
 391
 392         old_size =
 393                 le64toh(f->header->header_size) +
 394                 le64toh(f->header->arena_size);
 395
 396         new_size = PAGE_ALIGN(offset + size);
 397         if (new_size < le64toh(f->header->header_size))
 398                 new_size = le64toh(f->header->header_size);
 399
 400         if (new_size <= old_size) {
 401
 402                 /* We already pre-allocated enough space, but before
 403                  * we write to it, let's check with fstat() if the
 404                  * file got deleted, in order make sure we don't throw
 405                  * away the data immediately. Don't check fstat() for
 406                  * all writes though, but only once ever 10s. */
 407
 408                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 409                         return 0;
 410
 411                 return journal_file_fstat(f);
 412         }
 413
 414         /* Allocate more space. */
 415
 416         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 417                 return -E2BIG;
 418
 419         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 420                 struct statvfs svfs;
 421
 422                 if (fstatvfs(f->fd, &svfs) >= 0) {
 423                         uint64_t available;
 424
 425                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 426
 427                         if (new_size - old_size > available)
 428                                 return -E2BIG;
 429                 }
 430         }
 431
 432         /* Increase by larger blocks at once */
 433         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 434         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 435                 new_size = f->metrics.max_size;
 436
 437         /* Note that the glibc fallocate() fallback is very
 438            inefficient, hence we try to minimize the allocation area
 439            as we can. */
 440         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 441         if (r != 0)
 442                 return -r;
 443
 444         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 445
 446         return journal_file_fstat(f);
 447 }
 448
 449 static unsigned type_to_context(ObjectType type) {
 450         /* One context for each type, plus one catch-all for the rest */
 451         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 452         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 453         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 454 }
 455
 456 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
 457         int r;
 458
 459         assert(f);
 460         assert(ret);
 461
 462         if (size <= 0)
 463                 return -EINVAL;
 464
 465         /* Avoid SIGBUS on invalid accesses */
 466         if (offset + size > (uint64_t) f->last_stat.st_size) {
 467                 /* Hmm, out of range? Let's refresh the fstat() data
 468                  * first, before we trust that check. */
 469
 470                 r = journal_file_fstat(f);
 471                 if (r < 0)
 472                         return r;
 473
 474                 if (offset + size > (uint64_t) f->last_stat.st_size)
 475                         return -EADDRNOTAVAIL;
 476         }
 477
 478         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
 479 }
 480
 481 static uint64_t minimum_header_size(Object *o) {
 482
 483         static const uint64_t table[] = {
 484                 [OBJECT_DATA] = sizeof(DataObject),
 485                 [OBJECT_FIELD] = sizeof(FieldObject),
 486                 [OBJECT_ENTRY] = sizeof(EntryObject),
 487                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 488                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 489                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 490                 [OBJECT_TAG] = sizeof(TagObject),
 491         };
 492
 493         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 494                 return sizeof(ObjectHeader);
 495
 496         return table[o->object.type];
 497 }
 498
 499 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 500         int r;
 501         void *t;
 502         Object *o;
 503         uint64_t s;
 504
 505         assert(f);
 506         assert(ret);
 507
 508         /* Objects may only be located at multiple of 64 bit */
 509         if (!VALID64(offset))
 510                 return -EFAULT;
 511
 512         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
 513         if (r < 0)
 514                 return r;
 515
 516         o = (Object*) t;
 517         s = le64toh(o->object.size);
 518
 519         if (s < sizeof(ObjectHeader))
 520                 return -EBADMSG;
 521
 522         if (o->object.type <= OBJECT_UNUSED)
 523                 return -EBADMSG;
 524
 525         if (s < minimum_header_size(o))
 526                 return -EBADMSG;
 527
 528         if (type > OBJECT_UNUSED && o->object.type != type)
 529                 return -EBADMSG;
 530
 531         if (s > sizeof(ObjectHeader)) {
 532                 r = journal_file_move_to(f, type, false, offset, s, &t);
 533                 if (r < 0)
 534                         return r;
 535
 536                 o = (Object*) t;
 537         }
 538
 539         *ret = o;
 540         return 0;
 541 }
 542
 543 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 544         uint64_t r;
 545
 546         assert(f);
 547
 548         r = le64toh(f->header->tail_entry_seqnum) + 1;
 549
 550         if (seqnum) {
 551                 /* If an external seqnum counter was passed, we update
 552                  * both the local and the external one, and set it to
 553                  * the maximum of both */
 554
 555                 if (*seqnum + 1 > r)
 556                         r = *seqnum + 1;
 557
 558                 *seqnum = r;
 559         }
 560
 561         f->header->tail_entry_seqnum = htole64(r);
 562
 563         if (f->header->head_entry_seqnum == 0)
 564                 f->header->head_entry_seqnum = htole64(r);
 565
 566         return r;
 567 }
 568
 569 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
 570         int r;
 571         uint64_t p;
 572         Object *tail, *o;
 573         void *t;
 574
 575         assert(f);
 576         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
 577         assert(size >= sizeof(ObjectHeader));
 578         assert(offset);
 579         assert(ret);
 580
 581         r = journal_file_set_online(f);
 582         if (r < 0)
 583                 return r;
 584
 585         p = le64toh(f->header->tail_object_offset);
 586         if (p == 0)
 587                 p = le64toh(f->header->header_size);
 588         else {
 589                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
 590                 if (r < 0)
 591                         return r;
 592
 593                 p += ALIGN64(le64toh(tail->object.size));
 594         }
 595
 596         r = journal_file_allocate(f, p, size);
 597         if (r < 0)
 598                 return r;
 599
 600         r = journal_file_move_to(f, type, false, p, size, &t);
 601         if (r < 0)
 602                 return r;
 603
 604         o = (Object*) t;
 605
 606         zero(o->object);
 607         o->object.type = type;
 608         o->object.size = htole64(size);
 609
 610         f->header->tail_object_offset = htole64(p);
 611         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
 612
 613         *ret = o;
 614         *offset = p;
 615
 616         return 0;
 617 }
 618
 619 static int journal_file_setup_data_hash_table(JournalFile *f) {
 620         uint64_t s, p;
 621         Object *o;
 622         int r;
 623
 624         assert(f);
 625
 626         /* We estimate that we need 1 hash table entry per 768 bytes
 627            of journal file and we want to make sure we never get
 628            beyond 75% fill level. Calculate the hash table size for
 629            the maximum file size based on these metrics. */
 630
 631         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
 632         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
 633                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
 634
 635         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
 636
 637         r = journal_file_append_object(f,
 638                                        OBJECT_DATA_HASH_TABLE,
 639                                        offsetof(Object, hash_table.items) + s,
 640                                        &o, &p);
 641         if (r < 0)
 642                 return r;
 643
 644         memzero(o->hash_table.items, s);
 645
 646         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 647         f->header->data_hash_table_size = htole64(s);
 648
 649         return 0;
 650 }
 651
 652 static int journal_file_setup_field_hash_table(JournalFile *f) {
 653         uint64_t s, p;
 654         Object *o;
 655         int r;
 656
 657         assert(f);
 658
 659         /* We use a fixed size hash table for the fields as this
 660          * number should grow very slowly only */
 661
 662         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
 663         r = journal_file_append_object(f,
 664                                        OBJECT_FIELD_HASH_TABLE,
 665                                        offsetof(Object, hash_table.items) + s,
 666                                        &o, &p);
 667         if (r < 0)
 668                 return r;
 669
 670         memzero(o->hash_table.items, s);
 671
 672         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 673         f->header->field_hash_table_size = htole64(s);
 674
 675         return 0;
 676 }
 677
 678 int journal_file_map_data_hash_table(JournalFile *f) {
 679         uint64_t s, p;
 680         void *t;
 681         int r;
 682
 683         assert(f);
 684
 685         if (f->data_hash_table)
 686                 return 0;
 687
 688         p = le64toh(f->header->data_hash_table_offset);
 689         s = le64toh(f->header->data_hash_table_size);
 690
 691         r = journal_file_move_to(f,
 692                                  OBJECT_DATA_HASH_TABLE,
 693                                  true,
 694                                  p, s,
 695                                  &t);
 696         if (r < 0)
 697                 return r;
 698
 699         f->data_hash_table = t;
 700         return 0;
 701 }
 702
 703 int journal_file_map_field_hash_table(JournalFile *f) {
 704         uint64_t s, p;
 705         void *t;
 706         int r;
 707
 708         assert(f);
 709
 710         if (f->field_hash_table)
 711                 return 0;
 712
 713         p = le64toh(f->header->field_hash_table_offset);
 714         s = le64toh(f->header->field_hash_table_size);
 715
 716         r = journal_file_move_to(f,
 717                                  OBJECT_FIELD_HASH_TABLE,
 718                                  true,
 719                                  p, s,
 720                                  &t);
 721         if (r < 0)
 722                 return r;
 723
 724         f->field_hash_table = t;
 725         return 0;
 726 }
 727
 728 static int journal_file_link_field(
 729                 JournalFile *f,
 730                 Object *o,
 731                 uint64_t offset,
 732                 uint64_t hash) {
 733
 734         uint64_t p, h, m;
 735         int r;
 736
 737         assert(f);
 738         assert(o);
 739         assert(offset > 0);
 740
 741         if (o->object.type != OBJECT_FIELD)
 742                 return -EINVAL;
 743
 744         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 745         if (m <= 0)
 746                 return -EBADMSG;
 747
 748         /* This might alter the window we are looking at */
 749         o->field.next_hash_offset = o->field.head_data_offset = 0;
 750
 751         h = hash % m;
 752         p = le64toh(f->field_hash_table[h].tail_hash_offset);
 753         if (p == 0)
 754                 f->field_hash_table[h].head_hash_offset = htole64(offset);
 755         else {
 756                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 757                 if (r < 0)
 758                         return r;
 759
 760                 o->field.next_hash_offset = htole64(offset);
 761         }
 762
 763         f->field_hash_table[h].tail_hash_offset = htole64(offset);
 764
 765         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
 766                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
 767
 768         return 0;
 769 }
 770
 771 static int journal_file_link_data(
 772                 JournalFile *f,
 773                 Object *o,
 774                 uint64_t offset,
 775                 uint64_t hash) {
 776
 777         uint64_t p, h, m;
 778         int r;
 779
 780         assert(f);
 781         assert(o);
 782         assert(offset > 0);
 783
 784         if (o->object.type != OBJECT_DATA)
 785                 return -EINVAL;
 786
 787         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 788         if (m <= 0)
 789                 return -EBADMSG;
 790
 791         /* This might alter the window we are looking at */
 792         o->data.next_hash_offset = o->data.next_field_offset = 0;
 793         o->data.entry_offset = o->data.entry_array_offset = 0;
 794         o->data.n_entries = 0;
 795
 796         h = hash % m;
 797         p = le64toh(f->data_hash_table[h].tail_hash_offset);
 798         if (p == 0)
 799                 /* Only entry in the hash table is easy */
 800                 f->data_hash_table[h].head_hash_offset = htole64(offset);
 801         else {
 802                 /* Move back to the previous data object, to patch in
 803                  * pointer */
 804
 805                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 806                 if (r < 0)
 807                         return r;
 808
 809                 o->data.next_hash_offset = htole64(offset);
 810         }
 811
 812         f->data_hash_table[h].tail_hash_offset = htole64(offset);
 813
 814         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
 815                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
 816
 817         return 0;
 818 }
 819
 820 int journal_file_find_field_object_with_hash(
 821                 JournalFile *f,
 822                 const void *field, uint64_t size, uint64_t hash,
 823                 Object **ret, uint64_t *offset) {
 824
 825         uint64_t p, osize, h, m;
 826         int r;
 827
 828         assert(f);
 829         assert(field && size > 0);
 830
 831         /* If the field hash table is empty, we can't find anything */
 832         if (le64toh(f->header->field_hash_table_size) <= 0)
 833                 return 0;
 834
 835         /* Map the field hash table, if it isn't mapped yet. */
 836         r = journal_file_map_field_hash_table(f);
 837         if (r < 0)
 838                 return r;
 839
 840         osize = offsetof(Object, field.payload) + size;
 841
 842         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 843         if (m <= 0)
 844                 return -EBADMSG;
 845
 846         h = hash % m;
 847         p = le64toh(f->field_hash_table[h].head_hash_offset);
 848
 849         while (p > 0) {
 850                 Object *o;
 851
 852                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 853                 if (r < 0)
 854                         return r;
 855
 856                 if (le64toh(o->field.hash) == hash &&
 857                     le64toh(o->object.size) == osize &&
 858                     memcmp(o->field.payload, field, size) == 0) {
 859
 860                         if (ret)
 861                                 *ret = o;
 862                         if (offset)
 863                                 *offset = p;
 864
 865                         return 1;
 866                 }
 867
 868                 p = le64toh(o->field.next_hash_offset);
 869         }
 870
 871         return 0;
 872 }
 873
 874 int journal_file_find_field_object(
 875                 JournalFile *f,
 876                 const void *field, uint64_t size,
 877                 Object **ret, uint64_t *offset) {
 878
 879         uint64_t hash;
 880
 881         assert(f);
 882         assert(field && size > 0);
 883
 884         hash = hash64(field, size);
 885
 886         return journal_file_find_field_object_with_hash(f,
 887                                                         field, size, hash,
 888                                                         ret, offset);
 889 }
 890
 891 int journal_file_find_data_object_with_hash(
 892                 JournalFile *f,
 893                 const void *data, uint64_t size, uint64_t hash,
 894                 Object **ret, uint64_t *offset) {
 895
 896         uint64_t p, osize, h, m;
 897         int r;
 898
 899         assert(f);
 900         assert(data || size == 0);
 901
 902         /* If there's no data hash table, then there's no entry. */
 903         if (le64toh(f->header->data_hash_table_size) <= 0)
 904                 return 0;
 905
 906         /* Map the data hash table, if it isn't mapped yet. */
 907         r = journal_file_map_data_hash_table(f);
 908         if (r < 0)
 909                 return r;
 910
 911         osize = offsetof(Object, data.payload) + size;
 912
 913         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 914         if (m <= 0)
 915                 return -EBADMSG;
 916
 917         h = hash % m;
 918         p = le64toh(f->data_hash_table[h].head_hash_offset);
 919
 920         while (p > 0) {
 921                 Object *o;
 922
 923                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 924                 if (r < 0)
 925                         return r;
 926
 927                 if (le64toh(o->data.hash) != hash)
 928                         goto next;
 929
 930                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
 931 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 932                         uint64_t l;
 933                         size_t rsize = 0;
 934
 935                         l = le64toh(o->object.size);
 936                         if (l <= offsetof(Object, data.payload))
 937                                 return -EBADMSG;
 938
 939                         l -= offsetof(Object, data.payload);
 940
 941                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
 942                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
 943                         if (r < 0)
 944                                 return r;
 945
 946                         if (rsize == size &&
 947                             memcmp(f->compress_buffer, data, size) == 0) {
 948
 949                                 if (ret)
 950                                         *ret = o;
 951
 952                                 if (offset)
 953                                         *offset = p;
 954
 955                                 return 1;
 956                         }
 957 #else
 958                         return -EPROTONOSUPPORT;
 959 #endif
 960                 } else if (le64toh(o->object.size) == osize &&
 961                            memcmp(o->data.payload, data, size) == 0) {
 962
 963                         if (ret)
 964                                 *ret = o;
 965
 966                         if (offset)
 967                                 *offset = p;
 968
 969                         return 1;
 970                 }
 971
 972         next:
 973                 p = le64toh(o->data.next_hash_offset);
 974         }
 975
 976         return 0;
 977 }
 978
 979 int journal_file_find_data_object(
 980                 JournalFile *f,
 981                 const void *data, uint64_t size,
 982                 Object **ret, uint64_t *offset) {
 983
 984         uint64_t hash;
 985
 986         assert(f);
 987         assert(data || size == 0);
 988
 989         hash = hash64(data, size);
 990
 991         return journal_file_find_data_object_with_hash(f,
 992                                                        data, size, hash,
 993                                                        ret, offset);
 994 }
 995
 996 static int journal_file_append_field(
 997                 JournalFile *f,
 998                 const void *field, uint64_t size,
 999                 Object **ret, uint64_t *offset) {
1000
1001         uint64_t hash, p;
1002         uint64_t osize;
1003         Object *o;
1004         int r;
1005
1006         assert(f);
1007         assert(field && size > 0);
1008
1009         hash = hash64(field, size);
1010
1011         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1012         if (r < 0)
1013                 return r;
1014         else if (r > 0) {
1015
1016                 if (ret)
1017                         *ret = o;
1018
1019                 if (offset)
1020                         *offset = p;
1021
1022                 return 0;
1023         }
1024
1025         osize = offsetof(Object, field.payload) + size;
1026         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1027         if (r < 0)
1028                 return r;
1029
1030         o->field.hash = htole64(hash);
1031         memcpy(o->field.payload, field, size);
1032
1033         r = journal_file_link_field(f, o, p, hash);
1034         if (r < 0)
1035                 return r;
1036
1037         /* The linking might have altered the window, so let's
1038          * refresh our pointer */
1039         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1040         if (r < 0)
1041                 return r;
1042
1043 #ifdef HAVE_GCRYPT
1044         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1045         if (r < 0)
1046                 return r;
1047 #endif
1048
1049         if (ret)
1050                 *ret = o;
1051
1052         if (offset)
1053                 *offset = p;
1054
1055         return 0;
1056 }
1057
1058 static int journal_file_append_data(
1059                 JournalFile *f,
1060                 const void *data, uint64_t size,
1061                 Object **ret, uint64_t *offset) {
1062
1063         uint64_t hash, p;
1064         uint64_t osize;
1065         Object *o;
1066         int r, compression = 0;
1067         const void *eq;
1068
1069         assert(f);
1070         assert(data || size == 0);
1071
1072         hash = hash64(data, size);
1073
1074         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1075         if (r < 0)
1076                 return r;
1077         if (r > 0) {
1078
1079                 if (ret)
1080                         *ret = o;
1081
1082                 if (offset)
1083                         *offset = p;
1084
1085                 return 0;
1086         }
1087
1088         osize = offsetof(Object, data.payload) + size;
1089         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1090         if (r < 0)
1091                 return r;
1092
1093         o->data.hash = htole64(hash);
1094
1095 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1096         if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1097                 size_t rsize = 0;
1098
1099                 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1100
1101                 if (compression >= 0) {
1102                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1103                         o->object.flags |= compression;
1104
1105                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1106                                   size, rsize, object_compressed_to_string(compression));
1107                 } else
1108                         /* Compression didn't work, we don't really care why, let's continue without compression */
1109                         compression = 0;
1110         }
1111 #endif
1112
1113         if (compression == 0 && size > 0)
1114                 memcpy(o->data.payload, data, size);
1115
1116         r = journal_file_link_data(f, o, p, hash);
1117         if (r < 0)
1118                 return r;
1119
1120         /* The linking might have altered the window, so let's
1121          * refresh our pointer */
1122         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1123         if (r < 0)
1124                 return r;
1125
1126         if (!data)
1127                 eq = NULL;
1128         else
1129                 eq = memchr(data, '=', size);
1130         if (eq && eq > data) {
1131                 Object *fo = NULL;
1132                 uint64_t fp;
1133
1134                 /* Create field object ... */
1135                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1136                 if (r < 0)
1137                         return r;
1138
1139                 /* ... and link it in. */
1140                 o->data.next_field_offset = fo->field.head_data_offset;
1141                 fo->field.head_data_offset = le64toh(p);
1142         }
1143
1144 #ifdef HAVE_GCRYPT
1145         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1146         if (r < 0)
1147                 return r;
1148 #endif
1149
1150         if (ret)
1151                 *ret = o;
1152
1153         if (offset)
1154                 *offset = p;
1155
1156         return 0;
1157 }
1158
1159 uint64_t journal_file_entry_n_items(Object *o) {
1160         assert(o);
1161
1162         if (o->object.type != OBJECT_ENTRY)
1163                 return 0;
1164
1165         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1166 }
1167
1168 uint64_t journal_file_entry_array_n_items(Object *o) {
1169         assert(o);
1170
1171         if (o->object.type != OBJECT_ENTRY_ARRAY)
1172                 return 0;
1173
1174         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1175 }
1176
1177 uint64_t journal_file_hash_table_n_items(Object *o) {
1178         assert(o);
1179
1180         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1181             o->object.type != OBJECT_FIELD_HASH_TABLE)
1182                 return 0;
1183
1184         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1185 }
1186
1187 static int link_entry_into_array(JournalFile *f,
1188                                  le64_t *first,
1189                                  le64_t *idx,
1190                                  uint64_t p) {
1191         int r;
1192         uint64_t n = 0, ap = 0, q, i, a, hidx;
1193         Object *o;
1194
1195         assert(f);
1196         assert(first);
1197         assert(idx);
1198         assert(p > 0);
1199
1200         a = le64toh(*first);
1201         i = hidx = le64toh(*idx);
1202         while (a > 0) {
1203
1204                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1205                 if (r < 0)
1206                         return r;
1207
1208                 n = journal_file_entry_array_n_items(o);
1209                 if (i < n) {
1210                         o->entry_array.items[i] = htole64(p);
1211                         *idx = htole64(hidx + 1);
1212                         return 0;
1213                 }
1214
1215                 i -= n;
1216                 ap = a;
1217                 a = le64toh(o->entry_array.next_entry_array_offset);
1218         }
1219
1220         if (hidx > n)
1221                 n = (hidx+1) * 2;
1222         else
1223                 n = n * 2;
1224
1225         if (n < 4)
1226                 n = 4;
1227
1228         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1229                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1230                                        &o, &q);
1231         if (r < 0)
1232                 return r;
1233
1234 #ifdef HAVE_GCRYPT
1235         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1236         if (r < 0)
1237                 return r;
1238 #endif
1239
1240         o->entry_array.items[i] = htole64(p);
1241
1242         if (ap == 0)
1243                 *first = htole64(q);
1244         else {
1245                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1246                 if (r < 0)
1247                         return r;
1248
1249                 o->entry_array.next_entry_array_offset = htole64(q);
1250         }
1251
1252         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1253                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1254
1255         *idx = htole64(hidx + 1);
1256
1257         return 0;
1258 }
1259
1260 static int link_entry_into_array_plus_one(JournalFile *f,
1261                                           le64_t *extra,
1262                                           le64_t *first,
1263                                           le64_t *idx,
1264                                           uint64_t p) {
1265
1266         int r;
1267
1268         assert(f);
1269         assert(extra);
1270         assert(first);
1271         assert(idx);
1272         assert(p > 0);
1273
1274         if (*idx == 0)
1275                 *extra = htole64(p);
1276         else {
1277                 le64_t i;
1278
1279                 i = htole64(le64toh(*idx) - 1);
1280                 r = link_entry_into_array(f, first, &i, p);
1281                 if (r < 0)
1282                         return r;
1283         }
1284
1285         *idx = htole64(le64toh(*idx) + 1);
1286         return 0;
1287 }
1288
1289 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1290         uint64_t p;
1291         int r;
1292         assert(f);
1293         assert(o);
1294         assert(offset > 0);
1295
1296         p = le64toh(o->entry.items[i].object_offset);
1297         if (p == 0)
1298                 return -EINVAL;
1299
1300         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1301         if (r < 0)
1302                 return r;
1303
1304         return link_entry_into_array_plus_one(f,
1305                                               &o->data.entry_offset,
1306                                               &o->data.entry_array_offset,
1307                                               &o->data.n_entries,
1308                                               offset);
1309 }
1310
1311 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1312         uint64_t n, i;
1313         int r;
1314
1315         assert(f);
1316         assert(o);
1317         assert(offset > 0);
1318
1319         if (o->object.type != OBJECT_ENTRY)
1320                 return -EINVAL;
1321
1322         __sync_synchronize();
1323
1324         /* Link up the entry itself */
1325         r = link_entry_into_array(f,
1326                                   &f->header->entry_array_offset,
1327                                   &f->header->n_entries,
1328                                   offset);
1329         if (r < 0)
1330                 return r;
1331
1332         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1333
1334         if (f->header->head_entry_realtime == 0)
1335                 f->header->head_entry_realtime = o->entry.realtime;
1336
1337         f->header->tail_entry_realtime = o->entry.realtime;
1338         f->header->tail_entry_monotonic = o->entry.monotonic;
1339
1340         f->tail_entry_monotonic_valid = true;
1341
1342         /* Link up the items */
1343         n = journal_file_entry_n_items(o);
1344         for (i = 0; i < n; i++) {
1345                 r = journal_file_link_entry_item(f, o, offset, i);
1346                 if (r < 0)
1347                         return r;
1348         }
1349
1350         return 0;
1351 }
1352
1353 static int journal_file_append_entry_internal(
1354                 JournalFile *f,
1355                 const dual_timestamp *ts,
1356                 uint64_t xor_hash,
1357                 const EntryItem items[], unsigned n_items,
1358                 uint64_t *seqnum,
1359                 Object **ret, uint64_t *offset) {
1360         uint64_t np;
1361         uint64_t osize;
1362         Object *o;
1363         int r;
1364
1365         assert(f);
1366         assert(items || n_items == 0);
1367         assert(ts);
1368
1369         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1370
1371         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1372         if (r < 0)
1373                 return r;
1374
1375         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1376         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1377         o->entry.realtime = htole64(ts->realtime);
1378         o->entry.monotonic = htole64(ts->monotonic);
1379         o->entry.xor_hash = htole64(xor_hash);
1380         o->entry.boot_id = f->header->boot_id;
1381
1382 #ifdef HAVE_GCRYPT
1383         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1384         if (r < 0)
1385                 return r;
1386 #endif
1387
1388         r = journal_file_link_entry(f, o, np);
1389         if (r < 0)
1390                 return r;
1391
1392         if (ret)
1393                 *ret = o;
1394
1395         if (offset)
1396                 *offset = np;
1397
1398         return 0;
1399 }
1400
1401 void journal_file_post_change(JournalFile *f) {
1402         assert(f);
1403
1404         /* inotify() does not receive IN_MODIFY events from file
1405          * accesses done via mmap(). After each access we hence
1406          * trigger IN_MODIFY by truncating the journal file to its
1407          * current size which triggers IN_MODIFY. */
1408
1409         __sync_synchronize();
1410
1411         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1412                 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1413 }
1414
1415 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1416         assert(userdata);
1417
1418         journal_file_post_change(userdata);
1419
1420         return 1;
1421 }
1422
1423 static void schedule_post_change(JournalFile *f) {
1424         sd_event_source *timer;
1425         int enabled, r;
1426         uint64_t now;
1427
1428         assert(f);
1429         assert(f->post_change_timer);
1430
1431         timer = f->post_change_timer;
1432
1433         r = sd_event_source_get_enabled(timer, &enabled);
1434         if (r < 0) {
1435                 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1436                 goto fail;
1437         }
1438
1439         if (enabled == SD_EVENT_ONESHOT)
1440                 return;
1441
1442         r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1443         if (r < 0) {
1444                 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1445                 goto fail;
1446         }
1447
1448         r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1449         if (r < 0) {
1450                 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1451                 goto fail;
1452         }
1453
1454         r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1455         if (r < 0) {
1456                 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1457                 goto fail;
1458         }
1459
1460         return;
1461
1462 fail:
1463         /* On failure, let's simply post the change immediately. */
1464         journal_file_post_change(f);
1465 }
1466
1467 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1468 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1469         _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1470         int r;
1471
1472         assert(f);
1473         assert_return(!f->post_change_timer, -EINVAL);
1474         assert(e);
1475         assert(t);
1476
1477         r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1478         if (r < 0)
1479                 return r;
1480
1481         r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1482         if (r < 0)
1483                 return r;
1484
1485         f->post_change_timer = timer;
1486         timer = NULL;
1487         f->post_change_timer_period = t;
1488
1489         return r;
1490 }
1491
1492 static int entry_item_cmp(const void *_a, const void *_b) {
1493         const EntryItem *a = _a, *b = _b;
1494
1495         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1496                 return -1;
1497         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1498                 return 1;
1499         return 0;
1500 }
1501
1502 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1503         unsigned i;
1504         EntryItem *items;
1505         int r;
1506         uint64_t xor_hash = 0;
1507         struct dual_timestamp _ts;
1508
1509         assert(f);
1510         assert(iovec || n_iovec == 0);
1511
1512         if (!ts) {
1513                 dual_timestamp_get(&_ts);
1514                 ts = &_ts;
1515         }
1516
1517         if (f->tail_entry_monotonic_valid &&
1518             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1519                 return -EINVAL;
1520
1521 #ifdef HAVE_GCRYPT
1522         r = journal_file_maybe_append_tag(f, ts->realtime);
1523         if (r < 0)
1524                 return r;
1525 #endif
1526
1527         /* alloca() can't take 0, hence let's allocate at least one */
1528         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1529
1530         for (i = 0; i < n_iovec; i++) {
1531                 uint64_t p;
1532                 Object *o;
1533
1534                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1535                 if (r < 0)
1536                         return r;
1537
1538                 xor_hash ^= le64toh(o->data.hash);
1539                 items[i].object_offset = htole64(p);
1540                 items[i].hash = o->data.hash;
1541         }
1542
1543         /* Order by the position on disk, in order to improve seek
1544          * times for rotating media. */
1545         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1546
1547         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1548
1549         /* If the memory mapping triggered a SIGBUS then we return an
1550          * IO error and ignore the error code passed down to us, since
1551          * it is very likely just an effect of a nullified replacement
1552          * mapping page */
1553
1554         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1555                 r = -EIO;
1556
1557         if (f->post_change_timer)
1558                 schedule_post_change(f);
1559         else
1560                 journal_file_post_change(f);
1561
1562         return r;
1563 }
1564
1565 typedef struct ChainCacheItem {
1566         uint64_t first; /* the array at the beginning of the chain */
1567         uint64_t array; /* the cached array */
1568         uint64_t begin; /* the first item in the cached array */
1569         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1570         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1571 } ChainCacheItem;
1572
1573 static void chain_cache_put(
1574                 OrderedHashmap *h,
1575                 ChainCacheItem *ci,
1576                 uint64_t first,
1577                 uint64_t array,
1578                 uint64_t begin,
1579                 uint64_t total,
1580                 uint64_t last_index) {
1581
1582         if (!ci) {
1583                 /* If the chain item to cache for this chain is the
1584                  * first one it's not worth caching anything */
1585                 if (array == first)
1586                         return;
1587
1588                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1589                         ci = ordered_hashmap_steal_first(h);
1590                         assert(ci);
1591                 } else {
1592                         ci = new(ChainCacheItem, 1);
1593                         if (!ci)
1594                                 return;
1595                 }
1596
1597                 ci->first = first;
1598
1599                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1600                         free(ci);
1601                         return;
1602                 }
1603         } else
1604                 assert(ci->first == first);
1605
1606         ci->array = array;
1607         ci->begin = begin;
1608         ci->total = total;
1609         ci->last_index = last_index;
1610 }
1611
1612 static int generic_array_get(
1613                 JournalFile *f,
1614                 uint64_t first,
1615                 uint64_t i,
1616                 Object **ret, uint64_t *offset) {
1617
1618         Object *o;
1619         uint64_t p = 0, a, t = 0;
1620         int r;
1621         ChainCacheItem *ci;
1622
1623         assert(f);
1624
1625         a = first;
1626
1627         /* Try the chain cache first */
1628         ci = ordered_hashmap_get(f->chain_cache, &first);
1629         if (ci && i > ci->total) {
1630                 a = ci->array;
1631                 i -= ci->total;
1632                 t = ci->total;
1633         }
1634
1635         while (a > 0) {
1636                 uint64_t k;
1637
1638                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1639                 if (r < 0)
1640                         return r;
1641
1642                 k = journal_file_entry_array_n_items(o);
1643                 if (i < k) {
1644                         p = le64toh(o->entry_array.items[i]);
1645                         goto found;
1646                 }
1647
1648                 i -= k;
1649                 t += k;
1650                 a = le64toh(o->entry_array.next_entry_array_offset);
1651         }
1652
1653         return 0;
1654
1655 found:
1656         /* Let's cache this item for the next invocation */
1657         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1658
1659         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1660         if (r < 0)
1661                 return r;
1662
1663         if (ret)
1664                 *ret = o;
1665
1666         if (offset)
1667                 *offset = p;
1668
1669         return 1;
1670 }
1671
1672 static int generic_array_get_plus_one(
1673                 JournalFile *f,
1674                 uint64_t extra,
1675                 uint64_t first,
1676                 uint64_t i,
1677                 Object **ret, uint64_t *offset) {
1678
1679         Object *o;
1680
1681         assert(f);
1682
1683         if (i == 0) {
1684                 int r;
1685
1686                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1687                 if (r < 0)
1688                         return r;
1689
1690                 if (ret)
1691                         *ret = o;
1692
1693                 if (offset)
1694                         *offset = extra;
1695
1696                 return 1;
1697         }
1698
1699         return generic_array_get(f, first, i-1, ret, offset);
1700 }
1701
1702 enum {
1703         TEST_FOUND,
1704         TEST_LEFT,
1705         TEST_RIGHT
1706 };
1707
1708 static int generic_array_bisect(
1709                 JournalFile *f,
1710                 uint64_t first,
1711                 uint64_t n,
1712                 uint64_t needle,
1713                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1714                 direction_t direction,
1715                 Object **ret,
1716                 uint64_t *offset,
1717                 uint64_t *idx) {
1718
1719         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1720         bool subtract_one = false;
1721         Object *o, *array = NULL;
1722         int r;
1723         ChainCacheItem *ci;
1724
1725         assert(f);
1726         assert(test_object);
1727
1728         /* Start with the first array in the chain */
1729         a = first;
1730
1731         ci = ordered_hashmap_get(f->chain_cache, &first);
1732         if (ci && n > ci->total) {
1733                 /* Ah, we have iterated this bisection array chain
1734                  * previously! Let's see if we can skip ahead in the
1735                  * chain, as far as the last time. But we can't jump
1736                  * backwards in the chain, so let's check that
1737                  * first. */
1738
1739                 r = test_object(f, ci->begin, needle);
1740                 if (r < 0)
1741                         return r;
1742
1743                 if (r == TEST_LEFT) {
1744                         /* OK, what we are looking for is right of the
1745                          * begin of this EntryArray, so let's jump
1746                          * straight to previously cached array in the
1747                          * chain */
1748
1749                         a = ci->array;
1750                         n -= ci->total;
1751                         t = ci->total;
1752                         last_index = ci->last_index;
1753                 }
1754         }
1755
1756         while (a > 0) {
1757                 uint64_t left, right, k, lp;
1758
1759                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1760                 if (r < 0)
1761                         return r;
1762
1763                 k = journal_file_entry_array_n_items(array);
1764                 right = MIN(k, n);
1765                 if (right <= 0)
1766                         return 0;
1767
1768                 i = right - 1;
1769                 lp = p = le64toh(array->entry_array.items[i]);
1770                 if (p <= 0)
1771                         return -EBADMSG;
1772
1773                 r = test_object(f, p, needle);
1774                 if (r < 0)
1775                         return r;
1776
1777                 if (r == TEST_FOUND)
1778                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1779
1780                 if (r == TEST_RIGHT) {
1781                         left = 0;
1782                         right -= 1;
1783
1784                         if (last_index != (uint64_t) -1) {
1785                                 assert(last_index <= right);
1786
1787                                 /* If we cached the last index we
1788                                  * looked at, let's try to not to jump
1789                                  * too wildly around and see if we can
1790                                  * limit the range to look at early to
1791                                  * the immediate neighbors of the last
1792                                  * index we looked at. */
1793
1794                                 if (last_index > 0) {
1795                                         uint64_t x = last_index - 1;
1796
1797                                         p = le64toh(array->entry_array.items[x]);
1798                                         if (p <= 0)
1799                                                 return -EBADMSG;
1800
1801                                         r = test_object(f, p, needle);
1802                                         if (r < 0)
1803                                                 return r;
1804
1805                                         if (r == TEST_FOUND)
1806                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1807
1808                                         if (r == TEST_RIGHT)
1809                                                 right = x;
1810                                         else
1811                                                 left = x + 1;
1812                                 }
1813
1814                                 if (last_index < right) {
1815                                         uint64_t y = last_index + 1;
1816
1817                                         p = le64toh(array->entry_array.items[y]);
1818                                         if (p <= 0)
1819                                                 return -EBADMSG;
1820
1821                                         r = test_object(f, p, needle);
1822                                         if (r < 0)
1823                                                 return r;
1824
1825                                         if (r == TEST_FOUND)
1826                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1827
1828                                         if (r == TEST_RIGHT)
1829                                                 right = y;
1830                                         else
1831                                                 left = y + 1;
1832                                 }
1833                         }
1834
1835                         for (;;) {
1836                                 if (left == right) {
1837                                         if (direction == DIRECTION_UP)
1838                                                 subtract_one = true;
1839
1840                                         i = left;
1841                                         goto found;
1842                                 }
1843
1844                                 assert(left < right);
1845                                 i = (left + right) / 2;
1846
1847                                 p = le64toh(array->entry_array.items[i]);
1848                                 if (p <= 0)
1849                                         return -EBADMSG;
1850
1851                                 r = test_object(f, p, needle);
1852                                 if (r < 0)
1853                                         return r;
1854
1855                                 if (r == TEST_FOUND)
1856                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1857
1858                                 if (r == TEST_RIGHT)
1859                                         right = i;
1860                                 else
1861                                         left = i + 1;
1862                         }
1863                 }
1864
1865                 if (k >= n) {
1866                         if (direction == DIRECTION_UP) {
1867                                 i = n;
1868                                 subtract_one = true;
1869                                 goto found;
1870                         }
1871
1872                         return 0;
1873                 }
1874
1875                 last_p = lp;
1876
1877                 n -= k;
1878                 t += k;
1879                 last_index = (uint64_t) -1;
1880                 a = le64toh(array->entry_array.next_entry_array_offset);
1881         }
1882
1883         return 0;
1884
1885 found:
1886         if (subtract_one && t == 0 && i == 0)
1887                 return 0;
1888
1889         /* Let's cache this item for the next invocation */
1890         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1891
1892         if (subtract_one && i == 0)
1893                 p = last_p;
1894         else if (subtract_one)
1895                 p = le64toh(array->entry_array.items[i-1]);
1896         else
1897                 p = le64toh(array->entry_array.items[i]);
1898
1899         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1900         if (r < 0)
1901                 return r;
1902
1903         if (ret)
1904                 *ret = o;
1905
1906         if (offset)
1907                 *offset = p;
1908
1909         if (idx)
1910                 *idx = t + i + (subtract_one ? -1 : 0);
1911
1912         return 1;
1913 }
1914
1915 static int generic_array_bisect_plus_one(
1916                 JournalFile *f,
1917                 uint64_t extra,
1918                 uint64_t first,
1919                 uint64_t n,
1920                 uint64_t needle,
1921                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1922                 direction_t direction,
1923                 Object **ret,
1924                 uint64_t *offset,
1925                 uint64_t *idx) {
1926
1927         int r;
1928         bool step_back = false;
1929         Object *o;
1930
1931         assert(f);
1932         assert(test_object);
1933
1934         if (n <= 0)
1935                 return 0;
1936
1937         /* This bisects the array in object 'first', but first checks
1938          * an extra  */
1939         r = test_object(f, extra, needle);
1940         if (r < 0)
1941                 return r;
1942
1943         if (r == TEST_FOUND)
1944                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1945
1946         /* if we are looking with DIRECTION_UP then we need to first
1947            see if in the actual array there is a matching entry, and
1948            return the last one of that. But if there isn't any we need
1949            to return this one. Hence remember this, and return it
1950            below. */
1951         if (r == TEST_LEFT)
1952                 step_back = direction == DIRECTION_UP;
1953
1954         if (r == TEST_RIGHT) {
1955                 if (direction == DIRECTION_DOWN)
1956                         goto found;
1957                 else
1958                         return 0;
1959         }
1960
1961         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1962
1963         if (r == 0 && step_back)
1964                 goto found;
1965
1966         if (r > 0 && idx)
1967                 (*idx) ++;
1968
1969         return r;
1970
1971 found:
1972         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1973         if (r < 0)
1974                 return r;
1975
1976         if (ret)
1977                 *ret = o;
1978
1979         if (offset)
1980                 *offset = extra;
1981
1982         if (idx)
1983                 *idx = 0;
1984
1985         return 1;
1986 }
1987
1988 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1989         assert(f);
1990         assert(p > 0);
1991
1992         if (p == needle)
1993                 return TEST_FOUND;
1994         else if (p < needle)
1995                 return TEST_LEFT;
1996         else
1997                 return TEST_RIGHT;
1998 }
1999
2000 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2001         Object *o;
2002         int r;
2003
2004         assert(f);
2005         assert(p > 0);
2006
2007         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2008         if (r < 0)
2009                 return r;
2010
2011         if (le64toh(o->entry.seqnum) == needle)
2012                 return TEST_FOUND;
2013         else if (le64toh(o->entry.seqnum) < needle)
2014                 return TEST_LEFT;
2015         else
2016                 return TEST_RIGHT;
2017 }
2018
2019 int journal_file_move_to_entry_by_seqnum(
2020                 JournalFile *f,
2021                 uint64_t seqnum,
2022                 direction_t direction,
2023                 Object **ret,
2024                 uint64_t *offset) {
2025
2026         return generic_array_bisect(f,
2027                                     le64toh(f->header->entry_array_offset),
2028                                     le64toh(f->header->n_entries),
2029                                     seqnum,
2030                                     test_object_seqnum,
2031                                     direction,
2032                                     ret, offset, NULL);
2033 }
2034
2035 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2036         Object *o;
2037         int r;
2038
2039         assert(f);
2040         assert(p > 0);
2041
2042         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2043         if (r < 0)
2044                 return r;
2045
2046         if (le64toh(o->entry.realtime) == needle)
2047                 return TEST_FOUND;
2048         else if (le64toh(o->entry.realtime) < needle)
2049                 return TEST_LEFT;
2050         else
2051                 return TEST_RIGHT;
2052 }
2053
2054 int journal_file_move_to_entry_by_realtime(
2055                 JournalFile *f,
2056                 uint64_t realtime,
2057                 direction_t direction,
2058                 Object **ret,
2059                 uint64_t *offset) {
2060
2061         return generic_array_bisect(f,
2062                                     le64toh(f->header->entry_array_offset),
2063                                     le64toh(f->header->n_entries),
2064                                     realtime,
2065                                     test_object_realtime,
2066                                     direction,
2067                                     ret, offset, NULL);
2068 }
2069
2070 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2071         Object *o;
2072         int r;
2073
2074         assert(f);
2075         assert(p > 0);
2076
2077         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2078         if (r < 0)
2079                 return r;
2080
2081         if (le64toh(o->entry.monotonic) == needle)
2082                 return TEST_FOUND;
2083         else if (le64toh(o->entry.monotonic) < needle)
2084                 return TEST_LEFT;
2085         else
2086                 return TEST_RIGHT;
2087 }
2088
2089 static int find_data_object_by_boot_id(
2090                 JournalFile *f,
2091                 sd_id128_t boot_id,
2092                 Object **o,
2093                 uint64_t *b) {
2094
2095         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2096
2097         sd_id128_to_string(boot_id, t + 9);
2098         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2099 }
2100
2101 int journal_file_move_to_entry_by_monotonic(
2102                 JournalFile *f,
2103                 sd_id128_t boot_id,
2104                 uint64_t monotonic,
2105                 direction_t direction,
2106                 Object **ret,
2107                 uint64_t *offset) {
2108
2109         Object *o;
2110         int r;
2111
2112         assert(f);
2113
2114         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2115         if (r < 0)
2116                 return r;
2117         if (r == 0)
2118                 return -ENOENT;
2119
2120         return generic_array_bisect_plus_one(f,
2121                                              le64toh(o->data.entry_offset),
2122                                              le64toh(o->data.entry_array_offset),
2123                                              le64toh(o->data.n_entries),
2124                                              monotonic,
2125                                              test_object_monotonic,
2126                                              direction,
2127                                              ret, offset, NULL);
2128 }
2129
2130 void journal_file_reset_location(JournalFile *f) {
2131         f->location_type = LOCATION_HEAD;
2132         f->current_offset = 0;
2133         f->current_seqnum = 0;
2134         f->current_realtime = 0;
2135         f->current_monotonic = 0;
2136         zero(f->current_boot_id);
2137         f->current_xor_hash = 0;
2138 }
2139
2140 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2141         f->location_type = LOCATION_SEEK;
2142         f->current_offset = offset;
2143         f->current_seqnum = le64toh(o->entry.seqnum);
2144         f->current_realtime = le64toh(o->entry.realtime);
2145         f->current_monotonic = le64toh(o->entry.monotonic);
2146         f->current_boot_id = o->entry.boot_id;
2147         f->current_xor_hash = le64toh(o->entry.xor_hash);
2148 }
2149
2150 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2151         assert(af);
2152         assert(bf);
2153         assert(af->location_type == LOCATION_SEEK);
2154         assert(bf->location_type == LOCATION_SEEK);
2155
2156         /* If contents and timestamps match, these entries are
2157          * identical, even if the seqnum does not match */
2158         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2159             af->current_monotonic == bf->current_monotonic &&
2160             af->current_realtime == bf->current_realtime &&
2161             af->current_xor_hash == bf->current_xor_hash)
2162                 return 0;
2163
2164         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2165
2166                 /* If this is from the same seqnum source, compare
2167                  * seqnums */
2168                 if (af->current_seqnum < bf->current_seqnum)
2169                         return -1;
2170                 if (af->current_seqnum > bf->current_seqnum)
2171                         return 1;
2172
2173                 /* Wow! This is weird, different data but the same
2174                  * seqnums? Something is borked, but let's make the
2175                  * best of it and compare by time. */
2176         }
2177
2178         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2179
2180                 /* If the boot id matches, compare monotonic time */
2181                 if (af->current_monotonic < bf->current_monotonic)
2182                         return -1;
2183                 if (af->current_monotonic > bf->current_monotonic)
2184                         return 1;
2185         }
2186
2187         /* Otherwise, compare UTC time */
2188         if (af->current_realtime < bf->current_realtime)
2189                 return -1;
2190         if (af->current_realtime > bf->current_realtime)
2191                 return 1;
2192
2193         /* Finally, compare by contents */
2194         if (af->current_xor_hash < bf->current_xor_hash)
2195                 return -1;
2196         if (af->current_xor_hash > bf->current_xor_hash)
2197                 return 1;
2198
2199         return 0;
2200 }
2201
2202 int journal_file_next_entry(
2203                 JournalFile *f,
2204                 uint64_t p,
2205                 direction_t direction,
2206                 Object **ret, uint64_t *offset) {
2207
2208         uint64_t i, n, ofs;
2209         int r;
2210
2211         assert(f);
2212
2213         n = le64toh(f->header->n_entries);
2214         if (n <= 0)
2215                 return 0;
2216
2217         if (p == 0)
2218                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2219         else {
2220                 r = generic_array_bisect(f,
2221                                          le64toh(f->header->entry_array_offset),
2222                                          le64toh(f->header->n_entries),
2223                                          p,
2224                                          test_object_offset,
2225                                          DIRECTION_DOWN,
2226                                          NULL, NULL,
2227                                          &i);
2228                 if (r <= 0)
2229                         return r;
2230
2231                 if (direction == DIRECTION_DOWN) {
2232                         if (i >= n - 1)
2233                                 return 0;
2234
2235                         i++;
2236                 } else {
2237                         if (i <= 0)
2238                                 return 0;
2239
2240                         i--;
2241                 }
2242         }
2243
2244         /* And jump to it */
2245         r = generic_array_get(f,
2246                               le64toh(f->header->entry_array_offset),
2247                               i,
2248                               ret, &ofs);
2249         if (r <= 0)
2250                 return r;
2251
2252         if (p > 0 &&
2253             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2254                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2255                           f->path, i);
2256                 return -EBADMSG;
2257         }
2258
2259         if (offset)
2260                 *offset = ofs;
2261
2262         return 1;
2263 }
2264
2265 int journal_file_next_entry_for_data(
2266                 JournalFile *f,
2267                 Object *o, uint64_t p,
2268                 uint64_t data_offset,
2269                 direction_t direction,
2270                 Object **ret, uint64_t *offset) {
2271
2272         uint64_t n, i;
2273         int r;
2274         Object *d;
2275
2276         assert(f);
2277         assert(p > 0 || !o);
2278
2279         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2280         if (r < 0)
2281                 return r;
2282
2283         n = le64toh(d->data.n_entries);
2284         if (n <= 0)
2285                 return n;
2286
2287         if (!o)
2288                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2289         else {
2290                 if (o->object.type != OBJECT_ENTRY)
2291                         return -EINVAL;
2292
2293                 r = generic_array_bisect_plus_one(f,
2294                                                   le64toh(d->data.entry_offset),
2295                                                   le64toh(d->data.entry_array_offset),
2296                                                   le64toh(d->data.n_entries),
2297                                                   p,
2298                                                   test_object_offset,
2299                                                   DIRECTION_DOWN,
2300                                                   NULL, NULL,
2301                                                   &i);
2302
2303                 if (r <= 0)
2304                         return r;
2305
2306                 if (direction == DIRECTION_DOWN) {
2307                         if (i >= n - 1)
2308                                 return 0;
2309
2310                         i++;
2311                 } else {
2312                         if (i <= 0)
2313                                 return 0;
2314
2315                         i--;
2316                 }
2317
2318         }
2319
2320         return generic_array_get_plus_one(f,
2321                                           le64toh(d->data.entry_offset),
2322                                           le64toh(d->data.entry_array_offset),
2323                                           i,
2324                                           ret, offset);
2325 }
2326
2327 int journal_file_move_to_entry_by_offset_for_data(
2328                 JournalFile *f,
2329                 uint64_t data_offset,
2330                 uint64_t p,
2331                 direction_t direction,
2332                 Object **ret, uint64_t *offset) {
2333
2334         int r;
2335         Object *d;
2336
2337         assert(f);
2338
2339         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2340         if (r < 0)
2341                 return r;
2342
2343         return generic_array_bisect_plus_one(f,
2344                                              le64toh(d->data.entry_offset),
2345                                              le64toh(d->data.entry_array_offset),
2346                                              le64toh(d->data.n_entries),
2347                                              p,
2348                                              test_object_offset,
2349                                              direction,
2350                                              ret, offset, NULL);
2351 }
2352
2353 int journal_file_move_to_entry_by_monotonic_for_data(
2354                 JournalFile *f,
2355                 uint64_t data_offset,
2356                 sd_id128_t boot_id,
2357                 uint64_t monotonic,
2358                 direction_t direction,
2359                 Object **ret, uint64_t *offset) {
2360
2361         Object *o, *d;
2362         int r;
2363         uint64_t b, z;
2364
2365         assert(f);
2366
2367         /* First, seek by time */
2368         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2369         if (r < 0)
2370                 return r;
2371         if (r == 0)
2372                 return -ENOENT;
2373
2374         r = generic_array_bisect_plus_one(f,
2375                                           le64toh(o->data.entry_offset),
2376                                           le64toh(o->data.entry_array_offset),
2377                                           le64toh(o->data.n_entries),
2378                                           monotonic,
2379                                           test_object_monotonic,
2380                                           direction,
2381                                           NULL, &z, NULL);
2382         if (r <= 0)
2383                 return r;
2384
2385         /* And now, continue seeking until we find an entry that
2386          * exists in both bisection arrays */
2387
2388         for (;;) {
2389                 Object *qo;
2390                 uint64_t p, q;
2391
2392                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2393                 if (r < 0)
2394                         return r;
2395
2396                 r = generic_array_bisect_plus_one(f,
2397                                                   le64toh(d->data.entry_offset),
2398                                                   le64toh(d->data.entry_array_offset),
2399                                                   le64toh(d->data.n_entries),
2400                                                   z,
2401                                                   test_object_offset,
2402                                                   direction,
2403                                                   NULL, &p, NULL);
2404                 if (r <= 0)
2405                         return r;
2406
2407                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2408                 if (r < 0)
2409                         return r;
2410
2411                 r = generic_array_bisect_plus_one(f,
2412                                                   le64toh(o->data.entry_offset),
2413                                                   le64toh(o->data.entry_array_offset),
2414                                                   le64toh(o->data.n_entries),
2415                                                   p,
2416                                                   test_object_offset,
2417                                                   direction,
2418                                                   &qo, &q, NULL);
2419
2420                 if (r <= 0)
2421                         return r;
2422
2423                 if (p == q) {
2424                         if (ret)
2425                                 *ret = qo;
2426                         if (offset)
2427                                 *offset = q;
2428
2429                         return 1;
2430                 }
2431
2432                 z = q;
2433         }
2434 }
2435
2436 int journal_file_move_to_entry_by_seqnum_for_data(
2437                 JournalFile *f,
2438                 uint64_t data_offset,
2439                 uint64_t seqnum,
2440                 direction_t direction,
2441                 Object **ret, uint64_t *offset) {
2442
2443         Object *d;
2444         int r;
2445
2446         assert(f);
2447
2448         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2449         if (r < 0)
2450                 return r;
2451
2452         return generic_array_bisect_plus_one(f,
2453                                              le64toh(d->data.entry_offset),
2454                                              le64toh(d->data.entry_array_offset),
2455                                              le64toh(d->data.n_entries),
2456                                              seqnum,
2457                                              test_object_seqnum,
2458                                              direction,
2459                                              ret, offset, NULL);
2460 }
2461
2462 int journal_file_move_to_entry_by_realtime_for_data(
2463                 JournalFile *f,
2464                 uint64_t data_offset,
2465                 uint64_t realtime,
2466                 direction_t direction,
2467                 Object **ret, uint64_t *offset) {
2468
2469         Object *d;
2470         int r;
2471
2472         assert(f);
2473
2474         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2475         if (r < 0)
2476                 return r;
2477
2478         return generic_array_bisect_plus_one(f,
2479                                              le64toh(d->data.entry_offset),
2480                                              le64toh(d->data.entry_array_offset),
2481                                              le64toh(d->data.n_entries),
2482                                              realtime,
2483                                              test_object_realtime,
2484                                              direction,
2485                                              ret, offset, NULL);
2486 }
2487
2488 void journal_file_dump(JournalFile *f) {
2489         Object *o;
2490         int r;
2491         uint64_t p;
2492
2493         assert(f);
2494
2495         journal_file_print_header(f);
2496
2497         p = le64toh(f->header->header_size);
2498         while (p != 0) {
2499                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2500                 if (r < 0)
2501                         goto fail;
2502
2503                 switch (o->object.type) {
2504
2505                 case OBJECT_UNUSED:
2506                         printf("Type: OBJECT_UNUSED\n");
2507                         break;
2508
2509                 case OBJECT_DATA:
2510                         printf("Type: OBJECT_DATA\n");
2511                         break;
2512
2513                 case OBJECT_FIELD:
2514                         printf("Type: OBJECT_FIELD\n");
2515                         break;
2516
2517                 case OBJECT_ENTRY:
2518                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2519                                le64toh(o->entry.seqnum),
2520                                le64toh(o->entry.monotonic),
2521                                le64toh(o->entry.realtime));
2522                         break;
2523
2524                 case OBJECT_FIELD_HASH_TABLE:
2525                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2526                         break;
2527
2528                 case OBJECT_DATA_HASH_TABLE:
2529                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2530                         break;
2531
2532                 case OBJECT_ENTRY_ARRAY:
2533                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2534                         break;
2535
2536                 case OBJECT_TAG:
2537                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2538                                le64toh(o->tag.seqnum),
2539                                le64toh(o->tag.epoch));
2540                         break;
2541
2542                 default:
2543                         printf("Type: unknown (%i)\n", o->object.type);
2544                         break;
2545                 }
2546
2547                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2548                         printf("Flags: %s\n",
2549                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2550
2551                 if (p == le64toh(f->header->tail_object_offset))
2552                         p = 0;
2553                 else
2554                         p = p + ALIGN64(le64toh(o->object.size));
2555         }
2556
2557         return;
2558 fail:
2559         log_error("File corrupt");
2560 }
2561
2562 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2563         const char *x;
2564
2565         x = format_timestamp(buf, l, t);
2566         if (x)
2567                 return x;
2568         return " --- ";
2569 }
2570
2571 void journal_file_print_header(JournalFile *f) {
2572         char a[33], b[33], c[33], d[33];
2573         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2574         struct stat st;
2575         char bytes[FORMAT_BYTES_MAX];
2576
2577         assert(f);
2578
2579         printf("File Path: %s\n"
2580                "File ID: %s\n"
2581                "Machine ID: %s\n"
2582                "Boot ID: %s\n"
2583                "Sequential Number ID: %s\n"
2584                "State: %s\n"
2585                "Compatible Flags:%s%s\n"
2586                "Incompatible Flags:%s%s%s\n"
2587                "Header size: %"PRIu64"\n"
2588                "Arena size: %"PRIu64"\n"
2589                "Data Hash Table Size: %"PRIu64"\n"
2590                "Field Hash Table Size: %"PRIu64"\n"
2591                "Rotate Suggested: %s\n"
2592                "Head Sequential Number: %"PRIu64"\n"
2593                "Tail Sequential Number: %"PRIu64"\n"
2594                "Head Realtime Timestamp: %s\n"
2595                "Tail Realtime Timestamp: %s\n"
2596                "Tail Monotonic Timestamp: %s\n"
2597                "Objects: %"PRIu64"\n"
2598                "Entry Objects: %"PRIu64"\n",
2599                f->path,
2600                sd_id128_to_string(f->header->file_id, a),
2601                sd_id128_to_string(f->header->machine_id, b),
2602                sd_id128_to_string(f->header->boot_id, c),
2603                sd_id128_to_string(f->header->seqnum_id, d),
2604                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2605                f->header->state == STATE_ONLINE ? "ONLINE" :
2606                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2607                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2608                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2609                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2610                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2611                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2612                le64toh(f->header->header_size),
2613                le64toh(f->header->arena_size),
2614                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2615                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2616                yes_no(journal_file_rotate_suggested(f, 0)),
2617                le64toh(f->header->head_entry_seqnum),
2618                le64toh(f->header->tail_entry_seqnum),
2619                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2620                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2621                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2622                le64toh(f->header->n_objects),
2623                le64toh(f->header->n_entries));
2624
2625         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2626                 printf("Data Objects: %"PRIu64"\n"
2627                        "Data Hash Table Fill: %.1f%%\n",
2628                        le64toh(f->header->n_data),
2629                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2630
2631         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2632                 printf("Field Objects: %"PRIu64"\n"
2633                        "Field Hash Table Fill: %.1f%%\n",
2634                        le64toh(f->header->n_fields),
2635                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2636
2637         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2638                 printf("Tag Objects: %"PRIu64"\n",
2639                        le64toh(f->header->n_tags));
2640         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2641                 printf("Entry Array Objects: %"PRIu64"\n",
2642                        le64toh(f->header->n_entry_arrays));
2643
2644         if (fstat(f->fd, &st) >= 0)
2645                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2646 }
2647
2648 static int journal_file_warn_btrfs(JournalFile *f) {
2649         unsigned attrs;
2650         int r;
2651
2652         assert(f);
2653
2654         /* Before we write anything, check if the COW logic is turned
2655          * off on btrfs. Given our write pattern that is quite
2656          * unfriendly to COW file systems this should greatly improve
2657          * performance on COW file systems, such as btrfs, at the
2658          * expense of data integrity features (which shouldn't be too
2659          * bad, given that we do our own checksumming). */
2660
2661         r = btrfs_is_filesystem(f->fd);
2662         if (r < 0)
2663                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2664         if (!r)
2665                 return 0;
2666
2667         r = read_attr_fd(f->fd, &attrs);
2668         if (r < 0)
2669                 return log_warning_errno(r, "Failed to read file attributes: %m");
2670
2671         if (attrs & FS_NOCOW_FL) {
2672                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2673                 return 0;
2674         }
2675
2676         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2677                    "This is likely to slow down journal access substantially, please consider turning "
2678                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2679
2680         return 1;
2681 }
2682
2683 int journal_file_open(
2684                 const char *fname,
2685                 int flags,
2686                 mode_t mode,
2687                 bool compress,
2688                 bool seal,
2689                 JournalMetrics *metrics,
2690                 MMapCache *mmap_cache,
2691                 JournalFile *template,
2692                 JournalFile **ret) {
2693
2694         bool newly_created = false;
2695         JournalFile *f;
2696         void *h;
2697         int r;
2698
2699         assert(fname);
2700         assert(ret);
2701
2702         if ((flags & O_ACCMODE) != O_RDONLY &&
2703             (flags & O_ACCMODE) != O_RDWR)
2704                 return -EINVAL;
2705
2706         if (!endswith(fname, ".journal") &&
2707             !endswith(fname, ".journal~"))
2708                 return -EINVAL;
2709
2710         f = new0(JournalFile, 1);
2711         if (!f)
2712                 return -ENOMEM;
2713
2714         f->fd = -1;
2715         f->mode = mode;
2716
2717         f->flags = flags;
2718         f->prot = prot_from_flags(flags);
2719         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2720 #if defined(HAVE_LZ4)
2721         f->compress_lz4 = compress;
2722 #elif defined(HAVE_XZ)
2723         f->compress_xz = compress;
2724 #endif
2725 #ifdef HAVE_GCRYPT
2726         f->seal = seal;
2727 #endif
2728
2729         if (mmap_cache)
2730                 f->mmap = mmap_cache_ref(mmap_cache);
2731         else {
2732                 f->mmap = mmap_cache_new();
2733                 if (!f->mmap) {
2734                         r = -ENOMEM;
2735                         goto fail;
2736                 }
2737         }
2738
2739         f->path = strdup(fname);
2740         if (!f->path) {
2741                 r = -ENOMEM;
2742                 goto fail;
2743         }
2744
2745         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2746         if (!f->chain_cache) {
2747                 r = -ENOMEM;
2748                 goto fail;
2749         }
2750
2751         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2752         if (f->fd < 0) {
2753                 r = -errno;
2754                 goto fail;
2755         }
2756
2757         r = journal_file_fstat(f);
2758         if (r < 0)
2759                 goto fail;
2760
2761         if (f->last_stat.st_size == 0 && f->writable) {
2762
2763                 (void) journal_file_warn_btrfs(f);
2764
2765                 /* Let's attach the creation time to the journal file,
2766                  * so that the vacuuming code knows the age of this
2767                  * file even if the file might end up corrupted one
2768                  * day... Ideally we'd just use the creation time many
2769                  * file systems maintain for each file, but there is
2770                  * currently no usable API to query this, hence let's
2771                  * emulate this via extended attributes. If extended
2772                  * attributes are not supported we'll just skip this,
2773                  * and rely solely on mtime/atime/ctime of the file. */
2774
2775                 fd_setcrtime(f->fd, 0);
2776
2777 #ifdef HAVE_GCRYPT
2778                 /* Try to load the FSPRG state, and if we can't, then
2779                  * just don't do sealing */
2780                 if (f->seal) {
2781                         r = journal_file_fss_load(f);
2782                         if (r < 0)
2783                                 f->seal = false;
2784                 }
2785 #endif
2786
2787                 r = journal_file_init_header(f, template);
2788                 if (r < 0)
2789                         goto fail;
2790
2791                 r = journal_file_fstat(f);
2792                 if (r < 0)
2793                         goto fail;
2794
2795                 newly_created = true;
2796         }
2797
2798         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2799                 r = -ENODATA;
2800                 goto fail;
2801         }
2802
2803         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2804         if (r < 0)
2805                 goto fail;
2806
2807         f->header = h;
2808
2809         if (!newly_created) {
2810                 r = journal_file_verify_header(f);
2811                 if (r < 0)
2812                         goto fail;
2813         }
2814
2815 #ifdef HAVE_GCRYPT
2816         if (!newly_created && f->writable) {
2817                 r = journal_file_fss_load(f);
2818                 if (r < 0)
2819                         goto fail;
2820         }
2821 #endif
2822
2823         if (f->writable) {
2824                 if (metrics) {
2825                         journal_default_metrics(metrics, f->fd);
2826                         f->metrics = *metrics;
2827                 } else if (template)
2828                         f->metrics = template->metrics;
2829
2830                 r = journal_file_refresh_header(f);
2831                 if (r < 0)
2832                         goto fail;
2833         }
2834
2835 #ifdef HAVE_GCRYPT
2836         r = journal_file_hmac_setup(f);
2837         if (r < 0)
2838                 goto fail;
2839 #endif
2840
2841         if (newly_created) {
2842                 r = journal_file_setup_field_hash_table(f);
2843                 if (r < 0)
2844                         goto fail;
2845
2846                 r = journal_file_setup_data_hash_table(f);
2847                 if (r < 0)
2848                         goto fail;
2849
2850 #ifdef HAVE_GCRYPT
2851                 r = journal_file_append_first_tag(f);
2852                 if (r < 0)
2853                         goto fail;
2854 #endif
2855         }
2856
2857         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2858                 r = -EIO;
2859                 goto fail;
2860         }
2861
2862         if (template && template->post_change_timer) {
2863                 r = journal_file_enable_post_change_timer(
2864                                 f,
2865                                 sd_event_source_get_event(template->post_change_timer),
2866                                 template->post_change_timer_period);
2867
2868                 if (r < 0)
2869                         goto fail;
2870         }
2871
2872         *ret = f;
2873         return 0;
2874
2875 fail:
2876         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2877                 r = -EIO;
2878
2879         journal_file_close(f);
2880
2881         return r;
2882 }
2883
2884 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2885         _cleanup_free_ char *p = NULL;
2886         size_t l;
2887         JournalFile *old_file, *new_file = NULL;
2888         int r;
2889
2890         assert(f);
2891         assert(*f);
2892
2893         old_file = *f;
2894
2895         if (!old_file->writable)
2896                 return -EINVAL;
2897
2898         if (!endswith(old_file->path, ".journal"))
2899                 return -EINVAL;
2900
2901         l = strlen(old_file->path);
2902         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2903                      (int) l - 8, old_file->path,
2904                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2905                      le64toh((*f)->header->head_entry_seqnum),
2906                      le64toh((*f)->header->head_entry_realtime));
2907         if (r < 0)
2908                 return -ENOMEM;
2909
2910         /* Try to rename the file to the archived version. If the file
2911          * already was deleted, we'll get ENOENT, let's ignore that
2912          * case. */
2913         r = rename(old_file->path, p);
2914         if (r < 0 && errno != ENOENT)
2915                 return -errno;
2916
2917         old_file->header->state = STATE_ARCHIVED;
2918
2919         /* Currently, btrfs is not very good with out write patterns
2920          * and fragments heavily. Let's defrag our journal files when
2921          * we archive them */
2922         old_file->defrag_on_close = true;
2923
2924         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2925         journal_file_close(old_file);
2926
2927         *f = new_file;
2928         return r;
2929 }
2930
2931 int journal_file_open_reliably(
2932                 const char *fname,
2933                 int flags,
2934                 mode_t mode,
2935                 bool compress,
2936                 bool seal,
2937                 JournalMetrics *metrics,
2938                 MMapCache *mmap_cache,
2939                 JournalFile *template,
2940                 JournalFile **ret) {
2941
2942         int r;
2943         size_t l;
2944         _cleanup_free_ char *p = NULL;
2945
2946         r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2947         if (!IN_SET(r,
2948                     -EBADMSG,           /* corrupted */
2949                     -ENODATA,           /* truncated */
2950                     -EHOSTDOWN,         /* other machine */
2951                     -EPROTONOSUPPORT,   /* incompatible feature */
2952                     -EBUSY,             /* unclean shutdown */
2953                     -ESHUTDOWN,         /* already archived */
2954                     -EIO,               /* IO error, including SIGBUS on mmap */
2955                     -EIDRM              /* File has been deleted */))
2956                 return r;
2957
2958         if ((flags & O_ACCMODE) == O_RDONLY)
2959                 return r;
2960
2961         if (!(flags & O_CREAT))
2962                 return r;
2963
2964         if (!endswith(fname, ".journal"))
2965                 return r;
2966
2967         /* The file is corrupted. Rotate it away and try it again (but only once) */
2968
2969         l = strlen(fname);
2970         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2971                      (int) l - 8, fname,
2972                      now(CLOCK_REALTIME),
2973                      random_u64()) < 0)
2974                 return -ENOMEM;
2975
2976         if (rename(fname, p) < 0)
2977                 return -errno;
2978
2979         /* btrfs doesn't cope well with our write pattern and
2980          * fragments heavily. Let's defrag all files we rotate */
2981
2982         (void) chattr_path(p, false, FS_NOCOW_FL);
2983         (void) btrfs_defrag(p);
2984
2985         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2986
2987         return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2988 }
2989
2990 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2991         uint64_t i, n;
2992         uint64_t q, xor_hash = 0;
2993         int r;
2994         EntryItem *items;
2995         dual_timestamp ts;
2996
2997         assert(from);
2998         assert(to);
2999         assert(o);
3000         assert(p);
3001
3002         if (!to->writable)
3003                 return -EPERM;
3004
3005         ts.monotonic = le64toh(o->entry.monotonic);
3006         ts.realtime = le64toh(o->entry.realtime);
3007
3008         n = journal_file_entry_n_items(o);
3009         /* alloca() can't take 0, hence let's allocate at least one */
3010         items = alloca(sizeof(EntryItem) * MAX(1u, n));
3011
3012         for (i = 0; i < n; i++) {
3013                 uint64_t l, h;
3014                 le64_t le_hash;
3015                 size_t t;
3016                 void *data;
3017                 Object *u;
3018
3019                 q = le64toh(o->entry.items[i].object_offset);
3020                 le_hash = o->entry.items[i].hash;
3021
3022                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3023                 if (r < 0)
3024                         return r;
3025
3026                 if (le_hash != o->data.hash)
3027                         return -EBADMSG;
3028
3029                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3030                 t = (size_t) l;
3031
3032                 /* We hit the limit on 32bit machines */
3033                 if ((uint64_t) t != l)
3034                         return -E2BIG;
3035
3036                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3037 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
3038                         size_t rsize = 0;
3039
3040                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3041                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3042                         if (r < 0)
3043                                 return r;
3044
3045                         data = from->compress_buffer;
3046                         l = rsize;
3047 #else
3048                         return -EPROTONOSUPPORT;
3049 #endif
3050                 } else
3051                         data = o->data.payload;
3052
3053                 r = journal_file_append_data(to, data, l, &u, &h);
3054                 if (r < 0)
3055                         return r;
3056
3057                 xor_hash ^= le64toh(u->data.hash);
3058                 items[i].object_offset = htole64(h);
3059                 items[i].hash = u->data.hash;
3060
3061                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3062                 if (r < 0)
3063                         return r;
3064         }
3065
3066         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3067
3068         if (mmap_cache_got_sigbus(to->mmap, to->fd))
3069                 return -EIO;
3070
3071         return r;
3072 }
3073
3074 void journal_reset_metrics(JournalMetrics *m) {
3075         assert(m);
3076
3077         /* Set everything to "pick automatic values". */
3078
3079         *m = (JournalMetrics) {
3080                 .min_use = (uint64_t) -1,
3081                 .max_use = (uint64_t) -1,
3082                 .min_size = (uint64_t) -1,
3083                 .max_size = (uint64_t) -1,
3084                 .keep_free = (uint64_t) -1,
3085                 .n_max_files = (uint64_t) -1,
3086         };
3087 }
3088
3089 void journal_default_metrics(JournalMetrics *m, int fd) {
3090         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3091         struct statvfs ss;
3092         uint64_t fs_size;
3093
3094         assert(m);
3095         assert(fd >= 0);
3096
3097         if (fstatvfs(fd, &ss) >= 0)
3098                 fs_size = ss.f_frsize * ss.f_blocks;
3099         else {
3100                 log_debug_errno(errno, "Failed to detremine disk size: %m");
3101                 fs_size = 0;
3102         }
3103
3104         if (m->max_use == (uint64_t) -1) {
3105
3106                 if (fs_size > 0) {
3107                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3108
3109                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3110                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3111
3112                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3113                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3114                 } else
3115                         m->max_use = DEFAULT_MAX_USE_LOWER;
3116         } else {
3117                 m->max_use = PAGE_ALIGN(m->max_use);
3118
3119                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3120                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3121         }
3122
3123         if (m->min_use == (uint64_t) -1)
3124                 m->min_use = DEFAULT_MIN_USE;
3125
3126         if (m->min_use > m->max_use)
3127                 m->min_use = m->max_use;
3128
3129         if (m->max_size == (uint64_t) -1) {
3130                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3131
3132                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3133                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3134         } else
3135                 m->max_size = PAGE_ALIGN(m->max_size);
3136
3137         if (m->max_size != 0) {
3138                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3139                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3140
3141                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3142                         m->max_use = m->max_size*2;
3143         }
3144
3145         if (m->min_size == (uint64_t) -1)
3146                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3147         else {
3148                 m->min_size = PAGE_ALIGN(m->min_size);
3149
3150                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3151                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3152
3153                 if (m->max_size != 0 && m->min_size > m->max_size)
3154                         m->max_size = m->min_size;
3155         }
3156
3157         if (m->keep_free == (uint64_t) -1) {
3158
3159                 if (fs_size > 0) {
3160                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3161
3162                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3163                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3164
3165                 } else
3166                         m->keep_free = DEFAULT_KEEP_FREE;
3167         }
3168
3169         if (m->n_max_files == (uint64_t) -1)
3170                 m->n_max_files = DEFAULT_N_MAX_FILES;
3171
3172         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3173                   format_bytes(a, sizeof(a), m->min_use),
3174                   format_bytes(b, sizeof(b), m->max_use),
3175                   format_bytes(c, sizeof(c), m->max_size),
3176                   format_bytes(d, sizeof(d), m->min_size),
3177                   format_bytes(e, sizeof(e), m->keep_free),
3178                   m->n_max_files);
3179 }
3180
3181 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3182         assert(f);
3183         assert(from || to);
3184
3185         if (from) {
3186                 if (f->header->head_entry_realtime == 0)
3187                         return -ENOENT;
3188
3189                 *from = le64toh(f->header->head_entry_realtime);
3190         }
3191
3192         if (to) {
3193                 if (f->header->tail_entry_realtime == 0)
3194                         return -ENOENT;
3195
3196                 *to = le64toh(f->header->tail_entry_realtime);
3197         }
3198
3199         return 1;
3200 }
3201
3202 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3203         Object *o;
3204         uint64_t p;
3205         int r;
3206
3207         assert(f);
3208         assert(from || to);
3209
3210         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3211         if (r <= 0)
3212                 return r;
3213
3214         if (le64toh(o->data.n_entries) <= 0)
3215                 return 0;
3216
3217         if (from) {
3218                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3219                 if (r < 0)
3220                         return r;
3221
3222                 *from = le64toh(o->entry.monotonic);
3223         }
3224
3225         if (to) {
3226                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3227                 if (r < 0)
3228                         return r;
3229
3230                 r = generic_array_get_plus_one(f,
3231                                                le64toh(o->data.entry_offset),
3232                                                le64toh(o->data.entry_array_offset),
3233                                                le64toh(o->data.n_entries)-1,
3234                                                &o, NULL);
3235                 if (r <= 0)
3236                         return r;
3237
3238                 *to = le64toh(o->entry.monotonic);
3239         }
3240
3241         return 1;
3242 }
3243
3244 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3245         assert(f);
3246
3247         /* If we gained new header fields we gained new features,
3248          * hence suggest a rotation */
3249         if (le64toh(f->header->header_size) < sizeof(Header)) {
3250                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3251                 return true;
3252         }
3253
3254         /* Let's check if the hash tables grew over a certain fill
3255          * level (75%, borrowing this value from Java's hash table
3256          * implementation), and if so suggest a rotation. To calculate
3257          * the fill level we need the n_data field, which only exists
3258          * in newer versions. */
3259
3260         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3261                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3262                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3263                                   f->path,
3264                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3265                                   le64toh(f->header->n_data),
3266                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3267                                   (unsigned long long) f->last_stat.st_size,
3268                                   f->last_stat.st_size / le64toh(f->header->n_data));
3269                         return true;
3270                 }
3271
3272         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3273                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3274                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3275                                   f->path,
3276                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3277                                   le64toh(f->header->n_fields),
3278                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3279                         return true;
3280                 }
3281
3282         /* Are the data objects properly indexed by field objects? */
3283         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3284             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3285             le64toh(f->header->n_data) > 0 &&
3286             le64toh(f->header->n_fields) == 0)
3287                 return true;
3288
3289         if (max_file_usec > 0) {
3290                 usec_t t, h;
3291
3292                 h = le64toh(f->header->head_entry_realtime);
3293                 t = now(CLOCK_REALTIME);
3294
3295                 if (h > 0 && t > h + max_file_usec)
3296                         return true;
3297         }
3298
3299         return false;
3300 }