src/journal/journal-file.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2011 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <fcntl.h>
  24 #include <linux/fs.h>
  25 #include <stddef.h>
  26 #include <sys/mman.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/uio.h>
  29 #include <unistd.h>
  30
  31 #include "alloc-util.h"
  32 #include "btrfs-util.h"
  33 #include "chattr-util.h"
  34 #include "compress.h"
  35 #include "fd-util.h"
  36 #include "journal-authenticate.h"
  37 #include "journal-def.h"
  38 #include "journal-file.h"
  39 #include "lookup3.h"
  40 #include "parse-util.h"
  41 #include "random-util.h"
  42 #include "sd-event.h"
  43 #include "string-util.h"
  44 #include "xattr-util.h"
  45
  46 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
  47 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
  48
  49 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
  50
  51 /* This is the minimum journal file size */
  52 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL)                 /* 512 KiB */
  53
  54 /* These are the lower and upper bounds if we deduce the max_use value
  55  * from the file system size */
  56 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
  57 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
  58
  59 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
  60 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL)                 /* 1 MiB */
  61
  62 /* This is the upper bound if we deduce max_size from max_use */
  63 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
  64
  65 /* This is the upper bound if we deduce the keep_free value from the
  66  * file system size */
  67 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
  68
  69 /* This is the keep_free value when we can't determine the system
  70  * size */
  71 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
  72
  73 /* This is the default maximum number of journal files to keep around. */
  74 #define DEFAULT_N_MAX_FILES (100)
  75
  76 /* n_data was the first entry we added after the initial file format design */
  77 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
  78
  79 /* How many entries to keep in the entry array chain cache at max */
  80 #define CHAIN_CACHE_MAX 20
  81
  82 /* How much to increase the journal file size at once each time we allocate something new. */
  83 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
  84
  85 /* Reread fstat() of the file for detecting deletions at least this often */
  86 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
  87
  88 /* The mmap context to use for the header we pick as one above the last defined typed */
  89 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
  90
  91 static int journal_file_set_online(JournalFile *f) {
  92         assert(f);
  93
  94         if (!f->writable)
  95                 return -EPERM;
  96
  97         if (!(f->fd >= 0 && f->header))
  98                 return -EINVAL;
  99
 100         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 101                 return -EIO;
 102
 103         switch(f->header->state) {
 104                 case STATE_ONLINE:
 105                         return 0;
 106
 107                 case STATE_OFFLINE:
 108                         f->header->state = STATE_ONLINE;
 109                         fsync(f->fd);
 110                         return 0;
 111
 112                 default:
 113                         return -EINVAL;
 114         }
 115 }
 116
 117 int journal_file_set_offline(JournalFile *f) {
 118         assert(f);
 119
 120         if (!f->writable)
 121                 return -EPERM;
 122
 123         if (!(f->fd >= 0 && f->header))
 124                 return -EINVAL;
 125
 126         if (f->header->state != STATE_ONLINE)
 127                 return 0;
 128
 129         fsync(f->fd);
 130
 131         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 132                 return -EIO;
 133
 134         f->header->state = STATE_OFFLINE;
 135
 136         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 137                 return -EIO;
 138
 139         fsync(f->fd);
 140
 141         return 0;
 142 }
 143
 144 JournalFile* journal_file_close(JournalFile *f) {
 145         assert(f);
 146
 147 #ifdef HAVE_GCRYPT
 148         /* Write the final tag */
 149         if (f->seal && f->writable)
 150                 journal_file_append_tag(f);
 151 #endif
 152
 153         if (f->post_change_timer) {
 154                 int enabled;
 155
 156                 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
 157                         if (enabled == SD_EVENT_ONESHOT)
 158                                 journal_file_post_change(f);
 159
 160                 sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
 161                 sd_event_source_unref(f->post_change_timer);
 162         }
 163
 164         journal_file_set_offline(f);
 165
 166         if (f->mmap && f->fd >= 0)
 167                 mmap_cache_close_fd(f->mmap, f->fd);
 168
 169         if (f->fd >= 0 && f->defrag_on_close) {
 170
 171                 /* Be friendly to btrfs: turn COW back on again now,
 172                  * and defragment the file. We won't write to the file
 173                  * ever again, hence remove all fragmentation, and
 174                  * reenable all the good bits COW usually provides
 175                  * (such as data checksumming). */
 176
 177                 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
 178                 (void) btrfs_defrag_fd(f->fd);
 179         }
 180
 181         safe_close(f->fd);
 182         free(f->path);
 183
 184         mmap_cache_unref(f->mmap);
 185
 186         ordered_hashmap_free_free(f->chain_cache);
 187
 188 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 189         free(f->compress_buffer);
 190 #endif
 191
 192 #ifdef HAVE_GCRYPT
 193         if (f->fss_file)
 194                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
 195         else
 196                 free(f->fsprg_state);
 197
 198         free(f->fsprg_seed);
 199
 200         if (f->hmac)
 201                 gcry_md_close(f->hmac);
 202 #endif
 203
 204         free(f);
 205         return NULL;
 206 }
 207
 208 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
 209         Header h = {};
 210         ssize_t k;
 211         int r;
 212
 213         assert(f);
 214
 215         memcpy(h.signature, HEADER_SIGNATURE, 8);
 216         h.header_size = htole64(ALIGN64(sizeof(h)));
 217
 218         h.incompatible_flags |= htole32(
 219                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
 220                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
 221
 222         h.compatible_flags = htole32(
 223                 f->seal * HEADER_COMPATIBLE_SEALED);
 224
 225         r = sd_id128_randomize(&h.file_id);
 226         if (r < 0)
 227                 return r;
 228
 229         if (template) {
 230                 h.seqnum_id = template->header->seqnum_id;
 231                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
 232         } else
 233                 h.seqnum_id = h.file_id;
 234
 235         k = pwrite(f->fd, &h, sizeof(h), 0);
 236         if (k < 0)
 237                 return -errno;
 238
 239         if (k != sizeof(h))
 240                 return -EIO;
 241
 242         return 0;
 243 }
 244
 245 static int journal_file_refresh_header(JournalFile *f) {
 246         sd_id128_t boot_id;
 247         int r;
 248
 249         assert(f);
 250
 251         r = sd_id128_get_machine(&f->header->machine_id);
 252         if (r < 0)
 253                 return r;
 254
 255         r = sd_id128_get_boot(&boot_id);
 256         if (r < 0)
 257                 return r;
 258
 259         if (sd_id128_equal(boot_id, f->header->boot_id))
 260                 f->tail_entry_monotonic_valid = true;
 261
 262         f->header->boot_id = boot_id;
 263
 264         r = journal_file_set_online(f);
 265
 266         /* Sync the online state to disk */
 267         fsync(f->fd);
 268
 269         return r;
 270 }
 271
 272 static int journal_file_verify_header(JournalFile *f) {
 273         uint32_t flags;
 274
 275         assert(f);
 276
 277         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
 278                 return -EBADMSG;
 279
 280         /* In both read and write mode we refuse to open files with
 281          * incompatible flags we don't know */
 282         flags = le32toh(f->header->incompatible_flags);
 283         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
 284                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
 285                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
 286                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
 287                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
 288                 if (flags)
 289                         log_debug("Journal file %s uses incompatible flags %"PRIx32
 290                                   " disabled at compilation time.", f->path, flags);
 291                 return -EPROTONOSUPPORT;
 292         }
 293
 294         /* When open for writing we refuse to open files with
 295          * compatible flags, too */
 296         flags = le32toh(f->header->compatible_flags);
 297         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
 298                 if (flags & ~HEADER_COMPATIBLE_ANY)
 299                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
 300                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
 301                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
 302                 if (flags)
 303                         log_debug("Journal file %s uses compatible flags %"PRIx32
 304                                   " disabled at compilation time.", f->path, flags);
 305                 return -EPROTONOSUPPORT;
 306         }
 307
 308         if (f->header->state >= _STATE_MAX)
 309                 return -EBADMSG;
 310
 311         /* The first addition was n_data, so check that we are at least this large */
 312         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
 313                 return -EBADMSG;
 314
 315         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
 316                 return -EBADMSG;
 317
 318         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
 319                 return -ENODATA;
 320
 321         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
 322                 return -ENODATA;
 323
 324         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
 325             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
 326             !VALID64(le64toh(f->header->tail_object_offset)) ||
 327             !VALID64(le64toh(f->header->entry_array_offset)))
 328                 return -ENODATA;
 329
 330         if (f->writable) {
 331                 uint8_t state;
 332                 sd_id128_t machine_id;
 333                 int r;
 334
 335                 r = sd_id128_get_machine(&machine_id);
 336                 if (r < 0)
 337                         return r;
 338
 339                 if (!sd_id128_equal(machine_id, f->header->machine_id))
 340                         return -EHOSTDOWN;
 341
 342                 state = f->header->state;
 343
 344                 if (state == STATE_ONLINE) {
 345                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
 346                         return -EBUSY;
 347                 } else if (state == STATE_ARCHIVED)
 348                         return -ESHUTDOWN;
 349                 else if (state != STATE_OFFLINE) {
 350                         log_debug("Journal file %s has unknown state %i.", f->path, state);
 351                         return -EBUSY;
 352                 }
 353         }
 354
 355         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
 356         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
 357
 358         f->seal = JOURNAL_HEADER_SEALED(f->header);
 359
 360         return 0;
 361 }
 362
 363 static int journal_file_fstat(JournalFile *f) {
 364         assert(f);
 365         assert(f->fd >= 0);
 366
 367         if (fstat(f->fd, &f->last_stat) < 0)
 368                 return -errno;
 369
 370         f->last_stat_usec = now(CLOCK_MONOTONIC);
 371
 372         /* Refuse appending to files that are already deleted */
 373         if (f->last_stat.st_nlink <= 0)
 374                 return -EIDRM;
 375
 376         return 0;
 377 }
 378
 379 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
 380         uint64_t old_size, new_size;
 381         int r;
 382
 383         assert(f);
 384
 385         /* We assume that this file is not sparse, and we know that
 386          * for sure, since we always call posix_fallocate()
 387          * ourselves */
 388
 389         if (mmap_cache_got_sigbus(f->mmap, f->fd))
 390                 return -EIO;
 391
 392         old_size =
 393                 le64toh(f->header->header_size) +
 394                 le64toh(f->header->arena_size);
 395
 396         new_size = PAGE_ALIGN(offset + size);
 397         if (new_size < le64toh(f->header->header_size))
 398                 new_size = le64toh(f->header->header_size);
 399
 400         if (new_size <= old_size) {
 401
 402                 /* We already pre-allocated enough space, but before
 403                  * we write to it, let's check with fstat() if the
 404                  * file got deleted, in order make sure we don't throw
 405                  * away the data immediately. Don't check fstat() for
 406                  * all writes though, but only once ever 10s. */
 407
 408                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
 409                         return 0;
 410
 411                 return journal_file_fstat(f);
 412         }
 413
 414         /* Allocate more space. */
 415
 416         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 417                 return -E2BIG;
 418
 419         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
 420                 struct statvfs svfs;
 421
 422                 if (fstatvfs(f->fd, &svfs) >= 0) {
 423                         uint64_t available;
 424
 425                         available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
 426
 427                         if (new_size - old_size > available)
 428                                 return -E2BIG;
 429                 }
 430         }
 431
 432         /* Increase by larger blocks at once */
 433         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
 434         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
 435                 new_size = f->metrics.max_size;
 436
 437         /* Note that the glibc fallocate() fallback is very
 438            inefficient, hence we try to minimize the allocation area
 439            as we can. */
 440         r = posix_fallocate(f->fd, old_size, new_size - old_size);
 441         if (r != 0)
 442                 return -r;
 443
 444         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
 445
 446         return journal_file_fstat(f);
 447 }
 448
 449 static unsigned type_to_context(ObjectType type) {
 450         /* One context for each type, plus one catch-all for the rest */
 451         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
 452         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
 453         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
 454 }
 455
 456 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
 457         int r;
 458
 459         assert(f);
 460         assert(ret);
 461
 462         if (size <= 0)
 463                 return -EINVAL;
 464
 465         /* Avoid SIGBUS on invalid accesses */
 466         if (offset + size > (uint64_t) f->last_stat.st_size) {
 467                 /* Hmm, out of range? Let's refresh the fstat() data
 468                  * first, before we trust that check. */
 469
 470                 r = journal_file_fstat(f);
 471                 if (r < 0)
 472                         return r;
 473
 474                 if (offset + size > (uint64_t) f->last_stat.st_size)
 475                         return -EADDRNOTAVAIL;
 476         }
 477
 478         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
 479 }
 480
 481 static uint64_t minimum_header_size(Object *o) {
 482
 483         static const uint64_t table[] = {
 484                 [OBJECT_DATA] = sizeof(DataObject),
 485                 [OBJECT_FIELD] = sizeof(FieldObject),
 486                 [OBJECT_ENTRY] = sizeof(EntryObject),
 487                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
 488                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
 489                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
 490                 [OBJECT_TAG] = sizeof(TagObject),
 491         };
 492
 493         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
 494                 return sizeof(ObjectHeader);
 495
 496         return table[o->object.type];
 497 }
 498
 499 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
 500         int r;
 501         void *t;
 502         Object *o;
 503         uint64_t s;
 504
 505         assert(f);
 506         assert(ret);
 507
 508         /* Objects may only be located at multiple of 64 bit */
 509         if (!VALID64(offset))
 510                 return -EFAULT;
 511
 512         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
 513         if (r < 0)
 514                 return r;
 515
 516         o = (Object*) t;
 517         s = le64toh(o->object.size);
 518
 519         if (s < sizeof(ObjectHeader))
 520                 return -EBADMSG;
 521
 522         if (o->object.type <= OBJECT_UNUSED)
 523                 return -EBADMSG;
 524
 525         if (s < minimum_header_size(o))
 526                 return -EBADMSG;
 527
 528         if (type > OBJECT_UNUSED && o->object.type != type)
 529                 return -EBADMSG;
 530
 531         if (s > sizeof(ObjectHeader)) {
 532                 r = journal_file_move_to(f, type, false, offset, s, &t);
 533                 if (r < 0)
 534                         return r;
 535
 536                 o = (Object*) t;
 537         }
 538
 539         *ret = o;
 540         return 0;
 541 }
 542
 543 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
 544         uint64_t r;
 545
 546         assert(f);
 547
 548         r = le64toh(f->header->tail_entry_seqnum) + 1;
 549
 550         if (seqnum) {
 551                 /* If an external seqnum counter was passed, we update
 552                  * both the local and the external one, and set it to
 553                  * the maximum of both */
 554
 555                 if (*seqnum + 1 > r)
 556                         r = *seqnum + 1;
 557
 558                 *seqnum = r;
 559         }
 560
 561         f->header->tail_entry_seqnum = htole64(r);
 562
 563         if (f->header->head_entry_seqnum == 0)
 564                 f->header->head_entry_seqnum = htole64(r);
 565
 566         return r;
 567 }
 568
 569 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
 570         int r;
 571         uint64_t p;
 572         Object *tail, *o;
 573         void *t;
 574
 575         assert(f);
 576         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
 577         assert(size >= sizeof(ObjectHeader));
 578         assert(offset);
 579         assert(ret);
 580
 581         r = journal_file_set_online(f);
 582         if (r < 0)
 583                 return r;
 584
 585         p = le64toh(f->header->tail_object_offset);
 586         if (p == 0)
 587                 p = le64toh(f->header->header_size);
 588         else {
 589                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
 590                 if (r < 0)
 591                         return r;
 592
 593                 p += ALIGN64(le64toh(tail->object.size));
 594         }
 595
 596         r = journal_file_allocate(f, p, size);
 597         if (r < 0)
 598                 return r;
 599
 600         r = journal_file_move_to(f, type, false, p, size, &t);
 601         if (r < 0)
 602                 return r;
 603
 604         o = (Object*) t;
 605
 606         zero(o->object);
 607         o->object.type = type;
 608         o->object.size = htole64(size);
 609
 610         f->header->tail_object_offset = htole64(p);
 611         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
 612
 613         *ret = o;
 614         *offset = p;
 615
 616         return 0;
 617 }
 618
 619 static int journal_file_setup_data_hash_table(JournalFile *f) {
 620         uint64_t s, p;
 621         Object *o;
 622         int r;
 623
 624         assert(f);
 625
 626         /* We estimate that we need 1 hash table entry per 768 bytes
 627            of journal file and we want to make sure we never get
 628            beyond 75% fill level. Calculate the hash table size for
 629            the maximum file size based on these metrics. */
 630
 631         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
 632         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
 633                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
 634
 635         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
 636
 637         r = journal_file_append_object(f,
 638                                        OBJECT_DATA_HASH_TABLE,
 639                                        offsetof(Object, hash_table.items) + s,
 640                                        &o, &p);
 641         if (r < 0)
 642                 return r;
 643
 644         memzero(o->hash_table.items, s);
 645
 646         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 647         f->header->data_hash_table_size = htole64(s);
 648
 649         return 0;
 650 }
 651
 652 static int journal_file_setup_field_hash_table(JournalFile *f) {
 653         uint64_t s, p;
 654         Object *o;
 655         int r;
 656
 657         assert(f);
 658
 659         /* We use a fixed size hash table for the fields as this
 660          * number should grow very slowly only */
 661
 662         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
 663         r = journal_file_append_object(f,
 664                                        OBJECT_FIELD_HASH_TABLE,
 665                                        offsetof(Object, hash_table.items) + s,
 666                                        &o, &p);
 667         if (r < 0)
 668                 return r;
 669
 670         memzero(o->hash_table.items, s);
 671
 672         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
 673         f->header->field_hash_table_size = htole64(s);
 674
 675         return 0;
 676 }
 677
 678 int journal_file_map_data_hash_table(JournalFile *f) {
 679         uint64_t s, p;
 680         void *t;
 681         int r;
 682
 683         assert(f);
 684
 685         if (f->data_hash_table)
 686                 return 0;
 687
 688         p = le64toh(f->header->data_hash_table_offset);
 689         s = le64toh(f->header->data_hash_table_size);
 690
 691         r = journal_file_move_to(f,
 692                                  OBJECT_DATA_HASH_TABLE,
 693                                  true,
 694                                  p, s,
 695                                  &t);
 696         if (r < 0)
 697                 return r;
 698
 699         f->data_hash_table = t;
 700         return 0;
 701 }
 702
 703 int journal_file_map_field_hash_table(JournalFile *f) {
 704         uint64_t s, p;
 705         void *t;
 706         int r;
 707
 708         assert(f);
 709
 710         if (f->field_hash_table)
 711                 return 0;
 712
 713         p = le64toh(f->header->field_hash_table_offset);
 714         s = le64toh(f->header->field_hash_table_size);
 715
 716         r = journal_file_move_to(f,
 717                                  OBJECT_FIELD_HASH_TABLE,
 718                                  true,
 719                                  p, s,
 720                                  &t);
 721         if (r < 0)
 722                 return r;
 723
 724         f->field_hash_table = t;
 725         return 0;
 726 }
 727
 728 static int journal_file_link_field(
 729                 JournalFile *f,
 730                 Object *o,
 731                 uint64_t offset,
 732                 uint64_t hash) {
 733
 734         uint64_t p, h, m;
 735         int r;
 736
 737         assert(f);
 738         assert(o);
 739         assert(offset > 0);
 740
 741         if (o->object.type != OBJECT_FIELD)
 742                 return -EINVAL;
 743
 744         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 745         if (m <= 0)
 746                 return -EBADMSG;
 747
 748         /* This might alter the window we are looking at */
 749         o->field.next_hash_offset = o->field.head_data_offset = 0;
 750
 751         h = hash % m;
 752         p = le64toh(f->field_hash_table[h].tail_hash_offset);
 753         if (p == 0)
 754                 f->field_hash_table[h].head_hash_offset = htole64(offset);
 755         else {
 756                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 757                 if (r < 0)
 758                         return r;
 759
 760                 o->field.next_hash_offset = htole64(offset);
 761         }
 762
 763         f->field_hash_table[h].tail_hash_offset = htole64(offset);
 764
 765         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
 766                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
 767
 768         return 0;
 769 }
 770
 771 static int journal_file_link_data(
 772                 JournalFile *f,
 773                 Object *o,
 774                 uint64_t offset,
 775                 uint64_t hash) {
 776
 777         uint64_t p, h, m;
 778         int r;
 779
 780         assert(f);
 781         assert(o);
 782         assert(offset > 0);
 783
 784         if (o->object.type != OBJECT_DATA)
 785                 return -EINVAL;
 786
 787         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 788         if (m <= 0)
 789                 return -EBADMSG;
 790
 791         /* This might alter the window we are looking at */
 792         o->data.next_hash_offset = o->data.next_field_offset = 0;
 793         o->data.entry_offset = o->data.entry_array_offset = 0;
 794         o->data.n_entries = 0;
 795
 796         h = hash % m;
 797         p = le64toh(f->data_hash_table[h].tail_hash_offset);
 798         if (p == 0)
 799                 /* Only entry in the hash table is easy */
 800                 f->data_hash_table[h].head_hash_offset = htole64(offset);
 801         else {
 802                 /* Move back to the previous data object, to patch in
 803                  * pointer */
 804
 805                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 806                 if (r < 0)
 807                         return r;
 808
 809                 o->data.next_hash_offset = htole64(offset);
 810         }
 811
 812         f->data_hash_table[h].tail_hash_offset = htole64(offset);
 813
 814         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
 815                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
 816
 817         return 0;
 818 }
 819
 820 int journal_file_find_field_object_with_hash(
 821                 JournalFile *f,
 822                 const void *field, uint64_t size, uint64_t hash,
 823                 Object **ret, uint64_t *offset) {
 824
 825         uint64_t p, osize, h, m;
 826         int r;
 827
 828         assert(f);
 829         assert(field && size > 0);
 830
 831         /* If the field hash table is empty, we can't find anything */
 832         if (le64toh(f->header->field_hash_table_size) <= 0)
 833                 return 0;
 834
 835         /* Map the field hash table, if it isn't mapped yet. */
 836         r = journal_file_map_field_hash_table(f);
 837         if (r < 0)
 838                 return r;
 839
 840         osize = offsetof(Object, field.payload) + size;
 841
 842         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
 843         if (m <= 0)
 844                 return -EBADMSG;
 845
 846         h = hash % m;
 847         p = le64toh(f->field_hash_table[h].head_hash_offset);
 848
 849         while (p > 0) {
 850                 Object *o;
 851
 852                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
 853                 if (r < 0)
 854                         return r;
 855
 856                 if (le64toh(o->field.hash) == hash &&
 857                     le64toh(o->object.size) == osize &&
 858                     memcmp(o->field.payload, field, size) == 0) {
 859
 860                         if (ret)
 861                                 *ret = o;
 862                         if (offset)
 863                                 *offset = p;
 864
 865                         return 1;
 866                 }
 867
 868                 p = le64toh(o->field.next_hash_offset);
 869         }
 870
 871         return 0;
 872 }
 873
 874 int journal_file_find_field_object(
 875                 JournalFile *f,
 876                 const void *field, uint64_t size,
 877                 Object **ret, uint64_t *offset) {
 878
 879         uint64_t hash;
 880
 881         assert(f);
 882         assert(field && size > 0);
 883
 884         hash = hash64(field, size);
 885
 886         return journal_file_find_field_object_with_hash(f,
 887                                                         field, size, hash,
 888                                                         ret, offset);
 889 }
 890
 891 int journal_file_find_data_object_with_hash(
 892                 JournalFile *f,
 893                 const void *data, uint64_t size, uint64_t hash,
 894                 Object **ret, uint64_t *offset) {
 895
 896         uint64_t p, osize, h, m;
 897         int r;
 898
 899         assert(f);
 900         assert(data || size == 0);
 901
 902         /* If there's no data hash table, then there's no entry. */
 903         if (le64toh(f->header->data_hash_table_size) <= 0)
 904                 return 0;
 905
 906         /* Map the data hash table, if it isn't mapped yet. */
 907         r = journal_file_map_data_hash_table(f);
 908         if (r < 0)
 909                 return r;
 910
 911         osize = offsetof(Object, data.payload) + size;
 912
 913         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
 914         if (m <= 0)
 915                 return -EBADMSG;
 916
 917         h = hash % m;
 918         p = le64toh(f->data_hash_table[h].head_hash_offset);
 919
 920         while (p > 0) {
 921                 Object *o;
 922
 923                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
 924                 if (r < 0)
 925                         return r;
 926
 927                 if (le64toh(o->data.hash) != hash)
 928                         goto next;
 929
 930                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
 931 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
 932                         uint64_t l;
 933                         size_t rsize = 0;
 934
 935                         l = le64toh(o->object.size);
 936                         if (l <= offsetof(Object, data.payload))
 937                                 return -EBADMSG;
 938
 939                         l -= offsetof(Object, data.payload);
 940
 941                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
 942                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
 943                         if (r < 0)
 944                                 return r;
 945
 946                         if (rsize == size &&
 947                             memcmp(f->compress_buffer, data, size) == 0) {
 948
 949                                 if (ret)
 950                                         *ret = o;
 951
 952                                 if (offset)
 953                                         *offset = p;
 954
 955                                 return 1;
 956                         }
 957 #else
 958                         return -EPROTONOSUPPORT;
 959 #endif
 960                 } else if (le64toh(o->object.size) == osize &&
 961                            memcmp(o->data.payload, data, size) == 0) {
 962
 963                         if (ret)
 964                                 *ret = o;
 965
 966                         if (offset)
 967                                 *offset = p;
 968
 969                         return 1;
 970                 }
 971
 972         next:
 973                 p = le64toh(o->data.next_hash_offset);
 974         }
 975
 976         return 0;
 977 }
 978
 979 int journal_file_find_data_object(
 980                 JournalFile *f,
 981                 const void *data, uint64_t size,
 982                 Object **ret, uint64_t *offset) {
 983
 984         uint64_t hash;
 985
 986         assert(f);
 987         assert(data || size == 0);
 988
 989         hash = hash64(data, size);
 990
 991         return journal_file_find_data_object_with_hash(f,
 992                                                        data, size, hash,
 993                                                        ret, offset);
 994 }
 995
 996 static int journal_file_append_field(
 997                 JournalFile *f,
 998                 const void *field, uint64_t size,
 999                 Object **ret, uint64_t *offset) {
1000
1001         uint64_t hash, p;
1002         uint64_t osize;
1003         Object *o;
1004         int r;
1005
1006         assert(f);
1007         assert(field && size > 0);
1008
1009         hash = hash64(field, size);
1010
1011         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1012         if (r < 0)
1013                 return r;
1014         else if (r > 0) {
1015
1016                 if (ret)
1017                         *ret = o;
1018
1019                 if (offset)
1020                         *offset = p;
1021
1022                 return 0;
1023         }
1024
1025         osize = offsetof(Object, field.payload) + size;
1026         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1027         if (r < 0)
1028                 return r;
1029
1030         o->field.hash = htole64(hash);
1031         memcpy(o->field.payload, field, size);
1032
1033         r = journal_file_link_field(f, o, p, hash);
1034         if (r < 0)
1035                 return r;
1036
1037         /* The linking might have altered the window, so let's
1038          * refresh our pointer */
1039         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1040         if (r < 0)
1041                 return r;
1042
1043 #ifdef HAVE_GCRYPT
1044         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1045         if (r < 0)
1046                 return r;
1047 #endif
1048
1049         if (ret)
1050                 *ret = o;
1051
1052         if (offset)
1053                 *offset = p;
1054
1055         return 0;
1056 }
1057
1058 static int journal_file_append_data(
1059                 JournalFile *f,
1060                 const void *data, uint64_t size,
1061                 Object **ret, uint64_t *offset) {
1062
1063         uint64_t hash, p;
1064         uint64_t osize;
1065         Object *o;
1066         int r, compression = 0;
1067         const void *eq;
1068
1069         assert(f);
1070         assert(data || size == 0);
1071
1072         hash = hash64(data, size);
1073
1074         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1075         if (r < 0)
1076                 return r;
1077         if (r > 0) {
1078
1079                 if (ret)
1080                         *ret = o;
1081
1082                 if (offset)
1083                         *offset = p;
1084
1085                 return 0;
1086         }
1087
1088         osize = offsetof(Object, data.payload) + size;
1089         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1090         if (r < 0)
1091                 return r;
1092
1093         o->data.hash = htole64(hash);
1094
1095 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1096         if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1097                 size_t rsize = 0;
1098
1099                 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1100
1101                 if (compression >= 0) {
1102                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1103                         o->object.flags |= compression;
1104
1105                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1106                                   size, rsize, object_compressed_to_string(compression));
1107                 } else
1108                         /* Compression didn't work, we don't really care why, let's continue without compression */
1109                         compression = 0;
1110         }
1111 #endif
1112
1113         if (compression == 0 && size > 0)
1114                 memcpy(o->data.payload, data, size);
1115
1116         r = journal_file_link_data(f, o, p, hash);
1117         if (r < 0)
1118                 return r;
1119
1120         /* The linking might have altered the window, so let's
1121          * refresh our pointer */
1122         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1123         if (r < 0)
1124                 return r;
1125
1126         if (!data)
1127                 eq = NULL;
1128         else
1129                 eq = memchr(data, '=', size);
1130         if (eq && eq > data) {
1131                 Object *fo = NULL;
1132                 uint64_t fp;
1133
1134                 /* Create field object ... */
1135                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1136                 if (r < 0)
1137                         return r;
1138
1139                 /* ... and link it in. */
1140                 o->data.next_field_offset = fo->field.head_data_offset;
1141                 fo->field.head_data_offset = le64toh(p);
1142         }
1143
1144 #ifdef HAVE_GCRYPT
1145         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1146         if (r < 0)
1147                 return r;
1148 #endif
1149
1150         if (ret)
1151                 *ret = o;
1152
1153         if (offset)
1154                 *offset = p;
1155
1156         return 0;
1157 }
1158
1159 uint64_t journal_file_entry_n_items(Object *o) {
1160         assert(o);
1161
1162         if (o->object.type != OBJECT_ENTRY)
1163                 return 0;
1164
1165         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1166 }
1167
1168 uint64_t journal_file_entry_array_n_items(Object *o) {
1169         assert(o);
1170
1171         if (o->object.type != OBJECT_ENTRY_ARRAY)
1172                 return 0;
1173
1174         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1175 }
1176
1177 uint64_t journal_file_hash_table_n_items(Object *o) {
1178         assert(o);
1179
1180         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1181             o->object.type != OBJECT_FIELD_HASH_TABLE)
1182                 return 0;
1183
1184         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1185 }
1186
1187 static int link_entry_into_array(JournalFile *f,
1188                                  le64_t *first,
1189                                  le64_t *idx,
1190                                  uint64_t p) {
1191         int r;
1192         uint64_t n = 0, ap = 0, q, i, a, hidx;
1193         Object *o;
1194
1195         assert(f);
1196         assert(first);
1197         assert(idx);
1198         assert(p > 0);
1199
1200         a = le64toh(*first);
1201         i = hidx = le64toh(*idx);
1202         while (a > 0) {
1203
1204                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1205                 if (r < 0)
1206                         return r;
1207
1208                 n = journal_file_entry_array_n_items(o);
1209                 if (i < n) {
1210                         o->entry_array.items[i] = htole64(p);
1211                         *idx = htole64(hidx + 1);
1212                         return 0;
1213                 }
1214
1215                 i -= n;
1216                 ap = a;
1217                 a = le64toh(o->entry_array.next_entry_array_offset);
1218         }
1219
1220         if (hidx > n)
1221                 n = (hidx+1) * 2;
1222         else
1223                 n = n * 2;
1224
1225         if (n < 4)
1226                 n = 4;
1227
1228         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1229                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1230                                        &o, &q);
1231         if (r < 0)
1232                 return r;
1233
1234 #ifdef HAVE_GCRYPT
1235         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1236         if (r < 0)
1237                 return r;
1238 #endif
1239
1240         o->entry_array.items[i] = htole64(p);
1241
1242         if (ap == 0)
1243                 *first = htole64(q);
1244         else {
1245                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1246                 if (r < 0)
1247                         return r;
1248
1249                 o->entry_array.next_entry_array_offset = htole64(q);
1250         }
1251
1252         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1253                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1254
1255         *idx = htole64(hidx + 1);
1256
1257         return 0;
1258 }
1259
1260 static int link_entry_into_array_plus_one(JournalFile *f,
1261                                           le64_t *extra,
1262                                           le64_t *first,
1263                                           le64_t *idx,
1264                                           uint64_t p) {
1265
1266         int r;
1267
1268         assert(f);
1269         assert(extra);
1270         assert(first);
1271         assert(idx);
1272         assert(p > 0);
1273
1274         if (*idx == 0)
1275                 *extra = htole64(p);
1276         else {
1277                 le64_t i;
1278
1279                 i = htole64(le64toh(*idx) - 1);
1280                 r = link_entry_into_array(f, first, &i, p);
1281                 if (r < 0)
1282                         return r;
1283         }
1284
1285         *idx = htole64(le64toh(*idx) + 1);
1286         return 0;
1287 }
1288
1289 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1290         uint64_t p;
1291         int r;
1292         assert(f);
1293         assert(o);
1294         assert(offset > 0);
1295
1296         p = le64toh(o->entry.items[i].object_offset);
1297         if (p == 0)
1298                 return -EINVAL;
1299
1300         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1301         if (r < 0)
1302                 return r;
1303
1304         return link_entry_into_array_plus_one(f,
1305                                               &o->data.entry_offset,
1306                                               &o->data.entry_array_offset,
1307                                               &o->data.n_entries,
1308                                               offset);
1309 }
1310
1311 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1312         uint64_t n, i;
1313         int r;
1314
1315         assert(f);
1316         assert(o);
1317         assert(offset > 0);
1318
1319         if (o->object.type != OBJECT_ENTRY)
1320                 return -EINVAL;
1321
1322         __sync_synchronize();
1323
1324         /* Link up the entry itself */
1325         r = link_entry_into_array(f,
1326                                   &f->header->entry_array_offset,
1327                                   &f->header->n_entries,
1328                                   offset);
1329         if (r < 0)
1330                 return r;
1331
1332         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1333
1334         if (f->header->head_entry_realtime == 0)
1335                 f->header->head_entry_realtime = o->entry.realtime;
1336
1337         f->header->tail_entry_realtime = o->entry.realtime;
1338         f->header->tail_entry_monotonic = o->entry.monotonic;
1339
1340         f->tail_entry_monotonic_valid = true;
1341
1342         /* Link up the items */
1343         n = journal_file_entry_n_items(o);
1344         for (i = 0; i < n; i++) {
1345                 r = journal_file_link_entry_item(f, o, offset, i);
1346                 if (r < 0)
1347                         return r;
1348         }
1349
1350         return 0;
1351 }
1352
1353 static int journal_file_append_entry_internal(
1354                 JournalFile *f,
1355                 const dual_timestamp *ts,
1356                 uint64_t xor_hash,
1357                 const EntryItem items[], unsigned n_items,
1358                 uint64_t *seqnum,
1359                 Object **ret, uint64_t *offset) {
1360         uint64_t np;
1361         uint64_t osize;
1362         Object *o;
1363         int r;
1364
1365         assert(f);
1366         assert(items || n_items == 0);
1367         assert(ts);
1368
1369         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1370
1371         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1372         if (r < 0)
1373                 return r;
1374
1375         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1376         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1377         o->entry.realtime = htole64(ts->realtime);
1378         o->entry.monotonic = htole64(ts->monotonic);
1379         o->entry.xor_hash = htole64(xor_hash);
1380         o->entry.boot_id = f->header->boot_id;
1381
1382 #ifdef HAVE_GCRYPT
1383         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1384         if (r < 0)
1385                 return r;
1386 #endif
1387
1388         r = journal_file_link_entry(f, o, np);
1389         if (r < 0)
1390                 return r;
1391
1392         if (ret)
1393                 *ret = o;
1394
1395         if (offset)
1396                 *offset = np;
1397
1398         return 0;
1399 }
1400
1401 void journal_file_post_change(JournalFile *f) {
1402         assert(f);
1403
1404         /* inotify() does not receive IN_MODIFY events from file
1405          * accesses done via mmap(). After each access we hence
1406          * trigger IN_MODIFY by truncating the journal file to its
1407          * current size which triggers IN_MODIFY. */
1408
1409         __sync_synchronize();
1410
1411         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1412                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1413 }
1414
1415 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1416         assert(userdata);
1417
1418         journal_file_post_change(userdata);
1419
1420         return 1;
1421 }
1422
1423 static void schedule_post_change(JournalFile *f) {
1424         sd_event_source *timer;
1425         int enabled, r;
1426         uint64_t now;
1427
1428         assert(f);
1429         assert(f->post_change_timer);
1430
1431         timer = f->post_change_timer;
1432
1433         r = sd_event_source_get_enabled(timer, &enabled);
1434         if (r < 0) {
1435                 log_error_errno(-r, "Failed to get ftruncate timer state: %m");
1436                 return;
1437         }
1438
1439         if (enabled == SD_EVENT_ONESHOT)
1440                 return;
1441
1442         r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1443         if (r < 0) {
1444                 log_error_errno(-r, "Failed to get clock's now for scheduling ftruncate: %m");
1445                 return;
1446         }
1447
1448         r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1449         if (r < 0) {
1450                 log_error_errno(-r, "Failed to set time for scheduling ftruncate: %m");
1451                 return;
1452         }
1453
1454         r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1455         if (r < 0) {
1456                 log_error_errno(-r, "Failed to enable scheduled ftruncate: %m");
1457                 return;
1458         }
1459 }
1460
1461 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1462 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1463         _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1464         int r;
1465
1466         assert(f);
1467         assert_return(!f->post_change_timer, -EINVAL);
1468         assert(e);
1469         assert(t);
1470
1471         r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1472         if (r < 0)
1473                 return r;
1474
1475         r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1476         if (r < 0)
1477                 return r;
1478
1479         f->post_change_timer = timer;
1480         timer = NULL;
1481         f->post_change_timer_period = t;
1482
1483         return r;
1484 }
1485
1486 static int entry_item_cmp(const void *_a, const void *_b) {
1487         const EntryItem *a = _a, *b = _b;
1488
1489         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1490                 return -1;
1491         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1492                 return 1;
1493         return 0;
1494 }
1495
1496 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1497         unsigned i;
1498         EntryItem *items;
1499         int r;
1500         uint64_t xor_hash = 0;
1501         struct dual_timestamp _ts;
1502
1503         assert(f);
1504         assert(iovec || n_iovec == 0);
1505
1506         if (!ts) {
1507                 dual_timestamp_get(&_ts);
1508                 ts = &_ts;
1509         }
1510
1511         if (f->tail_entry_monotonic_valid &&
1512             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1513                 return -EINVAL;
1514
1515 #ifdef HAVE_GCRYPT
1516         r = journal_file_maybe_append_tag(f, ts->realtime);
1517         if (r < 0)
1518                 return r;
1519 #endif
1520
1521         /* alloca() can't take 0, hence let's allocate at least one */
1522         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1523
1524         for (i = 0; i < n_iovec; i++) {
1525                 uint64_t p;
1526                 Object *o;
1527
1528                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1529                 if (r < 0)
1530                         return r;
1531
1532                 xor_hash ^= le64toh(o->data.hash);
1533                 items[i].object_offset = htole64(p);
1534                 items[i].hash = o->data.hash;
1535         }
1536
1537         /* Order by the position on disk, in order to improve seek
1538          * times for rotating media. */
1539         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1540
1541         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1542
1543         /* If the memory mapping triggered a SIGBUS then we return an
1544          * IO error and ignore the error code passed down to us, since
1545          * it is very likely just an effect of a nullified replacement
1546          * mapping page */
1547
1548         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1549                 r = -EIO;
1550
1551         if (f->post_change_timer)
1552                 schedule_post_change(f);
1553         else
1554                 journal_file_post_change(f);
1555
1556         return r;
1557 }
1558
1559 typedef struct ChainCacheItem {
1560         uint64_t first; /* the array at the beginning of the chain */
1561         uint64_t array; /* the cached array */
1562         uint64_t begin; /* the first item in the cached array */
1563         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1564         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1565 } ChainCacheItem;
1566
1567 static void chain_cache_put(
1568                 OrderedHashmap *h,
1569                 ChainCacheItem *ci,
1570                 uint64_t first,
1571                 uint64_t array,
1572                 uint64_t begin,
1573                 uint64_t total,
1574                 uint64_t last_index) {
1575
1576         if (!ci) {
1577                 /* If the chain item to cache for this chain is the
1578                  * first one it's not worth caching anything */
1579                 if (array == first)
1580                         return;
1581
1582                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1583                         ci = ordered_hashmap_steal_first(h);
1584                         assert(ci);
1585                 } else {
1586                         ci = new(ChainCacheItem, 1);
1587                         if (!ci)
1588                                 return;
1589                 }
1590
1591                 ci->first = first;
1592
1593                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1594                         free(ci);
1595                         return;
1596                 }
1597         } else
1598                 assert(ci->first == first);
1599
1600         ci->array = array;
1601         ci->begin = begin;
1602         ci->total = total;
1603         ci->last_index = last_index;
1604 }
1605
1606 static int generic_array_get(
1607                 JournalFile *f,
1608                 uint64_t first,
1609                 uint64_t i,
1610                 Object **ret, uint64_t *offset) {
1611
1612         Object *o;
1613         uint64_t p = 0, a, t = 0;
1614         int r;
1615         ChainCacheItem *ci;
1616
1617         assert(f);
1618
1619         a = first;
1620
1621         /* Try the chain cache first */
1622         ci = ordered_hashmap_get(f->chain_cache, &first);
1623         if (ci && i > ci->total) {
1624                 a = ci->array;
1625                 i -= ci->total;
1626                 t = ci->total;
1627         }
1628
1629         while (a > 0) {
1630                 uint64_t k;
1631
1632                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1633                 if (r < 0)
1634                         return r;
1635
1636                 k = journal_file_entry_array_n_items(o);
1637                 if (i < k) {
1638                         p = le64toh(o->entry_array.items[i]);
1639                         goto found;
1640                 }
1641
1642                 i -= k;
1643                 t += k;
1644                 a = le64toh(o->entry_array.next_entry_array_offset);
1645         }
1646
1647         return 0;
1648
1649 found:
1650         /* Let's cache this item for the next invocation */
1651         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1652
1653         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1654         if (r < 0)
1655                 return r;
1656
1657         if (ret)
1658                 *ret = o;
1659
1660         if (offset)
1661                 *offset = p;
1662
1663         return 1;
1664 }
1665
1666 static int generic_array_get_plus_one(
1667                 JournalFile *f,
1668                 uint64_t extra,
1669                 uint64_t first,
1670                 uint64_t i,
1671                 Object **ret, uint64_t *offset) {
1672
1673         Object *o;
1674
1675         assert(f);
1676
1677         if (i == 0) {
1678                 int r;
1679
1680                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1681                 if (r < 0)
1682                         return r;
1683
1684                 if (ret)
1685                         *ret = o;
1686
1687                 if (offset)
1688                         *offset = extra;
1689
1690                 return 1;
1691         }
1692
1693         return generic_array_get(f, first, i-1, ret, offset);
1694 }
1695
1696 enum {
1697         TEST_FOUND,
1698         TEST_LEFT,
1699         TEST_RIGHT
1700 };
1701
1702 static int generic_array_bisect(
1703                 JournalFile *f,
1704                 uint64_t first,
1705                 uint64_t n,
1706                 uint64_t needle,
1707                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1708                 direction_t direction,
1709                 Object **ret,
1710                 uint64_t *offset,
1711                 uint64_t *idx) {
1712
1713         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1714         bool subtract_one = false;
1715         Object *o, *array = NULL;
1716         int r;
1717         ChainCacheItem *ci;
1718
1719         assert(f);
1720         assert(test_object);
1721
1722         /* Start with the first array in the chain */
1723         a = first;
1724
1725         ci = ordered_hashmap_get(f->chain_cache, &first);
1726         if (ci && n > ci->total) {
1727                 /* Ah, we have iterated this bisection array chain
1728                  * previously! Let's see if we can skip ahead in the
1729                  * chain, as far as the last time. But we can't jump
1730                  * backwards in the chain, so let's check that
1731                  * first. */
1732
1733                 r = test_object(f, ci->begin, needle);
1734                 if (r < 0)
1735                         return r;
1736
1737                 if (r == TEST_LEFT) {
1738                         /* OK, what we are looking for is right of the
1739                          * begin of this EntryArray, so let's jump
1740                          * straight to previously cached array in the
1741                          * chain */
1742
1743                         a = ci->array;
1744                         n -= ci->total;
1745                         t = ci->total;
1746                         last_index = ci->last_index;
1747                 }
1748         }
1749
1750         while (a > 0) {
1751                 uint64_t left, right, k, lp;
1752
1753                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1754                 if (r < 0)
1755                         return r;
1756
1757                 k = journal_file_entry_array_n_items(array);
1758                 right = MIN(k, n);
1759                 if (right <= 0)
1760                         return 0;
1761
1762                 i = right - 1;
1763                 lp = p = le64toh(array->entry_array.items[i]);
1764                 if (p <= 0)
1765                         return -EBADMSG;
1766
1767                 r = test_object(f, p, needle);
1768                 if (r < 0)
1769                         return r;
1770
1771                 if (r == TEST_FOUND)
1772                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1773
1774                 if (r == TEST_RIGHT) {
1775                         left = 0;
1776                         right -= 1;
1777
1778                         if (last_index != (uint64_t) -1) {
1779                                 assert(last_index <= right);
1780
1781                                 /* If we cached the last index we
1782                                  * looked at, let's try to not to jump
1783                                  * too wildly around and see if we can
1784                                  * limit the range to look at early to
1785                                  * the immediate neighbors of the last
1786                                  * index we looked at. */
1787
1788                                 if (last_index > 0) {
1789                                         uint64_t x = last_index - 1;
1790
1791                                         p = le64toh(array->entry_array.items[x]);
1792                                         if (p <= 0)
1793                                                 return -EBADMSG;
1794
1795                                         r = test_object(f, p, needle);
1796                                         if (r < 0)
1797                                                 return r;
1798
1799                                         if (r == TEST_FOUND)
1800                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1801
1802                                         if (r == TEST_RIGHT)
1803                                                 right = x;
1804                                         else
1805                                                 left = x + 1;
1806                                 }
1807
1808                                 if (last_index < right) {
1809                                         uint64_t y = last_index + 1;
1810
1811                                         p = le64toh(array->entry_array.items[y]);
1812                                         if (p <= 0)
1813                                                 return -EBADMSG;
1814
1815                                         r = test_object(f, p, needle);
1816                                         if (r < 0)
1817                                                 return r;
1818
1819                                         if (r == TEST_FOUND)
1820                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1821
1822                                         if (r == TEST_RIGHT)
1823                                                 right = y;
1824                                         else
1825                                                 left = y + 1;
1826                                 }
1827                         }
1828
1829                         for (;;) {
1830                                 if (left == right) {
1831                                         if (direction == DIRECTION_UP)
1832                                                 subtract_one = true;
1833
1834                                         i = left;
1835                                         goto found;
1836                                 }
1837
1838                                 assert(left < right);
1839                                 i = (left + right) / 2;
1840
1841                                 p = le64toh(array->entry_array.items[i]);
1842                                 if (p <= 0)
1843                                         return -EBADMSG;
1844
1845                                 r = test_object(f, p, needle);
1846                                 if (r < 0)
1847                                         return r;
1848
1849                                 if (r == TEST_FOUND)
1850                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1851
1852                                 if (r == TEST_RIGHT)
1853                                         right = i;
1854                                 else
1855                                         left = i + 1;
1856                         }
1857                 }
1858
1859                 if (k >= n) {
1860                         if (direction == DIRECTION_UP) {
1861                                 i = n;
1862                                 subtract_one = true;
1863                                 goto found;
1864                         }
1865
1866                         return 0;
1867                 }
1868
1869                 last_p = lp;
1870
1871                 n -= k;
1872                 t += k;
1873                 last_index = (uint64_t) -1;
1874                 a = le64toh(array->entry_array.next_entry_array_offset);
1875         }
1876
1877         return 0;
1878
1879 found:
1880         if (subtract_one && t == 0 && i == 0)
1881                 return 0;
1882
1883         /* Let's cache this item for the next invocation */
1884         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1885
1886         if (subtract_one && i == 0)
1887                 p = last_p;
1888         else if (subtract_one)
1889                 p = le64toh(array->entry_array.items[i-1]);
1890         else
1891                 p = le64toh(array->entry_array.items[i]);
1892
1893         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1894         if (r < 0)
1895                 return r;
1896
1897         if (ret)
1898                 *ret = o;
1899
1900         if (offset)
1901                 *offset = p;
1902
1903         if (idx)
1904                 *idx = t + i + (subtract_one ? -1 : 0);
1905
1906         return 1;
1907 }
1908
1909 static int generic_array_bisect_plus_one(
1910                 JournalFile *f,
1911                 uint64_t extra,
1912                 uint64_t first,
1913                 uint64_t n,
1914                 uint64_t needle,
1915                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1916                 direction_t direction,
1917                 Object **ret,
1918                 uint64_t *offset,
1919                 uint64_t *idx) {
1920
1921         int r;
1922         bool step_back = false;
1923         Object *o;
1924
1925         assert(f);
1926         assert(test_object);
1927
1928         if (n <= 0)
1929                 return 0;
1930
1931         /* This bisects the array in object 'first', but first checks
1932          * an extra  */
1933         r = test_object(f, extra, needle);
1934         if (r < 0)
1935                 return r;
1936
1937         if (r == TEST_FOUND)
1938                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1939
1940         /* if we are looking with DIRECTION_UP then we need to first
1941            see if in the actual array there is a matching entry, and
1942            return the last one of that. But if there isn't any we need
1943            to return this one. Hence remember this, and return it
1944            below. */
1945         if (r == TEST_LEFT)
1946                 step_back = direction == DIRECTION_UP;
1947
1948         if (r == TEST_RIGHT) {
1949                 if (direction == DIRECTION_DOWN)
1950                         goto found;
1951                 else
1952                         return 0;
1953         }
1954
1955         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1956
1957         if (r == 0 && step_back)
1958                 goto found;
1959
1960         if (r > 0 && idx)
1961                 (*idx) ++;
1962
1963         return r;
1964
1965 found:
1966         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1967         if (r < 0)
1968                 return r;
1969
1970         if (ret)
1971                 *ret = o;
1972
1973         if (offset)
1974                 *offset = extra;
1975
1976         if (idx)
1977                 *idx = 0;
1978
1979         return 1;
1980 }
1981
1982 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1983         assert(f);
1984         assert(p > 0);
1985
1986         if (p == needle)
1987                 return TEST_FOUND;
1988         else if (p < needle)
1989                 return TEST_LEFT;
1990         else
1991                 return TEST_RIGHT;
1992 }
1993
1994 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1995         Object *o;
1996         int r;
1997
1998         assert(f);
1999         assert(p > 0);
2000
2001         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2002         if (r < 0)
2003                 return r;
2004
2005         if (le64toh(o->entry.seqnum) == needle)
2006                 return TEST_FOUND;
2007         else if (le64toh(o->entry.seqnum) < needle)
2008                 return TEST_LEFT;
2009         else
2010                 return TEST_RIGHT;
2011 }
2012
2013 int journal_file_move_to_entry_by_seqnum(
2014                 JournalFile *f,
2015                 uint64_t seqnum,
2016                 direction_t direction,
2017                 Object **ret,
2018                 uint64_t *offset) {
2019
2020         return generic_array_bisect(f,
2021                                     le64toh(f->header->entry_array_offset),
2022                                     le64toh(f->header->n_entries),
2023                                     seqnum,
2024                                     test_object_seqnum,
2025                                     direction,
2026                                     ret, offset, NULL);
2027 }
2028
2029 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2030         Object *o;
2031         int r;
2032
2033         assert(f);
2034         assert(p > 0);
2035
2036         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2037         if (r < 0)
2038                 return r;
2039
2040         if (le64toh(o->entry.realtime) == needle)
2041                 return TEST_FOUND;
2042         else if (le64toh(o->entry.realtime) < needle)
2043                 return TEST_LEFT;
2044         else
2045                 return TEST_RIGHT;
2046 }
2047
2048 int journal_file_move_to_entry_by_realtime(
2049                 JournalFile *f,
2050                 uint64_t realtime,
2051                 direction_t direction,
2052                 Object **ret,
2053                 uint64_t *offset) {
2054
2055         return generic_array_bisect(f,
2056                                     le64toh(f->header->entry_array_offset),
2057                                     le64toh(f->header->n_entries),
2058                                     realtime,
2059                                     test_object_realtime,
2060                                     direction,
2061                                     ret, offset, NULL);
2062 }
2063
2064 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2065         Object *o;
2066         int r;
2067
2068         assert(f);
2069         assert(p > 0);
2070
2071         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2072         if (r < 0)
2073                 return r;
2074
2075         if (le64toh(o->entry.monotonic) == needle)
2076                 return TEST_FOUND;
2077         else if (le64toh(o->entry.monotonic) < needle)
2078                 return TEST_LEFT;
2079         else
2080                 return TEST_RIGHT;
2081 }
2082
2083 static int find_data_object_by_boot_id(
2084                 JournalFile *f,
2085                 sd_id128_t boot_id,
2086                 Object **o,
2087                 uint64_t *b) {
2088
2089         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2090
2091         sd_id128_to_string(boot_id, t + 9);
2092         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2093 }
2094
2095 int journal_file_move_to_entry_by_monotonic(
2096                 JournalFile *f,
2097                 sd_id128_t boot_id,
2098                 uint64_t monotonic,
2099                 direction_t direction,
2100                 Object **ret,
2101                 uint64_t *offset) {
2102
2103         Object *o;
2104         int r;
2105
2106         assert(f);
2107
2108         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2109         if (r < 0)
2110                 return r;
2111         if (r == 0)
2112                 return -ENOENT;
2113
2114         return generic_array_bisect_plus_one(f,
2115                                              le64toh(o->data.entry_offset),
2116                                              le64toh(o->data.entry_array_offset),
2117                                              le64toh(o->data.n_entries),
2118                                              monotonic,
2119                                              test_object_monotonic,
2120                                              direction,
2121                                              ret, offset, NULL);
2122 }
2123
2124 void journal_file_reset_location(JournalFile *f) {
2125         f->location_type = LOCATION_HEAD;
2126         f->current_offset = 0;
2127         f->current_seqnum = 0;
2128         f->current_realtime = 0;
2129         f->current_monotonic = 0;
2130         zero(f->current_boot_id);
2131         f->current_xor_hash = 0;
2132 }
2133
2134 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2135         f->location_type = LOCATION_SEEK;
2136         f->current_offset = offset;
2137         f->current_seqnum = le64toh(o->entry.seqnum);
2138         f->current_realtime = le64toh(o->entry.realtime);
2139         f->current_monotonic = le64toh(o->entry.monotonic);
2140         f->current_boot_id = o->entry.boot_id;
2141         f->current_xor_hash = le64toh(o->entry.xor_hash);
2142 }
2143
2144 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2145         assert(af);
2146         assert(bf);
2147         assert(af->location_type == LOCATION_SEEK);
2148         assert(bf->location_type == LOCATION_SEEK);
2149
2150         /* If contents and timestamps match, these entries are
2151          * identical, even if the seqnum does not match */
2152         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2153             af->current_monotonic == bf->current_monotonic &&
2154             af->current_realtime == bf->current_realtime &&
2155             af->current_xor_hash == bf->current_xor_hash)
2156                 return 0;
2157
2158         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2159
2160                 /* If this is from the same seqnum source, compare
2161                  * seqnums */
2162                 if (af->current_seqnum < bf->current_seqnum)
2163                         return -1;
2164                 if (af->current_seqnum > bf->current_seqnum)
2165                         return 1;
2166
2167                 /* Wow! This is weird, different data but the same
2168                  * seqnums? Something is borked, but let's make the
2169                  * best of it and compare by time. */
2170         }
2171
2172         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2173
2174                 /* If the boot id matches, compare monotonic time */
2175                 if (af->current_monotonic < bf->current_monotonic)
2176                         return -1;
2177                 if (af->current_monotonic > bf->current_monotonic)
2178                         return 1;
2179         }
2180
2181         /* Otherwise, compare UTC time */
2182         if (af->current_realtime < bf->current_realtime)
2183                 return -1;
2184         if (af->current_realtime > bf->current_realtime)
2185                 return 1;
2186
2187         /* Finally, compare by contents */
2188         if (af->current_xor_hash < bf->current_xor_hash)
2189                 return -1;
2190         if (af->current_xor_hash > bf->current_xor_hash)
2191                 return 1;
2192
2193         return 0;
2194 }
2195
2196 int journal_file_next_entry(
2197                 JournalFile *f,
2198                 uint64_t p,
2199                 direction_t direction,
2200                 Object **ret, uint64_t *offset) {
2201
2202         uint64_t i, n, ofs;
2203         int r;
2204
2205         assert(f);
2206
2207         n = le64toh(f->header->n_entries);
2208         if (n <= 0)
2209                 return 0;
2210
2211         if (p == 0)
2212                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2213         else {
2214                 r = generic_array_bisect(f,
2215                                          le64toh(f->header->entry_array_offset),
2216                                          le64toh(f->header->n_entries),
2217                                          p,
2218                                          test_object_offset,
2219                                          DIRECTION_DOWN,
2220                                          NULL, NULL,
2221                                          &i);
2222                 if (r <= 0)
2223                         return r;
2224
2225                 if (direction == DIRECTION_DOWN) {
2226                         if (i >= n - 1)
2227                                 return 0;
2228
2229                         i++;
2230                 } else {
2231                         if (i <= 0)
2232                                 return 0;
2233
2234                         i--;
2235                 }
2236         }
2237
2238         /* And jump to it */
2239         r = generic_array_get(f,
2240                               le64toh(f->header->entry_array_offset),
2241                               i,
2242                               ret, &ofs);
2243         if (r <= 0)
2244                 return r;
2245
2246         if (p > 0 &&
2247             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2248                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2249                           f->path, i);
2250                 return -EBADMSG;
2251         }
2252
2253         if (offset)
2254                 *offset = ofs;
2255
2256         return 1;
2257 }
2258
2259 int journal_file_next_entry_for_data(
2260                 JournalFile *f,
2261                 Object *o, uint64_t p,
2262                 uint64_t data_offset,
2263                 direction_t direction,
2264                 Object **ret, uint64_t *offset) {
2265
2266         uint64_t n, i;
2267         int r;
2268         Object *d;
2269
2270         assert(f);
2271         assert(p > 0 || !o);
2272
2273         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2274         if (r < 0)
2275                 return r;
2276
2277         n = le64toh(d->data.n_entries);
2278         if (n <= 0)
2279                 return n;
2280
2281         if (!o)
2282                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2283         else {
2284                 if (o->object.type != OBJECT_ENTRY)
2285                         return -EINVAL;
2286
2287                 r = generic_array_bisect_plus_one(f,
2288                                                   le64toh(d->data.entry_offset),
2289                                                   le64toh(d->data.entry_array_offset),
2290                                                   le64toh(d->data.n_entries),
2291                                                   p,
2292                                                   test_object_offset,
2293                                                   DIRECTION_DOWN,
2294                                                   NULL, NULL,
2295                                                   &i);
2296
2297                 if (r <= 0)
2298                         return r;
2299
2300                 if (direction == DIRECTION_DOWN) {
2301                         if (i >= n - 1)
2302                                 return 0;
2303
2304                         i++;
2305                 } else {
2306                         if (i <= 0)
2307                                 return 0;
2308
2309                         i--;
2310                 }
2311
2312         }
2313
2314         return generic_array_get_plus_one(f,
2315                                           le64toh(d->data.entry_offset),
2316                                           le64toh(d->data.entry_array_offset),
2317                                           i,
2318                                           ret, offset);
2319 }
2320
2321 int journal_file_move_to_entry_by_offset_for_data(
2322                 JournalFile *f,
2323                 uint64_t data_offset,
2324                 uint64_t p,
2325                 direction_t direction,
2326                 Object **ret, uint64_t *offset) {
2327
2328         int r;
2329         Object *d;
2330
2331         assert(f);
2332
2333         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2334         if (r < 0)
2335                 return r;
2336
2337         return generic_array_bisect_plus_one(f,
2338                                              le64toh(d->data.entry_offset),
2339                                              le64toh(d->data.entry_array_offset),
2340                                              le64toh(d->data.n_entries),
2341                                              p,
2342                                              test_object_offset,
2343                                              direction,
2344                                              ret, offset, NULL);
2345 }
2346
2347 int journal_file_move_to_entry_by_monotonic_for_data(
2348                 JournalFile *f,
2349                 uint64_t data_offset,
2350                 sd_id128_t boot_id,
2351                 uint64_t monotonic,
2352                 direction_t direction,
2353                 Object **ret, uint64_t *offset) {
2354
2355         Object *o, *d;
2356         int r;
2357         uint64_t b, z;
2358
2359         assert(f);
2360
2361         /* First, seek by time */
2362         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2363         if (r < 0)
2364                 return r;
2365         if (r == 0)
2366                 return -ENOENT;
2367
2368         r = generic_array_bisect_plus_one(f,
2369                                           le64toh(o->data.entry_offset),
2370                                           le64toh(o->data.entry_array_offset),
2371                                           le64toh(o->data.n_entries),
2372                                           monotonic,
2373                                           test_object_monotonic,
2374                                           direction,
2375                                           NULL, &z, NULL);
2376         if (r <= 0)
2377                 return r;
2378
2379         /* And now, continue seeking until we find an entry that
2380          * exists in both bisection arrays */
2381
2382         for (;;) {
2383                 Object *qo;
2384                 uint64_t p, q;
2385
2386                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2387                 if (r < 0)
2388                         return r;
2389
2390                 r = generic_array_bisect_plus_one(f,
2391                                                   le64toh(d->data.entry_offset),
2392                                                   le64toh(d->data.entry_array_offset),
2393                                                   le64toh(d->data.n_entries),
2394                                                   z,
2395                                                   test_object_offset,
2396                                                   direction,
2397                                                   NULL, &p, NULL);
2398                 if (r <= 0)
2399                         return r;
2400
2401                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2402                 if (r < 0)
2403                         return r;
2404
2405                 r = generic_array_bisect_plus_one(f,
2406                                                   le64toh(o->data.entry_offset),
2407                                                   le64toh(o->data.entry_array_offset),
2408                                                   le64toh(o->data.n_entries),
2409                                                   p,
2410                                                   test_object_offset,
2411                                                   direction,
2412                                                   &qo, &q, NULL);
2413
2414                 if (r <= 0)
2415                         return r;
2416
2417                 if (p == q) {
2418                         if (ret)
2419                                 *ret = qo;
2420                         if (offset)
2421                                 *offset = q;
2422
2423                         return 1;
2424                 }
2425
2426                 z = q;
2427         }
2428 }
2429
2430 int journal_file_move_to_entry_by_seqnum_for_data(
2431                 JournalFile *f,
2432                 uint64_t data_offset,
2433                 uint64_t seqnum,
2434                 direction_t direction,
2435                 Object **ret, uint64_t *offset) {
2436
2437         Object *d;
2438         int r;
2439
2440         assert(f);
2441
2442         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2443         if (r < 0)
2444                 return r;
2445
2446         return generic_array_bisect_plus_one(f,
2447                                              le64toh(d->data.entry_offset),
2448                                              le64toh(d->data.entry_array_offset),
2449                                              le64toh(d->data.n_entries),
2450                                              seqnum,
2451                                              test_object_seqnum,
2452                                              direction,
2453                                              ret, offset, NULL);
2454 }
2455
2456 int journal_file_move_to_entry_by_realtime_for_data(
2457                 JournalFile *f,
2458                 uint64_t data_offset,
2459                 uint64_t realtime,
2460                 direction_t direction,
2461                 Object **ret, uint64_t *offset) {
2462
2463         Object *d;
2464         int r;
2465
2466         assert(f);
2467
2468         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2469         if (r < 0)
2470                 return r;
2471
2472         return generic_array_bisect_plus_one(f,
2473                                              le64toh(d->data.entry_offset),
2474                                              le64toh(d->data.entry_array_offset),
2475                                              le64toh(d->data.n_entries),
2476                                              realtime,
2477                                              test_object_realtime,
2478                                              direction,
2479                                              ret, offset, NULL);
2480 }
2481
2482 void journal_file_dump(JournalFile *f) {
2483         Object *o;
2484         int r;
2485         uint64_t p;
2486
2487         assert(f);
2488
2489         journal_file_print_header(f);
2490
2491         p = le64toh(f->header->header_size);
2492         while (p != 0) {
2493                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2494                 if (r < 0)
2495                         goto fail;
2496
2497                 switch (o->object.type) {
2498
2499                 case OBJECT_UNUSED:
2500                         printf("Type: OBJECT_UNUSED\n");
2501                         break;
2502
2503                 case OBJECT_DATA:
2504                         printf("Type: OBJECT_DATA\n");
2505                         break;
2506
2507                 case OBJECT_FIELD:
2508                         printf("Type: OBJECT_FIELD\n");
2509                         break;
2510
2511                 case OBJECT_ENTRY:
2512                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2513                                le64toh(o->entry.seqnum),
2514                                le64toh(o->entry.monotonic),
2515                                le64toh(o->entry.realtime));
2516                         break;
2517
2518                 case OBJECT_FIELD_HASH_TABLE:
2519                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2520                         break;
2521
2522                 case OBJECT_DATA_HASH_TABLE:
2523                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2524                         break;
2525
2526                 case OBJECT_ENTRY_ARRAY:
2527                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2528                         break;
2529
2530                 case OBJECT_TAG:
2531                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2532                                le64toh(o->tag.seqnum),
2533                                le64toh(o->tag.epoch));
2534                         break;
2535
2536                 default:
2537                         printf("Type: unknown (%i)\n", o->object.type);
2538                         break;
2539                 }
2540
2541                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2542                         printf("Flags: %s\n",
2543                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2544
2545                 if (p == le64toh(f->header->tail_object_offset))
2546                         p = 0;
2547                 else
2548                         p = p + ALIGN64(le64toh(o->object.size));
2549         }
2550
2551         return;
2552 fail:
2553         log_error("File corrupt");
2554 }
2555
2556 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2557         const char *x;
2558
2559         x = format_timestamp(buf, l, t);
2560         if (x)
2561                 return x;
2562         return " --- ";
2563 }
2564
2565 void journal_file_print_header(JournalFile *f) {
2566         char a[33], b[33], c[33], d[33];
2567         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2568         struct stat st;
2569         char bytes[FORMAT_BYTES_MAX];
2570
2571         assert(f);
2572
2573         printf("File Path: %s\n"
2574                "File ID: %s\n"
2575                "Machine ID: %s\n"
2576                "Boot ID: %s\n"
2577                "Sequential Number ID: %s\n"
2578                "State: %s\n"
2579                "Compatible Flags:%s%s\n"
2580                "Incompatible Flags:%s%s%s\n"
2581                "Header size: %"PRIu64"\n"
2582                "Arena size: %"PRIu64"\n"
2583                "Data Hash Table Size: %"PRIu64"\n"
2584                "Field Hash Table Size: %"PRIu64"\n"
2585                "Rotate Suggested: %s\n"
2586                "Head Sequential Number: %"PRIu64"\n"
2587                "Tail Sequential Number: %"PRIu64"\n"
2588                "Head Realtime Timestamp: %s\n"
2589                "Tail Realtime Timestamp: %s\n"
2590                "Tail Monotonic Timestamp: %s\n"
2591                "Objects: %"PRIu64"\n"
2592                "Entry Objects: %"PRIu64"\n",
2593                f->path,
2594                sd_id128_to_string(f->header->file_id, a),
2595                sd_id128_to_string(f->header->machine_id, b),
2596                sd_id128_to_string(f->header->boot_id, c),
2597                sd_id128_to_string(f->header->seqnum_id, d),
2598                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2599                f->header->state == STATE_ONLINE ? "ONLINE" :
2600                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2601                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2602                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2603                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2604                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2605                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2606                le64toh(f->header->header_size),
2607                le64toh(f->header->arena_size),
2608                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2609                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2610                yes_no(journal_file_rotate_suggested(f, 0)),
2611                le64toh(f->header->head_entry_seqnum),
2612                le64toh(f->header->tail_entry_seqnum),
2613                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2614                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2615                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2616                le64toh(f->header->n_objects),
2617                le64toh(f->header->n_entries));
2618
2619         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2620                 printf("Data Objects: %"PRIu64"\n"
2621                        "Data Hash Table Fill: %.1f%%\n",
2622                        le64toh(f->header->n_data),
2623                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2624
2625         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2626                 printf("Field Objects: %"PRIu64"\n"
2627                        "Field Hash Table Fill: %.1f%%\n",
2628                        le64toh(f->header->n_fields),
2629                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2630
2631         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2632                 printf("Tag Objects: %"PRIu64"\n",
2633                        le64toh(f->header->n_tags));
2634         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2635                 printf("Entry Array Objects: %"PRIu64"\n",
2636                        le64toh(f->header->n_entry_arrays));
2637
2638         if (fstat(f->fd, &st) >= 0)
2639                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2640 }
2641
2642 static int journal_file_warn_btrfs(JournalFile *f) {
2643         unsigned attrs;
2644         int r;
2645
2646         assert(f);
2647
2648         /* Before we write anything, check if the COW logic is turned
2649          * off on btrfs. Given our write pattern that is quite
2650          * unfriendly to COW file systems this should greatly improve
2651          * performance on COW file systems, such as btrfs, at the
2652          * expense of data integrity features (which shouldn't be too
2653          * bad, given that we do our own checksumming). */
2654
2655         r = btrfs_is_filesystem(f->fd);
2656         if (r < 0)
2657                 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2658         if (!r)
2659                 return 0;
2660
2661         r = read_attr_fd(f->fd, &attrs);
2662         if (r < 0)
2663                 return log_warning_errno(r, "Failed to read file attributes: %m");
2664
2665         if (attrs & FS_NOCOW_FL) {
2666                 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2667                 return 0;
2668         }
2669
2670         log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2671                    "This is likely to slow down journal access substantially, please consider turning "
2672                    "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2673
2674         return 1;
2675 }
2676
2677 int journal_file_open(
2678                 const char *fname,
2679                 int flags,
2680                 mode_t mode,
2681                 bool compress,
2682                 bool seal,
2683                 JournalMetrics *metrics,
2684                 MMapCache *mmap_cache,
2685                 JournalFile *template,
2686                 JournalFile **ret) {
2687
2688         bool newly_created = false;
2689         JournalFile *f;
2690         void *h;
2691         int r;
2692
2693         assert(fname);
2694         assert(ret);
2695
2696         if ((flags & O_ACCMODE) != O_RDONLY &&
2697             (flags & O_ACCMODE) != O_RDWR)
2698                 return -EINVAL;
2699
2700         if (!endswith(fname, ".journal") &&
2701             !endswith(fname, ".journal~"))
2702                 return -EINVAL;
2703
2704         f = new0(JournalFile, 1);
2705         if (!f)
2706                 return -ENOMEM;
2707
2708         f->fd = -1;
2709         f->mode = mode;
2710
2711         f->flags = flags;
2712         f->prot = prot_from_flags(flags);
2713         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2714 #if defined(HAVE_LZ4)
2715         f->compress_lz4 = compress;
2716 #elif defined(HAVE_XZ)
2717         f->compress_xz = compress;
2718 #endif
2719 #ifdef HAVE_GCRYPT
2720         f->seal = seal;
2721 #endif
2722
2723         if (mmap_cache)
2724                 f->mmap = mmap_cache_ref(mmap_cache);
2725         else {
2726                 f->mmap = mmap_cache_new();
2727                 if (!f->mmap) {
2728                         r = -ENOMEM;
2729                         goto fail;
2730                 }
2731         }
2732
2733         f->path = strdup(fname);
2734         if (!f->path) {
2735                 r = -ENOMEM;
2736                 goto fail;
2737         }
2738
2739         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2740         if (!f->chain_cache) {
2741                 r = -ENOMEM;
2742                 goto fail;
2743         }
2744
2745         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2746         if (f->fd < 0) {
2747                 r = -errno;
2748                 goto fail;
2749         }
2750
2751         r = journal_file_fstat(f);
2752         if (r < 0)
2753                 goto fail;
2754
2755         if (f->last_stat.st_size == 0 && f->writable) {
2756
2757                 (void) journal_file_warn_btrfs(f);
2758
2759                 /* Let's attach the creation time to the journal file,
2760                  * so that the vacuuming code knows the age of this
2761                  * file even if the file might end up corrupted one
2762                  * day... Ideally we'd just use the creation time many
2763                  * file systems maintain for each file, but there is
2764                  * currently no usable API to query this, hence let's
2765                  * emulate this via extended attributes. If extended
2766                  * attributes are not supported we'll just skip this,
2767                  * and rely solely on mtime/atime/ctime of the file. */
2768
2769                 fd_setcrtime(f->fd, 0);
2770
2771 #ifdef HAVE_GCRYPT
2772                 /* Try to load the FSPRG state, and if we can't, then
2773                  * just don't do sealing */
2774                 if (f->seal) {
2775                         r = journal_file_fss_load(f);
2776                         if (r < 0)
2777                                 f->seal = false;
2778                 }
2779 #endif
2780
2781                 r = journal_file_init_header(f, template);
2782                 if (r < 0)
2783                         goto fail;
2784
2785                 r = journal_file_fstat(f);
2786                 if (r < 0)
2787                         goto fail;
2788
2789                 newly_created = true;
2790         }
2791
2792         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2793                 r = -ENODATA;
2794                 goto fail;
2795         }
2796
2797         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2798         if (r < 0)
2799                 goto fail;
2800
2801         f->header = h;
2802
2803         if (!newly_created) {
2804                 r = journal_file_verify_header(f);
2805                 if (r < 0)
2806                         goto fail;
2807         }
2808
2809 #ifdef HAVE_GCRYPT
2810         if (!newly_created && f->writable) {
2811                 r = journal_file_fss_load(f);
2812                 if (r < 0)
2813                         goto fail;
2814         }
2815 #endif
2816
2817         if (f->writable) {
2818                 if (metrics) {
2819                         journal_default_metrics(metrics, f->fd);
2820                         f->metrics = *metrics;
2821                 } else if (template)
2822                         f->metrics = template->metrics;
2823
2824                 r = journal_file_refresh_header(f);
2825                 if (r < 0)
2826                         goto fail;
2827         }
2828
2829 #ifdef HAVE_GCRYPT
2830         r = journal_file_hmac_setup(f);
2831         if (r < 0)
2832                 goto fail;
2833 #endif
2834
2835         if (newly_created) {
2836                 r = journal_file_setup_field_hash_table(f);
2837                 if (r < 0)
2838                         goto fail;
2839
2840                 r = journal_file_setup_data_hash_table(f);
2841                 if (r < 0)
2842                         goto fail;
2843
2844 #ifdef HAVE_GCRYPT
2845                 r = journal_file_append_first_tag(f);
2846                 if (r < 0)
2847                         goto fail;
2848 #endif
2849         }
2850
2851         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2852                 r = -EIO;
2853                 goto fail;
2854         }
2855
2856         if (template && template->post_change_timer) {
2857                 sd_event *e = sd_event_source_get_event(template->post_change_timer);
2858
2859                 r = journal_file_enable_post_change_timer(f, e, template->post_change_timer_period);
2860                 if (r < 0)
2861                         goto fail;
2862         }
2863
2864         *ret = f;
2865         return 0;
2866
2867 fail:
2868         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2869                 r = -EIO;
2870
2871         journal_file_close(f);
2872
2873         return r;
2874 }
2875
2876 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2877         _cleanup_free_ char *p = NULL;
2878         size_t l;
2879         JournalFile *old_file, *new_file = NULL;
2880         int r;
2881
2882         assert(f);
2883         assert(*f);
2884
2885         old_file = *f;
2886
2887         if (!old_file->writable)
2888                 return -EINVAL;
2889
2890         if (!endswith(old_file->path, ".journal"))
2891                 return -EINVAL;
2892
2893         l = strlen(old_file->path);
2894         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2895                      (int) l - 8, old_file->path,
2896                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2897                      le64toh((*f)->header->head_entry_seqnum),
2898                      le64toh((*f)->header->head_entry_realtime));
2899         if (r < 0)
2900                 return -ENOMEM;
2901
2902         /* Try to rename the file to the archived version. If the file
2903          * already was deleted, we'll get ENOENT, let's ignore that
2904          * case. */
2905         r = rename(old_file->path, p);
2906         if (r < 0 && errno != ENOENT)
2907                 return -errno;
2908
2909         old_file->header->state = STATE_ARCHIVED;
2910
2911         /* Currently, btrfs is not very good with out write patterns
2912          * and fragments heavily. Let's defrag our journal files when
2913          * we archive them */
2914         old_file->defrag_on_close = true;
2915
2916         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2917         journal_file_close(old_file);
2918
2919         *f = new_file;
2920         return r;
2921 }
2922
2923 int journal_file_open_reliably(
2924                 const char *fname,
2925                 int flags,
2926                 mode_t mode,
2927                 bool compress,
2928                 bool seal,
2929                 JournalMetrics *metrics,
2930                 MMapCache *mmap_cache,
2931                 JournalFile *template,
2932                 JournalFile **ret) {
2933
2934         int r;
2935         size_t l;
2936         _cleanup_free_ char *p = NULL;
2937
2938         r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2939         if (!IN_SET(r,
2940                     -EBADMSG,           /* corrupted */
2941                     -ENODATA,           /* truncated */
2942                     -EHOSTDOWN,         /* other machine */
2943                     -EPROTONOSUPPORT,   /* incompatible feature */
2944                     -EBUSY,             /* unclean shutdown */
2945                     -ESHUTDOWN,         /* already archived */
2946                     -EIO,               /* IO error, including SIGBUS on mmap */
2947                     -EIDRM              /* File has been deleted */))
2948                 return r;
2949
2950         if ((flags & O_ACCMODE) == O_RDONLY)
2951                 return r;
2952
2953         if (!(flags & O_CREAT))
2954                 return r;
2955
2956         if (!endswith(fname, ".journal"))
2957                 return r;
2958
2959         /* The file is corrupted. Rotate it away and try it again (but only once) */
2960
2961         l = strlen(fname);
2962         if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2963                      (int) l - 8, fname,
2964                      now(CLOCK_REALTIME),
2965                      random_u64()) < 0)
2966                 return -ENOMEM;
2967
2968         if (rename(fname, p) < 0)
2969                 return -errno;
2970
2971         /* btrfs doesn't cope well with our write pattern and
2972          * fragments heavily. Let's defrag all files we rotate */
2973
2974         (void) chattr_path(p, false, FS_NOCOW_FL);
2975         (void) btrfs_defrag(p);
2976
2977         log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2978
2979         return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2980 }
2981
2982 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2983         uint64_t i, n;
2984         uint64_t q, xor_hash = 0;
2985         int r;
2986         EntryItem *items;
2987         dual_timestamp ts;
2988
2989         assert(from);
2990         assert(to);
2991         assert(o);
2992         assert(p);
2993
2994         if (!to->writable)
2995                 return -EPERM;
2996
2997         ts.monotonic = le64toh(o->entry.monotonic);
2998         ts.realtime = le64toh(o->entry.realtime);
2999
3000         n = journal_file_entry_n_items(o);
3001         /* alloca() can't take 0, hence let's allocate at least one */
3002         items = alloca(sizeof(EntryItem) * MAX(1u, n));
3003
3004         for (i = 0; i < n; i++) {
3005                 uint64_t l, h;
3006                 le64_t le_hash;
3007                 size_t t;
3008                 void *data;
3009                 Object *u;
3010
3011                 q = le64toh(o->entry.items[i].object_offset);
3012                 le_hash = o->entry.items[i].hash;
3013
3014                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3015                 if (r < 0)
3016                         return r;
3017
3018                 if (le_hash != o->data.hash)
3019                         return -EBADMSG;
3020
3021                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3022                 t = (size_t) l;
3023
3024                 /* We hit the limit on 32bit machines */
3025                 if ((uint64_t) t != l)
3026                         return -E2BIG;
3027
3028                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3029 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
3030                         size_t rsize = 0;
3031
3032                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3033                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3034                         if (r < 0)
3035                                 return r;
3036
3037                         data = from->compress_buffer;
3038                         l = rsize;
3039 #else
3040                         return -EPROTONOSUPPORT;
3041 #endif
3042                 } else
3043                         data = o->data.payload;
3044
3045                 r = journal_file_append_data(to, data, l, &u, &h);
3046                 if (r < 0)
3047                         return r;
3048
3049                 xor_hash ^= le64toh(u->data.hash);
3050                 items[i].object_offset = htole64(h);
3051                 items[i].hash = u->data.hash;
3052
3053                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3054                 if (r < 0)
3055                         return r;
3056         }
3057
3058         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3059
3060         if (mmap_cache_got_sigbus(to->mmap, to->fd))
3061                 return -EIO;
3062
3063         return r;
3064 }
3065
3066 void journal_reset_metrics(JournalMetrics *m) {
3067         assert(m);
3068
3069         /* Set everything to "pick automatic values". */
3070
3071         *m = (JournalMetrics) {
3072                 .min_use = (uint64_t) -1,
3073                 .max_use = (uint64_t) -1,
3074                 .min_size = (uint64_t) -1,
3075                 .max_size = (uint64_t) -1,
3076                 .keep_free = (uint64_t) -1,
3077                 .n_max_files = (uint64_t) -1,
3078         };
3079 }
3080
3081 void journal_default_metrics(JournalMetrics *m, int fd) {
3082         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3083         struct statvfs ss;
3084         uint64_t fs_size;
3085
3086         assert(m);
3087         assert(fd >= 0);
3088
3089         if (fstatvfs(fd, &ss) >= 0)
3090                 fs_size = ss.f_frsize * ss.f_blocks;
3091         else {
3092                 log_debug_errno(errno, "Failed to detremine disk size: %m");
3093                 fs_size = 0;
3094         }
3095
3096         if (m->max_use == (uint64_t) -1) {
3097
3098                 if (fs_size > 0) {
3099                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3100
3101                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3102                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3103
3104                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3105                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3106                 } else
3107                         m->max_use = DEFAULT_MAX_USE_LOWER;
3108         } else {
3109                 m->max_use = PAGE_ALIGN(m->max_use);
3110
3111                 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3112                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3113         }
3114
3115         if (m->min_use == (uint64_t) -1)
3116                 m->min_use = DEFAULT_MIN_USE;
3117
3118         if (m->min_use > m->max_use)
3119                 m->min_use = m->max_use;
3120
3121         if (m->max_size == (uint64_t) -1) {
3122                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3123
3124                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3125                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3126         } else
3127                 m->max_size = PAGE_ALIGN(m->max_size);
3128
3129         if (m->max_size != 0) {
3130                 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3131                         m->max_size = JOURNAL_FILE_SIZE_MIN;
3132
3133                 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3134                         m->max_use = m->max_size*2;
3135         }
3136
3137         if (m->min_size == (uint64_t) -1)
3138                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3139         else {
3140                 m->min_size = PAGE_ALIGN(m->min_size);
3141
3142                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3143                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3144
3145                 if (m->max_size != 0 && m->min_size > m->max_size)
3146                         m->max_size = m->min_size;
3147         }
3148
3149         if (m->keep_free == (uint64_t) -1) {
3150
3151                 if (fs_size > 0) {
3152                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3153
3154                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3155                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3156
3157                 } else
3158                         m->keep_free = DEFAULT_KEEP_FREE;
3159         }
3160
3161         if (m->n_max_files == (uint64_t) -1)
3162                 m->n_max_files = DEFAULT_N_MAX_FILES;
3163
3164         log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3165                   format_bytes(a, sizeof(a), m->min_use),
3166                   format_bytes(b, sizeof(b), m->max_use),
3167                   format_bytes(c, sizeof(c), m->max_size),
3168                   format_bytes(d, sizeof(d), m->min_size),
3169                   format_bytes(e, sizeof(e), m->keep_free),
3170                   m->n_max_files);
3171 }
3172
3173 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3174         assert(f);
3175         assert(from || to);
3176
3177         if (from) {
3178                 if (f->header->head_entry_realtime == 0)
3179                         return -ENOENT;
3180
3181                 *from = le64toh(f->header->head_entry_realtime);
3182         }
3183
3184         if (to) {
3185                 if (f->header->tail_entry_realtime == 0)
3186                         return -ENOENT;
3187
3188                 *to = le64toh(f->header->tail_entry_realtime);
3189         }
3190
3191         return 1;
3192 }
3193
3194 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3195         Object *o;
3196         uint64_t p;
3197         int r;
3198
3199         assert(f);
3200         assert(from || to);
3201
3202         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3203         if (r <= 0)
3204                 return r;
3205
3206         if (le64toh(o->data.n_entries) <= 0)
3207                 return 0;
3208
3209         if (from) {
3210                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3211                 if (r < 0)
3212                         return r;
3213
3214                 *from = le64toh(o->entry.monotonic);
3215         }
3216
3217         if (to) {
3218                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3219                 if (r < 0)
3220                         return r;
3221
3222                 r = generic_array_get_plus_one(f,
3223                                                le64toh(o->data.entry_offset),
3224                                                le64toh(o->data.entry_array_offset),
3225                                                le64toh(o->data.n_entries)-1,
3226                                                &o, NULL);
3227                 if (r <= 0)
3228                         return r;
3229
3230                 *to = le64toh(o->entry.monotonic);
3231         }
3232
3233         return 1;
3234 }
3235
3236 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3237         assert(f);
3238
3239         /* If we gained new header fields we gained new features,
3240          * hence suggest a rotation */
3241         if (le64toh(f->header->header_size) < sizeof(Header)) {
3242                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3243                 return true;
3244         }
3245
3246         /* Let's check if the hash tables grew over a certain fill
3247          * level (75%, borrowing this value from Java's hash table
3248          * implementation), and if so suggest a rotation. To calculate
3249          * the fill level we need the n_data field, which only exists
3250          * in newer versions. */
3251
3252         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3253                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3254                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3255                                   f->path,
3256                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3257                                   le64toh(f->header->n_data),
3258                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3259                                   (unsigned long long) f->last_stat.st_size,
3260                                   f->last_stat.st_size / le64toh(f->header->n_data));
3261                         return true;
3262                 }
3263
3264         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3265                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3266                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3267                                   f->path,
3268                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3269                                   le64toh(f->header->n_fields),
3270                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3271                         return true;
3272                 }
3273
3274         /* Are the data objects properly indexed by field objects? */
3275         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3276             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3277             le64toh(f->header->n_data) > 0 &&
3278             le64toh(f->header->n_fields) == 0)
3279                 return true;
3280
3281         if (max_file_usec > 0) {
3282                 usec_t t, h;
3283
3284                 h = le64toh(f->header->head_entry_realtime);
3285                 t = now(CLOCK_REALTIME);
3286
3287                 if (h > 0 && t > h + max_file_usec)
3288                         return true;
3289         }
3290
3291         return false;
3292 }