]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
journald: turn off COW for journal files on btrfs
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <linux/fs.h>
30
31 #include "btrfs-util.h"
32 #include "journal-def.h"
33 #include "journal-file.h"
34 #include "journal-authenticate.h"
35 #include "lookup3.h"
36 #include "compress.h"
37 #include "fsprg.h"
38
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41
42 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43
44 /* This is the minimum journal file size */
45 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
46
47 /* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51
52 /* This is the upper bound if we deduce max_size from max_use */
53 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
54
55 /* This is the upper bound if we deduce the keep_free value from the
56 * file system size */
57 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58
59 /* This is the keep_free value when we can't determine the system
60 * size */
61 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62
63 /* n_data was the first entry we added after the initial file format design */
64 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65
66 /* How many entries to keep in the entry array chain cache at max */
67 #define CHAIN_CACHE_MAX 20
68
69 /* How much to increase the journal file size at once each time we allocate something new. */
70 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71
72 /* Reread fstat() of the file for detecting deletions at least this often */
73 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
74
75 /* The mmap context to use for the header we pick as one above the last defined typed */
76 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
77
78 static int journal_file_set_online(JournalFile *f) {
79 assert(f);
80
81 if (!f->writable)
82 return -EPERM;
83
84 if (!(f->fd >= 0 && f->header))
85 return -EINVAL;
86
87 if (mmap_cache_got_sigbus(f->mmap, f->fd))
88 return -EIO;
89
90 switch(f->header->state) {
91 case STATE_ONLINE:
92 return 0;
93
94 case STATE_OFFLINE:
95 f->header->state = STATE_ONLINE;
96 fsync(f->fd);
97 return 0;
98
99 default:
100 return -EINVAL;
101 }
102 }
103
104 int journal_file_set_offline(JournalFile *f) {
105 assert(f);
106
107 if (!f->writable)
108 return -EPERM;
109
110 if (!(f->fd >= 0 && f->header))
111 return -EINVAL;
112
113 if (f->header->state != STATE_ONLINE)
114 return 0;
115
116 fsync(f->fd);
117
118 if (mmap_cache_got_sigbus(f->mmap, f->fd))
119 return -EIO;
120
121 f->header->state = STATE_OFFLINE;
122
123 if (mmap_cache_got_sigbus(f->mmap, f->fd))
124 return -EIO;
125
126 fsync(f->fd);
127
128 return 0;
129 }
130
131 void journal_file_close(JournalFile *f) {
132 assert(f);
133
134 #ifdef HAVE_GCRYPT
135 /* Write the final tag */
136 if (f->seal && f->writable)
137 journal_file_append_tag(f);
138 #endif
139
140 journal_file_set_offline(f);
141
142 if (f->mmap && f->fd >= 0)
143 mmap_cache_close_fd(f->mmap, f->fd);
144
145 if (f->fd >= 0 && f->defrag_on_close) {
146
147 /* Be friendly to btrfs: turn COW back on again now,
148 * and defragment the file. We won't write to the file
149 * ever again, hence remove all fragmentation, and
150 * reenable all the good bits COW usually provides
151 * (such as data checksumming). */
152
153 (void) chattr_fd(f->fd, false, FS_NOCOW_FL);
154 (void) btrfs_defrag_fd(f->fd);
155 }
156
157 safe_close(f->fd);
158 free(f->path);
159
160 if (f->mmap)
161 mmap_cache_unref(f->mmap);
162
163 ordered_hashmap_free_free(f->chain_cache);
164
165 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
166 free(f->compress_buffer);
167 #endif
168
169 #ifdef HAVE_GCRYPT
170 if (f->fss_file)
171 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
172 else if (f->fsprg_state)
173 free(f->fsprg_state);
174
175 free(f->fsprg_seed);
176
177 if (f->hmac)
178 gcry_md_close(f->hmac);
179 #endif
180
181 free(f);
182 }
183
184 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
185 Header h = {};
186 ssize_t k;
187 int r;
188
189 assert(f);
190
191 memcpy(h.signature, HEADER_SIGNATURE, 8);
192 h.header_size = htole64(ALIGN64(sizeof(h)));
193
194 h.incompatible_flags |= htole32(
195 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
196 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
197
198 h.compatible_flags = htole32(
199 f->seal * HEADER_COMPATIBLE_SEALED);
200
201 r = sd_id128_randomize(&h.file_id);
202 if (r < 0)
203 return r;
204
205 if (template) {
206 h.seqnum_id = template->header->seqnum_id;
207 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
208 } else
209 h.seqnum_id = h.file_id;
210
211 k = pwrite(f->fd, &h, sizeof(h), 0);
212 if (k < 0)
213 return -errno;
214
215 if (k != sizeof(h))
216 return -EIO;
217
218 return 0;
219 }
220
221 static int journal_file_refresh_header(JournalFile *f) {
222 sd_id128_t boot_id;
223 int r;
224
225 assert(f);
226
227 r = sd_id128_get_machine(&f->header->machine_id);
228 if (r < 0)
229 return r;
230
231 r = sd_id128_get_boot(&boot_id);
232 if (r < 0)
233 return r;
234
235 if (sd_id128_equal(boot_id, f->header->boot_id))
236 f->tail_entry_monotonic_valid = true;
237
238 f->header->boot_id = boot_id;
239
240 r = journal_file_set_online(f);
241
242 /* Sync the online state to disk */
243 fsync(f->fd);
244
245 return r;
246 }
247
248 static int journal_file_verify_header(JournalFile *f) {
249 uint32_t flags;
250
251 assert(f);
252
253 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
254 return -EBADMSG;
255
256 /* In both read and write mode we refuse to open files with
257 * incompatible flags we don't know */
258 flags = le32toh(f->header->incompatible_flags);
259 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
260 if (flags & ~HEADER_INCOMPATIBLE_ANY)
261 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
262 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
263 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
264 if (flags)
265 log_debug("Journal file %s uses incompatible flags %"PRIx32
266 " disabled at compilation time.", f->path, flags);
267 return -EPROTONOSUPPORT;
268 }
269
270 /* When open for writing we refuse to open files with
271 * compatible flags, too */
272 flags = le32toh(f->header->compatible_flags);
273 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
274 if (flags & ~HEADER_COMPATIBLE_ANY)
275 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
276 f->path, flags & ~HEADER_COMPATIBLE_ANY);
277 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
278 if (flags)
279 log_debug("Journal file %s uses compatible flags %"PRIx32
280 " disabled at compilation time.", f->path, flags);
281 return -EPROTONOSUPPORT;
282 }
283
284 if (f->header->state >= _STATE_MAX)
285 return -EBADMSG;
286
287 /* The first addition was n_data, so check that we are at least this large */
288 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
289 return -EBADMSG;
290
291 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
292 return -EBADMSG;
293
294 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
295 return -ENODATA;
296
297 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
298 return -ENODATA;
299
300 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
301 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
302 !VALID64(le64toh(f->header->tail_object_offset)) ||
303 !VALID64(le64toh(f->header->entry_array_offset)))
304 return -ENODATA;
305
306 if (f->writable) {
307 uint8_t state;
308 sd_id128_t machine_id;
309 int r;
310
311 r = sd_id128_get_machine(&machine_id);
312 if (r < 0)
313 return r;
314
315 if (!sd_id128_equal(machine_id, f->header->machine_id))
316 return -EHOSTDOWN;
317
318 state = f->header->state;
319
320 if (state == STATE_ONLINE) {
321 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
322 return -EBUSY;
323 } else if (state == STATE_ARCHIVED)
324 return -ESHUTDOWN;
325 else if (state != STATE_OFFLINE) {
326 log_debug("Journal file %s has unknown state %u.", f->path, state);
327 return -EBUSY;
328 }
329 }
330
331 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
332 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
333
334 f->seal = JOURNAL_HEADER_SEALED(f->header);
335
336 return 0;
337 }
338
339 static int journal_file_fstat(JournalFile *f) {
340 assert(f);
341 assert(f->fd >= 0);
342
343 if (fstat(f->fd, &f->last_stat) < 0)
344 return -errno;
345
346 f->last_stat_usec = now(CLOCK_MONOTONIC);
347
348 /* Refuse appending to files that are already deleted */
349 if (f->last_stat.st_nlink <= 0)
350 return -EIDRM;
351
352 return 0;
353 }
354
355 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
356 uint64_t old_size, new_size;
357 int r;
358
359 assert(f);
360
361 /* We assume that this file is not sparse, and we know that
362 * for sure, since we always call posix_fallocate()
363 * ourselves */
364
365 if (mmap_cache_got_sigbus(f->mmap, f->fd))
366 return -EIO;
367
368 old_size =
369 le64toh(f->header->header_size) +
370 le64toh(f->header->arena_size);
371
372 new_size = PAGE_ALIGN(offset + size);
373 if (new_size < le64toh(f->header->header_size))
374 new_size = le64toh(f->header->header_size);
375
376 if (new_size <= old_size) {
377
378 /* We already pre-allocated enough space, but before
379 * we write to it, let's check with fstat() if the
380 * file got deleted, in order make sure we don't throw
381 * away the data immediately. Don't check fstat() for
382 * all writes though, but only once ever 10s. */
383
384 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
385 return 0;
386
387 return journal_file_fstat(f);
388 }
389
390 /* Allocate more space. */
391
392 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
393 return -E2BIG;
394
395 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
396 struct statvfs svfs;
397
398 if (fstatvfs(f->fd, &svfs) >= 0) {
399 uint64_t available;
400
401 available = svfs.f_bfree * svfs.f_bsize;
402
403 if (available >= f->metrics.keep_free)
404 available -= f->metrics.keep_free;
405 else
406 available = 0;
407
408 if (new_size - old_size > available)
409 return -E2BIG;
410 }
411 }
412
413 /* Increase by larger blocks at once */
414 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
415 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
416 new_size = f->metrics.max_size;
417
418 /* Note that the glibc fallocate() fallback is very
419 inefficient, hence we try to minimize the allocation area
420 as we can. */
421 r = posix_fallocate(f->fd, old_size, new_size - old_size);
422 if (r != 0)
423 return -r;
424
425 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
426
427 return journal_file_fstat(f);
428 }
429
430 static unsigned type_to_context(ObjectType type) {
431 /* One context for each type, plus one catch-all for the rest */
432 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
433 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
434 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
435 }
436
437 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
438 int r;
439
440 assert(f);
441 assert(ret);
442
443 if (size <= 0)
444 return -EINVAL;
445
446 /* Avoid SIGBUS on invalid accesses */
447 if (offset + size > (uint64_t) f->last_stat.st_size) {
448 /* Hmm, out of range? Let's refresh the fstat() data
449 * first, before we trust that check. */
450
451 r = journal_file_fstat(f);
452 if (r < 0)
453 return r;
454
455 if (offset + size > (uint64_t) f->last_stat.st_size)
456 return -EADDRNOTAVAIL;
457 }
458
459 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
460 }
461
462 static uint64_t minimum_header_size(Object *o) {
463
464 static const uint64_t table[] = {
465 [OBJECT_DATA] = sizeof(DataObject),
466 [OBJECT_FIELD] = sizeof(FieldObject),
467 [OBJECT_ENTRY] = sizeof(EntryObject),
468 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
469 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
470 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
471 [OBJECT_TAG] = sizeof(TagObject),
472 };
473
474 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
475 return sizeof(ObjectHeader);
476
477 return table[o->object.type];
478 }
479
480 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
481 int r;
482 void *t;
483 Object *o;
484 uint64_t s;
485
486 assert(f);
487 assert(ret);
488
489 /* Objects may only be located at multiple of 64 bit */
490 if (!VALID64(offset))
491 return -EFAULT;
492
493 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
494 if (r < 0)
495 return r;
496
497 o = (Object*) t;
498 s = le64toh(o->object.size);
499
500 if (s < sizeof(ObjectHeader))
501 return -EBADMSG;
502
503 if (o->object.type <= OBJECT_UNUSED)
504 return -EBADMSG;
505
506 if (s < minimum_header_size(o))
507 return -EBADMSG;
508
509 if (type > OBJECT_UNUSED && o->object.type != type)
510 return -EBADMSG;
511
512 if (s > sizeof(ObjectHeader)) {
513 r = journal_file_move_to(f, type, false, offset, s, &t);
514 if (r < 0)
515 return r;
516
517 o = (Object*) t;
518 }
519
520 *ret = o;
521 return 0;
522 }
523
524 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
525 uint64_t r;
526
527 assert(f);
528
529 r = le64toh(f->header->tail_entry_seqnum) + 1;
530
531 if (seqnum) {
532 /* If an external seqnum counter was passed, we update
533 * both the local and the external one, and set it to
534 * the maximum of both */
535
536 if (*seqnum + 1 > r)
537 r = *seqnum + 1;
538
539 *seqnum = r;
540 }
541
542 f->header->tail_entry_seqnum = htole64(r);
543
544 if (f->header->head_entry_seqnum == 0)
545 f->header->head_entry_seqnum = htole64(r);
546
547 return r;
548 }
549
550 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
551 int r;
552 uint64_t p;
553 Object *tail, *o;
554 void *t;
555
556 assert(f);
557 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
558 assert(size >= sizeof(ObjectHeader));
559 assert(offset);
560 assert(ret);
561
562 r = journal_file_set_online(f);
563 if (r < 0)
564 return r;
565
566 p = le64toh(f->header->tail_object_offset);
567 if (p == 0)
568 p = le64toh(f->header->header_size);
569 else {
570 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
571 if (r < 0)
572 return r;
573
574 p += ALIGN64(le64toh(tail->object.size));
575 }
576
577 r = journal_file_allocate(f, p, size);
578 if (r < 0)
579 return r;
580
581 r = journal_file_move_to(f, type, false, p, size, &t);
582 if (r < 0)
583 return r;
584
585 o = (Object*) t;
586
587 zero(o->object);
588 o->object.type = type;
589 o->object.size = htole64(size);
590
591 f->header->tail_object_offset = htole64(p);
592 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
593
594 *ret = o;
595 *offset = p;
596
597 return 0;
598 }
599
600 static int journal_file_setup_data_hash_table(JournalFile *f) {
601 uint64_t s, p;
602 Object *o;
603 int r;
604
605 assert(f);
606
607 /* We estimate that we need 1 hash table entry per 768 of
608 journal file and we want to make sure we never get beyond
609 75% fill level. Calculate the hash table size for the
610 maximum file size based on these metrics. */
611
612 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
613 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
614 s = DEFAULT_DATA_HASH_TABLE_SIZE;
615
616 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
617
618 r = journal_file_append_object(f,
619 OBJECT_DATA_HASH_TABLE,
620 offsetof(Object, hash_table.items) + s,
621 &o, &p);
622 if (r < 0)
623 return r;
624
625 memzero(o->hash_table.items, s);
626
627 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
628 f->header->data_hash_table_size = htole64(s);
629
630 return 0;
631 }
632
633 static int journal_file_setup_field_hash_table(JournalFile *f) {
634 uint64_t s, p;
635 Object *o;
636 int r;
637
638 assert(f);
639
640 /* We use a fixed size hash table for the fields as this
641 * number should grow very slowly only */
642
643 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
644 r = journal_file_append_object(f,
645 OBJECT_FIELD_HASH_TABLE,
646 offsetof(Object, hash_table.items) + s,
647 &o, &p);
648 if (r < 0)
649 return r;
650
651 memzero(o->hash_table.items, s);
652
653 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
654 f->header->field_hash_table_size = htole64(s);
655
656 return 0;
657 }
658
659 static int journal_file_map_data_hash_table(JournalFile *f) {
660 uint64_t s, p;
661 void *t;
662 int r;
663
664 assert(f);
665
666 p = le64toh(f->header->data_hash_table_offset);
667 s = le64toh(f->header->data_hash_table_size);
668
669 r = journal_file_move_to(f,
670 OBJECT_DATA_HASH_TABLE,
671 true,
672 p, s,
673 &t);
674 if (r < 0)
675 return r;
676
677 f->data_hash_table = t;
678 return 0;
679 }
680
681 static int journal_file_map_field_hash_table(JournalFile *f) {
682 uint64_t s, p;
683 void *t;
684 int r;
685
686 assert(f);
687
688 p = le64toh(f->header->field_hash_table_offset);
689 s = le64toh(f->header->field_hash_table_size);
690
691 r = journal_file_move_to(f,
692 OBJECT_FIELD_HASH_TABLE,
693 true,
694 p, s,
695 &t);
696 if (r < 0)
697 return r;
698
699 f->field_hash_table = t;
700 return 0;
701 }
702
703 static int journal_file_link_field(
704 JournalFile *f,
705 Object *o,
706 uint64_t offset,
707 uint64_t hash) {
708
709 uint64_t p, h, m;
710 int r;
711
712 assert(f);
713 assert(o);
714 assert(offset > 0);
715
716 if (o->object.type != OBJECT_FIELD)
717 return -EINVAL;
718
719 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
720 if (m <= 0)
721 return -EBADMSG;
722
723 /* This might alter the window we are looking at */
724 o->field.next_hash_offset = o->field.head_data_offset = 0;
725
726 h = hash % m;
727 p = le64toh(f->field_hash_table[h].tail_hash_offset);
728 if (p == 0)
729 f->field_hash_table[h].head_hash_offset = htole64(offset);
730 else {
731 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
732 if (r < 0)
733 return r;
734
735 o->field.next_hash_offset = htole64(offset);
736 }
737
738 f->field_hash_table[h].tail_hash_offset = htole64(offset);
739
740 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
741 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
742
743 return 0;
744 }
745
746 static int journal_file_link_data(
747 JournalFile *f,
748 Object *o,
749 uint64_t offset,
750 uint64_t hash) {
751
752 uint64_t p, h, m;
753 int r;
754
755 assert(f);
756 assert(o);
757 assert(offset > 0);
758
759 if (o->object.type != OBJECT_DATA)
760 return -EINVAL;
761
762 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
763 if (m <= 0)
764 return -EBADMSG;
765
766 /* This might alter the window we are looking at */
767 o->data.next_hash_offset = o->data.next_field_offset = 0;
768 o->data.entry_offset = o->data.entry_array_offset = 0;
769 o->data.n_entries = 0;
770
771 h = hash % m;
772 p = le64toh(f->data_hash_table[h].tail_hash_offset);
773 if (p == 0)
774 /* Only entry in the hash table is easy */
775 f->data_hash_table[h].head_hash_offset = htole64(offset);
776 else {
777 /* Move back to the previous data object, to patch in
778 * pointer */
779
780 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
781 if (r < 0)
782 return r;
783
784 o->data.next_hash_offset = htole64(offset);
785 }
786
787 f->data_hash_table[h].tail_hash_offset = htole64(offset);
788
789 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
790 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
791
792 return 0;
793 }
794
795 int journal_file_find_field_object_with_hash(
796 JournalFile *f,
797 const void *field, uint64_t size, uint64_t hash,
798 Object **ret, uint64_t *offset) {
799
800 uint64_t p, osize, h, m;
801 int r;
802
803 assert(f);
804 assert(field && size > 0);
805
806 osize = offsetof(Object, field.payload) + size;
807
808 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
809
810 if (m <= 0)
811 return -EBADMSG;
812
813 h = hash % m;
814 p = le64toh(f->field_hash_table[h].head_hash_offset);
815
816 while (p > 0) {
817 Object *o;
818
819 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
820 if (r < 0)
821 return r;
822
823 if (le64toh(o->field.hash) == hash &&
824 le64toh(o->object.size) == osize &&
825 memcmp(o->field.payload, field, size) == 0) {
826
827 if (ret)
828 *ret = o;
829 if (offset)
830 *offset = p;
831
832 return 1;
833 }
834
835 p = le64toh(o->field.next_hash_offset);
836 }
837
838 return 0;
839 }
840
841 int journal_file_find_field_object(
842 JournalFile *f,
843 const void *field, uint64_t size,
844 Object **ret, uint64_t *offset) {
845
846 uint64_t hash;
847
848 assert(f);
849 assert(field && size > 0);
850
851 hash = hash64(field, size);
852
853 return journal_file_find_field_object_with_hash(f,
854 field, size, hash,
855 ret, offset);
856 }
857
858 int journal_file_find_data_object_with_hash(
859 JournalFile *f,
860 const void *data, uint64_t size, uint64_t hash,
861 Object **ret, uint64_t *offset) {
862
863 uint64_t p, osize, h, m;
864 int r;
865
866 assert(f);
867 assert(data || size == 0);
868
869 osize = offsetof(Object, data.payload) + size;
870
871 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
872 if (m <= 0)
873 return -EBADMSG;
874
875 h = hash % m;
876 p = le64toh(f->data_hash_table[h].head_hash_offset);
877
878 while (p > 0) {
879 Object *o;
880
881 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
882 if (r < 0)
883 return r;
884
885 if (le64toh(o->data.hash) != hash)
886 goto next;
887
888 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
889 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
890 uint64_t l;
891 size_t rsize;
892
893 l = le64toh(o->object.size);
894 if (l <= offsetof(Object, data.payload))
895 return -EBADMSG;
896
897 l -= offsetof(Object, data.payload);
898
899 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
900 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
901 if (r < 0)
902 return r;
903
904 if (rsize == size &&
905 memcmp(f->compress_buffer, data, size) == 0) {
906
907 if (ret)
908 *ret = o;
909
910 if (offset)
911 *offset = p;
912
913 return 1;
914 }
915 #else
916 return -EPROTONOSUPPORT;
917 #endif
918 } else if (le64toh(o->object.size) == osize &&
919 memcmp(o->data.payload, data, size) == 0) {
920
921 if (ret)
922 *ret = o;
923
924 if (offset)
925 *offset = p;
926
927 return 1;
928 }
929
930 next:
931 p = le64toh(o->data.next_hash_offset);
932 }
933
934 return 0;
935 }
936
937 int journal_file_find_data_object(
938 JournalFile *f,
939 const void *data, uint64_t size,
940 Object **ret, uint64_t *offset) {
941
942 uint64_t hash;
943
944 assert(f);
945 assert(data || size == 0);
946
947 hash = hash64(data, size);
948
949 return journal_file_find_data_object_with_hash(f,
950 data, size, hash,
951 ret, offset);
952 }
953
954 static int journal_file_append_field(
955 JournalFile *f,
956 const void *field, uint64_t size,
957 Object **ret, uint64_t *offset) {
958
959 uint64_t hash, p;
960 uint64_t osize;
961 Object *o;
962 int r;
963
964 assert(f);
965 assert(field && size > 0);
966
967 hash = hash64(field, size);
968
969 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
970 if (r < 0)
971 return r;
972 else if (r > 0) {
973
974 if (ret)
975 *ret = o;
976
977 if (offset)
978 *offset = p;
979
980 return 0;
981 }
982
983 osize = offsetof(Object, field.payload) + size;
984 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
985 if (r < 0)
986 return r;
987
988 o->field.hash = htole64(hash);
989 memcpy(o->field.payload, field, size);
990
991 r = journal_file_link_field(f, o, p, hash);
992 if (r < 0)
993 return r;
994
995 /* The linking might have altered the window, so let's
996 * refresh our pointer */
997 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
998 if (r < 0)
999 return r;
1000
1001 #ifdef HAVE_GCRYPT
1002 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1003 if (r < 0)
1004 return r;
1005 #endif
1006
1007 if (ret)
1008 *ret = o;
1009
1010 if (offset)
1011 *offset = p;
1012
1013 return 0;
1014 }
1015
1016 static int journal_file_append_data(
1017 JournalFile *f,
1018 const void *data, uint64_t size,
1019 Object **ret, uint64_t *offset) {
1020
1021 uint64_t hash, p;
1022 uint64_t osize;
1023 Object *o;
1024 int r, compression = 0;
1025 const void *eq;
1026
1027 assert(f);
1028 assert(data || size == 0);
1029
1030 hash = hash64(data, size);
1031
1032 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1033 if (r < 0)
1034 return r;
1035 else if (r > 0) {
1036
1037 if (ret)
1038 *ret = o;
1039
1040 if (offset)
1041 *offset = p;
1042
1043 return 0;
1044 }
1045
1046 osize = offsetof(Object, data.payload) + size;
1047 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1048 if (r < 0)
1049 return r;
1050
1051 o->data.hash = htole64(hash);
1052
1053 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1054 if (f->compress_xz &&
1055 size >= COMPRESSION_SIZE_THRESHOLD) {
1056 size_t rsize;
1057
1058 compression = compress_blob(data, size, o->data.payload, &rsize);
1059
1060 if (compression) {
1061 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1062 o->object.flags |= compression;
1063
1064 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1065 size, rsize, object_compressed_to_string(compression));
1066 }
1067 }
1068 #endif
1069
1070 if (!compression && size > 0)
1071 memcpy(o->data.payload, data, size);
1072
1073 r = journal_file_link_data(f, o, p, hash);
1074 if (r < 0)
1075 return r;
1076
1077 /* The linking might have altered the window, so let's
1078 * refresh our pointer */
1079 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1080 if (r < 0)
1081 return r;
1082
1083 if (!data)
1084 eq = NULL;
1085 else
1086 eq = memchr(data, '=', size);
1087 if (eq && eq > data) {
1088 Object *fo = NULL;
1089 uint64_t fp;
1090
1091 /* Create field object ... */
1092 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1093 if (r < 0)
1094 return r;
1095
1096 /* ... and link it in. */
1097 o->data.next_field_offset = fo->field.head_data_offset;
1098 fo->field.head_data_offset = le64toh(p);
1099 }
1100
1101 #ifdef HAVE_GCRYPT
1102 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1103 if (r < 0)
1104 return r;
1105 #endif
1106
1107 if (ret)
1108 *ret = o;
1109
1110 if (offset)
1111 *offset = p;
1112
1113 return 0;
1114 }
1115
1116 uint64_t journal_file_entry_n_items(Object *o) {
1117 assert(o);
1118
1119 if (o->object.type != OBJECT_ENTRY)
1120 return 0;
1121
1122 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1123 }
1124
1125 uint64_t journal_file_entry_array_n_items(Object *o) {
1126 assert(o);
1127
1128 if (o->object.type != OBJECT_ENTRY_ARRAY)
1129 return 0;
1130
1131 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1132 }
1133
1134 uint64_t journal_file_hash_table_n_items(Object *o) {
1135 assert(o);
1136
1137 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1138 o->object.type != OBJECT_FIELD_HASH_TABLE)
1139 return 0;
1140
1141 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1142 }
1143
1144 static int link_entry_into_array(JournalFile *f,
1145 le64_t *first,
1146 le64_t *idx,
1147 uint64_t p) {
1148 int r;
1149 uint64_t n = 0, ap = 0, q, i, a, hidx;
1150 Object *o;
1151
1152 assert(f);
1153 assert(first);
1154 assert(idx);
1155 assert(p > 0);
1156
1157 a = le64toh(*first);
1158 i = hidx = le64toh(*idx);
1159 while (a > 0) {
1160
1161 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1162 if (r < 0)
1163 return r;
1164
1165 n = journal_file_entry_array_n_items(o);
1166 if (i < n) {
1167 o->entry_array.items[i] = htole64(p);
1168 *idx = htole64(hidx + 1);
1169 return 0;
1170 }
1171
1172 i -= n;
1173 ap = a;
1174 a = le64toh(o->entry_array.next_entry_array_offset);
1175 }
1176
1177 if (hidx > n)
1178 n = (hidx+1) * 2;
1179 else
1180 n = n * 2;
1181
1182 if (n < 4)
1183 n = 4;
1184
1185 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1186 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1187 &o, &q);
1188 if (r < 0)
1189 return r;
1190
1191 #ifdef HAVE_GCRYPT
1192 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1193 if (r < 0)
1194 return r;
1195 #endif
1196
1197 o->entry_array.items[i] = htole64(p);
1198
1199 if (ap == 0)
1200 *first = htole64(q);
1201 else {
1202 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1203 if (r < 0)
1204 return r;
1205
1206 o->entry_array.next_entry_array_offset = htole64(q);
1207 }
1208
1209 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1210 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1211
1212 *idx = htole64(hidx + 1);
1213
1214 return 0;
1215 }
1216
1217 static int link_entry_into_array_plus_one(JournalFile *f,
1218 le64_t *extra,
1219 le64_t *first,
1220 le64_t *idx,
1221 uint64_t p) {
1222
1223 int r;
1224
1225 assert(f);
1226 assert(extra);
1227 assert(first);
1228 assert(idx);
1229 assert(p > 0);
1230
1231 if (*idx == 0)
1232 *extra = htole64(p);
1233 else {
1234 le64_t i;
1235
1236 i = htole64(le64toh(*idx) - 1);
1237 r = link_entry_into_array(f, first, &i, p);
1238 if (r < 0)
1239 return r;
1240 }
1241
1242 *idx = htole64(le64toh(*idx) + 1);
1243 return 0;
1244 }
1245
1246 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1247 uint64_t p;
1248 int r;
1249 assert(f);
1250 assert(o);
1251 assert(offset > 0);
1252
1253 p = le64toh(o->entry.items[i].object_offset);
1254 if (p == 0)
1255 return -EINVAL;
1256
1257 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1258 if (r < 0)
1259 return r;
1260
1261 return link_entry_into_array_plus_one(f,
1262 &o->data.entry_offset,
1263 &o->data.entry_array_offset,
1264 &o->data.n_entries,
1265 offset);
1266 }
1267
1268 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1269 uint64_t n, i;
1270 int r;
1271
1272 assert(f);
1273 assert(o);
1274 assert(offset > 0);
1275
1276 if (o->object.type != OBJECT_ENTRY)
1277 return -EINVAL;
1278
1279 __sync_synchronize();
1280
1281 /* Link up the entry itself */
1282 r = link_entry_into_array(f,
1283 &f->header->entry_array_offset,
1284 &f->header->n_entries,
1285 offset);
1286 if (r < 0)
1287 return r;
1288
1289 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1290
1291 if (f->header->head_entry_realtime == 0)
1292 f->header->head_entry_realtime = o->entry.realtime;
1293
1294 f->header->tail_entry_realtime = o->entry.realtime;
1295 f->header->tail_entry_monotonic = o->entry.monotonic;
1296
1297 f->tail_entry_monotonic_valid = true;
1298
1299 /* Link up the items */
1300 n = journal_file_entry_n_items(o);
1301 for (i = 0; i < n; i++) {
1302 r = journal_file_link_entry_item(f, o, offset, i);
1303 if (r < 0)
1304 return r;
1305 }
1306
1307 return 0;
1308 }
1309
1310 static int journal_file_append_entry_internal(
1311 JournalFile *f,
1312 const dual_timestamp *ts,
1313 uint64_t xor_hash,
1314 const EntryItem items[], unsigned n_items,
1315 uint64_t *seqnum,
1316 Object **ret, uint64_t *offset) {
1317 uint64_t np;
1318 uint64_t osize;
1319 Object *o;
1320 int r;
1321
1322 assert(f);
1323 assert(items || n_items == 0);
1324 assert(ts);
1325
1326 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1327
1328 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1329 if (r < 0)
1330 return r;
1331
1332 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1333 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1334 o->entry.realtime = htole64(ts->realtime);
1335 o->entry.monotonic = htole64(ts->monotonic);
1336 o->entry.xor_hash = htole64(xor_hash);
1337 o->entry.boot_id = f->header->boot_id;
1338
1339 #ifdef HAVE_GCRYPT
1340 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1341 if (r < 0)
1342 return r;
1343 #endif
1344
1345 r = journal_file_link_entry(f, o, np);
1346 if (r < 0)
1347 return r;
1348
1349 if (ret)
1350 *ret = o;
1351
1352 if (offset)
1353 *offset = np;
1354
1355 return 0;
1356 }
1357
1358 void journal_file_post_change(JournalFile *f) {
1359 assert(f);
1360
1361 /* inotify() does not receive IN_MODIFY events from file
1362 * accesses done via mmap(). After each access we hence
1363 * trigger IN_MODIFY by truncating the journal file to its
1364 * current size which triggers IN_MODIFY. */
1365
1366 __sync_synchronize();
1367
1368 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1369 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1370 }
1371
1372 static int entry_item_cmp(const void *_a, const void *_b) {
1373 const EntryItem *a = _a, *b = _b;
1374
1375 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1376 return -1;
1377 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1378 return 1;
1379 return 0;
1380 }
1381
1382 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1383 unsigned i;
1384 EntryItem *items;
1385 int r;
1386 uint64_t xor_hash = 0;
1387 struct dual_timestamp _ts;
1388
1389 assert(f);
1390 assert(iovec || n_iovec == 0);
1391
1392 if (!ts) {
1393 dual_timestamp_get(&_ts);
1394 ts = &_ts;
1395 }
1396
1397 if (f->tail_entry_monotonic_valid &&
1398 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1399 return -EINVAL;
1400
1401 #ifdef HAVE_GCRYPT
1402 r = journal_file_maybe_append_tag(f, ts->realtime);
1403 if (r < 0)
1404 return r;
1405 #endif
1406
1407 /* alloca() can't take 0, hence let's allocate at least one */
1408 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1409
1410 for (i = 0; i < n_iovec; i++) {
1411 uint64_t p;
1412 Object *o;
1413
1414 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1415 if (r < 0)
1416 return r;
1417
1418 xor_hash ^= le64toh(o->data.hash);
1419 items[i].object_offset = htole64(p);
1420 items[i].hash = o->data.hash;
1421 }
1422
1423 /* Order by the position on disk, in order to improve seek
1424 * times for rotating media. */
1425 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1426
1427 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1428
1429 /* If the memory mapping triggered a SIGBUS then we return an
1430 * IO error and ignore the error code passed down to us, since
1431 * it is very likely just an effect of a nullified replacement
1432 * mapping page */
1433
1434 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1435 r = -EIO;
1436
1437 journal_file_post_change(f);
1438
1439 return r;
1440 }
1441
1442 typedef struct ChainCacheItem {
1443 uint64_t first; /* the array at the beginning of the chain */
1444 uint64_t array; /* the cached array */
1445 uint64_t begin; /* the first item in the cached array */
1446 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1447 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1448 } ChainCacheItem;
1449
1450 static void chain_cache_put(
1451 OrderedHashmap *h,
1452 ChainCacheItem *ci,
1453 uint64_t first,
1454 uint64_t array,
1455 uint64_t begin,
1456 uint64_t total,
1457 uint64_t last_index) {
1458
1459 if (!ci) {
1460 /* If the chain item to cache for this chain is the
1461 * first one it's not worth caching anything */
1462 if (array == first)
1463 return;
1464
1465 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1466 ci = ordered_hashmap_steal_first(h);
1467 assert(ci);
1468 } else {
1469 ci = new(ChainCacheItem, 1);
1470 if (!ci)
1471 return;
1472 }
1473
1474 ci->first = first;
1475
1476 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1477 free(ci);
1478 return;
1479 }
1480 } else
1481 assert(ci->first == first);
1482
1483 ci->array = array;
1484 ci->begin = begin;
1485 ci->total = total;
1486 ci->last_index = last_index;
1487 }
1488
1489 static int generic_array_get(
1490 JournalFile *f,
1491 uint64_t first,
1492 uint64_t i,
1493 Object **ret, uint64_t *offset) {
1494
1495 Object *o;
1496 uint64_t p = 0, a, t = 0;
1497 int r;
1498 ChainCacheItem *ci;
1499
1500 assert(f);
1501
1502 a = first;
1503
1504 /* Try the chain cache first */
1505 ci = ordered_hashmap_get(f->chain_cache, &first);
1506 if (ci && i > ci->total) {
1507 a = ci->array;
1508 i -= ci->total;
1509 t = ci->total;
1510 }
1511
1512 while (a > 0) {
1513 uint64_t k;
1514
1515 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1516 if (r < 0)
1517 return r;
1518
1519 k = journal_file_entry_array_n_items(o);
1520 if (i < k) {
1521 p = le64toh(o->entry_array.items[i]);
1522 goto found;
1523 }
1524
1525 i -= k;
1526 t += k;
1527 a = le64toh(o->entry_array.next_entry_array_offset);
1528 }
1529
1530 return 0;
1531
1532 found:
1533 /* Let's cache this item for the next invocation */
1534 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1535
1536 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1537 if (r < 0)
1538 return r;
1539
1540 if (ret)
1541 *ret = o;
1542
1543 if (offset)
1544 *offset = p;
1545
1546 return 1;
1547 }
1548
1549 static int generic_array_get_plus_one(
1550 JournalFile *f,
1551 uint64_t extra,
1552 uint64_t first,
1553 uint64_t i,
1554 Object **ret, uint64_t *offset) {
1555
1556 Object *o;
1557
1558 assert(f);
1559
1560 if (i == 0) {
1561 int r;
1562
1563 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1564 if (r < 0)
1565 return r;
1566
1567 if (ret)
1568 *ret = o;
1569
1570 if (offset)
1571 *offset = extra;
1572
1573 return 1;
1574 }
1575
1576 return generic_array_get(f, first, i-1, ret, offset);
1577 }
1578
1579 enum {
1580 TEST_FOUND,
1581 TEST_LEFT,
1582 TEST_RIGHT
1583 };
1584
1585 static int generic_array_bisect(
1586 JournalFile *f,
1587 uint64_t first,
1588 uint64_t n,
1589 uint64_t needle,
1590 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1591 direction_t direction,
1592 Object **ret,
1593 uint64_t *offset,
1594 uint64_t *idx) {
1595
1596 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1597 bool subtract_one = false;
1598 Object *o, *array = NULL;
1599 int r;
1600 ChainCacheItem *ci;
1601
1602 assert(f);
1603 assert(test_object);
1604
1605 /* Start with the first array in the chain */
1606 a = first;
1607
1608 ci = ordered_hashmap_get(f->chain_cache, &first);
1609 if (ci && n > ci->total) {
1610 /* Ah, we have iterated this bisection array chain
1611 * previously! Let's see if we can skip ahead in the
1612 * chain, as far as the last time. But we can't jump
1613 * backwards in the chain, so let's check that
1614 * first. */
1615
1616 r = test_object(f, ci->begin, needle);
1617 if (r < 0)
1618 return r;
1619
1620 if (r == TEST_LEFT) {
1621 /* OK, what we are looking for is right of the
1622 * begin of this EntryArray, so let's jump
1623 * straight to previously cached array in the
1624 * chain */
1625
1626 a = ci->array;
1627 n -= ci->total;
1628 t = ci->total;
1629 last_index = ci->last_index;
1630 }
1631 }
1632
1633 while (a > 0) {
1634 uint64_t left, right, k, lp;
1635
1636 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1637 if (r < 0)
1638 return r;
1639
1640 k = journal_file_entry_array_n_items(array);
1641 right = MIN(k, n);
1642 if (right <= 0)
1643 return 0;
1644
1645 i = right - 1;
1646 lp = p = le64toh(array->entry_array.items[i]);
1647 if (p <= 0)
1648 return -EBADMSG;
1649
1650 r = test_object(f, p, needle);
1651 if (r < 0)
1652 return r;
1653
1654 if (r == TEST_FOUND)
1655 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1656
1657 if (r == TEST_RIGHT) {
1658 left = 0;
1659 right -= 1;
1660
1661 if (last_index != (uint64_t) -1) {
1662 assert(last_index <= right);
1663
1664 /* If we cached the last index we
1665 * looked at, let's try to not to jump
1666 * too wildly around and see if we can
1667 * limit the range to look at early to
1668 * the immediate neighbors of the last
1669 * index we looked at. */
1670
1671 if (last_index > 0) {
1672 uint64_t x = last_index - 1;
1673
1674 p = le64toh(array->entry_array.items[x]);
1675 if (p <= 0)
1676 return -EBADMSG;
1677
1678 r = test_object(f, p, needle);
1679 if (r < 0)
1680 return r;
1681
1682 if (r == TEST_FOUND)
1683 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1684
1685 if (r == TEST_RIGHT)
1686 right = x;
1687 else
1688 left = x + 1;
1689 }
1690
1691 if (last_index < right) {
1692 uint64_t y = last_index + 1;
1693
1694 p = le64toh(array->entry_array.items[y]);
1695 if (p <= 0)
1696 return -EBADMSG;
1697
1698 r = test_object(f, p, needle);
1699 if (r < 0)
1700 return r;
1701
1702 if (r == TEST_FOUND)
1703 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1704
1705 if (r == TEST_RIGHT)
1706 right = y;
1707 else
1708 left = y + 1;
1709 }
1710 }
1711
1712 for (;;) {
1713 if (left == right) {
1714 if (direction == DIRECTION_UP)
1715 subtract_one = true;
1716
1717 i = left;
1718 goto found;
1719 }
1720
1721 assert(left < right);
1722 i = (left + right) / 2;
1723
1724 p = le64toh(array->entry_array.items[i]);
1725 if (p <= 0)
1726 return -EBADMSG;
1727
1728 r = test_object(f, p, needle);
1729 if (r < 0)
1730 return r;
1731
1732 if (r == TEST_FOUND)
1733 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1734
1735 if (r == TEST_RIGHT)
1736 right = i;
1737 else
1738 left = i + 1;
1739 }
1740 }
1741
1742 if (k >= n) {
1743 if (direction == DIRECTION_UP) {
1744 i = n;
1745 subtract_one = true;
1746 goto found;
1747 }
1748
1749 return 0;
1750 }
1751
1752 last_p = lp;
1753
1754 n -= k;
1755 t += k;
1756 last_index = (uint64_t) -1;
1757 a = le64toh(array->entry_array.next_entry_array_offset);
1758 }
1759
1760 return 0;
1761
1762 found:
1763 if (subtract_one && t == 0 && i == 0)
1764 return 0;
1765
1766 /* Let's cache this item for the next invocation */
1767 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1768
1769 if (subtract_one && i == 0)
1770 p = last_p;
1771 else if (subtract_one)
1772 p = le64toh(array->entry_array.items[i-1]);
1773 else
1774 p = le64toh(array->entry_array.items[i]);
1775
1776 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1777 if (r < 0)
1778 return r;
1779
1780 if (ret)
1781 *ret = o;
1782
1783 if (offset)
1784 *offset = p;
1785
1786 if (idx)
1787 *idx = t + i + (subtract_one ? -1 : 0);
1788
1789 return 1;
1790 }
1791
1792 static int generic_array_bisect_plus_one(
1793 JournalFile *f,
1794 uint64_t extra,
1795 uint64_t first,
1796 uint64_t n,
1797 uint64_t needle,
1798 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1799 direction_t direction,
1800 Object **ret,
1801 uint64_t *offset,
1802 uint64_t *idx) {
1803
1804 int r;
1805 bool step_back = false;
1806 Object *o;
1807
1808 assert(f);
1809 assert(test_object);
1810
1811 if (n <= 0)
1812 return 0;
1813
1814 /* This bisects the array in object 'first', but first checks
1815 * an extra */
1816 r = test_object(f, extra, needle);
1817 if (r < 0)
1818 return r;
1819
1820 if (r == TEST_FOUND)
1821 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1822
1823 /* if we are looking with DIRECTION_UP then we need to first
1824 see if in the actual array there is a matching entry, and
1825 return the last one of that. But if there isn't any we need
1826 to return this one. Hence remember this, and return it
1827 below. */
1828 if (r == TEST_LEFT)
1829 step_back = direction == DIRECTION_UP;
1830
1831 if (r == TEST_RIGHT) {
1832 if (direction == DIRECTION_DOWN)
1833 goto found;
1834 else
1835 return 0;
1836 }
1837
1838 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1839
1840 if (r == 0 && step_back)
1841 goto found;
1842
1843 if (r > 0 && idx)
1844 (*idx) ++;
1845
1846 return r;
1847
1848 found:
1849 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1850 if (r < 0)
1851 return r;
1852
1853 if (ret)
1854 *ret = o;
1855
1856 if (offset)
1857 *offset = extra;
1858
1859 if (idx)
1860 *idx = 0;
1861
1862 return 1;
1863 }
1864
1865 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1866 assert(f);
1867 assert(p > 0);
1868
1869 if (p == needle)
1870 return TEST_FOUND;
1871 else if (p < needle)
1872 return TEST_LEFT;
1873 else
1874 return TEST_RIGHT;
1875 }
1876
1877 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1878 Object *o;
1879 int r;
1880
1881 assert(f);
1882 assert(p > 0);
1883
1884 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1885 if (r < 0)
1886 return r;
1887
1888 if (le64toh(o->entry.seqnum) == needle)
1889 return TEST_FOUND;
1890 else if (le64toh(o->entry.seqnum) < needle)
1891 return TEST_LEFT;
1892 else
1893 return TEST_RIGHT;
1894 }
1895
1896 int journal_file_move_to_entry_by_seqnum(
1897 JournalFile *f,
1898 uint64_t seqnum,
1899 direction_t direction,
1900 Object **ret,
1901 uint64_t *offset) {
1902
1903 return generic_array_bisect(f,
1904 le64toh(f->header->entry_array_offset),
1905 le64toh(f->header->n_entries),
1906 seqnum,
1907 test_object_seqnum,
1908 direction,
1909 ret, offset, NULL);
1910 }
1911
1912 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1913 Object *o;
1914 int r;
1915
1916 assert(f);
1917 assert(p > 0);
1918
1919 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1920 if (r < 0)
1921 return r;
1922
1923 if (le64toh(o->entry.realtime) == needle)
1924 return TEST_FOUND;
1925 else if (le64toh(o->entry.realtime) < needle)
1926 return TEST_LEFT;
1927 else
1928 return TEST_RIGHT;
1929 }
1930
1931 int journal_file_move_to_entry_by_realtime(
1932 JournalFile *f,
1933 uint64_t realtime,
1934 direction_t direction,
1935 Object **ret,
1936 uint64_t *offset) {
1937
1938 return generic_array_bisect(f,
1939 le64toh(f->header->entry_array_offset),
1940 le64toh(f->header->n_entries),
1941 realtime,
1942 test_object_realtime,
1943 direction,
1944 ret, offset, NULL);
1945 }
1946
1947 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1948 Object *o;
1949 int r;
1950
1951 assert(f);
1952 assert(p > 0);
1953
1954 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1955 if (r < 0)
1956 return r;
1957
1958 if (le64toh(o->entry.monotonic) == needle)
1959 return TEST_FOUND;
1960 else if (le64toh(o->entry.monotonic) < needle)
1961 return TEST_LEFT;
1962 else
1963 return TEST_RIGHT;
1964 }
1965
1966 static inline int find_data_object_by_boot_id(
1967 JournalFile *f,
1968 sd_id128_t boot_id,
1969 Object **o,
1970 uint64_t *b) {
1971 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1972
1973 sd_id128_to_string(boot_id, t + 9);
1974 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1975 }
1976
1977 int journal_file_move_to_entry_by_monotonic(
1978 JournalFile *f,
1979 sd_id128_t boot_id,
1980 uint64_t monotonic,
1981 direction_t direction,
1982 Object **ret,
1983 uint64_t *offset) {
1984
1985 Object *o;
1986 int r;
1987
1988 assert(f);
1989
1990 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1991 if (r < 0)
1992 return r;
1993 if (r == 0)
1994 return -ENOENT;
1995
1996 return generic_array_bisect_plus_one(f,
1997 le64toh(o->data.entry_offset),
1998 le64toh(o->data.entry_array_offset),
1999 le64toh(o->data.n_entries),
2000 monotonic,
2001 test_object_monotonic,
2002 direction,
2003 ret, offset, NULL);
2004 }
2005
2006 void journal_file_reset_location(JournalFile *f) {
2007 f->location_type = LOCATION_HEAD;
2008 f->current_offset = 0;
2009 f->current_seqnum = 0;
2010 f->current_realtime = 0;
2011 f->current_monotonic = 0;
2012 zero(f->current_boot_id);
2013 f->current_xor_hash = 0;
2014 }
2015
2016 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
2017 f->last_direction = direction;
2018 f->location_type = LOCATION_SEEK;
2019 f->current_offset = offset;
2020 f->current_seqnum = le64toh(o->entry.seqnum);
2021 f->current_realtime = le64toh(o->entry.realtime);
2022 f->current_monotonic = le64toh(o->entry.monotonic);
2023 f->current_boot_id = o->entry.boot_id;
2024 f->current_xor_hash = le64toh(o->entry.xor_hash);
2025 }
2026
2027 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2028 assert(af);
2029 assert(bf);
2030 assert(af->location_type == LOCATION_SEEK);
2031 assert(bf->location_type == LOCATION_SEEK);
2032
2033 /* If contents and timestamps match, these entries are
2034 * identical, even if the seqnum does not match */
2035 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2036 af->current_monotonic == bf->current_monotonic &&
2037 af->current_realtime == bf->current_realtime &&
2038 af->current_xor_hash == bf->current_xor_hash)
2039 return 0;
2040
2041 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2042
2043 /* If this is from the same seqnum source, compare
2044 * seqnums */
2045 if (af->current_seqnum < bf->current_seqnum)
2046 return -1;
2047 if (af->current_seqnum > bf->current_seqnum)
2048 return 1;
2049
2050 /* Wow! This is weird, different data but the same
2051 * seqnums? Something is borked, but let's make the
2052 * best of it and compare by time. */
2053 }
2054
2055 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2056
2057 /* If the boot id matches, compare monotonic time */
2058 if (af->current_monotonic < bf->current_monotonic)
2059 return -1;
2060 if (af->current_monotonic > bf->current_monotonic)
2061 return 1;
2062 }
2063
2064 /* Otherwise, compare UTC time */
2065 if (af->current_realtime < bf->current_realtime)
2066 return -1;
2067 if (af->current_realtime > bf->current_realtime)
2068 return 1;
2069
2070 /* Finally, compare by contents */
2071 if (af->current_xor_hash < bf->current_xor_hash)
2072 return -1;
2073 if (af->current_xor_hash > bf->current_xor_hash)
2074 return 1;
2075
2076 return 0;
2077 }
2078
2079 int journal_file_next_entry(
2080 JournalFile *f,
2081 uint64_t p,
2082 direction_t direction,
2083 Object **ret, uint64_t *offset) {
2084
2085 uint64_t i, n, ofs;
2086 int r;
2087
2088 assert(f);
2089
2090 n = le64toh(f->header->n_entries);
2091 if (n <= 0)
2092 return 0;
2093
2094 if (p == 0)
2095 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2096 else {
2097 r = generic_array_bisect(f,
2098 le64toh(f->header->entry_array_offset),
2099 le64toh(f->header->n_entries),
2100 p,
2101 test_object_offset,
2102 DIRECTION_DOWN,
2103 NULL, NULL,
2104 &i);
2105 if (r <= 0)
2106 return r;
2107
2108 if (direction == DIRECTION_DOWN) {
2109 if (i >= n - 1)
2110 return 0;
2111
2112 i++;
2113 } else {
2114 if (i <= 0)
2115 return 0;
2116
2117 i--;
2118 }
2119 }
2120
2121 /* And jump to it */
2122 r = generic_array_get(f,
2123 le64toh(f->header->entry_array_offset),
2124 i,
2125 ret, &ofs);
2126 if (r <= 0)
2127 return r;
2128
2129 if (p > 0 &&
2130 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2131 log_debug("%s: entry array corrupted at entry %"PRIu64,
2132 f->path, i);
2133 return -EBADMSG;
2134 }
2135
2136 if (offset)
2137 *offset = ofs;
2138
2139 return 1;
2140 }
2141
2142 int journal_file_next_entry_for_data(
2143 JournalFile *f,
2144 Object *o, uint64_t p,
2145 uint64_t data_offset,
2146 direction_t direction,
2147 Object **ret, uint64_t *offset) {
2148
2149 uint64_t n, i;
2150 int r;
2151 Object *d;
2152
2153 assert(f);
2154 assert(p > 0 || !o);
2155
2156 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2157 if (r < 0)
2158 return r;
2159
2160 n = le64toh(d->data.n_entries);
2161 if (n <= 0)
2162 return n;
2163
2164 if (!o)
2165 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2166 else {
2167 if (o->object.type != OBJECT_ENTRY)
2168 return -EINVAL;
2169
2170 r = generic_array_bisect_plus_one(f,
2171 le64toh(d->data.entry_offset),
2172 le64toh(d->data.entry_array_offset),
2173 le64toh(d->data.n_entries),
2174 p,
2175 test_object_offset,
2176 DIRECTION_DOWN,
2177 NULL, NULL,
2178 &i);
2179
2180 if (r <= 0)
2181 return r;
2182
2183 if (direction == DIRECTION_DOWN) {
2184 if (i >= n - 1)
2185 return 0;
2186
2187 i++;
2188 } else {
2189 if (i <= 0)
2190 return 0;
2191
2192 i--;
2193 }
2194
2195 }
2196
2197 return generic_array_get_plus_one(f,
2198 le64toh(d->data.entry_offset),
2199 le64toh(d->data.entry_array_offset),
2200 i,
2201 ret, offset);
2202 }
2203
2204 int journal_file_move_to_entry_by_offset_for_data(
2205 JournalFile *f,
2206 uint64_t data_offset,
2207 uint64_t p,
2208 direction_t direction,
2209 Object **ret, uint64_t *offset) {
2210
2211 int r;
2212 Object *d;
2213
2214 assert(f);
2215
2216 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2217 if (r < 0)
2218 return r;
2219
2220 return generic_array_bisect_plus_one(f,
2221 le64toh(d->data.entry_offset),
2222 le64toh(d->data.entry_array_offset),
2223 le64toh(d->data.n_entries),
2224 p,
2225 test_object_offset,
2226 direction,
2227 ret, offset, NULL);
2228 }
2229
2230 int journal_file_move_to_entry_by_monotonic_for_data(
2231 JournalFile *f,
2232 uint64_t data_offset,
2233 sd_id128_t boot_id,
2234 uint64_t monotonic,
2235 direction_t direction,
2236 Object **ret, uint64_t *offset) {
2237
2238 Object *o, *d;
2239 int r;
2240 uint64_t b, z;
2241
2242 assert(f);
2243
2244 /* First, seek by time */
2245 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2246 if (r < 0)
2247 return r;
2248 if (r == 0)
2249 return -ENOENT;
2250
2251 r = generic_array_bisect_plus_one(f,
2252 le64toh(o->data.entry_offset),
2253 le64toh(o->data.entry_array_offset),
2254 le64toh(o->data.n_entries),
2255 monotonic,
2256 test_object_monotonic,
2257 direction,
2258 NULL, &z, NULL);
2259 if (r <= 0)
2260 return r;
2261
2262 /* And now, continue seeking until we find an entry that
2263 * exists in both bisection arrays */
2264
2265 for (;;) {
2266 Object *qo;
2267 uint64_t p, q;
2268
2269 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2270 if (r < 0)
2271 return r;
2272
2273 r = generic_array_bisect_plus_one(f,
2274 le64toh(d->data.entry_offset),
2275 le64toh(d->data.entry_array_offset),
2276 le64toh(d->data.n_entries),
2277 z,
2278 test_object_offset,
2279 direction,
2280 NULL, &p, NULL);
2281 if (r <= 0)
2282 return r;
2283
2284 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2285 if (r < 0)
2286 return r;
2287
2288 r = generic_array_bisect_plus_one(f,
2289 le64toh(o->data.entry_offset),
2290 le64toh(o->data.entry_array_offset),
2291 le64toh(o->data.n_entries),
2292 p,
2293 test_object_offset,
2294 direction,
2295 &qo, &q, NULL);
2296
2297 if (r <= 0)
2298 return r;
2299
2300 if (p == q) {
2301 if (ret)
2302 *ret = qo;
2303 if (offset)
2304 *offset = q;
2305
2306 return 1;
2307 }
2308
2309 z = q;
2310 }
2311 }
2312
2313 int journal_file_move_to_entry_by_seqnum_for_data(
2314 JournalFile *f,
2315 uint64_t data_offset,
2316 uint64_t seqnum,
2317 direction_t direction,
2318 Object **ret, uint64_t *offset) {
2319
2320 Object *d;
2321 int r;
2322
2323 assert(f);
2324
2325 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2326 if (r < 0)
2327 return r;
2328
2329 return generic_array_bisect_plus_one(f,
2330 le64toh(d->data.entry_offset),
2331 le64toh(d->data.entry_array_offset),
2332 le64toh(d->data.n_entries),
2333 seqnum,
2334 test_object_seqnum,
2335 direction,
2336 ret, offset, NULL);
2337 }
2338
2339 int journal_file_move_to_entry_by_realtime_for_data(
2340 JournalFile *f,
2341 uint64_t data_offset,
2342 uint64_t realtime,
2343 direction_t direction,
2344 Object **ret, uint64_t *offset) {
2345
2346 Object *d;
2347 int r;
2348
2349 assert(f);
2350
2351 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2352 if (r < 0)
2353 return r;
2354
2355 return generic_array_bisect_plus_one(f,
2356 le64toh(d->data.entry_offset),
2357 le64toh(d->data.entry_array_offset),
2358 le64toh(d->data.n_entries),
2359 realtime,
2360 test_object_realtime,
2361 direction,
2362 ret, offset, NULL);
2363 }
2364
2365 void journal_file_dump(JournalFile *f) {
2366 Object *o;
2367 int r;
2368 uint64_t p;
2369
2370 assert(f);
2371
2372 journal_file_print_header(f);
2373
2374 p = le64toh(f->header->header_size);
2375 while (p != 0) {
2376 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2377 if (r < 0)
2378 goto fail;
2379
2380 switch (o->object.type) {
2381
2382 case OBJECT_UNUSED:
2383 printf("Type: OBJECT_UNUSED\n");
2384 break;
2385
2386 case OBJECT_DATA:
2387 printf("Type: OBJECT_DATA\n");
2388 break;
2389
2390 case OBJECT_FIELD:
2391 printf("Type: OBJECT_FIELD\n");
2392 break;
2393
2394 case OBJECT_ENTRY:
2395 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2396 le64toh(o->entry.seqnum),
2397 le64toh(o->entry.monotonic),
2398 le64toh(o->entry.realtime));
2399 break;
2400
2401 case OBJECT_FIELD_HASH_TABLE:
2402 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2403 break;
2404
2405 case OBJECT_DATA_HASH_TABLE:
2406 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2407 break;
2408
2409 case OBJECT_ENTRY_ARRAY:
2410 printf("Type: OBJECT_ENTRY_ARRAY\n");
2411 break;
2412
2413 case OBJECT_TAG:
2414 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2415 le64toh(o->tag.seqnum),
2416 le64toh(o->tag.epoch));
2417 break;
2418
2419 default:
2420 printf("Type: unknown (%u)\n", o->object.type);
2421 break;
2422 }
2423
2424 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2425 printf("Flags: %s\n",
2426 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2427
2428 if (p == le64toh(f->header->tail_object_offset))
2429 p = 0;
2430 else
2431 p = p + ALIGN64(le64toh(o->object.size));
2432 }
2433
2434 return;
2435 fail:
2436 log_error("File corrupt");
2437 }
2438
2439 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2440 const char *x;
2441
2442 x = format_timestamp(buf, l, t);
2443 if (x)
2444 return x;
2445 return " --- ";
2446 }
2447
2448 void journal_file_print_header(JournalFile *f) {
2449 char a[33], b[33], c[33], d[33];
2450 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2451 struct stat st;
2452 char bytes[FORMAT_BYTES_MAX];
2453
2454 assert(f);
2455
2456 printf("File Path: %s\n"
2457 "File ID: %s\n"
2458 "Machine ID: %s\n"
2459 "Boot ID: %s\n"
2460 "Sequential Number ID: %s\n"
2461 "State: %s\n"
2462 "Compatible Flags:%s%s\n"
2463 "Incompatible Flags:%s%s%s\n"
2464 "Header size: %"PRIu64"\n"
2465 "Arena size: %"PRIu64"\n"
2466 "Data Hash Table Size: %"PRIu64"\n"
2467 "Field Hash Table Size: %"PRIu64"\n"
2468 "Rotate Suggested: %s\n"
2469 "Head Sequential Number: %"PRIu64"\n"
2470 "Tail Sequential Number: %"PRIu64"\n"
2471 "Head Realtime Timestamp: %s\n"
2472 "Tail Realtime Timestamp: %s\n"
2473 "Tail Monotonic Timestamp: %s\n"
2474 "Objects: %"PRIu64"\n"
2475 "Entry Objects: %"PRIu64"\n",
2476 f->path,
2477 sd_id128_to_string(f->header->file_id, a),
2478 sd_id128_to_string(f->header->machine_id, b),
2479 sd_id128_to_string(f->header->boot_id, c),
2480 sd_id128_to_string(f->header->seqnum_id, d),
2481 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2482 f->header->state == STATE_ONLINE ? "ONLINE" :
2483 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2484 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2485 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2486 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2487 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2488 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2489 le64toh(f->header->header_size),
2490 le64toh(f->header->arena_size),
2491 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2492 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2493 yes_no(journal_file_rotate_suggested(f, 0)),
2494 le64toh(f->header->head_entry_seqnum),
2495 le64toh(f->header->tail_entry_seqnum),
2496 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2497 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2498 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2499 le64toh(f->header->n_objects),
2500 le64toh(f->header->n_entries));
2501
2502 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2503 printf("Data Objects: %"PRIu64"\n"
2504 "Data Hash Table Fill: %.1f%%\n",
2505 le64toh(f->header->n_data),
2506 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2507
2508 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2509 printf("Field Objects: %"PRIu64"\n"
2510 "Field Hash Table Fill: %.1f%%\n",
2511 le64toh(f->header->n_fields),
2512 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2513
2514 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2515 printf("Tag Objects: %"PRIu64"\n",
2516 le64toh(f->header->n_tags));
2517 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2518 printf("Entry Array Objects: %"PRIu64"\n",
2519 le64toh(f->header->n_entry_arrays));
2520
2521 if (fstat(f->fd, &st) >= 0)
2522 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2523 }
2524
2525 int journal_file_open(
2526 const char *fname,
2527 int flags,
2528 mode_t mode,
2529 bool compress,
2530 bool seal,
2531 JournalMetrics *metrics,
2532 MMapCache *mmap_cache,
2533 JournalFile *template,
2534 JournalFile **ret) {
2535
2536 bool newly_created = false;
2537 JournalFile *f;
2538 void *h;
2539 int r;
2540
2541 assert(fname);
2542 assert(ret);
2543
2544 if ((flags & O_ACCMODE) != O_RDONLY &&
2545 (flags & O_ACCMODE) != O_RDWR)
2546 return -EINVAL;
2547
2548 if (!endswith(fname, ".journal") &&
2549 !endswith(fname, ".journal~"))
2550 return -EINVAL;
2551
2552 f = new0(JournalFile, 1);
2553 if (!f)
2554 return -ENOMEM;
2555
2556 f->fd = -1;
2557 f->mode = mode;
2558
2559 f->flags = flags;
2560 f->prot = prot_from_flags(flags);
2561 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2562 #if defined(HAVE_LZ4)
2563 f->compress_lz4 = compress;
2564 #elif defined(HAVE_XZ)
2565 f->compress_xz = compress;
2566 #endif
2567 #ifdef HAVE_GCRYPT
2568 f->seal = seal;
2569 #endif
2570
2571 if (mmap_cache)
2572 f->mmap = mmap_cache_ref(mmap_cache);
2573 else {
2574 f->mmap = mmap_cache_new();
2575 if (!f->mmap) {
2576 r = -ENOMEM;
2577 goto fail;
2578 }
2579 }
2580
2581 f->path = strdup(fname);
2582 if (!f->path) {
2583 r = -ENOMEM;
2584 goto fail;
2585 }
2586
2587 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2588 if (!f->chain_cache) {
2589 r = -ENOMEM;
2590 goto fail;
2591 }
2592
2593 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2594 if (f->fd < 0) {
2595 r = -errno;
2596 goto fail;
2597 }
2598
2599 r = journal_file_fstat(f);
2600 if (r < 0)
2601 goto fail;
2602
2603 if (f->last_stat.st_size == 0 && f->writable) {
2604
2605 /* Before we write anything, turn off COW logic. Given
2606 * our write pattern that is quite unfriendly to COW
2607 * file systems this should greatly improve
2608 * performance on COW file systems, such as btrfs, at
2609 * the expense of data integrity features (which
2610 * shouldn't be too bad, given that we do our own
2611 * checksumming). */
2612 r = chattr_fd(f->fd, true, FS_NOCOW_FL);
2613 if (r < 0)
2614 log_warning_errno(errno, "Failed to set file attributes: %m");
2615
2616 /* Let's attach the creation time to the journal file,
2617 * so that the vacuuming code knows the age of this
2618 * file even if the file might end up corrupted one
2619 * day... Ideally we'd just use the creation time many
2620 * file systems maintain for each file, but there is
2621 * currently no usable API to query this, hence let's
2622 * emulate this via extended attributes. If extended
2623 * attributes are not supported we'll just skip this,
2624 * and rely solely on mtime/atime/ctime of the file. */
2625
2626 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
2627
2628 #ifdef HAVE_GCRYPT
2629 /* Try to load the FSPRG state, and if we can't, then
2630 * just don't do sealing */
2631 if (f->seal) {
2632 r = journal_file_fss_load(f);
2633 if (r < 0)
2634 f->seal = false;
2635 }
2636 #endif
2637
2638 r = journal_file_init_header(f, template);
2639 if (r < 0)
2640 goto fail;
2641
2642 r = journal_file_fstat(f);
2643 if (r < 0)
2644 goto fail;
2645
2646 newly_created = true;
2647 }
2648
2649 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2650 r = -EIO;
2651 goto fail;
2652 }
2653
2654 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2655 if (r < 0) {
2656 r = -errno;
2657 goto fail;
2658 }
2659
2660 f->header = h;
2661
2662 if (!newly_created) {
2663 r = journal_file_verify_header(f);
2664 if (r < 0)
2665 goto fail;
2666 }
2667
2668 #ifdef HAVE_GCRYPT
2669 if (!newly_created && f->writable) {
2670 r = journal_file_fss_load(f);
2671 if (r < 0)
2672 goto fail;
2673 }
2674 #endif
2675
2676 if (f->writable) {
2677 if (metrics) {
2678 journal_default_metrics(metrics, f->fd);
2679 f->metrics = *metrics;
2680 } else if (template)
2681 f->metrics = template->metrics;
2682
2683 r = journal_file_refresh_header(f);
2684 if (r < 0)
2685 goto fail;
2686 }
2687
2688 #ifdef HAVE_GCRYPT
2689 r = journal_file_hmac_setup(f);
2690 if (r < 0)
2691 goto fail;
2692 #endif
2693
2694 if (newly_created) {
2695 r = journal_file_setup_field_hash_table(f);
2696 if (r < 0)
2697 goto fail;
2698
2699 r = journal_file_setup_data_hash_table(f);
2700 if (r < 0)
2701 goto fail;
2702
2703 #ifdef HAVE_GCRYPT
2704 r = journal_file_append_first_tag(f);
2705 if (r < 0)
2706 goto fail;
2707 #endif
2708 }
2709
2710 r = journal_file_map_field_hash_table(f);
2711 if (r < 0)
2712 goto fail;
2713
2714 r = journal_file_map_data_hash_table(f);
2715 if (r < 0)
2716 goto fail;
2717
2718 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2719 r = -EIO;
2720 goto fail;
2721 }
2722
2723 *ret = f;
2724 return 0;
2725
2726 fail:
2727 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2728 r = -EIO;
2729
2730 journal_file_close(f);
2731
2732 return r;
2733 }
2734
2735 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2736 _cleanup_free_ char *p = NULL;
2737 size_t l;
2738 JournalFile *old_file, *new_file = NULL;
2739 int r;
2740
2741 assert(f);
2742 assert(*f);
2743
2744 old_file = *f;
2745
2746 if (!old_file->writable)
2747 return -EINVAL;
2748
2749 if (!endswith(old_file->path, ".journal"))
2750 return -EINVAL;
2751
2752 l = strlen(old_file->path);
2753 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2754 (int) l - 8, old_file->path,
2755 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2756 le64toh((*f)->header->head_entry_seqnum),
2757 le64toh((*f)->header->head_entry_realtime));
2758 if (r < 0)
2759 return -ENOMEM;
2760
2761 /* Try to rename the file to the archived version. If the file
2762 * already was deleted, we'll get ENOENT, let's ignore that
2763 * case. */
2764 r = rename(old_file->path, p);
2765 if (r < 0 && errno != ENOENT)
2766 return -errno;
2767
2768 old_file->header->state = STATE_ARCHIVED;
2769
2770 /* Currently, btrfs is not very good with out write patterns
2771 * and fragments heavily. Let's defrag our journal files when
2772 * we archive them */
2773 old_file->defrag_on_close = true;
2774
2775 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2776 journal_file_close(old_file);
2777
2778 *f = new_file;
2779 return r;
2780 }
2781
2782 int journal_file_open_reliably(
2783 const char *fname,
2784 int flags,
2785 mode_t mode,
2786 bool compress,
2787 bool seal,
2788 JournalMetrics *metrics,
2789 MMapCache *mmap_cache,
2790 JournalFile *template,
2791 JournalFile **ret) {
2792
2793 int r;
2794 size_t l;
2795 _cleanup_free_ char *p = NULL;
2796
2797 r = journal_file_open(fname, flags, mode, compress, seal,
2798 metrics, mmap_cache, template, ret);
2799 if (r != -EBADMSG && /* corrupted */
2800 r != -ENODATA && /* truncated */
2801 r != -EHOSTDOWN && /* other machine */
2802 r != -EPROTONOSUPPORT && /* incompatible feature */
2803 r != -EBUSY && /* unclean shutdown */
2804 r != -ESHUTDOWN && /* already archived */
2805 r != -EIO && /* IO error, including SIGBUS on mmap */
2806 r != -EIDRM /* File has been deleted */)
2807 return r;
2808
2809 if ((flags & O_ACCMODE) == O_RDONLY)
2810 return r;
2811
2812 if (!(flags & O_CREAT))
2813 return r;
2814
2815 if (!endswith(fname, ".journal"))
2816 return r;
2817
2818 /* The file is corrupted. Rotate it away and try it again (but only once) */
2819
2820 l = strlen(fname);
2821 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2822 (int) l - 8, fname,
2823 (unsigned long long) now(CLOCK_REALTIME),
2824 random_u64()) < 0)
2825 return -ENOMEM;
2826
2827 r = rename(fname, p);
2828 if (r < 0)
2829 return -errno;
2830
2831 /* btrfs doesn't cope well with our write pattern and
2832 * fragments heavily. Let's defrag all files we rotate */
2833
2834 (void) chattr_path(p, false, FS_NOCOW_FL);
2835 (void) btrfs_defrag(p);
2836
2837 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2838
2839 return journal_file_open(fname, flags, mode, compress, seal,
2840 metrics, mmap_cache, template, ret);
2841 }
2842
2843 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2844 uint64_t i, n;
2845 uint64_t q, xor_hash = 0;
2846 int r;
2847 EntryItem *items;
2848 dual_timestamp ts;
2849
2850 assert(from);
2851 assert(to);
2852 assert(o);
2853 assert(p);
2854
2855 if (!to->writable)
2856 return -EPERM;
2857
2858 ts.monotonic = le64toh(o->entry.monotonic);
2859 ts.realtime = le64toh(o->entry.realtime);
2860
2861 n = journal_file_entry_n_items(o);
2862 /* alloca() can't take 0, hence let's allocate at least one */
2863 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2864
2865 for (i = 0; i < n; i++) {
2866 uint64_t l, h;
2867 le64_t le_hash;
2868 size_t t;
2869 void *data;
2870 Object *u;
2871
2872 q = le64toh(o->entry.items[i].object_offset);
2873 le_hash = o->entry.items[i].hash;
2874
2875 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2876 if (r < 0)
2877 return r;
2878
2879 if (le_hash != o->data.hash)
2880 return -EBADMSG;
2881
2882 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2883 t = (size_t) l;
2884
2885 /* We hit the limit on 32bit machines */
2886 if ((uint64_t) t != l)
2887 return -E2BIG;
2888
2889 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2890 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2891 size_t rsize;
2892
2893 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2894 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2895 if (r < 0)
2896 return r;
2897
2898 data = from->compress_buffer;
2899 l = rsize;
2900 #else
2901 return -EPROTONOSUPPORT;
2902 #endif
2903 } else
2904 data = o->data.payload;
2905
2906 r = journal_file_append_data(to, data, l, &u, &h);
2907 if (r < 0)
2908 return r;
2909
2910 xor_hash ^= le64toh(u->data.hash);
2911 items[i].object_offset = htole64(h);
2912 items[i].hash = u->data.hash;
2913
2914 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2915 if (r < 0)
2916 return r;
2917 }
2918
2919 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2920
2921 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2922 return -EIO;
2923
2924 return r;
2925 }
2926
2927 void journal_default_metrics(JournalMetrics *m, int fd) {
2928 uint64_t fs_size = 0;
2929 struct statvfs ss;
2930 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2931
2932 assert(m);
2933 assert(fd >= 0);
2934
2935 if (fstatvfs(fd, &ss) >= 0)
2936 fs_size = ss.f_frsize * ss.f_blocks;
2937
2938 if (m->max_use == (uint64_t) -1) {
2939
2940 if (fs_size > 0) {
2941 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2942
2943 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2944 m->max_use = DEFAULT_MAX_USE_UPPER;
2945
2946 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2947 m->max_use = DEFAULT_MAX_USE_LOWER;
2948 } else
2949 m->max_use = DEFAULT_MAX_USE_LOWER;
2950 } else {
2951 m->max_use = PAGE_ALIGN(m->max_use);
2952
2953 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2954 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2955 }
2956
2957 if (m->max_size == (uint64_t) -1) {
2958 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2959
2960 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2961 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2962 } else
2963 m->max_size = PAGE_ALIGN(m->max_size);
2964
2965 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2966 m->max_size = JOURNAL_FILE_SIZE_MIN;
2967
2968 if (m->max_size*2 > m->max_use)
2969 m->max_use = m->max_size*2;
2970
2971 if (m->min_size == (uint64_t) -1)
2972 m->min_size = JOURNAL_FILE_SIZE_MIN;
2973 else {
2974 m->min_size = PAGE_ALIGN(m->min_size);
2975
2976 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2977 m->min_size = JOURNAL_FILE_SIZE_MIN;
2978
2979 if (m->min_size > m->max_size)
2980 m->max_size = m->min_size;
2981 }
2982
2983 if (m->keep_free == (uint64_t) -1) {
2984
2985 if (fs_size > 0) {
2986 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2987
2988 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2989 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2990
2991 } else
2992 m->keep_free = DEFAULT_KEEP_FREE;
2993 }
2994
2995 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2996 format_bytes(a, sizeof(a), m->max_use),
2997 format_bytes(b, sizeof(b), m->max_size),
2998 format_bytes(c, sizeof(c), m->min_size),
2999 format_bytes(d, sizeof(d), m->keep_free));
3000 }
3001
3002 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3003 assert(f);
3004 assert(from || to);
3005
3006 if (from) {
3007 if (f->header->head_entry_realtime == 0)
3008 return -ENOENT;
3009
3010 *from = le64toh(f->header->head_entry_realtime);
3011 }
3012
3013 if (to) {
3014 if (f->header->tail_entry_realtime == 0)
3015 return -ENOENT;
3016
3017 *to = le64toh(f->header->tail_entry_realtime);
3018 }
3019
3020 return 1;
3021 }
3022
3023 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3024 Object *o;
3025 uint64_t p;
3026 int r;
3027
3028 assert(f);
3029 assert(from || to);
3030
3031 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3032 if (r <= 0)
3033 return r;
3034
3035 if (le64toh(o->data.n_entries) <= 0)
3036 return 0;
3037
3038 if (from) {
3039 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3040 if (r < 0)
3041 return r;
3042
3043 *from = le64toh(o->entry.monotonic);
3044 }
3045
3046 if (to) {
3047 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3048 if (r < 0)
3049 return r;
3050
3051 r = generic_array_get_plus_one(f,
3052 le64toh(o->data.entry_offset),
3053 le64toh(o->data.entry_array_offset),
3054 le64toh(o->data.n_entries)-1,
3055 &o, NULL);
3056 if (r <= 0)
3057 return r;
3058
3059 *to = le64toh(o->entry.monotonic);
3060 }
3061
3062 return 1;
3063 }
3064
3065 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3066 assert(f);
3067
3068 /* If we gained new header fields we gained new features,
3069 * hence suggest a rotation */
3070 if (le64toh(f->header->header_size) < sizeof(Header)) {
3071 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3072 return true;
3073 }
3074
3075 /* Let's check if the hash tables grew over a certain fill
3076 * level (75%, borrowing this value from Java's hash table
3077 * implementation), and if so suggest a rotation. To calculate
3078 * the fill level we need the n_data field, which only exists
3079 * in newer versions. */
3080
3081 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3082 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3083 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3084 f->path,
3085 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3086 le64toh(f->header->n_data),
3087 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3088 (unsigned long long) f->last_stat.st_size,
3089 f->last_stat.st_size / le64toh(f->header->n_data));
3090 return true;
3091 }
3092
3093 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3094 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3095 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3096 f->path,
3097 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3098 le64toh(f->header->n_fields),
3099 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3100 return true;
3101 }
3102
3103 /* Are the data objects properly indexed by field objects? */
3104 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3105 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3106 le64toh(f->header->n_data) > 0 &&
3107 le64toh(f->header->n_fields) == 0)
3108 return true;
3109
3110 if (max_file_usec > 0) {
3111 usec_t t, h;
3112
3113 h = le64toh(f->header->head_entry_realtime);
3114 t = now(CLOCK_REALTIME);
3115
3116 if (h > 0 && t > h + max_file_usec)
3117 return true;
3118 }
3119
3120 return false;
3121 }