1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include "btrfs-util.h"
32 #include "journal-def.h"
33 #include "journal-file.h"
34 #include "journal-authenticate.h"
37 #include "random-util.h"
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
42 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
44 /* This is the minimum journal file size */
45 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
47 /* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
52 /* This is the upper bound if we deduce max_size from max_use */
53 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
55 /* This is the upper bound if we deduce the keep_free value from the
57 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
59 /* This is the keep_free value when we can't determine the system
61 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
63 /* n_data was the first entry we added after the initial file format design */
64 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
66 /* How many entries to keep in the entry array chain cache at max */
67 #define CHAIN_CACHE_MAX 20
69 /* How much to increase the journal file size at once each time we allocate something new. */
70 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
72 /* Reread fstat() of the file for detecting deletions at least this often */
73 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
75 /* The mmap context to use for the header we pick as one above the last defined typed */
76 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
78 static int journal_file_set_online(JournalFile
*f
) {
84 if (!(f
->fd
>= 0 && f
->header
))
87 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
90 switch(f
->header
->state
) {
95 f
->header
->state
= STATE_ONLINE
;
104 int journal_file_set_offline(JournalFile
*f
) {
110 if (!(f
->fd
>= 0 && f
->header
))
113 if (f
->header
->state
!= STATE_ONLINE
)
118 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
121 f
->header
->state
= STATE_OFFLINE
;
123 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
131 void journal_file_close(JournalFile
*f
) {
135 /* Write the final tag */
136 if (f
->seal
&& f
->writable
)
137 journal_file_append_tag(f
);
140 journal_file_set_offline(f
);
142 if (f
->mmap
&& f
->fd
>= 0)
143 mmap_cache_close_fd(f
->mmap
, f
->fd
);
145 if (f
->fd
>= 0 && f
->defrag_on_close
) {
147 /* Be friendly to btrfs: turn COW back on again now,
148 * and defragment the file. We won't write to the file
149 * ever again, hence remove all fragmentation, and
150 * reenable all the good bits COW usually provides
151 * (such as data checksumming). */
153 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
154 (void) btrfs_defrag_fd(f
->fd
);
161 mmap_cache_unref(f
->mmap
);
163 ordered_hashmap_free_free(f
->chain_cache
);
165 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
166 free(f
->compress_buffer
);
171 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
173 free(f
->fsprg_state
);
178 gcry_md_close(f
->hmac
);
184 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
191 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
192 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
194 h
.incompatible_flags
|= htole32(
195 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
196 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
198 h
.compatible_flags
= htole32(
199 f
->seal
* HEADER_COMPATIBLE_SEALED
);
201 r
= sd_id128_randomize(&h
.file_id
);
206 h
.seqnum_id
= template->header
->seqnum_id
;
207 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
209 h
.seqnum_id
= h
.file_id
;
211 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
221 static int journal_file_refresh_header(JournalFile
*f
) {
227 r
= sd_id128_get_machine(&f
->header
->machine_id
);
231 r
= sd_id128_get_boot(&boot_id
);
235 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
236 f
->tail_entry_monotonic_valid
= true;
238 f
->header
->boot_id
= boot_id
;
240 r
= journal_file_set_online(f
);
242 /* Sync the online state to disk */
248 static int journal_file_verify_header(JournalFile
*f
) {
253 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
256 /* In both read and write mode we refuse to open files with
257 * incompatible flags we don't know */
258 flags
= le32toh(f
->header
->incompatible_flags
);
259 if (flags
& ~HEADER_INCOMPATIBLE_SUPPORTED
) {
260 if (flags
& ~HEADER_INCOMPATIBLE_ANY
)
261 log_debug("Journal file %s has unknown incompatible flags %"PRIx32
,
262 f
->path
, flags
& ~HEADER_INCOMPATIBLE_ANY
);
263 flags
= (flags
& HEADER_INCOMPATIBLE_ANY
) & ~HEADER_INCOMPATIBLE_SUPPORTED
;
265 log_debug("Journal file %s uses incompatible flags %"PRIx32
266 " disabled at compilation time.", f
->path
, flags
);
267 return -EPROTONOSUPPORT
;
270 /* When open for writing we refuse to open files with
271 * compatible flags, too */
272 flags
= le32toh(f
->header
->compatible_flags
);
273 if (f
->writable
&& (flags
& ~HEADER_COMPATIBLE_SUPPORTED
)) {
274 if (flags
& ~HEADER_COMPATIBLE_ANY
)
275 log_debug("Journal file %s has unknown compatible flags %"PRIx32
,
276 f
->path
, flags
& ~HEADER_COMPATIBLE_ANY
);
277 flags
= (flags
& HEADER_COMPATIBLE_ANY
) & ~HEADER_COMPATIBLE_SUPPORTED
;
279 log_debug("Journal file %s uses compatible flags %"PRIx32
280 " disabled at compilation time.", f
->path
, flags
);
281 return -EPROTONOSUPPORT
;
284 if (f
->header
->state
>= _STATE_MAX
)
287 /* The first addition was n_data, so check that we are at least this large */
288 if (le64toh(f
->header
->header_size
) < HEADER_SIZE_MIN
)
291 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
294 if ((le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)) > (uint64_t) f
->last_stat
.st_size
)
297 if (le64toh(f
->header
->tail_object_offset
) > (le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)))
300 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
301 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
302 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
303 !VALID64(le64toh(f
->header
->entry_array_offset
)))
308 sd_id128_t machine_id
;
311 r
= sd_id128_get_machine(&machine_id
);
315 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
318 state
= f
->header
->state
;
320 if (state
== STATE_ONLINE
) {
321 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
323 } else if (state
== STATE_ARCHIVED
)
325 else if (state
!= STATE_OFFLINE
) {
326 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
331 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
332 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
334 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
339 static int journal_file_fstat(JournalFile
*f
) {
343 if (fstat(f
->fd
, &f
->last_stat
) < 0)
346 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
348 /* Refuse appending to files that are already deleted */
349 if (f
->last_stat
.st_nlink
<= 0)
355 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
356 uint64_t old_size
, new_size
;
361 /* We assume that this file is not sparse, and we know that
362 * for sure, since we always call posix_fallocate()
365 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
369 le64toh(f
->header
->header_size
) +
370 le64toh(f
->header
->arena_size
);
372 new_size
= PAGE_ALIGN(offset
+ size
);
373 if (new_size
< le64toh(f
->header
->header_size
))
374 new_size
= le64toh(f
->header
->header_size
);
376 if (new_size
<= old_size
) {
378 /* We already pre-allocated enough space, but before
379 * we write to it, let's check with fstat() if the
380 * file got deleted, in order make sure we don't throw
381 * away the data immediately. Don't check fstat() for
382 * all writes though, but only once ever 10s. */
384 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
387 return journal_file_fstat(f
);
390 /* Allocate more space. */
392 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
395 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
398 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
401 available
= svfs
.f_bfree
* svfs
.f_bsize
;
403 if (available
>= f
->metrics
.keep_free
)
404 available
-= f
->metrics
.keep_free
;
408 if (new_size
- old_size
> available
)
413 /* Increase by larger blocks at once */
414 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
415 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
416 new_size
= f
->metrics
.max_size
;
418 /* Note that the glibc fallocate() fallback is very
419 inefficient, hence we try to minimize the allocation area
421 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
425 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
427 return journal_file_fstat(f
);
430 static unsigned type_to_context(ObjectType type
) {
431 /* One context for each type, plus one catch-all for the rest */
432 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
433 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
434 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
437 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
) {
446 /* Avoid SIGBUS on invalid accesses */
447 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
448 /* Hmm, out of range? Let's refresh the fstat() data
449 * first, before we trust that check. */
451 r
= journal_file_fstat(f
);
455 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
456 return -EADDRNOTAVAIL
;
459 return mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
);
462 static uint64_t minimum_header_size(Object
*o
) {
464 static const uint64_t table
[] = {
465 [OBJECT_DATA
] = sizeof(DataObject
),
466 [OBJECT_FIELD
] = sizeof(FieldObject
),
467 [OBJECT_ENTRY
] = sizeof(EntryObject
),
468 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
469 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
470 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
471 [OBJECT_TAG
] = sizeof(TagObject
),
474 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
475 return sizeof(ObjectHeader
);
477 return table
[o
->object
.type
];
480 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
489 /* Objects may only be located at multiple of 64 bit */
490 if (!VALID64(offset
))
493 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
);
498 s
= le64toh(o
->object
.size
);
500 if (s
< sizeof(ObjectHeader
))
503 if (o
->object
.type
<= OBJECT_UNUSED
)
506 if (s
< minimum_header_size(o
))
509 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
)
512 if (s
> sizeof(ObjectHeader
)) {
513 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
);
524 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
529 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
532 /* If an external seqnum counter was passed, we update
533 * both the local and the external one, and set it to
534 * the maximum of both */
542 f
->header
->tail_entry_seqnum
= htole64(r
);
544 if (f
->header
->head_entry_seqnum
== 0)
545 f
->header
->head_entry_seqnum
= htole64(r
);
550 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
557 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
558 assert(size
>= sizeof(ObjectHeader
));
562 r
= journal_file_set_online(f
);
566 p
= le64toh(f
->header
->tail_object_offset
);
568 p
= le64toh(f
->header
->header_size
);
570 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
574 p
+= ALIGN64(le64toh(tail
->object
.size
));
577 r
= journal_file_allocate(f
, p
, size
);
581 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
);
588 o
->object
.type
= type
;
589 o
->object
.size
= htole64(size
);
591 f
->header
->tail_object_offset
= htole64(p
);
592 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
600 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
607 /* We estimate that we need 1 hash table entry per 768 of
608 journal file and we want to make sure we never get beyond
609 75% fill level. Calculate the hash table size for the
610 maximum file size based on these metrics. */
612 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
613 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
614 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
616 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
618 r
= journal_file_append_object(f
,
619 OBJECT_DATA_HASH_TABLE
,
620 offsetof(Object
, hash_table
.items
) + s
,
625 memzero(o
->hash_table
.items
, s
);
627 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
628 f
->header
->data_hash_table_size
= htole64(s
);
633 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
640 /* We use a fixed size hash table for the fields as this
641 * number should grow very slowly only */
643 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
644 r
= journal_file_append_object(f
,
645 OBJECT_FIELD_HASH_TABLE
,
646 offsetof(Object
, hash_table
.items
) + s
,
651 memzero(o
->hash_table
.items
, s
);
653 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
654 f
->header
->field_hash_table_size
= htole64(s
);
659 int journal_file_map_data_hash_table(JournalFile
*f
) {
666 if (f
->data_hash_table
)
669 p
= le64toh(f
->header
->data_hash_table_offset
);
670 s
= le64toh(f
->header
->data_hash_table_size
);
672 r
= journal_file_move_to(f
,
673 OBJECT_DATA_HASH_TABLE
,
680 f
->data_hash_table
= t
;
684 int journal_file_map_field_hash_table(JournalFile
*f
) {
691 if (f
->field_hash_table
)
694 p
= le64toh(f
->header
->field_hash_table_offset
);
695 s
= le64toh(f
->header
->field_hash_table_size
);
697 r
= journal_file_move_to(f
,
698 OBJECT_FIELD_HASH_TABLE
,
705 f
->field_hash_table
= t
;
709 static int journal_file_link_field(
722 if (o
->object
.type
!= OBJECT_FIELD
)
725 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
729 /* This might alter the window we are looking at */
730 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
733 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
735 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
737 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
741 o
->field
.next_hash_offset
= htole64(offset
);
744 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
746 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
747 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
752 static int journal_file_link_data(
765 if (o
->object
.type
!= OBJECT_DATA
)
768 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
772 /* This might alter the window we are looking at */
773 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
774 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
775 o
->data
.n_entries
= 0;
778 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
780 /* Only entry in the hash table is easy */
781 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
783 /* Move back to the previous data object, to patch in
786 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
790 o
->data
.next_hash_offset
= htole64(offset
);
793 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
795 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
796 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
801 int journal_file_find_field_object_with_hash(
803 const void *field
, uint64_t size
, uint64_t hash
,
804 Object
**ret
, uint64_t *offset
) {
806 uint64_t p
, osize
, h
, m
;
810 assert(field
&& size
> 0);
812 /* If the field hash table is empty, we can't find anything */
813 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
816 /* Map the field hash table, if it isn't mapped yet. */
817 r
= journal_file_map_field_hash_table(f
);
821 osize
= offsetof(Object
, field
.payload
) + size
;
823 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
828 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
833 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
837 if (le64toh(o
->field
.hash
) == hash
&&
838 le64toh(o
->object
.size
) == osize
&&
839 memcmp(o
->field
.payload
, field
, size
) == 0) {
849 p
= le64toh(o
->field
.next_hash_offset
);
855 int journal_file_find_field_object(
857 const void *field
, uint64_t size
,
858 Object
**ret
, uint64_t *offset
) {
863 assert(field
&& size
> 0);
865 hash
= hash64(field
, size
);
867 return journal_file_find_field_object_with_hash(f
,
872 int journal_file_find_data_object_with_hash(
874 const void *data
, uint64_t size
, uint64_t hash
,
875 Object
**ret
, uint64_t *offset
) {
877 uint64_t p
, osize
, h
, m
;
881 assert(data
|| size
== 0);
883 /* If there's no data hash table, then there's no entry. */
884 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
887 /* Map the data hash table, if it isn't mapped yet. */
888 r
= journal_file_map_data_hash_table(f
);
892 osize
= offsetof(Object
, data
.payload
) + size
;
894 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
899 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
904 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
908 if (le64toh(o
->data
.hash
) != hash
)
911 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
912 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
916 l
= le64toh(o
->object
.size
);
917 if (l
<= offsetof(Object
, data
.payload
))
920 l
-= offsetof(Object
, data
.payload
);
922 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
923 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
928 memcmp(f
->compress_buffer
, data
, size
) == 0) {
939 return -EPROTONOSUPPORT
;
941 } else if (le64toh(o
->object
.size
) == osize
&&
942 memcmp(o
->data
.payload
, data
, size
) == 0) {
954 p
= le64toh(o
->data
.next_hash_offset
);
960 int journal_file_find_data_object(
962 const void *data
, uint64_t size
,
963 Object
**ret
, uint64_t *offset
) {
968 assert(data
|| size
== 0);
970 hash
= hash64(data
, size
);
972 return journal_file_find_data_object_with_hash(f
,
977 static int journal_file_append_field(
979 const void *field
, uint64_t size
,
980 Object
**ret
, uint64_t *offset
) {
988 assert(field
&& size
> 0);
990 hash
= hash64(field
, size
);
992 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1006 osize
= offsetof(Object
, field
.payload
) + size
;
1007 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1011 o
->field
.hash
= htole64(hash
);
1012 memcpy(o
->field
.payload
, field
, size
);
1014 r
= journal_file_link_field(f
, o
, p
, hash
);
1018 /* The linking might have altered the window, so let's
1019 * refresh our pointer */
1020 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1025 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1039 static int journal_file_append_data(
1041 const void *data
, uint64_t size
,
1042 Object
**ret
, uint64_t *offset
) {
1047 int r
, compression
= 0;
1051 assert(data
|| size
== 0);
1053 hash
= hash64(data
, size
);
1055 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1069 osize
= offsetof(Object
, data
.payload
) + size
;
1070 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1074 o
->data
.hash
= htole64(hash
);
1076 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1077 if (f
->compress_xz
&&
1078 size
>= COMPRESSION_SIZE_THRESHOLD
) {
1081 compression
= compress_blob(data
, size
, o
->data
.payload
, &rsize
);
1084 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1085 o
->object
.flags
|= compression
;
1087 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1088 size
, rsize
, object_compressed_to_string(compression
));
1093 if (!compression
&& size
> 0)
1094 memcpy(o
->data
.payload
, data
, size
);
1096 r
= journal_file_link_data(f
, o
, p
, hash
);
1100 /* The linking might have altered the window, so let's
1101 * refresh our pointer */
1102 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1109 eq
= memchr(data
, '=', size
);
1110 if (eq
&& eq
> data
) {
1114 /* Create field object ... */
1115 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1119 /* ... and link it in. */
1120 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1121 fo
->field
.head_data_offset
= le64toh(p
);
1125 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1139 uint64_t journal_file_entry_n_items(Object
*o
) {
1142 if (o
->object
.type
!= OBJECT_ENTRY
)
1145 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1148 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1151 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1154 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1157 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1160 if (o
->object
.type
!= OBJECT_DATA_HASH_TABLE
&&
1161 o
->object
.type
!= OBJECT_FIELD_HASH_TABLE
)
1164 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1167 static int link_entry_into_array(JournalFile
*f
,
1172 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1180 a
= le64toh(*first
);
1181 i
= hidx
= le64toh(*idx
);
1184 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1188 n
= journal_file_entry_array_n_items(o
);
1190 o
->entry_array
.items
[i
] = htole64(p
);
1191 *idx
= htole64(hidx
+ 1);
1197 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1208 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1209 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1215 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1220 o
->entry_array
.items
[i
] = htole64(p
);
1223 *first
= htole64(q
);
1225 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1229 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1232 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1233 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1235 *idx
= htole64(hidx
+ 1);
1240 static int link_entry_into_array_plus_one(JournalFile
*f
,
1255 *extra
= htole64(p
);
1259 i
= htole64(le64toh(*idx
) - 1);
1260 r
= link_entry_into_array(f
, first
, &i
, p
);
1265 *idx
= htole64(le64toh(*idx
) + 1);
1269 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1276 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1280 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1284 return link_entry_into_array_plus_one(f
,
1285 &o
->data
.entry_offset
,
1286 &o
->data
.entry_array_offset
,
1291 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1299 if (o
->object
.type
!= OBJECT_ENTRY
)
1302 __sync_synchronize();
1304 /* Link up the entry itself */
1305 r
= link_entry_into_array(f
,
1306 &f
->header
->entry_array_offset
,
1307 &f
->header
->n_entries
,
1312 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1314 if (f
->header
->head_entry_realtime
== 0)
1315 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1317 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1318 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1320 f
->tail_entry_monotonic_valid
= true;
1322 /* Link up the items */
1323 n
= journal_file_entry_n_items(o
);
1324 for (i
= 0; i
< n
; i
++) {
1325 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1333 static int journal_file_append_entry_internal(
1335 const dual_timestamp
*ts
,
1337 const EntryItem items
[], unsigned n_items
,
1339 Object
**ret
, uint64_t *offset
) {
1346 assert(items
|| n_items
== 0);
1349 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1351 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1355 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1356 memcpy(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1357 o
->entry
.realtime
= htole64(ts
->realtime
);
1358 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1359 o
->entry
.xor_hash
= htole64(xor_hash
);
1360 o
->entry
.boot_id
= f
->header
->boot_id
;
1363 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1368 r
= journal_file_link_entry(f
, o
, np
);
1381 void journal_file_post_change(JournalFile
*f
) {
1384 /* inotify() does not receive IN_MODIFY events from file
1385 * accesses done via mmap(). After each access we hence
1386 * trigger IN_MODIFY by truncating the journal file to its
1387 * current size which triggers IN_MODIFY. */
1389 __sync_synchronize();
1391 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1392 log_error_errno(errno
, "Failed to truncate file to its own size: %m");
1395 static int entry_item_cmp(const void *_a
, const void *_b
) {
1396 const EntryItem
*a
= _a
, *b
= _b
;
1398 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1400 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1405 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1409 uint64_t xor_hash
= 0;
1410 struct dual_timestamp _ts
;
1413 assert(iovec
|| n_iovec
== 0);
1416 dual_timestamp_get(&_ts
);
1420 if (f
->tail_entry_monotonic_valid
&&
1421 ts
->monotonic
< le64toh(f
->header
->tail_entry_monotonic
))
1425 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
1430 /* alloca() can't take 0, hence let's allocate at least one */
1431 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
1433 for (i
= 0; i
< n_iovec
; i
++) {
1437 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
1441 xor_hash
^= le64toh(o
->data
.hash
);
1442 items
[i
].object_offset
= htole64(p
);
1443 items
[i
].hash
= o
->data
.hash
;
1446 /* Order by the position on disk, in order to improve seek
1447 * times for rotating media. */
1448 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
1450 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
1452 /* If the memory mapping triggered a SIGBUS then we return an
1453 * IO error and ignore the error code passed down to us, since
1454 * it is very likely just an effect of a nullified replacement
1457 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
1460 journal_file_post_change(f
);
1465 typedef struct ChainCacheItem
{
1466 uint64_t first
; /* the array at the beginning of the chain */
1467 uint64_t array
; /* the cached array */
1468 uint64_t begin
; /* the first item in the cached array */
1469 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
1470 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
1473 static void chain_cache_put(
1480 uint64_t last_index
) {
1483 /* If the chain item to cache for this chain is the
1484 * first one it's not worth caching anything */
1488 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
1489 ci
= ordered_hashmap_steal_first(h
);
1492 ci
= new(ChainCacheItem
, 1);
1499 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
1504 assert(ci
->first
== first
);
1509 ci
->last_index
= last_index
;
1512 static int generic_array_get(
1516 Object
**ret
, uint64_t *offset
) {
1519 uint64_t p
= 0, a
, t
= 0;
1527 /* Try the chain cache first */
1528 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1529 if (ci
&& i
> ci
->total
) {
1538 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1542 k
= journal_file_entry_array_n_items(o
);
1544 p
= le64toh(o
->entry_array
.items
[i
]);
1550 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1556 /* Let's cache this item for the next invocation */
1557 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
1559 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1572 static int generic_array_get_plus_one(
1577 Object
**ret
, uint64_t *offset
) {
1586 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1599 return generic_array_get(f
, first
, i
-1, ret
, offset
);
1608 static int generic_array_bisect(
1613 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1614 direction_t direction
,
1619 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
1620 bool subtract_one
= false;
1621 Object
*o
, *array
= NULL
;
1626 assert(test_object
);
1628 /* Start with the first array in the chain */
1631 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1632 if (ci
&& n
> ci
->total
) {
1633 /* Ah, we have iterated this bisection array chain
1634 * previously! Let's see if we can skip ahead in the
1635 * chain, as far as the last time. But we can't jump
1636 * backwards in the chain, so let's check that
1639 r
= test_object(f
, ci
->begin
, needle
);
1643 if (r
== TEST_LEFT
) {
1644 /* OK, what we are looking for is right of the
1645 * begin of this EntryArray, so let's jump
1646 * straight to previously cached array in the
1652 last_index
= ci
->last_index
;
1657 uint64_t left
, right
, k
, lp
;
1659 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
1663 k
= journal_file_entry_array_n_items(array
);
1669 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
1673 r
= test_object(f
, p
, needle
);
1677 if (r
== TEST_FOUND
)
1678 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1680 if (r
== TEST_RIGHT
) {
1684 if (last_index
!= (uint64_t) -1) {
1685 assert(last_index
<= right
);
1687 /* If we cached the last index we
1688 * looked at, let's try to not to jump
1689 * too wildly around and see if we can
1690 * limit the range to look at early to
1691 * the immediate neighbors of the last
1692 * index we looked at. */
1694 if (last_index
> 0) {
1695 uint64_t x
= last_index
- 1;
1697 p
= le64toh(array
->entry_array
.items
[x
]);
1701 r
= test_object(f
, p
, needle
);
1705 if (r
== TEST_FOUND
)
1706 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1708 if (r
== TEST_RIGHT
)
1714 if (last_index
< right
) {
1715 uint64_t y
= last_index
+ 1;
1717 p
= le64toh(array
->entry_array
.items
[y
]);
1721 r
= test_object(f
, p
, needle
);
1725 if (r
== TEST_FOUND
)
1726 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1728 if (r
== TEST_RIGHT
)
1736 if (left
== right
) {
1737 if (direction
== DIRECTION_UP
)
1738 subtract_one
= true;
1744 assert(left
< right
);
1745 i
= (left
+ right
) / 2;
1747 p
= le64toh(array
->entry_array
.items
[i
]);
1751 r
= test_object(f
, p
, needle
);
1755 if (r
== TEST_FOUND
)
1756 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1758 if (r
== TEST_RIGHT
)
1766 if (direction
== DIRECTION_UP
) {
1768 subtract_one
= true;
1779 last_index
= (uint64_t) -1;
1780 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
1786 if (subtract_one
&& t
== 0 && i
== 0)
1789 /* Let's cache this item for the next invocation */
1790 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
1792 if (subtract_one
&& i
== 0)
1794 else if (subtract_one
)
1795 p
= le64toh(array
->entry_array
.items
[i
-1]);
1797 p
= le64toh(array
->entry_array
.items
[i
]);
1799 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1810 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
1815 static int generic_array_bisect_plus_one(
1821 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1822 direction_t direction
,
1828 bool step_back
= false;
1832 assert(test_object
);
1837 /* This bisects the array in object 'first', but first checks
1839 r
= test_object(f
, extra
, needle
);
1843 if (r
== TEST_FOUND
)
1844 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1846 /* if we are looking with DIRECTION_UP then we need to first
1847 see if in the actual array there is a matching entry, and
1848 return the last one of that. But if there isn't any we need
1849 to return this one. Hence remember this, and return it
1852 step_back
= direction
== DIRECTION_UP
;
1854 if (r
== TEST_RIGHT
) {
1855 if (direction
== DIRECTION_DOWN
)
1861 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
1863 if (r
== 0 && step_back
)
1872 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1888 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1894 else if (p
< needle
)
1900 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1907 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1911 if (le64toh(o
->entry
.seqnum
) == needle
)
1913 else if (le64toh(o
->entry
.seqnum
) < needle
)
1919 int journal_file_move_to_entry_by_seqnum(
1922 direction_t direction
,
1926 return generic_array_bisect(f
,
1927 le64toh(f
->header
->entry_array_offset
),
1928 le64toh(f
->header
->n_entries
),
1935 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1942 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1946 if (le64toh(o
->entry
.realtime
) == needle
)
1948 else if (le64toh(o
->entry
.realtime
) < needle
)
1954 int journal_file_move_to_entry_by_realtime(
1957 direction_t direction
,
1961 return generic_array_bisect(f
,
1962 le64toh(f
->header
->entry_array_offset
),
1963 le64toh(f
->header
->n_entries
),
1965 test_object_realtime
,
1970 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1977 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1981 if (le64toh(o
->entry
.monotonic
) == needle
)
1983 else if (le64toh(o
->entry
.monotonic
) < needle
)
1989 static int find_data_object_by_boot_id(
1995 char t
[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1997 sd_id128_to_string(boot_id
, t
+ 9);
1998 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2001 int journal_file_move_to_entry_by_monotonic(
2005 direction_t direction
,
2014 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2020 return generic_array_bisect_plus_one(f
,
2021 le64toh(o
->data
.entry_offset
),
2022 le64toh(o
->data
.entry_array_offset
),
2023 le64toh(o
->data
.n_entries
),
2025 test_object_monotonic
,
2030 void journal_file_reset_location(JournalFile
*f
) {
2031 f
->location_type
= LOCATION_HEAD
;
2032 f
->current_offset
= 0;
2033 f
->current_seqnum
= 0;
2034 f
->current_realtime
= 0;
2035 f
->current_monotonic
= 0;
2036 zero(f
->current_boot_id
);
2037 f
->current_xor_hash
= 0;
2040 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2041 f
->location_type
= LOCATION_SEEK
;
2042 f
->current_offset
= offset
;
2043 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2044 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2045 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2046 f
->current_boot_id
= o
->entry
.boot_id
;
2047 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2050 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2053 assert(af
->location_type
== LOCATION_SEEK
);
2054 assert(bf
->location_type
== LOCATION_SEEK
);
2056 /* If contents and timestamps match, these entries are
2057 * identical, even if the seqnum does not match */
2058 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2059 af
->current_monotonic
== bf
->current_monotonic
&&
2060 af
->current_realtime
== bf
->current_realtime
&&
2061 af
->current_xor_hash
== bf
->current_xor_hash
)
2064 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2066 /* If this is from the same seqnum source, compare
2068 if (af
->current_seqnum
< bf
->current_seqnum
)
2070 if (af
->current_seqnum
> bf
->current_seqnum
)
2073 /* Wow! This is weird, different data but the same
2074 * seqnums? Something is borked, but let's make the
2075 * best of it and compare by time. */
2078 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2080 /* If the boot id matches, compare monotonic time */
2081 if (af
->current_monotonic
< bf
->current_monotonic
)
2083 if (af
->current_monotonic
> bf
->current_monotonic
)
2087 /* Otherwise, compare UTC time */
2088 if (af
->current_realtime
< bf
->current_realtime
)
2090 if (af
->current_realtime
> bf
->current_realtime
)
2093 /* Finally, compare by contents */
2094 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2096 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2102 int journal_file_next_entry(
2105 direction_t direction
,
2106 Object
**ret
, uint64_t *offset
) {
2113 n
= le64toh(f
->header
->n_entries
);
2118 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2120 r
= generic_array_bisect(f
,
2121 le64toh(f
->header
->entry_array_offset
),
2122 le64toh(f
->header
->n_entries
),
2131 if (direction
== DIRECTION_DOWN
) {
2144 /* And jump to it */
2145 r
= generic_array_get(f
,
2146 le64toh(f
->header
->entry_array_offset
),
2153 (direction
== DIRECTION_DOWN
? ofs
<= p
: ofs
>= p
)) {
2154 log_debug("%s: entry array corrupted at entry %"PRIu64
,
2165 int journal_file_next_entry_for_data(
2167 Object
*o
, uint64_t p
,
2168 uint64_t data_offset
,
2169 direction_t direction
,
2170 Object
**ret
, uint64_t *offset
) {
2177 assert(p
> 0 || !o
);
2179 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2183 n
= le64toh(d
->data
.n_entries
);
2188 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2190 if (o
->object
.type
!= OBJECT_ENTRY
)
2193 r
= generic_array_bisect_plus_one(f
,
2194 le64toh(d
->data
.entry_offset
),
2195 le64toh(d
->data
.entry_array_offset
),
2196 le64toh(d
->data
.n_entries
),
2206 if (direction
== DIRECTION_DOWN
) {
2220 return generic_array_get_plus_one(f
,
2221 le64toh(d
->data
.entry_offset
),
2222 le64toh(d
->data
.entry_array_offset
),
2227 int journal_file_move_to_entry_by_offset_for_data(
2229 uint64_t data_offset
,
2231 direction_t direction
,
2232 Object
**ret
, uint64_t *offset
) {
2239 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2243 return generic_array_bisect_plus_one(f
,
2244 le64toh(d
->data
.entry_offset
),
2245 le64toh(d
->data
.entry_array_offset
),
2246 le64toh(d
->data
.n_entries
),
2253 int journal_file_move_to_entry_by_monotonic_for_data(
2255 uint64_t data_offset
,
2258 direction_t direction
,
2259 Object
**ret
, uint64_t *offset
) {
2267 /* First, seek by time */
2268 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2274 r
= generic_array_bisect_plus_one(f
,
2275 le64toh(o
->data
.entry_offset
),
2276 le64toh(o
->data
.entry_array_offset
),
2277 le64toh(o
->data
.n_entries
),
2279 test_object_monotonic
,
2285 /* And now, continue seeking until we find an entry that
2286 * exists in both bisection arrays */
2292 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2296 r
= generic_array_bisect_plus_one(f
,
2297 le64toh(d
->data
.entry_offset
),
2298 le64toh(d
->data
.entry_array_offset
),
2299 le64toh(d
->data
.n_entries
),
2307 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2311 r
= generic_array_bisect_plus_one(f
,
2312 le64toh(o
->data
.entry_offset
),
2313 le64toh(o
->data
.entry_array_offset
),
2314 le64toh(o
->data
.n_entries
),
2336 int journal_file_move_to_entry_by_seqnum_for_data(
2338 uint64_t data_offset
,
2340 direction_t direction
,
2341 Object
**ret
, uint64_t *offset
) {
2348 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2352 return generic_array_bisect_plus_one(f
,
2353 le64toh(d
->data
.entry_offset
),
2354 le64toh(d
->data
.entry_array_offset
),
2355 le64toh(d
->data
.n_entries
),
2362 int journal_file_move_to_entry_by_realtime_for_data(
2364 uint64_t data_offset
,
2366 direction_t direction
,
2367 Object
**ret
, uint64_t *offset
) {
2374 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2378 return generic_array_bisect_plus_one(f
,
2379 le64toh(d
->data
.entry_offset
),
2380 le64toh(d
->data
.entry_array_offset
),
2381 le64toh(d
->data
.n_entries
),
2383 test_object_realtime
,
2388 void journal_file_dump(JournalFile
*f
) {
2395 journal_file_print_header(f
);
2397 p
= le64toh(f
->header
->header_size
);
2399 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
2403 switch (o
->object
.type
) {
2406 printf("Type: OBJECT_UNUSED\n");
2410 printf("Type: OBJECT_DATA\n");
2414 printf("Type: OBJECT_FIELD\n");
2418 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
2419 le64toh(o
->entry
.seqnum
),
2420 le64toh(o
->entry
.monotonic
),
2421 le64toh(o
->entry
.realtime
));
2424 case OBJECT_FIELD_HASH_TABLE
:
2425 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2428 case OBJECT_DATA_HASH_TABLE
:
2429 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2432 case OBJECT_ENTRY_ARRAY
:
2433 printf("Type: OBJECT_ENTRY_ARRAY\n");
2437 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
2438 le64toh(o
->tag
.seqnum
),
2439 le64toh(o
->tag
.epoch
));
2443 printf("Type: unknown (%i)\n", o
->object
.type
);
2447 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
2448 printf("Flags: %s\n",
2449 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
2451 if (p
== le64toh(f
->header
->tail_object_offset
))
2454 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
2459 log_error("File corrupt");
2462 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
2465 x
= format_timestamp(buf
, l
, t
);
2471 void journal_file_print_header(JournalFile
*f
) {
2472 char a
[33], b
[33], c
[33], d
[33];
2473 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
2475 char bytes
[FORMAT_BYTES_MAX
];
2479 printf("File Path: %s\n"
2483 "Sequential Number ID: %s\n"
2485 "Compatible Flags:%s%s\n"
2486 "Incompatible Flags:%s%s%s\n"
2487 "Header size: %"PRIu64
"\n"
2488 "Arena size: %"PRIu64
"\n"
2489 "Data Hash Table Size: %"PRIu64
"\n"
2490 "Field Hash Table Size: %"PRIu64
"\n"
2491 "Rotate Suggested: %s\n"
2492 "Head Sequential Number: %"PRIu64
"\n"
2493 "Tail Sequential Number: %"PRIu64
"\n"
2494 "Head Realtime Timestamp: %s\n"
2495 "Tail Realtime Timestamp: %s\n"
2496 "Tail Monotonic Timestamp: %s\n"
2497 "Objects: %"PRIu64
"\n"
2498 "Entry Objects: %"PRIu64
"\n",
2500 sd_id128_to_string(f
->header
->file_id
, a
),
2501 sd_id128_to_string(f
->header
->machine_id
, b
),
2502 sd_id128_to_string(f
->header
->boot_id
, c
),
2503 sd_id128_to_string(f
->header
->seqnum_id
, d
),
2504 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
2505 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
2506 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
2507 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
2508 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
2509 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
2510 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
2511 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
2512 le64toh(f
->header
->header_size
),
2513 le64toh(f
->header
->arena_size
),
2514 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
2515 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
2516 yes_no(journal_file_rotate_suggested(f
, 0)),
2517 le64toh(f
->header
->head_entry_seqnum
),
2518 le64toh(f
->header
->tail_entry_seqnum
),
2519 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)),
2520 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)),
2521 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
),
2522 le64toh(f
->header
->n_objects
),
2523 le64toh(f
->header
->n_entries
));
2525 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
2526 printf("Data Objects: %"PRIu64
"\n"
2527 "Data Hash Table Fill: %.1f%%\n",
2528 le64toh(f
->header
->n_data
),
2529 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
2531 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
2532 printf("Field Objects: %"PRIu64
"\n"
2533 "Field Hash Table Fill: %.1f%%\n",
2534 le64toh(f
->header
->n_fields
),
2535 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
2537 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
2538 printf("Tag Objects: %"PRIu64
"\n",
2539 le64toh(f
->header
->n_tags
));
2540 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
2541 printf("Entry Array Objects: %"PRIu64
"\n",
2542 le64toh(f
->header
->n_entry_arrays
));
2544 if (fstat(f
->fd
, &st
) >= 0)
2545 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (off_t
) st
.st_blocks
* 512ULL));
2548 static int journal_file_warn_btrfs(JournalFile
*f
) {
2554 /* Before we write anything, check if the COW logic is turned
2555 * off on btrfs. Given our write pattern that is quite
2556 * unfriendly to COW file systems this should greatly improve
2557 * performance on COW file systems, such as btrfs, at the
2558 * expense of data integrity features (which shouldn't be too
2559 * bad, given that we do our own checksumming). */
2561 r
= btrfs_is_filesystem(f
->fd
);
2563 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
2567 r
= read_attr_fd(f
->fd
, &attrs
);
2569 return log_warning_errno(r
, "Failed to read file attributes: %m");
2571 if (attrs
& FS_NOCOW_FL
) {
2572 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2576 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2577 "This is likely to slow down journal access substantially, please consider turning "
2578 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
2583 int journal_file_open(
2589 JournalMetrics
*metrics
,
2590 MMapCache
*mmap_cache
,
2591 JournalFile
*template,
2592 JournalFile
**ret
) {
2594 bool newly_created
= false;
2602 if ((flags
& O_ACCMODE
) != O_RDONLY
&&
2603 (flags
& O_ACCMODE
) != O_RDWR
)
2606 if (!endswith(fname
, ".journal") &&
2607 !endswith(fname
, ".journal~"))
2610 f
= new0(JournalFile
, 1);
2618 f
->prot
= prot_from_flags(flags
);
2619 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
2620 #if defined(HAVE_LZ4)
2621 f
->compress_lz4
= compress
;
2622 #elif defined(HAVE_XZ)
2623 f
->compress_xz
= compress
;
2630 f
->mmap
= mmap_cache_ref(mmap_cache
);
2632 f
->mmap
= mmap_cache_new();
2639 f
->path
= strdup(fname
);
2645 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
2646 if (!f
->chain_cache
) {
2651 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
, f
->mode
);
2657 r
= journal_file_fstat(f
);
2661 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
2663 (void) journal_file_warn_btrfs(f
);
2665 /* Let's attach the creation time to the journal file,
2666 * so that the vacuuming code knows the age of this
2667 * file even if the file might end up corrupted one
2668 * day... Ideally we'd just use the creation time many
2669 * file systems maintain for each file, but there is
2670 * currently no usable API to query this, hence let's
2671 * emulate this via extended attributes. If extended
2672 * attributes are not supported we'll just skip this,
2673 * and rely solely on mtime/atime/ctime of the file. */
2675 fd_setcrtime(f
->fd
, 0);
2678 /* Try to load the FSPRG state, and if we can't, then
2679 * just don't do sealing */
2681 r
= journal_file_fss_load(f
);
2687 r
= journal_file_init_header(f
, template);
2691 r
= journal_file_fstat(f
);
2695 newly_created
= true;
2698 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
2703 r
= mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
);
2709 if (!newly_created
) {
2710 r
= journal_file_verify_header(f
);
2716 if (!newly_created
&& f
->writable
) {
2717 r
= journal_file_fss_load(f
);
2725 journal_default_metrics(metrics
, f
->fd
);
2726 f
->metrics
= *metrics
;
2727 } else if (template)
2728 f
->metrics
= template->metrics
;
2730 r
= journal_file_refresh_header(f
);
2736 r
= journal_file_hmac_setup(f
);
2741 if (newly_created
) {
2742 r
= journal_file_setup_field_hash_table(f
);
2746 r
= journal_file_setup_data_hash_table(f
);
2751 r
= journal_file_append_first_tag(f
);
2757 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
)) {
2766 if (f
->fd
>= 0 && mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
2769 journal_file_close(f
);
2774 int journal_file_rotate(JournalFile
**f
, bool compress
, bool seal
) {
2775 _cleanup_free_
char *p
= NULL
;
2777 JournalFile
*old_file
, *new_file
= NULL
;
2785 if (!old_file
->writable
)
2788 if (!endswith(old_file
->path
, ".journal"))
2791 l
= strlen(old_file
->path
);
2792 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
2793 (int) l
- 8, old_file
->path
,
2794 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
2795 le64toh((*f
)->header
->head_entry_seqnum
),
2796 le64toh((*f
)->header
->head_entry_realtime
));
2800 /* Try to rename the file to the archived version. If the file
2801 * already was deleted, we'll get ENOENT, let's ignore that
2803 r
= rename(old_file
->path
, p
);
2804 if (r
< 0 && errno
!= ENOENT
)
2807 old_file
->header
->state
= STATE_ARCHIVED
;
2809 /* Currently, btrfs is not very good with out write patterns
2810 * and fragments heavily. Let's defrag our journal files when
2811 * we archive them */
2812 old_file
->defrag_on_close
= true;
2814 r
= journal_file_open(old_file
->path
, old_file
->flags
, old_file
->mode
, compress
, seal
, NULL
, old_file
->mmap
, old_file
, &new_file
);
2815 journal_file_close(old_file
);
2821 int journal_file_open_reliably(
2827 JournalMetrics
*metrics
,
2828 MMapCache
*mmap_cache
,
2829 JournalFile
*template,
2830 JournalFile
**ret
) {
2834 _cleanup_free_
char *p
= NULL
;
2836 r
= journal_file_open(fname
, flags
, mode
, compress
, seal
,
2837 metrics
, mmap_cache
, template, ret
);
2839 -EBADMSG
, /* corrupted */
2840 -ENODATA
, /* truncated */
2841 -EHOSTDOWN
, /* other machine */
2842 -EPROTONOSUPPORT
, /* incompatible feature */
2843 -EBUSY
, /* unclean shutdown */
2844 -ESHUTDOWN
, /* already archived */
2845 -EIO
, /* IO error, including SIGBUS on mmap */
2846 -EIDRM
/* File has been deleted */))
2849 if ((flags
& O_ACCMODE
) == O_RDONLY
)
2852 if (!(flags
& O_CREAT
))
2855 if (!endswith(fname
, ".journal"))
2858 /* The file is corrupted. Rotate it away and try it again (but only once) */
2861 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
2863 now(CLOCK_REALTIME
),
2867 r
= rename(fname
, p
);
2871 /* btrfs doesn't cope well with our write pattern and
2872 * fragments heavily. Let's defrag all files we rotate */
2874 (void) chattr_path(p
, false, FS_NOCOW_FL
);
2875 (void) btrfs_defrag(p
);
2877 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
2879 return journal_file_open(fname
, flags
, mode
, compress
, seal
,
2880 metrics
, mmap_cache
, template, ret
);
2883 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
2885 uint64_t q
, xor_hash
= 0;
2898 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
2899 ts
.realtime
= le64toh(o
->entry
.realtime
);
2901 n
= journal_file_entry_n_items(o
);
2902 /* alloca() can't take 0, hence let's allocate at least one */
2903 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
2905 for (i
= 0; i
< n
; i
++) {
2912 q
= le64toh(o
->entry
.items
[i
].object_offset
);
2913 le_hash
= o
->entry
.items
[i
].hash
;
2915 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
2919 if (le_hash
!= o
->data
.hash
)
2922 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
2925 /* We hit the limit on 32bit machines */
2926 if ((uint64_t) t
!= l
)
2929 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
2930 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2933 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
2934 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
2938 data
= from
->compress_buffer
;
2941 return -EPROTONOSUPPORT
;
2944 data
= o
->data
.payload
;
2946 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
2950 xor_hash
^= le64toh(u
->data
.hash
);
2951 items
[i
].object_offset
= htole64(h
);
2952 items
[i
].hash
= u
->data
.hash
;
2954 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
2959 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
2961 if (mmap_cache_got_sigbus(to
->mmap
, to
->fd
))
2967 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
2968 uint64_t fs_size
= 0;
2970 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
];
2975 if (fstatvfs(fd
, &ss
) >= 0)
2976 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
2978 if (m
->max_use
== (uint64_t) -1) {
2981 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
2983 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
2984 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
2986 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
2987 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
2989 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
2991 m
->max_use
= PAGE_ALIGN(m
->max_use
);
2993 if (m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
2994 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
2997 if (m
->max_size
== (uint64_t) -1) {
2998 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3000 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3001 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3003 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3005 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3006 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3008 if (m
->max_size
*2 > m
->max_use
)
3009 m
->max_use
= m
->max_size
*2;
3011 if (m
->min_size
== (uint64_t) -1)
3012 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3014 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3016 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3017 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3019 if (m
->min_size
> m
->max_size
)
3020 m
->max_size
= m
->min_size
;
3023 if (m
->keep_free
== (uint64_t) -1) {
3026 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3028 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3029 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3032 m
->keep_free
= DEFAULT_KEEP_FREE
;
3035 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
3036 format_bytes(a
, sizeof(a
), m
->max_use
),
3037 format_bytes(b
, sizeof(b
), m
->max_size
),
3038 format_bytes(c
, sizeof(c
), m
->min_size
),
3039 format_bytes(d
, sizeof(d
), m
->keep_free
));
3042 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3047 if (f
->header
->head_entry_realtime
== 0)
3050 *from
= le64toh(f
->header
->head_entry_realtime
);
3054 if (f
->header
->tail_entry_realtime
== 0)
3057 *to
= le64toh(f
->header
->tail_entry_realtime
);
3063 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3071 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3075 if (le64toh(o
->data
.n_entries
) <= 0)
3079 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3083 *from
= le64toh(o
->entry
.monotonic
);
3087 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3091 r
= generic_array_get_plus_one(f
,
3092 le64toh(o
->data
.entry_offset
),
3093 le64toh(o
->data
.entry_array_offset
),
3094 le64toh(o
->data
.n_entries
)-1,
3099 *to
= le64toh(o
->entry
.monotonic
);
3105 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3108 /* If we gained new header fields we gained new features,
3109 * hence suggest a rotation */
3110 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3111 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3115 /* Let's check if the hash tables grew over a certain fill
3116 * level (75%, borrowing this value from Java's hash table
3117 * implementation), and if so suggest a rotation. To calculate
3118 * the fill level we need the n_data field, which only exists
3119 * in newer versions. */
3121 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3122 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3123 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3125 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3126 le64toh(f
->header
->n_data
),
3127 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3128 (unsigned long long) f
->last_stat
.st_size
,
3129 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3133 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3134 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3135 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3137 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3138 le64toh(f
->header
->n_fields
),
3139 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3143 /* Are the data objects properly indexed by field objects? */
3144 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3145 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3146 le64toh(f
->header
->n_data
) > 0 &&
3147 le64toh(f
->header
->n_fields
) == 0)
3150 if (max_file_usec
> 0) {
3153 h
= le64toh(f
->header
->head_entry_realtime
);
3154 t
= now(CLOCK_REALTIME
);
3156 if (h
> 0 && t
> h
+ max_file_usec
)