1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/statvfs.h>
31 #include "btrfs-util.h"
33 #include "journal-authenticate.h"
34 #include "journal-def.h"
36 #include "random-util.h"
37 #include "string-util.h"
38 #include "journal-file.h"
40 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
41 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45 /* This is the minimum journal file size */
46 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
48 /* These are the lower and upper bounds if we deduce the max_use value
49 * from the file system size */
50 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
51 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
53 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
54 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
56 /* This is the upper bound if we deduce max_size from max_use */
57 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
59 /* This is the upper bound if we deduce the keep_free value from the
61 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
63 /* This is the keep_free value when we can't determine the system
65 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
67 /* This is the default maximum number of journal files to keep around. */
68 #define DEFAULT_N_MAX_FILES (100)
70 /* n_data was the first entry we added after the initial file format design */
71 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
73 /* How many entries to keep in the entry array chain cache at max */
74 #define CHAIN_CACHE_MAX 20
76 /* How much to increase the journal file size at once each time we allocate something new. */
77 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
79 /* Reread fstat() of the file for detecting deletions at least this often */
80 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
82 /* The mmap context to use for the header we pick as one above the last defined typed */
83 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
85 static int journal_file_set_online(JournalFile
*f
) {
91 if (!(f
->fd
>= 0 && f
->header
))
94 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
97 switch(f
->header
->state
) {
102 f
->header
->state
= STATE_ONLINE
;
111 int journal_file_set_offline(JournalFile
*f
) {
117 if (!(f
->fd
>= 0 && f
->header
))
120 if (f
->header
->state
!= STATE_ONLINE
)
125 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
128 f
->header
->state
= STATE_OFFLINE
;
130 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
138 JournalFile
* journal_file_close(JournalFile
*f
) {
142 /* Write the final tag */
143 if (f
->seal
&& f
->writable
)
144 journal_file_append_tag(f
);
147 journal_file_set_offline(f
);
149 if (f
->mmap
&& f
->fd
>= 0)
150 mmap_cache_close_fd(f
->mmap
, f
->fd
);
152 if (f
->fd
>= 0 && f
->defrag_on_close
) {
154 /* Be friendly to btrfs: turn COW back on again now,
155 * and defragment the file. We won't write to the file
156 * ever again, hence remove all fragmentation, and
157 * reenable all the good bits COW usually provides
158 * (such as data checksumming). */
160 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
161 (void) btrfs_defrag_fd(f
->fd
);
168 mmap_cache_unref(f
->mmap
);
170 ordered_hashmap_free_free(f
->chain_cache
);
172 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
173 free(f
->compress_buffer
);
178 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
180 free(f
->fsprg_state
);
185 gcry_md_close(f
->hmac
);
192 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
199 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
200 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
202 h
.incompatible_flags
|= htole32(
203 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
204 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
206 h
.compatible_flags
= htole32(
207 f
->seal
* HEADER_COMPATIBLE_SEALED
);
209 r
= sd_id128_randomize(&h
.file_id
);
214 h
.seqnum_id
= template->header
->seqnum_id
;
215 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
217 h
.seqnum_id
= h
.file_id
;
219 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
229 static int journal_file_refresh_header(JournalFile
*f
) {
235 r
= sd_id128_get_machine(&f
->header
->machine_id
);
239 r
= sd_id128_get_boot(&boot_id
);
243 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
244 f
->tail_entry_monotonic_valid
= true;
246 f
->header
->boot_id
= boot_id
;
248 r
= journal_file_set_online(f
);
250 /* Sync the online state to disk */
256 static int journal_file_verify_header(JournalFile
*f
) {
261 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
264 /* In both read and write mode we refuse to open files with
265 * incompatible flags we don't know */
266 flags
= le32toh(f
->header
->incompatible_flags
);
267 if (flags
& ~HEADER_INCOMPATIBLE_SUPPORTED
) {
268 if (flags
& ~HEADER_INCOMPATIBLE_ANY
)
269 log_debug("Journal file %s has unknown incompatible flags %"PRIx32
,
270 f
->path
, flags
& ~HEADER_INCOMPATIBLE_ANY
);
271 flags
= (flags
& HEADER_INCOMPATIBLE_ANY
) & ~HEADER_INCOMPATIBLE_SUPPORTED
;
273 log_debug("Journal file %s uses incompatible flags %"PRIx32
274 " disabled at compilation time.", f
->path
, flags
);
275 return -EPROTONOSUPPORT
;
278 /* When open for writing we refuse to open files with
279 * compatible flags, too */
280 flags
= le32toh(f
->header
->compatible_flags
);
281 if (f
->writable
&& (flags
& ~HEADER_COMPATIBLE_SUPPORTED
)) {
282 if (flags
& ~HEADER_COMPATIBLE_ANY
)
283 log_debug("Journal file %s has unknown compatible flags %"PRIx32
,
284 f
->path
, flags
& ~HEADER_COMPATIBLE_ANY
);
285 flags
= (flags
& HEADER_COMPATIBLE_ANY
) & ~HEADER_COMPATIBLE_SUPPORTED
;
287 log_debug("Journal file %s uses compatible flags %"PRIx32
288 " disabled at compilation time.", f
->path
, flags
);
289 return -EPROTONOSUPPORT
;
292 if (f
->header
->state
>= _STATE_MAX
)
295 /* The first addition was n_data, so check that we are at least this large */
296 if (le64toh(f
->header
->header_size
) < HEADER_SIZE_MIN
)
299 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
302 if ((le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)) > (uint64_t) f
->last_stat
.st_size
)
305 if (le64toh(f
->header
->tail_object_offset
) > (le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)))
308 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
309 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
310 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
311 !VALID64(le64toh(f
->header
->entry_array_offset
)))
316 sd_id128_t machine_id
;
319 r
= sd_id128_get_machine(&machine_id
);
323 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
326 state
= f
->header
->state
;
328 if (state
== STATE_ONLINE
) {
329 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
331 } else if (state
== STATE_ARCHIVED
)
333 else if (state
!= STATE_OFFLINE
) {
334 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
339 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
340 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
342 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
347 static int journal_file_fstat(JournalFile
*f
) {
351 if (fstat(f
->fd
, &f
->last_stat
) < 0)
354 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
356 /* Refuse appending to files that are already deleted */
357 if (f
->last_stat
.st_nlink
<= 0)
363 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
364 uint64_t old_size
, new_size
;
369 /* We assume that this file is not sparse, and we know that
370 * for sure, since we always call posix_fallocate()
373 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
377 le64toh(f
->header
->header_size
) +
378 le64toh(f
->header
->arena_size
);
380 new_size
= PAGE_ALIGN(offset
+ size
);
381 if (new_size
< le64toh(f
->header
->header_size
))
382 new_size
= le64toh(f
->header
->header_size
);
384 if (new_size
<= old_size
) {
386 /* We already pre-allocated enough space, but before
387 * we write to it, let's check with fstat() if the
388 * file got deleted, in order make sure we don't throw
389 * away the data immediately. Don't check fstat() for
390 * all writes though, but only once ever 10s. */
392 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
395 return journal_file_fstat(f
);
398 /* Allocate more space. */
400 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
403 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
406 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
409 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
411 if (new_size
- old_size
> available
)
416 /* Increase by larger blocks at once */
417 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
418 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
419 new_size
= f
->metrics
.max_size
;
421 /* Note that the glibc fallocate() fallback is very
422 inefficient, hence we try to minimize the allocation area
424 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
428 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
430 return journal_file_fstat(f
);
433 static unsigned type_to_context(ObjectType type
) {
434 /* One context for each type, plus one catch-all for the rest */
435 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
436 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
437 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
440 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
) {
449 /* Avoid SIGBUS on invalid accesses */
450 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
451 /* Hmm, out of range? Let's refresh the fstat() data
452 * first, before we trust that check. */
454 r
= journal_file_fstat(f
);
458 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
459 return -EADDRNOTAVAIL
;
462 return mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
);
465 static uint64_t minimum_header_size(Object
*o
) {
467 static const uint64_t table
[] = {
468 [OBJECT_DATA
] = sizeof(DataObject
),
469 [OBJECT_FIELD
] = sizeof(FieldObject
),
470 [OBJECT_ENTRY
] = sizeof(EntryObject
),
471 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
472 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
473 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
474 [OBJECT_TAG
] = sizeof(TagObject
),
477 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
478 return sizeof(ObjectHeader
);
480 return table
[o
->object
.type
];
483 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
492 /* Objects may only be located at multiple of 64 bit */
493 if (!VALID64(offset
))
496 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
);
501 s
= le64toh(o
->object
.size
);
503 if (s
< sizeof(ObjectHeader
))
506 if (o
->object
.type
<= OBJECT_UNUSED
)
509 if (s
< minimum_header_size(o
))
512 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
)
515 if (s
> sizeof(ObjectHeader
)) {
516 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
);
527 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
532 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
535 /* If an external seqnum counter was passed, we update
536 * both the local and the external one, and set it to
537 * the maximum of both */
545 f
->header
->tail_entry_seqnum
= htole64(r
);
547 if (f
->header
->head_entry_seqnum
== 0)
548 f
->header
->head_entry_seqnum
= htole64(r
);
553 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
560 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
561 assert(size
>= sizeof(ObjectHeader
));
565 r
= journal_file_set_online(f
);
569 p
= le64toh(f
->header
->tail_object_offset
);
571 p
= le64toh(f
->header
->header_size
);
573 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
577 p
+= ALIGN64(le64toh(tail
->object
.size
));
580 r
= journal_file_allocate(f
, p
, size
);
584 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
);
591 o
->object
.type
= type
;
592 o
->object
.size
= htole64(size
);
594 f
->header
->tail_object_offset
= htole64(p
);
595 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
603 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
610 /* We estimate that we need 1 hash table entry per 768 bytes
611 of journal file and we want to make sure we never get
612 beyond 75% fill level. Calculate the hash table size for
613 the maximum file size based on these metrics. */
615 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
616 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
617 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
619 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
621 r
= journal_file_append_object(f
,
622 OBJECT_DATA_HASH_TABLE
,
623 offsetof(Object
, hash_table
.items
) + s
,
628 memzero(o
->hash_table
.items
, s
);
630 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
631 f
->header
->data_hash_table_size
= htole64(s
);
636 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
643 /* We use a fixed size hash table for the fields as this
644 * number should grow very slowly only */
646 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
647 r
= journal_file_append_object(f
,
648 OBJECT_FIELD_HASH_TABLE
,
649 offsetof(Object
, hash_table
.items
) + s
,
654 memzero(o
->hash_table
.items
, s
);
656 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
657 f
->header
->field_hash_table_size
= htole64(s
);
662 int journal_file_map_data_hash_table(JournalFile
*f
) {
669 if (f
->data_hash_table
)
672 p
= le64toh(f
->header
->data_hash_table_offset
);
673 s
= le64toh(f
->header
->data_hash_table_size
);
675 r
= journal_file_move_to(f
,
676 OBJECT_DATA_HASH_TABLE
,
683 f
->data_hash_table
= t
;
687 int journal_file_map_field_hash_table(JournalFile
*f
) {
694 if (f
->field_hash_table
)
697 p
= le64toh(f
->header
->field_hash_table_offset
);
698 s
= le64toh(f
->header
->field_hash_table_size
);
700 r
= journal_file_move_to(f
,
701 OBJECT_FIELD_HASH_TABLE
,
708 f
->field_hash_table
= t
;
712 static int journal_file_link_field(
725 if (o
->object
.type
!= OBJECT_FIELD
)
728 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
732 /* This might alter the window we are looking at */
733 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
736 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
738 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
740 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
744 o
->field
.next_hash_offset
= htole64(offset
);
747 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
749 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
750 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
755 static int journal_file_link_data(
768 if (o
->object
.type
!= OBJECT_DATA
)
771 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
775 /* This might alter the window we are looking at */
776 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
777 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
778 o
->data
.n_entries
= 0;
781 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
783 /* Only entry in the hash table is easy */
784 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
786 /* Move back to the previous data object, to patch in
789 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
793 o
->data
.next_hash_offset
= htole64(offset
);
796 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
798 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
799 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
804 int journal_file_find_field_object_with_hash(
806 const void *field
, uint64_t size
, uint64_t hash
,
807 Object
**ret
, uint64_t *offset
) {
809 uint64_t p
, osize
, h
, m
;
813 assert(field
&& size
> 0);
815 /* If the field hash table is empty, we can't find anything */
816 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
819 /* Map the field hash table, if it isn't mapped yet. */
820 r
= journal_file_map_field_hash_table(f
);
824 osize
= offsetof(Object
, field
.payload
) + size
;
826 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
831 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
836 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
840 if (le64toh(o
->field
.hash
) == hash
&&
841 le64toh(o
->object
.size
) == osize
&&
842 memcmp(o
->field
.payload
, field
, size
) == 0) {
852 p
= le64toh(o
->field
.next_hash_offset
);
858 int journal_file_find_field_object(
860 const void *field
, uint64_t size
,
861 Object
**ret
, uint64_t *offset
) {
866 assert(field
&& size
> 0);
868 hash
= hash64(field
, size
);
870 return journal_file_find_field_object_with_hash(f
,
875 int journal_file_find_data_object_with_hash(
877 const void *data
, uint64_t size
, uint64_t hash
,
878 Object
**ret
, uint64_t *offset
) {
880 uint64_t p
, osize
, h
, m
;
884 assert(data
|| size
== 0);
886 /* If there's no data hash table, then there's no entry. */
887 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
890 /* Map the data hash table, if it isn't mapped yet. */
891 r
= journal_file_map_data_hash_table(f
);
895 osize
= offsetof(Object
, data
.payload
) + size
;
897 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
902 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
907 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
911 if (le64toh(o
->data
.hash
) != hash
)
914 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
915 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
919 l
= le64toh(o
->object
.size
);
920 if (l
<= offsetof(Object
, data
.payload
))
923 l
-= offsetof(Object
, data
.payload
);
925 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
926 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
931 memcmp(f
->compress_buffer
, data
, size
) == 0) {
942 return -EPROTONOSUPPORT
;
944 } else if (le64toh(o
->object
.size
) == osize
&&
945 memcmp(o
->data
.payload
, data
, size
) == 0) {
957 p
= le64toh(o
->data
.next_hash_offset
);
963 int journal_file_find_data_object(
965 const void *data
, uint64_t size
,
966 Object
**ret
, uint64_t *offset
) {
971 assert(data
|| size
== 0);
973 hash
= hash64(data
, size
);
975 return journal_file_find_data_object_with_hash(f
,
980 static int journal_file_append_field(
982 const void *field
, uint64_t size
,
983 Object
**ret
, uint64_t *offset
) {
991 assert(field
&& size
> 0);
993 hash
= hash64(field
, size
);
995 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1009 osize
= offsetof(Object
, field
.payload
) + size
;
1010 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1014 o
->field
.hash
= htole64(hash
);
1015 memcpy(o
->field
.payload
, field
, size
);
1017 r
= journal_file_link_field(f
, o
, p
, hash
);
1021 /* The linking might have altered the window, so let's
1022 * refresh our pointer */
1023 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1028 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1042 static int journal_file_append_data(
1044 const void *data
, uint64_t size
,
1045 Object
**ret
, uint64_t *offset
) {
1050 int r
, compression
= 0;
1054 assert(data
|| size
== 0);
1056 hash
= hash64(data
, size
);
1058 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1072 osize
= offsetof(Object
, data
.payload
) + size
;
1073 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1077 o
->data
.hash
= htole64(hash
);
1079 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1080 if (f
->compress_xz
&&
1081 size
>= COMPRESSION_SIZE_THRESHOLD
) {
1084 compression
= compress_blob(data
, size
, o
->data
.payload
, &rsize
);
1087 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1088 o
->object
.flags
|= compression
;
1090 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1091 size
, rsize
, object_compressed_to_string(compression
));
1096 if (!compression
&& size
> 0)
1097 memcpy(o
->data
.payload
, data
, size
);
1099 r
= journal_file_link_data(f
, o
, p
, hash
);
1103 /* The linking might have altered the window, so let's
1104 * refresh our pointer */
1105 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1112 eq
= memchr(data
, '=', size
);
1113 if (eq
&& eq
> data
) {
1117 /* Create field object ... */
1118 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1122 /* ... and link it in. */
1123 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1124 fo
->field
.head_data_offset
= le64toh(p
);
1128 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1142 uint64_t journal_file_entry_n_items(Object
*o
) {
1145 if (o
->object
.type
!= OBJECT_ENTRY
)
1148 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1151 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1154 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1157 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1160 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1163 if (o
->object
.type
!= OBJECT_DATA_HASH_TABLE
&&
1164 o
->object
.type
!= OBJECT_FIELD_HASH_TABLE
)
1167 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1170 static int link_entry_into_array(JournalFile
*f
,
1175 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1183 a
= le64toh(*first
);
1184 i
= hidx
= le64toh(*idx
);
1187 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1191 n
= journal_file_entry_array_n_items(o
);
1193 o
->entry_array
.items
[i
] = htole64(p
);
1194 *idx
= htole64(hidx
+ 1);
1200 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1211 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1212 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1218 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1223 o
->entry_array
.items
[i
] = htole64(p
);
1226 *first
= htole64(q
);
1228 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1232 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1235 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1236 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1238 *idx
= htole64(hidx
+ 1);
1243 static int link_entry_into_array_plus_one(JournalFile
*f
,
1258 *extra
= htole64(p
);
1262 i
= htole64(le64toh(*idx
) - 1);
1263 r
= link_entry_into_array(f
, first
, &i
, p
);
1268 *idx
= htole64(le64toh(*idx
) + 1);
1272 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1279 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1283 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1287 return link_entry_into_array_plus_one(f
,
1288 &o
->data
.entry_offset
,
1289 &o
->data
.entry_array_offset
,
1294 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1302 if (o
->object
.type
!= OBJECT_ENTRY
)
1305 __sync_synchronize();
1307 /* Link up the entry itself */
1308 r
= link_entry_into_array(f
,
1309 &f
->header
->entry_array_offset
,
1310 &f
->header
->n_entries
,
1315 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1317 if (f
->header
->head_entry_realtime
== 0)
1318 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1320 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1321 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1323 f
->tail_entry_monotonic_valid
= true;
1325 /* Link up the items */
1326 n
= journal_file_entry_n_items(o
);
1327 for (i
= 0; i
< n
; i
++) {
1328 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1336 static int journal_file_append_entry_internal(
1338 const dual_timestamp
*ts
,
1340 const EntryItem items
[], unsigned n_items
,
1342 Object
**ret
, uint64_t *offset
) {
1349 assert(items
|| n_items
== 0);
1352 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1354 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1358 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1359 memcpy(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1360 o
->entry
.realtime
= htole64(ts
->realtime
);
1361 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1362 o
->entry
.xor_hash
= htole64(xor_hash
);
1363 o
->entry
.boot_id
= f
->header
->boot_id
;
1366 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1371 r
= journal_file_link_entry(f
, o
, np
);
1384 void journal_file_post_change(JournalFile
*f
) {
1387 /* inotify() does not receive IN_MODIFY events from file
1388 * accesses done via mmap(). After each access we hence
1389 * trigger IN_MODIFY by truncating the journal file to its
1390 * current size which triggers IN_MODIFY. */
1392 __sync_synchronize();
1394 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1395 log_error_errno(errno
, "Failed to truncate file to its own size: %m");
1398 static int entry_item_cmp(const void *_a
, const void *_b
) {
1399 const EntryItem
*a
= _a
, *b
= _b
;
1401 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1403 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1408 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1412 uint64_t xor_hash
= 0;
1413 struct dual_timestamp _ts
;
1416 assert(iovec
|| n_iovec
== 0);
1419 dual_timestamp_get(&_ts
);
1423 if (f
->tail_entry_monotonic_valid
&&
1424 ts
->monotonic
< le64toh(f
->header
->tail_entry_monotonic
))
1428 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
1433 /* alloca() can't take 0, hence let's allocate at least one */
1434 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
1436 for (i
= 0; i
< n_iovec
; i
++) {
1440 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
1444 xor_hash
^= le64toh(o
->data
.hash
);
1445 items
[i
].object_offset
= htole64(p
);
1446 items
[i
].hash
= o
->data
.hash
;
1449 /* Order by the position on disk, in order to improve seek
1450 * times for rotating media. */
1451 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
1453 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
1455 /* If the memory mapping triggered a SIGBUS then we return an
1456 * IO error and ignore the error code passed down to us, since
1457 * it is very likely just an effect of a nullified replacement
1460 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
1463 journal_file_post_change(f
);
1468 typedef struct ChainCacheItem
{
1469 uint64_t first
; /* the array at the beginning of the chain */
1470 uint64_t array
; /* the cached array */
1471 uint64_t begin
; /* the first item in the cached array */
1472 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
1473 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
1476 static void chain_cache_put(
1483 uint64_t last_index
) {
1486 /* If the chain item to cache for this chain is the
1487 * first one it's not worth caching anything */
1491 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
1492 ci
= ordered_hashmap_steal_first(h
);
1495 ci
= new(ChainCacheItem
, 1);
1502 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
1507 assert(ci
->first
== first
);
1512 ci
->last_index
= last_index
;
1515 static int generic_array_get(
1519 Object
**ret
, uint64_t *offset
) {
1522 uint64_t p
= 0, a
, t
= 0;
1530 /* Try the chain cache first */
1531 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1532 if (ci
&& i
> ci
->total
) {
1541 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1545 k
= journal_file_entry_array_n_items(o
);
1547 p
= le64toh(o
->entry_array
.items
[i
]);
1553 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1559 /* Let's cache this item for the next invocation */
1560 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
1562 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1575 static int generic_array_get_plus_one(
1580 Object
**ret
, uint64_t *offset
) {
1589 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1602 return generic_array_get(f
, first
, i
-1, ret
, offset
);
1611 static int generic_array_bisect(
1616 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1617 direction_t direction
,
1622 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
1623 bool subtract_one
= false;
1624 Object
*o
, *array
= NULL
;
1629 assert(test_object
);
1631 /* Start with the first array in the chain */
1634 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1635 if (ci
&& n
> ci
->total
) {
1636 /* Ah, we have iterated this bisection array chain
1637 * previously! Let's see if we can skip ahead in the
1638 * chain, as far as the last time. But we can't jump
1639 * backwards in the chain, so let's check that
1642 r
= test_object(f
, ci
->begin
, needle
);
1646 if (r
== TEST_LEFT
) {
1647 /* OK, what we are looking for is right of the
1648 * begin of this EntryArray, so let's jump
1649 * straight to previously cached array in the
1655 last_index
= ci
->last_index
;
1660 uint64_t left
, right
, k
, lp
;
1662 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
1666 k
= journal_file_entry_array_n_items(array
);
1672 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
1676 r
= test_object(f
, p
, needle
);
1680 if (r
== TEST_FOUND
)
1681 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1683 if (r
== TEST_RIGHT
) {
1687 if (last_index
!= (uint64_t) -1) {
1688 assert(last_index
<= right
);
1690 /* If we cached the last index we
1691 * looked at, let's try to not to jump
1692 * too wildly around and see if we can
1693 * limit the range to look at early to
1694 * the immediate neighbors of the last
1695 * index we looked at. */
1697 if (last_index
> 0) {
1698 uint64_t x
= last_index
- 1;
1700 p
= le64toh(array
->entry_array
.items
[x
]);
1704 r
= test_object(f
, p
, needle
);
1708 if (r
== TEST_FOUND
)
1709 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1711 if (r
== TEST_RIGHT
)
1717 if (last_index
< right
) {
1718 uint64_t y
= last_index
+ 1;
1720 p
= le64toh(array
->entry_array
.items
[y
]);
1724 r
= test_object(f
, p
, needle
);
1728 if (r
== TEST_FOUND
)
1729 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1731 if (r
== TEST_RIGHT
)
1739 if (left
== right
) {
1740 if (direction
== DIRECTION_UP
)
1741 subtract_one
= true;
1747 assert(left
< right
);
1748 i
= (left
+ right
) / 2;
1750 p
= le64toh(array
->entry_array
.items
[i
]);
1754 r
= test_object(f
, p
, needle
);
1758 if (r
== TEST_FOUND
)
1759 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1761 if (r
== TEST_RIGHT
)
1769 if (direction
== DIRECTION_UP
) {
1771 subtract_one
= true;
1782 last_index
= (uint64_t) -1;
1783 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
1789 if (subtract_one
&& t
== 0 && i
== 0)
1792 /* Let's cache this item for the next invocation */
1793 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
1795 if (subtract_one
&& i
== 0)
1797 else if (subtract_one
)
1798 p
= le64toh(array
->entry_array
.items
[i
-1]);
1800 p
= le64toh(array
->entry_array
.items
[i
]);
1802 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1813 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
1818 static int generic_array_bisect_plus_one(
1824 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1825 direction_t direction
,
1831 bool step_back
= false;
1835 assert(test_object
);
1840 /* This bisects the array in object 'first', but first checks
1842 r
= test_object(f
, extra
, needle
);
1846 if (r
== TEST_FOUND
)
1847 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1849 /* if we are looking with DIRECTION_UP then we need to first
1850 see if in the actual array there is a matching entry, and
1851 return the last one of that. But if there isn't any we need
1852 to return this one. Hence remember this, and return it
1855 step_back
= direction
== DIRECTION_UP
;
1857 if (r
== TEST_RIGHT
) {
1858 if (direction
== DIRECTION_DOWN
)
1864 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
1866 if (r
== 0 && step_back
)
1875 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1891 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1897 else if (p
< needle
)
1903 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1910 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1914 if (le64toh(o
->entry
.seqnum
) == needle
)
1916 else if (le64toh(o
->entry
.seqnum
) < needle
)
1922 int journal_file_move_to_entry_by_seqnum(
1925 direction_t direction
,
1929 return generic_array_bisect(f
,
1930 le64toh(f
->header
->entry_array_offset
),
1931 le64toh(f
->header
->n_entries
),
1938 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1945 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1949 if (le64toh(o
->entry
.realtime
) == needle
)
1951 else if (le64toh(o
->entry
.realtime
) < needle
)
1957 int journal_file_move_to_entry_by_realtime(
1960 direction_t direction
,
1964 return generic_array_bisect(f
,
1965 le64toh(f
->header
->entry_array_offset
),
1966 le64toh(f
->header
->n_entries
),
1968 test_object_realtime
,
1973 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1980 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1984 if (le64toh(o
->entry
.monotonic
) == needle
)
1986 else if (le64toh(o
->entry
.monotonic
) < needle
)
1992 static int find_data_object_by_boot_id(
1998 char t
[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2000 sd_id128_to_string(boot_id
, t
+ 9);
2001 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2004 int journal_file_move_to_entry_by_monotonic(
2008 direction_t direction
,
2017 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2023 return generic_array_bisect_plus_one(f
,
2024 le64toh(o
->data
.entry_offset
),
2025 le64toh(o
->data
.entry_array_offset
),
2026 le64toh(o
->data
.n_entries
),
2028 test_object_monotonic
,
2033 void journal_file_reset_location(JournalFile
*f
) {
2034 f
->location_type
= LOCATION_HEAD
;
2035 f
->current_offset
= 0;
2036 f
->current_seqnum
= 0;
2037 f
->current_realtime
= 0;
2038 f
->current_monotonic
= 0;
2039 zero(f
->current_boot_id
);
2040 f
->current_xor_hash
= 0;
2043 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2044 f
->location_type
= LOCATION_SEEK
;
2045 f
->current_offset
= offset
;
2046 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2047 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2048 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2049 f
->current_boot_id
= o
->entry
.boot_id
;
2050 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2053 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2056 assert(af
->location_type
== LOCATION_SEEK
);
2057 assert(bf
->location_type
== LOCATION_SEEK
);
2059 /* If contents and timestamps match, these entries are
2060 * identical, even if the seqnum does not match */
2061 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2062 af
->current_monotonic
== bf
->current_monotonic
&&
2063 af
->current_realtime
== bf
->current_realtime
&&
2064 af
->current_xor_hash
== bf
->current_xor_hash
)
2067 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2069 /* If this is from the same seqnum source, compare
2071 if (af
->current_seqnum
< bf
->current_seqnum
)
2073 if (af
->current_seqnum
> bf
->current_seqnum
)
2076 /* Wow! This is weird, different data but the same
2077 * seqnums? Something is borked, but let's make the
2078 * best of it and compare by time. */
2081 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2083 /* If the boot id matches, compare monotonic time */
2084 if (af
->current_monotonic
< bf
->current_monotonic
)
2086 if (af
->current_monotonic
> bf
->current_monotonic
)
2090 /* Otherwise, compare UTC time */
2091 if (af
->current_realtime
< bf
->current_realtime
)
2093 if (af
->current_realtime
> bf
->current_realtime
)
2096 /* Finally, compare by contents */
2097 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2099 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2105 int journal_file_next_entry(
2108 direction_t direction
,
2109 Object
**ret
, uint64_t *offset
) {
2116 n
= le64toh(f
->header
->n_entries
);
2121 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2123 r
= generic_array_bisect(f
,
2124 le64toh(f
->header
->entry_array_offset
),
2125 le64toh(f
->header
->n_entries
),
2134 if (direction
== DIRECTION_DOWN
) {
2147 /* And jump to it */
2148 r
= generic_array_get(f
,
2149 le64toh(f
->header
->entry_array_offset
),
2156 (direction
== DIRECTION_DOWN
? ofs
<= p
: ofs
>= p
)) {
2157 log_debug("%s: entry array corrupted at entry %"PRIu64
,
2168 int journal_file_next_entry_for_data(
2170 Object
*o
, uint64_t p
,
2171 uint64_t data_offset
,
2172 direction_t direction
,
2173 Object
**ret
, uint64_t *offset
) {
2180 assert(p
> 0 || !o
);
2182 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2186 n
= le64toh(d
->data
.n_entries
);
2191 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2193 if (o
->object
.type
!= OBJECT_ENTRY
)
2196 r
= generic_array_bisect_plus_one(f
,
2197 le64toh(d
->data
.entry_offset
),
2198 le64toh(d
->data
.entry_array_offset
),
2199 le64toh(d
->data
.n_entries
),
2209 if (direction
== DIRECTION_DOWN
) {
2223 return generic_array_get_plus_one(f
,
2224 le64toh(d
->data
.entry_offset
),
2225 le64toh(d
->data
.entry_array_offset
),
2230 int journal_file_move_to_entry_by_offset_for_data(
2232 uint64_t data_offset
,
2234 direction_t direction
,
2235 Object
**ret
, uint64_t *offset
) {
2242 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2246 return generic_array_bisect_plus_one(f
,
2247 le64toh(d
->data
.entry_offset
),
2248 le64toh(d
->data
.entry_array_offset
),
2249 le64toh(d
->data
.n_entries
),
2256 int journal_file_move_to_entry_by_monotonic_for_data(
2258 uint64_t data_offset
,
2261 direction_t direction
,
2262 Object
**ret
, uint64_t *offset
) {
2270 /* First, seek by time */
2271 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2277 r
= generic_array_bisect_plus_one(f
,
2278 le64toh(o
->data
.entry_offset
),
2279 le64toh(o
->data
.entry_array_offset
),
2280 le64toh(o
->data
.n_entries
),
2282 test_object_monotonic
,
2288 /* And now, continue seeking until we find an entry that
2289 * exists in both bisection arrays */
2295 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2299 r
= generic_array_bisect_plus_one(f
,
2300 le64toh(d
->data
.entry_offset
),
2301 le64toh(d
->data
.entry_array_offset
),
2302 le64toh(d
->data
.n_entries
),
2310 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2314 r
= generic_array_bisect_plus_one(f
,
2315 le64toh(o
->data
.entry_offset
),
2316 le64toh(o
->data
.entry_array_offset
),
2317 le64toh(o
->data
.n_entries
),
2339 int journal_file_move_to_entry_by_seqnum_for_data(
2341 uint64_t data_offset
,
2343 direction_t direction
,
2344 Object
**ret
, uint64_t *offset
) {
2351 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2355 return generic_array_bisect_plus_one(f
,
2356 le64toh(d
->data
.entry_offset
),
2357 le64toh(d
->data
.entry_array_offset
),
2358 le64toh(d
->data
.n_entries
),
2365 int journal_file_move_to_entry_by_realtime_for_data(
2367 uint64_t data_offset
,
2369 direction_t direction
,
2370 Object
**ret
, uint64_t *offset
) {
2377 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2381 return generic_array_bisect_plus_one(f
,
2382 le64toh(d
->data
.entry_offset
),
2383 le64toh(d
->data
.entry_array_offset
),
2384 le64toh(d
->data
.n_entries
),
2386 test_object_realtime
,
2391 void journal_file_dump(JournalFile
*f
) {
2398 journal_file_print_header(f
);
2400 p
= le64toh(f
->header
->header_size
);
2402 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
2406 switch (o
->object
.type
) {
2409 printf("Type: OBJECT_UNUSED\n");
2413 printf("Type: OBJECT_DATA\n");
2417 printf("Type: OBJECT_FIELD\n");
2421 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
2422 le64toh(o
->entry
.seqnum
),
2423 le64toh(o
->entry
.monotonic
),
2424 le64toh(o
->entry
.realtime
));
2427 case OBJECT_FIELD_HASH_TABLE
:
2428 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2431 case OBJECT_DATA_HASH_TABLE
:
2432 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2435 case OBJECT_ENTRY_ARRAY
:
2436 printf("Type: OBJECT_ENTRY_ARRAY\n");
2440 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
2441 le64toh(o
->tag
.seqnum
),
2442 le64toh(o
->tag
.epoch
));
2446 printf("Type: unknown (%i)\n", o
->object
.type
);
2450 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
2451 printf("Flags: %s\n",
2452 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
2454 if (p
== le64toh(f
->header
->tail_object_offset
))
2457 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
2462 log_error("File corrupt");
2465 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
2468 x
= format_timestamp(buf
, l
, t
);
2474 void journal_file_print_header(JournalFile
*f
) {
2475 char a
[33], b
[33], c
[33], d
[33];
2476 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
2478 char bytes
[FORMAT_BYTES_MAX
];
2482 printf("File Path: %s\n"
2486 "Sequential Number ID: %s\n"
2488 "Compatible Flags:%s%s\n"
2489 "Incompatible Flags:%s%s%s\n"
2490 "Header size: %"PRIu64
"\n"
2491 "Arena size: %"PRIu64
"\n"
2492 "Data Hash Table Size: %"PRIu64
"\n"
2493 "Field Hash Table Size: %"PRIu64
"\n"
2494 "Rotate Suggested: %s\n"
2495 "Head Sequential Number: %"PRIu64
"\n"
2496 "Tail Sequential Number: %"PRIu64
"\n"
2497 "Head Realtime Timestamp: %s\n"
2498 "Tail Realtime Timestamp: %s\n"
2499 "Tail Monotonic Timestamp: %s\n"
2500 "Objects: %"PRIu64
"\n"
2501 "Entry Objects: %"PRIu64
"\n",
2503 sd_id128_to_string(f
->header
->file_id
, a
),
2504 sd_id128_to_string(f
->header
->machine_id
, b
),
2505 sd_id128_to_string(f
->header
->boot_id
, c
),
2506 sd_id128_to_string(f
->header
->seqnum_id
, d
),
2507 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
2508 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
2509 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
2510 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
2511 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
2512 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
2513 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
2514 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
2515 le64toh(f
->header
->header_size
),
2516 le64toh(f
->header
->arena_size
),
2517 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
2518 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
2519 yes_no(journal_file_rotate_suggested(f
, 0)),
2520 le64toh(f
->header
->head_entry_seqnum
),
2521 le64toh(f
->header
->tail_entry_seqnum
),
2522 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)),
2523 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)),
2524 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
),
2525 le64toh(f
->header
->n_objects
),
2526 le64toh(f
->header
->n_entries
));
2528 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
2529 printf("Data Objects: %"PRIu64
"\n"
2530 "Data Hash Table Fill: %.1f%%\n",
2531 le64toh(f
->header
->n_data
),
2532 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
2534 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
2535 printf("Field Objects: %"PRIu64
"\n"
2536 "Field Hash Table Fill: %.1f%%\n",
2537 le64toh(f
->header
->n_fields
),
2538 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
2540 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
2541 printf("Tag Objects: %"PRIu64
"\n",
2542 le64toh(f
->header
->n_tags
));
2543 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
2544 printf("Entry Array Objects: %"PRIu64
"\n",
2545 le64toh(f
->header
->n_entry_arrays
));
2547 if (fstat(f
->fd
, &st
) >= 0)
2548 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (uint64_t) st
.st_blocks
* 512ULL));
2551 static int journal_file_warn_btrfs(JournalFile
*f
) {
2557 /* Before we write anything, check if the COW logic is turned
2558 * off on btrfs. Given our write pattern that is quite
2559 * unfriendly to COW file systems this should greatly improve
2560 * performance on COW file systems, such as btrfs, at the
2561 * expense of data integrity features (which shouldn't be too
2562 * bad, given that we do our own checksumming). */
2564 r
= btrfs_is_filesystem(f
->fd
);
2566 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
2570 r
= read_attr_fd(f
->fd
, &attrs
);
2572 return log_warning_errno(r
, "Failed to read file attributes: %m");
2574 if (attrs
& FS_NOCOW_FL
) {
2575 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2579 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2580 "This is likely to slow down journal access substantially, please consider turning "
2581 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
2586 int journal_file_open(
2592 JournalMetrics
*metrics
,
2593 MMapCache
*mmap_cache
,
2594 JournalFile
*template,
2595 JournalFile
**ret
) {
2597 bool newly_created
= false;
2605 if ((flags
& O_ACCMODE
) != O_RDONLY
&&
2606 (flags
& O_ACCMODE
) != O_RDWR
)
2609 if (!endswith(fname
, ".journal") &&
2610 !endswith(fname
, ".journal~"))
2613 f
= new0(JournalFile
, 1);
2621 f
->prot
= prot_from_flags(flags
);
2622 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
2623 #if defined(HAVE_LZ4)
2624 f
->compress_lz4
= compress
;
2625 #elif defined(HAVE_XZ)
2626 f
->compress_xz
= compress
;
2633 f
->mmap
= mmap_cache_ref(mmap_cache
);
2635 f
->mmap
= mmap_cache_new();
2642 f
->path
= strdup(fname
);
2648 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
2649 if (!f
->chain_cache
) {
2654 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
, f
->mode
);
2660 r
= journal_file_fstat(f
);
2664 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
2666 (void) journal_file_warn_btrfs(f
);
2668 /* Let's attach the creation time to the journal file,
2669 * so that the vacuuming code knows the age of this
2670 * file even if the file might end up corrupted one
2671 * day... Ideally we'd just use the creation time many
2672 * file systems maintain for each file, but there is
2673 * currently no usable API to query this, hence let's
2674 * emulate this via extended attributes. If extended
2675 * attributes are not supported we'll just skip this,
2676 * and rely solely on mtime/atime/ctime of the file. */
2678 fd_setcrtime(f
->fd
, 0);
2681 /* Try to load the FSPRG state, and if we can't, then
2682 * just don't do sealing */
2684 r
= journal_file_fss_load(f
);
2690 r
= journal_file_init_header(f
, template);
2694 r
= journal_file_fstat(f
);
2698 newly_created
= true;
2701 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
2706 r
= mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
);
2712 if (!newly_created
) {
2713 r
= journal_file_verify_header(f
);
2719 if (!newly_created
&& f
->writable
) {
2720 r
= journal_file_fss_load(f
);
2728 journal_default_metrics(metrics
, f
->fd
);
2729 f
->metrics
= *metrics
;
2730 } else if (template)
2731 f
->metrics
= template->metrics
;
2733 r
= journal_file_refresh_header(f
);
2739 r
= journal_file_hmac_setup(f
);
2744 if (newly_created
) {
2745 r
= journal_file_setup_field_hash_table(f
);
2749 r
= journal_file_setup_data_hash_table(f
);
2754 r
= journal_file_append_first_tag(f
);
2760 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
)) {
2769 if (f
->fd
>= 0 && mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
2772 journal_file_close(f
);
2777 int journal_file_rotate(JournalFile
**f
, bool compress
, bool seal
) {
2778 _cleanup_free_
char *p
= NULL
;
2780 JournalFile
*old_file
, *new_file
= NULL
;
2788 if (!old_file
->writable
)
2791 if (!endswith(old_file
->path
, ".journal"))
2794 l
= strlen(old_file
->path
);
2795 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
2796 (int) l
- 8, old_file
->path
,
2797 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
2798 le64toh((*f
)->header
->head_entry_seqnum
),
2799 le64toh((*f
)->header
->head_entry_realtime
));
2803 /* Try to rename the file to the archived version. If the file
2804 * already was deleted, we'll get ENOENT, let's ignore that
2806 r
= rename(old_file
->path
, p
);
2807 if (r
< 0 && errno
!= ENOENT
)
2810 old_file
->header
->state
= STATE_ARCHIVED
;
2812 /* Currently, btrfs is not very good with out write patterns
2813 * and fragments heavily. Let's defrag our journal files when
2814 * we archive them */
2815 old_file
->defrag_on_close
= true;
2817 r
= journal_file_open(old_file
->path
, old_file
->flags
, old_file
->mode
, compress
, seal
, NULL
, old_file
->mmap
, old_file
, &new_file
);
2818 journal_file_close(old_file
);
2824 int journal_file_open_reliably(
2830 JournalMetrics
*metrics
,
2831 MMapCache
*mmap_cache
,
2832 JournalFile
*template,
2833 JournalFile
**ret
) {
2837 _cleanup_free_
char *p
= NULL
;
2839 r
= journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2841 -EBADMSG
, /* corrupted */
2842 -ENODATA
, /* truncated */
2843 -EHOSTDOWN
, /* other machine */
2844 -EPROTONOSUPPORT
, /* incompatible feature */
2845 -EBUSY
, /* unclean shutdown */
2846 -ESHUTDOWN
, /* already archived */
2847 -EIO
, /* IO error, including SIGBUS on mmap */
2848 -EIDRM
/* File has been deleted */))
2851 if ((flags
& O_ACCMODE
) == O_RDONLY
)
2854 if (!(flags
& O_CREAT
))
2857 if (!endswith(fname
, ".journal"))
2860 /* The file is corrupted. Rotate it away and try it again (but only once) */
2863 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
2865 now(CLOCK_REALTIME
),
2869 if (rename(fname
, p
) < 0)
2872 /* btrfs doesn't cope well with our write pattern and
2873 * fragments heavily. Let's defrag all files we rotate */
2875 (void) chattr_path(p
, false, FS_NOCOW_FL
);
2876 (void) btrfs_defrag(p
);
2878 log_warning_errno(r
, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
2880 return journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2883 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
2885 uint64_t q
, xor_hash
= 0;
2898 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
2899 ts
.realtime
= le64toh(o
->entry
.realtime
);
2901 n
= journal_file_entry_n_items(o
);
2902 /* alloca() can't take 0, hence let's allocate at least one */
2903 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
2905 for (i
= 0; i
< n
; i
++) {
2912 q
= le64toh(o
->entry
.items
[i
].object_offset
);
2913 le_hash
= o
->entry
.items
[i
].hash
;
2915 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
2919 if (le_hash
!= o
->data
.hash
)
2922 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
2925 /* We hit the limit on 32bit machines */
2926 if ((uint64_t) t
!= l
)
2929 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
2930 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2933 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
2934 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
2938 data
= from
->compress_buffer
;
2941 return -EPROTONOSUPPORT
;
2944 data
= o
->data
.payload
;
2946 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
2950 xor_hash
^= le64toh(u
->data
.hash
);
2951 items
[i
].object_offset
= htole64(h
);
2952 items
[i
].hash
= u
->data
.hash
;
2954 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
2959 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
2961 if (mmap_cache_got_sigbus(to
->mmap
, to
->fd
))
2967 void journal_reset_metrics(JournalMetrics
*m
) {
2970 /* Set everything to "pick automatic values". */
2972 *m
= (JournalMetrics
) {
2973 .min_use
= (uint64_t) -1,
2974 .max_use
= (uint64_t) -1,
2975 .min_size
= (uint64_t) -1,
2976 .max_size
= (uint64_t) -1,
2977 .keep_free
= (uint64_t) -1,
2978 .n_max_files
= (uint64_t) -1,
2982 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
2983 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
], e
[FORMAT_BYTES_MAX
];
2990 if (fstatvfs(fd
, &ss
) >= 0)
2991 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
2993 log_debug_errno(errno
, "Failed to detremine disk size: %m");
2997 if (m
->max_use
== (uint64_t) -1) {
3000 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
3002 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
3003 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
3005 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
3006 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3008 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3010 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3012 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3013 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3016 if (m
->min_use
== (uint64_t) -1)
3017 m
->min_use
= DEFAULT_MIN_USE
;
3019 if (m
->min_use
> m
->max_use
)
3020 m
->min_use
= m
->max_use
;
3022 if (m
->max_size
== (uint64_t) -1) {
3023 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3025 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3026 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3028 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3030 if (m
->max_size
!= 0) {
3031 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3032 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3034 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3035 m
->max_use
= m
->max_size
*2;
3038 if (m
->min_size
== (uint64_t) -1)
3039 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3041 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3043 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3044 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3046 if (m
->max_size
!= 0 && m
->min_size
> m
->max_size
)
3047 m
->max_size
= m
->min_size
;
3050 if (m
->keep_free
== (uint64_t) -1) {
3053 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3055 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3056 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3059 m
->keep_free
= DEFAULT_KEEP_FREE
;
3062 if (m
->n_max_files
== (uint64_t) -1)
3063 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3065 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3066 format_bytes(a
, sizeof(a
), m
->min_use
),
3067 format_bytes(b
, sizeof(b
), m
->max_use
),
3068 format_bytes(c
, sizeof(c
), m
->max_size
),
3069 format_bytes(d
, sizeof(d
), m
->min_size
),
3070 format_bytes(e
, sizeof(e
), m
->keep_free
),
3074 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3079 if (f
->header
->head_entry_realtime
== 0)
3082 *from
= le64toh(f
->header
->head_entry_realtime
);
3086 if (f
->header
->tail_entry_realtime
== 0)
3089 *to
= le64toh(f
->header
->tail_entry_realtime
);
3095 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3103 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3107 if (le64toh(o
->data
.n_entries
) <= 0)
3111 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3115 *from
= le64toh(o
->entry
.monotonic
);
3119 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3123 r
= generic_array_get_plus_one(f
,
3124 le64toh(o
->data
.entry_offset
),
3125 le64toh(o
->data
.entry_array_offset
),
3126 le64toh(o
->data
.n_entries
)-1,
3131 *to
= le64toh(o
->entry
.monotonic
);
3137 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3140 /* If we gained new header fields we gained new features,
3141 * hence suggest a rotation */
3142 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3143 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3147 /* Let's check if the hash tables grew over a certain fill
3148 * level (75%, borrowing this value from Java's hash table
3149 * implementation), and if so suggest a rotation. To calculate
3150 * the fill level we need the n_data field, which only exists
3151 * in newer versions. */
3153 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3154 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3155 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3157 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3158 le64toh(f
->header
->n_data
),
3159 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3160 (unsigned long long) f
->last_stat
.st_size
,
3161 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3165 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3166 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3167 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3169 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3170 le64toh(f
->header
->n_fields
),
3171 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3175 /* Are the data objects properly indexed by field objects? */
3176 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3177 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3178 le64toh(f
->header
->n_data
) > 0 &&
3179 le64toh(f
->header
->n_fields
) == 0)
3182 if (max_file_usec
> 0) {
3185 h
= le64toh(f
->header
->head_entry_realtime
);
3186 t
= now(CLOCK_REALTIME
);
3188 if (h
> 0 && t
> h
+ max_file_usec
)