1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include "btrfs-util.h"
32 #include "journal-def.h"
33 #include "journal-file.h"
34 #include "journal-authenticate.h"
37 #include "random-util.h"
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
42 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
44 /* This is the minimum journal file size */
45 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
47 /* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
52 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
53 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
55 /* This is the upper bound if we deduce max_size from max_use */
56 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
58 /* This is the upper bound if we deduce the keep_free value from the
60 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
62 /* This is the keep_free value when we can't determine the system
64 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
66 /* This is the default maximum number of journal files to keep around. */
67 #define DEFAULT_N_MAX_FILES (100)
69 /* n_data was the first entry we added after the initial file format design */
70 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
72 /* How many entries to keep in the entry array chain cache at max */
73 #define CHAIN_CACHE_MAX 20
75 /* How much to increase the journal file size at once each time we allocate something new. */
76 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
78 /* Reread fstat() of the file for detecting deletions at least this often */
79 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
81 /* The mmap context to use for the header we pick as one above the last defined typed */
82 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
84 static int journal_file_set_online(JournalFile
*f
) {
90 if (!(f
->fd
>= 0 && f
->header
))
93 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
96 switch(f
->header
->state
) {
101 f
->header
->state
= STATE_ONLINE
;
110 int journal_file_set_offline(JournalFile
*f
) {
116 if (!(f
->fd
>= 0 && f
->header
))
119 if (f
->header
->state
!= STATE_ONLINE
)
124 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
127 f
->header
->state
= STATE_OFFLINE
;
129 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
137 JournalFile
* journal_file_close(JournalFile
*f
) {
141 /* Write the final tag */
142 if (f
->seal
&& f
->writable
)
143 journal_file_append_tag(f
);
146 journal_file_set_offline(f
);
148 if (f
->mmap
&& f
->fd
>= 0)
149 mmap_cache_close_fd(f
->mmap
, f
->fd
);
151 if (f
->fd
>= 0 && f
->defrag_on_close
) {
153 /* Be friendly to btrfs: turn COW back on again now,
154 * and defragment the file. We won't write to the file
155 * ever again, hence remove all fragmentation, and
156 * reenable all the good bits COW usually provides
157 * (such as data checksumming). */
159 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
160 (void) btrfs_defrag_fd(f
->fd
);
167 mmap_cache_unref(f
->mmap
);
169 ordered_hashmap_free_free(f
->chain_cache
);
171 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
172 free(f
->compress_buffer
);
177 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
179 free(f
->fsprg_state
);
184 gcry_md_close(f
->hmac
);
191 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
198 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
199 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
201 h
.incompatible_flags
|= htole32(
202 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
203 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
205 h
.compatible_flags
= htole32(
206 f
->seal
* HEADER_COMPATIBLE_SEALED
);
208 r
= sd_id128_randomize(&h
.file_id
);
213 h
.seqnum_id
= template->header
->seqnum_id
;
214 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
216 h
.seqnum_id
= h
.file_id
;
218 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
228 static int journal_file_refresh_header(JournalFile
*f
) {
234 r
= sd_id128_get_machine(&f
->header
->machine_id
);
238 r
= sd_id128_get_boot(&boot_id
);
242 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
243 f
->tail_entry_monotonic_valid
= true;
245 f
->header
->boot_id
= boot_id
;
247 r
= journal_file_set_online(f
);
249 /* Sync the online state to disk */
255 static int journal_file_verify_header(JournalFile
*f
) {
260 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
263 /* In both read and write mode we refuse to open files with
264 * incompatible flags we don't know */
265 flags
= le32toh(f
->header
->incompatible_flags
);
266 if (flags
& ~HEADER_INCOMPATIBLE_SUPPORTED
) {
267 if (flags
& ~HEADER_INCOMPATIBLE_ANY
)
268 log_debug("Journal file %s has unknown incompatible flags %"PRIx32
,
269 f
->path
, flags
& ~HEADER_INCOMPATIBLE_ANY
);
270 flags
= (flags
& HEADER_INCOMPATIBLE_ANY
) & ~HEADER_INCOMPATIBLE_SUPPORTED
;
272 log_debug("Journal file %s uses incompatible flags %"PRIx32
273 " disabled at compilation time.", f
->path
, flags
);
274 return -EPROTONOSUPPORT
;
277 /* When open for writing we refuse to open files with
278 * compatible flags, too */
279 flags
= le32toh(f
->header
->compatible_flags
);
280 if (f
->writable
&& (flags
& ~HEADER_COMPATIBLE_SUPPORTED
)) {
281 if (flags
& ~HEADER_COMPATIBLE_ANY
)
282 log_debug("Journal file %s has unknown compatible flags %"PRIx32
,
283 f
->path
, flags
& ~HEADER_COMPATIBLE_ANY
);
284 flags
= (flags
& HEADER_COMPATIBLE_ANY
) & ~HEADER_COMPATIBLE_SUPPORTED
;
286 log_debug("Journal file %s uses compatible flags %"PRIx32
287 " disabled at compilation time.", f
->path
, flags
);
288 return -EPROTONOSUPPORT
;
291 if (f
->header
->state
>= _STATE_MAX
)
294 /* The first addition was n_data, so check that we are at least this large */
295 if (le64toh(f
->header
->header_size
) < HEADER_SIZE_MIN
)
298 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
301 if ((le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)) > (uint64_t) f
->last_stat
.st_size
)
304 if (le64toh(f
->header
->tail_object_offset
) > (le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)))
307 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
308 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
309 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
310 !VALID64(le64toh(f
->header
->entry_array_offset
)))
315 sd_id128_t machine_id
;
318 r
= sd_id128_get_machine(&machine_id
);
322 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
325 state
= f
->header
->state
;
327 if (state
== STATE_ONLINE
) {
328 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
330 } else if (state
== STATE_ARCHIVED
)
332 else if (state
!= STATE_OFFLINE
) {
333 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
338 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
339 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
341 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
346 static int journal_file_fstat(JournalFile
*f
) {
350 if (fstat(f
->fd
, &f
->last_stat
) < 0)
353 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
355 /* Refuse appending to files that are already deleted */
356 if (f
->last_stat
.st_nlink
<= 0)
362 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
363 uint64_t old_size
, new_size
;
368 /* We assume that this file is not sparse, and we know that
369 * for sure, since we always call posix_fallocate()
372 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
376 le64toh(f
->header
->header_size
) +
377 le64toh(f
->header
->arena_size
);
379 new_size
= PAGE_ALIGN(offset
+ size
);
380 if (new_size
< le64toh(f
->header
->header_size
))
381 new_size
= le64toh(f
->header
->header_size
);
383 if (new_size
<= old_size
) {
385 /* We already pre-allocated enough space, but before
386 * we write to it, let's check with fstat() if the
387 * file got deleted, in order make sure we don't throw
388 * away the data immediately. Don't check fstat() for
389 * all writes though, but only once ever 10s. */
391 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
394 return journal_file_fstat(f
);
397 /* Allocate more space. */
399 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
402 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
405 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
408 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
410 if (new_size
- old_size
> available
)
415 /* Increase by larger blocks at once */
416 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
417 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
418 new_size
= f
->metrics
.max_size
;
420 /* Note that the glibc fallocate() fallback is very
421 inefficient, hence we try to minimize the allocation area
423 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
427 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
429 return journal_file_fstat(f
);
432 static unsigned type_to_context(ObjectType type
) {
433 /* One context for each type, plus one catch-all for the rest */
434 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
435 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
436 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
439 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
) {
448 /* Avoid SIGBUS on invalid accesses */
449 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
450 /* Hmm, out of range? Let's refresh the fstat() data
451 * first, before we trust that check. */
453 r
= journal_file_fstat(f
);
457 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
458 return -EADDRNOTAVAIL
;
461 return mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
);
464 static uint64_t minimum_header_size(Object
*o
) {
466 static const uint64_t table
[] = {
467 [OBJECT_DATA
] = sizeof(DataObject
),
468 [OBJECT_FIELD
] = sizeof(FieldObject
),
469 [OBJECT_ENTRY
] = sizeof(EntryObject
),
470 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
471 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
472 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
473 [OBJECT_TAG
] = sizeof(TagObject
),
476 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
477 return sizeof(ObjectHeader
);
479 return table
[o
->object
.type
];
482 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
491 /* Objects may only be located at multiple of 64 bit */
492 if (!VALID64(offset
))
495 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
);
500 s
= le64toh(o
->object
.size
);
502 if (s
< sizeof(ObjectHeader
))
505 if (o
->object
.type
<= OBJECT_UNUSED
)
508 if (s
< minimum_header_size(o
))
511 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
)
514 if (s
> sizeof(ObjectHeader
)) {
515 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
);
526 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
531 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
534 /* If an external seqnum counter was passed, we update
535 * both the local and the external one, and set it to
536 * the maximum of both */
544 f
->header
->tail_entry_seqnum
= htole64(r
);
546 if (f
->header
->head_entry_seqnum
== 0)
547 f
->header
->head_entry_seqnum
= htole64(r
);
552 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
559 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
560 assert(size
>= sizeof(ObjectHeader
));
564 r
= journal_file_set_online(f
);
568 p
= le64toh(f
->header
->tail_object_offset
);
570 p
= le64toh(f
->header
->header_size
);
572 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
576 p
+= ALIGN64(le64toh(tail
->object
.size
));
579 r
= journal_file_allocate(f
, p
, size
);
583 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
);
590 o
->object
.type
= type
;
591 o
->object
.size
= htole64(size
);
593 f
->header
->tail_object_offset
= htole64(p
);
594 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
602 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
609 /* We estimate that we need 1 hash table entry per 768 bytes
610 of journal file and we want to make sure we never get
611 beyond 75% fill level. Calculate the hash table size for
612 the maximum file size based on these metrics. */
614 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
615 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
616 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
618 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
620 r
= journal_file_append_object(f
,
621 OBJECT_DATA_HASH_TABLE
,
622 offsetof(Object
, hash_table
.items
) + s
,
627 memzero(o
->hash_table
.items
, s
);
629 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
630 f
->header
->data_hash_table_size
= htole64(s
);
635 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
642 /* We use a fixed size hash table for the fields as this
643 * number should grow very slowly only */
645 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
646 r
= journal_file_append_object(f
,
647 OBJECT_FIELD_HASH_TABLE
,
648 offsetof(Object
, hash_table
.items
) + s
,
653 memzero(o
->hash_table
.items
, s
);
655 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
656 f
->header
->field_hash_table_size
= htole64(s
);
661 int journal_file_map_data_hash_table(JournalFile
*f
) {
668 if (f
->data_hash_table
)
671 p
= le64toh(f
->header
->data_hash_table_offset
);
672 s
= le64toh(f
->header
->data_hash_table_size
);
674 r
= journal_file_move_to(f
,
675 OBJECT_DATA_HASH_TABLE
,
682 f
->data_hash_table
= t
;
686 int journal_file_map_field_hash_table(JournalFile
*f
) {
693 if (f
->field_hash_table
)
696 p
= le64toh(f
->header
->field_hash_table_offset
);
697 s
= le64toh(f
->header
->field_hash_table_size
);
699 r
= journal_file_move_to(f
,
700 OBJECT_FIELD_HASH_TABLE
,
707 f
->field_hash_table
= t
;
711 static int journal_file_link_field(
724 if (o
->object
.type
!= OBJECT_FIELD
)
727 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
731 /* This might alter the window we are looking at */
732 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
735 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
737 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
739 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
743 o
->field
.next_hash_offset
= htole64(offset
);
746 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
748 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
749 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
754 static int journal_file_link_data(
767 if (o
->object
.type
!= OBJECT_DATA
)
770 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
774 /* This might alter the window we are looking at */
775 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
776 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
777 o
->data
.n_entries
= 0;
780 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
782 /* Only entry in the hash table is easy */
783 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
785 /* Move back to the previous data object, to patch in
788 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
792 o
->data
.next_hash_offset
= htole64(offset
);
795 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
797 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
798 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
803 int journal_file_find_field_object_with_hash(
805 const void *field
, uint64_t size
, uint64_t hash
,
806 Object
**ret
, uint64_t *offset
) {
808 uint64_t p
, osize
, h
, m
;
812 assert(field
&& size
> 0);
814 /* If the field hash table is empty, we can't find anything */
815 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
818 /* Map the field hash table, if it isn't mapped yet. */
819 r
= journal_file_map_field_hash_table(f
);
823 osize
= offsetof(Object
, field
.payload
) + size
;
825 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
830 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
835 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
839 if (le64toh(o
->field
.hash
) == hash
&&
840 le64toh(o
->object
.size
) == osize
&&
841 memcmp(o
->field
.payload
, field
, size
) == 0) {
851 p
= le64toh(o
->field
.next_hash_offset
);
857 int journal_file_find_field_object(
859 const void *field
, uint64_t size
,
860 Object
**ret
, uint64_t *offset
) {
865 assert(field
&& size
> 0);
867 hash
= hash64(field
, size
);
869 return journal_file_find_field_object_with_hash(f
,
874 int journal_file_find_data_object_with_hash(
876 const void *data
, uint64_t size
, uint64_t hash
,
877 Object
**ret
, uint64_t *offset
) {
879 uint64_t p
, osize
, h
, m
;
883 assert(data
|| size
== 0);
885 /* If there's no data hash table, then there's no entry. */
886 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
889 /* Map the data hash table, if it isn't mapped yet. */
890 r
= journal_file_map_data_hash_table(f
);
894 osize
= offsetof(Object
, data
.payload
) + size
;
896 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
901 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
906 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
910 if (le64toh(o
->data
.hash
) != hash
)
913 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
914 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
918 l
= le64toh(o
->object
.size
);
919 if (l
<= offsetof(Object
, data
.payload
))
922 l
-= offsetof(Object
, data
.payload
);
924 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
925 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
930 memcmp(f
->compress_buffer
, data
, size
) == 0) {
941 return -EPROTONOSUPPORT
;
943 } else if (le64toh(o
->object
.size
) == osize
&&
944 memcmp(o
->data
.payload
, data
, size
) == 0) {
956 p
= le64toh(o
->data
.next_hash_offset
);
962 int journal_file_find_data_object(
964 const void *data
, uint64_t size
,
965 Object
**ret
, uint64_t *offset
) {
970 assert(data
|| size
== 0);
972 hash
= hash64(data
, size
);
974 return journal_file_find_data_object_with_hash(f
,
979 static int journal_file_append_field(
981 const void *field
, uint64_t size
,
982 Object
**ret
, uint64_t *offset
) {
990 assert(field
&& size
> 0);
992 hash
= hash64(field
, size
);
994 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1008 osize
= offsetof(Object
, field
.payload
) + size
;
1009 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1013 o
->field
.hash
= htole64(hash
);
1014 memcpy(o
->field
.payload
, field
, size
);
1016 r
= journal_file_link_field(f
, o
, p
, hash
);
1020 /* The linking might have altered the window, so let's
1021 * refresh our pointer */
1022 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1027 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1041 static int journal_file_append_data(
1043 const void *data
, uint64_t size
,
1044 Object
**ret
, uint64_t *offset
) {
1049 int r
, compression
= 0;
1053 assert(data
|| size
== 0);
1055 hash
= hash64(data
, size
);
1057 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1071 osize
= offsetof(Object
, data
.payload
) + size
;
1072 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1076 o
->data
.hash
= htole64(hash
);
1078 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1079 if (f
->compress_xz
&&
1080 size
>= COMPRESSION_SIZE_THRESHOLD
) {
1083 compression
= compress_blob(data
, size
, o
->data
.payload
, &rsize
);
1086 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1087 o
->object
.flags
|= compression
;
1089 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1090 size
, rsize
, object_compressed_to_string(compression
));
1095 if (!compression
&& size
> 0)
1096 memcpy(o
->data
.payload
, data
, size
);
1098 r
= journal_file_link_data(f
, o
, p
, hash
);
1102 /* The linking might have altered the window, so let's
1103 * refresh our pointer */
1104 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1111 eq
= memchr(data
, '=', size
);
1112 if (eq
&& eq
> data
) {
1116 /* Create field object ... */
1117 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1121 /* ... and link it in. */
1122 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1123 fo
->field
.head_data_offset
= le64toh(p
);
1127 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1141 uint64_t journal_file_entry_n_items(Object
*o
) {
1144 if (o
->object
.type
!= OBJECT_ENTRY
)
1147 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1150 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1153 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1156 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1159 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1162 if (o
->object
.type
!= OBJECT_DATA_HASH_TABLE
&&
1163 o
->object
.type
!= OBJECT_FIELD_HASH_TABLE
)
1166 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1169 static int link_entry_into_array(JournalFile
*f
,
1174 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1182 a
= le64toh(*first
);
1183 i
= hidx
= le64toh(*idx
);
1186 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1190 n
= journal_file_entry_array_n_items(o
);
1192 o
->entry_array
.items
[i
] = htole64(p
);
1193 *idx
= htole64(hidx
+ 1);
1199 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1210 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1211 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1217 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1222 o
->entry_array
.items
[i
] = htole64(p
);
1225 *first
= htole64(q
);
1227 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1231 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1234 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1235 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1237 *idx
= htole64(hidx
+ 1);
1242 static int link_entry_into_array_plus_one(JournalFile
*f
,
1257 *extra
= htole64(p
);
1261 i
= htole64(le64toh(*idx
) - 1);
1262 r
= link_entry_into_array(f
, first
, &i
, p
);
1267 *idx
= htole64(le64toh(*idx
) + 1);
1271 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1278 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1282 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1286 return link_entry_into_array_plus_one(f
,
1287 &o
->data
.entry_offset
,
1288 &o
->data
.entry_array_offset
,
1293 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1301 if (o
->object
.type
!= OBJECT_ENTRY
)
1304 __sync_synchronize();
1306 /* Link up the entry itself */
1307 r
= link_entry_into_array(f
,
1308 &f
->header
->entry_array_offset
,
1309 &f
->header
->n_entries
,
1314 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1316 if (f
->header
->head_entry_realtime
== 0)
1317 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1319 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1320 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1322 f
->tail_entry_monotonic_valid
= true;
1324 /* Link up the items */
1325 n
= journal_file_entry_n_items(o
);
1326 for (i
= 0; i
< n
; i
++) {
1327 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1335 static int journal_file_append_entry_internal(
1337 const dual_timestamp
*ts
,
1339 const EntryItem items
[], unsigned n_items
,
1341 Object
**ret
, uint64_t *offset
) {
1348 assert(items
|| n_items
== 0);
1351 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1353 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1357 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1358 memcpy(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1359 o
->entry
.realtime
= htole64(ts
->realtime
);
1360 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1361 o
->entry
.xor_hash
= htole64(xor_hash
);
1362 o
->entry
.boot_id
= f
->header
->boot_id
;
1365 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1370 r
= journal_file_link_entry(f
, o
, np
);
1383 void journal_file_post_change(JournalFile
*f
) {
1386 /* inotify() does not receive IN_MODIFY events from file
1387 * accesses done via mmap(). After each access we hence
1388 * trigger IN_MODIFY by truncating the journal file to its
1389 * current size which triggers IN_MODIFY. */
1391 __sync_synchronize();
1393 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1394 log_error_errno(errno
, "Failed to truncate file to its own size: %m");
1397 static int entry_item_cmp(const void *_a
, const void *_b
) {
1398 const EntryItem
*a
= _a
, *b
= _b
;
1400 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1402 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1407 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1411 uint64_t xor_hash
= 0;
1412 struct dual_timestamp _ts
;
1415 assert(iovec
|| n_iovec
== 0);
1418 dual_timestamp_get(&_ts
);
1422 if (f
->tail_entry_monotonic_valid
&&
1423 ts
->monotonic
< le64toh(f
->header
->tail_entry_monotonic
))
1427 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
1432 /* alloca() can't take 0, hence let's allocate at least one */
1433 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
1435 for (i
= 0; i
< n_iovec
; i
++) {
1439 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
1443 xor_hash
^= le64toh(o
->data
.hash
);
1444 items
[i
].object_offset
= htole64(p
);
1445 items
[i
].hash
= o
->data
.hash
;
1448 /* Order by the position on disk, in order to improve seek
1449 * times for rotating media. */
1450 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
1452 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
1454 /* If the memory mapping triggered a SIGBUS then we return an
1455 * IO error and ignore the error code passed down to us, since
1456 * it is very likely just an effect of a nullified replacement
1459 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
1462 journal_file_post_change(f
);
1467 typedef struct ChainCacheItem
{
1468 uint64_t first
; /* the array at the beginning of the chain */
1469 uint64_t array
; /* the cached array */
1470 uint64_t begin
; /* the first item in the cached array */
1471 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
1472 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
1475 static void chain_cache_put(
1482 uint64_t last_index
) {
1485 /* If the chain item to cache for this chain is the
1486 * first one it's not worth caching anything */
1490 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
1491 ci
= ordered_hashmap_steal_first(h
);
1494 ci
= new(ChainCacheItem
, 1);
1501 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
1506 assert(ci
->first
== first
);
1511 ci
->last_index
= last_index
;
1514 static int generic_array_get(
1518 Object
**ret
, uint64_t *offset
) {
1521 uint64_t p
= 0, a
, t
= 0;
1529 /* Try the chain cache first */
1530 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1531 if (ci
&& i
> ci
->total
) {
1540 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1544 k
= journal_file_entry_array_n_items(o
);
1546 p
= le64toh(o
->entry_array
.items
[i
]);
1552 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1558 /* Let's cache this item for the next invocation */
1559 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
1561 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1574 static int generic_array_get_plus_one(
1579 Object
**ret
, uint64_t *offset
) {
1588 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1601 return generic_array_get(f
, first
, i
-1, ret
, offset
);
1610 static int generic_array_bisect(
1615 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1616 direction_t direction
,
1621 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
1622 bool subtract_one
= false;
1623 Object
*o
, *array
= NULL
;
1628 assert(test_object
);
1630 /* Start with the first array in the chain */
1633 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1634 if (ci
&& n
> ci
->total
) {
1635 /* Ah, we have iterated this bisection array chain
1636 * previously! Let's see if we can skip ahead in the
1637 * chain, as far as the last time. But we can't jump
1638 * backwards in the chain, so let's check that
1641 r
= test_object(f
, ci
->begin
, needle
);
1645 if (r
== TEST_LEFT
) {
1646 /* OK, what we are looking for is right of the
1647 * begin of this EntryArray, so let's jump
1648 * straight to previously cached array in the
1654 last_index
= ci
->last_index
;
1659 uint64_t left
, right
, k
, lp
;
1661 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
1665 k
= journal_file_entry_array_n_items(array
);
1671 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
1675 r
= test_object(f
, p
, needle
);
1679 if (r
== TEST_FOUND
)
1680 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1682 if (r
== TEST_RIGHT
) {
1686 if (last_index
!= (uint64_t) -1) {
1687 assert(last_index
<= right
);
1689 /* If we cached the last index we
1690 * looked at, let's try to not to jump
1691 * too wildly around and see if we can
1692 * limit the range to look at early to
1693 * the immediate neighbors of the last
1694 * index we looked at. */
1696 if (last_index
> 0) {
1697 uint64_t x
= last_index
- 1;
1699 p
= le64toh(array
->entry_array
.items
[x
]);
1703 r
= test_object(f
, p
, needle
);
1707 if (r
== TEST_FOUND
)
1708 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1710 if (r
== TEST_RIGHT
)
1716 if (last_index
< right
) {
1717 uint64_t y
= last_index
+ 1;
1719 p
= le64toh(array
->entry_array
.items
[y
]);
1723 r
= test_object(f
, p
, needle
);
1727 if (r
== TEST_FOUND
)
1728 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1730 if (r
== TEST_RIGHT
)
1738 if (left
== right
) {
1739 if (direction
== DIRECTION_UP
)
1740 subtract_one
= true;
1746 assert(left
< right
);
1747 i
= (left
+ right
) / 2;
1749 p
= le64toh(array
->entry_array
.items
[i
]);
1753 r
= test_object(f
, p
, needle
);
1757 if (r
== TEST_FOUND
)
1758 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1760 if (r
== TEST_RIGHT
)
1768 if (direction
== DIRECTION_UP
) {
1770 subtract_one
= true;
1781 last_index
= (uint64_t) -1;
1782 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
1788 if (subtract_one
&& t
== 0 && i
== 0)
1791 /* Let's cache this item for the next invocation */
1792 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
1794 if (subtract_one
&& i
== 0)
1796 else if (subtract_one
)
1797 p
= le64toh(array
->entry_array
.items
[i
-1]);
1799 p
= le64toh(array
->entry_array
.items
[i
]);
1801 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1812 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
1817 static int generic_array_bisect_plus_one(
1823 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1824 direction_t direction
,
1830 bool step_back
= false;
1834 assert(test_object
);
1839 /* This bisects the array in object 'first', but first checks
1841 r
= test_object(f
, extra
, needle
);
1845 if (r
== TEST_FOUND
)
1846 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1848 /* if we are looking with DIRECTION_UP then we need to first
1849 see if in the actual array there is a matching entry, and
1850 return the last one of that. But if there isn't any we need
1851 to return this one. Hence remember this, and return it
1854 step_back
= direction
== DIRECTION_UP
;
1856 if (r
== TEST_RIGHT
) {
1857 if (direction
== DIRECTION_DOWN
)
1863 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
1865 if (r
== 0 && step_back
)
1874 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1890 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1896 else if (p
< needle
)
1902 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1909 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1913 if (le64toh(o
->entry
.seqnum
) == needle
)
1915 else if (le64toh(o
->entry
.seqnum
) < needle
)
1921 int journal_file_move_to_entry_by_seqnum(
1924 direction_t direction
,
1928 return generic_array_bisect(f
,
1929 le64toh(f
->header
->entry_array_offset
),
1930 le64toh(f
->header
->n_entries
),
1937 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1944 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1948 if (le64toh(o
->entry
.realtime
) == needle
)
1950 else if (le64toh(o
->entry
.realtime
) < needle
)
1956 int journal_file_move_to_entry_by_realtime(
1959 direction_t direction
,
1963 return generic_array_bisect(f
,
1964 le64toh(f
->header
->entry_array_offset
),
1965 le64toh(f
->header
->n_entries
),
1967 test_object_realtime
,
1972 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1979 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1983 if (le64toh(o
->entry
.monotonic
) == needle
)
1985 else if (le64toh(o
->entry
.monotonic
) < needle
)
1991 static int find_data_object_by_boot_id(
1997 char t
[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1999 sd_id128_to_string(boot_id
, t
+ 9);
2000 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2003 int journal_file_move_to_entry_by_monotonic(
2007 direction_t direction
,
2016 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2022 return generic_array_bisect_plus_one(f
,
2023 le64toh(o
->data
.entry_offset
),
2024 le64toh(o
->data
.entry_array_offset
),
2025 le64toh(o
->data
.n_entries
),
2027 test_object_monotonic
,
2032 void journal_file_reset_location(JournalFile
*f
) {
2033 f
->location_type
= LOCATION_HEAD
;
2034 f
->current_offset
= 0;
2035 f
->current_seqnum
= 0;
2036 f
->current_realtime
= 0;
2037 f
->current_monotonic
= 0;
2038 zero(f
->current_boot_id
);
2039 f
->current_xor_hash
= 0;
2042 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2043 f
->location_type
= LOCATION_SEEK
;
2044 f
->current_offset
= offset
;
2045 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2046 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2047 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2048 f
->current_boot_id
= o
->entry
.boot_id
;
2049 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2052 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2055 assert(af
->location_type
== LOCATION_SEEK
);
2056 assert(bf
->location_type
== LOCATION_SEEK
);
2058 /* If contents and timestamps match, these entries are
2059 * identical, even if the seqnum does not match */
2060 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2061 af
->current_monotonic
== bf
->current_monotonic
&&
2062 af
->current_realtime
== bf
->current_realtime
&&
2063 af
->current_xor_hash
== bf
->current_xor_hash
)
2066 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2068 /* If this is from the same seqnum source, compare
2070 if (af
->current_seqnum
< bf
->current_seqnum
)
2072 if (af
->current_seqnum
> bf
->current_seqnum
)
2075 /* Wow! This is weird, different data but the same
2076 * seqnums? Something is borked, but let's make the
2077 * best of it and compare by time. */
2080 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2082 /* If the boot id matches, compare monotonic time */
2083 if (af
->current_monotonic
< bf
->current_monotonic
)
2085 if (af
->current_monotonic
> bf
->current_monotonic
)
2089 /* Otherwise, compare UTC time */
2090 if (af
->current_realtime
< bf
->current_realtime
)
2092 if (af
->current_realtime
> bf
->current_realtime
)
2095 /* Finally, compare by contents */
2096 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2098 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2104 int journal_file_next_entry(
2107 direction_t direction
,
2108 Object
**ret
, uint64_t *offset
) {
2115 n
= le64toh(f
->header
->n_entries
);
2120 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2122 r
= generic_array_bisect(f
,
2123 le64toh(f
->header
->entry_array_offset
),
2124 le64toh(f
->header
->n_entries
),
2133 if (direction
== DIRECTION_DOWN
) {
2146 /* And jump to it */
2147 r
= generic_array_get(f
,
2148 le64toh(f
->header
->entry_array_offset
),
2155 (direction
== DIRECTION_DOWN
? ofs
<= p
: ofs
>= p
)) {
2156 log_debug("%s: entry array corrupted at entry %"PRIu64
,
2167 int journal_file_next_entry_for_data(
2169 Object
*o
, uint64_t p
,
2170 uint64_t data_offset
,
2171 direction_t direction
,
2172 Object
**ret
, uint64_t *offset
) {
2179 assert(p
> 0 || !o
);
2181 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2185 n
= le64toh(d
->data
.n_entries
);
2190 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2192 if (o
->object
.type
!= OBJECT_ENTRY
)
2195 r
= generic_array_bisect_plus_one(f
,
2196 le64toh(d
->data
.entry_offset
),
2197 le64toh(d
->data
.entry_array_offset
),
2198 le64toh(d
->data
.n_entries
),
2208 if (direction
== DIRECTION_DOWN
) {
2222 return generic_array_get_plus_one(f
,
2223 le64toh(d
->data
.entry_offset
),
2224 le64toh(d
->data
.entry_array_offset
),
2229 int journal_file_move_to_entry_by_offset_for_data(
2231 uint64_t data_offset
,
2233 direction_t direction
,
2234 Object
**ret
, uint64_t *offset
) {
2241 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2245 return generic_array_bisect_plus_one(f
,
2246 le64toh(d
->data
.entry_offset
),
2247 le64toh(d
->data
.entry_array_offset
),
2248 le64toh(d
->data
.n_entries
),
2255 int journal_file_move_to_entry_by_monotonic_for_data(
2257 uint64_t data_offset
,
2260 direction_t direction
,
2261 Object
**ret
, uint64_t *offset
) {
2269 /* First, seek by time */
2270 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2276 r
= generic_array_bisect_plus_one(f
,
2277 le64toh(o
->data
.entry_offset
),
2278 le64toh(o
->data
.entry_array_offset
),
2279 le64toh(o
->data
.n_entries
),
2281 test_object_monotonic
,
2287 /* And now, continue seeking until we find an entry that
2288 * exists in both bisection arrays */
2294 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2298 r
= generic_array_bisect_plus_one(f
,
2299 le64toh(d
->data
.entry_offset
),
2300 le64toh(d
->data
.entry_array_offset
),
2301 le64toh(d
->data
.n_entries
),
2309 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2313 r
= generic_array_bisect_plus_one(f
,
2314 le64toh(o
->data
.entry_offset
),
2315 le64toh(o
->data
.entry_array_offset
),
2316 le64toh(o
->data
.n_entries
),
2338 int journal_file_move_to_entry_by_seqnum_for_data(
2340 uint64_t data_offset
,
2342 direction_t direction
,
2343 Object
**ret
, uint64_t *offset
) {
2350 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2354 return generic_array_bisect_plus_one(f
,
2355 le64toh(d
->data
.entry_offset
),
2356 le64toh(d
->data
.entry_array_offset
),
2357 le64toh(d
->data
.n_entries
),
2364 int journal_file_move_to_entry_by_realtime_for_data(
2366 uint64_t data_offset
,
2368 direction_t direction
,
2369 Object
**ret
, uint64_t *offset
) {
2376 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2380 return generic_array_bisect_plus_one(f
,
2381 le64toh(d
->data
.entry_offset
),
2382 le64toh(d
->data
.entry_array_offset
),
2383 le64toh(d
->data
.n_entries
),
2385 test_object_realtime
,
2390 void journal_file_dump(JournalFile
*f
) {
2397 journal_file_print_header(f
);
2399 p
= le64toh(f
->header
->header_size
);
2401 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
2405 switch (o
->object
.type
) {
2408 printf("Type: OBJECT_UNUSED\n");
2412 printf("Type: OBJECT_DATA\n");
2416 printf("Type: OBJECT_FIELD\n");
2420 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
2421 le64toh(o
->entry
.seqnum
),
2422 le64toh(o
->entry
.monotonic
),
2423 le64toh(o
->entry
.realtime
));
2426 case OBJECT_FIELD_HASH_TABLE
:
2427 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2430 case OBJECT_DATA_HASH_TABLE
:
2431 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2434 case OBJECT_ENTRY_ARRAY
:
2435 printf("Type: OBJECT_ENTRY_ARRAY\n");
2439 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
2440 le64toh(o
->tag
.seqnum
),
2441 le64toh(o
->tag
.epoch
));
2445 printf("Type: unknown (%i)\n", o
->object
.type
);
2449 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
2450 printf("Flags: %s\n",
2451 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
2453 if (p
== le64toh(f
->header
->tail_object_offset
))
2456 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
2461 log_error("File corrupt");
2464 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
2467 x
= format_timestamp(buf
, l
, t
);
2473 void journal_file_print_header(JournalFile
*f
) {
2474 char a
[33], b
[33], c
[33], d
[33];
2475 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
2477 char bytes
[FORMAT_BYTES_MAX
];
2481 printf("File Path: %s\n"
2485 "Sequential Number ID: %s\n"
2487 "Compatible Flags:%s%s\n"
2488 "Incompatible Flags:%s%s%s\n"
2489 "Header size: %"PRIu64
"\n"
2490 "Arena size: %"PRIu64
"\n"
2491 "Data Hash Table Size: %"PRIu64
"\n"
2492 "Field Hash Table Size: %"PRIu64
"\n"
2493 "Rotate Suggested: %s\n"
2494 "Head Sequential Number: %"PRIu64
"\n"
2495 "Tail Sequential Number: %"PRIu64
"\n"
2496 "Head Realtime Timestamp: %s\n"
2497 "Tail Realtime Timestamp: %s\n"
2498 "Tail Monotonic Timestamp: %s\n"
2499 "Objects: %"PRIu64
"\n"
2500 "Entry Objects: %"PRIu64
"\n",
2502 sd_id128_to_string(f
->header
->file_id
, a
),
2503 sd_id128_to_string(f
->header
->machine_id
, b
),
2504 sd_id128_to_string(f
->header
->boot_id
, c
),
2505 sd_id128_to_string(f
->header
->seqnum_id
, d
),
2506 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
2507 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
2508 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
2509 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
2510 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
2511 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
2512 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
2513 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
2514 le64toh(f
->header
->header_size
),
2515 le64toh(f
->header
->arena_size
),
2516 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
2517 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
2518 yes_no(journal_file_rotate_suggested(f
, 0)),
2519 le64toh(f
->header
->head_entry_seqnum
),
2520 le64toh(f
->header
->tail_entry_seqnum
),
2521 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)),
2522 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)),
2523 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
),
2524 le64toh(f
->header
->n_objects
),
2525 le64toh(f
->header
->n_entries
));
2527 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
2528 printf("Data Objects: %"PRIu64
"\n"
2529 "Data Hash Table Fill: %.1f%%\n",
2530 le64toh(f
->header
->n_data
),
2531 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
2533 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
2534 printf("Field Objects: %"PRIu64
"\n"
2535 "Field Hash Table Fill: %.1f%%\n",
2536 le64toh(f
->header
->n_fields
),
2537 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
2539 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
2540 printf("Tag Objects: %"PRIu64
"\n",
2541 le64toh(f
->header
->n_tags
));
2542 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
2543 printf("Entry Array Objects: %"PRIu64
"\n",
2544 le64toh(f
->header
->n_entry_arrays
));
2546 if (fstat(f
->fd
, &st
) >= 0)
2547 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (uint64_t) st
.st_blocks
* 512ULL));
2550 static int journal_file_warn_btrfs(JournalFile
*f
) {
2556 /* Before we write anything, check if the COW logic is turned
2557 * off on btrfs. Given our write pattern that is quite
2558 * unfriendly to COW file systems this should greatly improve
2559 * performance on COW file systems, such as btrfs, at the
2560 * expense of data integrity features (which shouldn't be too
2561 * bad, given that we do our own checksumming). */
2563 r
= btrfs_is_filesystem(f
->fd
);
2565 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
2569 r
= read_attr_fd(f
->fd
, &attrs
);
2571 return log_warning_errno(r
, "Failed to read file attributes: %m");
2573 if (attrs
& FS_NOCOW_FL
) {
2574 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2578 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2579 "This is likely to slow down journal access substantially, please consider turning "
2580 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
2585 int journal_file_open(
2591 JournalMetrics
*metrics
,
2592 MMapCache
*mmap_cache
,
2593 JournalFile
*template,
2594 JournalFile
**ret
) {
2596 bool newly_created
= false;
2604 if ((flags
& O_ACCMODE
) != O_RDONLY
&&
2605 (flags
& O_ACCMODE
) != O_RDWR
)
2608 if (!endswith(fname
, ".journal") &&
2609 !endswith(fname
, ".journal~"))
2612 f
= new0(JournalFile
, 1);
2620 f
->prot
= prot_from_flags(flags
);
2621 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
2622 #if defined(HAVE_LZ4)
2623 f
->compress_lz4
= compress
;
2624 #elif defined(HAVE_XZ)
2625 f
->compress_xz
= compress
;
2632 f
->mmap
= mmap_cache_ref(mmap_cache
);
2634 f
->mmap
= mmap_cache_new();
2641 f
->path
= strdup(fname
);
2647 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
2648 if (!f
->chain_cache
) {
2653 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
, f
->mode
);
2659 r
= journal_file_fstat(f
);
2663 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
2665 (void) journal_file_warn_btrfs(f
);
2667 /* Let's attach the creation time to the journal file,
2668 * so that the vacuuming code knows the age of this
2669 * file even if the file might end up corrupted one
2670 * day... Ideally we'd just use the creation time many
2671 * file systems maintain for each file, but there is
2672 * currently no usable API to query this, hence let's
2673 * emulate this via extended attributes. If extended
2674 * attributes are not supported we'll just skip this,
2675 * and rely solely on mtime/atime/ctime of the file. */
2677 fd_setcrtime(f
->fd
, 0);
2680 /* Try to load the FSPRG state, and if we can't, then
2681 * just don't do sealing */
2683 r
= journal_file_fss_load(f
);
2689 r
= journal_file_init_header(f
, template);
2693 r
= journal_file_fstat(f
);
2697 newly_created
= true;
2700 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
2705 r
= mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
);
2711 if (!newly_created
) {
2712 r
= journal_file_verify_header(f
);
2718 if (!newly_created
&& f
->writable
) {
2719 r
= journal_file_fss_load(f
);
2727 journal_default_metrics(metrics
, f
->fd
);
2728 f
->metrics
= *metrics
;
2729 } else if (template)
2730 f
->metrics
= template->metrics
;
2732 r
= journal_file_refresh_header(f
);
2738 r
= journal_file_hmac_setup(f
);
2743 if (newly_created
) {
2744 r
= journal_file_setup_field_hash_table(f
);
2748 r
= journal_file_setup_data_hash_table(f
);
2753 r
= journal_file_append_first_tag(f
);
2759 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
)) {
2768 if (f
->fd
>= 0 && mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
2771 journal_file_close(f
);
2776 int journal_file_rotate(JournalFile
**f
, bool compress
, bool seal
) {
2777 _cleanup_free_
char *p
= NULL
;
2779 JournalFile
*old_file
, *new_file
= NULL
;
2787 if (!old_file
->writable
)
2790 if (!endswith(old_file
->path
, ".journal"))
2793 l
= strlen(old_file
->path
);
2794 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
2795 (int) l
- 8, old_file
->path
,
2796 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
2797 le64toh((*f
)->header
->head_entry_seqnum
),
2798 le64toh((*f
)->header
->head_entry_realtime
));
2802 /* Try to rename the file to the archived version. If the file
2803 * already was deleted, we'll get ENOENT, let's ignore that
2805 r
= rename(old_file
->path
, p
);
2806 if (r
< 0 && errno
!= ENOENT
)
2809 old_file
->header
->state
= STATE_ARCHIVED
;
2811 /* Currently, btrfs is not very good with out write patterns
2812 * and fragments heavily. Let's defrag our journal files when
2813 * we archive them */
2814 old_file
->defrag_on_close
= true;
2816 r
= journal_file_open(old_file
->path
, old_file
->flags
, old_file
->mode
, compress
, seal
, NULL
, old_file
->mmap
, old_file
, &new_file
);
2817 journal_file_close(old_file
);
2823 int journal_file_open_reliably(
2829 JournalMetrics
*metrics
,
2830 MMapCache
*mmap_cache
,
2831 JournalFile
*template,
2832 JournalFile
**ret
) {
2836 _cleanup_free_
char *p
= NULL
;
2838 r
= journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2840 -EBADMSG
, /* corrupted */
2841 -ENODATA
, /* truncated */
2842 -EHOSTDOWN
, /* other machine */
2843 -EPROTONOSUPPORT
, /* incompatible feature */
2844 -EBUSY
, /* unclean shutdown */
2845 -ESHUTDOWN
, /* already archived */
2846 -EIO
, /* IO error, including SIGBUS on mmap */
2847 -EIDRM
/* File has been deleted */))
2850 if ((flags
& O_ACCMODE
) == O_RDONLY
)
2853 if (!(flags
& O_CREAT
))
2856 if (!endswith(fname
, ".journal"))
2859 /* The file is corrupted. Rotate it away and try it again (but only once) */
2862 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
2864 now(CLOCK_REALTIME
),
2868 if (rename(fname
, p
) < 0)
2871 /* btrfs doesn't cope well with our write pattern and
2872 * fragments heavily. Let's defrag all files we rotate */
2874 (void) chattr_path(p
, false, FS_NOCOW_FL
);
2875 (void) btrfs_defrag(p
);
2877 log_warning_errno(r
, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
2879 return journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2882 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
2884 uint64_t q
, xor_hash
= 0;
2897 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
2898 ts
.realtime
= le64toh(o
->entry
.realtime
);
2900 n
= journal_file_entry_n_items(o
);
2901 /* alloca() can't take 0, hence let's allocate at least one */
2902 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
2904 for (i
= 0; i
< n
; i
++) {
2911 q
= le64toh(o
->entry
.items
[i
].object_offset
);
2912 le_hash
= o
->entry
.items
[i
].hash
;
2914 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
2918 if (le_hash
!= o
->data
.hash
)
2921 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
2924 /* We hit the limit on 32bit machines */
2925 if ((uint64_t) t
!= l
)
2928 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
2929 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2932 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
2933 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
2937 data
= from
->compress_buffer
;
2940 return -EPROTONOSUPPORT
;
2943 data
= o
->data
.payload
;
2945 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
2949 xor_hash
^= le64toh(u
->data
.hash
);
2950 items
[i
].object_offset
= htole64(h
);
2951 items
[i
].hash
= u
->data
.hash
;
2953 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
2958 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
2960 if (mmap_cache_got_sigbus(to
->mmap
, to
->fd
))
2966 void journal_reset_metrics(JournalMetrics
*m
) {
2969 /* Set everything to "pick automatic values". */
2971 *m
= (JournalMetrics
) {
2972 .min_use
= (uint64_t) -1,
2973 .max_use
= (uint64_t) -1,
2974 .min_size
= (uint64_t) -1,
2975 .max_size
= (uint64_t) -1,
2976 .keep_free
= (uint64_t) -1,
2977 .n_max_files
= (uint64_t) -1,
2981 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
2982 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
], e
[FORMAT_BYTES_MAX
];
2989 if (fstatvfs(fd
, &ss
) >= 0)
2990 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
2992 log_debug_errno(errno
, "Failed to detremine disk size: %m");
2996 if (m
->max_use
== (uint64_t) -1) {
2999 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
3001 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
3002 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
3004 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
3005 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3007 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3009 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3011 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3012 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3015 if (m
->min_use
== (uint64_t) -1)
3016 m
->min_use
= DEFAULT_MIN_USE
;
3018 if (m
->min_use
> m
->max_use
)
3019 m
->min_use
= m
->max_use
;
3021 if (m
->max_size
== (uint64_t) -1) {
3022 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3024 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3025 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3027 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3029 if (m
->max_size
!= 0) {
3030 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3031 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3033 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3034 m
->max_use
= m
->max_size
*2;
3037 if (m
->min_size
== (uint64_t) -1)
3038 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3040 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3042 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3043 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3045 if (m
->max_size
!= 0 && m
->min_size
> m
->max_size
)
3046 m
->max_size
= m
->min_size
;
3049 if (m
->keep_free
== (uint64_t) -1) {
3052 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3054 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3055 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3058 m
->keep_free
= DEFAULT_KEEP_FREE
;
3061 if (m
->n_max_files
== (uint64_t) -1)
3062 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3064 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3065 format_bytes(a
, sizeof(a
), m
->min_use
),
3066 format_bytes(b
, sizeof(b
), m
->max_use
),
3067 format_bytes(c
, sizeof(c
), m
->max_size
),
3068 format_bytes(d
, sizeof(d
), m
->min_size
),
3069 format_bytes(e
, sizeof(e
), m
->keep_free
),
3073 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3078 if (f
->header
->head_entry_realtime
== 0)
3081 *from
= le64toh(f
->header
->head_entry_realtime
);
3085 if (f
->header
->tail_entry_realtime
== 0)
3088 *to
= le64toh(f
->header
->tail_entry_realtime
);
3094 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3102 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3106 if (le64toh(o
->data
.n_entries
) <= 0)
3110 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3114 *from
= le64toh(o
->entry
.monotonic
);
3118 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3122 r
= generic_array_get_plus_one(f
,
3123 le64toh(o
->data
.entry_offset
),
3124 le64toh(o
->data
.entry_array_offset
),
3125 le64toh(o
->data
.n_entries
)-1,
3130 *to
= le64toh(o
->entry
.monotonic
);
3136 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3139 /* If we gained new header fields we gained new features,
3140 * hence suggest a rotation */
3141 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3142 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3146 /* Let's check if the hash tables grew over a certain fill
3147 * level (75%, borrowing this value from Java's hash table
3148 * implementation), and if so suggest a rotation. To calculate
3149 * the fill level we need the n_data field, which only exists
3150 * in newer versions. */
3152 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3153 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3154 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3156 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3157 le64toh(f
->header
->n_data
),
3158 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3159 (unsigned long long) f
->last_stat
.st_size
,
3160 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3164 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3165 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3166 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3168 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3169 le64toh(f
->header
->n_fields
),
3170 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3174 /* Are the data objects properly indexed by field objects? */
3175 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3176 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3177 le64toh(f
->header
->n_data
) > 0 &&
3178 le64toh(f
->header
->n_fields
) == 0)
3181 if (max_file_usec
> 0) {
3184 h
= le64toh(f
->header
->head_entry_realtime
);
3185 t
= now(CLOCK_REALTIME
);
3187 if (h
> 0 && t
> h
+ max_file_usec
)