1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/statvfs.h>
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
36 #include "journal-authenticate.h"
37 #include "journal-def.h"
38 #include "journal-file.h"
40 #include "parse-util.h"
41 #include "random-util.h"
43 #include "string-util.h"
44 #include "xattr-util.h"
46 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
47 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
49 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
51 /* This is the minimum journal file size */
52 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
54 /* These are the lower and upper bounds if we deduce the max_use value
55 * from the file system size */
56 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
57 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
59 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
60 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
62 /* This is the upper bound if we deduce max_size from max_use */
63 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
65 /* This is the upper bound if we deduce the keep_free value from the
67 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
69 /* This is the keep_free value when we can't determine the system
71 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
73 /* This is the default maximum number of journal files to keep around. */
74 #define DEFAULT_N_MAX_FILES (100)
76 /* n_data was the first entry we added after the initial file format design */
77 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
79 /* How many entries to keep in the entry array chain cache at max */
80 #define CHAIN_CACHE_MAX 20
82 /* How much to increase the journal file size at once each time we allocate something new. */
83 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
85 /* Reread fstat() of the file for detecting deletions at least this often */
86 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
88 /* The mmap context to use for the header we pick as one above the last defined typed */
89 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
91 static int journal_file_set_online(JournalFile
*f
) {
97 if (!(f
->fd
>= 0 && f
->header
))
100 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
103 switch(f
->header
->state
) {
108 f
->header
->state
= STATE_ONLINE
;
117 int journal_file_set_offline(JournalFile
*f
) {
123 if (!(f
->fd
>= 0 && f
->header
))
126 if (f
->header
->state
!= STATE_ONLINE
)
131 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
134 f
->header
->state
= STATE_OFFLINE
;
136 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
144 JournalFile
* journal_file_close(JournalFile
*f
) {
148 /* Write the final tag */
149 if (f
->seal
&& f
->writable
)
150 journal_file_append_tag(f
);
153 if (f
->post_change_timer
) {
156 if (sd_event_source_get_enabled(f
->post_change_timer
, &enabled
) >= 0)
157 if (enabled
== SD_EVENT_ONESHOT
)
158 journal_file_post_change(f
);
160 sd_event_source_set_enabled(f
->post_change_timer
, SD_EVENT_OFF
);
161 sd_event_source_unref(f
->post_change_timer
);
164 journal_file_set_offline(f
);
166 if (f
->mmap
&& f
->fd
>= 0)
167 mmap_cache_close_fd(f
->mmap
, f
->fd
);
169 if (f
->fd
>= 0 && f
->defrag_on_close
) {
171 /* Be friendly to btrfs: turn COW back on again now,
172 * and defragment the file. We won't write to the file
173 * ever again, hence remove all fragmentation, and
174 * reenable all the good bits COW usually provides
175 * (such as data checksumming). */
177 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
178 (void) btrfs_defrag_fd(f
->fd
);
184 mmap_cache_unref(f
->mmap
);
186 ordered_hashmap_free_free(f
->chain_cache
);
188 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
189 free(f
->compress_buffer
);
194 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
196 free(f
->fsprg_state
);
201 gcry_md_close(f
->hmac
);
208 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
215 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
216 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
218 h
.incompatible_flags
|= htole32(
219 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
220 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
222 h
.compatible_flags
= htole32(
223 f
->seal
* HEADER_COMPATIBLE_SEALED
);
225 r
= sd_id128_randomize(&h
.file_id
);
230 h
.seqnum_id
= template->header
->seqnum_id
;
231 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
233 h
.seqnum_id
= h
.file_id
;
235 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
245 static int journal_file_refresh_header(JournalFile
*f
) {
251 r
= sd_id128_get_machine(&f
->header
->machine_id
);
255 r
= sd_id128_get_boot(&boot_id
);
259 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
260 f
->tail_entry_monotonic_valid
= true;
262 f
->header
->boot_id
= boot_id
;
264 r
= journal_file_set_online(f
);
266 /* Sync the online state to disk */
272 static int journal_file_verify_header(JournalFile
*f
) {
277 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
280 /* In both read and write mode we refuse to open files with
281 * incompatible flags we don't know */
282 flags
= le32toh(f
->header
->incompatible_flags
);
283 if (flags
& ~HEADER_INCOMPATIBLE_SUPPORTED
) {
284 if (flags
& ~HEADER_INCOMPATIBLE_ANY
)
285 log_debug("Journal file %s has unknown incompatible flags %"PRIx32
,
286 f
->path
, flags
& ~HEADER_INCOMPATIBLE_ANY
);
287 flags
= (flags
& HEADER_INCOMPATIBLE_ANY
) & ~HEADER_INCOMPATIBLE_SUPPORTED
;
289 log_debug("Journal file %s uses incompatible flags %"PRIx32
290 " disabled at compilation time.", f
->path
, flags
);
291 return -EPROTONOSUPPORT
;
294 /* When open for writing we refuse to open files with
295 * compatible flags, too */
296 flags
= le32toh(f
->header
->compatible_flags
);
297 if (f
->writable
&& (flags
& ~HEADER_COMPATIBLE_SUPPORTED
)) {
298 if (flags
& ~HEADER_COMPATIBLE_ANY
)
299 log_debug("Journal file %s has unknown compatible flags %"PRIx32
,
300 f
->path
, flags
& ~HEADER_COMPATIBLE_ANY
);
301 flags
= (flags
& HEADER_COMPATIBLE_ANY
) & ~HEADER_COMPATIBLE_SUPPORTED
;
303 log_debug("Journal file %s uses compatible flags %"PRIx32
304 " disabled at compilation time.", f
->path
, flags
);
305 return -EPROTONOSUPPORT
;
308 if (f
->header
->state
>= _STATE_MAX
)
311 /* The first addition was n_data, so check that we are at least this large */
312 if (le64toh(f
->header
->header_size
) < HEADER_SIZE_MIN
)
315 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
318 if ((le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)) > (uint64_t) f
->last_stat
.st_size
)
321 if (le64toh(f
->header
->tail_object_offset
) > (le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)))
324 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
325 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
326 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
327 !VALID64(le64toh(f
->header
->entry_array_offset
)))
332 sd_id128_t machine_id
;
335 r
= sd_id128_get_machine(&machine_id
);
339 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
342 state
= f
->header
->state
;
344 if (state
== STATE_ONLINE
) {
345 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
347 } else if (state
== STATE_ARCHIVED
)
349 else if (state
!= STATE_OFFLINE
) {
350 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
355 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
356 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
358 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
363 static int journal_file_fstat(JournalFile
*f
) {
367 if (fstat(f
->fd
, &f
->last_stat
) < 0)
370 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
372 /* Refuse appending to files that are already deleted */
373 if (f
->last_stat
.st_nlink
<= 0)
379 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
380 uint64_t old_size
, new_size
;
385 /* We assume that this file is not sparse, and we know that
386 * for sure, since we always call posix_fallocate()
389 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
393 le64toh(f
->header
->header_size
) +
394 le64toh(f
->header
->arena_size
);
396 new_size
= PAGE_ALIGN(offset
+ size
);
397 if (new_size
< le64toh(f
->header
->header_size
))
398 new_size
= le64toh(f
->header
->header_size
);
400 if (new_size
<= old_size
) {
402 /* We already pre-allocated enough space, but before
403 * we write to it, let's check with fstat() if the
404 * file got deleted, in order make sure we don't throw
405 * away the data immediately. Don't check fstat() for
406 * all writes though, but only once ever 10s. */
408 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
411 return journal_file_fstat(f
);
414 /* Allocate more space. */
416 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
419 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
422 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
425 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
427 if (new_size
- old_size
> available
)
432 /* Increase by larger blocks at once */
433 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
434 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
435 new_size
= f
->metrics
.max_size
;
437 /* Note that the glibc fallocate() fallback is very
438 inefficient, hence we try to minimize the allocation area
440 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
444 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
446 return journal_file_fstat(f
);
449 static unsigned type_to_context(ObjectType type
) {
450 /* One context for each type, plus one catch-all for the rest */
451 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
452 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
453 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
456 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
) {
465 /* Avoid SIGBUS on invalid accesses */
466 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
467 /* Hmm, out of range? Let's refresh the fstat() data
468 * first, before we trust that check. */
470 r
= journal_file_fstat(f
);
474 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
475 return -EADDRNOTAVAIL
;
478 return mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
);
481 static uint64_t minimum_header_size(Object
*o
) {
483 static const uint64_t table
[] = {
484 [OBJECT_DATA
] = sizeof(DataObject
),
485 [OBJECT_FIELD
] = sizeof(FieldObject
),
486 [OBJECT_ENTRY
] = sizeof(EntryObject
),
487 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
488 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
489 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
490 [OBJECT_TAG
] = sizeof(TagObject
),
493 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
494 return sizeof(ObjectHeader
);
496 return table
[o
->object
.type
];
499 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
508 /* Objects may only be located at multiple of 64 bit */
509 if (!VALID64(offset
))
512 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
);
517 s
= le64toh(o
->object
.size
);
519 if (s
< sizeof(ObjectHeader
))
522 if (o
->object
.type
<= OBJECT_UNUSED
)
525 if (s
< minimum_header_size(o
))
528 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
)
531 if (s
> sizeof(ObjectHeader
)) {
532 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
);
543 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
548 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
551 /* If an external seqnum counter was passed, we update
552 * both the local and the external one, and set it to
553 * the maximum of both */
561 f
->header
->tail_entry_seqnum
= htole64(r
);
563 if (f
->header
->head_entry_seqnum
== 0)
564 f
->header
->head_entry_seqnum
= htole64(r
);
569 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
576 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
577 assert(size
>= sizeof(ObjectHeader
));
581 r
= journal_file_set_online(f
);
585 p
= le64toh(f
->header
->tail_object_offset
);
587 p
= le64toh(f
->header
->header_size
);
589 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
593 p
+= ALIGN64(le64toh(tail
->object
.size
));
596 r
= journal_file_allocate(f
, p
, size
);
600 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
);
607 o
->object
.type
= type
;
608 o
->object
.size
= htole64(size
);
610 f
->header
->tail_object_offset
= htole64(p
);
611 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
619 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
626 /* We estimate that we need 1 hash table entry per 768 bytes
627 of journal file and we want to make sure we never get
628 beyond 75% fill level. Calculate the hash table size for
629 the maximum file size based on these metrics. */
631 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
632 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
633 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
635 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
637 r
= journal_file_append_object(f
,
638 OBJECT_DATA_HASH_TABLE
,
639 offsetof(Object
, hash_table
.items
) + s
,
644 memzero(o
->hash_table
.items
, s
);
646 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
647 f
->header
->data_hash_table_size
= htole64(s
);
652 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
659 /* We use a fixed size hash table for the fields as this
660 * number should grow very slowly only */
662 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
663 r
= journal_file_append_object(f
,
664 OBJECT_FIELD_HASH_TABLE
,
665 offsetof(Object
, hash_table
.items
) + s
,
670 memzero(o
->hash_table
.items
, s
);
672 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
673 f
->header
->field_hash_table_size
= htole64(s
);
678 int journal_file_map_data_hash_table(JournalFile
*f
) {
685 if (f
->data_hash_table
)
688 p
= le64toh(f
->header
->data_hash_table_offset
);
689 s
= le64toh(f
->header
->data_hash_table_size
);
691 r
= journal_file_move_to(f
,
692 OBJECT_DATA_HASH_TABLE
,
699 f
->data_hash_table
= t
;
703 int journal_file_map_field_hash_table(JournalFile
*f
) {
710 if (f
->field_hash_table
)
713 p
= le64toh(f
->header
->field_hash_table_offset
);
714 s
= le64toh(f
->header
->field_hash_table_size
);
716 r
= journal_file_move_to(f
,
717 OBJECT_FIELD_HASH_TABLE
,
724 f
->field_hash_table
= t
;
728 static int journal_file_link_field(
741 if (o
->object
.type
!= OBJECT_FIELD
)
744 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
748 /* This might alter the window we are looking at */
749 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
752 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
754 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
756 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
760 o
->field
.next_hash_offset
= htole64(offset
);
763 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
765 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
766 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
771 static int journal_file_link_data(
784 if (o
->object
.type
!= OBJECT_DATA
)
787 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
791 /* This might alter the window we are looking at */
792 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
793 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
794 o
->data
.n_entries
= 0;
797 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
799 /* Only entry in the hash table is easy */
800 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
802 /* Move back to the previous data object, to patch in
805 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
809 o
->data
.next_hash_offset
= htole64(offset
);
812 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
814 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
815 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
820 int journal_file_find_field_object_with_hash(
822 const void *field
, uint64_t size
, uint64_t hash
,
823 Object
**ret
, uint64_t *offset
) {
825 uint64_t p
, osize
, h
, m
;
829 assert(field
&& size
> 0);
831 /* If the field hash table is empty, we can't find anything */
832 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
835 /* Map the field hash table, if it isn't mapped yet. */
836 r
= journal_file_map_field_hash_table(f
);
840 osize
= offsetof(Object
, field
.payload
) + size
;
842 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
847 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
852 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
856 if (le64toh(o
->field
.hash
) == hash
&&
857 le64toh(o
->object
.size
) == osize
&&
858 memcmp(o
->field
.payload
, field
, size
) == 0) {
868 p
= le64toh(o
->field
.next_hash_offset
);
874 int journal_file_find_field_object(
876 const void *field
, uint64_t size
,
877 Object
**ret
, uint64_t *offset
) {
882 assert(field
&& size
> 0);
884 hash
= hash64(field
, size
);
886 return journal_file_find_field_object_with_hash(f
,
891 int journal_file_find_data_object_with_hash(
893 const void *data
, uint64_t size
, uint64_t hash
,
894 Object
**ret
, uint64_t *offset
) {
896 uint64_t p
, osize
, h
, m
;
900 assert(data
|| size
== 0);
902 /* If there's no data hash table, then there's no entry. */
903 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
906 /* Map the data hash table, if it isn't mapped yet. */
907 r
= journal_file_map_data_hash_table(f
);
911 osize
= offsetof(Object
, data
.payload
) + size
;
913 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
918 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
923 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
927 if (le64toh(o
->data
.hash
) != hash
)
930 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
931 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
935 l
= le64toh(o
->object
.size
);
936 if (l
<= offsetof(Object
, data
.payload
))
939 l
-= offsetof(Object
, data
.payload
);
941 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
942 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
947 memcmp(f
->compress_buffer
, data
, size
) == 0) {
958 return -EPROTONOSUPPORT
;
960 } else if (le64toh(o
->object
.size
) == osize
&&
961 memcmp(o
->data
.payload
, data
, size
) == 0) {
973 p
= le64toh(o
->data
.next_hash_offset
);
979 int journal_file_find_data_object(
981 const void *data
, uint64_t size
,
982 Object
**ret
, uint64_t *offset
) {
987 assert(data
|| size
== 0);
989 hash
= hash64(data
, size
);
991 return journal_file_find_data_object_with_hash(f
,
996 static int journal_file_append_field(
998 const void *field
, uint64_t size
,
999 Object
**ret
, uint64_t *offset
) {
1007 assert(field
&& size
> 0);
1009 hash
= hash64(field
, size
);
1011 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1025 osize
= offsetof(Object
, field
.payload
) + size
;
1026 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1030 o
->field
.hash
= htole64(hash
);
1031 memcpy(o
->field
.payload
, field
, size
);
1033 r
= journal_file_link_field(f
, o
, p
, hash
);
1037 /* The linking might have altered the window, so let's
1038 * refresh our pointer */
1039 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1044 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1058 static int journal_file_append_data(
1060 const void *data
, uint64_t size
,
1061 Object
**ret
, uint64_t *offset
) {
1066 int r
, compression
= 0;
1070 assert(data
|| size
== 0);
1072 hash
= hash64(data
, size
);
1074 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1088 osize
= offsetof(Object
, data
.payload
) + size
;
1089 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1093 o
->data
.hash
= htole64(hash
);
1095 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1096 if (JOURNAL_FILE_COMPRESS(f
) && size
>= COMPRESSION_SIZE_THRESHOLD
) {
1099 compression
= compress_blob(data
, size
, o
->data
.payload
, size
- 1, &rsize
);
1101 if (compression
>= 0) {
1102 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1103 o
->object
.flags
|= compression
;
1105 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1106 size
, rsize
, object_compressed_to_string(compression
));
1108 /* Compression didn't work, we don't really care why, let's continue without compression */
1113 if (compression
== 0 && size
> 0)
1114 memcpy(o
->data
.payload
, data
, size
);
1116 r
= journal_file_link_data(f
, o
, p
, hash
);
1120 /* The linking might have altered the window, so let's
1121 * refresh our pointer */
1122 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1129 eq
= memchr(data
, '=', size
);
1130 if (eq
&& eq
> data
) {
1134 /* Create field object ... */
1135 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1139 /* ... and link it in. */
1140 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1141 fo
->field
.head_data_offset
= le64toh(p
);
1145 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1159 uint64_t journal_file_entry_n_items(Object
*o
) {
1162 if (o
->object
.type
!= OBJECT_ENTRY
)
1165 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1168 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1171 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1174 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1177 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1180 if (o
->object
.type
!= OBJECT_DATA_HASH_TABLE
&&
1181 o
->object
.type
!= OBJECT_FIELD_HASH_TABLE
)
1184 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1187 static int link_entry_into_array(JournalFile
*f
,
1192 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1200 a
= le64toh(*first
);
1201 i
= hidx
= le64toh(*idx
);
1204 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1208 n
= journal_file_entry_array_n_items(o
);
1210 o
->entry_array
.items
[i
] = htole64(p
);
1211 *idx
= htole64(hidx
+ 1);
1217 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1228 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1229 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1235 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1240 o
->entry_array
.items
[i
] = htole64(p
);
1243 *first
= htole64(q
);
1245 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1249 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1252 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1253 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1255 *idx
= htole64(hidx
+ 1);
1260 static int link_entry_into_array_plus_one(JournalFile
*f
,
1275 *extra
= htole64(p
);
1279 i
= htole64(le64toh(*idx
) - 1);
1280 r
= link_entry_into_array(f
, first
, &i
, p
);
1285 *idx
= htole64(le64toh(*idx
) + 1);
1289 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1296 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1300 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1304 return link_entry_into_array_plus_one(f
,
1305 &o
->data
.entry_offset
,
1306 &o
->data
.entry_array_offset
,
1311 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1319 if (o
->object
.type
!= OBJECT_ENTRY
)
1322 __sync_synchronize();
1324 /* Link up the entry itself */
1325 r
= link_entry_into_array(f
,
1326 &f
->header
->entry_array_offset
,
1327 &f
->header
->n_entries
,
1332 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1334 if (f
->header
->head_entry_realtime
== 0)
1335 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1337 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1338 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1340 f
->tail_entry_monotonic_valid
= true;
1342 /* Link up the items */
1343 n
= journal_file_entry_n_items(o
);
1344 for (i
= 0; i
< n
; i
++) {
1345 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1353 static int journal_file_append_entry_internal(
1355 const dual_timestamp
*ts
,
1357 const EntryItem items
[], unsigned n_items
,
1359 Object
**ret
, uint64_t *offset
) {
1366 assert(items
|| n_items
== 0);
1369 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1371 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1375 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1376 memcpy(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1377 o
->entry
.realtime
= htole64(ts
->realtime
);
1378 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1379 o
->entry
.xor_hash
= htole64(xor_hash
);
1380 o
->entry
.boot_id
= f
->header
->boot_id
;
1383 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1388 r
= journal_file_link_entry(f
, o
, np
);
1401 void journal_file_post_change(JournalFile
*f
) {
1404 /* inotify() does not receive IN_MODIFY events from file
1405 * accesses done via mmap(). After each access we hence
1406 * trigger IN_MODIFY by truncating the journal file to its
1407 * current size which triggers IN_MODIFY. */
1409 __sync_synchronize();
1411 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1412 log_error_errno(errno
, "Failed to truncate file to its own size: %m");
1415 static int post_change_thunk(sd_event_source
*timer
, uint64_t usec
, void *userdata
) {
1418 journal_file_post_change(userdata
);
1423 static void schedule_post_change(JournalFile
*f
) {
1424 sd_event_source
*timer
;
1429 assert(f
->post_change_timer
);
1431 timer
= f
->post_change_timer
;
1433 r
= sd_event_source_get_enabled(timer
, &enabled
);
1435 log_error_errno(-r
, "Failed to get ftruncate timer state: %m");
1439 if (enabled
== SD_EVENT_ONESHOT
)
1442 r
= sd_event_now(sd_event_source_get_event(timer
), CLOCK_MONOTONIC
, &now
);
1444 log_error_errno(-r
, "Failed to get clock's now for scheduling ftruncate: %m");
1448 r
= sd_event_source_set_time(timer
, now
+f
->post_change_timer_period
);
1450 log_error_errno(-r
, "Failed to set time for scheduling ftruncate: %m");
1454 r
= sd_event_source_set_enabled(timer
, SD_EVENT_ONESHOT
);
1456 log_error_errno(-r
, "Failed to enable scheduled ftruncate: %m");
1461 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1462 int journal_file_enable_post_change_timer(JournalFile
*f
, sd_event
*e
, usec_t t
) {
1463 _cleanup_(sd_event_source_unrefp
) sd_event_source
*timer
= NULL
;
1467 assert_return(!f
->post_change_timer
, -EINVAL
);
1471 r
= sd_event_add_time(e
, &timer
, CLOCK_MONOTONIC
, 0, 0, post_change_thunk
, f
);
1475 r
= sd_event_source_set_enabled(timer
, SD_EVENT_OFF
);
1479 f
->post_change_timer
= timer
;
1481 f
->post_change_timer_period
= t
;
1486 static int entry_item_cmp(const void *_a
, const void *_b
) {
1487 const EntryItem
*a
= _a
, *b
= _b
;
1489 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1491 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1496 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1500 uint64_t xor_hash
= 0;
1501 struct dual_timestamp _ts
;
1504 assert(iovec
|| n_iovec
== 0);
1507 dual_timestamp_get(&_ts
);
1511 if (f
->tail_entry_monotonic_valid
&&
1512 ts
->monotonic
< le64toh(f
->header
->tail_entry_monotonic
))
1516 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
1521 /* alloca() can't take 0, hence let's allocate at least one */
1522 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
1524 for (i
= 0; i
< n_iovec
; i
++) {
1528 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
1532 xor_hash
^= le64toh(o
->data
.hash
);
1533 items
[i
].object_offset
= htole64(p
);
1534 items
[i
].hash
= o
->data
.hash
;
1537 /* Order by the position on disk, in order to improve seek
1538 * times for rotating media. */
1539 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
1541 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
1543 /* If the memory mapping triggered a SIGBUS then we return an
1544 * IO error and ignore the error code passed down to us, since
1545 * it is very likely just an effect of a nullified replacement
1548 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
1551 if (f
->post_change_timer
)
1552 schedule_post_change(f
);
1554 journal_file_post_change(f
);
1559 typedef struct ChainCacheItem
{
1560 uint64_t first
; /* the array at the beginning of the chain */
1561 uint64_t array
; /* the cached array */
1562 uint64_t begin
; /* the first item in the cached array */
1563 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
1564 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
1567 static void chain_cache_put(
1574 uint64_t last_index
) {
1577 /* If the chain item to cache for this chain is the
1578 * first one it's not worth caching anything */
1582 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
1583 ci
= ordered_hashmap_steal_first(h
);
1586 ci
= new(ChainCacheItem
, 1);
1593 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
1598 assert(ci
->first
== first
);
1603 ci
->last_index
= last_index
;
1606 static int generic_array_get(
1610 Object
**ret
, uint64_t *offset
) {
1613 uint64_t p
= 0, a
, t
= 0;
1621 /* Try the chain cache first */
1622 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1623 if (ci
&& i
> ci
->total
) {
1632 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1636 k
= journal_file_entry_array_n_items(o
);
1638 p
= le64toh(o
->entry_array
.items
[i
]);
1644 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1650 /* Let's cache this item for the next invocation */
1651 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
1653 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1666 static int generic_array_get_plus_one(
1671 Object
**ret
, uint64_t *offset
) {
1680 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1693 return generic_array_get(f
, first
, i
-1, ret
, offset
);
1702 static int generic_array_bisect(
1707 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1708 direction_t direction
,
1713 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
1714 bool subtract_one
= false;
1715 Object
*o
, *array
= NULL
;
1720 assert(test_object
);
1722 /* Start with the first array in the chain */
1725 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1726 if (ci
&& n
> ci
->total
) {
1727 /* Ah, we have iterated this bisection array chain
1728 * previously! Let's see if we can skip ahead in the
1729 * chain, as far as the last time. But we can't jump
1730 * backwards in the chain, so let's check that
1733 r
= test_object(f
, ci
->begin
, needle
);
1737 if (r
== TEST_LEFT
) {
1738 /* OK, what we are looking for is right of the
1739 * begin of this EntryArray, so let's jump
1740 * straight to previously cached array in the
1746 last_index
= ci
->last_index
;
1751 uint64_t left
, right
, k
, lp
;
1753 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
1757 k
= journal_file_entry_array_n_items(array
);
1763 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
1767 r
= test_object(f
, p
, needle
);
1771 if (r
== TEST_FOUND
)
1772 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1774 if (r
== TEST_RIGHT
) {
1778 if (last_index
!= (uint64_t) -1) {
1779 assert(last_index
<= right
);
1781 /* If we cached the last index we
1782 * looked at, let's try to not to jump
1783 * too wildly around and see if we can
1784 * limit the range to look at early to
1785 * the immediate neighbors of the last
1786 * index we looked at. */
1788 if (last_index
> 0) {
1789 uint64_t x
= last_index
- 1;
1791 p
= le64toh(array
->entry_array
.items
[x
]);
1795 r
= test_object(f
, p
, needle
);
1799 if (r
== TEST_FOUND
)
1800 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1802 if (r
== TEST_RIGHT
)
1808 if (last_index
< right
) {
1809 uint64_t y
= last_index
+ 1;
1811 p
= le64toh(array
->entry_array
.items
[y
]);
1815 r
= test_object(f
, p
, needle
);
1819 if (r
== TEST_FOUND
)
1820 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1822 if (r
== TEST_RIGHT
)
1830 if (left
== right
) {
1831 if (direction
== DIRECTION_UP
)
1832 subtract_one
= true;
1838 assert(left
< right
);
1839 i
= (left
+ right
) / 2;
1841 p
= le64toh(array
->entry_array
.items
[i
]);
1845 r
= test_object(f
, p
, needle
);
1849 if (r
== TEST_FOUND
)
1850 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1852 if (r
== TEST_RIGHT
)
1860 if (direction
== DIRECTION_UP
) {
1862 subtract_one
= true;
1873 last_index
= (uint64_t) -1;
1874 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
1880 if (subtract_one
&& t
== 0 && i
== 0)
1883 /* Let's cache this item for the next invocation */
1884 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
1886 if (subtract_one
&& i
== 0)
1888 else if (subtract_one
)
1889 p
= le64toh(array
->entry_array
.items
[i
-1]);
1891 p
= le64toh(array
->entry_array
.items
[i
]);
1893 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1904 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
1909 static int generic_array_bisect_plus_one(
1915 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1916 direction_t direction
,
1922 bool step_back
= false;
1926 assert(test_object
);
1931 /* This bisects the array in object 'first', but first checks
1933 r
= test_object(f
, extra
, needle
);
1937 if (r
== TEST_FOUND
)
1938 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1940 /* if we are looking with DIRECTION_UP then we need to first
1941 see if in the actual array there is a matching entry, and
1942 return the last one of that. But if there isn't any we need
1943 to return this one. Hence remember this, and return it
1946 step_back
= direction
== DIRECTION_UP
;
1948 if (r
== TEST_RIGHT
) {
1949 if (direction
== DIRECTION_DOWN
)
1955 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
1957 if (r
== 0 && step_back
)
1966 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1982 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1988 else if (p
< needle
)
1994 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2001 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2005 if (le64toh(o
->entry
.seqnum
) == needle
)
2007 else if (le64toh(o
->entry
.seqnum
) < needle
)
2013 int journal_file_move_to_entry_by_seqnum(
2016 direction_t direction
,
2020 return generic_array_bisect(f
,
2021 le64toh(f
->header
->entry_array_offset
),
2022 le64toh(f
->header
->n_entries
),
2029 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2036 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2040 if (le64toh(o
->entry
.realtime
) == needle
)
2042 else if (le64toh(o
->entry
.realtime
) < needle
)
2048 int journal_file_move_to_entry_by_realtime(
2051 direction_t direction
,
2055 return generic_array_bisect(f
,
2056 le64toh(f
->header
->entry_array_offset
),
2057 le64toh(f
->header
->n_entries
),
2059 test_object_realtime
,
2064 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2071 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2075 if (le64toh(o
->entry
.monotonic
) == needle
)
2077 else if (le64toh(o
->entry
.monotonic
) < needle
)
2083 static int find_data_object_by_boot_id(
2089 char t
[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2091 sd_id128_to_string(boot_id
, t
+ 9);
2092 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2095 int journal_file_move_to_entry_by_monotonic(
2099 direction_t direction
,
2108 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2114 return generic_array_bisect_plus_one(f
,
2115 le64toh(o
->data
.entry_offset
),
2116 le64toh(o
->data
.entry_array_offset
),
2117 le64toh(o
->data
.n_entries
),
2119 test_object_monotonic
,
2124 void journal_file_reset_location(JournalFile
*f
) {
2125 f
->location_type
= LOCATION_HEAD
;
2126 f
->current_offset
= 0;
2127 f
->current_seqnum
= 0;
2128 f
->current_realtime
= 0;
2129 f
->current_monotonic
= 0;
2130 zero(f
->current_boot_id
);
2131 f
->current_xor_hash
= 0;
2134 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2135 f
->location_type
= LOCATION_SEEK
;
2136 f
->current_offset
= offset
;
2137 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2138 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2139 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2140 f
->current_boot_id
= o
->entry
.boot_id
;
2141 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2144 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2147 assert(af
->location_type
== LOCATION_SEEK
);
2148 assert(bf
->location_type
== LOCATION_SEEK
);
2150 /* If contents and timestamps match, these entries are
2151 * identical, even if the seqnum does not match */
2152 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2153 af
->current_monotonic
== bf
->current_monotonic
&&
2154 af
->current_realtime
== bf
->current_realtime
&&
2155 af
->current_xor_hash
== bf
->current_xor_hash
)
2158 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2160 /* If this is from the same seqnum source, compare
2162 if (af
->current_seqnum
< bf
->current_seqnum
)
2164 if (af
->current_seqnum
> bf
->current_seqnum
)
2167 /* Wow! This is weird, different data but the same
2168 * seqnums? Something is borked, but let's make the
2169 * best of it and compare by time. */
2172 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2174 /* If the boot id matches, compare monotonic time */
2175 if (af
->current_monotonic
< bf
->current_monotonic
)
2177 if (af
->current_monotonic
> bf
->current_monotonic
)
2181 /* Otherwise, compare UTC time */
2182 if (af
->current_realtime
< bf
->current_realtime
)
2184 if (af
->current_realtime
> bf
->current_realtime
)
2187 /* Finally, compare by contents */
2188 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2190 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2196 int journal_file_next_entry(
2199 direction_t direction
,
2200 Object
**ret
, uint64_t *offset
) {
2207 n
= le64toh(f
->header
->n_entries
);
2212 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2214 r
= generic_array_bisect(f
,
2215 le64toh(f
->header
->entry_array_offset
),
2216 le64toh(f
->header
->n_entries
),
2225 if (direction
== DIRECTION_DOWN
) {
2238 /* And jump to it */
2239 r
= generic_array_get(f
,
2240 le64toh(f
->header
->entry_array_offset
),
2247 (direction
== DIRECTION_DOWN
? ofs
<= p
: ofs
>= p
)) {
2248 log_debug("%s: entry array corrupted at entry %"PRIu64
,
2259 int journal_file_next_entry_for_data(
2261 Object
*o
, uint64_t p
,
2262 uint64_t data_offset
,
2263 direction_t direction
,
2264 Object
**ret
, uint64_t *offset
) {
2271 assert(p
> 0 || !o
);
2273 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2277 n
= le64toh(d
->data
.n_entries
);
2282 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2284 if (o
->object
.type
!= OBJECT_ENTRY
)
2287 r
= generic_array_bisect_plus_one(f
,
2288 le64toh(d
->data
.entry_offset
),
2289 le64toh(d
->data
.entry_array_offset
),
2290 le64toh(d
->data
.n_entries
),
2300 if (direction
== DIRECTION_DOWN
) {
2314 return generic_array_get_plus_one(f
,
2315 le64toh(d
->data
.entry_offset
),
2316 le64toh(d
->data
.entry_array_offset
),
2321 int journal_file_move_to_entry_by_offset_for_data(
2323 uint64_t data_offset
,
2325 direction_t direction
,
2326 Object
**ret
, uint64_t *offset
) {
2333 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2337 return generic_array_bisect_plus_one(f
,
2338 le64toh(d
->data
.entry_offset
),
2339 le64toh(d
->data
.entry_array_offset
),
2340 le64toh(d
->data
.n_entries
),
2347 int journal_file_move_to_entry_by_monotonic_for_data(
2349 uint64_t data_offset
,
2352 direction_t direction
,
2353 Object
**ret
, uint64_t *offset
) {
2361 /* First, seek by time */
2362 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2368 r
= generic_array_bisect_plus_one(f
,
2369 le64toh(o
->data
.entry_offset
),
2370 le64toh(o
->data
.entry_array_offset
),
2371 le64toh(o
->data
.n_entries
),
2373 test_object_monotonic
,
2379 /* And now, continue seeking until we find an entry that
2380 * exists in both bisection arrays */
2386 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2390 r
= generic_array_bisect_plus_one(f
,
2391 le64toh(d
->data
.entry_offset
),
2392 le64toh(d
->data
.entry_array_offset
),
2393 le64toh(d
->data
.n_entries
),
2401 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2405 r
= generic_array_bisect_plus_one(f
,
2406 le64toh(o
->data
.entry_offset
),
2407 le64toh(o
->data
.entry_array_offset
),
2408 le64toh(o
->data
.n_entries
),
2430 int journal_file_move_to_entry_by_seqnum_for_data(
2432 uint64_t data_offset
,
2434 direction_t direction
,
2435 Object
**ret
, uint64_t *offset
) {
2442 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2446 return generic_array_bisect_plus_one(f
,
2447 le64toh(d
->data
.entry_offset
),
2448 le64toh(d
->data
.entry_array_offset
),
2449 le64toh(d
->data
.n_entries
),
2456 int journal_file_move_to_entry_by_realtime_for_data(
2458 uint64_t data_offset
,
2460 direction_t direction
,
2461 Object
**ret
, uint64_t *offset
) {
2468 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2472 return generic_array_bisect_plus_one(f
,
2473 le64toh(d
->data
.entry_offset
),
2474 le64toh(d
->data
.entry_array_offset
),
2475 le64toh(d
->data
.n_entries
),
2477 test_object_realtime
,
2482 void journal_file_dump(JournalFile
*f
) {
2489 journal_file_print_header(f
);
2491 p
= le64toh(f
->header
->header_size
);
2493 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
2497 switch (o
->object
.type
) {
2500 printf("Type: OBJECT_UNUSED\n");
2504 printf("Type: OBJECT_DATA\n");
2508 printf("Type: OBJECT_FIELD\n");
2512 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
2513 le64toh(o
->entry
.seqnum
),
2514 le64toh(o
->entry
.monotonic
),
2515 le64toh(o
->entry
.realtime
));
2518 case OBJECT_FIELD_HASH_TABLE
:
2519 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2522 case OBJECT_DATA_HASH_TABLE
:
2523 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2526 case OBJECT_ENTRY_ARRAY
:
2527 printf("Type: OBJECT_ENTRY_ARRAY\n");
2531 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
2532 le64toh(o
->tag
.seqnum
),
2533 le64toh(o
->tag
.epoch
));
2537 printf("Type: unknown (%i)\n", o
->object
.type
);
2541 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
2542 printf("Flags: %s\n",
2543 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
2545 if (p
== le64toh(f
->header
->tail_object_offset
))
2548 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
2553 log_error("File corrupt");
2556 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
2559 x
= format_timestamp(buf
, l
, t
);
2565 void journal_file_print_header(JournalFile
*f
) {
2566 char a
[33], b
[33], c
[33], d
[33];
2567 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
2569 char bytes
[FORMAT_BYTES_MAX
];
2573 printf("File Path: %s\n"
2577 "Sequential Number ID: %s\n"
2579 "Compatible Flags:%s%s\n"
2580 "Incompatible Flags:%s%s%s\n"
2581 "Header size: %"PRIu64
"\n"
2582 "Arena size: %"PRIu64
"\n"
2583 "Data Hash Table Size: %"PRIu64
"\n"
2584 "Field Hash Table Size: %"PRIu64
"\n"
2585 "Rotate Suggested: %s\n"
2586 "Head Sequential Number: %"PRIu64
"\n"
2587 "Tail Sequential Number: %"PRIu64
"\n"
2588 "Head Realtime Timestamp: %s\n"
2589 "Tail Realtime Timestamp: %s\n"
2590 "Tail Monotonic Timestamp: %s\n"
2591 "Objects: %"PRIu64
"\n"
2592 "Entry Objects: %"PRIu64
"\n",
2594 sd_id128_to_string(f
->header
->file_id
, a
),
2595 sd_id128_to_string(f
->header
->machine_id
, b
),
2596 sd_id128_to_string(f
->header
->boot_id
, c
),
2597 sd_id128_to_string(f
->header
->seqnum_id
, d
),
2598 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
2599 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
2600 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
2601 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
2602 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
2603 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
2604 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
2605 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
2606 le64toh(f
->header
->header_size
),
2607 le64toh(f
->header
->arena_size
),
2608 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
2609 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
2610 yes_no(journal_file_rotate_suggested(f
, 0)),
2611 le64toh(f
->header
->head_entry_seqnum
),
2612 le64toh(f
->header
->tail_entry_seqnum
),
2613 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)),
2614 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)),
2615 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
),
2616 le64toh(f
->header
->n_objects
),
2617 le64toh(f
->header
->n_entries
));
2619 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
2620 printf("Data Objects: %"PRIu64
"\n"
2621 "Data Hash Table Fill: %.1f%%\n",
2622 le64toh(f
->header
->n_data
),
2623 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
2625 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
2626 printf("Field Objects: %"PRIu64
"\n"
2627 "Field Hash Table Fill: %.1f%%\n",
2628 le64toh(f
->header
->n_fields
),
2629 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
2631 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
2632 printf("Tag Objects: %"PRIu64
"\n",
2633 le64toh(f
->header
->n_tags
));
2634 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
2635 printf("Entry Array Objects: %"PRIu64
"\n",
2636 le64toh(f
->header
->n_entry_arrays
));
2638 if (fstat(f
->fd
, &st
) >= 0)
2639 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (uint64_t) st
.st_blocks
* 512ULL));
2642 static int journal_file_warn_btrfs(JournalFile
*f
) {
2648 /* Before we write anything, check if the COW logic is turned
2649 * off on btrfs. Given our write pattern that is quite
2650 * unfriendly to COW file systems this should greatly improve
2651 * performance on COW file systems, such as btrfs, at the
2652 * expense of data integrity features (which shouldn't be too
2653 * bad, given that we do our own checksumming). */
2655 r
= btrfs_is_filesystem(f
->fd
);
2657 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
2661 r
= read_attr_fd(f
->fd
, &attrs
);
2663 return log_warning_errno(r
, "Failed to read file attributes: %m");
2665 if (attrs
& FS_NOCOW_FL
) {
2666 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2670 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2671 "This is likely to slow down journal access substantially, please consider turning "
2672 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
2677 int journal_file_open(
2683 JournalMetrics
*metrics
,
2684 MMapCache
*mmap_cache
,
2685 JournalFile
*template,
2686 JournalFile
**ret
) {
2688 bool newly_created
= false;
2696 if ((flags
& O_ACCMODE
) != O_RDONLY
&&
2697 (flags
& O_ACCMODE
) != O_RDWR
)
2700 if (!endswith(fname
, ".journal") &&
2701 !endswith(fname
, ".journal~"))
2704 f
= new0(JournalFile
, 1);
2712 f
->prot
= prot_from_flags(flags
);
2713 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
2714 #if defined(HAVE_LZ4)
2715 f
->compress_lz4
= compress
;
2716 #elif defined(HAVE_XZ)
2717 f
->compress_xz
= compress
;
2724 f
->mmap
= mmap_cache_ref(mmap_cache
);
2726 f
->mmap
= mmap_cache_new();
2733 f
->path
= strdup(fname
);
2739 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
2740 if (!f
->chain_cache
) {
2745 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
, f
->mode
);
2751 r
= journal_file_fstat(f
);
2755 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
2757 (void) journal_file_warn_btrfs(f
);
2759 /* Let's attach the creation time to the journal file,
2760 * so that the vacuuming code knows the age of this
2761 * file even if the file might end up corrupted one
2762 * day... Ideally we'd just use the creation time many
2763 * file systems maintain for each file, but there is
2764 * currently no usable API to query this, hence let's
2765 * emulate this via extended attributes. If extended
2766 * attributes are not supported we'll just skip this,
2767 * and rely solely on mtime/atime/ctime of the file. */
2769 fd_setcrtime(f
->fd
, 0);
2772 /* Try to load the FSPRG state, and if we can't, then
2773 * just don't do sealing */
2775 r
= journal_file_fss_load(f
);
2781 r
= journal_file_init_header(f
, template);
2785 r
= journal_file_fstat(f
);
2789 newly_created
= true;
2792 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
2797 r
= mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
);
2803 if (!newly_created
) {
2804 r
= journal_file_verify_header(f
);
2810 if (!newly_created
&& f
->writable
) {
2811 r
= journal_file_fss_load(f
);
2819 journal_default_metrics(metrics
, f
->fd
);
2820 f
->metrics
= *metrics
;
2821 } else if (template)
2822 f
->metrics
= template->metrics
;
2824 r
= journal_file_refresh_header(f
);
2830 r
= journal_file_hmac_setup(f
);
2835 if (newly_created
) {
2836 r
= journal_file_setup_field_hash_table(f
);
2840 r
= journal_file_setup_data_hash_table(f
);
2845 r
= journal_file_append_first_tag(f
);
2851 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
)) {
2856 if (template && template->post_change_timer
) {
2857 sd_event
*e
= sd_event_source_get_event(template->post_change_timer
);
2859 r
= journal_file_enable_post_change_timer(f
, e
, template->post_change_timer_period
);
2868 if (f
->fd
>= 0 && mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
2871 journal_file_close(f
);
2876 int journal_file_rotate(JournalFile
**f
, bool compress
, bool seal
) {
2877 _cleanup_free_
char *p
= NULL
;
2879 JournalFile
*old_file
, *new_file
= NULL
;
2887 if (!old_file
->writable
)
2890 if (!endswith(old_file
->path
, ".journal"))
2893 l
= strlen(old_file
->path
);
2894 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
2895 (int) l
- 8, old_file
->path
,
2896 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
2897 le64toh((*f
)->header
->head_entry_seqnum
),
2898 le64toh((*f
)->header
->head_entry_realtime
));
2902 /* Try to rename the file to the archived version. If the file
2903 * already was deleted, we'll get ENOENT, let's ignore that
2905 r
= rename(old_file
->path
, p
);
2906 if (r
< 0 && errno
!= ENOENT
)
2909 old_file
->header
->state
= STATE_ARCHIVED
;
2911 /* Currently, btrfs is not very good with out write patterns
2912 * and fragments heavily. Let's defrag our journal files when
2913 * we archive them */
2914 old_file
->defrag_on_close
= true;
2916 r
= journal_file_open(old_file
->path
, old_file
->flags
, old_file
->mode
, compress
, seal
, NULL
, old_file
->mmap
, old_file
, &new_file
);
2917 journal_file_close(old_file
);
2923 int journal_file_open_reliably(
2929 JournalMetrics
*metrics
,
2930 MMapCache
*mmap_cache
,
2931 JournalFile
*template,
2932 JournalFile
**ret
) {
2936 _cleanup_free_
char *p
= NULL
;
2938 r
= journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2940 -EBADMSG
, /* corrupted */
2941 -ENODATA
, /* truncated */
2942 -EHOSTDOWN
, /* other machine */
2943 -EPROTONOSUPPORT
, /* incompatible feature */
2944 -EBUSY
, /* unclean shutdown */
2945 -ESHUTDOWN
, /* already archived */
2946 -EIO
, /* IO error, including SIGBUS on mmap */
2947 -EIDRM
/* File has been deleted */))
2950 if ((flags
& O_ACCMODE
) == O_RDONLY
)
2953 if (!(flags
& O_CREAT
))
2956 if (!endswith(fname
, ".journal"))
2959 /* The file is corrupted. Rotate it away and try it again (but only once) */
2962 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
2964 now(CLOCK_REALTIME
),
2968 if (rename(fname
, p
) < 0)
2971 /* btrfs doesn't cope well with our write pattern and
2972 * fragments heavily. Let's defrag all files we rotate */
2974 (void) chattr_path(p
, false, FS_NOCOW_FL
);
2975 (void) btrfs_defrag(p
);
2977 log_warning_errno(r
, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
2979 return journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2982 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
2984 uint64_t q
, xor_hash
= 0;
2997 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
2998 ts
.realtime
= le64toh(o
->entry
.realtime
);
3000 n
= journal_file_entry_n_items(o
);
3001 /* alloca() can't take 0, hence let's allocate at least one */
3002 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
3004 for (i
= 0; i
< n
; i
++) {
3011 q
= le64toh(o
->entry
.items
[i
].object_offset
);
3012 le_hash
= o
->entry
.items
[i
].hash
;
3014 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
3018 if (le_hash
!= o
->data
.hash
)
3021 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
3024 /* We hit the limit on 32bit machines */
3025 if ((uint64_t) t
!= l
)
3028 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
3029 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
3032 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
3033 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
3037 data
= from
->compress_buffer
;
3040 return -EPROTONOSUPPORT
;
3043 data
= o
->data
.payload
;
3045 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
3049 xor_hash
^= le64toh(u
->data
.hash
);
3050 items
[i
].object_offset
= htole64(h
);
3051 items
[i
].hash
= u
->data
.hash
;
3053 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
3058 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
3060 if (mmap_cache_got_sigbus(to
->mmap
, to
->fd
))
3066 void journal_reset_metrics(JournalMetrics
*m
) {
3069 /* Set everything to "pick automatic values". */
3071 *m
= (JournalMetrics
) {
3072 .min_use
= (uint64_t) -1,
3073 .max_use
= (uint64_t) -1,
3074 .min_size
= (uint64_t) -1,
3075 .max_size
= (uint64_t) -1,
3076 .keep_free
= (uint64_t) -1,
3077 .n_max_files
= (uint64_t) -1,
3081 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
3082 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
], e
[FORMAT_BYTES_MAX
];
3089 if (fstatvfs(fd
, &ss
) >= 0)
3090 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
3092 log_debug_errno(errno
, "Failed to detremine disk size: %m");
3096 if (m
->max_use
== (uint64_t) -1) {
3099 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
3101 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
3102 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
3104 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
3105 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3107 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3109 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3111 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3112 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3115 if (m
->min_use
== (uint64_t) -1)
3116 m
->min_use
= DEFAULT_MIN_USE
;
3118 if (m
->min_use
> m
->max_use
)
3119 m
->min_use
= m
->max_use
;
3121 if (m
->max_size
== (uint64_t) -1) {
3122 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3124 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3125 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3127 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3129 if (m
->max_size
!= 0) {
3130 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3131 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3133 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3134 m
->max_use
= m
->max_size
*2;
3137 if (m
->min_size
== (uint64_t) -1)
3138 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3140 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3142 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3143 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3145 if (m
->max_size
!= 0 && m
->min_size
> m
->max_size
)
3146 m
->max_size
= m
->min_size
;
3149 if (m
->keep_free
== (uint64_t) -1) {
3152 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3154 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3155 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3158 m
->keep_free
= DEFAULT_KEEP_FREE
;
3161 if (m
->n_max_files
== (uint64_t) -1)
3162 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3164 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3165 format_bytes(a
, sizeof(a
), m
->min_use
),
3166 format_bytes(b
, sizeof(b
), m
->max_use
),
3167 format_bytes(c
, sizeof(c
), m
->max_size
),
3168 format_bytes(d
, sizeof(d
), m
->min_size
),
3169 format_bytes(e
, sizeof(e
), m
->keep_free
),
3173 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3178 if (f
->header
->head_entry_realtime
== 0)
3181 *from
= le64toh(f
->header
->head_entry_realtime
);
3185 if (f
->header
->tail_entry_realtime
== 0)
3188 *to
= le64toh(f
->header
->tail_entry_realtime
);
3194 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3202 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3206 if (le64toh(o
->data
.n_entries
) <= 0)
3210 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3214 *from
= le64toh(o
->entry
.monotonic
);
3218 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3222 r
= generic_array_get_plus_one(f
,
3223 le64toh(o
->data
.entry_offset
),
3224 le64toh(o
->data
.entry_array_offset
),
3225 le64toh(o
->data
.n_entries
)-1,
3230 *to
= le64toh(o
->entry
.monotonic
);
3236 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3239 /* If we gained new header fields we gained new features,
3240 * hence suggest a rotation */
3241 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3242 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3246 /* Let's check if the hash tables grew over a certain fill
3247 * level (75%, borrowing this value from Java's hash table
3248 * implementation), and if so suggest a rotation. To calculate
3249 * the fill level we need the n_data field, which only exists
3250 * in newer versions. */
3252 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3253 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3254 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3256 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3257 le64toh(f
->header
->n_data
),
3258 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3259 (unsigned long long) f
->last_stat
.st_size
,
3260 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3264 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3265 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3266 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3268 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3269 le64toh(f
->header
->n_fields
),
3270 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3274 /* Are the data objects properly indexed by field objects? */
3275 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3276 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3277 le64toh(f
->header
->n_data
) > 0 &&
3278 le64toh(f
->header
->n_fields
) == 0)
3281 if (max_file_usec
> 0) {
3284 h
= le64toh(f
->header
->head_entry_realtime
);
3285 t
= now(CLOCK_REALTIME
);
3287 if (h
> 0 && t
> h
+ max_file_usec
)