1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/statvfs.h>
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
36 #include "journal-authenticate.h"
37 #include "journal-def.h"
38 #include "journal-file.h"
40 #include "parse-util.h"
41 #include "random-util.h"
43 #include "string-util.h"
44 #include "xattr-util.h"
46 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
47 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
49 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
51 /* This is the minimum journal file size */
52 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
54 /* These are the lower and upper bounds if we deduce the max_use value
55 * from the file system size */
56 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
57 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
59 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
60 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
62 /* This is the upper bound if we deduce max_size from max_use */
63 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
65 /* This is the upper bound if we deduce the keep_free value from the
67 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
69 /* This is the keep_free value when we can't determine the system
71 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
73 /* This is the default maximum number of journal files to keep around. */
74 #define DEFAULT_N_MAX_FILES (100)
76 /* n_data was the first entry we added after the initial file format design */
77 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
79 /* How many entries to keep in the entry array chain cache at max */
80 #define CHAIN_CACHE_MAX 20
82 /* How much to increase the journal file size at once each time we allocate something new. */
83 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
85 /* Reread fstat() of the file for detecting deletions at least this often */
86 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
88 /* The mmap context to use for the header we pick as one above the last defined typed */
89 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
91 static int journal_file_set_online(JournalFile
*f
) {
97 if (!(f
->fd
>= 0 && f
->header
))
100 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
103 switch(f
->header
->state
) {
108 f
->header
->state
= STATE_ONLINE
;
117 int journal_file_set_offline(JournalFile
*f
) {
123 if (!(f
->fd
>= 0 && f
->header
))
126 if (f
->header
->state
!= STATE_ONLINE
)
131 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
134 f
->header
->state
= STATE_OFFLINE
;
136 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
144 JournalFile
* journal_file_close(JournalFile
*f
) {
148 /* Write the final tag */
149 if (f
->seal
&& f
->writable
)
150 journal_file_append_tag(f
);
153 if (f
->post_change_timer
) {
156 if (sd_event_source_get_enabled(f
->post_change_timer
, &enabled
) >= 0)
157 if (enabled
== SD_EVENT_ONESHOT
)
158 journal_file_post_change(f
);
160 (void) sd_event_source_set_enabled(f
->post_change_timer
, SD_EVENT_OFF
);
161 sd_event_source_unref(f
->post_change_timer
);
164 journal_file_set_offline(f
);
166 if (f
->mmap
&& f
->fd
>= 0)
167 mmap_cache_close_fd(f
->mmap
, f
->fd
);
169 if (f
->fd
>= 0 && f
->defrag_on_close
) {
171 /* Be friendly to btrfs: turn COW back on again now,
172 * and defragment the file. We won't write to the file
173 * ever again, hence remove all fragmentation, and
174 * reenable all the good bits COW usually provides
175 * (such as data checksumming). */
177 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
178 (void) btrfs_defrag_fd(f
->fd
);
184 mmap_cache_unref(f
->mmap
);
186 ordered_hashmap_free_free(f
->chain_cache
);
188 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
189 free(f
->compress_buffer
);
194 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
196 free(f
->fsprg_state
);
201 gcry_md_close(f
->hmac
);
208 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
215 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
216 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
218 h
.incompatible_flags
|= htole32(
219 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
220 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
222 h
.compatible_flags
= htole32(
223 f
->seal
* HEADER_COMPATIBLE_SEALED
);
225 r
= sd_id128_randomize(&h
.file_id
);
230 h
.seqnum_id
= template->header
->seqnum_id
;
231 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
233 h
.seqnum_id
= h
.file_id
;
235 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
245 static int journal_file_refresh_header(JournalFile
*f
) {
251 r
= sd_id128_get_machine(&f
->header
->machine_id
);
255 r
= sd_id128_get_boot(&boot_id
);
259 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
260 f
->tail_entry_monotonic_valid
= true;
262 f
->header
->boot_id
= boot_id
;
264 r
= journal_file_set_online(f
);
266 /* Sync the online state to disk */
272 static int journal_file_verify_header(JournalFile
*f
) {
277 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
280 /* In both read and write mode we refuse to open files with
281 * incompatible flags we don't know */
282 flags
= le32toh(f
->header
->incompatible_flags
);
283 if (flags
& ~HEADER_INCOMPATIBLE_SUPPORTED
) {
284 if (flags
& ~HEADER_INCOMPATIBLE_ANY
)
285 log_debug("Journal file %s has unknown incompatible flags %"PRIx32
,
286 f
->path
, flags
& ~HEADER_INCOMPATIBLE_ANY
);
287 flags
= (flags
& HEADER_INCOMPATIBLE_ANY
) & ~HEADER_INCOMPATIBLE_SUPPORTED
;
289 log_debug("Journal file %s uses incompatible flags %"PRIx32
290 " disabled at compilation time.", f
->path
, flags
);
291 return -EPROTONOSUPPORT
;
294 /* When open for writing we refuse to open files with
295 * compatible flags, too */
296 flags
= le32toh(f
->header
->compatible_flags
);
297 if (f
->writable
&& (flags
& ~HEADER_COMPATIBLE_SUPPORTED
)) {
298 if (flags
& ~HEADER_COMPATIBLE_ANY
)
299 log_debug("Journal file %s has unknown compatible flags %"PRIx32
,
300 f
->path
, flags
& ~HEADER_COMPATIBLE_ANY
);
301 flags
= (flags
& HEADER_COMPATIBLE_ANY
) & ~HEADER_COMPATIBLE_SUPPORTED
;
303 log_debug("Journal file %s uses compatible flags %"PRIx32
304 " disabled at compilation time.", f
->path
, flags
);
305 return -EPROTONOSUPPORT
;
308 if (f
->header
->state
>= _STATE_MAX
)
311 /* The first addition was n_data, so check that we are at least this large */
312 if (le64toh(f
->header
->header_size
) < HEADER_SIZE_MIN
)
315 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
318 if ((le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)) > (uint64_t) f
->last_stat
.st_size
)
321 if (le64toh(f
->header
->tail_object_offset
) > (le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)))
324 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
325 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
326 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
327 !VALID64(le64toh(f
->header
->entry_array_offset
)))
332 sd_id128_t machine_id
;
335 r
= sd_id128_get_machine(&machine_id
);
339 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
342 state
= f
->header
->state
;
344 if (state
== STATE_ONLINE
) {
345 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
347 } else if (state
== STATE_ARCHIVED
)
349 else if (state
!= STATE_OFFLINE
) {
350 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
355 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
356 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
358 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
363 static int journal_file_fstat(JournalFile
*f
) {
367 if (fstat(f
->fd
, &f
->last_stat
) < 0)
370 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
372 /* Refuse appending to files that are already deleted */
373 if (f
->last_stat
.st_nlink
<= 0)
379 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
380 uint64_t old_size
, new_size
;
385 /* We assume that this file is not sparse, and we know that
386 * for sure, since we always call posix_fallocate()
389 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
393 le64toh(f
->header
->header_size
) +
394 le64toh(f
->header
->arena_size
);
396 new_size
= PAGE_ALIGN(offset
+ size
);
397 if (new_size
< le64toh(f
->header
->header_size
))
398 new_size
= le64toh(f
->header
->header_size
);
400 if (new_size
<= old_size
) {
402 /* We already pre-allocated enough space, but before
403 * we write to it, let's check with fstat() if the
404 * file got deleted, in order make sure we don't throw
405 * away the data immediately. Don't check fstat() for
406 * all writes though, but only once ever 10s. */
408 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
411 return journal_file_fstat(f
);
414 /* Allocate more space. */
416 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
419 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
422 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
425 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
427 if (new_size
- old_size
> available
)
432 /* Increase by larger blocks at once */
433 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
434 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
435 new_size
= f
->metrics
.max_size
;
437 /* Note that the glibc fallocate() fallback is very
438 inefficient, hence we try to minimize the allocation area
440 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
444 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
446 return journal_file_fstat(f
);
449 static unsigned type_to_context(ObjectType type
) {
450 /* One context for each type, plus one catch-all for the rest */
451 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
452 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
453 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
456 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
) {
465 /* Avoid SIGBUS on invalid accesses */
466 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
467 /* Hmm, out of range? Let's refresh the fstat() data
468 * first, before we trust that check. */
470 r
= journal_file_fstat(f
);
474 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
475 return -EADDRNOTAVAIL
;
478 return mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
);
481 static uint64_t minimum_header_size(Object
*o
) {
483 static const uint64_t table
[] = {
484 [OBJECT_DATA
] = sizeof(DataObject
),
485 [OBJECT_FIELD
] = sizeof(FieldObject
),
486 [OBJECT_ENTRY
] = sizeof(EntryObject
),
487 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
488 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
489 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
490 [OBJECT_TAG
] = sizeof(TagObject
),
493 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
494 return sizeof(ObjectHeader
);
496 return table
[o
->object
.type
];
499 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
508 /* Objects may only be located at multiple of 64 bit */
509 if (!VALID64(offset
))
512 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
);
517 s
= le64toh(o
->object
.size
);
519 if (s
< sizeof(ObjectHeader
))
522 if (o
->object
.type
<= OBJECT_UNUSED
)
525 if (s
< minimum_header_size(o
))
528 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
)
531 if (s
> sizeof(ObjectHeader
)) {
532 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
);
543 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
548 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
551 /* If an external seqnum counter was passed, we update
552 * both the local and the external one, and set it to
553 * the maximum of both */
561 f
->header
->tail_entry_seqnum
= htole64(r
);
563 if (f
->header
->head_entry_seqnum
== 0)
564 f
->header
->head_entry_seqnum
= htole64(r
);
569 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
576 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
577 assert(size
>= sizeof(ObjectHeader
));
581 r
= journal_file_set_online(f
);
585 p
= le64toh(f
->header
->tail_object_offset
);
587 p
= le64toh(f
->header
->header_size
);
589 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
593 p
+= ALIGN64(le64toh(tail
->object
.size
));
596 r
= journal_file_allocate(f
, p
, size
);
600 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
);
607 o
->object
.type
= type
;
608 o
->object
.size
= htole64(size
);
610 f
->header
->tail_object_offset
= htole64(p
);
611 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
619 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
626 /* We estimate that we need 1 hash table entry per 768 bytes
627 of journal file and we want to make sure we never get
628 beyond 75% fill level. Calculate the hash table size for
629 the maximum file size based on these metrics. */
631 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
632 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
633 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
635 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
637 r
= journal_file_append_object(f
,
638 OBJECT_DATA_HASH_TABLE
,
639 offsetof(Object
, hash_table
.items
) + s
,
644 memzero(o
->hash_table
.items
, s
);
646 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
647 f
->header
->data_hash_table_size
= htole64(s
);
652 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
659 /* We use a fixed size hash table for the fields as this
660 * number should grow very slowly only */
662 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
663 r
= journal_file_append_object(f
,
664 OBJECT_FIELD_HASH_TABLE
,
665 offsetof(Object
, hash_table
.items
) + s
,
670 memzero(o
->hash_table
.items
, s
);
672 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
673 f
->header
->field_hash_table_size
= htole64(s
);
678 int journal_file_map_data_hash_table(JournalFile
*f
) {
685 if (f
->data_hash_table
)
688 p
= le64toh(f
->header
->data_hash_table_offset
);
689 s
= le64toh(f
->header
->data_hash_table_size
);
691 r
= journal_file_move_to(f
,
692 OBJECT_DATA_HASH_TABLE
,
699 f
->data_hash_table
= t
;
703 int journal_file_map_field_hash_table(JournalFile
*f
) {
710 if (f
->field_hash_table
)
713 p
= le64toh(f
->header
->field_hash_table_offset
);
714 s
= le64toh(f
->header
->field_hash_table_size
);
716 r
= journal_file_move_to(f
,
717 OBJECT_FIELD_HASH_TABLE
,
724 f
->field_hash_table
= t
;
728 static int journal_file_link_field(
741 if (o
->object
.type
!= OBJECT_FIELD
)
744 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
748 /* This might alter the window we are looking at */
749 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
752 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
754 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
756 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
760 o
->field
.next_hash_offset
= htole64(offset
);
763 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
765 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
766 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
771 static int journal_file_link_data(
784 if (o
->object
.type
!= OBJECT_DATA
)
787 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
791 /* This might alter the window we are looking at */
792 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
793 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
794 o
->data
.n_entries
= 0;
797 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
799 /* Only entry in the hash table is easy */
800 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
802 /* Move back to the previous data object, to patch in
805 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
809 o
->data
.next_hash_offset
= htole64(offset
);
812 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
814 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
815 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
820 int journal_file_find_field_object_with_hash(
822 const void *field
, uint64_t size
, uint64_t hash
,
823 Object
**ret
, uint64_t *offset
) {
825 uint64_t p
, osize
, h
, m
;
829 assert(field
&& size
> 0);
831 /* If the field hash table is empty, we can't find anything */
832 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
835 /* Map the field hash table, if it isn't mapped yet. */
836 r
= journal_file_map_field_hash_table(f
);
840 osize
= offsetof(Object
, field
.payload
) + size
;
842 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
847 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
852 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
856 if (le64toh(o
->field
.hash
) == hash
&&
857 le64toh(o
->object
.size
) == osize
&&
858 memcmp(o
->field
.payload
, field
, size
) == 0) {
868 p
= le64toh(o
->field
.next_hash_offset
);
874 int journal_file_find_field_object(
876 const void *field
, uint64_t size
,
877 Object
**ret
, uint64_t *offset
) {
882 assert(field
&& size
> 0);
884 hash
= hash64(field
, size
);
886 return journal_file_find_field_object_with_hash(f
,
891 int journal_file_find_data_object_with_hash(
893 const void *data
, uint64_t size
, uint64_t hash
,
894 Object
**ret
, uint64_t *offset
) {
896 uint64_t p
, osize
, h
, m
;
900 assert(data
|| size
== 0);
902 /* If there's no data hash table, then there's no entry. */
903 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
906 /* Map the data hash table, if it isn't mapped yet. */
907 r
= journal_file_map_data_hash_table(f
);
911 osize
= offsetof(Object
, data
.payload
) + size
;
913 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
918 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
923 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
927 if (le64toh(o
->data
.hash
) != hash
)
930 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
931 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
935 l
= le64toh(o
->object
.size
);
936 if (l
<= offsetof(Object
, data
.payload
))
939 l
-= offsetof(Object
, data
.payload
);
941 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
942 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
947 memcmp(f
->compress_buffer
, data
, size
) == 0) {
958 return -EPROTONOSUPPORT
;
960 } else if (le64toh(o
->object
.size
) == osize
&&
961 memcmp(o
->data
.payload
, data
, size
) == 0) {
973 p
= le64toh(o
->data
.next_hash_offset
);
979 int journal_file_find_data_object(
981 const void *data
, uint64_t size
,
982 Object
**ret
, uint64_t *offset
) {
987 assert(data
|| size
== 0);
989 hash
= hash64(data
, size
);
991 return journal_file_find_data_object_with_hash(f
,
996 static int journal_file_append_field(
998 const void *field
, uint64_t size
,
999 Object
**ret
, uint64_t *offset
) {
1007 assert(field
&& size
> 0);
1009 hash
= hash64(field
, size
);
1011 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1025 osize
= offsetof(Object
, field
.payload
) + size
;
1026 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1030 o
->field
.hash
= htole64(hash
);
1031 memcpy(o
->field
.payload
, field
, size
);
1033 r
= journal_file_link_field(f
, o
, p
, hash
);
1037 /* The linking might have altered the window, so let's
1038 * refresh our pointer */
1039 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1044 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1058 static int journal_file_append_data(
1060 const void *data
, uint64_t size
,
1061 Object
**ret
, uint64_t *offset
) {
1066 int r
, compression
= 0;
1070 assert(data
|| size
== 0);
1072 hash
= hash64(data
, size
);
1074 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1088 osize
= offsetof(Object
, data
.payload
) + size
;
1089 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1093 o
->data
.hash
= htole64(hash
);
1095 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1096 if (JOURNAL_FILE_COMPRESS(f
) && size
>= COMPRESSION_SIZE_THRESHOLD
) {
1099 compression
= compress_blob(data
, size
, o
->data
.payload
, size
- 1, &rsize
);
1101 if (compression
>= 0) {
1102 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1103 o
->object
.flags
|= compression
;
1105 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1106 size
, rsize
, object_compressed_to_string(compression
));
1108 /* Compression didn't work, we don't really care why, let's continue without compression */
1113 if (compression
== 0 && size
> 0)
1114 memcpy(o
->data
.payload
, data
, size
);
1116 r
= journal_file_link_data(f
, o
, p
, hash
);
1120 /* The linking might have altered the window, so let's
1121 * refresh our pointer */
1122 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1129 eq
= memchr(data
, '=', size
);
1130 if (eq
&& eq
> data
) {
1134 /* Create field object ... */
1135 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1139 /* ... and link it in. */
1140 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1141 fo
->field
.head_data_offset
= le64toh(p
);
1145 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1159 uint64_t journal_file_entry_n_items(Object
*o
) {
1162 if (o
->object
.type
!= OBJECT_ENTRY
)
1165 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1168 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1171 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1174 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1177 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1180 if (o
->object
.type
!= OBJECT_DATA_HASH_TABLE
&&
1181 o
->object
.type
!= OBJECT_FIELD_HASH_TABLE
)
1184 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1187 static int link_entry_into_array(JournalFile
*f
,
1192 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1200 a
= le64toh(*first
);
1201 i
= hidx
= le64toh(*idx
);
1204 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1208 n
= journal_file_entry_array_n_items(o
);
1210 o
->entry_array
.items
[i
] = htole64(p
);
1211 *idx
= htole64(hidx
+ 1);
1217 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1228 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1229 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1235 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1240 o
->entry_array
.items
[i
] = htole64(p
);
1243 *first
= htole64(q
);
1245 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1249 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1252 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1253 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1255 *idx
= htole64(hidx
+ 1);
1260 static int link_entry_into_array_plus_one(JournalFile
*f
,
1275 *extra
= htole64(p
);
1279 i
= htole64(le64toh(*idx
) - 1);
1280 r
= link_entry_into_array(f
, first
, &i
, p
);
1285 *idx
= htole64(le64toh(*idx
) + 1);
1289 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1296 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1300 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1304 return link_entry_into_array_plus_one(f
,
1305 &o
->data
.entry_offset
,
1306 &o
->data
.entry_array_offset
,
1311 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1319 if (o
->object
.type
!= OBJECT_ENTRY
)
1322 __sync_synchronize();
1324 /* Link up the entry itself */
1325 r
= link_entry_into_array(f
,
1326 &f
->header
->entry_array_offset
,
1327 &f
->header
->n_entries
,
1332 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1334 if (f
->header
->head_entry_realtime
== 0)
1335 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1337 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1338 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1340 f
->tail_entry_monotonic_valid
= true;
1342 /* Link up the items */
1343 n
= journal_file_entry_n_items(o
);
1344 for (i
= 0; i
< n
; i
++) {
1345 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1353 static int journal_file_append_entry_internal(
1355 const dual_timestamp
*ts
,
1357 const EntryItem items
[], unsigned n_items
,
1359 Object
**ret
, uint64_t *offset
) {
1366 assert(items
|| n_items
== 0);
1369 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1371 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1375 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1376 memcpy(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1377 o
->entry
.realtime
= htole64(ts
->realtime
);
1378 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1379 o
->entry
.xor_hash
= htole64(xor_hash
);
1380 o
->entry
.boot_id
= f
->header
->boot_id
;
1383 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1388 r
= journal_file_link_entry(f
, o
, np
);
1401 void journal_file_post_change(JournalFile
*f
) {
1404 /* inotify() does not receive IN_MODIFY events from file
1405 * accesses done via mmap(). After each access we hence
1406 * trigger IN_MODIFY by truncating the journal file to its
1407 * current size which triggers IN_MODIFY. */
1409 __sync_synchronize();
1411 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1412 log_debug_errno(errno
, "Failed to truncate file to its own size: %m");
1415 static int post_change_thunk(sd_event_source
*timer
, uint64_t usec
, void *userdata
) {
1418 journal_file_post_change(userdata
);
1423 static void schedule_post_change(JournalFile
*f
) {
1424 sd_event_source
*timer
;
1429 assert(f
->post_change_timer
);
1431 timer
= f
->post_change_timer
;
1433 r
= sd_event_source_get_enabled(timer
, &enabled
);
1435 log_debug_errno(r
, "Failed to get ftruncate timer state: %m");
1439 if (enabled
== SD_EVENT_ONESHOT
)
1442 r
= sd_event_now(sd_event_source_get_event(timer
), CLOCK_MONOTONIC
, &now
);
1444 log_debug_errno(r
, "Failed to get clock's now for scheduling ftruncate: %m");
1448 r
= sd_event_source_set_time(timer
, now
+f
->post_change_timer_period
);
1450 log_debug_errno(r
, "Failed to set time for scheduling ftruncate: %m");
1454 r
= sd_event_source_set_enabled(timer
, SD_EVENT_ONESHOT
);
1456 log_debug_errno(r
, "Failed to enable scheduled ftruncate: %m");
1463 /* On failure, let's simply post the change immediately. */
1464 journal_file_post_change(f
);
1467 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1468 int journal_file_enable_post_change_timer(JournalFile
*f
, sd_event
*e
, usec_t t
) {
1469 _cleanup_(sd_event_source_unrefp
) sd_event_source
*timer
= NULL
;
1473 assert_return(!f
->post_change_timer
, -EINVAL
);
1477 r
= sd_event_add_time(e
, &timer
, CLOCK_MONOTONIC
, 0, 0, post_change_thunk
, f
);
1481 r
= sd_event_source_set_enabled(timer
, SD_EVENT_OFF
);
1485 f
->post_change_timer
= timer
;
1487 f
->post_change_timer_period
= t
;
1492 static int entry_item_cmp(const void *_a
, const void *_b
) {
1493 const EntryItem
*a
= _a
, *b
= _b
;
1495 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1497 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1502 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1506 uint64_t xor_hash
= 0;
1507 struct dual_timestamp _ts
;
1510 assert(iovec
|| n_iovec
== 0);
1513 dual_timestamp_get(&_ts
);
1517 if (f
->tail_entry_monotonic_valid
&&
1518 ts
->monotonic
< le64toh(f
->header
->tail_entry_monotonic
))
1522 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
1527 /* alloca() can't take 0, hence let's allocate at least one */
1528 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
1530 for (i
= 0; i
< n_iovec
; i
++) {
1534 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
1538 xor_hash
^= le64toh(o
->data
.hash
);
1539 items
[i
].object_offset
= htole64(p
);
1540 items
[i
].hash
= o
->data
.hash
;
1543 /* Order by the position on disk, in order to improve seek
1544 * times for rotating media. */
1545 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
1547 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
1549 /* If the memory mapping triggered a SIGBUS then we return an
1550 * IO error and ignore the error code passed down to us, since
1551 * it is very likely just an effect of a nullified replacement
1554 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
1557 if (f
->post_change_timer
)
1558 schedule_post_change(f
);
1560 journal_file_post_change(f
);
1565 typedef struct ChainCacheItem
{
1566 uint64_t first
; /* the array at the beginning of the chain */
1567 uint64_t array
; /* the cached array */
1568 uint64_t begin
; /* the first item in the cached array */
1569 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
1570 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
1573 static void chain_cache_put(
1580 uint64_t last_index
) {
1583 /* If the chain item to cache for this chain is the
1584 * first one it's not worth caching anything */
1588 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
1589 ci
= ordered_hashmap_steal_first(h
);
1592 ci
= new(ChainCacheItem
, 1);
1599 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
1604 assert(ci
->first
== first
);
1609 ci
->last_index
= last_index
;
1612 static int generic_array_get(
1616 Object
**ret
, uint64_t *offset
) {
1619 uint64_t p
= 0, a
, t
= 0;
1627 /* Try the chain cache first */
1628 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1629 if (ci
&& i
> ci
->total
) {
1638 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1642 k
= journal_file_entry_array_n_items(o
);
1644 p
= le64toh(o
->entry_array
.items
[i
]);
1650 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1656 /* Let's cache this item for the next invocation */
1657 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
1659 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1672 static int generic_array_get_plus_one(
1677 Object
**ret
, uint64_t *offset
) {
1686 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1699 return generic_array_get(f
, first
, i
-1, ret
, offset
);
1708 static int generic_array_bisect(
1713 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1714 direction_t direction
,
1719 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
1720 bool subtract_one
= false;
1721 Object
*o
, *array
= NULL
;
1726 assert(test_object
);
1728 /* Start with the first array in the chain */
1731 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1732 if (ci
&& n
> ci
->total
) {
1733 /* Ah, we have iterated this bisection array chain
1734 * previously! Let's see if we can skip ahead in the
1735 * chain, as far as the last time. But we can't jump
1736 * backwards in the chain, so let's check that
1739 r
= test_object(f
, ci
->begin
, needle
);
1743 if (r
== TEST_LEFT
) {
1744 /* OK, what we are looking for is right of the
1745 * begin of this EntryArray, so let's jump
1746 * straight to previously cached array in the
1752 last_index
= ci
->last_index
;
1757 uint64_t left
, right
, k
, lp
;
1759 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
1763 k
= journal_file_entry_array_n_items(array
);
1769 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
1773 r
= test_object(f
, p
, needle
);
1777 if (r
== TEST_FOUND
)
1778 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1780 if (r
== TEST_RIGHT
) {
1784 if (last_index
!= (uint64_t) -1) {
1785 assert(last_index
<= right
);
1787 /* If we cached the last index we
1788 * looked at, let's try to not to jump
1789 * too wildly around and see if we can
1790 * limit the range to look at early to
1791 * the immediate neighbors of the last
1792 * index we looked at. */
1794 if (last_index
> 0) {
1795 uint64_t x
= last_index
- 1;
1797 p
= le64toh(array
->entry_array
.items
[x
]);
1801 r
= test_object(f
, p
, needle
);
1805 if (r
== TEST_FOUND
)
1806 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1808 if (r
== TEST_RIGHT
)
1814 if (last_index
< right
) {
1815 uint64_t y
= last_index
+ 1;
1817 p
= le64toh(array
->entry_array
.items
[y
]);
1821 r
= test_object(f
, p
, needle
);
1825 if (r
== TEST_FOUND
)
1826 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1828 if (r
== TEST_RIGHT
)
1836 if (left
== right
) {
1837 if (direction
== DIRECTION_UP
)
1838 subtract_one
= true;
1844 assert(left
< right
);
1845 i
= (left
+ right
) / 2;
1847 p
= le64toh(array
->entry_array
.items
[i
]);
1851 r
= test_object(f
, p
, needle
);
1855 if (r
== TEST_FOUND
)
1856 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1858 if (r
== TEST_RIGHT
)
1866 if (direction
== DIRECTION_UP
) {
1868 subtract_one
= true;
1879 last_index
= (uint64_t) -1;
1880 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
1886 if (subtract_one
&& t
== 0 && i
== 0)
1889 /* Let's cache this item for the next invocation */
1890 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
1892 if (subtract_one
&& i
== 0)
1894 else if (subtract_one
)
1895 p
= le64toh(array
->entry_array
.items
[i
-1]);
1897 p
= le64toh(array
->entry_array
.items
[i
]);
1899 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1910 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
1915 static int generic_array_bisect_plus_one(
1921 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1922 direction_t direction
,
1928 bool step_back
= false;
1932 assert(test_object
);
1937 /* This bisects the array in object 'first', but first checks
1939 r
= test_object(f
, extra
, needle
);
1943 if (r
== TEST_FOUND
)
1944 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1946 /* if we are looking with DIRECTION_UP then we need to first
1947 see if in the actual array there is a matching entry, and
1948 return the last one of that. But if there isn't any we need
1949 to return this one. Hence remember this, and return it
1952 step_back
= direction
== DIRECTION_UP
;
1954 if (r
== TEST_RIGHT
) {
1955 if (direction
== DIRECTION_DOWN
)
1961 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
1963 if (r
== 0 && step_back
)
1972 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1988 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1994 else if (p
< needle
)
2000 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2007 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2011 if (le64toh(o
->entry
.seqnum
) == needle
)
2013 else if (le64toh(o
->entry
.seqnum
) < needle
)
2019 int journal_file_move_to_entry_by_seqnum(
2022 direction_t direction
,
2026 return generic_array_bisect(f
,
2027 le64toh(f
->header
->entry_array_offset
),
2028 le64toh(f
->header
->n_entries
),
2035 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2042 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2046 if (le64toh(o
->entry
.realtime
) == needle
)
2048 else if (le64toh(o
->entry
.realtime
) < needle
)
2054 int journal_file_move_to_entry_by_realtime(
2057 direction_t direction
,
2061 return generic_array_bisect(f
,
2062 le64toh(f
->header
->entry_array_offset
),
2063 le64toh(f
->header
->n_entries
),
2065 test_object_realtime
,
2070 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2077 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2081 if (le64toh(o
->entry
.monotonic
) == needle
)
2083 else if (le64toh(o
->entry
.monotonic
) < needle
)
2089 static int find_data_object_by_boot_id(
2095 char t
[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2097 sd_id128_to_string(boot_id
, t
+ 9);
2098 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2101 int journal_file_move_to_entry_by_monotonic(
2105 direction_t direction
,
2114 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2120 return generic_array_bisect_plus_one(f
,
2121 le64toh(o
->data
.entry_offset
),
2122 le64toh(o
->data
.entry_array_offset
),
2123 le64toh(o
->data
.n_entries
),
2125 test_object_monotonic
,
2130 void journal_file_reset_location(JournalFile
*f
) {
2131 f
->location_type
= LOCATION_HEAD
;
2132 f
->current_offset
= 0;
2133 f
->current_seqnum
= 0;
2134 f
->current_realtime
= 0;
2135 f
->current_monotonic
= 0;
2136 zero(f
->current_boot_id
);
2137 f
->current_xor_hash
= 0;
2140 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2141 f
->location_type
= LOCATION_SEEK
;
2142 f
->current_offset
= offset
;
2143 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2144 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2145 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2146 f
->current_boot_id
= o
->entry
.boot_id
;
2147 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2150 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2153 assert(af
->location_type
== LOCATION_SEEK
);
2154 assert(bf
->location_type
== LOCATION_SEEK
);
2156 /* If contents and timestamps match, these entries are
2157 * identical, even if the seqnum does not match */
2158 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2159 af
->current_monotonic
== bf
->current_monotonic
&&
2160 af
->current_realtime
== bf
->current_realtime
&&
2161 af
->current_xor_hash
== bf
->current_xor_hash
)
2164 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2166 /* If this is from the same seqnum source, compare
2168 if (af
->current_seqnum
< bf
->current_seqnum
)
2170 if (af
->current_seqnum
> bf
->current_seqnum
)
2173 /* Wow! This is weird, different data but the same
2174 * seqnums? Something is borked, but let's make the
2175 * best of it and compare by time. */
2178 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2180 /* If the boot id matches, compare monotonic time */
2181 if (af
->current_monotonic
< bf
->current_monotonic
)
2183 if (af
->current_monotonic
> bf
->current_monotonic
)
2187 /* Otherwise, compare UTC time */
2188 if (af
->current_realtime
< bf
->current_realtime
)
2190 if (af
->current_realtime
> bf
->current_realtime
)
2193 /* Finally, compare by contents */
2194 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2196 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2202 int journal_file_next_entry(
2205 direction_t direction
,
2206 Object
**ret
, uint64_t *offset
) {
2213 n
= le64toh(f
->header
->n_entries
);
2218 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2220 r
= generic_array_bisect(f
,
2221 le64toh(f
->header
->entry_array_offset
),
2222 le64toh(f
->header
->n_entries
),
2231 if (direction
== DIRECTION_DOWN
) {
2244 /* And jump to it */
2245 r
= generic_array_get(f
,
2246 le64toh(f
->header
->entry_array_offset
),
2253 (direction
== DIRECTION_DOWN
? ofs
<= p
: ofs
>= p
)) {
2254 log_debug("%s: entry array corrupted at entry %"PRIu64
,
2265 int journal_file_next_entry_for_data(
2267 Object
*o
, uint64_t p
,
2268 uint64_t data_offset
,
2269 direction_t direction
,
2270 Object
**ret
, uint64_t *offset
) {
2277 assert(p
> 0 || !o
);
2279 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2283 n
= le64toh(d
->data
.n_entries
);
2288 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2290 if (o
->object
.type
!= OBJECT_ENTRY
)
2293 r
= generic_array_bisect_plus_one(f
,
2294 le64toh(d
->data
.entry_offset
),
2295 le64toh(d
->data
.entry_array_offset
),
2296 le64toh(d
->data
.n_entries
),
2306 if (direction
== DIRECTION_DOWN
) {
2320 return generic_array_get_plus_one(f
,
2321 le64toh(d
->data
.entry_offset
),
2322 le64toh(d
->data
.entry_array_offset
),
2327 int journal_file_move_to_entry_by_offset_for_data(
2329 uint64_t data_offset
,
2331 direction_t direction
,
2332 Object
**ret
, uint64_t *offset
) {
2339 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2343 return generic_array_bisect_plus_one(f
,
2344 le64toh(d
->data
.entry_offset
),
2345 le64toh(d
->data
.entry_array_offset
),
2346 le64toh(d
->data
.n_entries
),
2353 int journal_file_move_to_entry_by_monotonic_for_data(
2355 uint64_t data_offset
,
2358 direction_t direction
,
2359 Object
**ret
, uint64_t *offset
) {
2367 /* First, seek by time */
2368 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2374 r
= generic_array_bisect_plus_one(f
,
2375 le64toh(o
->data
.entry_offset
),
2376 le64toh(o
->data
.entry_array_offset
),
2377 le64toh(o
->data
.n_entries
),
2379 test_object_monotonic
,
2385 /* And now, continue seeking until we find an entry that
2386 * exists in both bisection arrays */
2392 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2396 r
= generic_array_bisect_plus_one(f
,
2397 le64toh(d
->data
.entry_offset
),
2398 le64toh(d
->data
.entry_array_offset
),
2399 le64toh(d
->data
.n_entries
),
2407 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2411 r
= generic_array_bisect_plus_one(f
,
2412 le64toh(o
->data
.entry_offset
),
2413 le64toh(o
->data
.entry_array_offset
),
2414 le64toh(o
->data
.n_entries
),
2436 int journal_file_move_to_entry_by_seqnum_for_data(
2438 uint64_t data_offset
,
2440 direction_t direction
,
2441 Object
**ret
, uint64_t *offset
) {
2448 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2452 return generic_array_bisect_plus_one(f
,
2453 le64toh(d
->data
.entry_offset
),
2454 le64toh(d
->data
.entry_array_offset
),
2455 le64toh(d
->data
.n_entries
),
2462 int journal_file_move_to_entry_by_realtime_for_data(
2464 uint64_t data_offset
,
2466 direction_t direction
,
2467 Object
**ret
, uint64_t *offset
) {
2474 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2478 return generic_array_bisect_plus_one(f
,
2479 le64toh(d
->data
.entry_offset
),
2480 le64toh(d
->data
.entry_array_offset
),
2481 le64toh(d
->data
.n_entries
),
2483 test_object_realtime
,
2488 void journal_file_dump(JournalFile
*f
) {
2495 journal_file_print_header(f
);
2497 p
= le64toh(f
->header
->header_size
);
2499 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
2503 switch (o
->object
.type
) {
2506 printf("Type: OBJECT_UNUSED\n");
2510 printf("Type: OBJECT_DATA\n");
2514 printf("Type: OBJECT_FIELD\n");
2518 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
2519 le64toh(o
->entry
.seqnum
),
2520 le64toh(o
->entry
.monotonic
),
2521 le64toh(o
->entry
.realtime
));
2524 case OBJECT_FIELD_HASH_TABLE
:
2525 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2528 case OBJECT_DATA_HASH_TABLE
:
2529 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2532 case OBJECT_ENTRY_ARRAY
:
2533 printf("Type: OBJECT_ENTRY_ARRAY\n");
2537 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
2538 le64toh(o
->tag
.seqnum
),
2539 le64toh(o
->tag
.epoch
));
2543 printf("Type: unknown (%i)\n", o
->object
.type
);
2547 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
2548 printf("Flags: %s\n",
2549 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
2551 if (p
== le64toh(f
->header
->tail_object_offset
))
2554 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
2559 log_error("File corrupt");
2562 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
2565 x
= format_timestamp(buf
, l
, t
);
2571 void journal_file_print_header(JournalFile
*f
) {
2572 char a
[33], b
[33], c
[33], d
[33];
2573 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
2575 char bytes
[FORMAT_BYTES_MAX
];
2579 printf("File Path: %s\n"
2583 "Sequential Number ID: %s\n"
2585 "Compatible Flags:%s%s\n"
2586 "Incompatible Flags:%s%s%s\n"
2587 "Header size: %"PRIu64
"\n"
2588 "Arena size: %"PRIu64
"\n"
2589 "Data Hash Table Size: %"PRIu64
"\n"
2590 "Field Hash Table Size: %"PRIu64
"\n"
2591 "Rotate Suggested: %s\n"
2592 "Head Sequential Number: %"PRIu64
"\n"
2593 "Tail Sequential Number: %"PRIu64
"\n"
2594 "Head Realtime Timestamp: %s\n"
2595 "Tail Realtime Timestamp: %s\n"
2596 "Tail Monotonic Timestamp: %s\n"
2597 "Objects: %"PRIu64
"\n"
2598 "Entry Objects: %"PRIu64
"\n",
2600 sd_id128_to_string(f
->header
->file_id
, a
),
2601 sd_id128_to_string(f
->header
->machine_id
, b
),
2602 sd_id128_to_string(f
->header
->boot_id
, c
),
2603 sd_id128_to_string(f
->header
->seqnum_id
, d
),
2604 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
2605 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
2606 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
2607 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
2608 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
2609 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
2610 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
2611 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
2612 le64toh(f
->header
->header_size
),
2613 le64toh(f
->header
->arena_size
),
2614 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
2615 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
2616 yes_no(journal_file_rotate_suggested(f
, 0)),
2617 le64toh(f
->header
->head_entry_seqnum
),
2618 le64toh(f
->header
->tail_entry_seqnum
),
2619 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)),
2620 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)),
2621 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
),
2622 le64toh(f
->header
->n_objects
),
2623 le64toh(f
->header
->n_entries
));
2625 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
2626 printf("Data Objects: %"PRIu64
"\n"
2627 "Data Hash Table Fill: %.1f%%\n",
2628 le64toh(f
->header
->n_data
),
2629 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
2631 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
2632 printf("Field Objects: %"PRIu64
"\n"
2633 "Field Hash Table Fill: %.1f%%\n",
2634 le64toh(f
->header
->n_fields
),
2635 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
2637 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
2638 printf("Tag Objects: %"PRIu64
"\n",
2639 le64toh(f
->header
->n_tags
));
2640 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
2641 printf("Entry Array Objects: %"PRIu64
"\n",
2642 le64toh(f
->header
->n_entry_arrays
));
2644 if (fstat(f
->fd
, &st
) >= 0)
2645 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (uint64_t) st
.st_blocks
* 512ULL));
2648 static int journal_file_warn_btrfs(JournalFile
*f
) {
2654 /* Before we write anything, check if the COW logic is turned
2655 * off on btrfs. Given our write pattern that is quite
2656 * unfriendly to COW file systems this should greatly improve
2657 * performance on COW file systems, such as btrfs, at the
2658 * expense of data integrity features (which shouldn't be too
2659 * bad, given that we do our own checksumming). */
2661 r
= btrfs_is_filesystem(f
->fd
);
2663 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
2667 r
= read_attr_fd(f
->fd
, &attrs
);
2669 return log_warning_errno(r
, "Failed to read file attributes: %m");
2671 if (attrs
& FS_NOCOW_FL
) {
2672 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2676 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2677 "This is likely to slow down journal access substantially, please consider turning "
2678 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
2683 int journal_file_open(
2689 JournalMetrics
*metrics
,
2690 MMapCache
*mmap_cache
,
2691 JournalFile
*template,
2692 JournalFile
**ret
) {
2694 bool newly_created
= false;
2702 if ((flags
& O_ACCMODE
) != O_RDONLY
&&
2703 (flags
& O_ACCMODE
) != O_RDWR
)
2706 if (!endswith(fname
, ".journal") &&
2707 !endswith(fname
, ".journal~"))
2710 f
= new0(JournalFile
, 1);
2718 f
->prot
= prot_from_flags(flags
);
2719 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
2720 #if defined(HAVE_LZ4)
2721 f
->compress_lz4
= compress
;
2722 #elif defined(HAVE_XZ)
2723 f
->compress_xz
= compress
;
2730 f
->mmap
= mmap_cache_ref(mmap_cache
);
2732 f
->mmap
= mmap_cache_new();
2739 f
->path
= strdup(fname
);
2745 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
2746 if (!f
->chain_cache
) {
2751 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
, f
->mode
);
2757 r
= journal_file_fstat(f
);
2761 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
2763 (void) journal_file_warn_btrfs(f
);
2765 /* Let's attach the creation time to the journal file,
2766 * so that the vacuuming code knows the age of this
2767 * file even if the file might end up corrupted one
2768 * day... Ideally we'd just use the creation time many
2769 * file systems maintain for each file, but there is
2770 * currently no usable API to query this, hence let's
2771 * emulate this via extended attributes. If extended
2772 * attributes are not supported we'll just skip this,
2773 * and rely solely on mtime/atime/ctime of the file. */
2775 fd_setcrtime(f
->fd
, 0);
2778 /* Try to load the FSPRG state, and if we can't, then
2779 * just don't do sealing */
2781 r
= journal_file_fss_load(f
);
2787 r
= journal_file_init_header(f
, template);
2791 r
= journal_file_fstat(f
);
2795 newly_created
= true;
2798 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
2803 r
= mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
);
2809 if (!newly_created
) {
2810 r
= journal_file_verify_header(f
);
2816 if (!newly_created
&& f
->writable
) {
2817 r
= journal_file_fss_load(f
);
2825 journal_default_metrics(metrics
, f
->fd
);
2826 f
->metrics
= *metrics
;
2827 } else if (template)
2828 f
->metrics
= template->metrics
;
2830 r
= journal_file_refresh_header(f
);
2836 r
= journal_file_hmac_setup(f
);
2841 if (newly_created
) {
2842 r
= journal_file_setup_field_hash_table(f
);
2846 r
= journal_file_setup_data_hash_table(f
);
2851 r
= journal_file_append_first_tag(f
);
2857 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
)) {
2862 if (template && template->post_change_timer
) {
2863 r
= journal_file_enable_post_change_timer(
2865 sd_event_source_get_event(template->post_change_timer
),
2866 template->post_change_timer_period
);
2876 if (f
->fd
>= 0 && mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
2879 journal_file_close(f
);
2884 int journal_file_rotate(JournalFile
**f
, bool compress
, bool seal
) {
2885 _cleanup_free_
char *p
= NULL
;
2887 JournalFile
*old_file
, *new_file
= NULL
;
2895 if (!old_file
->writable
)
2898 if (!endswith(old_file
->path
, ".journal"))
2901 l
= strlen(old_file
->path
);
2902 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
2903 (int) l
- 8, old_file
->path
,
2904 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
2905 le64toh((*f
)->header
->head_entry_seqnum
),
2906 le64toh((*f
)->header
->head_entry_realtime
));
2910 /* Try to rename the file to the archived version. If the file
2911 * already was deleted, we'll get ENOENT, let's ignore that
2913 r
= rename(old_file
->path
, p
);
2914 if (r
< 0 && errno
!= ENOENT
)
2917 old_file
->header
->state
= STATE_ARCHIVED
;
2919 /* Currently, btrfs is not very good with out write patterns
2920 * and fragments heavily. Let's defrag our journal files when
2921 * we archive them */
2922 old_file
->defrag_on_close
= true;
2924 r
= journal_file_open(old_file
->path
, old_file
->flags
, old_file
->mode
, compress
, seal
, NULL
, old_file
->mmap
, old_file
, &new_file
);
2925 journal_file_close(old_file
);
2931 int journal_file_open_reliably(
2937 JournalMetrics
*metrics
,
2938 MMapCache
*mmap_cache
,
2939 JournalFile
*template,
2940 JournalFile
**ret
) {
2944 _cleanup_free_
char *p
= NULL
;
2946 r
= journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2948 -EBADMSG
, /* corrupted */
2949 -ENODATA
, /* truncated */
2950 -EHOSTDOWN
, /* other machine */
2951 -EPROTONOSUPPORT
, /* incompatible feature */
2952 -EBUSY
, /* unclean shutdown */
2953 -ESHUTDOWN
, /* already archived */
2954 -EIO
, /* IO error, including SIGBUS on mmap */
2955 -EIDRM
/* File has been deleted */))
2958 if ((flags
& O_ACCMODE
) == O_RDONLY
)
2961 if (!(flags
& O_CREAT
))
2964 if (!endswith(fname
, ".journal"))
2967 /* The file is corrupted. Rotate it away and try it again (but only once) */
2970 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
2972 now(CLOCK_REALTIME
),
2976 if (rename(fname
, p
) < 0)
2979 /* btrfs doesn't cope well with our write pattern and
2980 * fragments heavily. Let's defrag all files we rotate */
2982 (void) chattr_path(p
, false, FS_NOCOW_FL
);
2983 (void) btrfs_defrag(p
);
2985 log_warning_errno(r
, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
2987 return journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2990 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
2992 uint64_t q
, xor_hash
= 0;
3005 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
3006 ts
.realtime
= le64toh(o
->entry
.realtime
);
3008 n
= journal_file_entry_n_items(o
);
3009 /* alloca() can't take 0, hence let's allocate at least one */
3010 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
3012 for (i
= 0; i
< n
; i
++) {
3019 q
= le64toh(o
->entry
.items
[i
].object_offset
);
3020 le_hash
= o
->entry
.items
[i
].hash
;
3022 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
3026 if (le_hash
!= o
->data
.hash
)
3029 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
3032 /* We hit the limit on 32bit machines */
3033 if ((uint64_t) t
!= l
)
3036 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
3037 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
3040 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
3041 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
3045 data
= from
->compress_buffer
;
3048 return -EPROTONOSUPPORT
;
3051 data
= o
->data
.payload
;
3053 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
3057 xor_hash
^= le64toh(u
->data
.hash
);
3058 items
[i
].object_offset
= htole64(h
);
3059 items
[i
].hash
= u
->data
.hash
;
3061 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
3066 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
3068 if (mmap_cache_got_sigbus(to
->mmap
, to
->fd
))
3074 void journal_reset_metrics(JournalMetrics
*m
) {
3077 /* Set everything to "pick automatic values". */
3079 *m
= (JournalMetrics
) {
3080 .min_use
= (uint64_t) -1,
3081 .max_use
= (uint64_t) -1,
3082 .min_size
= (uint64_t) -1,
3083 .max_size
= (uint64_t) -1,
3084 .keep_free
= (uint64_t) -1,
3085 .n_max_files
= (uint64_t) -1,
3089 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
3090 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
], e
[FORMAT_BYTES_MAX
];
3097 if (fstatvfs(fd
, &ss
) >= 0)
3098 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
3100 log_debug_errno(errno
, "Failed to detremine disk size: %m");
3104 if (m
->max_use
== (uint64_t) -1) {
3107 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
3109 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
3110 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
3112 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
3113 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3115 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3117 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3119 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3120 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3123 if (m
->min_use
== (uint64_t) -1)
3124 m
->min_use
= DEFAULT_MIN_USE
;
3126 if (m
->min_use
> m
->max_use
)
3127 m
->min_use
= m
->max_use
;
3129 if (m
->max_size
== (uint64_t) -1) {
3130 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3132 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3133 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3135 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3137 if (m
->max_size
!= 0) {
3138 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3139 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3141 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3142 m
->max_use
= m
->max_size
*2;
3145 if (m
->min_size
== (uint64_t) -1)
3146 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3148 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3150 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3151 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3153 if (m
->max_size
!= 0 && m
->min_size
> m
->max_size
)
3154 m
->max_size
= m
->min_size
;
3157 if (m
->keep_free
== (uint64_t) -1) {
3160 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3162 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3163 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3166 m
->keep_free
= DEFAULT_KEEP_FREE
;
3169 if (m
->n_max_files
== (uint64_t) -1)
3170 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3172 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3173 format_bytes(a
, sizeof(a
), m
->min_use
),
3174 format_bytes(b
, sizeof(b
), m
->max_use
),
3175 format_bytes(c
, sizeof(c
), m
->max_size
),
3176 format_bytes(d
, sizeof(d
), m
->min_size
),
3177 format_bytes(e
, sizeof(e
), m
->keep_free
),
3181 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3186 if (f
->header
->head_entry_realtime
== 0)
3189 *from
= le64toh(f
->header
->head_entry_realtime
);
3193 if (f
->header
->tail_entry_realtime
== 0)
3196 *to
= le64toh(f
->header
->tail_entry_realtime
);
3202 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3210 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3214 if (le64toh(o
->data
.n_entries
) <= 0)
3218 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3222 *from
= le64toh(o
->entry
.monotonic
);
3226 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3230 r
= generic_array_get_plus_one(f
,
3231 le64toh(o
->data
.entry_offset
),
3232 le64toh(o
->data
.entry_array_offset
),
3233 le64toh(o
->data
.n_entries
)-1,
3238 *to
= le64toh(o
->entry
.monotonic
);
3244 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3247 /* If we gained new header fields we gained new features,
3248 * hence suggest a rotation */
3249 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3250 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3254 /* Let's check if the hash tables grew over a certain fill
3255 * level (75%, borrowing this value from Java's hash table
3256 * implementation), and if so suggest a rotation. To calculate
3257 * the fill level we need the n_data field, which only exists
3258 * in newer versions. */
3260 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3261 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3262 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3264 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3265 le64toh(f
->header
->n_data
),
3266 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3267 (unsigned long long) f
->last_stat
.st_size
,
3268 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3272 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3273 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3274 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3276 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3277 le64toh(f
->header
->n_fields
),
3278 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3282 /* Are the data objects properly indexed by field objects? */
3283 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3284 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3285 le64toh(f
->header
->n_data
) > 0 &&
3286 le64toh(f
->header
->n_fields
) == 0)
3289 if (max_file_usec
> 0) {
3292 h
= le64toh(f
->header
->head_entry_realtime
);
3293 t
= now(CLOCK_REALTIME
);
3295 if (h
> 0 && t
> h
+ max_file_usec
)