1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/statvfs.h>
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
36 #include "journal-authenticate.h"
37 #include "journal-def.h"
38 #include "journal-file.h"
40 #include "parse-util.h"
41 #include "random-util.h"
42 #include "string-util.h"
43 #include "xattr-util.h"
45 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
46 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
48 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
50 /* This is the minimum journal file size */
51 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
53 /* These are the lower and upper bounds if we deduce the max_use value
54 * from the file system size */
55 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
56 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
59 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
61 /* This is the upper bound if we deduce max_size from max_use */
62 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
64 /* This is the upper bound if we deduce the keep_free value from the
66 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
68 /* This is the keep_free value when we can't determine the system
70 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
72 /* This is the default maximum number of journal files to keep around. */
73 #define DEFAULT_N_MAX_FILES (100)
75 /* n_data was the first entry we added after the initial file format design */
76 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
78 /* How many entries to keep in the entry array chain cache at max */
79 #define CHAIN_CACHE_MAX 20
81 /* How much to increase the journal file size at once each time we allocate something new. */
82 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
84 /* Reread fstat() of the file for detecting deletions at least this often */
85 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
87 /* The mmap context to use for the header we pick as one above the last defined typed */
88 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
90 static int journal_file_set_online(JournalFile
*f
) {
96 if (!(f
->fd
>= 0 && f
->header
))
99 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
102 switch(f
->header
->state
) {
107 f
->header
->state
= STATE_ONLINE
;
116 int journal_file_set_offline(JournalFile
*f
) {
122 if (!(f
->fd
>= 0 && f
->header
))
125 if (f
->header
->state
!= STATE_ONLINE
)
130 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
133 f
->header
->state
= STATE_OFFLINE
;
135 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
143 JournalFile
* journal_file_close(JournalFile
*f
) {
147 /* Write the final tag */
148 if (f
->seal
&& f
->writable
)
149 journal_file_append_tag(f
);
152 journal_file_set_offline(f
);
154 if (f
->mmap
&& f
->fd
>= 0)
155 mmap_cache_close_fd(f
->mmap
, f
->fd
);
157 if (f
->fd
>= 0 && f
->defrag_on_close
) {
159 /* Be friendly to btrfs: turn COW back on again now,
160 * and defragment the file. We won't write to the file
161 * ever again, hence remove all fragmentation, and
162 * reenable all the good bits COW usually provides
163 * (such as data checksumming). */
165 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
166 (void) btrfs_defrag_fd(f
->fd
);
173 mmap_cache_unref(f
->mmap
);
175 ordered_hashmap_free_free(f
->chain_cache
);
177 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
178 free(f
->compress_buffer
);
183 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
185 free(f
->fsprg_state
);
190 gcry_md_close(f
->hmac
);
197 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
204 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
205 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
207 h
.incompatible_flags
|= htole32(
208 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
209 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
211 h
.compatible_flags
= htole32(
212 f
->seal
* HEADER_COMPATIBLE_SEALED
);
214 r
= sd_id128_randomize(&h
.file_id
);
219 h
.seqnum_id
= template->header
->seqnum_id
;
220 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
222 h
.seqnum_id
= h
.file_id
;
224 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
234 static int journal_file_refresh_header(JournalFile
*f
) {
240 r
= sd_id128_get_machine(&f
->header
->machine_id
);
244 r
= sd_id128_get_boot(&boot_id
);
248 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
249 f
->tail_entry_monotonic_valid
= true;
251 f
->header
->boot_id
= boot_id
;
253 r
= journal_file_set_online(f
);
255 /* Sync the online state to disk */
261 static int journal_file_verify_header(JournalFile
*f
) {
266 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
269 /* In both read and write mode we refuse to open files with
270 * incompatible flags we don't know */
271 flags
= le32toh(f
->header
->incompatible_flags
);
272 if (flags
& ~HEADER_INCOMPATIBLE_SUPPORTED
) {
273 if (flags
& ~HEADER_INCOMPATIBLE_ANY
)
274 log_debug("Journal file %s has unknown incompatible flags %"PRIx32
,
275 f
->path
, flags
& ~HEADER_INCOMPATIBLE_ANY
);
276 flags
= (flags
& HEADER_INCOMPATIBLE_ANY
) & ~HEADER_INCOMPATIBLE_SUPPORTED
;
278 log_debug("Journal file %s uses incompatible flags %"PRIx32
279 " disabled at compilation time.", f
->path
, flags
);
280 return -EPROTONOSUPPORT
;
283 /* When open for writing we refuse to open files with
284 * compatible flags, too */
285 flags
= le32toh(f
->header
->compatible_flags
);
286 if (f
->writable
&& (flags
& ~HEADER_COMPATIBLE_SUPPORTED
)) {
287 if (flags
& ~HEADER_COMPATIBLE_ANY
)
288 log_debug("Journal file %s has unknown compatible flags %"PRIx32
,
289 f
->path
, flags
& ~HEADER_COMPATIBLE_ANY
);
290 flags
= (flags
& HEADER_COMPATIBLE_ANY
) & ~HEADER_COMPATIBLE_SUPPORTED
;
292 log_debug("Journal file %s uses compatible flags %"PRIx32
293 " disabled at compilation time.", f
->path
, flags
);
294 return -EPROTONOSUPPORT
;
297 if (f
->header
->state
>= _STATE_MAX
)
300 /* The first addition was n_data, so check that we are at least this large */
301 if (le64toh(f
->header
->header_size
) < HEADER_SIZE_MIN
)
304 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
307 if ((le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)) > (uint64_t) f
->last_stat
.st_size
)
310 if (le64toh(f
->header
->tail_object_offset
) > (le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)))
313 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
314 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
315 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
316 !VALID64(le64toh(f
->header
->entry_array_offset
)))
321 sd_id128_t machine_id
;
324 r
= sd_id128_get_machine(&machine_id
);
328 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
331 state
= f
->header
->state
;
333 if (state
== STATE_ONLINE
) {
334 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
336 } else if (state
== STATE_ARCHIVED
)
338 else if (state
!= STATE_OFFLINE
) {
339 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
344 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
345 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
347 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
352 static int journal_file_fstat(JournalFile
*f
) {
356 if (fstat(f
->fd
, &f
->last_stat
) < 0)
359 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
361 /* Refuse appending to files that are already deleted */
362 if (f
->last_stat
.st_nlink
<= 0)
368 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
369 uint64_t old_size
, new_size
;
374 /* We assume that this file is not sparse, and we know that
375 * for sure, since we always call posix_fallocate()
378 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
382 le64toh(f
->header
->header_size
) +
383 le64toh(f
->header
->arena_size
);
385 new_size
= PAGE_ALIGN(offset
+ size
);
386 if (new_size
< le64toh(f
->header
->header_size
))
387 new_size
= le64toh(f
->header
->header_size
);
389 if (new_size
<= old_size
) {
391 /* We already pre-allocated enough space, but before
392 * we write to it, let's check with fstat() if the
393 * file got deleted, in order make sure we don't throw
394 * away the data immediately. Don't check fstat() for
395 * all writes though, but only once ever 10s. */
397 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
400 return journal_file_fstat(f
);
403 /* Allocate more space. */
405 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
408 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
411 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
414 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
416 if (new_size
- old_size
> available
)
421 /* Increase by larger blocks at once */
422 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
423 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
424 new_size
= f
->metrics
.max_size
;
426 /* Note that the glibc fallocate() fallback is very
427 inefficient, hence we try to minimize the allocation area
429 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
433 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
435 return journal_file_fstat(f
);
438 static unsigned type_to_context(ObjectType type
) {
439 /* One context for each type, plus one catch-all for the rest */
440 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
441 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
442 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
445 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
) {
454 /* Avoid SIGBUS on invalid accesses */
455 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
456 /* Hmm, out of range? Let's refresh the fstat() data
457 * first, before we trust that check. */
459 r
= journal_file_fstat(f
);
463 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
464 return -EADDRNOTAVAIL
;
467 return mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
);
470 static uint64_t minimum_header_size(Object
*o
) {
472 static const uint64_t table
[] = {
473 [OBJECT_DATA
] = sizeof(DataObject
),
474 [OBJECT_FIELD
] = sizeof(FieldObject
),
475 [OBJECT_ENTRY
] = sizeof(EntryObject
),
476 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
477 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
478 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
479 [OBJECT_TAG
] = sizeof(TagObject
),
482 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
483 return sizeof(ObjectHeader
);
485 return table
[o
->object
.type
];
488 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
497 /* Objects may only be located at multiple of 64 bit */
498 if (!VALID64(offset
))
501 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
);
506 s
= le64toh(o
->object
.size
);
508 if (s
< sizeof(ObjectHeader
))
511 if (o
->object
.type
<= OBJECT_UNUSED
)
514 if (s
< minimum_header_size(o
))
517 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
)
520 if (s
> sizeof(ObjectHeader
)) {
521 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
);
532 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
537 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
540 /* If an external seqnum counter was passed, we update
541 * both the local and the external one, and set it to
542 * the maximum of both */
550 f
->header
->tail_entry_seqnum
= htole64(r
);
552 if (f
->header
->head_entry_seqnum
== 0)
553 f
->header
->head_entry_seqnum
= htole64(r
);
558 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
565 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
566 assert(size
>= sizeof(ObjectHeader
));
570 r
= journal_file_set_online(f
);
574 p
= le64toh(f
->header
->tail_object_offset
);
576 p
= le64toh(f
->header
->header_size
);
578 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
582 p
+= ALIGN64(le64toh(tail
->object
.size
));
585 r
= journal_file_allocate(f
, p
, size
);
589 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
);
596 o
->object
.type
= type
;
597 o
->object
.size
= htole64(size
);
599 f
->header
->tail_object_offset
= htole64(p
);
600 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
608 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
615 /* We estimate that we need 1 hash table entry per 768 bytes
616 of journal file and we want to make sure we never get
617 beyond 75% fill level. Calculate the hash table size for
618 the maximum file size based on these metrics. */
620 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
621 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
622 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
624 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
626 r
= journal_file_append_object(f
,
627 OBJECT_DATA_HASH_TABLE
,
628 offsetof(Object
, hash_table
.items
) + s
,
633 memzero(o
->hash_table
.items
, s
);
635 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
636 f
->header
->data_hash_table_size
= htole64(s
);
641 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
648 /* We use a fixed size hash table for the fields as this
649 * number should grow very slowly only */
651 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
652 r
= journal_file_append_object(f
,
653 OBJECT_FIELD_HASH_TABLE
,
654 offsetof(Object
, hash_table
.items
) + s
,
659 memzero(o
->hash_table
.items
, s
);
661 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
662 f
->header
->field_hash_table_size
= htole64(s
);
667 int journal_file_map_data_hash_table(JournalFile
*f
) {
674 if (f
->data_hash_table
)
677 p
= le64toh(f
->header
->data_hash_table_offset
);
678 s
= le64toh(f
->header
->data_hash_table_size
);
680 r
= journal_file_move_to(f
,
681 OBJECT_DATA_HASH_TABLE
,
688 f
->data_hash_table
= t
;
692 int journal_file_map_field_hash_table(JournalFile
*f
) {
699 if (f
->field_hash_table
)
702 p
= le64toh(f
->header
->field_hash_table_offset
);
703 s
= le64toh(f
->header
->field_hash_table_size
);
705 r
= journal_file_move_to(f
,
706 OBJECT_FIELD_HASH_TABLE
,
713 f
->field_hash_table
= t
;
717 static int journal_file_link_field(
730 if (o
->object
.type
!= OBJECT_FIELD
)
733 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
737 /* This might alter the window we are looking at */
738 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
741 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
743 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
745 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
749 o
->field
.next_hash_offset
= htole64(offset
);
752 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
754 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
755 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
760 static int journal_file_link_data(
773 if (o
->object
.type
!= OBJECT_DATA
)
776 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
780 /* This might alter the window we are looking at */
781 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
782 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
783 o
->data
.n_entries
= 0;
786 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
788 /* Only entry in the hash table is easy */
789 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
791 /* Move back to the previous data object, to patch in
794 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
798 o
->data
.next_hash_offset
= htole64(offset
);
801 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
803 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
804 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
809 int journal_file_find_field_object_with_hash(
811 const void *field
, uint64_t size
, uint64_t hash
,
812 Object
**ret
, uint64_t *offset
) {
814 uint64_t p
, osize
, h
, m
;
818 assert(field
&& size
> 0);
820 /* If the field hash table is empty, we can't find anything */
821 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
824 /* Map the field hash table, if it isn't mapped yet. */
825 r
= journal_file_map_field_hash_table(f
);
829 osize
= offsetof(Object
, field
.payload
) + size
;
831 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
836 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
841 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
845 if (le64toh(o
->field
.hash
) == hash
&&
846 le64toh(o
->object
.size
) == osize
&&
847 memcmp(o
->field
.payload
, field
, size
) == 0) {
857 p
= le64toh(o
->field
.next_hash_offset
);
863 int journal_file_find_field_object(
865 const void *field
, uint64_t size
,
866 Object
**ret
, uint64_t *offset
) {
871 assert(field
&& size
> 0);
873 hash
= hash64(field
, size
);
875 return journal_file_find_field_object_with_hash(f
,
880 int journal_file_find_data_object_with_hash(
882 const void *data
, uint64_t size
, uint64_t hash
,
883 Object
**ret
, uint64_t *offset
) {
885 uint64_t p
, osize
, h
, m
;
889 assert(data
|| size
== 0);
891 /* If there's no data hash table, then there's no entry. */
892 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
895 /* Map the data hash table, if it isn't mapped yet. */
896 r
= journal_file_map_data_hash_table(f
);
900 osize
= offsetof(Object
, data
.payload
) + size
;
902 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
907 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
912 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
916 if (le64toh(o
->data
.hash
) != hash
)
919 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
920 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
924 l
= le64toh(o
->object
.size
);
925 if (l
<= offsetof(Object
, data
.payload
))
928 l
-= offsetof(Object
, data
.payload
);
930 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
931 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
936 memcmp(f
->compress_buffer
, data
, size
) == 0) {
947 return -EPROTONOSUPPORT
;
949 } else if (le64toh(o
->object
.size
) == osize
&&
950 memcmp(o
->data
.payload
, data
, size
) == 0) {
962 p
= le64toh(o
->data
.next_hash_offset
);
968 int journal_file_find_data_object(
970 const void *data
, uint64_t size
,
971 Object
**ret
, uint64_t *offset
) {
976 assert(data
|| size
== 0);
978 hash
= hash64(data
, size
);
980 return journal_file_find_data_object_with_hash(f
,
985 static int journal_file_append_field(
987 const void *field
, uint64_t size
,
988 Object
**ret
, uint64_t *offset
) {
996 assert(field
&& size
> 0);
998 hash
= hash64(field
, size
);
1000 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1014 osize
= offsetof(Object
, field
.payload
) + size
;
1015 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1019 o
->field
.hash
= htole64(hash
);
1020 memcpy(o
->field
.payload
, field
, size
);
1022 r
= journal_file_link_field(f
, o
, p
, hash
);
1026 /* The linking might have altered the window, so let's
1027 * refresh our pointer */
1028 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1033 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1047 static int journal_file_append_data(
1049 const void *data
, uint64_t size
,
1050 Object
**ret
, uint64_t *offset
) {
1055 int r
, compression
= 0;
1059 assert(data
|| size
== 0);
1061 hash
= hash64(data
, size
);
1063 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1077 osize
= offsetof(Object
, data
.payload
) + size
;
1078 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1082 o
->data
.hash
= htole64(hash
);
1084 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1085 if (JOURNAL_FILE_COMPRESS(f
) && size
>= COMPRESSION_SIZE_THRESHOLD
) {
1088 compression
= compress_blob(data
, size
, o
->data
.payload
, &rsize
);
1090 if (compression
>= 0) {
1091 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1092 o
->object
.flags
|= compression
;
1094 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1095 size
, rsize
, object_compressed_to_string(compression
));
1097 /* Compression didn't work, we don't really care why, let's continue without compression */
1102 if (compression
== 0 && size
> 0)
1103 memcpy(o
->data
.payload
, data
, size
);
1105 r
= journal_file_link_data(f
, o
, p
, hash
);
1109 /* The linking might have altered the window, so let's
1110 * refresh our pointer */
1111 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1118 eq
= memchr(data
, '=', size
);
1119 if (eq
&& eq
> data
) {
1123 /* Create field object ... */
1124 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1128 /* ... and link it in. */
1129 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1130 fo
->field
.head_data_offset
= le64toh(p
);
1134 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1148 uint64_t journal_file_entry_n_items(Object
*o
) {
1151 if (o
->object
.type
!= OBJECT_ENTRY
)
1154 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1157 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1160 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1163 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1166 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1169 if (o
->object
.type
!= OBJECT_DATA_HASH_TABLE
&&
1170 o
->object
.type
!= OBJECT_FIELD_HASH_TABLE
)
1173 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1176 static int link_entry_into_array(JournalFile
*f
,
1181 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1189 a
= le64toh(*first
);
1190 i
= hidx
= le64toh(*idx
);
1193 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1197 n
= journal_file_entry_array_n_items(o
);
1199 o
->entry_array
.items
[i
] = htole64(p
);
1200 *idx
= htole64(hidx
+ 1);
1206 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1217 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1218 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1224 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1229 o
->entry_array
.items
[i
] = htole64(p
);
1232 *first
= htole64(q
);
1234 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1238 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1241 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1242 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1244 *idx
= htole64(hidx
+ 1);
1249 static int link_entry_into_array_plus_one(JournalFile
*f
,
1264 *extra
= htole64(p
);
1268 i
= htole64(le64toh(*idx
) - 1);
1269 r
= link_entry_into_array(f
, first
, &i
, p
);
1274 *idx
= htole64(le64toh(*idx
) + 1);
1278 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1285 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1289 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1293 return link_entry_into_array_plus_one(f
,
1294 &o
->data
.entry_offset
,
1295 &o
->data
.entry_array_offset
,
1300 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1308 if (o
->object
.type
!= OBJECT_ENTRY
)
1311 __sync_synchronize();
1313 /* Link up the entry itself */
1314 r
= link_entry_into_array(f
,
1315 &f
->header
->entry_array_offset
,
1316 &f
->header
->n_entries
,
1321 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1323 if (f
->header
->head_entry_realtime
== 0)
1324 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1326 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1327 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1329 f
->tail_entry_monotonic_valid
= true;
1331 /* Link up the items */
1332 n
= journal_file_entry_n_items(o
);
1333 for (i
= 0; i
< n
; i
++) {
1334 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1342 static int journal_file_append_entry_internal(
1344 const dual_timestamp
*ts
,
1346 const EntryItem items
[], unsigned n_items
,
1348 Object
**ret
, uint64_t *offset
) {
1355 assert(items
|| n_items
== 0);
1358 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1360 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1364 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1365 memcpy(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1366 o
->entry
.realtime
= htole64(ts
->realtime
);
1367 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1368 o
->entry
.xor_hash
= htole64(xor_hash
);
1369 o
->entry
.boot_id
= f
->header
->boot_id
;
1372 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1377 r
= journal_file_link_entry(f
, o
, np
);
1390 void journal_file_post_change(JournalFile
*f
) {
1393 /* inotify() does not receive IN_MODIFY events from file
1394 * accesses done via mmap(). After each access we hence
1395 * trigger IN_MODIFY by truncating the journal file to its
1396 * current size which triggers IN_MODIFY. */
1398 __sync_synchronize();
1400 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1401 log_error_errno(errno
, "Failed to truncate file to its own size: %m");
1404 static int entry_item_cmp(const void *_a
, const void *_b
) {
1405 const EntryItem
*a
= _a
, *b
= _b
;
1407 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1409 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1414 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1418 uint64_t xor_hash
= 0;
1419 struct dual_timestamp _ts
;
1422 assert(iovec
|| n_iovec
== 0);
1425 dual_timestamp_get(&_ts
);
1429 if (f
->tail_entry_monotonic_valid
&&
1430 ts
->monotonic
< le64toh(f
->header
->tail_entry_monotonic
))
1434 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
1439 /* alloca() can't take 0, hence let's allocate at least one */
1440 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
1442 for (i
= 0; i
< n_iovec
; i
++) {
1446 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
1450 xor_hash
^= le64toh(o
->data
.hash
);
1451 items
[i
].object_offset
= htole64(p
);
1452 items
[i
].hash
= o
->data
.hash
;
1455 /* Order by the position on disk, in order to improve seek
1456 * times for rotating media. */
1457 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
1459 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
1461 /* If the memory mapping triggered a SIGBUS then we return an
1462 * IO error and ignore the error code passed down to us, since
1463 * it is very likely just an effect of a nullified replacement
1466 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
1469 journal_file_post_change(f
);
1474 typedef struct ChainCacheItem
{
1475 uint64_t first
; /* the array at the beginning of the chain */
1476 uint64_t array
; /* the cached array */
1477 uint64_t begin
; /* the first item in the cached array */
1478 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
1479 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
1482 static void chain_cache_put(
1489 uint64_t last_index
) {
1492 /* If the chain item to cache for this chain is the
1493 * first one it's not worth caching anything */
1497 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
1498 ci
= ordered_hashmap_steal_first(h
);
1501 ci
= new(ChainCacheItem
, 1);
1508 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
1513 assert(ci
->first
== first
);
1518 ci
->last_index
= last_index
;
1521 static int generic_array_get(
1525 Object
**ret
, uint64_t *offset
) {
1528 uint64_t p
= 0, a
, t
= 0;
1536 /* Try the chain cache first */
1537 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1538 if (ci
&& i
> ci
->total
) {
1547 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1551 k
= journal_file_entry_array_n_items(o
);
1553 p
= le64toh(o
->entry_array
.items
[i
]);
1559 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1565 /* Let's cache this item for the next invocation */
1566 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
1568 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1581 static int generic_array_get_plus_one(
1586 Object
**ret
, uint64_t *offset
) {
1595 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1608 return generic_array_get(f
, first
, i
-1, ret
, offset
);
1617 static int generic_array_bisect(
1622 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1623 direction_t direction
,
1628 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
1629 bool subtract_one
= false;
1630 Object
*o
, *array
= NULL
;
1635 assert(test_object
);
1637 /* Start with the first array in the chain */
1640 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1641 if (ci
&& n
> ci
->total
) {
1642 /* Ah, we have iterated this bisection array chain
1643 * previously! Let's see if we can skip ahead in the
1644 * chain, as far as the last time. But we can't jump
1645 * backwards in the chain, so let's check that
1648 r
= test_object(f
, ci
->begin
, needle
);
1652 if (r
== TEST_LEFT
) {
1653 /* OK, what we are looking for is right of the
1654 * begin of this EntryArray, so let's jump
1655 * straight to previously cached array in the
1661 last_index
= ci
->last_index
;
1666 uint64_t left
, right
, k
, lp
;
1668 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
1672 k
= journal_file_entry_array_n_items(array
);
1678 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
1682 r
= test_object(f
, p
, needle
);
1686 if (r
== TEST_FOUND
)
1687 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1689 if (r
== TEST_RIGHT
) {
1693 if (last_index
!= (uint64_t) -1) {
1694 assert(last_index
<= right
);
1696 /* If we cached the last index we
1697 * looked at, let's try to not to jump
1698 * too wildly around and see if we can
1699 * limit the range to look at early to
1700 * the immediate neighbors of the last
1701 * index we looked at. */
1703 if (last_index
> 0) {
1704 uint64_t x
= last_index
- 1;
1706 p
= le64toh(array
->entry_array
.items
[x
]);
1710 r
= test_object(f
, p
, needle
);
1714 if (r
== TEST_FOUND
)
1715 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1717 if (r
== TEST_RIGHT
)
1723 if (last_index
< right
) {
1724 uint64_t y
= last_index
+ 1;
1726 p
= le64toh(array
->entry_array
.items
[y
]);
1730 r
= test_object(f
, p
, needle
);
1734 if (r
== TEST_FOUND
)
1735 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1737 if (r
== TEST_RIGHT
)
1745 if (left
== right
) {
1746 if (direction
== DIRECTION_UP
)
1747 subtract_one
= true;
1753 assert(left
< right
);
1754 i
= (left
+ right
) / 2;
1756 p
= le64toh(array
->entry_array
.items
[i
]);
1760 r
= test_object(f
, p
, needle
);
1764 if (r
== TEST_FOUND
)
1765 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1767 if (r
== TEST_RIGHT
)
1775 if (direction
== DIRECTION_UP
) {
1777 subtract_one
= true;
1788 last_index
= (uint64_t) -1;
1789 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
1795 if (subtract_one
&& t
== 0 && i
== 0)
1798 /* Let's cache this item for the next invocation */
1799 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
1801 if (subtract_one
&& i
== 0)
1803 else if (subtract_one
)
1804 p
= le64toh(array
->entry_array
.items
[i
-1]);
1806 p
= le64toh(array
->entry_array
.items
[i
]);
1808 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1819 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
1824 static int generic_array_bisect_plus_one(
1830 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1831 direction_t direction
,
1837 bool step_back
= false;
1841 assert(test_object
);
1846 /* This bisects the array in object 'first', but first checks
1848 r
= test_object(f
, extra
, needle
);
1852 if (r
== TEST_FOUND
)
1853 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1855 /* if we are looking with DIRECTION_UP then we need to first
1856 see if in the actual array there is a matching entry, and
1857 return the last one of that. But if there isn't any we need
1858 to return this one. Hence remember this, and return it
1861 step_back
= direction
== DIRECTION_UP
;
1863 if (r
== TEST_RIGHT
) {
1864 if (direction
== DIRECTION_DOWN
)
1870 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
1872 if (r
== 0 && step_back
)
1881 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1897 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1903 else if (p
< needle
)
1909 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1916 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1920 if (le64toh(o
->entry
.seqnum
) == needle
)
1922 else if (le64toh(o
->entry
.seqnum
) < needle
)
1928 int journal_file_move_to_entry_by_seqnum(
1931 direction_t direction
,
1935 return generic_array_bisect(f
,
1936 le64toh(f
->header
->entry_array_offset
),
1937 le64toh(f
->header
->n_entries
),
1944 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1951 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1955 if (le64toh(o
->entry
.realtime
) == needle
)
1957 else if (le64toh(o
->entry
.realtime
) < needle
)
1963 int journal_file_move_to_entry_by_realtime(
1966 direction_t direction
,
1970 return generic_array_bisect(f
,
1971 le64toh(f
->header
->entry_array_offset
),
1972 le64toh(f
->header
->n_entries
),
1974 test_object_realtime
,
1979 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1986 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1990 if (le64toh(o
->entry
.monotonic
) == needle
)
1992 else if (le64toh(o
->entry
.monotonic
) < needle
)
1998 static int find_data_object_by_boot_id(
2004 char t
[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2006 sd_id128_to_string(boot_id
, t
+ 9);
2007 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2010 int journal_file_move_to_entry_by_monotonic(
2014 direction_t direction
,
2023 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2029 return generic_array_bisect_plus_one(f
,
2030 le64toh(o
->data
.entry_offset
),
2031 le64toh(o
->data
.entry_array_offset
),
2032 le64toh(o
->data
.n_entries
),
2034 test_object_monotonic
,
2039 void journal_file_reset_location(JournalFile
*f
) {
2040 f
->location_type
= LOCATION_HEAD
;
2041 f
->current_offset
= 0;
2042 f
->current_seqnum
= 0;
2043 f
->current_realtime
= 0;
2044 f
->current_monotonic
= 0;
2045 zero(f
->current_boot_id
);
2046 f
->current_xor_hash
= 0;
2049 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2050 f
->location_type
= LOCATION_SEEK
;
2051 f
->current_offset
= offset
;
2052 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2053 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2054 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2055 f
->current_boot_id
= o
->entry
.boot_id
;
2056 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2059 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2062 assert(af
->location_type
== LOCATION_SEEK
);
2063 assert(bf
->location_type
== LOCATION_SEEK
);
2065 /* If contents and timestamps match, these entries are
2066 * identical, even if the seqnum does not match */
2067 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2068 af
->current_monotonic
== bf
->current_monotonic
&&
2069 af
->current_realtime
== bf
->current_realtime
&&
2070 af
->current_xor_hash
== bf
->current_xor_hash
)
2073 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2075 /* If this is from the same seqnum source, compare
2077 if (af
->current_seqnum
< bf
->current_seqnum
)
2079 if (af
->current_seqnum
> bf
->current_seqnum
)
2082 /* Wow! This is weird, different data but the same
2083 * seqnums? Something is borked, but let's make the
2084 * best of it and compare by time. */
2087 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2089 /* If the boot id matches, compare monotonic time */
2090 if (af
->current_monotonic
< bf
->current_monotonic
)
2092 if (af
->current_monotonic
> bf
->current_monotonic
)
2096 /* Otherwise, compare UTC time */
2097 if (af
->current_realtime
< bf
->current_realtime
)
2099 if (af
->current_realtime
> bf
->current_realtime
)
2102 /* Finally, compare by contents */
2103 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2105 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2111 int journal_file_next_entry(
2114 direction_t direction
,
2115 Object
**ret
, uint64_t *offset
) {
2122 n
= le64toh(f
->header
->n_entries
);
2127 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2129 r
= generic_array_bisect(f
,
2130 le64toh(f
->header
->entry_array_offset
),
2131 le64toh(f
->header
->n_entries
),
2140 if (direction
== DIRECTION_DOWN
) {
2153 /* And jump to it */
2154 r
= generic_array_get(f
,
2155 le64toh(f
->header
->entry_array_offset
),
2162 (direction
== DIRECTION_DOWN
? ofs
<= p
: ofs
>= p
)) {
2163 log_debug("%s: entry array corrupted at entry %"PRIu64
,
2174 int journal_file_next_entry_for_data(
2176 Object
*o
, uint64_t p
,
2177 uint64_t data_offset
,
2178 direction_t direction
,
2179 Object
**ret
, uint64_t *offset
) {
2186 assert(p
> 0 || !o
);
2188 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2192 n
= le64toh(d
->data
.n_entries
);
2197 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2199 if (o
->object
.type
!= OBJECT_ENTRY
)
2202 r
= generic_array_bisect_plus_one(f
,
2203 le64toh(d
->data
.entry_offset
),
2204 le64toh(d
->data
.entry_array_offset
),
2205 le64toh(d
->data
.n_entries
),
2215 if (direction
== DIRECTION_DOWN
) {
2229 return generic_array_get_plus_one(f
,
2230 le64toh(d
->data
.entry_offset
),
2231 le64toh(d
->data
.entry_array_offset
),
2236 int journal_file_move_to_entry_by_offset_for_data(
2238 uint64_t data_offset
,
2240 direction_t direction
,
2241 Object
**ret
, uint64_t *offset
) {
2248 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2252 return generic_array_bisect_plus_one(f
,
2253 le64toh(d
->data
.entry_offset
),
2254 le64toh(d
->data
.entry_array_offset
),
2255 le64toh(d
->data
.n_entries
),
2262 int journal_file_move_to_entry_by_monotonic_for_data(
2264 uint64_t data_offset
,
2267 direction_t direction
,
2268 Object
**ret
, uint64_t *offset
) {
2276 /* First, seek by time */
2277 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2283 r
= generic_array_bisect_plus_one(f
,
2284 le64toh(o
->data
.entry_offset
),
2285 le64toh(o
->data
.entry_array_offset
),
2286 le64toh(o
->data
.n_entries
),
2288 test_object_monotonic
,
2294 /* And now, continue seeking until we find an entry that
2295 * exists in both bisection arrays */
2301 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2305 r
= generic_array_bisect_plus_one(f
,
2306 le64toh(d
->data
.entry_offset
),
2307 le64toh(d
->data
.entry_array_offset
),
2308 le64toh(d
->data
.n_entries
),
2316 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2320 r
= generic_array_bisect_plus_one(f
,
2321 le64toh(o
->data
.entry_offset
),
2322 le64toh(o
->data
.entry_array_offset
),
2323 le64toh(o
->data
.n_entries
),
2345 int journal_file_move_to_entry_by_seqnum_for_data(
2347 uint64_t data_offset
,
2349 direction_t direction
,
2350 Object
**ret
, uint64_t *offset
) {
2357 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2361 return generic_array_bisect_plus_one(f
,
2362 le64toh(d
->data
.entry_offset
),
2363 le64toh(d
->data
.entry_array_offset
),
2364 le64toh(d
->data
.n_entries
),
2371 int journal_file_move_to_entry_by_realtime_for_data(
2373 uint64_t data_offset
,
2375 direction_t direction
,
2376 Object
**ret
, uint64_t *offset
) {
2383 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2387 return generic_array_bisect_plus_one(f
,
2388 le64toh(d
->data
.entry_offset
),
2389 le64toh(d
->data
.entry_array_offset
),
2390 le64toh(d
->data
.n_entries
),
2392 test_object_realtime
,
2397 void journal_file_dump(JournalFile
*f
) {
2404 journal_file_print_header(f
);
2406 p
= le64toh(f
->header
->header_size
);
2408 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
2412 switch (o
->object
.type
) {
2415 printf("Type: OBJECT_UNUSED\n");
2419 printf("Type: OBJECT_DATA\n");
2423 printf("Type: OBJECT_FIELD\n");
2427 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
2428 le64toh(o
->entry
.seqnum
),
2429 le64toh(o
->entry
.monotonic
),
2430 le64toh(o
->entry
.realtime
));
2433 case OBJECT_FIELD_HASH_TABLE
:
2434 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2437 case OBJECT_DATA_HASH_TABLE
:
2438 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2441 case OBJECT_ENTRY_ARRAY
:
2442 printf("Type: OBJECT_ENTRY_ARRAY\n");
2446 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
2447 le64toh(o
->tag
.seqnum
),
2448 le64toh(o
->tag
.epoch
));
2452 printf("Type: unknown (%i)\n", o
->object
.type
);
2456 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
2457 printf("Flags: %s\n",
2458 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
2460 if (p
== le64toh(f
->header
->tail_object_offset
))
2463 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
2468 log_error("File corrupt");
2471 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
2474 x
= format_timestamp(buf
, l
, t
);
2480 void journal_file_print_header(JournalFile
*f
) {
2481 char a
[33], b
[33], c
[33], d
[33];
2482 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
2484 char bytes
[FORMAT_BYTES_MAX
];
2488 printf("File Path: %s\n"
2492 "Sequential Number ID: %s\n"
2494 "Compatible Flags:%s%s\n"
2495 "Incompatible Flags:%s%s%s\n"
2496 "Header size: %"PRIu64
"\n"
2497 "Arena size: %"PRIu64
"\n"
2498 "Data Hash Table Size: %"PRIu64
"\n"
2499 "Field Hash Table Size: %"PRIu64
"\n"
2500 "Rotate Suggested: %s\n"
2501 "Head Sequential Number: %"PRIu64
"\n"
2502 "Tail Sequential Number: %"PRIu64
"\n"
2503 "Head Realtime Timestamp: %s\n"
2504 "Tail Realtime Timestamp: %s\n"
2505 "Tail Monotonic Timestamp: %s\n"
2506 "Objects: %"PRIu64
"\n"
2507 "Entry Objects: %"PRIu64
"\n",
2509 sd_id128_to_string(f
->header
->file_id
, a
),
2510 sd_id128_to_string(f
->header
->machine_id
, b
),
2511 sd_id128_to_string(f
->header
->boot_id
, c
),
2512 sd_id128_to_string(f
->header
->seqnum_id
, d
),
2513 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
2514 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
2515 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
2516 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
2517 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
2518 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
2519 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
2520 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
2521 le64toh(f
->header
->header_size
),
2522 le64toh(f
->header
->arena_size
),
2523 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
2524 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
2525 yes_no(journal_file_rotate_suggested(f
, 0)),
2526 le64toh(f
->header
->head_entry_seqnum
),
2527 le64toh(f
->header
->tail_entry_seqnum
),
2528 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)),
2529 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)),
2530 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
),
2531 le64toh(f
->header
->n_objects
),
2532 le64toh(f
->header
->n_entries
));
2534 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
2535 printf("Data Objects: %"PRIu64
"\n"
2536 "Data Hash Table Fill: %.1f%%\n",
2537 le64toh(f
->header
->n_data
),
2538 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
2540 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
2541 printf("Field Objects: %"PRIu64
"\n"
2542 "Field Hash Table Fill: %.1f%%\n",
2543 le64toh(f
->header
->n_fields
),
2544 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
2546 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
2547 printf("Tag Objects: %"PRIu64
"\n",
2548 le64toh(f
->header
->n_tags
));
2549 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
2550 printf("Entry Array Objects: %"PRIu64
"\n",
2551 le64toh(f
->header
->n_entry_arrays
));
2553 if (fstat(f
->fd
, &st
) >= 0)
2554 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (uint64_t) st
.st_blocks
* 512ULL));
2557 static int journal_file_warn_btrfs(JournalFile
*f
) {
2563 /* Before we write anything, check if the COW logic is turned
2564 * off on btrfs. Given our write pattern that is quite
2565 * unfriendly to COW file systems this should greatly improve
2566 * performance on COW file systems, such as btrfs, at the
2567 * expense of data integrity features (which shouldn't be too
2568 * bad, given that we do our own checksumming). */
2570 r
= btrfs_is_filesystem(f
->fd
);
2572 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
2576 r
= read_attr_fd(f
->fd
, &attrs
);
2578 return log_warning_errno(r
, "Failed to read file attributes: %m");
2580 if (attrs
& FS_NOCOW_FL
) {
2581 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2585 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2586 "This is likely to slow down journal access substantially, please consider turning "
2587 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
2592 int journal_file_open(
2598 JournalMetrics
*metrics
,
2599 MMapCache
*mmap_cache
,
2600 JournalFile
*template,
2601 JournalFile
**ret
) {
2603 bool newly_created
= false;
2611 if ((flags
& O_ACCMODE
) != O_RDONLY
&&
2612 (flags
& O_ACCMODE
) != O_RDWR
)
2615 if (!endswith(fname
, ".journal") &&
2616 !endswith(fname
, ".journal~"))
2619 f
= new0(JournalFile
, 1);
2627 f
->prot
= prot_from_flags(flags
);
2628 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
2629 #if defined(HAVE_LZ4)
2630 f
->compress_lz4
= compress
;
2631 #elif defined(HAVE_XZ)
2632 f
->compress_xz
= compress
;
2639 f
->mmap
= mmap_cache_ref(mmap_cache
);
2641 f
->mmap
= mmap_cache_new();
2648 f
->path
= strdup(fname
);
2654 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
2655 if (!f
->chain_cache
) {
2660 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
, f
->mode
);
2666 r
= journal_file_fstat(f
);
2670 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
2672 (void) journal_file_warn_btrfs(f
);
2674 /* Let's attach the creation time to the journal file,
2675 * so that the vacuuming code knows the age of this
2676 * file even if the file might end up corrupted one
2677 * day... Ideally we'd just use the creation time many
2678 * file systems maintain for each file, but there is
2679 * currently no usable API to query this, hence let's
2680 * emulate this via extended attributes. If extended
2681 * attributes are not supported we'll just skip this,
2682 * and rely solely on mtime/atime/ctime of the file. */
2684 fd_setcrtime(f
->fd
, 0);
2687 /* Try to load the FSPRG state, and if we can't, then
2688 * just don't do sealing */
2690 r
= journal_file_fss_load(f
);
2696 r
= journal_file_init_header(f
, template);
2700 r
= journal_file_fstat(f
);
2704 newly_created
= true;
2707 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
2712 r
= mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
);
2718 if (!newly_created
) {
2719 r
= journal_file_verify_header(f
);
2725 if (!newly_created
&& f
->writable
) {
2726 r
= journal_file_fss_load(f
);
2734 journal_default_metrics(metrics
, f
->fd
);
2735 f
->metrics
= *metrics
;
2736 } else if (template)
2737 f
->metrics
= template->metrics
;
2739 r
= journal_file_refresh_header(f
);
2745 r
= journal_file_hmac_setup(f
);
2750 if (newly_created
) {
2751 r
= journal_file_setup_field_hash_table(f
);
2755 r
= journal_file_setup_data_hash_table(f
);
2760 r
= journal_file_append_first_tag(f
);
2766 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
)) {
2775 if (f
->fd
>= 0 && mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
2778 journal_file_close(f
);
2783 int journal_file_rotate(JournalFile
**f
, bool compress
, bool seal
) {
2784 _cleanup_free_
char *p
= NULL
;
2786 JournalFile
*old_file
, *new_file
= NULL
;
2794 if (!old_file
->writable
)
2797 if (!endswith(old_file
->path
, ".journal"))
2800 l
= strlen(old_file
->path
);
2801 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
2802 (int) l
- 8, old_file
->path
,
2803 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
2804 le64toh((*f
)->header
->head_entry_seqnum
),
2805 le64toh((*f
)->header
->head_entry_realtime
));
2809 /* Try to rename the file to the archived version. If the file
2810 * already was deleted, we'll get ENOENT, let's ignore that
2812 r
= rename(old_file
->path
, p
);
2813 if (r
< 0 && errno
!= ENOENT
)
2816 old_file
->header
->state
= STATE_ARCHIVED
;
2818 /* Currently, btrfs is not very good with out write patterns
2819 * and fragments heavily. Let's defrag our journal files when
2820 * we archive them */
2821 old_file
->defrag_on_close
= true;
2823 r
= journal_file_open(old_file
->path
, old_file
->flags
, old_file
->mode
, compress
, seal
, NULL
, old_file
->mmap
, old_file
, &new_file
);
2824 journal_file_close(old_file
);
2830 int journal_file_open_reliably(
2836 JournalMetrics
*metrics
,
2837 MMapCache
*mmap_cache
,
2838 JournalFile
*template,
2839 JournalFile
**ret
) {
2843 _cleanup_free_
char *p
= NULL
;
2845 r
= journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2847 -EBADMSG
, /* corrupted */
2848 -ENODATA
, /* truncated */
2849 -EHOSTDOWN
, /* other machine */
2850 -EPROTONOSUPPORT
, /* incompatible feature */
2851 -EBUSY
, /* unclean shutdown */
2852 -ESHUTDOWN
, /* already archived */
2853 -EIO
, /* IO error, including SIGBUS on mmap */
2854 -EIDRM
/* File has been deleted */))
2857 if ((flags
& O_ACCMODE
) == O_RDONLY
)
2860 if (!(flags
& O_CREAT
))
2863 if (!endswith(fname
, ".journal"))
2866 /* The file is corrupted. Rotate it away and try it again (but only once) */
2869 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
2871 now(CLOCK_REALTIME
),
2875 if (rename(fname
, p
) < 0)
2878 /* btrfs doesn't cope well with our write pattern and
2879 * fragments heavily. Let's defrag all files we rotate */
2881 (void) chattr_path(p
, false, FS_NOCOW_FL
);
2882 (void) btrfs_defrag(p
);
2884 log_warning_errno(r
, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
2886 return journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2889 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
2891 uint64_t q
, xor_hash
= 0;
2904 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
2905 ts
.realtime
= le64toh(o
->entry
.realtime
);
2907 n
= journal_file_entry_n_items(o
);
2908 /* alloca() can't take 0, hence let's allocate at least one */
2909 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
2911 for (i
= 0; i
< n
; i
++) {
2918 q
= le64toh(o
->entry
.items
[i
].object_offset
);
2919 le_hash
= o
->entry
.items
[i
].hash
;
2921 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
2925 if (le_hash
!= o
->data
.hash
)
2928 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
2931 /* We hit the limit on 32bit machines */
2932 if ((uint64_t) t
!= l
)
2935 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
2936 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2939 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
2940 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
2944 data
= from
->compress_buffer
;
2947 return -EPROTONOSUPPORT
;
2950 data
= o
->data
.payload
;
2952 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
2956 xor_hash
^= le64toh(u
->data
.hash
);
2957 items
[i
].object_offset
= htole64(h
);
2958 items
[i
].hash
= u
->data
.hash
;
2960 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
2965 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
2967 if (mmap_cache_got_sigbus(to
->mmap
, to
->fd
))
2973 void journal_reset_metrics(JournalMetrics
*m
) {
2976 /* Set everything to "pick automatic values". */
2978 *m
= (JournalMetrics
) {
2979 .min_use
= (uint64_t) -1,
2980 .max_use
= (uint64_t) -1,
2981 .min_size
= (uint64_t) -1,
2982 .max_size
= (uint64_t) -1,
2983 .keep_free
= (uint64_t) -1,
2984 .n_max_files
= (uint64_t) -1,
2988 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
2989 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
], e
[FORMAT_BYTES_MAX
];
2996 if (fstatvfs(fd
, &ss
) >= 0)
2997 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
2999 log_debug_errno(errno
, "Failed to detremine disk size: %m");
3003 if (m
->max_use
== (uint64_t) -1) {
3006 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
3008 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
3009 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
3011 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
3012 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3014 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3016 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3018 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3019 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3022 if (m
->min_use
== (uint64_t) -1)
3023 m
->min_use
= DEFAULT_MIN_USE
;
3025 if (m
->min_use
> m
->max_use
)
3026 m
->min_use
= m
->max_use
;
3028 if (m
->max_size
== (uint64_t) -1) {
3029 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3031 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3032 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3034 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3036 if (m
->max_size
!= 0) {
3037 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3038 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3040 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3041 m
->max_use
= m
->max_size
*2;
3044 if (m
->min_size
== (uint64_t) -1)
3045 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3047 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3049 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3050 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3052 if (m
->max_size
!= 0 && m
->min_size
> m
->max_size
)
3053 m
->max_size
= m
->min_size
;
3056 if (m
->keep_free
== (uint64_t) -1) {
3059 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3061 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3062 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3065 m
->keep_free
= DEFAULT_KEEP_FREE
;
3068 if (m
->n_max_files
== (uint64_t) -1)
3069 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3071 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3072 format_bytes(a
, sizeof(a
), m
->min_use
),
3073 format_bytes(b
, sizeof(b
), m
->max_use
),
3074 format_bytes(c
, sizeof(c
), m
->max_size
),
3075 format_bytes(d
, sizeof(d
), m
->min_size
),
3076 format_bytes(e
, sizeof(e
), m
->keep_free
),
3080 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3085 if (f
->header
->head_entry_realtime
== 0)
3088 *from
= le64toh(f
->header
->head_entry_realtime
);
3092 if (f
->header
->tail_entry_realtime
== 0)
3095 *to
= le64toh(f
->header
->tail_entry_realtime
);
3101 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3109 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3113 if (le64toh(o
->data
.n_entries
) <= 0)
3117 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3121 *from
= le64toh(o
->entry
.monotonic
);
3125 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3129 r
= generic_array_get_plus_one(f
,
3130 le64toh(o
->data
.entry_offset
),
3131 le64toh(o
->data
.entry_array_offset
),
3132 le64toh(o
->data
.n_entries
)-1,
3137 *to
= le64toh(o
->entry
.monotonic
);
3143 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3146 /* If we gained new header fields we gained new features,
3147 * hence suggest a rotation */
3148 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3149 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3153 /* Let's check if the hash tables grew over a certain fill
3154 * level (75%, borrowing this value from Java's hash table
3155 * implementation), and if so suggest a rotation. To calculate
3156 * the fill level we need the n_data field, which only exists
3157 * in newer versions. */
3159 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3160 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3161 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3163 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3164 le64toh(f
->header
->n_data
),
3165 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3166 (unsigned long long) f
->last_stat
.st_size
,
3167 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3171 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3172 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3173 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3175 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3176 le64toh(f
->header
->n_fields
),
3177 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3181 /* Are the data objects properly indexed by field objects? */
3182 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3183 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3184 le64toh(f
->header
->n_data
) > 0 &&
3185 le64toh(f
->header
->n_fields
) == 0)
3188 if (max_file_usec
> 0) {
3191 h
= le64toh(f
->header
->head_entry_realtime
);
3192 t
= now(CLOCK_REALTIME
);
3194 if (h
> 0 && t
> h
+ max_file_usec
)