1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/statvfs.h>
31 #include "btrfs-util.h"
34 #include "journal-authenticate.h"
35 #include "journal-def.h"
36 #include "journal-file.h"
38 #include "random-util.h"
39 #include "string-util.h"
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
55 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
57 /* This is the upper bound if we deduce max_size from max_use */
58 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
60 /* This is the upper bound if we deduce the keep_free value from the
62 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
64 /* This is the keep_free value when we can't determine the system
66 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
68 /* This is the default maximum number of journal files to keep around. */
69 #define DEFAULT_N_MAX_FILES (100)
71 /* n_data was the first entry we added after the initial file format design */
72 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
74 /* How many entries to keep in the entry array chain cache at max */
75 #define CHAIN_CACHE_MAX 20
77 /* How much to increase the journal file size at once each time we allocate something new. */
78 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
80 /* Reread fstat() of the file for detecting deletions at least this often */
81 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
83 /* The mmap context to use for the header we pick as one above the last defined typed */
84 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
86 static int journal_file_set_online(JournalFile
*f
) {
92 if (!(f
->fd
>= 0 && f
->header
))
95 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
98 switch(f
->header
->state
) {
103 f
->header
->state
= STATE_ONLINE
;
112 int journal_file_set_offline(JournalFile
*f
) {
118 if (!(f
->fd
>= 0 && f
->header
))
121 if (f
->header
->state
!= STATE_ONLINE
)
126 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
129 f
->header
->state
= STATE_OFFLINE
;
131 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
139 JournalFile
* journal_file_close(JournalFile
*f
) {
143 /* Write the final tag */
144 if (f
->seal
&& f
->writable
)
145 journal_file_append_tag(f
);
148 journal_file_set_offline(f
);
150 if (f
->mmap
&& f
->fd
>= 0)
151 mmap_cache_close_fd(f
->mmap
, f
->fd
);
153 if (f
->fd
>= 0 && f
->defrag_on_close
) {
155 /* Be friendly to btrfs: turn COW back on again now,
156 * and defragment the file. We won't write to the file
157 * ever again, hence remove all fragmentation, and
158 * reenable all the good bits COW usually provides
159 * (such as data checksumming). */
161 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
162 (void) btrfs_defrag_fd(f
->fd
);
169 mmap_cache_unref(f
->mmap
);
171 ordered_hashmap_free_free(f
->chain_cache
);
173 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
174 free(f
->compress_buffer
);
179 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
181 free(f
->fsprg_state
);
186 gcry_md_close(f
->hmac
);
193 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
200 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
201 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
203 h
.incompatible_flags
|= htole32(
204 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
205 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
207 h
.compatible_flags
= htole32(
208 f
->seal
* HEADER_COMPATIBLE_SEALED
);
210 r
= sd_id128_randomize(&h
.file_id
);
215 h
.seqnum_id
= template->header
->seqnum_id
;
216 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
218 h
.seqnum_id
= h
.file_id
;
220 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
230 static int journal_file_refresh_header(JournalFile
*f
) {
236 r
= sd_id128_get_machine(&f
->header
->machine_id
);
240 r
= sd_id128_get_boot(&boot_id
);
244 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
245 f
->tail_entry_monotonic_valid
= true;
247 f
->header
->boot_id
= boot_id
;
249 r
= journal_file_set_online(f
);
251 /* Sync the online state to disk */
257 static int journal_file_verify_header(JournalFile
*f
) {
262 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
265 /* In both read and write mode we refuse to open files with
266 * incompatible flags we don't know */
267 flags
= le32toh(f
->header
->incompatible_flags
);
268 if (flags
& ~HEADER_INCOMPATIBLE_SUPPORTED
) {
269 if (flags
& ~HEADER_INCOMPATIBLE_ANY
)
270 log_debug("Journal file %s has unknown incompatible flags %"PRIx32
,
271 f
->path
, flags
& ~HEADER_INCOMPATIBLE_ANY
);
272 flags
= (flags
& HEADER_INCOMPATIBLE_ANY
) & ~HEADER_INCOMPATIBLE_SUPPORTED
;
274 log_debug("Journal file %s uses incompatible flags %"PRIx32
275 " disabled at compilation time.", f
->path
, flags
);
276 return -EPROTONOSUPPORT
;
279 /* When open for writing we refuse to open files with
280 * compatible flags, too */
281 flags
= le32toh(f
->header
->compatible_flags
);
282 if (f
->writable
&& (flags
& ~HEADER_COMPATIBLE_SUPPORTED
)) {
283 if (flags
& ~HEADER_COMPATIBLE_ANY
)
284 log_debug("Journal file %s has unknown compatible flags %"PRIx32
,
285 f
->path
, flags
& ~HEADER_COMPATIBLE_ANY
);
286 flags
= (flags
& HEADER_COMPATIBLE_ANY
) & ~HEADER_COMPATIBLE_SUPPORTED
;
288 log_debug("Journal file %s uses compatible flags %"PRIx32
289 " disabled at compilation time.", f
->path
, flags
);
290 return -EPROTONOSUPPORT
;
293 if (f
->header
->state
>= _STATE_MAX
)
296 /* The first addition was n_data, so check that we are at least this large */
297 if (le64toh(f
->header
->header_size
) < HEADER_SIZE_MIN
)
300 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
303 if ((le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)) > (uint64_t) f
->last_stat
.st_size
)
306 if (le64toh(f
->header
->tail_object_offset
) > (le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)))
309 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
310 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
311 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
312 !VALID64(le64toh(f
->header
->entry_array_offset
)))
317 sd_id128_t machine_id
;
320 r
= sd_id128_get_machine(&machine_id
);
324 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
327 state
= f
->header
->state
;
329 if (state
== STATE_ONLINE
) {
330 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
332 } else if (state
== STATE_ARCHIVED
)
334 else if (state
!= STATE_OFFLINE
) {
335 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
340 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
341 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
343 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
348 static int journal_file_fstat(JournalFile
*f
) {
352 if (fstat(f
->fd
, &f
->last_stat
) < 0)
355 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
357 /* Refuse appending to files that are already deleted */
358 if (f
->last_stat
.st_nlink
<= 0)
364 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
365 uint64_t old_size
, new_size
;
370 /* We assume that this file is not sparse, and we know that
371 * for sure, since we always call posix_fallocate()
374 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
378 le64toh(f
->header
->header_size
) +
379 le64toh(f
->header
->arena_size
);
381 new_size
= PAGE_ALIGN(offset
+ size
);
382 if (new_size
< le64toh(f
->header
->header_size
))
383 new_size
= le64toh(f
->header
->header_size
);
385 if (new_size
<= old_size
) {
387 /* We already pre-allocated enough space, but before
388 * we write to it, let's check with fstat() if the
389 * file got deleted, in order make sure we don't throw
390 * away the data immediately. Don't check fstat() for
391 * all writes though, but only once ever 10s. */
393 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
396 return journal_file_fstat(f
);
399 /* Allocate more space. */
401 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
404 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
407 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
410 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
412 if (new_size
- old_size
> available
)
417 /* Increase by larger blocks at once */
418 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
419 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
420 new_size
= f
->metrics
.max_size
;
422 /* Note that the glibc fallocate() fallback is very
423 inefficient, hence we try to minimize the allocation area
425 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
429 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
431 return journal_file_fstat(f
);
434 static unsigned type_to_context(ObjectType type
) {
435 /* One context for each type, plus one catch-all for the rest */
436 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
437 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
438 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
441 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
) {
450 /* Avoid SIGBUS on invalid accesses */
451 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
452 /* Hmm, out of range? Let's refresh the fstat() data
453 * first, before we trust that check. */
455 r
= journal_file_fstat(f
);
459 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
460 return -EADDRNOTAVAIL
;
463 return mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
);
466 static uint64_t minimum_header_size(Object
*o
) {
468 static const uint64_t table
[] = {
469 [OBJECT_DATA
] = sizeof(DataObject
),
470 [OBJECT_FIELD
] = sizeof(FieldObject
),
471 [OBJECT_ENTRY
] = sizeof(EntryObject
),
472 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
473 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
474 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
475 [OBJECT_TAG
] = sizeof(TagObject
),
478 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
479 return sizeof(ObjectHeader
);
481 return table
[o
->object
.type
];
484 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
493 /* Objects may only be located at multiple of 64 bit */
494 if (!VALID64(offset
))
497 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
);
502 s
= le64toh(o
->object
.size
);
504 if (s
< sizeof(ObjectHeader
))
507 if (o
->object
.type
<= OBJECT_UNUSED
)
510 if (s
< minimum_header_size(o
))
513 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
)
516 if (s
> sizeof(ObjectHeader
)) {
517 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
);
528 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
533 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
536 /* If an external seqnum counter was passed, we update
537 * both the local and the external one, and set it to
538 * the maximum of both */
546 f
->header
->tail_entry_seqnum
= htole64(r
);
548 if (f
->header
->head_entry_seqnum
== 0)
549 f
->header
->head_entry_seqnum
= htole64(r
);
554 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
561 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
562 assert(size
>= sizeof(ObjectHeader
));
566 r
= journal_file_set_online(f
);
570 p
= le64toh(f
->header
->tail_object_offset
);
572 p
= le64toh(f
->header
->header_size
);
574 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
578 p
+= ALIGN64(le64toh(tail
->object
.size
));
581 r
= journal_file_allocate(f
, p
, size
);
585 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
);
592 o
->object
.type
= type
;
593 o
->object
.size
= htole64(size
);
595 f
->header
->tail_object_offset
= htole64(p
);
596 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
604 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
611 /* We estimate that we need 1 hash table entry per 768 bytes
612 of journal file and we want to make sure we never get
613 beyond 75% fill level. Calculate the hash table size for
614 the maximum file size based on these metrics. */
616 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
617 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
618 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
620 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
622 r
= journal_file_append_object(f
,
623 OBJECT_DATA_HASH_TABLE
,
624 offsetof(Object
, hash_table
.items
) + s
,
629 memzero(o
->hash_table
.items
, s
);
631 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
632 f
->header
->data_hash_table_size
= htole64(s
);
637 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
644 /* We use a fixed size hash table for the fields as this
645 * number should grow very slowly only */
647 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
648 r
= journal_file_append_object(f
,
649 OBJECT_FIELD_HASH_TABLE
,
650 offsetof(Object
, hash_table
.items
) + s
,
655 memzero(o
->hash_table
.items
, s
);
657 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
658 f
->header
->field_hash_table_size
= htole64(s
);
663 int journal_file_map_data_hash_table(JournalFile
*f
) {
670 if (f
->data_hash_table
)
673 p
= le64toh(f
->header
->data_hash_table_offset
);
674 s
= le64toh(f
->header
->data_hash_table_size
);
676 r
= journal_file_move_to(f
,
677 OBJECT_DATA_HASH_TABLE
,
684 f
->data_hash_table
= t
;
688 int journal_file_map_field_hash_table(JournalFile
*f
) {
695 if (f
->field_hash_table
)
698 p
= le64toh(f
->header
->field_hash_table_offset
);
699 s
= le64toh(f
->header
->field_hash_table_size
);
701 r
= journal_file_move_to(f
,
702 OBJECT_FIELD_HASH_TABLE
,
709 f
->field_hash_table
= t
;
713 static int journal_file_link_field(
726 if (o
->object
.type
!= OBJECT_FIELD
)
729 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
733 /* This might alter the window we are looking at */
734 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
737 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
739 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
741 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
745 o
->field
.next_hash_offset
= htole64(offset
);
748 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
750 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
751 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
756 static int journal_file_link_data(
769 if (o
->object
.type
!= OBJECT_DATA
)
772 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
776 /* This might alter the window we are looking at */
777 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
778 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
779 o
->data
.n_entries
= 0;
782 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
784 /* Only entry in the hash table is easy */
785 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
787 /* Move back to the previous data object, to patch in
790 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
794 o
->data
.next_hash_offset
= htole64(offset
);
797 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
799 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
800 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
805 int journal_file_find_field_object_with_hash(
807 const void *field
, uint64_t size
, uint64_t hash
,
808 Object
**ret
, uint64_t *offset
) {
810 uint64_t p
, osize
, h
, m
;
814 assert(field
&& size
> 0);
816 /* If the field hash table is empty, we can't find anything */
817 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
820 /* Map the field hash table, if it isn't mapped yet. */
821 r
= journal_file_map_field_hash_table(f
);
825 osize
= offsetof(Object
, field
.payload
) + size
;
827 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
832 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
837 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
841 if (le64toh(o
->field
.hash
) == hash
&&
842 le64toh(o
->object
.size
) == osize
&&
843 memcmp(o
->field
.payload
, field
, size
) == 0) {
853 p
= le64toh(o
->field
.next_hash_offset
);
859 int journal_file_find_field_object(
861 const void *field
, uint64_t size
,
862 Object
**ret
, uint64_t *offset
) {
867 assert(field
&& size
> 0);
869 hash
= hash64(field
, size
);
871 return journal_file_find_field_object_with_hash(f
,
876 int journal_file_find_data_object_with_hash(
878 const void *data
, uint64_t size
, uint64_t hash
,
879 Object
**ret
, uint64_t *offset
) {
881 uint64_t p
, osize
, h
, m
;
885 assert(data
|| size
== 0);
887 /* If there's no data hash table, then there's no entry. */
888 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
891 /* Map the data hash table, if it isn't mapped yet. */
892 r
= journal_file_map_data_hash_table(f
);
896 osize
= offsetof(Object
, data
.payload
) + size
;
898 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
903 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
908 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
912 if (le64toh(o
->data
.hash
) != hash
)
915 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
916 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
920 l
= le64toh(o
->object
.size
);
921 if (l
<= offsetof(Object
, data
.payload
))
924 l
-= offsetof(Object
, data
.payload
);
926 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
927 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
932 memcmp(f
->compress_buffer
, data
, size
) == 0) {
943 return -EPROTONOSUPPORT
;
945 } else if (le64toh(o
->object
.size
) == osize
&&
946 memcmp(o
->data
.payload
, data
, size
) == 0) {
958 p
= le64toh(o
->data
.next_hash_offset
);
964 int journal_file_find_data_object(
966 const void *data
, uint64_t size
,
967 Object
**ret
, uint64_t *offset
) {
972 assert(data
|| size
== 0);
974 hash
= hash64(data
, size
);
976 return journal_file_find_data_object_with_hash(f
,
981 static int journal_file_append_field(
983 const void *field
, uint64_t size
,
984 Object
**ret
, uint64_t *offset
) {
992 assert(field
&& size
> 0);
994 hash
= hash64(field
, size
);
996 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1010 osize
= offsetof(Object
, field
.payload
) + size
;
1011 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1015 o
->field
.hash
= htole64(hash
);
1016 memcpy(o
->field
.payload
, field
, size
);
1018 r
= journal_file_link_field(f
, o
, p
, hash
);
1022 /* The linking might have altered the window, so let's
1023 * refresh our pointer */
1024 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1029 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1043 static int journal_file_append_data(
1045 const void *data
, uint64_t size
,
1046 Object
**ret
, uint64_t *offset
) {
1051 int r
, compression
= 0;
1055 assert(data
|| size
== 0);
1057 hash
= hash64(data
, size
);
1059 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1073 osize
= offsetof(Object
, data
.payload
) + size
;
1074 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1078 o
->data
.hash
= htole64(hash
);
1080 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1081 if (JOURNAL_FILE_COMPRESS(f
) && size
>= COMPRESSION_SIZE_THRESHOLD
) {
1084 compression
= compress_blob(data
, size
, o
->data
.payload
, &rsize
);
1086 if (compression
>= 0) {
1087 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1088 o
->object
.flags
|= compression
;
1090 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1091 size
, rsize
, object_compressed_to_string(compression
));
1093 /* Compression didn't work, we don't really care why, let's continue without compression */
1098 if (compression
== 0 && size
> 0)
1099 memcpy(o
->data
.payload
, data
, size
);
1101 r
= journal_file_link_data(f
, o
, p
, hash
);
1105 /* The linking might have altered the window, so let's
1106 * refresh our pointer */
1107 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1114 eq
= memchr(data
, '=', size
);
1115 if (eq
&& eq
> data
) {
1119 /* Create field object ... */
1120 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1124 /* ... and link it in. */
1125 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1126 fo
->field
.head_data_offset
= le64toh(p
);
1130 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1144 uint64_t journal_file_entry_n_items(Object
*o
) {
1147 if (o
->object
.type
!= OBJECT_ENTRY
)
1150 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1153 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1156 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1159 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1162 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1165 if (o
->object
.type
!= OBJECT_DATA_HASH_TABLE
&&
1166 o
->object
.type
!= OBJECT_FIELD_HASH_TABLE
)
1169 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1172 static int link_entry_into_array(JournalFile
*f
,
1177 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1185 a
= le64toh(*first
);
1186 i
= hidx
= le64toh(*idx
);
1189 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1193 n
= journal_file_entry_array_n_items(o
);
1195 o
->entry_array
.items
[i
] = htole64(p
);
1196 *idx
= htole64(hidx
+ 1);
1202 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1213 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1214 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1220 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1225 o
->entry_array
.items
[i
] = htole64(p
);
1228 *first
= htole64(q
);
1230 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1234 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1237 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1238 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1240 *idx
= htole64(hidx
+ 1);
1245 static int link_entry_into_array_plus_one(JournalFile
*f
,
1260 *extra
= htole64(p
);
1264 i
= htole64(le64toh(*idx
) - 1);
1265 r
= link_entry_into_array(f
, first
, &i
, p
);
1270 *idx
= htole64(le64toh(*idx
) + 1);
1274 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1281 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1285 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1289 return link_entry_into_array_plus_one(f
,
1290 &o
->data
.entry_offset
,
1291 &o
->data
.entry_array_offset
,
1296 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1304 if (o
->object
.type
!= OBJECT_ENTRY
)
1307 __sync_synchronize();
1309 /* Link up the entry itself */
1310 r
= link_entry_into_array(f
,
1311 &f
->header
->entry_array_offset
,
1312 &f
->header
->n_entries
,
1317 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1319 if (f
->header
->head_entry_realtime
== 0)
1320 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1322 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1323 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1325 f
->tail_entry_monotonic_valid
= true;
1327 /* Link up the items */
1328 n
= journal_file_entry_n_items(o
);
1329 for (i
= 0; i
< n
; i
++) {
1330 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1338 static int journal_file_append_entry_internal(
1340 const dual_timestamp
*ts
,
1342 const EntryItem items
[], unsigned n_items
,
1344 Object
**ret
, uint64_t *offset
) {
1351 assert(items
|| n_items
== 0);
1354 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1356 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1360 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1361 memcpy(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1362 o
->entry
.realtime
= htole64(ts
->realtime
);
1363 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1364 o
->entry
.xor_hash
= htole64(xor_hash
);
1365 o
->entry
.boot_id
= f
->header
->boot_id
;
1368 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1373 r
= journal_file_link_entry(f
, o
, np
);
1386 void journal_file_post_change(JournalFile
*f
) {
1389 /* inotify() does not receive IN_MODIFY events from file
1390 * accesses done via mmap(). After each access we hence
1391 * trigger IN_MODIFY by truncating the journal file to its
1392 * current size which triggers IN_MODIFY. */
1394 __sync_synchronize();
1396 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1397 log_error_errno(errno
, "Failed to truncate file to its own size: %m");
1400 static int entry_item_cmp(const void *_a
, const void *_b
) {
1401 const EntryItem
*a
= _a
, *b
= _b
;
1403 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1405 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1410 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1414 uint64_t xor_hash
= 0;
1415 struct dual_timestamp _ts
;
1418 assert(iovec
|| n_iovec
== 0);
1421 dual_timestamp_get(&_ts
);
1425 if (f
->tail_entry_monotonic_valid
&&
1426 ts
->monotonic
< le64toh(f
->header
->tail_entry_monotonic
))
1430 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
1435 /* alloca() can't take 0, hence let's allocate at least one */
1436 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
1438 for (i
= 0; i
< n_iovec
; i
++) {
1442 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
1446 xor_hash
^= le64toh(o
->data
.hash
);
1447 items
[i
].object_offset
= htole64(p
);
1448 items
[i
].hash
= o
->data
.hash
;
1451 /* Order by the position on disk, in order to improve seek
1452 * times for rotating media. */
1453 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
1455 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
1457 /* If the memory mapping triggered a SIGBUS then we return an
1458 * IO error and ignore the error code passed down to us, since
1459 * it is very likely just an effect of a nullified replacement
1462 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
1465 journal_file_post_change(f
);
1470 typedef struct ChainCacheItem
{
1471 uint64_t first
; /* the array at the beginning of the chain */
1472 uint64_t array
; /* the cached array */
1473 uint64_t begin
; /* the first item in the cached array */
1474 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
1475 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
1478 static void chain_cache_put(
1485 uint64_t last_index
) {
1488 /* If the chain item to cache for this chain is the
1489 * first one it's not worth caching anything */
1493 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
1494 ci
= ordered_hashmap_steal_first(h
);
1497 ci
= new(ChainCacheItem
, 1);
1504 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
1509 assert(ci
->first
== first
);
1514 ci
->last_index
= last_index
;
1517 static int generic_array_get(
1521 Object
**ret
, uint64_t *offset
) {
1524 uint64_t p
= 0, a
, t
= 0;
1532 /* Try the chain cache first */
1533 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1534 if (ci
&& i
> ci
->total
) {
1543 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1547 k
= journal_file_entry_array_n_items(o
);
1549 p
= le64toh(o
->entry_array
.items
[i
]);
1555 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1561 /* Let's cache this item for the next invocation */
1562 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
1564 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1577 static int generic_array_get_plus_one(
1582 Object
**ret
, uint64_t *offset
) {
1591 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1604 return generic_array_get(f
, first
, i
-1, ret
, offset
);
1613 static int generic_array_bisect(
1618 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1619 direction_t direction
,
1624 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
1625 bool subtract_one
= false;
1626 Object
*o
, *array
= NULL
;
1631 assert(test_object
);
1633 /* Start with the first array in the chain */
1636 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1637 if (ci
&& n
> ci
->total
) {
1638 /* Ah, we have iterated this bisection array chain
1639 * previously! Let's see if we can skip ahead in the
1640 * chain, as far as the last time. But we can't jump
1641 * backwards in the chain, so let's check that
1644 r
= test_object(f
, ci
->begin
, needle
);
1648 if (r
== TEST_LEFT
) {
1649 /* OK, what we are looking for is right of the
1650 * begin of this EntryArray, so let's jump
1651 * straight to previously cached array in the
1657 last_index
= ci
->last_index
;
1662 uint64_t left
, right
, k
, lp
;
1664 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
1668 k
= journal_file_entry_array_n_items(array
);
1674 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
1678 r
= test_object(f
, p
, needle
);
1682 if (r
== TEST_FOUND
)
1683 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1685 if (r
== TEST_RIGHT
) {
1689 if (last_index
!= (uint64_t) -1) {
1690 assert(last_index
<= right
);
1692 /* If we cached the last index we
1693 * looked at, let's try to not to jump
1694 * too wildly around and see if we can
1695 * limit the range to look at early to
1696 * the immediate neighbors of the last
1697 * index we looked at. */
1699 if (last_index
> 0) {
1700 uint64_t x
= last_index
- 1;
1702 p
= le64toh(array
->entry_array
.items
[x
]);
1706 r
= test_object(f
, p
, needle
);
1710 if (r
== TEST_FOUND
)
1711 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1713 if (r
== TEST_RIGHT
)
1719 if (last_index
< right
) {
1720 uint64_t y
= last_index
+ 1;
1722 p
= le64toh(array
->entry_array
.items
[y
]);
1726 r
= test_object(f
, p
, needle
);
1730 if (r
== TEST_FOUND
)
1731 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1733 if (r
== TEST_RIGHT
)
1741 if (left
== right
) {
1742 if (direction
== DIRECTION_UP
)
1743 subtract_one
= true;
1749 assert(left
< right
);
1750 i
= (left
+ right
) / 2;
1752 p
= le64toh(array
->entry_array
.items
[i
]);
1756 r
= test_object(f
, p
, needle
);
1760 if (r
== TEST_FOUND
)
1761 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1763 if (r
== TEST_RIGHT
)
1771 if (direction
== DIRECTION_UP
) {
1773 subtract_one
= true;
1784 last_index
= (uint64_t) -1;
1785 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
1791 if (subtract_one
&& t
== 0 && i
== 0)
1794 /* Let's cache this item for the next invocation */
1795 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
1797 if (subtract_one
&& i
== 0)
1799 else if (subtract_one
)
1800 p
= le64toh(array
->entry_array
.items
[i
-1]);
1802 p
= le64toh(array
->entry_array
.items
[i
]);
1804 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1815 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
1820 static int generic_array_bisect_plus_one(
1826 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1827 direction_t direction
,
1833 bool step_back
= false;
1837 assert(test_object
);
1842 /* This bisects the array in object 'first', but first checks
1844 r
= test_object(f
, extra
, needle
);
1848 if (r
== TEST_FOUND
)
1849 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1851 /* if we are looking with DIRECTION_UP then we need to first
1852 see if in the actual array there is a matching entry, and
1853 return the last one of that. But if there isn't any we need
1854 to return this one. Hence remember this, and return it
1857 step_back
= direction
== DIRECTION_UP
;
1859 if (r
== TEST_RIGHT
) {
1860 if (direction
== DIRECTION_DOWN
)
1866 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
1868 if (r
== 0 && step_back
)
1877 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1893 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1899 else if (p
< needle
)
1905 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1912 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1916 if (le64toh(o
->entry
.seqnum
) == needle
)
1918 else if (le64toh(o
->entry
.seqnum
) < needle
)
1924 int journal_file_move_to_entry_by_seqnum(
1927 direction_t direction
,
1931 return generic_array_bisect(f
,
1932 le64toh(f
->header
->entry_array_offset
),
1933 le64toh(f
->header
->n_entries
),
1940 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1947 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1951 if (le64toh(o
->entry
.realtime
) == needle
)
1953 else if (le64toh(o
->entry
.realtime
) < needle
)
1959 int journal_file_move_to_entry_by_realtime(
1962 direction_t direction
,
1966 return generic_array_bisect(f
,
1967 le64toh(f
->header
->entry_array_offset
),
1968 le64toh(f
->header
->n_entries
),
1970 test_object_realtime
,
1975 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1982 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1986 if (le64toh(o
->entry
.monotonic
) == needle
)
1988 else if (le64toh(o
->entry
.monotonic
) < needle
)
1994 static int find_data_object_by_boot_id(
2000 char t
[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2002 sd_id128_to_string(boot_id
, t
+ 9);
2003 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2006 int journal_file_move_to_entry_by_monotonic(
2010 direction_t direction
,
2019 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2025 return generic_array_bisect_plus_one(f
,
2026 le64toh(o
->data
.entry_offset
),
2027 le64toh(o
->data
.entry_array_offset
),
2028 le64toh(o
->data
.n_entries
),
2030 test_object_monotonic
,
2035 void journal_file_reset_location(JournalFile
*f
) {
2036 f
->location_type
= LOCATION_HEAD
;
2037 f
->current_offset
= 0;
2038 f
->current_seqnum
= 0;
2039 f
->current_realtime
= 0;
2040 f
->current_monotonic
= 0;
2041 zero(f
->current_boot_id
);
2042 f
->current_xor_hash
= 0;
2045 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2046 f
->location_type
= LOCATION_SEEK
;
2047 f
->current_offset
= offset
;
2048 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2049 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2050 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2051 f
->current_boot_id
= o
->entry
.boot_id
;
2052 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2055 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2058 assert(af
->location_type
== LOCATION_SEEK
);
2059 assert(bf
->location_type
== LOCATION_SEEK
);
2061 /* If contents and timestamps match, these entries are
2062 * identical, even if the seqnum does not match */
2063 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2064 af
->current_monotonic
== bf
->current_monotonic
&&
2065 af
->current_realtime
== bf
->current_realtime
&&
2066 af
->current_xor_hash
== bf
->current_xor_hash
)
2069 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2071 /* If this is from the same seqnum source, compare
2073 if (af
->current_seqnum
< bf
->current_seqnum
)
2075 if (af
->current_seqnum
> bf
->current_seqnum
)
2078 /* Wow! This is weird, different data but the same
2079 * seqnums? Something is borked, but let's make the
2080 * best of it and compare by time. */
2083 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2085 /* If the boot id matches, compare monotonic time */
2086 if (af
->current_monotonic
< bf
->current_monotonic
)
2088 if (af
->current_monotonic
> bf
->current_monotonic
)
2092 /* Otherwise, compare UTC time */
2093 if (af
->current_realtime
< bf
->current_realtime
)
2095 if (af
->current_realtime
> bf
->current_realtime
)
2098 /* Finally, compare by contents */
2099 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2101 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2107 int journal_file_next_entry(
2110 direction_t direction
,
2111 Object
**ret
, uint64_t *offset
) {
2118 n
= le64toh(f
->header
->n_entries
);
2123 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2125 r
= generic_array_bisect(f
,
2126 le64toh(f
->header
->entry_array_offset
),
2127 le64toh(f
->header
->n_entries
),
2136 if (direction
== DIRECTION_DOWN
) {
2149 /* And jump to it */
2150 r
= generic_array_get(f
,
2151 le64toh(f
->header
->entry_array_offset
),
2158 (direction
== DIRECTION_DOWN
? ofs
<= p
: ofs
>= p
)) {
2159 log_debug("%s: entry array corrupted at entry %"PRIu64
,
2170 int journal_file_next_entry_for_data(
2172 Object
*o
, uint64_t p
,
2173 uint64_t data_offset
,
2174 direction_t direction
,
2175 Object
**ret
, uint64_t *offset
) {
2182 assert(p
> 0 || !o
);
2184 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2188 n
= le64toh(d
->data
.n_entries
);
2193 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2195 if (o
->object
.type
!= OBJECT_ENTRY
)
2198 r
= generic_array_bisect_plus_one(f
,
2199 le64toh(d
->data
.entry_offset
),
2200 le64toh(d
->data
.entry_array_offset
),
2201 le64toh(d
->data
.n_entries
),
2211 if (direction
== DIRECTION_DOWN
) {
2225 return generic_array_get_plus_one(f
,
2226 le64toh(d
->data
.entry_offset
),
2227 le64toh(d
->data
.entry_array_offset
),
2232 int journal_file_move_to_entry_by_offset_for_data(
2234 uint64_t data_offset
,
2236 direction_t direction
,
2237 Object
**ret
, uint64_t *offset
) {
2244 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2248 return generic_array_bisect_plus_one(f
,
2249 le64toh(d
->data
.entry_offset
),
2250 le64toh(d
->data
.entry_array_offset
),
2251 le64toh(d
->data
.n_entries
),
2258 int journal_file_move_to_entry_by_monotonic_for_data(
2260 uint64_t data_offset
,
2263 direction_t direction
,
2264 Object
**ret
, uint64_t *offset
) {
2272 /* First, seek by time */
2273 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2279 r
= generic_array_bisect_plus_one(f
,
2280 le64toh(o
->data
.entry_offset
),
2281 le64toh(o
->data
.entry_array_offset
),
2282 le64toh(o
->data
.n_entries
),
2284 test_object_monotonic
,
2290 /* And now, continue seeking until we find an entry that
2291 * exists in both bisection arrays */
2297 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2301 r
= generic_array_bisect_plus_one(f
,
2302 le64toh(d
->data
.entry_offset
),
2303 le64toh(d
->data
.entry_array_offset
),
2304 le64toh(d
->data
.n_entries
),
2312 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2316 r
= generic_array_bisect_plus_one(f
,
2317 le64toh(o
->data
.entry_offset
),
2318 le64toh(o
->data
.entry_array_offset
),
2319 le64toh(o
->data
.n_entries
),
2341 int journal_file_move_to_entry_by_seqnum_for_data(
2343 uint64_t data_offset
,
2345 direction_t direction
,
2346 Object
**ret
, uint64_t *offset
) {
2353 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2357 return generic_array_bisect_plus_one(f
,
2358 le64toh(d
->data
.entry_offset
),
2359 le64toh(d
->data
.entry_array_offset
),
2360 le64toh(d
->data
.n_entries
),
2367 int journal_file_move_to_entry_by_realtime_for_data(
2369 uint64_t data_offset
,
2371 direction_t direction
,
2372 Object
**ret
, uint64_t *offset
) {
2379 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2383 return generic_array_bisect_plus_one(f
,
2384 le64toh(d
->data
.entry_offset
),
2385 le64toh(d
->data
.entry_array_offset
),
2386 le64toh(d
->data
.n_entries
),
2388 test_object_realtime
,
2393 void journal_file_dump(JournalFile
*f
) {
2400 journal_file_print_header(f
);
2402 p
= le64toh(f
->header
->header_size
);
2404 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
2408 switch (o
->object
.type
) {
2411 printf("Type: OBJECT_UNUSED\n");
2415 printf("Type: OBJECT_DATA\n");
2419 printf("Type: OBJECT_FIELD\n");
2423 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
2424 le64toh(o
->entry
.seqnum
),
2425 le64toh(o
->entry
.monotonic
),
2426 le64toh(o
->entry
.realtime
));
2429 case OBJECT_FIELD_HASH_TABLE
:
2430 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2433 case OBJECT_DATA_HASH_TABLE
:
2434 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2437 case OBJECT_ENTRY_ARRAY
:
2438 printf("Type: OBJECT_ENTRY_ARRAY\n");
2442 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
2443 le64toh(o
->tag
.seqnum
),
2444 le64toh(o
->tag
.epoch
));
2448 printf("Type: unknown (%i)\n", o
->object
.type
);
2452 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
2453 printf("Flags: %s\n",
2454 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
2456 if (p
== le64toh(f
->header
->tail_object_offset
))
2459 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
2464 log_error("File corrupt");
2467 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
2470 x
= format_timestamp(buf
, l
, t
);
2476 void journal_file_print_header(JournalFile
*f
) {
2477 char a
[33], b
[33], c
[33], d
[33];
2478 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
2480 char bytes
[FORMAT_BYTES_MAX
];
2484 printf("File Path: %s\n"
2488 "Sequential Number ID: %s\n"
2490 "Compatible Flags:%s%s\n"
2491 "Incompatible Flags:%s%s%s\n"
2492 "Header size: %"PRIu64
"\n"
2493 "Arena size: %"PRIu64
"\n"
2494 "Data Hash Table Size: %"PRIu64
"\n"
2495 "Field Hash Table Size: %"PRIu64
"\n"
2496 "Rotate Suggested: %s\n"
2497 "Head Sequential Number: %"PRIu64
"\n"
2498 "Tail Sequential Number: %"PRIu64
"\n"
2499 "Head Realtime Timestamp: %s\n"
2500 "Tail Realtime Timestamp: %s\n"
2501 "Tail Monotonic Timestamp: %s\n"
2502 "Objects: %"PRIu64
"\n"
2503 "Entry Objects: %"PRIu64
"\n",
2505 sd_id128_to_string(f
->header
->file_id
, a
),
2506 sd_id128_to_string(f
->header
->machine_id
, b
),
2507 sd_id128_to_string(f
->header
->boot_id
, c
),
2508 sd_id128_to_string(f
->header
->seqnum_id
, d
),
2509 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
2510 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
2511 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
2512 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
2513 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
2514 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
2515 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
2516 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
2517 le64toh(f
->header
->header_size
),
2518 le64toh(f
->header
->arena_size
),
2519 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
2520 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
2521 yes_no(journal_file_rotate_suggested(f
, 0)),
2522 le64toh(f
->header
->head_entry_seqnum
),
2523 le64toh(f
->header
->tail_entry_seqnum
),
2524 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)),
2525 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)),
2526 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
),
2527 le64toh(f
->header
->n_objects
),
2528 le64toh(f
->header
->n_entries
));
2530 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
2531 printf("Data Objects: %"PRIu64
"\n"
2532 "Data Hash Table Fill: %.1f%%\n",
2533 le64toh(f
->header
->n_data
),
2534 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
2536 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
2537 printf("Field Objects: %"PRIu64
"\n"
2538 "Field Hash Table Fill: %.1f%%\n",
2539 le64toh(f
->header
->n_fields
),
2540 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
2542 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
2543 printf("Tag Objects: %"PRIu64
"\n",
2544 le64toh(f
->header
->n_tags
));
2545 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
2546 printf("Entry Array Objects: %"PRIu64
"\n",
2547 le64toh(f
->header
->n_entry_arrays
));
2549 if (fstat(f
->fd
, &st
) >= 0)
2550 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (uint64_t) st
.st_blocks
* 512ULL));
2553 static int journal_file_warn_btrfs(JournalFile
*f
) {
2559 /* Before we write anything, check if the COW logic is turned
2560 * off on btrfs. Given our write pattern that is quite
2561 * unfriendly to COW file systems this should greatly improve
2562 * performance on COW file systems, such as btrfs, at the
2563 * expense of data integrity features (which shouldn't be too
2564 * bad, given that we do our own checksumming). */
2566 r
= btrfs_is_filesystem(f
->fd
);
2568 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
2572 r
= read_attr_fd(f
->fd
, &attrs
);
2574 return log_warning_errno(r
, "Failed to read file attributes: %m");
2576 if (attrs
& FS_NOCOW_FL
) {
2577 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2581 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2582 "This is likely to slow down journal access substantially, please consider turning "
2583 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
2588 int journal_file_open(
2594 JournalMetrics
*metrics
,
2595 MMapCache
*mmap_cache
,
2596 JournalFile
*template,
2597 JournalFile
**ret
) {
2599 bool newly_created
= false;
2607 if ((flags
& O_ACCMODE
) != O_RDONLY
&&
2608 (flags
& O_ACCMODE
) != O_RDWR
)
2611 if (!endswith(fname
, ".journal") &&
2612 !endswith(fname
, ".journal~"))
2615 f
= new0(JournalFile
, 1);
2623 f
->prot
= prot_from_flags(flags
);
2624 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
2625 #if defined(HAVE_LZ4)
2626 f
->compress_lz4
= compress
;
2627 #elif defined(HAVE_XZ)
2628 f
->compress_xz
= compress
;
2635 f
->mmap
= mmap_cache_ref(mmap_cache
);
2637 f
->mmap
= mmap_cache_new();
2644 f
->path
= strdup(fname
);
2650 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
2651 if (!f
->chain_cache
) {
2656 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
, f
->mode
);
2662 r
= journal_file_fstat(f
);
2666 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
2668 (void) journal_file_warn_btrfs(f
);
2670 /* Let's attach the creation time to the journal file,
2671 * so that the vacuuming code knows the age of this
2672 * file even if the file might end up corrupted one
2673 * day... Ideally we'd just use the creation time many
2674 * file systems maintain for each file, but there is
2675 * currently no usable API to query this, hence let's
2676 * emulate this via extended attributes. If extended
2677 * attributes are not supported we'll just skip this,
2678 * and rely solely on mtime/atime/ctime of the file. */
2680 fd_setcrtime(f
->fd
, 0);
2683 /* Try to load the FSPRG state, and if we can't, then
2684 * just don't do sealing */
2686 r
= journal_file_fss_load(f
);
2692 r
= journal_file_init_header(f
, template);
2696 r
= journal_file_fstat(f
);
2700 newly_created
= true;
2703 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
2708 r
= mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
);
2714 if (!newly_created
) {
2715 r
= journal_file_verify_header(f
);
2721 if (!newly_created
&& f
->writable
) {
2722 r
= journal_file_fss_load(f
);
2730 journal_default_metrics(metrics
, f
->fd
);
2731 f
->metrics
= *metrics
;
2732 } else if (template)
2733 f
->metrics
= template->metrics
;
2735 r
= journal_file_refresh_header(f
);
2741 r
= journal_file_hmac_setup(f
);
2746 if (newly_created
) {
2747 r
= journal_file_setup_field_hash_table(f
);
2751 r
= journal_file_setup_data_hash_table(f
);
2756 r
= journal_file_append_first_tag(f
);
2762 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
)) {
2771 if (f
->fd
>= 0 && mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
2774 journal_file_close(f
);
2779 int journal_file_rotate(JournalFile
**f
, bool compress
, bool seal
) {
2780 _cleanup_free_
char *p
= NULL
;
2782 JournalFile
*old_file
, *new_file
= NULL
;
2790 if (!old_file
->writable
)
2793 if (!endswith(old_file
->path
, ".journal"))
2796 l
= strlen(old_file
->path
);
2797 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
2798 (int) l
- 8, old_file
->path
,
2799 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
2800 le64toh((*f
)->header
->head_entry_seqnum
),
2801 le64toh((*f
)->header
->head_entry_realtime
));
2805 /* Try to rename the file to the archived version. If the file
2806 * already was deleted, we'll get ENOENT, let's ignore that
2808 r
= rename(old_file
->path
, p
);
2809 if (r
< 0 && errno
!= ENOENT
)
2812 old_file
->header
->state
= STATE_ARCHIVED
;
2814 /* Currently, btrfs is not very good with out write patterns
2815 * and fragments heavily. Let's defrag our journal files when
2816 * we archive them */
2817 old_file
->defrag_on_close
= true;
2819 r
= journal_file_open(old_file
->path
, old_file
->flags
, old_file
->mode
, compress
, seal
, NULL
, old_file
->mmap
, old_file
, &new_file
);
2820 journal_file_close(old_file
);
2826 int journal_file_open_reliably(
2832 JournalMetrics
*metrics
,
2833 MMapCache
*mmap_cache
,
2834 JournalFile
*template,
2835 JournalFile
**ret
) {
2839 _cleanup_free_
char *p
= NULL
;
2841 r
= journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2843 -EBADMSG
, /* corrupted */
2844 -ENODATA
, /* truncated */
2845 -EHOSTDOWN
, /* other machine */
2846 -EPROTONOSUPPORT
, /* incompatible feature */
2847 -EBUSY
, /* unclean shutdown */
2848 -ESHUTDOWN
, /* already archived */
2849 -EIO
, /* IO error, including SIGBUS on mmap */
2850 -EIDRM
/* File has been deleted */))
2853 if ((flags
& O_ACCMODE
) == O_RDONLY
)
2856 if (!(flags
& O_CREAT
))
2859 if (!endswith(fname
, ".journal"))
2862 /* The file is corrupted. Rotate it away and try it again (but only once) */
2865 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
2867 now(CLOCK_REALTIME
),
2871 if (rename(fname
, p
) < 0)
2874 /* btrfs doesn't cope well with our write pattern and
2875 * fragments heavily. Let's defrag all files we rotate */
2877 (void) chattr_path(p
, false, FS_NOCOW_FL
);
2878 (void) btrfs_defrag(p
);
2880 log_warning_errno(r
, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
2882 return journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2885 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
2887 uint64_t q
, xor_hash
= 0;
2900 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
2901 ts
.realtime
= le64toh(o
->entry
.realtime
);
2903 n
= journal_file_entry_n_items(o
);
2904 /* alloca() can't take 0, hence let's allocate at least one */
2905 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
2907 for (i
= 0; i
< n
; i
++) {
2914 q
= le64toh(o
->entry
.items
[i
].object_offset
);
2915 le_hash
= o
->entry
.items
[i
].hash
;
2917 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
2921 if (le_hash
!= o
->data
.hash
)
2924 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
2927 /* We hit the limit on 32bit machines */
2928 if ((uint64_t) t
!= l
)
2931 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
2932 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2935 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
2936 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
2940 data
= from
->compress_buffer
;
2943 return -EPROTONOSUPPORT
;
2946 data
= o
->data
.payload
;
2948 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
2952 xor_hash
^= le64toh(u
->data
.hash
);
2953 items
[i
].object_offset
= htole64(h
);
2954 items
[i
].hash
= u
->data
.hash
;
2956 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
2961 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
2963 if (mmap_cache_got_sigbus(to
->mmap
, to
->fd
))
2969 void journal_reset_metrics(JournalMetrics
*m
) {
2972 /* Set everything to "pick automatic values". */
2974 *m
= (JournalMetrics
) {
2975 .min_use
= (uint64_t) -1,
2976 .max_use
= (uint64_t) -1,
2977 .min_size
= (uint64_t) -1,
2978 .max_size
= (uint64_t) -1,
2979 .keep_free
= (uint64_t) -1,
2980 .n_max_files
= (uint64_t) -1,
2984 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
2985 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
], e
[FORMAT_BYTES_MAX
];
2992 if (fstatvfs(fd
, &ss
) >= 0)
2993 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
2995 log_debug_errno(errno
, "Failed to detremine disk size: %m");
2999 if (m
->max_use
== (uint64_t) -1) {
3002 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
3004 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
3005 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
3007 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
3008 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3010 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3012 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3014 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3015 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3018 if (m
->min_use
== (uint64_t) -1)
3019 m
->min_use
= DEFAULT_MIN_USE
;
3021 if (m
->min_use
> m
->max_use
)
3022 m
->min_use
= m
->max_use
;
3024 if (m
->max_size
== (uint64_t) -1) {
3025 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3027 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3028 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3030 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3032 if (m
->max_size
!= 0) {
3033 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3034 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3036 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3037 m
->max_use
= m
->max_size
*2;
3040 if (m
->min_size
== (uint64_t) -1)
3041 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3043 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3045 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3046 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3048 if (m
->max_size
!= 0 && m
->min_size
> m
->max_size
)
3049 m
->max_size
= m
->min_size
;
3052 if (m
->keep_free
== (uint64_t) -1) {
3055 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3057 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3058 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3061 m
->keep_free
= DEFAULT_KEEP_FREE
;
3064 if (m
->n_max_files
== (uint64_t) -1)
3065 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3067 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3068 format_bytes(a
, sizeof(a
), m
->min_use
),
3069 format_bytes(b
, sizeof(b
), m
->max_use
),
3070 format_bytes(c
, sizeof(c
), m
->max_size
),
3071 format_bytes(d
, sizeof(d
), m
->min_size
),
3072 format_bytes(e
, sizeof(e
), m
->keep_free
),
3076 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3081 if (f
->header
->head_entry_realtime
== 0)
3084 *from
= le64toh(f
->header
->head_entry_realtime
);
3088 if (f
->header
->tail_entry_realtime
== 0)
3091 *to
= le64toh(f
->header
->tail_entry_realtime
);
3097 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3105 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3109 if (le64toh(o
->data
.n_entries
) <= 0)
3113 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3117 *from
= le64toh(o
->entry
.monotonic
);
3121 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3125 r
= generic_array_get_plus_one(f
,
3126 le64toh(o
->data
.entry_offset
),
3127 le64toh(o
->data
.entry_array_offset
),
3128 le64toh(o
->data
.n_entries
)-1,
3133 *to
= le64toh(o
->entry
.monotonic
);
3139 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3142 /* If we gained new header fields we gained new features,
3143 * hence suggest a rotation */
3144 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3145 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3149 /* Let's check if the hash tables grew over a certain fill
3150 * level (75%, borrowing this value from Java's hash table
3151 * implementation), and if so suggest a rotation. To calculate
3152 * the fill level we need the n_data field, which only exists
3153 * in newer versions. */
3155 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3156 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3157 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3159 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3160 le64toh(f
->header
->n_data
),
3161 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3162 (unsigned long long) f
->last_stat
.st_size
,
3163 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3167 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3168 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3169 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3171 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3172 le64toh(f
->header
->n_fields
),
3173 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3177 /* Are the data objects properly indexed by field objects? */
3178 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3179 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3180 le64toh(f
->header
->n_data
) > 0 &&
3181 le64toh(f
->header
->n_fields
) == 0)
3184 if (max_file_usec
> 0) {
3187 h
= le64toh(f
->header
->head_entry_realtime
);
3188 t
= now(CLOCK_REALTIME
);
3190 if (h
> 0 && t
> h
+ max_file_usec
)