1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/statvfs.h>
31 #include "btrfs-util.h"
32 #include "chattr-util.h"
35 #include "journal-authenticate.h"
36 #include "journal-def.h"
37 #include "journal-file.h"
39 #include "parse-util.h"
40 #include "random-util.h"
41 #include "string-util.h"
42 #include "xattr-util.h"
44 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
45 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
47 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
49 /* This is the minimum journal file size */
50 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
52 /* These are the lower and upper bounds if we deduce the max_use value
53 * from the file system size */
54 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
55 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
58 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
60 /* This is the upper bound if we deduce max_size from max_use */
61 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
63 /* This is the upper bound if we deduce the keep_free value from the
65 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
67 /* This is the keep_free value when we can't determine the system
69 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
71 /* This is the default maximum number of journal files to keep around. */
72 #define DEFAULT_N_MAX_FILES (100)
74 /* n_data was the first entry we added after the initial file format design */
75 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
77 /* How many entries to keep in the entry array chain cache at max */
78 #define CHAIN_CACHE_MAX 20
80 /* How much to increase the journal file size at once each time we allocate something new. */
81 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
83 /* Reread fstat() of the file for detecting deletions at least this often */
84 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
86 /* The mmap context to use for the header we pick as one above the last defined typed */
87 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
89 static int journal_file_set_online(JournalFile
*f
) {
95 if (!(f
->fd
>= 0 && f
->header
))
98 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
101 switch(f
->header
->state
) {
106 f
->header
->state
= STATE_ONLINE
;
115 int journal_file_set_offline(JournalFile
*f
) {
121 if (!(f
->fd
>= 0 && f
->header
))
124 if (f
->header
->state
!= STATE_ONLINE
)
129 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
132 f
->header
->state
= STATE_OFFLINE
;
134 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
142 JournalFile
* journal_file_close(JournalFile
*f
) {
146 /* Write the final tag */
147 if (f
->seal
&& f
->writable
)
148 journal_file_append_tag(f
);
151 journal_file_set_offline(f
);
153 if (f
->mmap
&& f
->fd
>= 0)
154 mmap_cache_close_fd(f
->mmap
, f
->fd
);
156 if (f
->fd
>= 0 && f
->defrag_on_close
) {
158 /* Be friendly to btrfs: turn COW back on again now,
159 * and defragment the file. We won't write to the file
160 * ever again, hence remove all fragmentation, and
161 * reenable all the good bits COW usually provides
162 * (such as data checksumming). */
164 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
165 (void) btrfs_defrag_fd(f
->fd
);
172 mmap_cache_unref(f
->mmap
);
174 ordered_hashmap_free_free(f
->chain_cache
);
176 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
177 free(f
->compress_buffer
);
182 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
184 free(f
->fsprg_state
);
189 gcry_md_close(f
->hmac
);
196 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
203 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
204 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
206 h
.incompatible_flags
|= htole32(
207 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
208 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
210 h
.compatible_flags
= htole32(
211 f
->seal
* HEADER_COMPATIBLE_SEALED
);
213 r
= sd_id128_randomize(&h
.file_id
);
218 h
.seqnum_id
= template->header
->seqnum_id
;
219 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
221 h
.seqnum_id
= h
.file_id
;
223 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
233 static int journal_file_refresh_header(JournalFile
*f
) {
239 r
= sd_id128_get_machine(&f
->header
->machine_id
);
243 r
= sd_id128_get_boot(&boot_id
);
247 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
248 f
->tail_entry_monotonic_valid
= true;
250 f
->header
->boot_id
= boot_id
;
252 r
= journal_file_set_online(f
);
254 /* Sync the online state to disk */
260 static int journal_file_verify_header(JournalFile
*f
) {
265 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
268 /* In both read and write mode we refuse to open files with
269 * incompatible flags we don't know */
270 flags
= le32toh(f
->header
->incompatible_flags
);
271 if (flags
& ~HEADER_INCOMPATIBLE_SUPPORTED
) {
272 if (flags
& ~HEADER_INCOMPATIBLE_ANY
)
273 log_debug("Journal file %s has unknown incompatible flags %"PRIx32
,
274 f
->path
, flags
& ~HEADER_INCOMPATIBLE_ANY
);
275 flags
= (flags
& HEADER_INCOMPATIBLE_ANY
) & ~HEADER_INCOMPATIBLE_SUPPORTED
;
277 log_debug("Journal file %s uses incompatible flags %"PRIx32
278 " disabled at compilation time.", f
->path
, flags
);
279 return -EPROTONOSUPPORT
;
282 /* When open for writing we refuse to open files with
283 * compatible flags, too */
284 flags
= le32toh(f
->header
->compatible_flags
);
285 if (f
->writable
&& (flags
& ~HEADER_COMPATIBLE_SUPPORTED
)) {
286 if (flags
& ~HEADER_COMPATIBLE_ANY
)
287 log_debug("Journal file %s has unknown compatible flags %"PRIx32
,
288 f
->path
, flags
& ~HEADER_COMPATIBLE_ANY
);
289 flags
= (flags
& HEADER_COMPATIBLE_ANY
) & ~HEADER_COMPATIBLE_SUPPORTED
;
291 log_debug("Journal file %s uses compatible flags %"PRIx32
292 " disabled at compilation time.", f
->path
, flags
);
293 return -EPROTONOSUPPORT
;
296 if (f
->header
->state
>= _STATE_MAX
)
299 /* The first addition was n_data, so check that we are at least this large */
300 if (le64toh(f
->header
->header_size
) < HEADER_SIZE_MIN
)
303 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
306 if ((le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)) > (uint64_t) f
->last_stat
.st_size
)
309 if (le64toh(f
->header
->tail_object_offset
) > (le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)))
312 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
313 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
314 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
315 !VALID64(le64toh(f
->header
->entry_array_offset
)))
320 sd_id128_t machine_id
;
323 r
= sd_id128_get_machine(&machine_id
);
327 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
330 state
= f
->header
->state
;
332 if (state
== STATE_ONLINE
) {
333 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
335 } else if (state
== STATE_ARCHIVED
)
337 else if (state
!= STATE_OFFLINE
) {
338 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
343 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
344 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
346 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
351 static int journal_file_fstat(JournalFile
*f
) {
355 if (fstat(f
->fd
, &f
->last_stat
) < 0)
358 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
360 /* Refuse appending to files that are already deleted */
361 if (f
->last_stat
.st_nlink
<= 0)
367 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
368 uint64_t old_size
, new_size
;
373 /* We assume that this file is not sparse, and we know that
374 * for sure, since we always call posix_fallocate()
377 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
381 le64toh(f
->header
->header_size
) +
382 le64toh(f
->header
->arena_size
);
384 new_size
= PAGE_ALIGN(offset
+ size
);
385 if (new_size
< le64toh(f
->header
->header_size
))
386 new_size
= le64toh(f
->header
->header_size
);
388 if (new_size
<= old_size
) {
390 /* We already pre-allocated enough space, but before
391 * we write to it, let's check with fstat() if the
392 * file got deleted, in order make sure we don't throw
393 * away the data immediately. Don't check fstat() for
394 * all writes though, but only once ever 10s. */
396 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
399 return journal_file_fstat(f
);
402 /* Allocate more space. */
404 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
407 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
410 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
413 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
415 if (new_size
- old_size
> available
)
420 /* Increase by larger blocks at once */
421 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
422 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
423 new_size
= f
->metrics
.max_size
;
425 /* Note that the glibc fallocate() fallback is very
426 inefficient, hence we try to minimize the allocation area
428 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
432 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
434 return journal_file_fstat(f
);
437 static unsigned type_to_context(ObjectType type
) {
438 /* One context for each type, plus one catch-all for the rest */
439 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
440 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
441 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
444 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
) {
453 /* Avoid SIGBUS on invalid accesses */
454 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
455 /* Hmm, out of range? Let's refresh the fstat() data
456 * first, before we trust that check. */
458 r
= journal_file_fstat(f
);
462 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
463 return -EADDRNOTAVAIL
;
466 return mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
);
469 static uint64_t minimum_header_size(Object
*o
) {
471 static const uint64_t table
[] = {
472 [OBJECT_DATA
] = sizeof(DataObject
),
473 [OBJECT_FIELD
] = sizeof(FieldObject
),
474 [OBJECT_ENTRY
] = sizeof(EntryObject
),
475 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
476 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
477 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
478 [OBJECT_TAG
] = sizeof(TagObject
),
481 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
482 return sizeof(ObjectHeader
);
484 return table
[o
->object
.type
];
487 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
496 /* Objects may only be located at multiple of 64 bit */
497 if (!VALID64(offset
))
500 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
);
505 s
= le64toh(o
->object
.size
);
507 if (s
< sizeof(ObjectHeader
))
510 if (o
->object
.type
<= OBJECT_UNUSED
)
513 if (s
< minimum_header_size(o
))
516 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
)
519 if (s
> sizeof(ObjectHeader
)) {
520 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
);
531 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
536 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
539 /* If an external seqnum counter was passed, we update
540 * both the local and the external one, and set it to
541 * the maximum of both */
549 f
->header
->tail_entry_seqnum
= htole64(r
);
551 if (f
->header
->head_entry_seqnum
== 0)
552 f
->header
->head_entry_seqnum
= htole64(r
);
557 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
564 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
565 assert(size
>= sizeof(ObjectHeader
));
569 r
= journal_file_set_online(f
);
573 p
= le64toh(f
->header
->tail_object_offset
);
575 p
= le64toh(f
->header
->header_size
);
577 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
581 p
+= ALIGN64(le64toh(tail
->object
.size
));
584 r
= journal_file_allocate(f
, p
, size
);
588 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
);
595 o
->object
.type
= type
;
596 o
->object
.size
= htole64(size
);
598 f
->header
->tail_object_offset
= htole64(p
);
599 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
607 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
614 /* We estimate that we need 1 hash table entry per 768 bytes
615 of journal file and we want to make sure we never get
616 beyond 75% fill level. Calculate the hash table size for
617 the maximum file size based on these metrics. */
619 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
620 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
621 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
623 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
625 r
= journal_file_append_object(f
,
626 OBJECT_DATA_HASH_TABLE
,
627 offsetof(Object
, hash_table
.items
) + s
,
632 memzero(o
->hash_table
.items
, s
);
634 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
635 f
->header
->data_hash_table_size
= htole64(s
);
640 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
647 /* We use a fixed size hash table for the fields as this
648 * number should grow very slowly only */
650 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
651 r
= journal_file_append_object(f
,
652 OBJECT_FIELD_HASH_TABLE
,
653 offsetof(Object
, hash_table
.items
) + s
,
658 memzero(o
->hash_table
.items
, s
);
660 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
661 f
->header
->field_hash_table_size
= htole64(s
);
666 int journal_file_map_data_hash_table(JournalFile
*f
) {
673 if (f
->data_hash_table
)
676 p
= le64toh(f
->header
->data_hash_table_offset
);
677 s
= le64toh(f
->header
->data_hash_table_size
);
679 r
= journal_file_move_to(f
,
680 OBJECT_DATA_HASH_TABLE
,
687 f
->data_hash_table
= t
;
691 int journal_file_map_field_hash_table(JournalFile
*f
) {
698 if (f
->field_hash_table
)
701 p
= le64toh(f
->header
->field_hash_table_offset
);
702 s
= le64toh(f
->header
->field_hash_table_size
);
704 r
= journal_file_move_to(f
,
705 OBJECT_FIELD_HASH_TABLE
,
712 f
->field_hash_table
= t
;
716 static int journal_file_link_field(
729 if (o
->object
.type
!= OBJECT_FIELD
)
732 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
736 /* This might alter the window we are looking at */
737 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
740 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
742 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
744 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
748 o
->field
.next_hash_offset
= htole64(offset
);
751 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
753 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
754 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
759 static int journal_file_link_data(
772 if (o
->object
.type
!= OBJECT_DATA
)
775 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
779 /* This might alter the window we are looking at */
780 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
781 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
782 o
->data
.n_entries
= 0;
785 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
787 /* Only entry in the hash table is easy */
788 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
790 /* Move back to the previous data object, to patch in
793 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
797 o
->data
.next_hash_offset
= htole64(offset
);
800 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
802 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
803 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
808 int journal_file_find_field_object_with_hash(
810 const void *field
, uint64_t size
, uint64_t hash
,
811 Object
**ret
, uint64_t *offset
) {
813 uint64_t p
, osize
, h
, m
;
817 assert(field
&& size
> 0);
819 /* If the field hash table is empty, we can't find anything */
820 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
823 /* Map the field hash table, if it isn't mapped yet. */
824 r
= journal_file_map_field_hash_table(f
);
828 osize
= offsetof(Object
, field
.payload
) + size
;
830 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
835 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
840 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
844 if (le64toh(o
->field
.hash
) == hash
&&
845 le64toh(o
->object
.size
) == osize
&&
846 memcmp(o
->field
.payload
, field
, size
) == 0) {
856 p
= le64toh(o
->field
.next_hash_offset
);
862 int journal_file_find_field_object(
864 const void *field
, uint64_t size
,
865 Object
**ret
, uint64_t *offset
) {
870 assert(field
&& size
> 0);
872 hash
= hash64(field
, size
);
874 return journal_file_find_field_object_with_hash(f
,
879 int journal_file_find_data_object_with_hash(
881 const void *data
, uint64_t size
, uint64_t hash
,
882 Object
**ret
, uint64_t *offset
) {
884 uint64_t p
, osize
, h
, m
;
888 assert(data
|| size
== 0);
890 /* If there's no data hash table, then there's no entry. */
891 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
894 /* Map the data hash table, if it isn't mapped yet. */
895 r
= journal_file_map_data_hash_table(f
);
899 osize
= offsetof(Object
, data
.payload
) + size
;
901 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
906 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
911 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
915 if (le64toh(o
->data
.hash
) != hash
)
918 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
919 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
923 l
= le64toh(o
->object
.size
);
924 if (l
<= offsetof(Object
, data
.payload
))
927 l
-= offsetof(Object
, data
.payload
);
929 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
930 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
935 memcmp(f
->compress_buffer
, data
, size
) == 0) {
946 return -EPROTONOSUPPORT
;
948 } else if (le64toh(o
->object
.size
) == osize
&&
949 memcmp(o
->data
.payload
, data
, size
) == 0) {
961 p
= le64toh(o
->data
.next_hash_offset
);
967 int journal_file_find_data_object(
969 const void *data
, uint64_t size
,
970 Object
**ret
, uint64_t *offset
) {
975 assert(data
|| size
== 0);
977 hash
= hash64(data
, size
);
979 return journal_file_find_data_object_with_hash(f
,
984 static int journal_file_append_field(
986 const void *field
, uint64_t size
,
987 Object
**ret
, uint64_t *offset
) {
995 assert(field
&& size
> 0);
997 hash
= hash64(field
, size
);
999 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1013 osize
= offsetof(Object
, field
.payload
) + size
;
1014 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1018 o
->field
.hash
= htole64(hash
);
1019 memcpy(o
->field
.payload
, field
, size
);
1021 r
= journal_file_link_field(f
, o
, p
, hash
);
1025 /* The linking might have altered the window, so let's
1026 * refresh our pointer */
1027 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1032 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1046 static int journal_file_append_data(
1048 const void *data
, uint64_t size
,
1049 Object
**ret
, uint64_t *offset
) {
1054 int r
, compression
= 0;
1058 assert(data
|| size
== 0);
1060 hash
= hash64(data
, size
);
1062 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1076 osize
= offsetof(Object
, data
.payload
) + size
;
1077 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1081 o
->data
.hash
= htole64(hash
);
1083 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1084 if (JOURNAL_FILE_COMPRESS(f
) && size
>= COMPRESSION_SIZE_THRESHOLD
) {
1087 compression
= compress_blob(data
, size
, o
->data
.payload
, &rsize
);
1089 if (compression
>= 0) {
1090 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1091 o
->object
.flags
|= compression
;
1093 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1094 size
, rsize
, object_compressed_to_string(compression
));
1096 /* Compression didn't work, we don't really care why, let's continue without compression */
1101 if (compression
== 0 && size
> 0)
1102 memcpy(o
->data
.payload
, data
, size
);
1104 r
= journal_file_link_data(f
, o
, p
, hash
);
1108 /* The linking might have altered the window, so let's
1109 * refresh our pointer */
1110 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1117 eq
= memchr(data
, '=', size
);
1118 if (eq
&& eq
> data
) {
1122 /* Create field object ... */
1123 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1127 /* ... and link it in. */
1128 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1129 fo
->field
.head_data_offset
= le64toh(p
);
1133 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1147 uint64_t journal_file_entry_n_items(Object
*o
) {
1150 if (o
->object
.type
!= OBJECT_ENTRY
)
1153 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1156 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1159 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1162 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1165 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1168 if (o
->object
.type
!= OBJECT_DATA_HASH_TABLE
&&
1169 o
->object
.type
!= OBJECT_FIELD_HASH_TABLE
)
1172 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1175 static int link_entry_into_array(JournalFile
*f
,
1180 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1188 a
= le64toh(*first
);
1189 i
= hidx
= le64toh(*idx
);
1192 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1196 n
= journal_file_entry_array_n_items(o
);
1198 o
->entry_array
.items
[i
] = htole64(p
);
1199 *idx
= htole64(hidx
+ 1);
1205 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1216 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1217 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1223 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1228 o
->entry_array
.items
[i
] = htole64(p
);
1231 *first
= htole64(q
);
1233 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1237 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1240 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1241 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1243 *idx
= htole64(hidx
+ 1);
1248 static int link_entry_into_array_plus_one(JournalFile
*f
,
1263 *extra
= htole64(p
);
1267 i
= htole64(le64toh(*idx
) - 1);
1268 r
= link_entry_into_array(f
, first
, &i
, p
);
1273 *idx
= htole64(le64toh(*idx
) + 1);
1277 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1284 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1288 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1292 return link_entry_into_array_plus_one(f
,
1293 &o
->data
.entry_offset
,
1294 &o
->data
.entry_array_offset
,
1299 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1307 if (o
->object
.type
!= OBJECT_ENTRY
)
1310 __sync_synchronize();
1312 /* Link up the entry itself */
1313 r
= link_entry_into_array(f
,
1314 &f
->header
->entry_array_offset
,
1315 &f
->header
->n_entries
,
1320 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1322 if (f
->header
->head_entry_realtime
== 0)
1323 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1325 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1326 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1328 f
->tail_entry_monotonic_valid
= true;
1330 /* Link up the items */
1331 n
= journal_file_entry_n_items(o
);
1332 for (i
= 0; i
< n
; i
++) {
1333 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1341 static int journal_file_append_entry_internal(
1343 const dual_timestamp
*ts
,
1345 const EntryItem items
[], unsigned n_items
,
1347 Object
**ret
, uint64_t *offset
) {
1354 assert(items
|| n_items
== 0);
1357 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1359 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1363 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1364 memcpy(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1365 o
->entry
.realtime
= htole64(ts
->realtime
);
1366 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1367 o
->entry
.xor_hash
= htole64(xor_hash
);
1368 o
->entry
.boot_id
= f
->header
->boot_id
;
1371 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1376 r
= journal_file_link_entry(f
, o
, np
);
1389 void journal_file_post_change(JournalFile
*f
) {
1392 /* inotify() does not receive IN_MODIFY events from file
1393 * accesses done via mmap(). After each access we hence
1394 * trigger IN_MODIFY by truncating the journal file to its
1395 * current size which triggers IN_MODIFY. */
1397 __sync_synchronize();
1399 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1400 log_error_errno(errno
, "Failed to truncate file to its own size: %m");
1403 static int entry_item_cmp(const void *_a
, const void *_b
) {
1404 const EntryItem
*a
= _a
, *b
= _b
;
1406 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1408 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1413 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1417 uint64_t xor_hash
= 0;
1418 struct dual_timestamp _ts
;
1421 assert(iovec
|| n_iovec
== 0);
1424 dual_timestamp_get(&_ts
);
1428 if (f
->tail_entry_monotonic_valid
&&
1429 ts
->monotonic
< le64toh(f
->header
->tail_entry_monotonic
))
1433 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
1438 /* alloca() can't take 0, hence let's allocate at least one */
1439 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
1441 for (i
= 0; i
< n_iovec
; i
++) {
1445 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
1449 xor_hash
^= le64toh(o
->data
.hash
);
1450 items
[i
].object_offset
= htole64(p
);
1451 items
[i
].hash
= o
->data
.hash
;
1454 /* Order by the position on disk, in order to improve seek
1455 * times for rotating media. */
1456 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
1458 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
1460 /* If the memory mapping triggered a SIGBUS then we return an
1461 * IO error and ignore the error code passed down to us, since
1462 * it is very likely just an effect of a nullified replacement
1465 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
1468 journal_file_post_change(f
);
1473 typedef struct ChainCacheItem
{
1474 uint64_t first
; /* the array at the beginning of the chain */
1475 uint64_t array
; /* the cached array */
1476 uint64_t begin
; /* the first item in the cached array */
1477 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
1478 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
1481 static void chain_cache_put(
1488 uint64_t last_index
) {
1491 /* If the chain item to cache for this chain is the
1492 * first one it's not worth caching anything */
1496 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
1497 ci
= ordered_hashmap_steal_first(h
);
1500 ci
= new(ChainCacheItem
, 1);
1507 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
1512 assert(ci
->first
== first
);
1517 ci
->last_index
= last_index
;
1520 static int generic_array_get(
1524 Object
**ret
, uint64_t *offset
) {
1527 uint64_t p
= 0, a
, t
= 0;
1535 /* Try the chain cache first */
1536 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1537 if (ci
&& i
> ci
->total
) {
1546 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1550 k
= journal_file_entry_array_n_items(o
);
1552 p
= le64toh(o
->entry_array
.items
[i
]);
1558 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1564 /* Let's cache this item for the next invocation */
1565 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
1567 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1580 static int generic_array_get_plus_one(
1585 Object
**ret
, uint64_t *offset
) {
1594 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1607 return generic_array_get(f
, first
, i
-1, ret
, offset
);
1616 static int generic_array_bisect(
1621 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1622 direction_t direction
,
1627 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
1628 bool subtract_one
= false;
1629 Object
*o
, *array
= NULL
;
1634 assert(test_object
);
1636 /* Start with the first array in the chain */
1639 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1640 if (ci
&& n
> ci
->total
) {
1641 /* Ah, we have iterated this bisection array chain
1642 * previously! Let's see if we can skip ahead in the
1643 * chain, as far as the last time. But we can't jump
1644 * backwards in the chain, so let's check that
1647 r
= test_object(f
, ci
->begin
, needle
);
1651 if (r
== TEST_LEFT
) {
1652 /* OK, what we are looking for is right of the
1653 * begin of this EntryArray, so let's jump
1654 * straight to previously cached array in the
1660 last_index
= ci
->last_index
;
1665 uint64_t left
, right
, k
, lp
;
1667 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
1671 k
= journal_file_entry_array_n_items(array
);
1677 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
1681 r
= test_object(f
, p
, needle
);
1685 if (r
== TEST_FOUND
)
1686 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1688 if (r
== TEST_RIGHT
) {
1692 if (last_index
!= (uint64_t) -1) {
1693 assert(last_index
<= right
);
1695 /* If we cached the last index we
1696 * looked at, let's try to not to jump
1697 * too wildly around and see if we can
1698 * limit the range to look at early to
1699 * the immediate neighbors of the last
1700 * index we looked at. */
1702 if (last_index
> 0) {
1703 uint64_t x
= last_index
- 1;
1705 p
= le64toh(array
->entry_array
.items
[x
]);
1709 r
= test_object(f
, p
, needle
);
1713 if (r
== TEST_FOUND
)
1714 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1716 if (r
== TEST_RIGHT
)
1722 if (last_index
< right
) {
1723 uint64_t y
= last_index
+ 1;
1725 p
= le64toh(array
->entry_array
.items
[y
]);
1729 r
= test_object(f
, p
, needle
);
1733 if (r
== TEST_FOUND
)
1734 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1736 if (r
== TEST_RIGHT
)
1744 if (left
== right
) {
1745 if (direction
== DIRECTION_UP
)
1746 subtract_one
= true;
1752 assert(left
< right
);
1753 i
= (left
+ right
) / 2;
1755 p
= le64toh(array
->entry_array
.items
[i
]);
1759 r
= test_object(f
, p
, needle
);
1763 if (r
== TEST_FOUND
)
1764 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1766 if (r
== TEST_RIGHT
)
1774 if (direction
== DIRECTION_UP
) {
1776 subtract_one
= true;
1787 last_index
= (uint64_t) -1;
1788 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
1794 if (subtract_one
&& t
== 0 && i
== 0)
1797 /* Let's cache this item for the next invocation */
1798 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
1800 if (subtract_one
&& i
== 0)
1802 else if (subtract_one
)
1803 p
= le64toh(array
->entry_array
.items
[i
-1]);
1805 p
= le64toh(array
->entry_array
.items
[i
]);
1807 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1818 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
1823 static int generic_array_bisect_plus_one(
1829 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1830 direction_t direction
,
1836 bool step_back
= false;
1840 assert(test_object
);
1845 /* This bisects the array in object 'first', but first checks
1847 r
= test_object(f
, extra
, needle
);
1851 if (r
== TEST_FOUND
)
1852 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1854 /* if we are looking with DIRECTION_UP then we need to first
1855 see if in the actual array there is a matching entry, and
1856 return the last one of that. But if there isn't any we need
1857 to return this one. Hence remember this, and return it
1860 step_back
= direction
== DIRECTION_UP
;
1862 if (r
== TEST_RIGHT
) {
1863 if (direction
== DIRECTION_DOWN
)
1869 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
1871 if (r
== 0 && step_back
)
1880 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1896 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1902 else if (p
< needle
)
1908 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1915 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1919 if (le64toh(o
->entry
.seqnum
) == needle
)
1921 else if (le64toh(o
->entry
.seqnum
) < needle
)
1927 int journal_file_move_to_entry_by_seqnum(
1930 direction_t direction
,
1934 return generic_array_bisect(f
,
1935 le64toh(f
->header
->entry_array_offset
),
1936 le64toh(f
->header
->n_entries
),
1943 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1950 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1954 if (le64toh(o
->entry
.realtime
) == needle
)
1956 else if (le64toh(o
->entry
.realtime
) < needle
)
1962 int journal_file_move_to_entry_by_realtime(
1965 direction_t direction
,
1969 return generic_array_bisect(f
,
1970 le64toh(f
->header
->entry_array_offset
),
1971 le64toh(f
->header
->n_entries
),
1973 test_object_realtime
,
1978 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1985 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1989 if (le64toh(o
->entry
.monotonic
) == needle
)
1991 else if (le64toh(o
->entry
.monotonic
) < needle
)
1997 static int find_data_object_by_boot_id(
2003 char t
[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2005 sd_id128_to_string(boot_id
, t
+ 9);
2006 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2009 int journal_file_move_to_entry_by_monotonic(
2013 direction_t direction
,
2022 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2028 return generic_array_bisect_plus_one(f
,
2029 le64toh(o
->data
.entry_offset
),
2030 le64toh(o
->data
.entry_array_offset
),
2031 le64toh(o
->data
.n_entries
),
2033 test_object_monotonic
,
2038 void journal_file_reset_location(JournalFile
*f
) {
2039 f
->location_type
= LOCATION_HEAD
;
2040 f
->current_offset
= 0;
2041 f
->current_seqnum
= 0;
2042 f
->current_realtime
= 0;
2043 f
->current_monotonic
= 0;
2044 zero(f
->current_boot_id
);
2045 f
->current_xor_hash
= 0;
2048 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2049 f
->location_type
= LOCATION_SEEK
;
2050 f
->current_offset
= offset
;
2051 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2052 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2053 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2054 f
->current_boot_id
= o
->entry
.boot_id
;
2055 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2058 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2061 assert(af
->location_type
== LOCATION_SEEK
);
2062 assert(bf
->location_type
== LOCATION_SEEK
);
2064 /* If contents and timestamps match, these entries are
2065 * identical, even if the seqnum does not match */
2066 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2067 af
->current_monotonic
== bf
->current_monotonic
&&
2068 af
->current_realtime
== bf
->current_realtime
&&
2069 af
->current_xor_hash
== bf
->current_xor_hash
)
2072 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2074 /* If this is from the same seqnum source, compare
2076 if (af
->current_seqnum
< bf
->current_seqnum
)
2078 if (af
->current_seqnum
> bf
->current_seqnum
)
2081 /* Wow! This is weird, different data but the same
2082 * seqnums? Something is borked, but let's make the
2083 * best of it and compare by time. */
2086 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2088 /* If the boot id matches, compare monotonic time */
2089 if (af
->current_monotonic
< bf
->current_monotonic
)
2091 if (af
->current_monotonic
> bf
->current_monotonic
)
2095 /* Otherwise, compare UTC time */
2096 if (af
->current_realtime
< bf
->current_realtime
)
2098 if (af
->current_realtime
> bf
->current_realtime
)
2101 /* Finally, compare by contents */
2102 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2104 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2110 int journal_file_next_entry(
2113 direction_t direction
,
2114 Object
**ret
, uint64_t *offset
) {
2121 n
= le64toh(f
->header
->n_entries
);
2126 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2128 r
= generic_array_bisect(f
,
2129 le64toh(f
->header
->entry_array_offset
),
2130 le64toh(f
->header
->n_entries
),
2139 if (direction
== DIRECTION_DOWN
) {
2152 /* And jump to it */
2153 r
= generic_array_get(f
,
2154 le64toh(f
->header
->entry_array_offset
),
2161 (direction
== DIRECTION_DOWN
? ofs
<= p
: ofs
>= p
)) {
2162 log_debug("%s: entry array corrupted at entry %"PRIu64
,
2173 int journal_file_next_entry_for_data(
2175 Object
*o
, uint64_t p
,
2176 uint64_t data_offset
,
2177 direction_t direction
,
2178 Object
**ret
, uint64_t *offset
) {
2185 assert(p
> 0 || !o
);
2187 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2191 n
= le64toh(d
->data
.n_entries
);
2196 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2198 if (o
->object
.type
!= OBJECT_ENTRY
)
2201 r
= generic_array_bisect_plus_one(f
,
2202 le64toh(d
->data
.entry_offset
),
2203 le64toh(d
->data
.entry_array_offset
),
2204 le64toh(d
->data
.n_entries
),
2214 if (direction
== DIRECTION_DOWN
) {
2228 return generic_array_get_plus_one(f
,
2229 le64toh(d
->data
.entry_offset
),
2230 le64toh(d
->data
.entry_array_offset
),
2235 int journal_file_move_to_entry_by_offset_for_data(
2237 uint64_t data_offset
,
2239 direction_t direction
,
2240 Object
**ret
, uint64_t *offset
) {
2247 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2251 return generic_array_bisect_plus_one(f
,
2252 le64toh(d
->data
.entry_offset
),
2253 le64toh(d
->data
.entry_array_offset
),
2254 le64toh(d
->data
.n_entries
),
2261 int journal_file_move_to_entry_by_monotonic_for_data(
2263 uint64_t data_offset
,
2266 direction_t direction
,
2267 Object
**ret
, uint64_t *offset
) {
2275 /* First, seek by time */
2276 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2282 r
= generic_array_bisect_plus_one(f
,
2283 le64toh(o
->data
.entry_offset
),
2284 le64toh(o
->data
.entry_array_offset
),
2285 le64toh(o
->data
.n_entries
),
2287 test_object_monotonic
,
2293 /* And now, continue seeking until we find an entry that
2294 * exists in both bisection arrays */
2300 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2304 r
= generic_array_bisect_plus_one(f
,
2305 le64toh(d
->data
.entry_offset
),
2306 le64toh(d
->data
.entry_array_offset
),
2307 le64toh(d
->data
.n_entries
),
2315 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2319 r
= generic_array_bisect_plus_one(f
,
2320 le64toh(o
->data
.entry_offset
),
2321 le64toh(o
->data
.entry_array_offset
),
2322 le64toh(o
->data
.n_entries
),
2344 int journal_file_move_to_entry_by_seqnum_for_data(
2346 uint64_t data_offset
,
2348 direction_t direction
,
2349 Object
**ret
, uint64_t *offset
) {
2356 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2360 return generic_array_bisect_plus_one(f
,
2361 le64toh(d
->data
.entry_offset
),
2362 le64toh(d
->data
.entry_array_offset
),
2363 le64toh(d
->data
.n_entries
),
2370 int journal_file_move_to_entry_by_realtime_for_data(
2372 uint64_t data_offset
,
2374 direction_t direction
,
2375 Object
**ret
, uint64_t *offset
) {
2382 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2386 return generic_array_bisect_plus_one(f
,
2387 le64toh(d
->data
.entry_offset
),
2388 le64toh(d
->data
.entry_array_offset
),
2389 le64toh(d
->data
.n_entries
),
2391 test_object_realtime
,
2396 void journal_file_dump(JournalFile
*f
) {
2403 journal_file_print_header(f
);
2405 p
= le64toh(f
->header
->header_size
);
2407 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
2411 switch (o
->object
.type
) {
2414 printf("Type: OBJECT_UNUSED\n");
2418 printf("Type: OBJECT_DATA\n");
2422 printf("Type: OBJECT_FIELD\n");
2426 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
2427 le64toh(o
->entry
.seqnum
),
2428 le64toh(o
->entry
.monotonic
),
2429 le64toh(o
->entry
.realtime
));
2432 case OBJECT_FIELD_HASH_TABLE
:
2433 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2436 case OBJECT_DATA_HASH_TABLE
:
2437 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2440 case OBJECT_ENTRY_ARRAY
:
2441 printf("Type: OBJECT_ENTRY_ARRAY\n");
2445 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
2446 le64toh(o
->tag
.seqnum
),
2447 le64toh(o
->tag
.epoch
));
2451 printf("Type: unknown (%i)\n", o
->object
.type
);
2455 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
2456 printf("Flags: %s\n",
2457 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
2459 if (p
== le64toh(f
->header
->tail_object_offset
))
2462 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
2467 log_error("File corrupt");
2470 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
2473 x
= format_timestamp(buf
, l
, t
);
2479 void journal_file_print_header(JournalFile
*f
) {
2480 char a
[33], b
[33], c
[33], d
[33];
2481 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
2483 char bytes
[FORMAT_BYTES_MAX
];
2487 printf("File Path: %s\n"
2491 "Sequential Number ID: %s\n"
2493 "Compatible Flags:%s%s\n"
2494 "Incompatible Flags:%s%s%s\n"
2495 "Header size: %"PRIu64
"\n"
2496 "Arena size: %"PRIu64
"\n"
2497 "Data Hash Table Size: %"PRIu64
"\n"
2498 "Field Hash Table Size: %"PRIu64
"\n"
2499 "Rotate Suggested: %s\n"
2500 "Head Sequential Number: %"PRIu64
"\n"
2501 "Tail Sequential Number: %"PRIu64
"\n"
2502 "Head Realtime Timestamp: %s\n"
2503 "Tail Realtime Timestamp: %s\n"
2504 "Tail Monotonic Timestamp: %s\n"
2505 "Objects: %"PRIu64
"\n"
2506 "Entry Objects: %"PRIu64
"\n",
2508 sd_id128_to_string(f
->header
->file_id
, a
),
2509 sd_id128_to_string(f
->header
->machine_id
, b
),
2510 sd_id128_to_string(f
->header
->boot_id
, c
),
2511 sd_id128_to_string(f
->header
->seqnum_id
, d
),
2512 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
2513 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
2514 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
2515 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
2516 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
2517 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
2518 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
2519 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
2520 le64toh(f
->header
->header_size
),
2521 le64toh(f
->header
->arena_size
),
2522 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
2523 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
2524 yes_no(journal_file_rotate_suggested(f
, 0)),
2525 le64toh(f
->header
->head_entry_seqnum
),
2526 le64toh(f
->header
->tail_entry_seqnum
),
2527 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)),
2528 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)),
2529 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
),
2530 le64toh(f
->header
->n_objects
),
2531 le64toh(f
->header
->n_entries
));
2533 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
2534 printf("Data Objects: %"PRIu64
"\n"
2535 "Data Hash Table Fill: %.1f%%\n",
2536 le64toh(f
->header
->n_data
),
2537 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
2539 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
2540 printf("Field Objects: %"PRIu64
"\n"
2541 "Field Hash Table Fill: %.1f%%\n",
2542 le64toh(f
->header
->n_fields
),
2543 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
2545 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
2546 printf("Tag Objects: %"PRIu64
"\n",
2547 le64toh(f
->header
->n_tags
));
2548 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
2549 printf("Entry Array Objects: %"PRIu64
"\n",
2550 le64toh(f
->header
->n_entry_arrays
));
2552 if (fstat(f
->fd
, &st
) >= 0)
2553 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (uint64_t) st
.st_blocks
* 512ULL));
2556 static int journal_file_warn_btrfs(JournalFile
*f
) {
2562 /* Before we write anything, check if the COW logic is turned
2563 * off on btrfs. Given our write pattern that is quite
2564 * unfriendly to COW file systems this should greatly improve
2565 * performance on COW file systems, such as btrfs, at the
2566 * expense of data integrity features (which shouldn't be too
2567 * bad, given that we do our own checksumming). */
2569 r
= btrfs_is_filesystem(f
->fd
);
2571 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
2575 r
= read_attr_fd(f
->fd
, &attrs
);
2577 return log_warning_errno(r
, "Failed to read file attributes: %m");
2579 if (attrs
& FS_NOCOW_FL
) {
2580 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2584 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2585 "This is likely to slow down journal access substantially, please consider turning "
2586 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
2591 int journal_file_open(
2597 JournalMetrics
*metrics
,
2598 MMapCache
*mmap_cache
,
2599 JournalFile
*template,
2600 JournalFile
**ret
) {
2602 bool newly_created
= false;
2610 if ((flags
& O_ACCMODE
) != O_RDONLY
&&
2611 (flags
& O_ACCMODE
) != O_RDWR
)
2614 if (!endswith(fname
, ".journal") &&
2615 !endswith(fname
, ".journal~"))
2618 f
= new0(JournalFile
, 1);
2626 f
->prot
= prot_from_flags(flags
);
2627 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
2628 #if defined(HAVE_LZ4)
2629 f
->compress_lz4
= compress
;
2630 #elif defined(HAVE_XZ)
2631 f
->compress_xz
= compress
;
2638 f
->mmap
= mmap_cache_ref(mmap_cache
);
2640 f
->mmap
= mmap_cache_new();
2647 f
->path
= strdup(fname
);
2653 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
2654 if (!f
->chain_cache
) {
2659 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
, f
->mode
);
2665 r
= journal_file_fstat(f
);
2669 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
2671 (void) journal_file_warn_btrfs(f
);
2673 /* Let's attach the creation time to the journal file,
2674 * so that the vacuuming code knows the age of this
2675 * file even if the file might end up corrupted one
2676 * day... Ideally we'd just use the creation time many
2677 * file systems maintain for each file, but there is
2678 * currently no usable API to query this, hence let's
2679 * emulate this via extended attributes. If extended
2680 * attributes are not supported we'll just skip this,
2681 * and rely solely on mtime/atime/ctime of the file. */
2683 fd_setcrtime(f
->fd
, 0);
2686 /* Try to load the FSPRG state, and if we can't, then
2687 * just don't do sealing */
2689 r
= journal_file_fss_load(f
);
2695 r
= journal_file_init_header(f
, template);
2699 r
= journal_file_fstat(f
);
2703 newly_created
= true;
2706 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
2711 r
= mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
);
2717 if (!newly_created
) {
2718 r
= journal_file_verify_header(f
);
2724 if (!newly_created
&& f
->writable
) {
2725 r
= journal_file_fss_load(f
);
2733 journal_default_metrics(metrics
, f
->fd
);
2734 f
->metrics
= *metrics
;
2735 } else if (template)
2736 f
->metrics
= template->metrics
;
2738 r
= journal_file_refresh_header(f
);
2744 r
= journal_file_hmac_setup(f
);
2749 if (newly_created
) {
2750 r
= journal_file_setup_field_hash_table(f
);
2754 r
= journal_file_setup_data_hash_table(f
);
2759 r
= journal_file_append_first_tag(f
);
2765 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
)) {
2774 if (f
->fd
>= 0 && mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
2777 journal_file_close(f
);
2782 int journal_file_rotate(JournalFile
**f
, bool compress
, bool seal
) {
2783 _cleanup_free_
char *p
= NULL
;
2785 JournalFile
*old_file
, *new_file
= NULL
;
2793 if (!old_file
->writable
)
2796 if (!endswith(old_file
->path
, ".journal"))
2799 l
= strlen(old_file
->path
);
2800 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
2801 (int) l
- 8, old_file
->path
,
2802 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
2803 le64toh((*f
)->header
->head_entry_seqnum
),
2804 le64toh((*f
)->header
->head_entry_realtime
));
2808 /* Try to rename the file to the archived version. If the file
2809 * already was deleted, we'll get ENOENT, let's ignore that
2811 r
= rename(old_file
->path
, p
);
2812 if (r
< 0 && errno
!= ENOENT
)
2815 old_file
->header
->state
= STATE_ARCHIVED
;
2817 /* Currently, btrfs is not very good with out write patterns
2818 * and fragments heavily. Let's defrag our journal files when
2819 * we archive them */
2820 old_file
->defrag_on_close
= true;
2822 r
= journal_file_open(old_file
->path
, old_file
->flags
, old_file
->mode
, compress
, seal
, NULL
, old_file
->mmap
, old_file
, &new_file
);
2823 journal_file_close(old_file
);
2829 int journal_file_open_reliably(
2835 JournalMetrics
*metrics
,
2836 MMapCache
*mmap_cache
,
2837 JournalFile
*template,
2838 JournalFile
**ret
) {
2842 _cleanup_free_
char *p
= NULL
;
2844 r
= journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2846 -EBADMSG
, /* corrupted */
2847 -ENODATA
, /* truncated */
2848 -EHOSTDOWN
, /* other machine */
2849 -EPROTONOSUPPORT
, /* incompatible feature */
2850 -EBUSY
, /* unclean shutdown */
2851 -ESHUTDOWN
, /* already archived */
2852 -EIO
, /* IO error, including SIGBUS on mmap */
2853 -EIDRM
/* File has been deleted */))
2856 if ((flags
& O_ACCMODE
) == O_RDONLY
)
2859 if (!(flags
& O_CREAT
))
2862 if (!endswith(fname
, ".journal"))
2865 /* The file is corrupted. Rotate it away and try it again (but only once) */
2868 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
2870 now(CLOCK_REALTIME
),
2874 if (rename(fname
, p
) < 0)
2877 /* btrfs doesn't cope well with our write pattern and
2878 * fragments heavily. Let's defrag all files we rotate */
2880 (void) chattr_path(p
, false, FS_NOCOW_FL
);
2881 (void) btrfs_defrag(p
);
2883 log_warning_errno(r
, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
2885 return journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2888 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
2890 uint64_t q
, xor_hash
= 0;
2903 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
2904 ts
.realtime
= le64toh(o
->entry
.realtime
);
2906 n
= journal_file_entry_n_items(o
);
2907 /* alloca() can't take 0, hence let's allocate at least one */
2908 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
2910 for (i
= 0; i
< n
; i
++) {
2917 q
= le64toh(o
->entry
.items
[i
].object_offset
);
2918 le_hash
= o
->entry
.items
[i
].hash
;
2920 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
2924 if (le_hash
!= o
->data
.hash
)
2927 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
2930 /* We hit the limit on 32bit machines */
2931 if ((uint64_t) t
!= l
)
2934 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
2935 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2938 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
2939 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
2943 data
= from
->compress_buffer
;
2946 return -EPROTONOSUPPORT
;
2949 data
= o
->data
.payload
;
2951 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
2955 xor_hash
^= le64toh(u
->data
.hash
);
2956 items
[i
].object_offset
= htole64(h
);
2957 items
[i
].hash
= u
->data
.hash
;
2959 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
2964 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
2966 if (mmap_cache_got_sigbus(to
->mmap
, to
->fd
))
2972 void journal_reset_metrics(JournalMetrics
*m
) {
2975 /* Set everything to "pick automatic values". */
2977 *m
= (JournalMetrics
) {
2978 .min_use
= (uint64_t) -1,
2979 .max_use
= (uint64_t) -1,
2980 .min_size
= (uint64_t) -1,
2981 .max_size
= (uint64_t) -1,
2982 .keep_free
= (uint64_t) -1,
2983 .n_max_files
= (uint64_t) -1,
2987 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
2988 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
], e
[FORMAT_BYTES_MAX
];
2995 if (fstatvfs(fd
, &ss
) >= 0)
2996 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
2998 log_debug_errno(errno
, "Failed to detremine disk size: %m");
3002 if (m
->max_use
== (uint64_t) -1) {
3005 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
3007 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
3008 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
3010 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
3011 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3013 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3015 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3017 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3018 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3021 if (m
->min_use
== (uint64_t) -1)
3022 m
->min_use
= DEFAULT_MIN_USE
;
3024 if (m
->min_use
> m
->max_use
)
3025 m
->min_use
= m
->max_use
;
3027 if (m
->max_size
== (uint64_t) -1) {
3028 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3030 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3031 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3033 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3035 if (m
->max_size
!= 0) {
3036 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3037 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3039 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3040 m
->max_use
= m
->max_size
*2;
3043 if (m
->min_size
== (uint64_t) -1)
3044 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3046 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3048 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3049 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3051 if (m
->max_size
!= 0 && m
->min_size
> m
->max_size
)
3052 m
->max_size
= m
->min_size
;
3055 if (m
->keep_free
== (uint64_t) -1) {
3058 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3060 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3061 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3064 m
->keep_free
= DEFAULT_KEEP_FREE
;
3067 if (m
->n_max_files
== (uint64_t) -1)
3068 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3070 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3071 format_bytes(a
, sizeof(a
), m
->min_use
),
3072 format_bytes(b
, sizeof(b
), m
->max_use
),
3073 format_bytes(c
, sizeof(c
), m
->max_size
),
3074 format_bytes(d
, sizeof(d
), m
->min_size
),
3075 format_bytes(e
, sizeof(e
), m
->keep_free
),
3079 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3084 if (f
->header
->head_entry_realtime
== 0)
3087 *from
= le64toh(f
->header
->head_entry_realtime
);
3091 if (f
->header
->tail_entry_realtime
== 0)
3094 *to
= le64toh(f
->header
->tail_entry_realtime
);
3100 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3108 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3112 if (le64toh(o
->data
.n_entries
) <= 0)
3116 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3120 *from
= le64toh(o
->entry
.monotonic
);
3124 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3128 r
= generic_array_get_plus_one(f
,
3129 le64toh(o
->data
.entry_offset
),
3130 le64toh(o
->data
.entry_array_offset
),
3131 le64toh(o
->data
.n_entries
)-1,
3136 *to
= le64toh(o
->entry
.monotonic
);
3142 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3145 /* If we gained new header fields we gained new features,
3146 * hence suggest a rotation */
3147 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3148 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3152 /* Let's check if the hash tables grew over a certain fill
3153 * level (75%, borrowing this value from Java's hash table
3154 * implementation), and if so suggest a rotation. To calculate
3155 * the fill level we need the n_data field, which only exists
3156 * in newer versions. */
3158 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3159 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3160 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3162 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3163 le64toh(f
->header
->n_data
),
3164 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3165 (unsigned long long) f
->last_stat
.st_size
,
3166 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3170 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3171 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3172 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3174 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3175 le64toh(f
->header
->n_fields
),
3176 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3180 /* Are the data objects properly indexed by field objects? */
3181 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3182 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3183 le64toh(f
->header
->n_data
) > 0 &&
3184 le64toh(f
->header
->n_fields
) == 0)
3187 if (max_file_usec
> 0) {
3190 h
= le64toh(f
->header
->head_entry_realtime
);
3191 t
= now(CLOCK_REALTIME
);
3193 if (h
> 0 && t
> h
+ max_file_usec
)