1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/statvfs.h>
31 #include "btrfs-util.h"
34 #include "journal-authenticate.h"
35 #include "journal-def.h"
36 #include "journal-file.h"
38 #include "parse-util.h"
39 #include "random-util.h"
40 #include "string-util.h"
42 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
43 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
45 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
47 /* This is the minimum journal file size */
48 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
50 /* These are the lower and upper bounds if we deduce the max_use value
51 * from the file system size */
52 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
53 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
55 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
56 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
58 /* This is the upper bound if we deduce max_size from max_use */
59 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
61 /* This is the upper bound if we deduce the keep_free value from the
63 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
65 /* This is the keep_free value when we can't determine the system
67 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
69 /* This is the default maximum number of journal files to keep around. */
70 #define DEFAULT_N_MAX_FILES (100)
72 /* n_data was the first entry we added after the initial file format design */
73 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
75 /* How many entries to keep in the entry array chain cache at max */
76 #define CHAIN_CACHE_MAX 20
78 /* How much to increase the journal file size at once each time we allocate something new. */
79 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
81 /* Reread fstat() of the file for detecting deletions at least this often */
82 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
84 /* The mmap context to use for the header we pick as one above the last defined typed */
85 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
87 static int journal_file_set_online(JournalFile
*f
) {
93 if (!(f
->fd
>= 0 && f
->header
))
96 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
99 switch(f
->header
->state
) {
104 f
->header
->state
= STATE_ONLINE
;
113 int journal_file_set_offline(JournalFile
*f
) {
119 if (!(f
->fd
>= 0 && f
->header
))
122 if (f
->header
->state
!= STATE_ONLINE
)
127 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
130 f
->header
->state
= STATE_OFFLINE
;
132 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
140 JournalFile
* journal_file_close(JournalFile
*f
) {
144 /* Write the final tag */
145 if (f
->seal
&& f
->writable
)
146 journal_file_append_tag(f
);
149 journal_file_set_offline(f
);
151 if (f
->mmap
&& f
->fd
>= 0)
152 mmap_cache_close_fd(f
->mmap
, f
->fd
);
154 if (f
->fd
>= 0 && f
->defrag_on_close
) {
156 /* Be friendly to btrfs: turn COW back on again now,
157 * and defragment the file. We won't write to the file
158 * ever again, hence remove all fragmentation, and
159 * reenable all the good bits COW usually provides
160 * (such as data checksumming). */
162 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
163 (void) btrfs_defrag_fd(f
->fd
);
170 mmap_cache_unref(f
->mmap
);
172 ordered_hashmap_free_free(f
->chain_cache
);
174 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
175 free(f
->compress_buffer
);
180 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
182 free(f
->fsprg_state
);
187 gcry_md_close(f
->hmac
);
194 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
201 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
202 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
204 h
.incompatible_flags
|= htole32(
205 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
206 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
208 h
.compatible_flags
= htole32(
209 f
->seal
* HEADER_COMPATIBLE_SEALED
);
211 r
= sd_id128_randomize(&h
.file_id
);
216 h
.seqnum_id
= template->header
->seqnum_id
;
217 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
219 h
.seqnum_id
= h
.file_id
;
221 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
231 static int journal_file_refresh_header(JournalFile
*f
) {
237 r
= sd_id128_get_machine(&f
->header
->machine_id
);
241 r
= sd_id128_get_boot(&boot_id
);
245 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
246 f
->tail_entry_monotonic_valid
= true;
248 f
->header
->boot_id
= boot_id
;
250 r
= journal_file_set_online(f
);
252 /* Sync the online state to disk */
258 static int journal_file_verify_header(JournalFile
*f
) {
263 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
266 /* In both read and write mode we refuse to open files with
267 * incompatible flags we don't know */
268 flags
= le32toh(f
->header
->incompatible_flags
);
269 if (flags
& ~HEADER_INCOMPATIBLE_SUPPORTED
) {
270 if (flags
& ~HEADER_INCOMPATIBLE_ANY
)
271 log_debug("Journal file %s has unknown incompatible flags %"PRIx32
,
272 f
->path
, flags
& ~HEADER_INCOMPATIBLE_ANY
);
273 flags
= (flags
& HEADER_INCOMPATIBLE_ANY
) & ~HEADER_INCOMPATIBLE_SUPPORTED
;
275 log_debug("Journal file %s uses incompatible flags %"PRIx32
276 " disabled at compilation time.", f
->path
, flags
);
277 return -EPROTONOSUPPORT
;
280 /* When open for writing we refuse to open files with
281 * compatible flags, too */
282 flags
= le32toh(f
->header
->compatible_flags
);
283 if (f
->writable
&& (flags
& ~HEADER_COMPATIBLE_SUPPORTED
)) {
284 if (flags
& ~HEADER_COMPATIBLE_ANY
)
285 log_debug("Journal file %s has unknown compatible flags %"PRIx32
,
286 f
->path
, flags
& ~HEADER_COMPATIBLE_ANY
);
287 flags
= (flags
& HEADER_COMPATIBLE_ANY
) & ~HEADER_COMPATIBLE_SUPPORTED
;
289 log_debug("Journal file %s uses compatible flags %"PRIx32
290 " disabled at compilation time.", f
->path
, flags
);
291 return -EPROTONOSUPPORT
;
294 if (f
->header
->state
>= _STATE_MAX
)
297 /* The first addition was n_data, so check that we are at least this large */
298 if (le64toh(f
->header
->header_size
) < HEADER_SIZE_MIN
)
301 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
304 if ((le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)) > (uint64_t) f
->last_stat
.st_size
)
307 if (le64toh(f
->header
->tail_object_offset
) > (le64toh(f
->header
->header_size
) + le64toh(f
->header
->arena_size
)))
310 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
311 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
312 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
313 !VALID64(le64toh(f
->header
->entry_array_offset
)))
318 sd_id128_t machine_id
;
321 r
= sd_id128_get_machine(&machine_id
);
325 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
328 state
= f
->header
->state
;
330 if (state
== STATE_ONLINE
) {
331 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
333 } else if (state
== STATE_ARCHIVED
)
335 else if (state
!= STATE_OFFLINE
) {
336 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
341 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
342 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
344 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
349 static int journal_file_fstat(JournalFile
*f
) {
353 if (fstat(f
->fd
, &f
->last_stat
) < 0)
356 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
358 /* Refuse appending to files that are already deleted */
359 if (f
->last_stat
.st_nlink
<= 0)
365 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
366 uint64_t old_size
, new_size
;
371 /* We assume that this file is not sparse, and we know that
372 * for sure, since we always call posix_fallocate()
375 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
379 le64toh(f
->header
->header_size
) +
380 le64toh(f
->header
->arena_size
);
382 new_size
= PAGE_ALIGN(offset
+ size
);
383 if (new_size
< le64toh(f
->header
->header_size
))
384 new_size
= le64toh(f
->header
->header_size
);
386 if (new_size
<= old_size
) {
388 /* We already pre-allocated enough space, but before
389 * we write to it, let's check with fstat() if the
390 * file got deleted, in order make sure we don't throw
391 * away the data immediately. Don't check fstat() for
392 * all writes though, but only once ever 10s. */
394 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
397 return journal_file_fstat(f
);
400 /* Allocate more space. */
402 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
405 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
408 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
411 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
413 if (new_size
- old_size
> available
)
418 /* Increase by larger blocks at once */
419 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
420 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
421 new_size
= f
->metrics
.max_size
;
423 /* Note that the glibc fallocate() fallback is very
424 inefficient, hence we try to minimize the allocation area
426 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
430 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
432 return journal_file_fstat(f
);
435 static unsigned type_to_context(ObjectType type
) {
436 /* One context for each type, plus one catch-all for the rest */
437 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
438 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
439 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
442 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
) {
451 /* Avoid SIGBUS on invalid accesses */
452 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
453 /* Hmm, out of range? Let's refresh the fstat() data
454 * first, before we trust that check. */
456 r
= journal_file_fstat(f
);
460 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
461 return -EADDRNOTAVAIL
;
464 return mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
);
467 static uint64_t minimum_header_size(Object
*o
) {
469 static const uint64_t table
[] = {
470 [OBJECT_DATA
] = sizeof(DataObject
),
471 [OBJECT_FIELD
] = sizeof(FieldObject
),
472 [OBJECT_ENTRY
] = sizeof(EntryObject
),
473 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
474 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
475 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
476 [OBJECT_TAG
] = sizeof(TagObject
),
479 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
480 return sizeof(ObjectHeader
);
482 return table
[o
->object
.type
];
485 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
494 /* Objects may only be located at multiple of 64 bit */
495 if (!VALID64(offset
))
498 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
);
503 s
= le64toh(o
->object
.size
);
505 if (s
< sizeof(ObjectHeader
))
508 if (o
->object
.type
<= OBJECT_UNUSED
)
511 if (s
< minimum_header_size(o
))
514 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
)
517 if (s
> sizeof(ObjectHeader
)) {
518 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
);
529 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
534 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
537 /* If an external seqnum counter was passed, we update
538 * both the local and the external one, and set it to
539 * the maximum of both */
547 f
->header
->tail_entry_seqnum
= htole64(r
);
549 if (f
->header
->head_entry_seqnum
== 0)
550 f
->header
->head_entry_seqnum
= htole64(r
);
555 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
562 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
563 assert(size
>= sizeof(ObjectHeader
));
567 r
= journal_file_set_online(f
);
571 p
= le64toh(f
->header
->tail_object_offset
);
573 p
= le64toh(f
->header
->header_size
);
575 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
579 p
+= ALIGN64(le64toh(tail
->object
.size
));
582 r
= journal_file_allocate(f
, p
, size
);
586 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
);
593 o
->object
.type
= type
;
594 o
->object
.size
= htole64(size
);
596 f
->header
->tail_object_offset
= htole64(p
);
597 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
605 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
612 /* We estimate that we need 1 hash table entry per 768 bytes
613 of journal file and we want to make sure we never get
614 beyond 75% fill level. Calculate the hash table size for
615 the maximum file size based on these metrics. */
617 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
618 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
619 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
621 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
623 r
= journal_file_append_object(f
,
624 OBJECT_DATA_HASH_TABLE
,
625 offsetof(Object
, hash_table
.items
) + s
,
630 memzero(o
->hash_table
.items
, s
);
632 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
633 f
->header
->data_hash_table_size
= htole64(s
);
638 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
645 /* We use a fixed size hash table for the fields as this
646 * number should grow very slowly only */
648 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
649 r
= journal_file_append_object(f
,
650 OBJECT_FIELD_HASH_TABLE
,
651 offsetof(Object
, hash_table
.items
) + s
,
656 memzero(o
->hash_table
.items
, s
);
658 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
659 f
->header
->field_hash_table_size
= htole64(s
);
664 int journal_file_map_data_hash_table(JournalFile
*f
) {
671 if (f
->data_hash_table
)
674 p
= le64toh(f
->header
->data_hash_table_offset
);
675 s
= le64toh(f
->header
->data_hash_table_size
);
677 r
= journal_file_move_to(f
,
678 OBJECT_DATA_HASH_TABLE
,
685 f
->data_hash_table
= t
;
689 int journal_file_map_field_hash_table(JournalFile
*f
) {
696 if (f
->field_hash_table
)
699 p
= le64toh(f
->header
->field_hash_table_offset
);
700 s
= le64toh(f
->header
->field_hash_table_size
);
702 r
= journal_file_move_to(f
,
703 OBJECT_FIELD_HASH_TABLE
,
710 f
->field_hash_table
= t
;
714 static int journal_file_link_field(
727 if (o
->object
.type
!= OBJECT_FIELD
)
730 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
734 /* This might alter the window we are looking at */
735 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
738 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
740 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
742 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
746 o
->field
.next_hash_offset
= htole64(offset
);
749 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
751 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
752 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
757 static int journal_file_link_data(
770 if (o
->object
.type
!= OBJECT_DATA
)
773 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
777 /* This might alter the window we are looking at */
778 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
779 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
780 o
->data
.n_entries
= 0;
783 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
785 /* Only entry in the hash table is easy */
786 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
788 /* Move back to the previous data object, to patch in
791 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
795 o
->data
.next_hash_offset
= htole64(offset
);
798 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
800 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
801 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
806 int journal_file_find_field_object_with_hash(
808 const void *field
, uint64_t size
, uint64_t hash
,
809 Object
**ret
, uint64_t *offset
) {
811 uint64_t p
, osize
, h
, m
;
815 assert(field
&& size
> 0);
817 /* If the field hash table is empty, we can't find anything */
818 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
821 /* Map the field hash table, if it isn't mapped yet. */
822 r
= journal_file_map_field_hash_table(f
);
826 osize
= offsetof(Object
, field
.payload
) + size
;
828 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
833 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
838 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
842 if (le64toh(o
->field
.hash
) == hash
&&
843 le64toh(o
->object
.size
) == osize
&&
844 memcmp(o
->field
.payload
, field
, size
) == 0) {
854 p
= le64toh(o
->field
.next_hash_offset
);
860 int journal_file_find_field_object(
862 const void *field
, uint64_t size
,
863 Object
**ret
, uint64_t *offset
) {
868 assert(field
&& size
> 0);
870 hash
= hash64(field
, size
);
872 return journal_file_find_field_object_with_hash(f
,
877 int journal_file_find_data_object_with_hash(
879 const void *data
, uint64_t size
, uint64_t hash
,
880 Object
**ret
, uint64_t *offset
) {
882 uint64_t p
, osize
, h
, m
;
886 assert(data
|| size
== 0);
888 /* If there's no data hash table, then there's no entry. */
889 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
892 /* Map the data hash table, if it isn't mapped yet. */
893 r
= journal_file_map_data_hash_table(f
);
897 osize
= offsetof(Object
, data
.payload
) + size
;
899 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
904 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
909 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
913 if (le64toh(o
->data
.hash
) != hash
)
916 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
917 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
921 l
= le64toh(o
->object
.size
);
922 if (l
<= offsetof(Object
, data
.payload
))
925 l
-= offsetof(Object
, data
.payload
);
927 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
928 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
933 memcmp(f
->compress_buffer
, data
, size
) == 0) {
944 return -EPROTONOSUPPORT
;
946 } else if (le64toh(o
->object
.size
) == osize
&&
947 memcmp(o
->data
.payload
, data
, size
) == 0) {
959 p
= le64toh(o
->data
.next_hash_offset
);
965 int journal_file_find_data_object(
967 const void *data
, uint64_t size
,
968 Object
**ret
, uint64_t *offset
) {
973 assert(data
|| size
== 0);
975 hash
= hash64(data
, size
);
977 return journal_file_find_data_object_with_hash(f
,
982 static int journal_file_append_field(
984 const void *field
, uint64_t size
,
985 Object
**ret
, uint64_t *offset
) {
993 assert(field
&& size
> 0);
995 hash
= hash64(field
, size
);
997 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1011 osize
= offsetof(Object
, field
.payload
) + size
;
1012 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1016 o
->field
.hash
= htole64(hash
);
1017 memcpy(o
->field
.payload
, field
, size
);
1019 r
= journal_file_link_field(f
, o
, p
, hash
);
1023 /* The linking might have altered the window, so let's
1024 * refresh our pointer */
1025 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1030 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1044 static int journal_file_append_data(
1046 const void *data
, uint64_t size
,
1047 Object
**ret
, uint64_t *offset
) {
1052 int r
, compression
= 0;
1056 assert(data
|| size
== 0);
1058 hash
= hash64(data
, size
);
1060 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1074 osize
= offsetof(Object
, data
.payload
) + size
;
1075 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1079 o
->data
.hash
= htole64(hash
);
1081 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1082 if (JOURNAL_FILE_COMPRESS(f
) && size
>= COMPRESSION_SIZE_THRESHOLD
) {
1085 compression
= compress_blob(data
, size
, o
->data
.payload
, &rsize
);
1087 if (compression
>= 0) {
1088 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1089 o
->object
.flags
|= compression
;
1091 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1092 size
, rsize
, object_compressed_to_string(compression
));
1094 /* Compression didn't work, we don't really care why, let's continue without compression */
1099 if (compression
== 0 && size
> 0)
1100 memcpy(o
->data
.payload
, data
, size
);
1102 r
= journal_file_link_data(f
, o
, p
, hash
);
1106 /* The linking might have altered the window, so let's
1107 * refresh our pointer */
1108 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1115 eq
= memchr(data
, '=', size
);
1116 if (eq
&& eq
> data
) {
1120 /* Create field object ... */
1121 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1125 /* ... and link it in. */
1126 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1127 fo
->field
.head_data_offset
= le64toh(p
);
1131 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1145 uint64_t journal_file_entry_n_items(Object
*o
) {
1148 if (o
->object
.type
!= OBJECT_ENTRY
)
1151 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1154 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1157 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1160 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1163 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1166 if (o
->object
.type
!= OBJECT_DATA_HASH_TABLE
&&
1167 o
->object
.type
!= OBJECT_FIELD_HASH_TABLE
)
1170 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1173 static int link_entry_into_array(JournalFile
*f
,
1178 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1186 a
= le64toh(*first
);
1187 i
= hidx
= le64toh(*idx
);
1190 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1194 n
= journal_file_entry_array_n_items(o
);
1196 o
->entry_array
.items
[i
] = htole64(p
);
1197 *idx
= htole64(hidx
+ 1);
1203 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1214 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1215 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1221 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1226 o
->entry_array
.items
[i
] = htole64(p
);
1229 *first
= htole64(q
);
1231 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1235 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1238 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1239 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1241 *idx
= htole64(hidx
+ 1);
1246 static int link_entry_into_array_plus_one(JournalFile
*f
,
1261 *extra
= htole64(p
);
1265 i
= htole64(le64toh(*idx
) - 1);
1266 r
= link_entry_into_array(f
, first
, &i
, p
);
1271 *idx
= htole64(le64toh(*idx
) + 1);
1275 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1282 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1286 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1290 return link_entry_into_array_plus_one(f
,
1291 &o
->data
.entry_offset
,
1292 &o
->data
.entry_array_offset
,
1297 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1305 if (o
->object
.type
!= OBJECT_ENTRY
)
1308 __sync_synchronize();
1310 /* Link up the entry itself */
1311 r
= link_entry_into_array(f
,
1312 &f
->header
->entry_array_offset
,
1313 &f
->header
->n_entries
,
1318 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1320 if (f
->header
->head_entry_realtime
== 0)
1321 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1323 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1324 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1326 f
->tail_entry_monotonic_valid
= true;
1328 /* Link up the items */
1329 n
= journal_file_entry_n_items(o
);
1330 for (i
= 0; i
< n
; i
++) {
1331 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1339 static int journal_file_append_entry_internal(
1341 const dual_timestamp
*ts
,
1343 const EntryItem items
[], unsigned n_items
,
1345 Object
**ret
, uint64_t *offset
) {
1352 assert(items
|| n_items
== 0);
1355 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1357 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1361 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1362 memcpy(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1363 o
->entry
.realtime
= htole64(ts
->realtime
);
1364 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1365 o
->entry
.xor_hash
= htole64(xor_hash
);
1366 o
->entry
.boot_id
= f
->header
->boot_id
;
1369 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1374 r
= journal_file_link_entry(f
, o
, np
);
1387 void journal_file_post_change(JournalFile
*f
) {
1390 /* inotify() does not receive IN_MODIFY events from file
1391 * accesses done via mmap(). After each access we hence
1392 * trigger IN_MODIFY by truncating the journal file to its
1393 * current size which triggers IN_MODIFY. */
1395 __sync_synchronize();
1397 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1398 log_error_errno(errno
, "Failed to truncate file to its own size: %m");
1401 static int entry_item_cmp(const void *_a
, const void *_b
) {
1402 const EntryItem
*a
= _a
, *b
= _b
;
1404 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1406 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1411 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1415 uint64_t xor_hash
= 0;
1416 struct dual_timestamp _ts
;
1419 assert(iovec
|| n_iovec
== 0);
1422 dual_timestamp_get(&_ts
);
1426 if (f
->tail_entry_monotonic_valid
&&
1427 ts
->monotonic
< le64toh(f
->header
->tail_entry_monotonic
))
1431 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
1436 /* alloca() can't take 0, hence let's allocate at least one */
1437 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
1439 for (i
= 0; i
< n_iovec
; i
++) {
1443 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
1447 xor_hash
^= le64toh(o
->data
.hash
);
1448 items
[i
].object_offset
= htole64(p
);
1449 items
[i
].hash
= o
->data
.hash
;
1452 /* Order by the position on disk, in order to improve seek
1453 * times for rotating media. */
1454 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
1456 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
1458 /* If the memory mapping triggered a SIGBUS then we return an
1459 * IO error and ignore the error code passed down to us, since
1460 * it is very likely just an effect of a nullified replacement
1463 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
1466 journal_file_post_change(f
);
1471 typedef struct ChainCacheItem
{
1472 uint64_t first
; /* the array at the beginning of the chain */
1473 uint64_t array
; /* the cached array */
1474 uint64_t begin
; /* the first item in the cached array */
1475 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
1476 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
1479 static void chain_cache_put(
1486 uint64_t last_index
) {
1489 /* If the chain item to cache for this chain is the
1490 * first one it's not worth caching anything */
1494 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
1495 ci
= ordered_hashmap_steal_first(h
);
1498 ci
= new(ChainCacheItem
, 1);
1505 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
1510 assert(ci
->first
== first
);
1515 ci
->last_index
= last_index
;
1518 static int generic_array_get(
1522 Object
**ret
, uint64_t *offset
) {
1525 uint64_t p
= 0, a
, t
= 0;
1533 /* Try the chain cache first */
1534 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1535 if (ci
&& i
> ci
->total
) {
1544 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1548 k
= journal_file_entry_array_n_items(o
);
1550 p
= le64toh(o
->entry_array
.items
[i
]);
1556 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1562 /* Let's cache this item for the next invocation */
1563 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
1565 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1578 static int generic_array_get_plus_one(
1583 Object
**ret
, uint64_t *offset
) {
1592 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1605 return generic_array_get(f
, first
, i
-1, ret
, offset
);
1614 static int generic_array_bisect(
1619 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1620 direction_t direction
,
1625 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
1626 bool subtract_one
= false;
1627 Object
*o
, *array
= NULL
;
1632 assert(test_object
);
1634 /* Start with the first array in the chain */
1637 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
1638 if (ci
&& n
> ci
->total
) {
1639 /* Ah, we have iterated this bisection array chain
1640 * previously! Let's see if we can skip ahead in the
1641 * chain, as far as the last time. But we can't jump
1642 * backwards in the chain, so let's check that
1645 r
= test_object(f
, ci
->begin
, needle
);
1649 if (r
== TEST_LEFT
) {
1650 /* OK, what we are looking for is right of the
1651 * begin of this EntryArray, so let's jump
1652 * straight to previously cached array in the
1658 last_index
= ci
->last_index
;
1663 uint64_t left
, right
, k
, lp
;
1665 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
1669 k
= journal_file_entry_array_n_items(array
);
1675 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
1679 r
= test_object(f
, p
, needle
);
1683 if (r
== TEST_FOUND
)
1684 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1686 if (r
== TEST_RIGHT
) {
1690 if (last_index
!= (uint64_t) -1) {
1691 assert(last_index
<= right
);
1693 /* If we cached the last index we
1694 * looked at, let's try to not to jump
1695 * too wildly around and see if we can
1696 * limit the range to look at early to
1697 * the immediate neighbors of the last
1698 * index we looked at. */
1700 if (last_index
> 0) {
1701 uint64_t x
= last_index
- 1;
1703 p
= le64toh(array
->entry_array
.items
[x
]);
1707 r
= test_object(f
, p
, needle
);
1711 if (r
== TEST_FOUND
)
1712 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1714 if (r
== TEST_RIGHT
)
1720 if (last_index
< right
) {
1721 uint64_t y
= last_index
+ 1;
1723 p
= le64toh(array
->entry_array
.items
[y
]);
1727 r
= test_object(f
, p
, needle
);
1731 if (r
== TEST_FOUND
)
1732 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1734 if (r
== TEST_RIGHT
)
1742 if (left
== right
) {
1743 if (direction
== DIRECTION_UP
)
1744 subtract_one
= true;
1750 assert(left
< right
);
1751 i
= (left
+ right
) / 2;
1753 p
= le64toh(array
->entry_array
.items
[i
]);
1757 r
= test_object(f
, p
, needle
);
1761 if (r
== TEST_FOUND
)
1762 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1764 if (r
== TEST_RIGHT
)
1772 if (direction
== DIRECTION_UP
) {
1774 subtract_one
= true;
1785 last_index
= (uint64_t) -1;
1786 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
1792 if (subtract_one
&& t
== 0 && i
== 0)
1795 /* Let's cache this item for the next invocation */
1796 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
1798 if (subtract_one
&& i
== 0)
1800 else if (subtract_one
)
1801 p
= le64toh(array
->entry_array
.items
[i
-1]);
1803 p
= le64toh(array
->entry_array
.items
[i
]);
1805 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1816 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
1821 static int generic_array_bisect_plus_one(
1827 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1828 direction_t direction
,
1834 bool step_back
= false;
1838 assert(test_object
);
1843 /* This bisects the array in object 'first', but first checks
1845 r
= test_object(f
, extra
, needle
);
1849 if (r
== TEST_FOUND
)
1850 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1852 /* if we are looking with DIRECTION_UP then we need to first
1853 see if in the actual array there is a matching entry, and
1854 return the last one of that. But if there isn't any we need
1855 to return this one. Hence remember this, and return it
1858 step_back
= direction
== DIRECTION_UP
;
1860 if (r
== TEST_RIGHT
) {
1861 if (direction
== DIRECTION_DOWN
)
1867 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
1869 if (r
== 0 && step_back
)
1878 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1894 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1900 else if (p
< needle
)
1906 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1913 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1917 if (le64toh(o
->entry
.seqnum
) == needle
)
1919 else if (le64toh(o
->entry
.seqnum
) < needle
)
1925 int journal_file_move_to_entry_by_seqnum(
1928 direction_t direction
,
1932 return generic_array_bisect(f
,
1933 le64toh(f
->header
->entry_array_offset
),
1934 le64toh(f
->header
->n_entries
),
1941 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1948 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1952 if (le64toh(o
->entry
.realtime
) == needle
)
1954 else if (le64toh(o
->entry
.realtime
) < needle
)
1960 int journal_file_move_to_entry_by_realtime(
1963 direction_t direction
,
1967 return generic_array_bisect(f
,
1968 le64toh(f
->header
->entry_array_offset
),
1969 le64toh(f
->header
->n_entries
),
1971 test_object_realtime
,
1976 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1983 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1987 if (le64toh(o
->entry
.monotonic
) == needle
)
1989 else if (le64toh(o
->entry
.monotonic
) < needle
)
1995 static int find_data_object_by_boot_id(
2001 char t
[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2003 sd_id128_to_string(boot_id
, t
+ 9);
2004 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2007 int journal_file_move_to_entry_by_monotonic(
2011 direction_t direction
,
2020 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2026 return generic_array_bisect_plus_one(f
,
2027 le64toh(o
->data
.entry_offset
),
2028 le64toh(o
->data
.entry_array_offset
),
2029 le64toh(o
->data
.n_entries
),
2031 test_object_monotonic
,
2036 void journal_file_reset_location(JournalFile
*f
) {
2037 f
->location_type
= LOCATION_HEAD
;
2038 f
->current_offset
= 0;
2039 f
->current_seqnum
= 0;
2040 f
->current_realtime
= 0;
2041 f
->current_monotonic
= 0;
2042 zero(f
->current_boot_id
);
2043 f
->current_xor_hash
= 0;
2046 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2047 f
->location_type
= LOCATION_SEEK
;
2048 f
->current_offset
= offset
;
2049 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2050 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2051 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2052 f
->current_boot_id
= o
->entry
.boot_id
;
2053 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2056 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2059 assert(af
->location_type
== LOCATION_SEEK
);
2060 assert(bf
->location_type
== LOCATION_SEEK
);
2062 /* If contents and timestamps match, these entries are
2063 * identical, even if the seqnum does not match */
2064 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2065 af
->current_monotonic
== bf
->current_monotonic
&&
2066 af
->current_realtime
== bf
->current_realtime
&&
2067 af
->current_xor_hash
== bf
->current_xor_hash
)
2070 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2072 /* If this is from the same seqnum source, compare
2074 if (af
->current_seqnum
< bf
->current_seqnum
)
2076 if (af
->current_seqnum
> bf
->current_seqnum
)
2079 /* Wow! This is weird, different data but the same
2080 * seqnums? Something is borked, but let's make the
2081 * best of it and compare by time. */
2084 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2086 /* If the boot id matches, compare monotonic time */
2087 if (af
->current_monotonic
< bf
->current_monotonic
)
2089 if (af
->current_monotonic
> bf
->current_monotonic
)
2093 /* Otherwise, compare UTC time */
2094 if (af
->current_realtime
< bf
->current_realtime
)
2096 if (af
->current_realtime
> bf
->current_realtime
)
2099 /* Finally, compare by contents */
2100 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2102 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2108 int journal_file_next_entry(
2111 direction_t direction
,
2112 Object
**ret
, uint64_t *offset
) {
2119 n
= le64toh(f
->header
->n_entries
);
2124 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2126 r
= generic_array_bisect(f
,
2127 le64toh(f
->header
->entry_array_offset
),
2128 le64toh(f
->header
->n_entries
),
2137 if (direction
== DIRECTION_DOWN
) {
2150 /* And jump to it */
2151 r
= generic_array_get(f
,
2152 le64toh(f
->header
->entry_array_offset
),
2159 (direction
== DIRECTION_DOWN
? ofs
<= p
: ofs
>= p
)) {
2160 log_debug("%s: entry array corrupted at entry %"PRIu64
,
2171 int journal_file_next_entry_for_data(
2173 Object
*o
, uint64_t p
,
2174 uint64_t data_offset
,
2175 direction_t direction
,
2176 Object
**ret
, uint64_t *offset
) {
2183 assert(p
> 0 || !o
);
2185 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2189 n
= le64toh(d
->data
.n_entries
);
2194 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2196 if (o
->object
.type
!= OBJECT_ENTRY
)
2199 r
= generic_array_bisect_plus_one(f
,
2200 le64toh(d
->data
.entry_offset
),
2201 le64toh(d
->data
.entry_array_offset
),
2202 le64toh(d
->data
.n_entries
),
2212 if (direction
== DIRECTION_DOWN
) {
2226 return generic_array_get_plus_one(f
,
2227 le64toh(d
->data
.entry_offset
),
2228 le64toh(d
->data
.entry_array_offset
),
2233 int journal_file_move_to_entry_by_offset_for_data(
2235 uint64_t data_offset
,
2237 direction_t direction
,
2238 Object
**ret
, uint64_t *offset
) {
2245 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2249 return generic_array_bisect_plus_one(f
,
2250 le64toh(d
->data
.entry_offset
),
2251 le64toh(d
->data
.entry_array_offset
),
2252 le64toh(d
->data
.n_entries
),
2259 int journal_file_move_to_entry_by_monotonic_for_data(
2261 uint64_t data_offset
,
2264 direction_t direction
,
2265 Object
**ret
, uint64_t *offset
) {
2273 /* First, seek by time */
2274 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2280 r
= generic_array_bisect_plus_one(f
,
2281 le64toh(o
->data
.entry_offset
),
2282 le64toh(o
->data
.entry_array_offset
),
2283 le64toh(o
->data
.n_entries
),
2285 test_object_monotonic
,
2291 /* And now, continue seeking until we find an entry that
2292 * exists in both bisection arrays */
2298 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2302 r
= generic_array_bisect_plus_one(f
,
2303 le64toh(d
->data
.entry_offset
),
2304 le64toh(d
->data
.entry_array_offset
),
2305 le64toh(d
->data
.n_entries
),
2313 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2317 r
= generic_array_bisect_plus_one(f
,
2318 le64toh(o
->data
.entry_offset
),
2319 le64toh(o
->data
.entry_array_offset
),
2320 le64toh(o
->data
.n_entries
),
2342 int journal_file_move_to_entry_by_seqnum_for_data(
2344 uint64_t data_offset
,
2346 direction_t direction
,
2347 Object
**ret
, uint64_t *offset
) {
2354 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2358 return generic_array_bisect_plus_one(f
,
2359 le64toh(d
->data
.entry_offset
),
2360 le64toh(d
->data
.entry_array_offset
),
2361 le64toh(d
->data
.n_entries
),
2368 int journal_file_move_to_entry_by_realtime_for_data(
2370 uint64_t data_offset
,
2372 direction_t direction
,
2373 Object
**ret
, uint64_t *offset
) {
2380 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2384 return generic_array_bisect_plus_one(f
,
2385 le64toh(d
->data
.entry_offset
),
2386 le64toh(d
->data
.entry_array_offset
),
2387 le64toh(d
->data
.n_entries
),
2389 test_object_realtime
,
2394 void journal_file_dump(JournalFile
*f
) {
2401 journal_file_print_header(f
);
2403 p
= le64toh(f
->header
->header_size
);
2405 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
2409 switch (o
->object
.type
) {
2412 printf("Type: OBJECT_UNUSED\n");
2416 printf("Type: OBJECT_DATA\n");
2420 printf("Type: OBJECT_FIELD\n");
2424 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
2425 le64toh(o
->entry
.seqnum
),
2426 le64toh(o
->entry
.monotonic
),
2427 le64toh(o
->entry
.realtime
));
2430 case OBJECT_FIELD_HASH_TABLE
:
2431 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2434 case OBJECT_DATA_HASH_TABLE
:
2435 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2438 case OBJECT_ENTRY_ARRAY
:
2439 printf("Type: OBJECT_ENTRY_ARRAY\n");
2443 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
2444 le64toh(o
->tag
.seqnum
),
2445 le64toh(o
->tag
.epoch
));
2449 printf("Type: unknown (%i)\n", o
->object
.type
);
2453 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
2454 printf("Flags: %s\n",
2455 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
2457 if (p
== le64toh(f
->header
->tail_object_offset
))
2460 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
2465 log_error("File corrupt");
2468 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
2471 x
= format_timestamp(buf
, l
, t
);
2477 void journal_file_print_header(JournalFile
*f
) {
2478 char a
[33], b
[33], c
[33], d
[33];
2479 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
2481 char bytes
[FORMAT_BYTES_MAX
];
2485 printf("File Path: %s\n"
2489 "Sequential Number ID: %s\n"
2491 "Compatible Flags:%s%s\n"
2492 "Incompatible Flags:%s%s%s\n"
2493 "Header size: %"PRIu64
"\n"
2494 "Arena size: %"PRIu64
"\n"
2495 "Data Hash Table Size: %"PRIu64
"\n"
2496 "Field Hash Table Size: %"PRIu64
"\n"
2497 "Rotate Suggested: %s\n"
2498 "Head Sequential Number: %"PRIu64
"\n"
2499 "Tail Sequential Number: %"PRIu64
"\n"
2500 "Head Realtime Timestamp: %s\n"
2501 "Tail Realtime Timestamp: %s\n"
2502 "Tail Monotonic Timestamp: %s\n"
2503 "Objects: %"PRIu64
"\n"
2504 "Entry Objects: %"PRIu64
"\n",
2506 sd_id128_to_string(f
->header
->file_id
, a
),
2507 sd_id128_to_string(f
->header
->machine_id
, b
),
2508 sd_id128_to_string(f
->header
->boot_id
, c
),
2509 sd_id128_to_string(f
->header
->seqnum_id
, d
),
2510 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
2511 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
2512 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
2513 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
2514 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
2515 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
2516 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
2517 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
2518 le64toh(f
->header
->header_size
),
2519 le64toh(f
->header
->arena_size
),
2520 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
2521 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
2522 yes_no(journal_file_rotate_suggested(f
, 0)),
2523 le64toh(f
->header
->head_entry_seqnum
),
2524 le64toh(f
->header
->tail_entry_seqnum
),
2525 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)),
2526 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)),
2527 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
),
2528 le64toh(f
->header
->n_objects
),
2529 le64toh(f
->header
->n_entries
));
2531 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
2532 printf("Data Objects: %"PRIu64
"\n"
2533 "Data Hash Table Fill: %.1f%%\n",
2534 le64toh(f
->header
->n_data
),
2535 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
2537 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
2538 printf("Field Objects: %"PRIu64
"\n"
2539 "Field Hash Table Fill: %.1f%%\n",
2540 le64toh(f
->header
->n_fields
),
2541 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
2543 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
2544 printf("Tag Objects: %"PRIu64
"\n",
2545 le64toh(f
->header
->n_tags
));
2546 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
2547 printf("Entry Array Objects: %"PRIu64
"\n",
2548 le64toh(f
->header
->n_entry_arrays
));
2550 if (fstat(f
->fd
, &st
) >= 0)
2551 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (uint64_t) st
.st_blocks
* 512ULL));
2554 static int journal_file_warn_btrfs(JournalFile
*f
) {
2560 /* Before we write anything, check if the COW logic is turned
2561 * off on btrfs. Given our write pattern that is quite
2562 * unfriendly to COW file systems this should greatly improve
2563 * performance on COW file systems, such as btrfs, at the
2564 * expense of data integrity features (which shouldn't be too
2565 * bad, given that we do our own checksumming). */
2567 r
= btrfs_is_filesystem(f
->fd
);
2569 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
2573 r
= read_attr_fd(f
->fd
, &attrs
);
2575 return log_warning_errno(r
, "Failed to read file attributes: %m");
2577 if (attrs
& FS_NOCOW_FL
) {
2578 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2582 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2583 "This is likely to slow down journal access substantially, please consider turning "
2584 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
2589 int journal_file_open(
2595 JournalMetrics
*metrics
,
2596 MMapCache
*mmap_cache
,
2597 JournalFile
*template,
2598 JournalFile
**ret
) {
2600 bool newly_created
= false;
2608 if ((flags
& O_ACCMODE
) != O_RDONLY
&&
2609 (flags
& O_ACCMODE
) != O_RDWR
)
2612 if (!endswith(fname
, ".journal") &&
2613 !endswith(fname
, ".journal~"))
2616 f
= new0(JournalFile
, 1);
2624 f
->prot
= prot_from_flags(flags
);
2625 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
2626 #if defined(HAVE_LZ4)
2627 f
->compress_lz4
= compress
;
2628 #elif defined(HAVE_XZ)
2629 f
->compress_xz
= compress
;
2636 f
->mmap
= mmap_cache_ref(mmap_cache
);
2638 f
->mmap
= mmap_cache_new();
2645 f
->path
= strdup(fname
);
2651 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
2652 if (!f
->chain_cache
) {
2657 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
, f
->mode
);
2663 r
= journal_file_fstat(f
);
2667 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
2669 (void) journal_file_warn_btrfs(f
);
2671 /* Let's attach the creation time to the journal file,
2672 * so that the vacuuming code knows the age of this
2673 * file even if the file might end up corrupted one
2674 * day... Ideally we'd just use the creation time many
2675 * file systems maintain for each file, but there is
2676 * currently no usable API to query this, hence let's
2677 * emulate this via extended attributes. If extended
2678 * attributes are not supported we'll just skip this,
2679 * and rely solely on mtime/atime/ctime of the file. */
2681 fd_setcrtime(f
->fd
, 0);
2684 /* Try to load the FSPRG state, and if we can't, then
2685 * just don't do sealing */
2687 r
= journal_file_fss_load(f
);
2693 r
= journal_file_init_header(f
, template);
2697 r
= journal_file_fstat(f
);
2701 newly_created
= true;
2704 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
2709 r
= mmap_cache_get(f
->mmap
, f
->fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
);
2715 if (!newly_created
) {
2716 r
= journal_file_verify_header(f
);
2722 if (!newly_created
&& f
->writable
) {
2723 r
= journal_file_fss_load(f
);
2731 journal_default_metrics(metrics
, f
->fd
);
2732 f
->metrics
= *metrics
;
2733 } else if (template)
2734 f
->metrics
= template->metrics
;
2736 r
= journal_file_refresh_header(f
);
2742 r
= journal_file_hmac_setup(f
);
2747 if (newly_created
) {
2748 r
= journal_file_setup_field_hash_table(f
);
2752 r
= journal_file_setup_data_hash_table(f
);
2757 r
= journal_file_append_first_tag(f
);
2763 if (mmap_cache_got_sigbus(f
->mmap
, f
->fd
)) {
2772 if (f
->fd
>= 0 && mmap_cache_got_sigbus(f
->mmap
, f
->fd
))
2775 journal_file_close(f
);
2780 int journal_file_rotate(JournalFile
**f
, bool compress
, bool seal
) {
2781 _cleanup_free_
char *p
= NULL
;
2783 JournalFile
*old_file
, *new_file
= NULL
;
2791 if (!old_file
->writable
)
2794 if (!endswith(old_file
->path
, ".journal"))
2797 l
= strlen(old_file
->path
);
2798 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
2799 (int) l
- 8, old_file
->path
,
2800 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
2801 le64toh((*f
)->header
->head_entry_seqnum
),
2802 le64toh((*f
)->header
->head_entry_realtime
));
2806 /* Try to rename the file to the archived version. If the file
2807 * already was deleted, we'll get ENOENT, let's ignore that
2809 r
= rename(old_file
->path
, p
);
2810 if (r
< 0 && errno
!= ENOENT
)
2813 old_file
->header
->state
= STATE_ARCHIVED
;
2815 /* Currently, btrfs is not very good with out write patterns
2816 * and fragments heavily. Let's defrag our journal files when
2817 * we archive them */
2818 old_file
->defrag_on_close
= true;
2820 r
= journal_file_open(old_file
->path
, old_file
->flags
, old_file
->mode
, compress
, seal
, NULL
, old_file
->mmap
, old_file
, &new_file
);
2821 journal_file_close(old_file
);
2827 int journal_file_open_reliably(
2833 JournalMetrics
*metrics
,
2834 MMapCache
*mmap_cache
,
2835 JournalFile
*template,
2836 JournalFile
**ret
) {
2840 _cleanup_free_
char *p
= NULL
;
2842 r
= journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2844 -EBADMSG
, /* corrupted */
2845 -ENODATA
, /* truncated */
2846 -EHOSTDOWN
, /* other machine */
2847 -EPROTONOSUPPORT
, /* incompatible feature */
2848 -EBUSY
, /* unclean shutdown */
2849 -ESHUTDOWN
, /* already archived */
2850 -EIO
, /* IO error, including SIGBUS on mmap */
2851 -EIDRM
/* File has been deleted */))
2854 if ((flags
& O_ACCMODE
) == O_RDONLY
)
2857 if (!(flags
& O_CREAT
))
2860 if (!endswith(fname
, ".journal"))
2863 /* The file is corrupted. Rotate it away and try it again (but only once) */
2866 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
2868 now(CLOCK_REALTIME
),
2872 if (rename(fname
, p
) < 0)
2875 /* btrfs doesn't cope well with our write pattern and
2876 * fragments heavily. Let's defrag all files we rotate */
2878 (void) chattr_path(p
, false, FS_NOCOW_FL
);
2879 (void) btrfs_defrag(p
);
2881 log_warning_errno(r
, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
2883 return journal_file_open(fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, template, ret
);
2886 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
2888 uint64_t q
, xor_hash
= 0;
2901 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
2902 ts
.realtime
= le64toh(o
->entry
.realtime
);
2904 n
= journal_file_entry_n_items(o
);
2905 /* alloca() can't take 0, hence let's allocate at least one */
2906 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
2908 for (i
= 0; i
< n
; i
++) {
2915 q
= le64toh(o
->entry
.items
[i
].object_offset
);
2916 le_hash
= o
->entry
.items
[i
].hash
;
2918 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
2922 if (le_hash
!= o
->data
.hash
)
2925 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
2928 /* We hit the limit on 32bit machines */
2929 if ((uint64_t) t
!= l
)
2932 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
2933 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2936 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
2937 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
2941 data
= from
->compress_buffer
;
2944 return -EPROTONOSUPPORT
;
2947 data
= o
->data
.payload
;
2949 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
2953 xor_hash
^= le64toh(u
->data
.hash
);
2954 items
[i
].object_offset
= htole64(h
);
2955 items
[i
].hash
= u
->data
.hash
;
2957 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
2962 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
2964 if (mmap_cache_got_sigbus(to
->mmap
, to
->fd
))
2970 void journal_reset_metrics(JournalMetrics
*m
) {
2973 /* Set everything to "pick automatic values". */
2975 *m
= (JournalMetrics
) {
2976 .min_use
= (uint64_t) -1,
2977 .max_use
= (uint64_t) -1,
2978 .min_size
= (uint64_t) -1,
2979 .max_size
= (uint64_t) -1,
2980 .keep_free
= (uint64_t) -1,
2981 .n_max_files
= (uint64_t) -1,
2985 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
2986 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
], e
[FORMAT_BYTES_MAX
];
2993 if (fstatvfs(fd
, &ss
) >= 0)
2994 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
2996 log_debug_errno(errno
, "Failed to detremine disk size: %m");
3000 if (m
->max_use
== (uint64_t) -1) {
3003 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
3005 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
3006 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
3008 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
3009 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3011 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3013 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3015 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3016 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3019 if (m
->min_use
== (uint64_t) -1)
3020 m
->min_use
= DEFAULT_MIN_USE
;
3022 if (m
->min_use
> m
->max_use
)
3023 m
->min_use
= m
->max_use
;
3025 if (m
->max_size
== (uint64_t) -1) {
3026 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3028 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3029 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3031 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3033 if (m
->max_size
!= 0) {
3034 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3035 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3037 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3038 m
->max_use
= m
->max_size
*2;
3041 if (m
->min_size
== (uint64_t) -1)
3042 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3044 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3046 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3047 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3049 if (m
->max_size
!= 0 && m
->min_size
> m
->max_size
)
3050 m
->max_size
= m
->min_size
;
3053 if (m
->keep_free
== (uint64_t) -1) {
3056 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3058 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3059 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3062 m
->keep_free
= DEFAULT_KEEP_FREE
;
3065 if (m
->n_max_files
== (uint64_t) -1)
3066 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3068 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3069 format_bytes(a
, sizeof(a
), m
->min_use
),
3070 format_bytes(b
, sizeof(b
), m
->max_use
),
3071 format_bytes(c
, sizeof(c
), m
->max_size
),
3072 format_bytes(d
, sizeof(d
), m
->min_size
),
3073 format_bytes(e
, sizeof(e
), m
->keep_free
),
3077 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3082 if (f
->header
->head_entry_realtime
== 0)
3085 *from
= le64toh(f
->header
->head_entry_realtime
);
3089 if (f
->header
->tail_entry_realtime
== 0)
3092 *to
= le64toh(f
->header
->tail_entry_realtime
);
3098 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3106 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3110 if (le64toh(o
->data
.n_entries
) <= 0)
3114 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3118 *from
= le64toh(o
->entry
.monotonic
);
3122 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3126 r
= generic_array_get_plus_one(f
,
3127 le64toh(o
->data
.entry_offset
),
3128 le64toh(o
->data
.entry_array_offset
),
3129 le64toh(o
->data
.n_entries
)-1,
3134 *to
= le64toh(o
->entry
.monotonic
);
3140 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3143 /* If we gained new header fields we gained new features,
3144 * hence suggest a rotation */
3145 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3146 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3150 /* Let's check if the hash tables grew over a certain fill
3151 * level (75%, borrowing this value from Java's hash table
3152 * implementation), and if so suggest a rotation. To calculate
3153 * the fill level we need the n_data field, which only exists
3154 * in newer versions. */
3156 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3157 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3158 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3160 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3161 le64toh(f
->header
->n_data
),
3162 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3163 (unsigned long long) f
->last_stat
.st_size
,
3164 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3168 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3169 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3170 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3172 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3173 le64toh(f
->header
->n_fields
),
3174 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3178 /* Are the data objects properly indexed by field objects? */
3179 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3180 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3181 le64toh(f
->header
->n_data
) > 0 &&
3182 le64toh(f
->header
->n_fields
) == 0)
3185 if (max_file_usec
> 0) {
3188 h
= le64toh(f
->header
->head_entry_realtime
);
3189 t
= now(CLOCK_REALTIME
);
3191 if (h
> 0 && t
> h
+ max_file_usec
)