1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
6 #include <linux/magic.h>
10 #include <sys/statvfs.h>
16 #include "alloc-util.h"
17 #include "chattr-util.h"
21 #include "format-util.h"
23 #include "journal-authenticate.h"
24 #include "journal-def.h"
25 #include "journal-file.h"
27 #include "memory-util.h"
28 #include "path-util.h"
29 #include "random-util.h"
31 #include "sort-util.h"
32 #include "stat-util.h"
33 #include "string-table.h"
34 #include "string-util.h"
36 #include "sync-util.h"
37 #include "xattr-util.h"
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
42 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
43 #define MIN_COMPRESS_THRESHOLD (8ULL)
45 /* This is the minimum journal file size */
46 #define JOURNAL_FILE_SIZE_MIN (512 * 1024ULL) /* 512 KiB */
48 /* These are the lower and upper bounds if we deduce the max_use value
49 * from the file system size */
50 #define MAX_USE_LOWER (1 * 1024 * 1024ULL) /* 1 MiB */
51 #define MAX_USE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
53 /* Those are the lower and upper bounds for the minimal use limit,
54 * i.e. how much we'll use even if keep_free suggests otherwise. */
55 #define MIN_USE_LOW (1 * 1024 * 1024ULL) /* 1 MiB */
56 #define MIN_USE_HIGH (16 * 1024 * 1024ULL) /* 16 MiB */
58 /* This is the upper bound if we deduce max_size from max_use */
59 #define MAX_SIZE_UPPER (128 * 1024 * 1024ULL) /* 128 MiB */
61 /* This is the upper bound if we deduce the keep_free value from the
63 #define KEEP_FREE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
65 /* This is the keep_free value when we can't determine the system
67 #define DEFAULT_KEEP_FREE (1024 * 1024ULL) /* 1 MB */
69 /* This is the default maximum number of journal files to keep around. */
70 #define DEFAULT_N_MAX_FILES 100
72 /* n_data was the first entry we added after the initial file format design */
73 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
75 /* How many entries to keep in the entry array chain cache at max */
76 #define CHAIN_CACHE_MAX 20
78 /* How much to increase the journal file size at once each time we allocate something new. */
79 #define FILE_SIZE_INCREASE (8 * 1024 * 1024ULL) /* 8MB */
81 /* Reread fstat() of the file for detecting deletions at least this often */
82 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
84 /* The mmap context to use for the header we pick as one above the last defined typed */
85 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
87 /* Longest hash chain to rotate after */
88 #define HASH_CHAIN_DEPTH_MAX 100
91 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
94 int journal_file_tail_end_by_pread(JournalFile
*f
, uint64_t *ret_offset
) {
102 /* Same as journal_file_tail_end_by_mmap() below, but operates with pread() to avoid the mmap cache
103 * (and thus is thread safe) */
105 p
= le64toh(f
->header
->tail_object_offset
);
107 p
= le64toh(f
->header
->header_size
);
112 r
= journal_file_read_object_header(f
, OBJECT_UNUSED
, p
, &tail
);
116 sz
= le64toh(tail
.object
.size
);
117 if (sz
> UINT64_MAX
- sizeof(uint64_t) + 1)
121 if (p
> UINT64_MAX
- sz
)
132 int journal_file_tail_end_by_mmap(JournalFile
*f
, uint64_t *ret_offset
) {
140 /* Same as journal_file_tail_end_by_pread() above, but operates with the usual mmap logic */
142 p
= le64toh(f
->header
->tail_object_offset
);
144 p
= le64toh(f
->header
->header_size
);
149 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
153 sz
= le64toh(READ_NOW(tail
->object
.size
));
154 if (sz
> UINT64_MAX
- sizeof(uint64_t) + 1)
158 if (p
> UINT64_MAX
- sz
)
169 int journal_file_set_offline_thread_join(JournalFile
*f
) {
174 if (f
->offline_state
== OFFLINE_JOINED
)
177 r
= pthread_join(f
->offline_thread
, NULL
);
181 f
->offline_state
= OFFLINE_JOINED
;
183 if (mmap_cache_fd_got_sigbus(f
->cache_fd
))
189 static int journal_file_set_online(JournalFile
*f
) {
197 if (f
->fd
< 0 || !f
->header
)
201 switch (f
->offline_state
) {
203 /* No offline thread, no need to wait. */
207 case OFFLINE_SYNCING
:
208 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_CANCEL
))
210 /* Canceled syncing prior to offlining, no need to wait. */
214 case OFFLINE_AGAIN_FROM_SYNCING
:
215 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_SYNCING
, OFFLINE_CANCEL
))
217 /* Canceled restart from syncing, no need to wait. */
221 case OFFLINE_AGAIN_FROM_OFFLINING
:
222 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_OFFLINING
, OFFLINE_CANCEL
))
224 /* Canceled restart from offlining, must wait for offlining to complete however. */
229 r
= journal_file_set_offline_thread_join(f
);
239 if (mmap_cache_fd_got_sigbus(f
->cache_fd
))
242 switch (f
->header
->state
) {
247 f
->header
->state
= STATE_ONLINE
;
256 JournalFile
* journal_file_close(JournalFile
*f
) {
261 mmap_cache_fd_free(f
->cache_fd
);
267 ordered_hashmap_free_free(f
->chain_cache
);
270 free(f
->compress_buffer
);
275 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
277 free(f
->fsprg_state
);
282 gcry_md_close(f
->hmac
);
288 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
295 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
296 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
298 h
.incompatible_flags
|= htole32(
299 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
300 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
|
301 f
->compress_zstd
* HEADER_INCOMPATIBLE_COMPRESSED_ZSTD
|
302 f
->keyed_hash
* HEADER_INCOMPATIBLE_KEYED_HASH
);
304 h
.compatible_flags
= htole32(
305 f
->seal
* HEADER_COMPATIBLE_SEALED
);
307 r
= sd_id128_randomize(&h
.file_id
);
312 h
.seqnum_id
= template->header
->seqnum_id
;
313 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
315 h
.seqnum_id
= h
.file_id
;
317 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
327 static int journal_file_refresh_header(JournalFile
*f
) {
333 r
= sd_id128_get_machine(&f
->header
->machine_id
);
334 if (IN_SET(r
, -ENOENT
, -ENOMEDIUM
))
335 /* We don't have a machine-id, let's continue without */
336 zero(f
->header
->machine_id
);
340 r
= sd_id128_get_boot(&f
->header
->boot_id
);
344 r
= journal_file_set_online(f
);
346 /* Sync the online state to disk; likely just created a new file, also sync the directory this file
348 (void) fsync_full(f
->fd
);
353 static bool warn_wrong_flags(const JournalFile
*f
, bool compatible
) {
354 const uint32_t any
= compatible
? HEADER_COMPATIBLE_ANY
: HEADER_INCOMPATIBLE_ANY
,
355 supported
= compatible
? HEADER_COMPATIBLE_SUPPORTED
: HEADER_INCOMPATIBLE_SUPPORTED
;
356 const char *type
= compatible
? "compatible" : "incompatible";
359 flags
= le32toh(compatible
? f
->header
->compatible_flags
: f
->header
->incompatible_flags
);
361 if (flags
& ~supported
) {
363 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32
,
364 f
->path
, type
, flags
& ~any
);
365 flags
= (flags
& any
) & ~supported
;
369 _cleanup_free_
char *t
= NULL
;
372 if (flags
& HEADER_COMPATIBLE_SEALED
)
373 strv
[n
++] = "sealed";
375 if (flags
& HEADER_INCOMPATIBLE_COMPRESSED_XZ
)
376 strv
[n
++] = "xz-compressed";
377 if (flags
& HEADER_INCOMPATIBLE_COMPRESSED_LZ4
)
378 strv
[n
++] = "lz4-compressed";
379 if (flags
& HEADER_INCOMPATIBLE_COMPRESSED_ZSTD
)
380 strv
[n
++] = "zstd-compressed";
381 if (flags
& HEADER_INCOMPATIBLE_KEYED_HASH
)
382 strv
[n
++] = "keyed-hash";
385 assert(n
< ELEMENTSOF(strv
));
387 t
= strv_join((char**) strv
, ", ");
388 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
389 f
->path
, type
, n
> 1 ? "flags" : "flag", strnull(t
));
397 static int journal_file_verify_header(JournalFile
*f
) {
398 uint64_t arena_size
, header_size
;
403 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
406 /* In both read and write mode we refuse to open files with incompatible
407 * flags we don't know. */
408 if (warn_wrong_flags(f
, false))
409 return -EPROTONOSUPPORT
;
411 /* When open for writing we refuse to open files with compatible flags, too. */
412 if (f
->writable
&& warn_wrong_flags(f
, true))
413 return -EPROTONOSUPPORT
;
415 if (f
->header
->state
>= _STATE_MAX
)
418 header_size
= le64toh(READ_NOW(f
->header
->header_size
));
420 /* The first addition was n_data, so check that we are at least this large */
421 if (header_size
< HEADER_SIZE_MIN
)
424 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
427 arena_size
= le64toh(READ_NOW(f
->header
->arena_size
));
429 if (UINT64_MAX
- header_size
< arena_size
|| header_size
+ arena_size
> (uint64_t) f
->last_stat
.st_size
)
432 if (le64toh(f
->header
->tail_object_offset
) > header_size
+ arena_size
)
435 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
436 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
437 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
438 !VALID64(le64toh(f
->header
->entry_array_offset
)))
442 sd_id128_t machine_id
;
446 r
= sd_id128_get_machine(&machine_id
);
450 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
453 state
= f
->header
->state
;
455 if (state
== STATE_ARCHIVED
)
456 return -ESHUTDOWN
; /* Already archived */
457 else if (state
== STATE_ONLINE
)
458 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY
),
459 "Journal file %s is already online. Assuming unclean closing.",
461 else if (state
!= STATE_OFFLINE
)
462 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY
),
463 "Journal file %s has unknown state %i.",
466 if (f
->header
->field_hash_table_size
== 0 || f
->header
->data_hash_table_size
== 0)
469 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
470 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
472 if (le64toh(f
->header
->tail_entry_realtime
) > now(CLOCK_REALTIME
))
473 return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY
),
474 "Journal file %s is from the future, refusing to append new data to it that'd be older.",
478 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
479 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
480 f
->compress_zstd
= JOURNAL_HEADER_COMPRESSED_ZSTD(f
->header
);
482 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
484 f
->keyed_hash
= JOURNAL_HEADER_KEYED_HASH(f
->header
);
489 int journal_file_fstat(JournalFile
*f
) {
495 if (fstat(f
->fd
, &f
->last_stat
) < 0)
498 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
500 /* Refuse dealing with files that aren't regular */
501 r
= stat_verify_regular(&f
->last_stat
);
505 /* Refuse appending to files that are already deleted */
506 if (f
->last_stat
.st_nlink
<= 0)
512 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
513 uint64_t old_size
, new_size
, old_header_size
, old_arena_size
;
519 /* We assume that this file is not sparse, and we know that for sure, since we always call
520 * posix_fallocate() ourselves */
522 if (size
> PAGE_ALIGN_DOWN(UINT64_MAX
) - offset
)
525 if (mmap_cache_fd_got_sigbus(f
->cache_fd
))
528 old_header_size
= le64toh(READ_NOW(f
->header
->header_size
));
529 old_arena_size
= le64toh(READ_NOW(f
->header
->arena_size
));
530 if (old_arena_size
> PAGE_ALIGN_DOWN(UINT64_MAX
) - old_header_size
)
533 old_size
= old_header_size
+ old_arena_size
;
535 new_size
= MAX(PAGE_ALIGN(offset
+ size
), old_header_size
);
537 if (new_size
<= old_size
) {
539 /* We already pre-allocated enough space, but before
540 * we write to it, let's check with fstat() if the
541 * file got deleted, in order make sure we don't throw
542 * away the data immediately. Don't check fstat() for
543 * all writes though, but only once ever 10s. */
545 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
548 return journal_file_fstat(f
);
551 /* Allocate more space. */
553 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
556 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
559 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
562 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
564 if (new_size
- old_size
> available
)
569 /* Increase by larger blocks at once */
570 new_size
= DIV_ROUND_UP(new_size
, FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
571 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
572 new_size
= f
->metrics
.max_size
;
574 /* Note that the glibc fallocate() fallback is very
575 inefficient, hence we try to minimize the allocation area
577 r
= posix_fallocate_loop(f
->fd
, old_size
, new_size
- old_size
);
581 f
->header
->arena_size
= htole64(new_size
- old_header_size
);
583 return journal_file_fstat(f
);
586 static unsigned type_to_context(ObjectType type
) {
587 /* One context for each type, plus one catch-all for the rest */
588 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
589 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
590 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
593 static int journal_file_move_to(
609 if (size
> UINT64_MAX
- offset
)
612 /* Avoid SIGBUS on invalid accesses */
613 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
614 /* Hmm, out of range? Let's refresh the fstat() data
615 * first, before we trust that check. */
617 r
= journal_file_fstat(f
);
621 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
622 return -EADDRNOTAVAIL
;
625 return mmap_cache_fd_get(f
->cache_fd
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
);
628 static uint64_t minimum_header_size(Object
*o
) {
630 static const uint64_t table
[] = {
631 [OBJECT_DATA
] = sizeof(DataObject
),
632 [OBJECT_FIELD
] = sizeof(FieldObject
),
633 [OBJECT_ENTRY
] = sizeof(EntryObject
),
634 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
635 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
636 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
637 [OBJECT_TAG
] = sizeof(TagObject
),
640 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
641 return sizeof(ObjectHeader
);
643 return table
[o
->object
.type
];
646 /* Lightweight object checks. We want this to be fast, so that we won't
647 * slowdown every journal_file_move_to_object() call too much. */
648 static int journal_file_check_object(JournalFile
*f
, uint64_t offset
, Object
*o
) {
652 switch (o
->object
.type
) {
655 if ((le64toh(o
->data
.entry_offset
) == 0) ^ (le64toh(o
->data
.n_entries
) == 0))
656 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
657 "Bad n_entries: %" PRIu64
": %" PRIu64
,
658 le64toh(o
->data
.n_entries
),
661 if (le64toh(o
->object
.size
) <= offsetof(Object
, data
.payload
))
662 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
663 "Bad object size (<= %zu): %" PRIu64
": %" PRIu64
,
664 offsetof(Object
, data
.payload
),
665 le64toh(o
->object
.size
),
668 if (!VALID64(le64toh(o
->data
.next_hash_offset
)) ||
669 !VALID64(le64toh(o
->data
.next_field_offset
)) ||
670 !VALID64(le64toh(o
->data
.entry_offset
)) ||
671 !VALID64(le64toh(o
->data
.entry_array_offset
)))
672 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
673 "Invalid offset, next_hash_offset=" OFSfmt
", next_field_offset=" OFSfmt
", entry_offset=" OFSfmt
", entry_array_offset=" OFSfmt
": %" PRIu64
,
674 le64toh(o
->data
.next_hash_offset
),
675 le64toh(o
->data
.next_field_offset
),
676 le64toh(o
->data
.entry_offset
),
677 le64toh(o
->data
.entry_array_offset
),
683 if (le64toh(o
->object
.size
) <= offsetof(Object
, field
.payload
))
684 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
685 "Bad field size (<= %zu): %" PRIu64
": %" PRIu64
,
686 offsetof(Object
, field
.payload
),
687 le64toh(o
->object
.size
),
690 if (!VALID64(le64toh(o
->field
.next_hash_offset
)) ||
691 !VALID64(le64toh(o
->field
.head_data_offset
)))
692 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
693 "Invalid offset, next_hash_offset=" OFSfmt
", head_data_offset=" OFSfmt
": %" PRIu64
,
694 le64toh(o
->field
.next_hash_offset
),
695 le64toh(o
->field
.head_data_offset
),
702 sz
= le64toh(READ_NOW(o
->object
.size
));
703 if (sz
< offsetof(Object
, entry
.items
) ||
704 (sz
- offsetof(Object
, entry
.items
)) % sizeof(EntryItem
) != 0)
705 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
706 "Bad entry size (<= %zu): %" PRIu64
": %" PRIu64
,
707 offsetof(Object
, entry
.items
),
711 if ((sz
- offsetof(Object
, entry
.items
)) / sizeof(EntryItem
) <= 0)
712 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
713 "Invalid number items in entry: %" PRIu64
": %" PRIu64
,
714 (sz
- offsetof(Object
, entry
.items
)) / sizeof(EntryItem
),
717 if (le64toh(o
->entry
.seqnum
) <= 0)
718 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
719 "Invalid entry seqnum: %" PRIx64
": %" PRIu64
,
720 le64toh(o
->entry
.seqnum
),
723 if (!VALID_REALTIME(le64toh(o
->entry
.realtime
)))
724 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
725 "Invalid entry realtime timestamp: %" PRIu64
": %" PRIu64
,
726 le64toh(o
->entry
.realtime
),
729 if (!VALID_MONOTONIC(le64toh(o
->entry
.monotonic
)))
730 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
731 "Invalid entry monotonic timestamp: %" PRIu64
": %" PRIu64
,
732 le64toh(o
->entry
.monotonic
),
738 case OBJECT_DATA_HASH_TABLE
:
739 case OBJECT_FIELD_HASH_TABLE
: {
742 sz
= le64toh(READ_NOW(o
->object
.size
));
743 if (sz
< offsetof(Object
, hash_table
.items
) ||
744 (sz
- offsetof(Object
, hash_table
.items
)) % sizeof(HashItem
) != 0 ||
745 (sz
- offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
) <= 0)
746 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
747 "Invalid %s hash table size: %" PRIu64
": %" PRIu64
,
748 o
->object
.type
== OBJECT_DATA_HASH_TABLE
? "data" : "field",
755 case OBJECT_ENTRY_ARRAY
: {
758 sz
= le64toh(READ_NOW(o
->object
.size
));
759 if (sz
< offsetof(Object
, entry_array
.items
) ||
760 (sz
- offsetof(Object
, entry_array
.items
)) % sizeof(le64_t
) != 0 ||
761 (sz
- offsetof(Object
, entry_array
.items
)) / sizeof(le64_t
) <= 0)
762 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
763 "Invalid object entry array size: %" PRIu64
": %" PRIu64
,
767 if (!VALID64(le64toh(o
->entry_array
.next_entry_array_offset
)))
768 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
769 "Invalid object entry array next_entry_array_offset: " OFSfmt
": %" PRIu64
,
770 le64toh(o
->entry_array
.next_entry_array_offset
),
777 if (le64toh(o
->object
.size
) != sizeof(TagObject
))
778 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
779 "Invalid object tag size: %" PRIu64
": %" PRIu64
,
780 le64toh(o
->object
.size
),
783 if (!VALID_EPOCH(le64toh(o
->tag
.epoch
)))
784 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
785 "Invalid object tag epoch: %" PRIu64
": %" PRIu64
,
786 le64toh(o
->tag
.epoch
), offset
);
794 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
802 /* Objects may only be located at multiple of 64 bit */
803 if (!VALID64(offset
))
804 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
805 "Attempt to move to object at non-64bit boundary: %" PRIu64
,
808 /* Object may not be located in the file header */
809 if (offset
< le64toh(f
->header
->header_size
))
810 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
811 "Attempt to move to object located in file header: %" PRIu64
,
814 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
);
819 s
= le64toh(READ_NOW(o
->object
.size
));
822 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
823 "Attempt to move to uninitialized object: %" PRIu64
,
825 if (s
< sizeof(ObjectHeader
))
826 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
827 "Attempt to move to overly short object: %" PRIu64
,
830 if (o
->object
.type
<= OBJECT_UNUSED
)
831 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
832 "Attempt to move to object with invalid type: %" PRIu64
,
835 if (s
< minimum_header_size(o
))
836 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
837 "Attempt to move to truncated object: %" PRIu64
,
840 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
)
841 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
842 "Attempt to move to object of unexpected type: %" PRIu64
,
845 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
);
851 r
= journal_file_check_object(f
, offset
, o
);
861 int journal_file_read_object_header(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
*ret
) {
869 /* Objects may only be located at multiple of 64 bit */
870 if (!VALID64(offset
))
871 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
872 "Attempt to read object at non-64bit boundary: %" PRIu64
,
875 /* Object may not be located in the file header */
876 if (offset
< le64toh(f
->header
->header_size
))
877 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
878 "Attempt to read object located in file header: %" PRIu64
,
881 /* This will likely read too much data but it avoids having to call pread() twice. */
882 n
= pread(f
->fd
, &o
, sizeof(o
), offset
);
884 return log_debug_errno(errno
, "Failed to read journal file at offset: %" PRIu64
,
887 if ((size_t) n
< sizeof(o
.object
))
888 return log_debug_errno(SYNTHETIC_ERRNO(EIO
),
889 "Failed to read short object at offset: %" PRIu64
,
892 s
= le64toh(o
.object
.size
);
894 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
895 "Attempt to read uninitialized object: %" PRIu64
,
897 if (s
< sizeof(o
.object
))
898 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
899 "Attempt to read overly short object: %" PRIu64
,
902 if (o
.object
.type
<= OBJECT_UNUSED
)
903 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
904 "Attempt to read object with invalid type: %" PRIu64
,
907 if (s
< minimum_header_size(&o
))
908 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
909 "Attempt to read truncated object: %" PRIu64
,
912 if ((size_t) n
< minimum_header_size(&o
))
913 return log_debug_errno(SYNTHETIC_ERRNO(EIO
),
914 "Short read while reading object: %" PRIu64
,
917 if (type
> OBJECT_UNUSED
&& o
.object
.type
!= type
)
918 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
919 "Attempt to read object of unexpected type: %" PRIu64
,
922 r
= journal_file_check_object(f
, offset
, &o
);
932 static uint64_t journal_file_entry_seqnum(
941 /* Picks a new sequence number for the entry we are about to add and returns it. */
943 ret
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
946 /* If an external seqnum counter was passed, we update both the local and the external one,
947 * and set it to the maximum of both */
949 if (*seqnum
+ 1 > ret
)
955 f
->header
->tail_entry_seqnum
= htole64(ret
);
957 if (f
->header
->head_entry_seqnum
== 0)
958 f
->header
->head_entry_seqnum
= htole64(ret
);
963 int journal_file_append_object(
968 uint64_t *ret_offset
) {
977 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
978 assert(size
>= sizeof(ObjectHeader
));
980 r
= journal_file_set_online(f
);
984 r
= journal_file_tail_end_by_mmap(f
, &p
);
988 r
= journal_file_allocate(f
, p
, size
);
992 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
);
997 o
->object
= (ObjectHeader
) {
999 .size
= htole64(size
),
1002 f
->header
->tail_object_offset
= htole64(p
);
1003 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
1014 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
1022 /* We estimate that we need 1 hash table entry per 768 bytes
1023 of journal file and we want to make sure we never get
1024 beyond 75% fill level. Calculate the hash table size for
1025 the maximum file size based on these metrics. */
1027 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
1028 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
1029 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
1031 log_debug("Reserving %"PRIu64
" entries in data hash table.", s
/ sizeof(HashItem
));
1033 r
= journal_file_append_object(f
,
1034 OBJECT_DATA_HASH_TABLE
,
1035 offsetof(Object
, hash_table
.items
) + s
,
1040 memzero(o
->hash_table
.items
, s
);
1042 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
1043 f
->header
->data_hash_table_size
= htole64(s
);
1048 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
1056 /* We use a fixed size hash table for the fields as this
1057 * number should grow very slowly only */
1059 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
1060 log_debug("Reserving %"PRIu64
" entries in field hash table.", s
/ sizeof(HashItem
));
1062 r
= journal_file_append_object(f
,
1063 OBJECT_FIELD_HASH_TABLE
,
1064 offsetof(Object
, hash_table
.items
) + s
,
1069 memzero(o
->hash_table
.items
, s
);
1071 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
1072 f
->header
->field_hash_table_size
= htole64(s
);
1077 int journal_file_map_data_hash_table(JournalFile
*f
) {
1085 if (f
->data_hash_table
)
1088 p
= le64toh(f
->header
->data_hash_table_offset
);
1089 s
= le64toh(f
->header
->data_hash_table_size
);
1091 r
= journal_file_move_to(f
,
1092 OBJECT_DATA_HASH_TABLE
,
1099 f
->data_hash_table
= t
;
1103 int journal_file_map_field_hash_table(JournalFile
*f
) {
1111 if (f
->field_hash_table
)
1114 p
= le64toh(f
->header
->field_hash_table_offset
);
1115 s
= le64toh(f
->header
->field_hash_table_size
);
1117 r
= journal_file_move_to(f
,
1118 OBJECT_FIELD_HASH_TABLE
,
1125 f
->field_hash_table
= t
;
1129 static int journal_file_link_field(
1140 assert(f
->field_hash_table
);
1144 if (o
->object
.type
!= OBJECT_FIELD
)
1147 m
= le64toh(READ_NOW(f
->header
->field_hash_table_size
)) / sizeof(HashItem
);
1151 /* This might alter the window we are looking at */
1152 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
1155 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
1157 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
1159 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1163 o
->field
.next_hash_offset
= htole64(offset
);
1166 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
1168 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
1169 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
1174 static int journal_file_link_data(
1185 assert(f
->data_hash_table
);
1189 if (o
->object
.type
!= OBJECT_DATA
)
1192 m
= le64toh(READ_NOW(f
->header
->data_hash_table_size
)) / sizeof(HashItem
);
1196 /* This might alter the window we are looking at */
1197 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
1198 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
1199 o
->data
.n_entries
= 0;
1202 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
1204 /* Only entry in the hash table is easy */
1205 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
1207 /* Move back to the previous data object, to patch in
1210 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1214 o
->data
.next_hash_offset
= htole64(offset
);
1217 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
1219 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
1220 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
1225 static int next_hash_offset(
1228 le64_t
*next_hash_offset
,
1230 le64_t
*header_max_depth
) {
1234 nextp
= le64toh(READ_NOW(*next_hash_offset
));
1236 if (nextp
<= *p
) /* Refuse going in loops */
1237 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
1238 "Detected hash item loop in %s, refusing.", f
->path
);
1242 /* If the depth of this hash chain is larger than all others we have seen so far, record it */
1243 if (header_max_depth
&& f
->writable
)
1244 *header_max_depth
= htole64(MAX(*depth
, le64toh(*header_max_depth
)));
1251 int journal_file_find_field_object_with_hash(
1253 const void *field
, uint64_t size
, uint64_t hash
,
1254 Object
**ret
, uint64_t *ret_offset
) {
1256 uint64_t p
, osize
, h
, m
, depth
= 0;
1261 assert(field
&& size
> 0);
1263 /* If the field hash table is empty, we can't find anything */
1264 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
1267 /* Map the field hash table, if it isn't mapped yet. */
1268 r
= journal_file_map_field_hash_table(f
);
1272 osize
= offsetof(Object
, field
.payload
) + size
;
1274 m
= le64toh(READ_NOW(f
->header
->field_hash_table_size
)) / sizeof(HashItem
);
1279 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
1283 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1287 if (le64toh(o
->field
.hash
) == hash
&&
1288 le64toh(o
->object
.size
) == osize
&&
1289 memcmp(o
->field
.payload
, field
, size
) == 0) {
1299 r
= next_hash_offset(
1302 &o
->field
.next_hash_offset
,
1304 JOURNAL_HEADER_CONTAINS(f
->header
, field_hash_chain_depth
) ? &f
->header
->field_hash_chain_depth
: NULL
);
1312 uint64_t journal_file_hash_data(
1318 assert(data
|| sz
== 0);
1320 /* We try to unify our codebase on siphash, hence new-styled journal files utilizing the keyed hash
1321 * function use siphash. Old journal files use the Jenkins hash. */
1323 if (JOURNAL_HEADER_KEYED_HASH(f
->header
))
1324 return siphash24(data
, sz
, f
->header
->file_id
.bytes
);
1326 return jenkins_hash64(data
, sz
);
1329 int journal_file_find_field_object(
1331 const void *field
, uint64_t size
,
1332 Object
**ret
, uint64_t *ret_offset
) {
1335 assert(field
&& size
> 0);
1337 return journal_file_find_field_object_with_hash(
1340 journal_file_hash_data(f
, field
, size
),
1344 int journal_file_find_data_object_with_hash(
1346 const void *data
, uint64_t size
, uint64_t hash
,
1347 Object
**ret
, uint64_t *ret_offset
) {
1349 uint64_t p
, osize
, h
, m
, depth
= 0;
1354 assert(data
|| size
== 0);
1356 /* If there's no data hash table, then there's no entry. */
1357 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
1360 /* Map the data hash table, if it isn't mapped yet. */
1361 r
= journal_file_map_data_hash_table(f
);
1365 osize
= offsetof(Object
, data
.payload
) + size
;
1367 m
= le64toh(READ_NOW(f
->header
->data_hash_table_size
)) / sizeof(HashItem
);
1372 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
1377 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1381 if (le64toh(o
->data
.hash
) != hash
)
1384 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
1385 #if HAVE_COMPRESSION
1389 l
= le64toh(READ_NOW(o
->object
.size
));
1390 if (l
<= offsetof(Object
, data
.payload
))
1393 l
-= offsetof(Object
, data
.payload
);
1395 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
1396 o
->data
.payload
, l
, &f
->compress_buffer
, &rsize
, 0);
1400 if (rsize
== size
&&
1401 memcmp(f
->compress_buffer
, data
, size
) == 0) {
1412 return -EPROTONOSUPPORT
;
1414 } else if (le64toh(o
->object
.size
) == osize
&&
1415 memcmp(o
->data
.payload
, data
, size
) == 0) {
1427 r
= next_hash_offset(
1430 &o
->data
.next_hash_offset
,
1432 JOURNAL_HEADER_CONTAINS(f
->header
, data_hash_chain_depth
) ? &f
->header
->data_hash_chain_depth
: NULL
);
1440 int journal_file_find_data_object(
1442 const void *data
, uint64_t size
,
1443 Object
**ret
, uint64_t *ret_offset
) {
1446 assert(data
|| size
== 0);
1448 return journal_file_find_data_object_with_hash(
1451 journal_file_hash_data(f
, data
, size
),
1455 bool journal_field_valid(const char *p
, size_t l
, bool allow_protected
) {
1456 /* We kinda enforce POSIX syntax recommendations for
1457 environment variables here, but make a couple of additional
1460 http://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html */
1465 /* No empty field names */
1469 /* Don't allow names longer than 64 chars */
1473 /* Variables starting with an underscore are protected */
1474 if (!allow_protected
&& p
[0] == '_')
1477 /* Don't allow digits as first character */
1478 if (p
[0] >= '0' && p
[0] <= '9')
1481 /* Only allow A-Z0-9 and '_' */
1482 for (const char *a
= p
; a
< p
+ l
; a
++)
1483 if ((*a
< 'A' || *a
> 'Z') &&
1484 (*a
< '0' || *a
> '9') &&
1491 static int journal_file_append_field(
1493 const void *field
, uint64_t size
,
1494 Object
**ret
, uint64_t *ret_offset
) {
1502 assert(field
&& size
> 0);
1504 if (!journal_field_valid(field
, size
, true))
1507 hash
= journal_file_hash_data(f
, field
, size
);
1509 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, ret
, ret_offset
);
1515 osize
= offsetof(Object
, field
.payload
) + size
;
1516 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1520 o
->field
.hash
= htole64(hash
);
1521 memcpy(o
->field
.payload
, field
, size
);
1523 r
= journal_file_link_field(f
, o
, p
, hash
);
1527 /* The linking might have altered the window, so let's only pass the offset to hmac which will
1528 * move to the object again if needed. */
1531 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, NULL
, p
);
1537 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, ret
);
1548 static int journal_file_append_data(
1550 const void *data
, uint64_t size
,
1551 Object
**ret
, uint64_t *ret_offset
) {
1553 uint64_t hash
, p
, fp
, osize
;
1555 int r
, compression
= 0;
1560 if (!data
|| size
== 0)
1563 hash
= journal_file_hash_data(f
, data
, size
);
1565 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, ret
, ret_offset
);
1571 eq
= memchr(data
, '=', size
);
1575 osize
= offsetof(Object
, data
.payload
) + size
;
1576 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1580 o
->data
.hash
= htole64(hash
);
1582 #if HAVE_COMPRESSION
1583 if (JOURNAL_FILE_COMPRESS(f
) && size
>= f
->compress_threshold_bytes
) {
1586 compression
= compress_blob(data
, size
, o
->data
.payload
, size
- 1, &rsize
);
1588 if (compression
>= 0) {
1589 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1590 o
->object
.flags
|= compression
;
1592 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1593 size
, rsize
, object_compressed_to_string(compression
));
1595 /* Compression didn't work, we don't really care why, let's continue without compression */
1600 if (compression
== 0)
1601 memcpy_safe(o
->data
.payload
, data
, size
);
1603 r
= journal_file_link_data(f
, o
, p
, hash
);
1607 /* The linking might have altered the window, so let's refresh our pointer. */
1608 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1613 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1618 /* Create field object ... */
1619 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1623 /* ... and link it in. */
1624 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1625 fo
->field
.head_data_offset
= le64toh(p
);
1636 uint64_t journal_file_entry_n_items(Object
*o
) {
1640 if (o
->object
.type
!= OBJECT_ENTRY
)
1643 sz
= le64toh(READ_NOW(o
->object
.size
));
1644 if (sz
< offsetof(Object
, entry
.items
))
1647 return (sz
- offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1650 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1655 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1658 sz
= le64toh(READ_NOW(o
->object
.size
));
1659 if (sz
< offsetof(Object
, entry_array
.items
))
1662 return (sz
- offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1665 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1670 if (!IN_SET(o
->object
.type
, OBJECT_DATA_HASH_TABLE
, OBJECT_FIELD_HASH_TABLE
))
1673 sz
= le64toh(READ_NOW(o
->object
.size
));
1674 if (sz
< offsetof(Object
, hash_table
.items
))
1677 return (sz
- offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1680 static int link_entry_into_array(JournalFile
*f
,
1685 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1694 a
= le64toh(*first
);
1695 i
= hidx
= le64toh(READ_NOW(*idx
));
1698 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1702 n
= journal_file_entry_array_n_items(o
);
1704 o
->entry_array
.items
[i
] = htole64(p
);
1705 *idx
= htole64(hidx
+ 1);
1711 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1722 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1723 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1729 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1734 o
->entry_array
.items
[i
] = htole64(p
);
1737 *first
= htole64(q
);
1739 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1743 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1746 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1747 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1749 *idx
= htole64(hidx
+ 1);
1754 static int link_entry_into_array_plus_one(JournalFile
*f
,
1769 hidx
= le64toh(READ_NOW(*idx
));
1770 if (hidx
== UINT64_MAX
)
1773 *extra
= htole64(p
);
1777 i
= htole64(hidx
- 1);
1778 r
= link_entry_into_array(f
, first
, &i
, p
);
1783 *idx
= htole64(hidx
+ 1);
1787 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1795 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1796 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1800 return link_entry_into_array_plus_one(f
,
1801 &o
->data
.entry_offset
,
1802 &o
->data
.entry_array_offset
,
1807 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1816 if (o
->object
.type
!= OBJECT_ENTRY
)
1819 __sync_synchronize();
1821 /* Link up the entry itself */
1822 r
= link_entry_into_array(f
,
1823 &f
->header
->entry_array_offset
,
1824 &f
->header
->n_entries
,
1829 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1831 if (f
->header
->head_entry_realtime
== 0)
1832 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1834 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1835 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1837 /* Link up the items */
1838 n
= journal_file_entry_n_items(o
);
1839 for (uint64_t i
= 0; i
< n
; i
++) {
1842 /* If we fail to link an entry item because we can't allocate a new entry array, don't fail
1843 * immediately but try to link the other entry items since it might still be possible to link
1844 * those if they don't require a new entry array to be allocated. */
1846 k
= journal_file_link_entry_item(f
, o
, offset
, i
);
1856 static int journal_file_append_entry_internal(
1858 const dual_timestamp
*ts
,
1859 const sd_id128_t
*boot_id
,
1861 const EntryItem items
[], unsigned n_items
,
1863 Object
**ret
, uint64_t *ret_offset
) {
1871 assert(items
|| n_items
== 0);
1874 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1876 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1880 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1881 memcpy_safe(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1882 o
->entry
.realtime
= htole64(ts
->realtime
);
1883 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1884 o
->entry
.xor_hash
= htole64(xor_hash
);
1886 f
->header
->boot_id
= *boot_id
;
1887 o
->entry
.boot_id
= f
->header
->boot_id
;
1890 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1895 r
= journal_file_link_entry(f
, o
, np
);
1908 void journal_file_post_change(JournalFile
*f
) {
1914 /* inotify() does not receive IN_MODIFY events from file
1915 * accesses done via mmap(). After each access we hence
1916 * trigger IN_MODIFY by truncating the journal file to its
1917 * current size which triggers IN_MODIFY. */
1919 __sync_synchronize();
1921 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1922 log_debug_errno(errno
, "Failed to truncate file to its own size: %m");
1925 static int post_change_thunk(sd_event_source
*timer
, uint64_t usec
, void *userdata
) {
1928 journal_file_post_change(userdata
);
1933 static void schedule_post_change(JournalFile
*f
) {
1937 assert(f
->post_change_timer
);
1939 r
= sd_event_source_get_enabled(f
->post_change_timer
, NULL
);
1941 log_debug_errno(r
, "Failed to get ftruncate timer state: %m");
1947 r
= sd_event_source_set_time_relative(f
->post_change_timer
, f
->post_change_timer_period
);
1949 log_debug_errno(r
, "Failed to set time for scheduling ftruncate: %m");
1953 r
= sd_event_source_set_enabled(f
->post_change_timer
, SD_EVENT_ONESHOT
);
1955 log_debug_errno(r
, "Failed to enable scheduled ftruncate: %m");
1962 /* On failure, let's simply post the change immediately. */
1963 journal_file_post_change(f
);
1966 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1967 int journal_file_enable_post_change_timer(JournalFile
*f
, sd_event
*e
, usec_t t
) {
1968 _cleanup_(sd_event_source_unrefp
) sd_event_source
*timer
= NULL
;
1972 assert_return(!f
->post_change_timer
, -EINVAL
);
1976 r
= sd_event_add_time(e
, &timer
, CLOCK_MONOTONIC
, 0, 0, post_change_thunk
, f
);
1980 r
= sd_event_source_set_enabled(timer
, SD_EVENT_OFF
);
1984 f
->post_change_timer
= TAKE_PTR(timer
);
1985 f
->post_change_timer_period
= t
;
1990 static int entry_item_cmp(const EntryItem
*a
, const EntryItem
*b
) {
1991 return CMP(le64toh(a
->object_offset
), le64toh(b
->object_offset
));
1994 static size_t remove_duplicate_entry_items(EntryItem items
[], size_t n
) {
1996 /* This function relies on the items array being sorted. */
2002 for (size_t i
= 1; i
< n
; i
++)
2003 if (items
[i
].object_offset
!= items
[j
- 1].object_offset
)
2004 items
[j
++] = items
[i
];
2009 int journal_file_append_entry(
2011 const dual_timestamp
*ts
,
2012 const sd_id128_t
*boot_id
,
2013 const struct iovec iovec
[], unsigned n_iovec
,
2015 Object
**ret
, uint64_t *ret_offset
) {
2019 uint64_t xor_hash
= 0;
2020 struct dual_timestamp _ts
;
2024 assert(iovec
&& n_iovec
> 0);
2027 if (!VALID_REALTIME(ts
->realtime
))
2028 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
2029 "Invalid realtime timestamp %" PRIu64
", refusing entry.",
2031 if (!VALID_MONOTONIC(ts
->monotonic
))
2032 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
2033 "Invalid monotomic timestamp %" PRIu64
", refusing entry.",
2036 dual_timestamp_get(&_ts
);
2041 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
2046 items
= newa(EntryItem
, n_iovec
);
2048 for (size_t i
= 0; i
< n_iovec
; i
++) {
2052 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
2056 /* When calculating the XOR hash field, we need to take special care if the "keyed-hash"
2057 * journal file flag is on. We use the XOR hash field to quickly determine the identity of a
2058 * specific record, and give records with otherwise identical position (i.e. match in seqno,
2059 * timestamp, …) a stable ordering. But for that we can't have it that the hash of the
2060 * objects in each file is different since they are keyed. Hence let's calculate the Jenkins
2061 * hash here for that. This also has the benefit that cursors for old and new journal files
2062 * are completely identical (they include the XOR hash after all). For classic Jenkins-hash
2063 * files things are easier, we can just take the value from the stored record directly. */
2065 if (JOURNAL_HEADER_KEYED_HASH(f
->header
))
2066 xor_hash
^= jenkins_hash64(iovec
[i
].iov_base
, iovec
[i
].iov_len
);
2068 xor_hash
^= le64toh(o
->data
.hash
);
2070 items
[i
] = (EntryItem
) {
2071 .object_offset
= htole64(p
),
2072 .hash
= o
->data
.hash
,
2076 /* Order by the position on disk, in order to improve seek
2077 * times for rotating media. */
2078 typesafe_qsort(items
, n_iovec
, entry_item_cmp
);
2079 n_iovec
= remove_duplicate_entry_items(items
, n_iovec
);
2081 r
= journal_file_append_entry_internal(f
, ts
, boot_id
, xor_hash
, items
, n_iovec
, seqnum
, ret
, ret_offset
);
2083 /* If the memory mapping triggered a SIGBUS then we return an
2084 * IO error and ignore the error code passed down to us, since
2085 * it is very likely just an effect of a nullified replacement
2088 if (mmap_cache_fd_got_sigbus(f
->cache_fd
))
2091 if (f
->post_change_timer
)
2092 schedule_post_change(f
);
2094 journal_file_post_change(f
);
2099 typedef struct ChainCacheItem
{
2100 uint64_t first
; /* the array at the beginning of the chain */
2101 uint64_t array
; /* the cached array */
2102 uint64_t begin
; /* the first item in the cached array */
2103 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
2104 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
2107 static void chain_cache_put(
2114 uint64_t last_index
) {
2117 /* If the chain item to cache for this chain is the
2118 * first one it's not worth caching anything */
2122 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
2123 ci
= ordered_hashmap_steal_first(h
);
2126 ci
= new(ChainCacheItem
, 1);
2133 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
2138 assert(ci
->first
== first
);
2143 ci
->last_index
= last_index
;
2146 static int bump_array_index(uint64_t *i
, direction_t direction
, uint64_t n
) {
2149 /* Increase or decrease the specified index, in the right direction. */
2151 if (direction
== DIRECTION_DOWN
) {
2166 static int bump_entry_array(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t first
, direction_t direction
, uint64_t *ret
) {
2174 if (direction
== DIRECTION_DOWN
)
2175 return le64toh(o
->entry_array
.next_entry_array_offset
);
2177 /* Entry array chains are a singly linked list, so to find the previous array in the chain, we have
2178 * to start iterating from the top. */
2182 while (p
> 0 && p
!= offset
) {
2183 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, p
, &o
);
2188 p
= le64toh(o
->entry_array
.next_entry_array_offset
);
2191 /* If we can't find the previous entry array in the entry array chain, we're likely dealing with a
2192 * corrupted journal file. */
2201 static int generic_array_get(
2205 direction_t direction
,
2206 Object
**ret
, uint64_t *ret_offset
) {
2209 uint64_t p
= 0, a
, t
= 0, k
;
2217 /* Try the chain cache first */
2218 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
2219 if (ci
&& i
> ci
->total
) {
2226 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
2227 if (IN_SET(r
, -EBADMSG
, -EADDRNOTAVAIL
)) {
2228 /* If there's corruption and we're going downwards, let's pretend we reached the
2229 * final entry in the entry array chain. */
2231 if (direction
== DIRECTION_DOWN
)
2234 /* If there's corruption and we're going upwards, move back to the previous entry
2235 * array and start iterating entries from there. */
2237 r
= bump_entry_array(f
, NULL
, a
, first
, DIRECTION_UP
, &a
);
2248 k
= journal_file_entry_array_n_items(o
);
2254 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
2257 /* If we've found the right location, now look for the first non-corrupt entry object (in the right
2261 /* In the first iteration of the while loop, we reuse i, k and o from the previous while
2263 if (i
== UINT64_MAX
) {
2264 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
2268 k
= journal_file_entry_array_n_items(o
);
2272 i
= direction
== DIRECTION_DOWN
? 0 : k
- 1;
2276 p
= le64toh(o
->entry_array
.items
[i
]);
2278 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, ret
);
2280 /* Let's cache this item for the next invocation */
2281 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
2288 if (!IN_SET(r
, -EADDRNOTAVAIL
, -EBADMSG
))
2291 /* OK, so this entry is borked. Most likely some entry didn't get synced to
2292 * disk properly, let's see if the next one might work for us instead. */
2293 log_debug_errno(r
, "Entry item %" PRIu64
" is bad, skipping over it.", i
);
2294 } while (bump_array_index(&i
, direction
, k
) > 0);
2296 r
= bump_entry_array(f
, o
, a
, first
, direction
, &a
);
2307 static int generic_array_get_plus_one(
2312 direction_t direction
,
2313 Object
**ret
, uint64_t *ret_offset
) {
2320 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, ret
);
2321 if (IN_SET(r
, -EADDRNOTAVAIL
, -EBADMSG
))
2322 return generic_array_get(f
, first
, 0, direction
, ret
, ret_offset
);
2327 *ret_offset
= extra
;
2332 return generic_array_get(f
, first
, i
- 1, direction
, ret
, ret_offset
);
2341 static int generic_array_bisect(
2346 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
2347 direction_t direction
,
2349 uint64_t *ret_offset
,
2350 uint64_t *ret_idx
) {
2352 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= UINT64_MAX
;
2353 bool subtract_one
= false;
2354 Object
*array
= NULL
;
2359 assert(test_object
);
2361 /* Start with the first array in the chain */
2364 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
2365 if (ci
&& n
> ci
->total
&& ci
->begin
!= 0) {
2366 /* Ah, we have iterated this bisection array chain
2367 * previously! Let's see if we can skip ahead in the
2368 * chain, as far as the last time. But we can't jump
2369 * backwards in the chain, so let's check that
2372 r
= test_object(f
, ci
->begin
, needle
);
2376 if (r
== TEST_LEFT
) {
2377 /* OK, what we are looking for is right of the
2378 * begin of this EntryArray, so let's jump
2379 * straight to previously cached array in the
2385 last_index
= ci
->last_index
;
2390 uint64_t left
, right
, k
, lp
;
2392 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
2396 k
= journal_file_entry_array_n_items(array
);
2402 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
2406 r
= test_object(f
, p
, needle
);
2407 if (r
== -EBADMSG
) {
2408 log_debug_errno(r
, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2415 if (r
== TEST_FOUND
)
2416 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2418 if (r
== TEST_RIGHT
) {
2422 if (last_index
!= UINT64_MAX
) {
2423 assert(last_index
<= right
);
2425 /* If we cached the last index we
2426 * looked at, let's try to not to jump
2427 * too wildly around and see if we can
2428 * limit the range to look at early to
2429 * the immediate neighbors of the last
2430 * index we looked at. */
2432 if (last_index
> 0) {
2433 uint64_t x
= last_index
- 1;
2435 p
= le64toh(array
->entry_array
.items
[x
]);
2439 r
= test_object(f
, p
, needle
);
2443 if (r
== TEST_FOUND
)
2444 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2446 if (r
== TEST_RIGHT
)
2452 if (last_index
< right
) {
2453 uint64_t y
= last_index
+ 1;
2455 p
= le64toh(array
->entry_array
.items
[y
]);
2459 r
= test_object(f
, p
, needle
);
2463 if (r
== TEST_FOUND
)
2464 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2466 if (r
== TEST_RIGHT
)
2474 if (left
== right
) {
2475 if (direction
== DIRECTION_UP
)
2476 subtract_one
= true;
2482 assert(left
< right
);
2483 i
= (left
+ right
) / 2;
2485 p
= le64toh(array
->entry_array
.items
[i
]);
2489 r
= test_object(f
, p
, needle
);
2490 if (r
== -EBADMSG
) {
2491 log_debug_errno(r
, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2498 if (r
== TEST_FOUND
)
2499 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2501 if (r
== TEST_RIGHT
)
2509 if (direction
== DIRECTION_UP
) {
2511 subtract_one
= true;
2522 last_index
= UINT64_MAX
;
2523 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
2529 if (subtract_one
&& t
== 0 && i
== 0)
2532 /* Let's cache this item for the next invocation */
2533 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : UINT64_MAX
) : i
);
2535 if (subtract_one
&& i
== 0)
2537 else if (subtract_one
)
2538 p
= le64toh(array
->entry_array
.items
[i
-1]);
2540 p
= le64toh(array
->entry_array
.items
[i
]);
2543 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, ret
);
2552 *ret_idx
= t
+ i
+ (subtract_one
? -1 : 0);
2557 static int generic_array_bisect_plus_one(
2563 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
2564 direction_t direction
,
2566 uint64_t *ret_offset
,
2567 uint64_t *ret_idx
) {
2570 bool step_back
= false;
2573 assert(test_object
);
2578 /* This bisects the array in object 'first', but first checks
2580 r
= test_object(f
, extra
, needle
);
2584 if (r
== TEST_FOUND
)
2585 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2587 /* if we are looking with DIRECTION_UP then we need to first
2588 see if in the actual array there is a matching entry, and
2589 return the last one of that. But if there isn't any we need
2590 to return this one. Hence remember this, and return it
2593 step_back
= direction
== DIRECTION_UP
;
2595 if (r
== TEST_RIGHT
) {
2596 if (direction
== DIRECTION_DOWN
)
2602 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, ret_offset
, ret_idx
);
2604 if (r
== 0 && step_back
)
2607 if (r
> 0 && ret_idx
)
2614 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, ret
);
2620 *ret_offset
= extra
;
2628 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2634 else if (p
< needle
)
2640 int journal_file_move_to_entry_by_offset(
2643 direction_t direction
,
2645 uint64_t *ret_offset
) {
2650 return generic_array_bisect(
2652 le64toh(f
->header
->entry_array_offset
),
2653 le64toh(f
->header
->n_entries
),
2657 ret
, ret_offset
, NULL
);
2660 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2668 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2672 sq
= le64toh(READ_NOW(o
->entry
.seqnum
));
2675 else if (sq
< needle
)
2681 int journal_file_move_to_entry_by_seqnum(
2684 direction_t direction
,
2686 uint64_t *ret_offset
) {
2690 return generic_array_bisect(
2692 le64toh(f
->header
->entry_array_offset
),
2693 le64toh(f
->header
->n_entries
),
2697 ret
, ret_offset
, NULL
);
2700 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2708 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2712 rt
= le64toh(READ_NOW(o
->entry
.realtime
));
2715 else if (rt
< needle
)
2721 int journal_file_move_to_entry_by_realtime(
2724 direction_t direction
,
2726 uint64_t *ret_offset
) {
2730 return generic_array_bisect(
2732 le64toh(f
->header
->entry_array_offset
),
2733 le64toh(f
->header
->n_entries
),
2735 test_object_realtime
,
2737 ret
, ret_offset
, NULL
);
2740 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2748 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2752 m
= le64toh(READ_NOW(o
->entry
.monotonic
));
2755 else if (m
< needle
)
2761 static int find_data_object_by_boot_id(
2767 char t
[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2769 sd_id128_to_string(boot_id
, t
+ 9);
2770 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2773 int journal_file_move_to_entry_by_monotonic(
2777 direction_t direction
,
2779 uint64_t *ret_offset
) {
2786 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2792 return generic_array_bisect_plus_one(
2794 le64toh(o
->data
.entry_offset
),
2795 le64toh(o
->data
.entry_array_offset
),
2796 le64toh(o
->data
.n_entries
),
2798 test_object_monotonic
,
2800 ret
, ret_offset
, NULL
);
2803 void journal_file_reset_location(JournalFile
*f
) {
2804 f
->location_type
= LOCATION_HEAD
;
2805 f
->current_offset
= 0;
2806 f
->current_seqnum
= 0;
2807 f
->current_realtime
= 0;
2808 f
->current_monotonic
= 0;
2809 zero(f
->current_boot_id
);
2810 f
->current_xor_hash
= 0;
2813 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2814 f
->location_type
= LOCATION_SEEK
;
2815 f
->current_offset
= offset
;
2816 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2817 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2818 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2819 f
->current_boot_id
= o
->entry
.boot_id
;
2820 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2823 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2830 assert(af
->location_type
== LOCATION_SEEK
);
2831 assert(bf
->location_type
== LOCATION_SEEK
);
2833 /* If contents, timestamps and seqnum match, these entries are
2835 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2836 af
->current_monotonic
== bf
->current_monotonic
&&
2837 af
->current_realtime
== bf
->current_realtime
&&
2838 af
->current_xor_hash
== bf
->current_xor_hash
&&
2839 sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
) &&
2840 af
->current_seqnum
== bf
->current_seqnum
)
2843 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2845 /* If this is from the same seqnum source, compare
2847 r
= CMP(af
->current_seqnum
, bf
->current_seqnum
);
2851 /* Wow! This is weird, different data but the same
2852 * seqnums? Something is borked, but let's make the
2853 * best of it and compare by time. */
2856 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2858 /* If the boot id matches, compare monotonic time */
2859 r
= CMP(af
->current_monotonic
, bf
->current_monotonic
);
2864 /* Otherwise, compare UTC time */
2865 r
= CMP(af
->current_realtime
, bf
->current_realtime
);
2869 /* Finally, compare by contents */
2870 return CMP(af
->current_xor_hash
, bf
->current_xor_hash
);
2873 static bool check_properly_ordered(uint64_t new_offset
, uint64_t old_offset
, direction_t direction
) {
2875 /* Consider it an error if any of the two offsets is uninitialized */
2876 if (old_offset
== 0 || new_offset
== 0)
2879 /* If we go down, the new offset must be larger than the old one. */
2880 return direction
== DIRECTION_DOWN
?
2881 new_offset
> old_offset
:
2882 new_offset
< old_offset
;
2885 int journal_file_next_entry(
2888 direction_t direction
,
2889 Object
**ret
, uint64_t *ret_offset
) {
2897 n
= le64toh(READ_NOW(f
->header
->n_entries
));
2902 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2904 r
= generic_array_bisect(f
,
2905 le64toh(f
->header
->entry_array_offset
),
2906 le64toh(f
->header
->n_entries
),
2915 r
= bump_array_index(&i
, direction
, n
);
2920 /* And jump to it */
2921 r
= generic_array_get(f
, le64toh(f
->header
->entry_array_offset
), i
, direction
, ret
, &ofs
);
2925 /* Ensure our array is properly ordered. */
2926 if (p
> 0 && !check_properly_ordered(ofs
, p
, direction
))
2927 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG
),
2928 "%s: entry array not properly ordered at entry %" PRIu64
,
2937 int journal_file_next_entry_for_data(
2940 direction_t direction
,
2941 Object
**ret
, uint64_t *ret_offset
) {
2948 assert(d
->object
.type
== OBJECT_DATA
);
2950 n
= le64toh(READ_NOW(d
->data
.n_entries
));
2954 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2956 r
= generic_array_get_plus_one(f
,
2957 le64toh(d
->data
.entry_offset
),
2958 le64toh(d
->data
.entry_array_offset
),
2971 int journal_file_move_to_entry_by_offset_for_data(
2975 direction_t direction
,
2976 Object
**ret
, uint64_t *ret_offset
) {
2980 assert(d
->object
.type
== OBJECT_DATA
);
2982 return generic_array_bisect_plus_one(
2984 le64toh(d
->data
.entry_offset
),
2985 le64toh(d
->data
.entry_array_offset
),
2986 le64toh(d
->data
.n_entries
),
2990 ret
, ret_offset
, NULL
);
2993 int journal_file_move_to_entry_by_monotonic_for_data(
2998 direction_t direction
,
2999 Object
**ret
, uint64_t *ret_offset
) {
3003 uint64_t b
, z
, entry_offset
, entry_array_offset
, n_entries
;
3007 assert(d
->object
.type
== OBJECT_DATA
);
3009 /* Save all the required data before the data object gets invalidated. */
3010 entry_offset
= le64toh(READ_NOW(d
->data
.entry_offset
));
3011 entry_array_offset
= le64toh(READ_NOW(d
->data
.entry_array_offset
));
3012 n_entries
= le64toh(READ_NOW(d
->data
.n_entries
));
3014 /* First, seek by time */
3015 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
3021 r
= generic_array_bisect_plus_one(f
,
3022 le64toh(o
->data
.entry_offset
),
3023 le64toh(o
->data
.entry_array_offset
),
3024 le64toh(o
->data
.n_entries
),
3026 test_object_monotonic
,
3032 /* And now, continue seeking until we find an entry that
3033 * exists in both bisection arrays */
3035 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
3042 r
= generic_array_bisect_plus_one(f
,
3053 r
= generic_array_bisect_plus_one(f
,
3054 le64toh(o
->data
.entry_offset
),
3055 le64toh(o
->data
.entry_array_offset
),
3056 le64toh(o
->data
.n_entries
),
3067 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, q
, ret
);
3082 int journal_file_move_to_entry_by_seqnum_for_data(
3086 direction_t direction
,
3087 Object
**ret
, uint64_t *ret_offset
) {
3091 assert(d
->object
.type
== OBJECT_DATA
);
3093 return generic_array_bisect_plus_one(
3095 le64toh(d
->data
.entry_offset
),
3096 le64toh(d
->data
.entry_array_offset
),
3097 le64toh(d
->data
.n_entries
),
3101 ret
, ret_offset
, NULL
);
3104 int journal_file_move_to_entry_by_realtime_for_data(
3108 direction_t direction
,
3109 Object
**ret
, uint64_t *ret_offset
) {
3113 assert(d
->object
.type
== OBJECT_DATA
);
3115 return generic_array_bisect_plus_one(
3117 le64toh(d
->data
.entry_offset
),
3118 le64toh(d
->data
.entry_array_offset
),
3119 le64toh(d
->data
.n_entries
),
3121 test_object_realtime
,
3123 ret
, ret_offset
, NULL
);
3126 void journal_file_dump(JournalFile
*f
) {
3134 journal_file_print_header(f
);
3136 p
= le64toh(READ_NOW(f
->header
->header_size
));
3140 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
3144 s
= journal_object_type_to_string(o
->object
.type
);
3146 switch (o
->object
.type
) {
3151 printf("Type: %s seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
3153 le64toh(o
->entry
.seqnum
),
3154 le64toh(o
->entry
.monotonic
),
3155 le64toh(o
->entry
.realtime
));
3161 printf("Type: %s seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
3163 le64toh(o
->tag
.seqnum
),
3164 le64toh(o
->tag
.epoch
));
3169 printf("Type: %s \n", s
);
3171 printf("Type: unknown (%i)", o
->object
.type
);
3176 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
3177 printf("Flags: %s\n",
3178 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
3180 if (p
== le64toh(f
->header
->tail_object_offset
))
3183 p
+= ALIGN64(le64toh(o
->object
.size
));
3188 log_error("File corrupt");
3191 /* Note: the lifetime of the compound literal is the immediately surrounding block. */
3192 #define FORMAT_TIMESTAMP_SAFE(t) (FORMAT_TIMESTAMP(t) ?: " --- ")
3194 void journal_file_print_header(JournalFile
*f
) {
3200 printf("File path: %s\n"
3204 "Sequential number ID: %s\n"
3206 "Compatible flags:%s%s\n"
3207 "Incompatible flags:%s%s%s%s%s\n"
3208 "Header size: %"PRIu64
"\n"
3209 "Arena size: %"PRIu64
"\n"
3210 "Data hash table size: %"PRIu64
"\n"
3211 "Field hash table size: %"PRIu64
"\n"
3212 "Rotate suggested: %s\n"
3213 "Head sequential number: %"PRIu64
" (%"PRIx64
")\n"
3214 "Tail sequential number: %"PRIu64
" (%"PRIx64
")\n"
3215 "Head realtime timestamp: %s (%"PRIx64
")\n"
3216 "Tail realtime timestamp: %s (%"PRIx64
")\n"
3217 "Tail monotonic timestamp: %s (%"PRIx64
")\n"
3218 "Objects: %"PRIu64
"\n"
3219 "Entry objects: %"PRIu64
"\n",
3221 SD_ID128_TO_STRING(f
->header
->file_id
),
3222 SD_ID128_TO_STRING(f
->header
->machine_id
),
3223 SD_ID128_TO_STRING(f
->header
->boot_id
),
3224 SD_ID128_TO_STRING(f
->header
->seqnum_id
),
3225 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
3226 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
3227 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
3228 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
3229 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
3230 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
3231 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
3232 JOURNAL_HEADER_COMPRESSED_ZSTD(f
->header
) ? " COMPRESSED-ZSTD" : "",
3233 JOURNAL_HEADER_KEYED_HASH(f
->header
) ? " KEYED-HASH" : "",
3234 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
3235 le64toh(f
->header
->header_size
),
3236 le64toh(f
->header
->arena_size
),
3237 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3238 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
3239 yes_no(journal_file_rotate_suggested(f
, 0, LOG_DEBUG
)),
3240 le64toh(f
->header
->head_entry_seqnum
), le64toh(f
->header
->head_entry_seqnum
),
3241 le64toh(f
->header
->tail_entry_seqnum
), le64toh(f
->header
->tail_entry_seqnum
),
3242 FORMAT_TIMESTAMP_SAFE(le64toh(f
->header
->head_entry_realtime
)), le64toh(f
->header
->head_entry_realtime
),
3243 FORMAT_TIMESTAMP_SAFE(le64toh(f
->header
->tail_entry_realtime
)), le64toh(f
->header
->tail_entry_realtime
),
3244 FORMAT_TIMESPAN(le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
), le64toh(f
->header
->tail_entry_monotonic
),
3245 le64toh(f
->header
->n_objects
),
3246 le64toh(f
->header
->n_entries
));
3248 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3249 printf("Data objects: %"PRIu64
"\n"
3250 "Data hash table fill: %.1f%%\n",
3251 le64toh(f
->header
->n_data
),
3252 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
3254 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3255 printf("Field objects: %"PRIu64
"\n"
3256 "Field hash table fill: %.1f%%\n",
3257 le64toh(f
->header
->n_fields
),
3258 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
3260 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
3261 printf("Tag objects: %"PRIu64
"\n",
3262 le64toh(f
->header
->n_tags
));
3263 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
3264 printf("Entry array objects: %"PRIu64
"\n",
3265 le64toh(f
->header
->n_entry_arrays
));
3267 if (JOURNAL_HEADER_CONTAINS(f
->header
, field_hash_chain_depth
))
3268 printf("Deepest field hash chain: %" PRIu64
"\n",
3269 f
->header
->field_hash_chain_depth
);
3271 if (JOURNAL_HEADER_CONTAINS(f
->header
, data_hash_chain_depth
))
3272 printf("Deepest data hash chain: %" PRIu64
"\n",
3273 f
->header
->data_hash_chain_depth
);
3275 if (fstat(f
->fd
, &st
) >= 0)
3276 printf("Disk usage: %s\n", FORMAT_BYTES((uint64_t) st
.st_blocks
* 512ULL));
3279 static int journal_file_warn_btrfs(JournalFile
*f
) {
3285 /* Before we write anything, check if the COW logic is turned
3286 * off on btrfs. Given our write pattern that is quite
3287 * unfriendly to COW file systems this should greatly improve
3288 * performance on COW file systems, such as btrfs, at the
3289 * expense of data integrity features (which shouldn't be too
3290 * bad, given that we do our own checksumming). */
3292 r
= fd_is_fs_type(f
->fd
, BTRFS_SUPER_MAGIC
);
3294 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
3298 r
= read_attr_fd(f
->fd
, &attrs
);
3300 return log_warning_errno(r
, "Failed to read file attributes: %m");
3302 if (attrs
& FS_NOCOW_FL
) {
3303 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3307 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3308 "This is likely to slow down journal access substantially, please consider turning "
3309 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
3314 int journal_file_open(
3320 uint64_t compress_threshold_bytes
,
3322 JournalMetrics
*metrics
,
3323 MMapCache
*mmap_cache
,
3324 JournalFile
*template,
3325 JournalFile
**ret
) {
3327 bool newly_created
= false;
3333 assert(fd
>= 0 || fname
);
3336 if (!IN_SET((flags
& O_ACCMODE
), O_RDONLY
, O_RDWR
))
3339 if ((flags
& O_ACCMODE
) == O_RDONLY
&& FLAGS_SET(flags
, O_CREAT
))
3342 if (fname
&& (flags
& O_CREAT
) && !endswith(fname
, ".journal"))
3345 f
= new(JournalFile
, 1);
3349 *f
= (JournalFile
) {
3354 .writable
= (flags
& O_ACCMODE
) != O_RDONLY
,
3357 .compress_zstd
= compress
,
3359 .compress_lz4
= compress
,
3361 .compress_xz
= compress
,
3363 .compress_threshold_bytes
= compress_threshold_bytes
== UINT64_MAX
?
3364 DEFAULT_COMPRESS_THRESHOLD
:
3365 MAX(MIN_COMPRESS_THRESHOLD
, compress_threshold_bytes
),
3371 /* We turn on keyed hashes by default, but provide an environment variable to turn them off, if
3372 * people really want that */
3373 r
= getenv_bool("SYSTEMD_JOURNAL_KEYED_HASH");
3376 log_debug_errno(r
, "Failed to parse $SYSTEMD_JOURNAL_KEYED_HASH environment variable, ignoring: %m");
3377 f
->keyed_hash
= true;
3381 if (DEBUG_LOGGING
) {
3382 static int last_seal
= -1, last_compress
= -1, last_keyed_hash
= -1;
3383 static uint64_t last_bytes
= UINT64_MAX
;
3385 if (last_seal
!= f
->seal
||
3386 last_keyed_hash
!= f
->keyed_hash
||
3387 last_compress
!= JOURNAL_FILE_COMPRESS(f
) ||
3388 last_bytes
!= f
->compress_threshold_bytes
) {
3390 log_debug("Journal effective settings seal=%s keyed_hash=%s compress=%s compress_threshold_bytes=%s",
3391 yes_no(f
->seal
), yes_no(f
->keyed_hash
), yes_no(JOURNAL_FILE_COMPRESS(f
)),
3392 FORMAT_BYTES(f
->compress_threshold_bytes
));
3393 last_seal
= f
->seal
;
3394 last_keyed_hash
= f
->keyed_hash
;
3395 last_compress
= JOURNAL_FILE_COMPRESS(f
);
3396 last_bytes
= f
->compress_threshold_bytes
;
3401 f
->path
= strdup(fname
);
3409 /* If we don't know the path, fill in something explanatory and vaguely useful */
3410 if (asprintf(&f
->path
, "/proc/self/%i", fd
) < 0) {
3416 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
3417 if (!f
->chain_cache
) {
3423 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3424 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3425 * it doesn't hurt in that case. */
3427 f
->fd
= openat_report_new(AT_FDCWD
, f
->path
, f
->flags
|O_CLOEXEC
|O_NONBLOCK
, f
->mode
, &newly_created
);
3433 /* fds we opened here by us should also be closed by us. */
3436 r
= fd_nonblock(f
->fd
, false);
3440 if (!newly_created
) {
3441 r
= journal_file_fstat(f
);
3446 r
= journal_file_fstat(f
);
3450 /* If we just got the fd passed in, we don't really know if we created the file anew */
3451 newly_created
= f
->last_stat
.st_size
== 0 && f
->writable
;
3454 f
->cache_fd
= mmap_cache_add_fd(mmap_cache
, f
->fd
, prot_from_flags(flags
));
3460 if (newly_created
) {
3461 (void) journal_file_warn_btrfs(f
);
3463 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3464 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3465 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3466 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3467 * solely on mtime/atime/ctime of the file. */
3468 (void) fd_setcrtime(f
->fd
, 0);
3471 /* Try to load the FSPRG state, and if we can't, then
3472 * just don't do sealing */
3474 r
= journal_file_fss_load(f
);
3480 r
= journal_file_init_header(f
, template);
3484 r
= journal_file_fstat(f
);
3489 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
3494 r
= mmap_cache_fd_get(f
->cache_fd
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
);
3496 /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only
3497 * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error
3507 if (!newly_created
) {
3508 r
= journal_file_verify_header(f
);
3514 if (!newly_created
&& f
->writable
) {
3515 r
= journal_file_fss_load(f
);
3523 journal_default_metrics(metrics
, f
->fd
);
3524 f
->metrics
= *metrics
;
3525 } else if (template)
3526 f
->metrics
= template->metrics
;
3528 r
= journal_file_refresh_header(f
);
3534 r
= journal_file_hmac_setup(f
);
3539 if (newly_created
) {
3540 r
= journal_file_setup_field_hash_table(f
);
3544 r
= journal_file_setup_data_hash_table(f
);
3549 r
= journal_file_append_first_tag(f
);
3555 if (mmap_cache_fd_got_sigbus(f
->cache_fd
)) {
3560 if (template && template->post_change_timer
) {
3561 r
= journal_file_enable_post_change_timer(
3563 sd_event_source_get_event(template->post_change_timer
),
3564 template->post_change_timer_period
);
3570 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3577 if (f
->cache_fd
&& mmap_cache_fd_got_sigbus(f
->cache_fd
))
3580 (void) journal_file_close(f
);
3585 int journal_file_archive(JournalFile
*f
, char **ret_previous_path
) {
3586 _cleanup_free_
char *p
= NULL
;
3593 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3594 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3595 if (path_startswith(f
->path
, "/proc/self/fd"))
3598 if (!endswith(f
->path
, ".journal"))
3601 if (asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
3602 (int) strlen(f
->path
) - 8, f
->path
,
3603 SD_ID128_FORMAT_VAL(f
->header
->seqnum_id
),
3604 le64toh(f
->header
->head_entry_seqnum
),
3605 le64toh(f
->header
->head_entry_realtime
)) < 0)
3608 /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
3609 * ignore that case. */
3610 if (rename(f
->path
, p
) < 0 && errno
!= ENOENT
)
3613 /* Sync the rename to disk */
3614 (void) fsync_directory_of_file(f
->fd
);
3616 if (ret_previous_path
)
3617 *ret_previous_path
= f
->path
;
3621 f
->path
= TAKE_PTR(p
);
3623 /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
3624 * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
3625 * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue
3626 * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
3633 int journal_file_dispose(int dir_fd
, const char *fname
) {
3634 _cleanup_free_
char *p
= NULL
;
3638 /* Renames a journal file to *.journal~, i.e. to mark it as corrupted or otherwise uncleanly shutdown. Note that
3639 * this is done without looking into the file or changing any of its contents. The idea is that this is called
3640 * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
3641 * for writing anymore. */
3643 if (!endswith(fname
, ".journal"))
3646 if (asprintf(&p
, "%.*s@%016" PRIx64
"-%016" PRIx64
".journal~",
3647 (int) strlen(fname
) - 8, fname
,
3648 now(CLOCK_REALTIME
),
3652 if (renameat(dir_fd
, fname
, dir_fd
, p
) < 0)
3658 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
) {
3659 uint64_t q
, n
, xor_hash
= 0;
3660 const sd_id128_t
*boot_id
;
3673 ts
= (dual_timestamp
) {
3674 .monotonic
= le64toh(o
->entry
.monotonic
),
3675 .realtime
= le64toh(o
->entry
.realtime
),
3677 boot_id
= &o
->entry
.boot_id
;
3679 n
= journal_file_entry_n_items(o
);
3680 items
= newa(EntryItem
, n
);
3682 for (uint64_t i
= 0; i
< n
; i
++) {
3688 q
= le64toh(o
->entry
.items
[i
].object_offset
);
3690 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
3694 l
= le64toh(READ_NOW(o
->object
.size
));
3695 if (l
< offsetof(Object
, data
.payload
))
3698 l
-= offsetof(Object
, data
.payload
);
3701 /* We hit the limit on 32bit machines */
3702 if ((uint64_t) t
!= l
)
3705 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
3706 #if HAVE_COMPRESSION
3709 r
= decompress_blob(
3710 o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
3712 &from
->compress_buffer
, &rsize
,
3717 data
= from
->compress_buffer
;
3720 return -EPROTONOSUPPORT
;
3723 data
= o
->data
.payload
;
3728 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
3732 if (JOURNAL_HEADER_KEYED_HASH(to
->header
))
3733 xor_hash
^= jenkins_hash64(data
, l
);
3735 xor_hash
^= le64toh(u
->data
.hash
);
3737 items
[i
] = (EntryItem
) {
3738 .object_offset
= htole64(h
),
3739 .hash
= u
->data
.hash
,
3742 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
3747 r
= journal_file_append_entry_internal(to
, &ts
, boot_id
, xor_hash
, items
, n
, NULL
, NULL
, NULL
);
3749 if (mmap_cache_fd_got_sigbus(to
->cache_fd
))
3755 void journal_reset_metrics(JournalMetrics
*m
) {
3758 /* Set everything to "pick automatic values". */
3760 *m
= (JournalMetrics
) {
3761 .min_use
= UINT64_MAX
,
3762 .max_use
= UINT64_MAX
,
3763 .min_size
= UINT64_MAX
,
3764 .max_size
= UINT64_MAX
,
3765 .keep_free
= UINT64_MAX
,
3766 .n_max_files
= UINT64_MAX
,
3770 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
3772 uint64_t fs_size
= 0;
3777 if (fstatvfs(fd
, &ss
) >= 0)
3778 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
3780 log_debug_errno(errno
, "Failed to determine disk size: %m");
3782 if (m
->max_use
== UINT64_MAX
) {
3785 m
->max_use
= CLAMP(PAGE_ALIGN(fs_size
/ 10), /* 10% of file system size */
3786 MAX_USE_LOWER
, MAX_USE_UPPER
);
3788 m
->max_use
= MAX_USE_LOWER
;
3790 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3792 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3793 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3796 if (m
->min_use
== UINT64_MAX
) {
3798 m
->min_use
= CLAMP(PAGE_ALIGN(fs_size
/ 50), /* 2% of file system size */
3799 MIN_USE_LOW
, MIN_USE_HIGH
);
3801 m
->min_use
= MIN_USE_LOW
;
3804 if (m
->min_use
> m
->max_use
)
3805 m
->min_use
= m
->max_use
;
3807 if (m
->max_size
== UINT64_MAX
)
3808 m
->max_size
= MIN(PAGE_ALIGN(m
->max_use
/ 8), /* 8 chunks */
3811 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3813 if (m
->max_size
!= 0) {
3814 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3815 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3817 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3818 m
->max_use
= m
->max_size
*2;
3821 if (m
->min_size
== UINT64_MAX
)
3822 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3824 m
->min_size
= CLAMP(PAGE_ALIGN(m
->min_size
),
3825 JOURNAL_FILE_SIZE_MIN
,
3826 m
->max_size
?: UINT64_MAX
);
3828 if (m
->keep_free
== UINT64_MAX
) {
3830 m
->keep_free
= MIN(PAGE_ALIGN(fs_size
/ 20), /* 5% of file system size */
3833 m
->keep_free
= DEFAULT_KEEP_FREE
;
3836 if (m
->n_max_files
== UINT64_MAX
)
3837 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3839 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3840 FORMAT_BYTES(m
->min_use
),
3841 FORMAT_BYTES(m
->max_use
),
3842 FORMAT_BYTES(m
->max_size
),
3843 FORMAT_BYTES(m
->min_size
),
3844 FORMAT_BYTES(m
->keep_free
),
3848 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3854 if (f
->header
->head_entry_realtime
== 0)
3857 *from
= le64toh(f
->header
->head_entry_realtime
);
3861 if (f
->header
->tail_entry_realtime
== 0)
3864 *to
= le64toh(f
->header
->tail_entry_realtime
);
3870 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3878 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3882 if (le64toh(o
->data
.n_entries
) <= 0)
3886 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3890 *from
= le64toh(o
->entry
.monotonic
);
3894 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3898 r
= generic_array_get_plus_one(f
,
3899 le64toh(o
->data
.entry_offset
),
3900 le64toh(o
->data
.entry_array_offset
),
3901 le64toh(o
->data
.n_entries
) - 1,
3907 *to
= le64toh(o
->entry
.monotonic
);
3913 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
, int log_level
) {
3917 /* If we gained new header fields we gained new features,
3918 * hence suggest a rotation */
3919 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3920 log_full(log_level
, "%s uses an outdated header, suggesting rotation.", f
->path
);
3924 /* Let's check if the hash tables grew over a certain fill level (75%, borrowing this value from
3925 * Java's hash table implementation), and if so suggest a rotation. To calculate the fill level we
3926 * need the n_data field, which only exists in newer versions. */
3928 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3929 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3931 "Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3933 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3934 le64toh(f
->header
->n_data
),
3935 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3936 (unsigned long long) f
->last_stat
.st_size
,
3937 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3941 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3942 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3944 "Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3946 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3947 le64toh(f
->header
->n_fields
),
3948 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3952 /* If there are too many hash collisions somebody is most likely playing games with us. Hence, if our
3953 * longest chain is longer than some threshold, let's suggest rotation. */
3954 if (JOURNAL_HEADER_CONTAINS(f
->header
, data_hash_chain_depth
) &&
3955 le64toh(f
->header
->data_hash_chain_depth
) > HASH_CHAIN_DEPTH_MAX
) {
3957 "Data hash table of %s has deepest hash chain of length %" PRIu64
", suggesting rotation.",
3958 f
->path
, le64toh(f
->header
->data_hash_chain_depth
));
3962 if (JOURNAL_HEADER_CONTAINS(f
->header
, field_hash_chain_depth
) &&
3963 le64toh(f
->header
->field_hash_chain_depth
) > HASH_CHAIN_DEPTH_MAX
) {
3965 "Field hash table of %s has deepest hash chain of length at %" PRIu64
", suggesting rotation.",
3966 f
->path
, le64toh(f
->header
->field_hash_chain_depth
));
3970 /* Are the data objects properly indexed by field objects? */
3971 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3972 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3973 le64toh(f
->header
->n_data
) > 0 &&
3974 le64toh(f
->header
->n_fields
) == 0) {
3976 "Data objects of %s are not indexed by field objects, suggesting rotation.",
3981 if (max_file_usec
> 0) {
3984 h
= le64toh(f
->header
->head_entry_realtime
);
3985 t
= now(CLOCK_REALTIME
);
3987 if (h
> 0 && t
> h
+ max_file_usec
) {
3989 "Oldest entry in %s is older than the configured file retention duration (%s), suggesting rotation.",
3990 f
->path
, FORMAT_TIMESPAN(max_file_usec
, USEC_PER_SEC
));
3998 static const char * const journal_object_type_table
[] = {
3999 [OBJECT_UNUSED
] = "unused",
4000 [OBJECT_DATA
] = "data",
4001 [OBJECT_FIELD
] = "field",
4002 [OBJECT_ENTRY
] = "entry",
4003 [OBJECT_DATA_HASH_TABLE
] = "data hash table",
4004 [OBJECT_FIELD_HASH_TABLE
] = "field hash table",
4005 [OBJECT_ENTRY_ARRAY
] = "entry array",
4006 [OBJECT_TAG
] = "tag",
4009 DEFINE_STRING_TABLE_LOOKUP_TO_STRING(journal_object_type
, ObjectType
);