1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2011 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/statvfs.h>
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
36 #include "journal-authenticate.h"
37 #include "journal-def.h"
38 #include "journal-file.h"
40 #include "parse-util.h"
41 #include "path-util.h"
42 #include "random-util.h"
45 #include "string-util.h"
47 #include "xattr-util.h"
49 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
50 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
52 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
54 /* This is the minimum journal file size */
55 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
57 /* These are the lower and upper bounds if we deduce the max_use value
58 * from the file system size */
59 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
60 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
62 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
63 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
65 /* This is the upper bound if we deduce max_size from max_use */
66 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
68 /* This is the upper bound if we deduce the keep_free value from the
70 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
72 /* This is the keep_free value when we can't determine the system
74 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
76 /* This is the default maximum number of journal files to keep around. */
77 #define DEFAULT_N_MAX_FILES (100)
79 /* n_data was the first entry we added after the initial file format design */
80 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
82 /* How many entries to keep in the entry array chain cache at max */
83 #define CHAIN_CACHE_MAX 20
85 /* How much to increase the journal file size at once each time we allocate something new. */
86 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
88 /* Reread fstat() of the file for detecting deletions at least this often */
89 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
91 /* The mmap context to use for the header we pick as one above the last defined typed */
92 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
95 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
98 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
99 * As a result we use atomic operations on f->offline_state for inter-thread communications with
100 * journal_file_set_offline() and journal_file_set_online(). */
101 static void journal_file_set_offline_internal(JournalFile
*f
) {
107 switch (f
->offline_state
) {
109 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_CANCEL
, OFFLINE_DONE
))
113 case OFFLINE_AGAIN_FROM_SYNCING
:
114 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_SYNCING
, OFFLINE_SYNCING
))
118 case OFFLINE_AGAIN_FROM_OFFLINING
:
119 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_OFFLINING
, OFFLINE_SYNCING
))
123 case OFFLINE_SYNCING
:
126 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_OFFLINING
))
129 f
->header
->state
= f
->archive
? STATE_ARCHIVED
: STATE_OFFLINE
;
133 case OFFLINE_OFFLINING
:
134 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_OFFLINING
, OFFLINE_DONE
))
141 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
147 static void * journal_file_set_offline_thread(void *arg
) {
148 JournalFile
*f
= arg
;
150 journal_file_set_offline_internal(f
);
155 static int journal_file_set_offline_thread_join(JournalFile
*f
) {
160 if (f
->offline_state
== OFFLINE_JOINED
)
163 r
= pthread_join(f
->offline_thread
, NULL
);
167 f
->offline_state
= OFFLINE_JOINED
;
169 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
175 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
176 static bool journal_file_set_offline_try_restart(JournalFile
*f
) {
178 switch (f
->offline_state
) {
179 case OFFLINE_AGAIN_FROM_SYNCING
:
180 case OFFLINE_AGAIN_FROM_OFFLINING
:
184 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_CANCEL
, OFFLINE_AGAIN_FROM_SYNCING
))
188 case OFFLINE_SYNCING
:
189 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_AGAIN_FROM_SYNCING
))
193 case OFFLINE_OFFLINING
:
194 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_OFFLINING
, OFFLINE_AGAIN_FROM_OFFLINING
))
204 /* Sets a journal offline.
206 * If wait is false then an offline is dispatched in a separate thread for a
207 * subsequent journal_file_set_offline() or journal_file_set_online() of the
208 * same journal to synchronize with.
210 * If wait is true, then either an existing offline thread will be restarted
211 * and joined, or if none exists the offline is simply performed in this
212 * context without involving another thread.
214 int journal_file_set_offline(JournalFile
*f
, bool wait
) {
223 if (!(f
->fd
>= 0 && f
->header
))
226 /* An offlining journal is implicitly online and may modify f->header->state,
227 * we must also join any potentially lingering offline thread when not online. */
228 if (!journal_file_is_offlining(f
) && f
->header
->state
!= STATE_ONLINE
)
229 return journal_file_set_offline_thread_join(f
);
231 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
232 restarted
= journal_file_set_offline_try_restart(f
);
233 if ((restarted
&& wait
) || !restarted
) {
234 r
= journal_file_set_offline_thread_join(f
);
242 /* Initiate a new offline. */
243 f
->offline_state
= OFFLINE_SYNCING
;
245 if (wait
) /* Without using a thread if waiting. */
246 journal_file_set_offline_internal(f
);
248 r
= pthread_create(&f
->offline_thread
, NULL
, journal_file_set_offline_thread
, f
);
250 f
->offline_state
= OFFLINE_JOINED
;
258 static int journal_file_set_online(JournalFile
*f
) {
266 if (!(f
->fd
>= 0 && f
->header
))
270 switch (f
->offline_state
) {
272 /* No offline thread, no need to wait. */
276 case OFFLINE_SYNCING
:
277 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_CANCEL
))
279 /* Canceled syncing prior to offlining, no need to wait. */
282 case OFFLINE_AGAIN_FROM_SYNCING
:
283 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_SYNCING
, OFFLINE_CANCEL
))
285 /* Canceled restart from syncing, no need to wait. */
288 case OFFLINE_AGAIN_FROM_OFFLINING
:
289 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_OFFLINING
, OFFLINE_CANCEL
))
291 /* Canceled restart from offlining, must wait for offlining to complete however. */
296 r
= journal_file_set_offline_thread_join(f
);
306 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
309 switch (f
->header
->state
) {
314 f
->header
->state
= STATE_ONLINE
;
323 bool journal_file_is_offlining(JournalFile
*f
) {
326 __sync_synchronize();
328 if (IN_SET(f
->offline_state
, OFFLINE_DONE
, OFFLINE_JOINED
))
334 JournalFile
* journal_file_close(JournalFile
*f
) {
338 /* Write the final tag */
339 if (f
->seal
&& f
->writable
) {
342 r
= journal_file_append_tag(f
);
344 log_error_errno(r
, "Failed to append tag when closing journal: %m");
348 if (f
->post_change_timer
) {
351 if (sd_event_source_get_enabled(f
->post_change_timer
, &enabled
) >= 0)
352 if (enabled
== SD_EVENT_ONESHOT
)
353 journal_file_post_change(f
);
355 (void) sd_event_source_set_enabled(f
->post_change_timer
, SD_EVENT_OFF
);
356 sd_event_source_unref(f
->post_change_timer
);
359 journal_file_set_offline(f
, true);
361 if (f
->mmap
&& f
->cache_fd
)
362 mmap_cache_free_fd(f
->mmap
, f
->cache_fd
);
364 if (f
->fd
>= 0 && f
->defrag_on_close
) {
366 /* Be friendly to btrfs: turn COW back on again now,
367 * and defragment the file. We won't write to the file
368 * ever again, hence remove all fragmentation, and
369 * reenable all the good bits COW usually provides
370 * (such as data checksumming). */
372 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
373 (void) btrfs_defrag_fd(f
->fd
);
380 mmap_cache_unref(f
->mmap
);
382 ordered_hashmap_free_free(f
->chain_cache
);
384 #if HAVE_XZ || HAVE_LZ4
385 free(f
->compress_buffer
);
390 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
392 free(f
->fsprg_state
);
397 gcry_md_close(f
->hmac
);
403 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
410 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
411 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
413 h
.incompatible_flags
|= htole32(
414 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
415 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
417 h
.compatible_flags
= htole32(
418 f
->seal
* HEADER_COMPATIBLE_SEALED
);
420 r
= sd_id128_randomize(&h
.file_id
);
425 h
.seqnum_id
= template->header
->seqnum_id
;
426 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
428 h
.seqnum_id
= h
.file_id
;
430 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
440 static int fsync_directory_of_file(int fd
) {
441 _cleanup_free_
char *path
= NULL
, *dn
= NULL
;
442 _cleanup_close_
int dfd
= -1;
446 if (fstat(fd
, &st
) < 0)
449 if (!S_ISREG(st
.st_mode
))
452 r
= fd_get_path(fd
, &path
);
456 if (!path_is_absolute(path
))
459 dn
= dirname_malloc(path
);
463 dfd
= open(dn
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
473 static int journal_file_refresh_header(JournalFile
*f
) {
480 r
= sd_id128_get_machine(&f
->header
->machine_id
);
484 r
= sd_id128_get_boot(&boot_id
);
488 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
489 f
->tail_entry_monotonic_valid
= true;
491 f
->header
->boot_id
= boot_id
;
493 r
= journal_file_set_online(f
);
495 /* Sync the online state to disk */
498 /* We likely just created a new file, also sync the directory this file is located in. */
499 (void) fsync_directory_of_file(f
->fd
);
504 static bool warn_wrong_flags(const JournalFile
*f
, bool compatible
) {
505 const uint32_t any
= compatible
? HEADER_COMPATIBLE_ANY
: HEADER_INCOMPATIBLE_ANY
,
506 supported
= compatible
? HEADER_COMPATIBLE_SUPPORTED
: HEADER_INCOMPATIBLE_SUPPORTED
;
507 const char *type
= compatible
? "compatible" : "incompatible";
510 flags
= le32toh(compatible
? f
->header
->compatible_flags
: f
->header
->incompatible_flags
);
512 if (flags
& ~supported
) {
514 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32
,
515 f
->path
, type
, flags
& ~any
);
516 flags
= (flags
& any
) & ~supported
;
520 _cleanup_free_
char *t
= NULL
;
522 if (compatible
&& (flags
& HEADER_COMPATIBLE_SEALED
))
523 strv
[n
++] = "sealed";
524 if (!compatible
&& (flags
& HEADER_INCOMPATIBLE_COMPRESSED_XZ
))
525 strv
[n
++] = "xz-compressed";
526 if (!compatible
&& (flags
& HEADER_INCOMPATIBLE_COMPRESSED_LZ4
))
527 strv
[n
++] = "lz4-compressed";
529 assert(n
< ELEMENTSOF(strv
));
531 t
= strv_join((char**) strv
, ", ");
532 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
533 f
->path
, type
, n
> 1 ? "flags" : "flag", strnull(t
));
541 static int journal_file_verify_header(JournalFile
*f
) {
542 uint64_t arena_size
, header_size
;
547 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
550 /* In both read and write mode we refuse to open files with incompatible
551 * flags we don't know. */
552 if (warn_wrong_flags(f
, false))
553 return -EPROTONOSUPPORT
;
555 /* When open for writing we refuse to open files with compatible flags, too. */
556 if (f
->writable
&& warn_wrong_flags(f
, true))
557 return -EPROTONOSUPPORT
;
559 if (f
->header
->state
>= _STATE_MAX
)
562 header_size
= le64toh(f
->header
->header_size
);
564 /* The first addition was n_data, so check that we are at least this large */
565 if (header_size
< HEADER_SIZE_MIN
)
568 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
571 arena_size
= le64toh(f
->header
->arena_size
);
573 if (UINT64_MAX
- header_size
< arena_size
|| header_size
+ arena_size
> (uint64_t) f
->last_stat
.st_size
)
576 if (le64toh(f
->header
->tail_object_offset
) > header_size
+ arena_size
)
579 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
580 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
581 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
582 !VALID64(le64toh(f
->header
->entry_array_offset
)))
586 sd_id128_t machine_id
;
590 r
= sd_id128_get_machine(&machine_id
);
594 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
597 state
= f
->header
->state
;
599 if (state
== STATE_ARCHIVED
)
600 return -ESHUTDOWN
; /* Already archived */
601 else if (state
== STATE_ONLINE
) {
602 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
604 } else if (state
!= STATE_OFFLINE
) {
605 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
609 if (f
->header
->field_hash_table_size
== 0 || f
->header
->data_hash_table_size
== 0)
612 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
613 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
615 if (le64toh(f
->header
->tail_entry_realtime
) > now(CLOCK_REALTIME
)) {
616 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f
->path
);
621 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
622 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
624 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
629 static int journal_file_fstat(JournalFile
*f
) {
633 if (fstat(f
->fd
, &f
->last_stat
) < 0)
636 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
638 /* Refuse appending to files that are already deleted */
639 if (f
->last_stat
.st_nlink
<= 0)
645 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
646 uint64_t old_size
, new_size
;
652 /* We assume that this file is not sparse, and we know that
653 * for sure, since we always call posix_fallocate()
656 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
660 le64toh(f
->header
->header_size
) +
661 le64toh(f
->header
->arena_size
);
663 new_size
= PAGE_ALIGN(offset
+ size
);
664 if (new_size
< le64toh(f
->header
->header_size
))
665 new_size
= le64toh(f
->header
->header_size
);
667 if (new_size
<= old_size
) {
669 /* We already pre-allocated enough space, but before
670 * we write to it, let's check with fstat() if the
671 * file got deleted, in order make sure we don't throw
672 * away the data immediately. Don't check fstat() for
673 * all writes though, but only once ever 10s. */
675 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
678 return journal_file_fstat(f
);
681 /* Allocate more space. */
683 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
686 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
689 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
692 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
694 if (new_size
- old_size
> available
)
699 /* Increase by larger blocks at once */
700 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
701 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
702 new_size
= f
->metrics
.max_size
;
704 /* Note that the glibc fallocate() fallback is very
705 inefficient, hence we try to minimize the allocation area
707 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
711 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
713 return journal_file_fstat(f
);
716 static unsigned type_to_context(ObjectType type
) {
717 /* One context for each type, plus one catch-all for the rest */
718 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
719 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
720 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
723 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
, size_t *ret_size
) {
732 /* Avoid SIGBUS on invalid accesses */
733 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
734 /* Hmm, out of range? Let's refresh the fstat() data
735 * first, before we trust that check. */
737 r
= journal_file_fstat(f
);
741 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
742 return -EADDRNOTAVAIL
;
745 return mmap_cache_get(f
->mmap
, f
->cache_fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
, ret_size
);
748 static uint64_t minimum_header_size(Object
*o
) {
750 static const uint64_t table
[] = {
751 [OBJECT_DATA
] = sizeof(DataObject
),
752 [OBJECT_FIELD
] = sizeof(FieldObject
),
753 [OBJECT_ENTRY
] = sizeof(EntryObject
),
754 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
755 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
756 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
757 [OBJECT_TAG
] = sizeof(TagObject
),
760 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
761 return sizeof(ObjectHeader
);
763 return table
[o
->object
.type
];
766 /* Lightweight object checks. We want this to be fast, so that we won't
767 * slowdown every journal_file_move_to_object() call too much. */
768 static int journal_file_check_object(JournalFile
*f
, uint64_t offset
, Object
*o
) {
772 switch (o
->object
.type
) {
775 if ((le64toh(o
->data
.entry_offset
) == 0) ^ (le64toh(o
->data
.n_entries
) == 0)) {
776 log_debug("Bad n_entries: %"PRIu64
": %"PRIu64
,
777 le64toh(o
->data
.n_entries
), offset
);
781 if (le64toh(o
->object
.size
) - offsetof(DataObject
, payload
) <= 0) {
782 log_debug("Bad object size (<= %zu): %"PRIu64
": %"PRIu64
,
783 offsetof(DataObject
, payload
),
784 le64toh(o
->object
.size
),
789 if (!VALID64(le64toh(o
->data
.next_hash_offset
)) ||
790 !VALID64(le64toh(o
->data
.next_field_offset
)) ||
791 !VALID64(le64toh(o
->data
.entry_offset
)) ||
792 !VALID64(le64toh(o
->data
.entry_array_offset
))) {
793 log_debug("Invalid offset, next_hash_offset="OFSfmt
", next_field_offset="OFSfmt
794 ", entry_offset="OFSfmt
", entry_array_offset="OFSfmt
": %"PRIu64
,
795 le64toh(o
->data
.next_hash_offset
),
796 le64toh(o
->data
.next_field_offset
),
797 le64toh(o
->data
.entry_offset
),
798 le64toh(o
->data
.entry_array_offset
),
807 if (le64toh(o
->object
.size
) - offsetof(FieldObject
, payload
) <= 0) {
809 "Bad field size (<= %zu): %"PRIu64
": %"PRIu64
,
810 offsetof(FieldObject
, payload
),
811 le64toh(o
->object
.size
),
816 if (!VALID64(le64toh(o
->field
.next_hash_offset
)) ||
817 !VALID64(le64toh(o
->field
.head_data_offset
))) {
819 "Invalid offset, next_hash_offset="OFSfmt
820 ", head_data_offset="OFSfmt
": %"PRIu64
,
821 le64toh(o
->field
.next_hash_offset
),
822 le64toh(o
->field
.head_data_offset
),
829 if ((le64toh(o
->object
.size
) - offsetof(EntryObject
, items
)) % sizeof(EntryItem
) != 0) {
831 "Bad entry size (<= %zu): %"PRIu64
": %"PRIu64
,
832 offsetof(EntryObject
, items
),
833 le64toh(o
->object
.size
),
838 if ((le64toh(o
->object
.size
) - offsetof(EntryObject
, items
)) / sizeof(EntryItem
) <= 0) {
840 "Invalid number items in entry: %"PRIu64
": %"PRIu64
,
841 (le64toh(o
->object
.size
) - offsetof(EntryObject
, items
)) / sizeof(EntryItem
),
846 if (le64toh(o
->entry
.seqnum
) <= 0) {
848 "Invalid entry seqnum: %"PRIx64
": %"PRIu64
,
849 le64toh(o
->entry
.seqnum
),
854 if (!VALID_REALTIME(le64toh(o
->entry
.realtime
))) {
856 "Invalid entry realtime timestamp: %"PRIu64
": %"PRIu64
,
857 le64toh(o
->entry
.realtime
),
862 if (!VALID_MONOTONIC(le64toh(o
->entry
.monotonic
))) {
864 "Invalid entry monotonic timestamp: %"PRIu64
": %"PRIu64
,
865 le64toh(o
->entry
.monotonic
),
872 case OBJECT_DATA_HASH_TABLE
:
873 case OBJECT_FIELD_HASH_TABLE
:
874 if ((le64toh(o
->object
.size
) - offsetof(HashTableObject
, items
)) % sizeof(HashItem
) != 0 ||
875 (le64toh(o
->object
.size
) - offsetof(HashTableObject
, items
)) / sizeof(HashItem
) <= 0) {
877 "Invalid %s hash table size: %"PRIu64
": %"PRIu64
,
878 o
->object
.type
== OBJECT_DATA_HASH_TABLE
? "data" : "field",
879 le64toh(o
->object
.size
),
886 case OBJECT_ENTRY_ARRAY
:
887 if ((le64toh(o
->object
.size
) - offsetof(EntryArrayObject
, items
)) % sizeof(le64_t
) != 0 ||
888 (le64toh(o
->object
.size
) - offsetof(EntryArrayObject
, items
)) / sizeof(le64_t
) <= 0) {
890 "Invalid object entry array size: %"PRIu64
": %"PRIu64
,
891 le64toh(o
->object
.size
),
896 if (!VALID64(le64toh(o
->entry_array
.next_entry_array_offset
))) {
898 "Invalid object entry array next_entry_array_offset: "OFSfmt
": %"PRIu64
,
899 le64toh(o
->entry_array
.next_entry_array_offset
),
907 if (le64toh(o
->object
.size
) != sizeof(TagObject
)) {
909 "Invalid object tag size: %"PRIu64
": %"PRIu64
,
910 le64toh(o
->object
.size
),
915 if (!VALID_EPOCH(le64toh(o
->tag
.epoch
))) {
917 "Invalid object tag epoch: %"PRIu64
": %"PRIu64
,
918 le64toh(o
->tag
.epoch
),
929 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
939 /* Objects may only be located at multiple of 64 bit */
940 if (!VALID64(offset
)) {
941 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64
, offset
);
945 /* Object may not be located in the file header */
946 if (offset
< le64toh(f
->header
->header_size
)) {
947 log_debug("Attempt to move to object located in file header: %" PRIu64
, offset
);
951 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
, &tsize
);
956 s
= le64toh(o
->object
.size
);
959 log_debug("Attempt to move to uninitialized object: %" PRIu64
, offset
);
962 if (s
< sizeof(ObjectHeader
)) {
963 log_debug("Attempt to move to overly short object: %" PRIu64
, offset
);
967 if (o
->object
.type
<= OBJECT_UNUSED
) {
968 log_debug("Attempt to move to object with invalid type: %" PRIu64
, offset
);
972 if (s
< minimum_header_size(o
)) {
973 log_debug("Attempt to move to truncated object: %" PRIu64
, offset
);
977 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
) {
978 log_debug("Attempt to move to object of unexpected type: %" PRIu64
, offset
);
983 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
, NULL
);
990 r
= journal_file_check_object(f
, offset
, o
);
998 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
1004 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
1007 /* If an external seqnum counter was passed, we update
1008 * both the local and the external one, and set it to
1009 * the maximum of both */
1011 if (*seqnum
+ 1 > r
)
1017 f
->header
->tail_entry_seqnum
= htole64(r
);
1019 if (f
->header
->head_entry_seqnum
== 0)
1020 f
->header
->head_entry_seqnum
= htole64(r
);
1025 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
1033 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
1034 assert(size
>= sizeof(ObjectHeader
));
1038 r
= journal_file_set_online(f
);
1042 p
= le64toh(f
->header
->tail_object_offset
);
1044 p
= le64toh(f
->header
->header_size
);
1046 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
1050 p
+= ALIGN64(le64toh(tail
->object
.size
));
1053 r
= journal_file_allocate(f
, p
, size
);
1057 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
, NULL
);
1064 o
->object
.type
= type
;
1065 o
->object
.size
= htole64(size
);
1067 f
->header
->tail_object_offset
= htole64(p
);
1068 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
1076 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
1084 /* We estimate that we need 1 hash table entry per 768 bytes
1085 of journal file and we want to make sure we never get
1086 beyond 75% fill level. Calculate the hash table size for
1087 the maximum file size based on these metrics. */
1089 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
1090 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
1091 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
1093 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
1095 r
= journal_file_append_object(f
,
1096 OBJECT_DATA_HASH_TABLE
,
1097 offsetof(Object
, hash_table
.items
) + s
,
1102 memzero(o
->hash_table
.items
, s
);
1104 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
1105 f
->header
->data_hash_table_size
= htole64(s
);
1110 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
1118 /* We use a fixed size hash table for the fields as this
1119 * number should grow very slowly only */
1121 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
1122 r
= journal_file_append_object(f
,
1123 OBJECT_FIELD_HASH_TABLE
,
1124 offsetof(Object
, hash_table
.items
) + s
,
1129 memzero(o
->hash_table
.items
, s
);
1131 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
1132 f
->header
->field_hash_table_size
= htole64(s
);
1137 int journal_file_map_data_hash_table(JournalFile
*f
) {
1145 if (f
->data_hash_table
)
1148 p
= le64toh(f
->header
->data_hash_table_offset
);
1149 s
= le64toh(f
->header
->data_hash_table_size
);
1151 r
= journal_file_move_to(f
,
1152 OBJECT_DATA_HASH_TABLE
,
1159 f
->data_hash_table
= t
;
1163 int journal_file_map_field_hash_table(JournalFile
*f
) {
1171 if (f
->field_hash_table
)
1174 p
= le64toh(f
->header
->field_hash_table_offset
);
1175 s
= le64toh(f
->header
->field_hash_table_size
);
1177 r
= journal_file_move_to(f
,
1178 OBJECT_FIELD_HASH_TABLE
,
1185 f
->field_hash_table
= t
;
1189 static int journal_file_link_field(
1200 assert(f
->field_hash_table
);
1204 if (o
->object
.type
!= OBJECT_FIELD
)
1207 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
1211 /* This might alter the window we are looking at */
1212 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
1215 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
1217 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
1219 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1223 o
->field
.next_hash_offset
= htole64(offset
);
1226 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
1228 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
1229 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
1234 static int journal_file_link_data(
1245 assert(f
->data_hash_table
);
1249 if (o
->object
.type
!= OBJECT_DATA
)
1252 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
1256 /* This might alter the window we are looking at */
1257 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
1258 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
1259 o
->data
.n_entries
= 0;
1262 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
1264 /* Only entry in the hash table is easy */
1265 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
1267 /* Move back to the previous data object, to patch in
1270 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1274 o
->data
.next_hash_offset
= htole64(offset
);
1277 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
1279 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
1280 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
1285 int journal_file_find_field_object_with_hash(
1287 const void *field
, uint64_t size
, uint64_t hash
,
1288 Object
**ret
, uint64_t *offset
) {
1290 uint64_t p
, osize
, h
, m
;
1295 assert(field
&& size
> 0);
1297 /* If the field hash table is empty, we can't find anything */
1298 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
1301 /* Map the field hash table, if it isn't mapped yet. */
1302 r
= journal_file_map_field_hash_table(f
);
1306 osize
= offsetof(Object
, field
.payload
) + size
;
1308 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
1313 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
1318 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1322 if (le64toh(o
->field
.hash
) == hash
&&
1323 le64toh(o
->object
.size
) == osize
&&
1324 memcmp(o
->field
.payload
, field
, size
) == 0) {
1334 p
= le64toh(o
->field
.next_hash_offset
);
1340 int journal_file_find_field_object(
1342 const void *field
, uint64_t size
,
1343 Object
**ret
, uint64_t *offset
) {
1348 assert(field
&& size
> 0);
1350 hash
= hash64(field
, size
);
1352 return journal_file_find_field_object_with_hash(f
,
1357 int journal_file_find_data_object_with_hash(
1359 const void *data
, uint64_t size
, uint64_t hash
,
1360 Object
**ret
, uint64_t *offset
) {
1362 uint64_t p
, osize
, h
, m
;
1367 assert(data
|| size
== 0);
1369 /* If there's no data hash table, then there's no entry. */
1370 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
1373 /* Map the data hash table, if it isn't mapped yet. */
1374 r
= journal_file_map_data_hash_table(f
);
1378 osize
= offsetof(Object
, data
.payload
) + size
;
1380 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
1385 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
1390 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1394 if (le64toh(o
->data
.hash
) != hash
)
1397 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
1398 #if HAVE_XZ || HAVE_LZ4
1402 l
= le64toh(o
->object
.size
);
1403 if (l
<= offsetof(Object
, data
.payload
))
1406 l
-= offsetof(Object
, data
.payload
);
1408 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
1409 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
1413 if (rsize
== size
&&
1414 memcmp(f
->compress_buffer
, data
, size
) == 0) {
1425 return -EPROTONOSUPPORT
;
1427 } else if (le64toh(o
->object
.size
) == osize
&&
1428 memcmp(o
->data
.payload
, data
, size
) == 0) {
1440 p
= le64toh(o
->data
.next_hash_offset
);
1446 int journal_file_find_data_object(
1448 const void *data
, uint64_t size
,
1449 Object
**ret
, uint64_t *offset
) {
1454 assert(data
|| size
== 0);
1456 hash
= hash64(data
, size
);
1458 return journal_file_find_data_object_with_hash(f
,
1463 static int journal_file_append_field(
1465 const void *field
, uint64_t size
,
1466 Object
**ret
, uint64_t *offset
) {
1474 assert(field
&& size
> 0);
1476 hash
= hash64(field
, size
);
1478 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1492 osize
= offsetof(Object
, field
.payload
) + size
;
1493 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1497 o
->field
.hash
= htole64(hash
);
1498 memcpy(o
->field
.payload
, field
, size
);
1500 r
= journal_file_link_field(f
, o
, p
, hash
);
1504 /* The linking might have altered the window, so let's
1505 * refresh our pointer */
1506 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1511 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1525 static int journal_file_append_data(
1527 const void *data
, uint64_t size
,
1528 Object
**ret
, uint64_t *offset
) {
1533 int r
, compression
= 0;
1537 assert(data
|| size
== 0);
1539 hash
= hash64(data
, size
);
1541 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1555 osize
= offsetof(Object
, data
.payload
) + size
;
1556 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1560 o
->data
.hash
= htole64(hash
);
1562 #if HAVE_XZ || HAVE_LZ4
1563 if (JOURNAL_FILE_COMPRESS(f
) && size
>= COMPRESSION_SIZE_THRESHOLD
) {
1566 compression
= compress_blob(data
, size
, o
->data
.payload
, size
- 1, &rsize
);
1568 if (compression
>= 0) {
1569 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1570 o
->object
.flags
|= compression
;
1572 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1573 size
, rsize
, object_compressed_to_string(compression
));
1575 /* Compression didn't work, we don't really care why, let's continue without compression */
1580 if (compression
== 0)
1581 memcpy_safe(o
->data
.payload
, data
, size
);
1583 r
= journal_file_link_data(f
, o
, p
, hash
);
1588 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1593 /* The linking might have altered the window, so let's
1594 * refresh our pointer */
1595 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1602 eq
= memchr(data
, '=', size
);
1603 if (eq
&& eq
> data
) {
1607 /* Create field object ... */
1608 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1612 /* ... and link it in. */
1613 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1614 fo
->field
.head_data_offset
= le64toh(p
);
1626 uint64_t journal_file_entry_n_items(Object
*o
) {
1629 if (o
->object
.type
!= OBJECT_ENTRY
)
1632 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1635 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1638 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1641 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1644 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1647 if (!IN_SET(o
->object
.type
, OBJECT_DATA_HASH_TABLE
, OBJECT_FIELD_HASH_TABLE
))
1650 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1653 static int link_entry_into_array(JournalFile
*f
,
1658 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1667 a
= le64toh(*first
);
1668 i
= hidx
= le64toh(*idx
);
1671 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1675 n
= journal_file_entry_array_n_items(o
);
1677 o
->entry_array
.items
[i
] = htole64(p
);
1678 *idx
= htole64(hidx
+ 1);
1684 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1695 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1696 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1702 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1707 o
->entry_array
.items
[i
] = htole64(p
);
1710 *first
= htole64(q
);
1712 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1716 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1719 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1720 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1722 *idx
= htole64(hidx
+ 1);
1727 static int link_entry_into_array_plus_one(JournalFile
*f
,
1742 *extra
= htole64(p
);
1746 i
= htole64(le64toh(*idx
) - 1);
1747 r
= link_entry_into_array(f
, first
, &i
, p
);
1752 *idx
= htole64(le64toh(*idx
) + 1);
1756 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1763 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1767 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1771 return link_entry_into_array_plus_one(f
,
1772 &o
->data
.entry_offset
,
1773 &o
->data
.entry_array_offset
,
1778 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1787 if (o
->object
.type
!= OBJECT_ENTRY
)
1790 __sync_synchronize();
1792 /* Link up the entry itself */
1793 r
= link_entry_into_array(f
,
1794 &f
->header
->entry_array_offset
,
1795 &f
->header
->n_entries
,
1800 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1802 if (f
->header
->head_entry_realtime
== 0)
1803 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1805 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1806 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1808 f
->tail_entry_monotonic_valid
= true;
1810 /* Link up the items */
1811 n
= journal_file_entry_n_items(o
);
1812 for (i
= 0; i
< n
; i
++) {
1813 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1821 static int journal_file_append_entry_internal(
1823 const dual_timestamp
*ts
,
1825 const EntryItem items
[], unsigned n_items
,
1827 Object
**ret
, uint64_t *offset
) {
1835 assert(items
|| n_items
== 0);
1838 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1840 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1844 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1845 memcpy_safe(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1846 o
->entry
.realtime
= htole64(ts
->realtime
);
1847 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1848 o
->entry
.xor_hash
= htole64(xor_hash
);
1849 o
->entry
.boot_id
= f
->header
->boot_id
;
1852 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1857 r
= journal_file_link_entry(f
, o
, np
);
1870 void journal_file_post_change(JournalFile
*f
) {
1873 /* inotify() does not receive IN_MODIFY events from file
1874 * accesses done via mmap(). After each access we hence
1875 * trigger IN_MODIFY by truncating the journal file to its
1876 * current size which triggers IN_MODIFY. */
1878 __sync_synchronize();
1880 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1881 log_debug_errno(errno
, "Failed to truncate file to its own size: %m");
1884 static int post_change_thunk(sd_event_source
*timer
, uint64_t usec
, void *userdata
) {
1887 journal_file_post_change(userdata
);
1892 static void schedule_post_change(JournalFile
*f
) {
1893 sd_event_source
*timer
;
1898 assert(f
->post_change_timer
);
1900 timer
= f
->post_change_timer
;
1902 r
= sd_event_source_get_enabled(timer
, &enabled
);
1904 log_debug_errno(r
, "Failed to get ftruncate timer state: %m");
1908 if (enabled
== SD_EVENT_ONESHOT
)
1911 r
= sd_event_now(sd_event_source_get_event(timer
), CLOCK_MONOTONIC
, &now
);
1913 log_debug_errno(r
, "Failed to get clock's now for scheduling ftruncate: %m");
1917 r
= sd_event_source_set_time(timer
, now
+f
->post_change_timer_period
);
1919 log_debug_errno(r
, "Failed to set time for scheduling ftruncate: %m");
1923 r
= sd_event_source_set_enabled(timer
, SD_EVENT_ONESHOT
);
1925 log_debug_errno(r
, "Failed to enable scheduled ftruncate: %m");
1932 /* On failure, let's simply post the change immediately. */
1933 journal_file_post_change(f
);
1936 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1937 int journal_file_enable_post_change_timer(JournalFile
*f
, sd_event
*e
, usec_t t
) {
1938 _cleanup_(sd_event_source_unrefp
) sd_event_source
*timer
= NULL
;
1942 assert_return(!f
->post_change_timer
, -EINVAL
);
1946 r
= sd_event_add_time(e
, &timer
, CLOCK_MONOTONIC
, 0, 0, post_change_thunk
, f
);
1950 r
= sd_event_source_set_enabled(timer
, SD_EVENT_OFF
);
1954 f
->post_change_timer
= timer
;
1956 f
->post_change_timer_period
= t
;
1961 static int entry_item_cmp(const void *_a
, const void *_b
) {
1962 const EntryItem
*a
= _a
, *b
= _b
;
1964 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1966 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1971 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1975 uint64_t xor_hash
= 0;
1976 struct dual_timestamp _ts
;
1980 assert(iovec
|| n_iovec
== 0);
1983 dual_timestamp_get(&_ts
);
1988 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
1993 /* alloca() can't take 0, hence let's allocate at least one */
1994 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
1996 for (i
= 0; i
< n_iovec
; i
++) {
2000 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
2004 xor_hash
^= le64toh(o
->data
.hash
);
2005 items
[i
].object_offset
= htole64(p
);
2006 items
[i
].hash
= o
->data
.hash
;
2009 /* Order by the position on disk, in order to improve seek
2010 * times for rotating media. */
2011 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
2013 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
2015 /* If the memory mapping triggered a SIGBUS then we return an
2016 * IO error and ignore the error code passed down to us, since
2017 * it is very likely just an effect of a nullified replacement
2020 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
2023 if (f
->post_change_timer
)
2024 schedule_post_change(f
);
2026 journal_file_post_change(f
);
2031 typedef struct ChainCacheItem
{
2032 uint64_t first
; /* the array at the beginning of the chain */
2033 uint64_t array
; /* the cached array */
2034 uint64_t begin
; /* the first item in the cached array */
2035 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
2036 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
2039 static void chain_cache_put(
2046 uint64_t last_index
) {
2049 /* If the chain item to cache for this chain is the
2050 * first one it's not worth caching anything */
2054 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
2055 ci
= ordered_hashmap_steal_first(h
);
2058 ci
= new(ChainCacheItem
, 1);
2065 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
2070 assert(ci
->first
== first
);
2075 ci
->last_index
= last_index
;
2078 static int generic_array_get(
2082 Object
**ret
, uint64_t *offset
) {
2085 uint64_t p
= 0, a
, t
= 0;
2093 /* Try the chain cache first */
2094 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
2095 if (ci
&& i
> ci
->total
) {
2104 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
2108 k
= journal_file_entry_array_n_items(o
);
2110 p
= le64toh(o
->entry_array
.items
[i
]);
2116 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
2122 /* Let's cache this item for the next invocation */
2123 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
2125 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2138 static int generic_array_get_plus_one(
2143 Object
**ret
, uint64_t *offset
) {
2152 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
2165 return generic_array_get(f
, first
, i
-1, ret
, offset
);
2174 static int generic_array_bisect(
2179 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
2180 direction_t direction
,
2185 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
2186 bool subtract_one
= false;
2187 Object
*o
, *array
= NULL
;
2192 assert(test_object
);
2194 /* Start with the first array in the chain */
2197 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
2198 if (ci
&& n
> ci
->total
) {
2199 /* Ah, we have iterated this bisection array chain
2200 * previously! Let's see if we can skip ahead in the
2201 * chain, as far as the last time. But we can't jump
2202 * backwards in the chain, so let's check that
2205 r
= test_object(f
, ci
->begin
, needle
);
2209 if (r
== TEST_LEFT
) {
2210 /* OK, what we are looking for is right of the
2211 * begin of this EntryArray, so let's jump
2212 * straight to previously cached array in the
2218 last_index
= ci
->last_index
;
2223 uint64_t left
, right
, k
, lp
;
2225 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
2229 k
= journal_file_entry_array_n_items(array
);
2235 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
2239 r
= test_object(f
, p
, needle
);
2240 if (r
== -EBADMSG
) {
2241 log_debug_errno(r
, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2248 if (r
== TEST_FOUND
)
2249 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2251 if (r
== TEST_RIGHT
) {
2255 if (last_index
!= (uint64_t) -1) {
2256 assert(last_index
<= right
);
2258 /* If we cached the last index we
2259 * looked at, let's try to not to jump
2260 * too wildly around and see if we can
2261 * limit the range to look at early to
2262 * the immediate neighbors of the last
2263 * index we looked at. */
2265 if (last_index
> 0) {
2266 uint64_t x
= last_index
- 1;
2268 p
= le64toh(array
->entry_array
.items
[x
]);
2272 r
= test_object(f
, p
, needle
);
2276 if (r
== TEST_FOUND
)
2277 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2279 if (r
== TEST_RIGHT
)
2285 if (last_index
< right
) {
2286 uint64_t y
= last_index
+ 1;
2288 p
= le64toh(array
->entry_array
.items
[y
]);
2292 r
= test_object(f
, p
, needle
);
2296 if (r
== TEST_FOUND
)
2297 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2299 if (r
== TEST_RIGHT
)
2307 if (left
== right
) {
2308 if (direction
== DIRECTION_UP
)
2309 subtract_one
= true;
2315 assert(left
< right
);
2316 i
= (left
+ right
) / 2;
2318 p
= le64toh(array
->entry_array
.items
[i
]);
2322 r
= test_object(f
, p
, needle
);
2323 if (r
== -EBADMSG
) {
2324 log_debug_errno(r
, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2331 if (r
== TEST_FOUND
)
2332 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2334 if (r
== TEST_RIGHT
)
2342 if (direction
== DIRECTION_UP
) {
2344 subtract_one
= true;
2355 last_index
= (uint64_t) -1;
2356 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
2362 if (subtract_one
&& t
== 0 && i
== 0)
2365 /* Let's cache this item for the next invocation */
2366 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
2368 if (subtract_one
&& i
== 0)
2370 else if (subtract_one
)
2371 p
= le64toh(array
->entry_array
.items
[i
-1]);
2373 p
= le64toh(array
->entry_array
.items
[i
]);
2375 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2386 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
2391 static int generic_array_bisect_plus_one(
2397 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
2398 direction_t direction
,
2404 bool step_back
= false;
2408 assert(test_object
);
2413 /* This bisects the array in object 'first', but first checks
2415 r
= test_object(f
, extra
, needle
);
2419 if (r
== TEST_FOUND
)
2420 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2422 /* if we are looking with DIRECTION_UP then we need to first
2423 see if in the actual array there is a matching entry, and
2424 return the last one of that. But if there isn't any we need
2425 to return this one. Hence remember this, and return it
2428 step_back
= direction
== DIRECTION_UP
;
2430 if (r
== TEST_RIGHT
) {
2431 if (direction
== DIRECTION_DOWN
)
2437 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
2439 if (r
== 0 && step_back
)
2448 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
2464 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2470 else if (p
< needle
)
2476 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2483 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2487 if (le64toh(o
->entry
.seqnum
) == needle
)
2489 else if (le64toh(o
->entry
.seqnum
) < needle
)
2495 int journal_file_move_to_entry_by_seqnum(
2498 direction_t direction
,
2504 return generic_array_bisect(f
,
2505 le64toh(f
->header
->entry_array_offset
),
2506 le64toh(f
->header
->n_entries
),
2513 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2520 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2524 if (le64toh(o
->entry
.realtime
) == needle
)
2526 else if (le64toh(o
->entry
.realtime
) < needle
)
2532 int journal_file_move_to_entry_by_realtime(
2535 direction_t direction
,
2541 return generic_array_bisect(f
,
2542 le64toh(f
->header
->entry_array_offset
),
2543 le64toh(f
->header
->n_entries
),
2545 test_object_realtime
,
2550 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2557 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2561 if (le64toh(o
->entry
.monotonic
) == needle
)
2563 else if (le64toh(o
->entry
.monotonic
) < needle
)
2569 static int find_data_object_by_boot_id(
2575 char t
[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2577 sd_id128_to_string(boot_id
, t
+ 9);
2578 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2581 int journal_file_move_to_entry_by_monotonic(
2585 direction_t direction
,
2594 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2600 return generic_array_bisect_plus_one(f
,
2601 le64toh(o
->data
.entry_offset
),
2602 le64toh(o
->data
.entry_array_offset
),
2603 le64toh(o
->data
.n_entries
),
2605 test_object_monotonic
,
2610 void journal_file_reset_location(JournalFile
*f
) {
2611 f
->location_type
= LOCATION_HEAD
;
2612 f
->current_offset
= 0;
2613 f
->current_seqnum
= 0;
2614 f
->current_realtime
= 0;
2615 f
->current_monotonic
= 0;
2616 zero(f
->current_boot_id
);
2617 f
->current_xor_hash
= 0;
2620 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2621 f
->location_type
= LOCATION_SEEK
;
2622 f
->current_offset
= offset
;
2623 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2624 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2625 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2626 f
->current_boot_id
= o
->entry
.boot_id
;
2627 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2630 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2635 assert(af
->location_type
== LOCATION_SEEK
);
2636 assert(bf
->location_type
== LOCATION_SEEK
);
2638 /* If contents and timestamps match, these entries are
2639 * identical, even if the seqnum does not match */
2640 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2641 af
->current_monotonic
== bf
->current_monotonic
&&
2642 af
->current_realtime
== bf
->current_realtime
&&
2643 af
->current_xor_hash
== bf
->current_xor_hash
)
2646 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2648 /* If this is from the same seqnum source, compare
2650 if (af
->current_seqnum
< bf
->current_seqnum
)
2652 if (af
->current_seqnum
> bf
->current_seqnum
)
2655 /* Wow! This is weird, different data but the same
2656 * seqnums? Something is borked, but let's make the
2657 * best of it and compare by time. */
2660 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2662 /* If the boot id matches, compare monotonic time */
2663 if (af
->current_monotonic
< bf
->current_monotonic
)
2665 if (af
->current_monotonic
> bf
->current_monotonic
)
2669 /* Otherwise, compare UTC time */
2670 if (af
->current_realtime
< bf
->current_realtime
)
2672 if (af
->current_realtime
> bf
->current_realtime
)
2675 /* Finally, compare by contents */
2676 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2678 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2684 static int bump_array_index(uint64_t *i
, direction_t direction
, uint64_t n
) {
2686 /* Increase or decrease the specified index, in the right direction. */
2688 if (direction
== DIRECTION_DOWN
) {
2703 static bool check_properly_ordered(uint64_t new_offset
, uint64_t old_offset
, direction_t direction
) {
2705 /* Consider it an error if any of the two offsets is uninitialized */
2706 if (old_offset
== 0 || new_offset
== 0)
2709 /* If we go down, the new offset must be larger than the old one. */
2710 return direction
== DIRECTION_DOWN
?
2711 new_offset
> old_offset
:
2712 new_offset
< old_offset
;
2715 int journal_file_next_entry(
2718 direction_t direction
,
2719 Object
**ret
, uint64_t *offset
) {
2727 n
= le64toh(f
->header
->n_entries
);
2732 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2734 r
= generic_array_bisect(f
,
2735 le64toh(f
->header
->entry_array_offset
),
2736 le64toh(f
->header
->n_entries
),
2745 r
= bump_array_index(&i
, direction
, n
);
2750 /* And jump to it */
2752 r
= generic_array_get(f
,
2753 le64toh(f
->header
->entry_array_offset
),
2761 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2762 * the next one might work for us instead. */
2763 log_debug_errno(r
, "Entry item %" PRIu64
" is bad, skipping over it.", i
);
2765 r
= bump_array_index(&i
, direction
, n
);
2770 /* Ensure our array is properly ordered. */
2771 if (p
> 0 && !check_properly_ordered(ofs
, p
, direction
)) {
2772 log_debug("%s: entry array not properly ordered at entry %" PRIu64
, f
->path
, i
);
2782 int journal_file_next_entry_for_data(
2784 Object
*o
, uint64_t p
,
2785 uint64_t data_offset
,
2786 direction_t direction
,
2787 Object
**ret
, uint64_t *offset
) {
2794 assert(p
> 0 || !o
);
2796 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2800 n
= le64toh(d
->data
.n_entries
);
2805 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2807 if (o
->object
.type
!= OBJECT_ENTRY
)
2810 r
= generic_array_bisect_plus_one(f
,
2811 le64toh(d
->data
.entry_offset
),
2812 le64toh(d
->data
.entry_array_offset
),
2813 le64toh(d
->data
.n_entries
),
2823 r
= bump_array_index(&i
, direction
, n
);
2829 r
= generic_array_get_plus_one(f
,
2830 le64toh(d
->data
.entry_offset
),
2831 le64toh(d
->data
.entry_array_offset
),
2839 log_debug_errno(r
, "Data entry item %" PRIu64
" is bad, skipping over it.", i
);
2841 r
= bump_array_index(&i
, direction
, n
);
2846 /* Ensure our array is properly ordered. */
2847 if (p
> 0 && check_properly_ordered(ofs
, p
, direction
)) {
2848 log_debug("%s data entry array not properly ordered at entry %" PRIu64
, f
->path
, i
);
2858 int journal_file_move_to_entry_by_offset_for_data(
2860 uint64_t data_offset
,
2862 direction_t direction
,
2863 Object
**ret
, uint64_t *offset
) {
2870 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2874 return generic_array_bisect_plus_one(f
,
2875 le64toh(d
->data
.entry_offset
),
2876 le64toh(d
->data
.entry_array_offset
),
2877 le64toh(d
->data
.n_entries
),
2884 int journal_file_move_to_entry_by_monotonic_for_data(
2886 uint64_t data_offset
,
2889 direction_t direction
,
2890 Object
**ret
, uint64_t *offset
) {
2898 /* First, seek by time */
2899 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2905 r
= generic_array_bisect_plus_one(f
,
2906 le64toh(o
->data
.entry_offset
),
2907 le64toh(o
->data
.entry_array_offset
),
2908 le64toh(o
->data
.n_entries
),
2910 test_object_monotonic
,
2916 /* And now, continue seeking until we find an entry that
2917 * exists in both bisection arrays */
2923 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2927 r
= generic_array_bisect_plus_one(f
,
2928 le64toh(d
->data
.entry_offset
),
2929 le64toh(d
->data
.entry_array_offset
),
2930 le64toh(d
->data
.n_entries
),
2938 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2942 r
= generic_array_bisect_plus_one(f
,
2943 le64toh(o
->data
.entry_offset
),
2944 le64toh(o
->data
.entry_array_offset
),
2945 le64toh(o
->data
.n_entries
),
2967 int journal_file_move_to_entry_by_seqnum_for_data(
2969 uint64_t data_offset
,
2971 direction_t direction
,
2972 Object
**ret
, uint64_t *offset
) {
2979 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2983 return generic_array_bisect_plus_one(f
,
2984 le64toh(d
->data
.entry_offset
),
2985 le64toh(d
->data
.entry_array_offset
),
2986 le64toh(d
->data
.n_entries
),
2993 int journal_file_move_to_entry_by_realtime_for_data(
2995 uint64_t data_offset
,
2997 direction_t direction
,
2998 Object
**ret
, uint64_t *offset
) {
3005 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
3009 return generic_array_bisect_plus_one(f
,
3010 le64toh(d
->data
.entry_offset
),
3011 le64toh(d
->data
.entry_array_offset
),
3012 le64toh(d
->data
.n_entries
),
3014 test_object_realtime
,
3019 void journal_file_dump(JournalFile
*f
) {
3027 journal_file_print_header(f
);
3029 p
= le64toh(f
->header
->header_size
);
3031 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
3035 switch (o
->object
.type
) {
3038 printf("Type: OBJECT_UNUSED\n");
3042 printf("Type: OBJECT_DATA\n");
3046 printf("Type: OBJECT_FIELD\n");
3050 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
3051 le64toh(o
->entry
.seqnum
),
3052 le64toh(o
->entry
.monotonic
),
3053 le64toh(o
->entry
.realtime
));
3056 case OBJECT_FIELD_HASH_TABLE
:
3057 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3060 case OBJECT_DATA_HASH_TABLE
:
3061 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3064 case OBJECT_ENTRY_ARRAY
:
3065 printf("Type: OBJECT_ENTRY_ARRAY\n");
3069 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
3070 le64toh(o
->tag
.seqnum
),
3071 le64toh(o
->tag
.epoch
));
3075 printf("Type: unknown (%i)\n", o
->object
.type
);
3079 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
3080 printf("Flags: %s\n",
3081 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
3083 if (p
== le64toh(f
->header
->tail_object_offset
))
3086 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
3091 log_error("File corrupt");
3094 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
3097 x
= format_timestamp(buf
, l
, t
);
3103 void journal_file_print_header(JournalFile
*f
) {
3104 char a
[33], b
[33], c
[33], d
[33];
3105 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
3107 char bytes
[FORMAT_BYTES_MAX
];
3112 printf("File Path: %s\n"
3116 "Sequential Number ID: %s\n"
3118 "Compatible Flags:%s%s\n"
3119 "Incompatible Flags:%s%s%s\n"
3120 "Header size: %"PRIu64
"\n"
3121 "Arena size: %"PRIu64
"\n"
3122 "Data Hash Table Size: %"PRIu64
"\n"
3123 "Field Hash Table Size: %"PRIu64
"\n"
3124 "Rotate Suggested: %s\n"
3125 "Head Sequential Number: %"PRIu64
" (%"PRIx64
")\n"
3126 "Tail Sequential Number: %"PRIu64
" (%"PRIx64
")\n"
3127 "Head Realtime Timestamp: %s (%"PRIx64
")\n"
3128 "Tail Realtime Timestamp: %s (%"PRIx64
")\n"
3129 "Tail Monotonic Timestamp: %s (%"PRIx64
")\n"
3130 "Objects: %"PRIu64
"\n"
3131 "Entry Objects: %"PRIu64
"\n",
3133 sd_id128_to_string(f
->header
->file_id
, a
),
3134 sd_id128_to_string(f
->header
->machine_id
, b
),
3135 sd_id128_to_string(f
->header
->boot_id
, c
),
3136 sd_id128_to_string(f
->header
->seqnum_id
, d
),
3137 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
3138 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
3139 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
3140 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
3141 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
3142 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
3143 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
3144 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
3145 le64toh(f
->header
->header_size
),
3146 le64toh(f
->header
->arena_size
),
3147 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3148 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
3149 yes_no(journal_file_rotate_suggested(f
, 0)),
3150 le64toh(f
->header
->head_entry_seqnum
), le64toh(f
->header
->head_entry_seqnum
),
3151 le64toh(f
->header
->tail_entry_seqnum
), le64toh(f
->header
->tail_entry_seqnum
),
3152 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)), le64toh(f
->header
->head_entry_realtime
),
3153 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)), le64toh(f
->header
->tail_entry_realtime
),
3154 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
), le64toh(f
->header
->tail_entry_monotonic
),
3155 le64toh(f
->header
->n_objects
),
3156 le64toh(f
->header
->n_entries
));
3158 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3159 printf("Data Objects: %"PRIu64
"\n"
3160 "Data Hash Table Fill: %.1f%%\n",
3161 le64toh(f
->header
->n_data
),
3162 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
3164 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3165 printf("Field Objects: %"PRIu64
"\n"
3166 "Field Hash Table Fill: %.1f%%\n",
3167 le64toh(f
->header
->n_fields
),
3168 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
3170 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
3171 printf("Tag Objects: %"PRIu64
"\n",
3172 le64toh(f
->header
->n_tags
));
3173 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
3174 printf("Entry Array Objects: %"PRIu64
"\n",
3175 le64toh(f
->header
->n_entry_arrays
));
3177 if (fstat(f
->fd
, &st
) >= 0)
3178 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (uint64_t) st
.st_blocks
* 512ULL));
3181 static int journal_file_warn_btrfs(JournalFile
*f
) {
3187 /* Before we write anything, check if the COW logic is turned
3188 * off on btrfs. Given our write pattern that is quite
3189 * unfriendly to COW file systems this should greatly improve
3190 * performance on COW file systems, such as btrfs, at the
3191 * expense of data integrity features (which shouldn't be too
3192 * bad, given that we do our own checksumming). */
3194 r
= btrfs_is_filesystem(f
->fd
);
3196 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
3200 r
= read_attr_fd(f
->fd
, &attrs
);
3202 return log_warning_errno(r
, "Failed to read file attributes: %m");
3204 if (attrs
& FS_NOCOW_FL
) {
3205 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3209 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3210 "This is likely to slow down journal access substantially, please consider turning "
3211 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
3216 int journal_file_open(
3223 JournalMetrics
*metrics
,
3224 MMapCache
*mmap_cache
,
3225 Set
*deferred_closes
,
3226 JournalFile
*template,
3227 JournalFile
**ret
) {
3229 bool newly_created
= false;
3235 assert(fd
>= 0 || fname
);
3237 if (!IN_SET((flags
& O_ACCMODE
), O_RDONLY
, O_RDWR
))
3241 if (!endswith(fname
, ".journal") &&
3242 !endswith(fname
, ".journal~"))
3246 f
= new0(JournalFile
, 1);
3254 f
->prot
= prot_from_flags(flags
);
3255 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
3257 f
->compress_lz4
= compress
;
3259 f
->compress_xz
= compress
;
3266 f
->mmap
= mmap_cache_ref(mmap_cache
);
3268 f
->mmap
= mmap_cache_new();
3276 f
->path
= strdup(fname
);
3282 /* If we don't know the path, fill in something explanatory and vaguely useful */
3283 if (asprintf(&f
->path
, "/proc/self/%i", fd
) < 0) {
3289 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
3290 if (!f
->chain_cache
) {
3296 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
, f
->mode
);
3302 /* fds we opened here by us should also be closed by us. */
3306 f
->cache_fd
= mmap_cache_add_fd(f
->mmap
, f
->fd
);
3312 r
= journal_file_fstat(f
);
3316 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
3318 (void) journal_file_warn_btrfs(f
);
3320 /* Let's attach the creation time to the journal file,
3321 * so that the vacuuming code knows the age of this
3322 * file even if the file might end up corrupted one
3323 * day... Ideally we'd just use the creation time many
3324 * file systems maintain for each file, but there is
3325 * currently no usable API to query this, hence let's
3326 * emulate this via extended attributes. If extended
3327 * attributes are not supported we'll just skip this,
3328 * and rely solely on mtime/atime/ctime of the file. */
3330 fd_setcrtime(f
->fd
, 0);
3333 /* Try to load the FSPRG state, and if we can't, then
3334 * just don't do sealing */
3336 r
= journal_file_fss_load(f
);
3342 r
= journal_file_init_header(f
, template);
3346 r
= journal_file_fstat(f
);
3350 newly_created
= true;
3353 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
3358 r
= mmap_cache_get(f
->mmap
, f
->cache_fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
, NULL
);
3364 if (!newly_created
) {
3365 set_clear_with_destructor(deferred_closes
, journal_file_close
);
3367 r
= journal_file_verify_header(f
);
3373 if (!newly_created
&& f
->writable
) {
3374 r
= journal_file_fss_load(f
);
3382 journal_default_metrics(metrics
, f
->fd
);
3383 f
->metrics
= *metrics
;
3384 } else if (template)
3385 f
->metrics
= template->metrics
;
3387 r
= journal_file_refresh_header(f
);
3393 r
= journal_file_hmac_setup(f
);
3398 if (newly_created
) {
3399 r
= journal_file_setup_field_hash_table(f
);
3403 r
= journal_file_setup_data_hash_table(f
);
3408 r
= journal_file_append_first_tag(f
);
3414 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
)) {
3419 if (template && template->post_change_timer
) {
3420 r
= journal_file_enable_post_change_timer(
3422 sd_event_source_get_event(template->post_change_timer
),
3423 template->post_change_timer_period
);
3429 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3436 if (f
->cache_fd
&& mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
3439 (void) journal_file_close(f
);
3444 int journal_file_rotate(JournalFile
**f
, bool compress
, bool seal
, Set
*deferred_closes
) {
3445 _cleanup_free_
char *p
= NULL
;
3447 JournalFile
*old_file
, *new_file
= NULL
;
3455 if (!old_file
->writable
)
3458 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3459 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3460 if (path_startswith(old_file
->path
, "/proc/self/fd"))
3463 if (!endswith(old_file
->path
, ".journal"))
3466 l
= strlen(old_file
->path
);
3467 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
3468 (int) l
- 8, old_file
->path
,
3469 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
3470 le64toh((*f
)->header
->head_entry_seqnum
),
3471 le64toh((*f
)->header
->head_entry_realtime
));
3475 /* Try to rename the file to the archived version. If the file
3476 * already was deleted, we'll get ENOENT, let's ignore that
3478 r
= rename(old_file
->path
, p
);
3479 if (r
< 0 && errno
!= ENOENT
)
3482 /* Sync the rename to disk */
3483 (void) fsync_directory_of_file(old_file
->fd
);
3485 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3486 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3487 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3488 * would result in the rotated journal never getting fsync() called before closing.
3489 * Now we simply queue the archive state by setting an archive bit, leaving the state
3490 * as STATE_ONLINE so proper offlining occurs. */
3491 old_file
->archive
= true;
3493 /* Currently, btrfs is not very good with out write patterns
3494 * and fragments heavily. Let's defrag our journal files when
3495 * we archive them */
3496 old_file
->defrag_on_close
= true;
3498 r
= journal_file_open(-1, old_file
->path
, old_file
->flags
, old_file
->mode
, compress
, seal
, NULL
, old_file
->mmap
, deferred_closes
, old_file
, &new_file
);
3500 if (deferred_closes
&&
3501 set_put(deferred_closes
, old_file
) >= 0)
3502 (void) journal_file_set_offline(old_file
, false);
3504 (void) journal_file_close(old_file
);
3510 int journal_file_open_reliably(
3516 JournalMetrics
*metrics
,
3517 MMapCache
*mmap_cache
,
3518 Set
*deferred_closes
,
3519 JournalFile
*template,
3520 JournalFile
**ret
) {
3524 _cleanup_free_
char *p
= NULL
;
3526 r
= journal_file_open(-1, fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, deferred_closes
, template, ret
);
3528 -EBADMSG
, /* Corrupted */
3529 -ENODATA
, /* Truncated */
3530 -EHOSTDOWN
, /* Other machine */
3531 -EPROTONOSUPPORT
, /* Incompatible feature */
3532 -EBUSY
, /* Unclean shutdown */
3533 -ESHUTDOWN
, /* Already archived */
3534 -EIO
, /* IO error, including SIGBUS on mmap */
3535 -EIDRM
, /* File has been deleted */
3536 -ETXTBSY
)) /* File is from the future */
3539 if ((flags
& O_ACCMODE
) == O_RDONLY
)
3542 if (!(flags
& O_CREAT
))
3545 if (!endswith(fname
, ".journal"))
3548 /* The file is corrupted. Rotate it away and try it again (but only once) */
3551 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
3553 now(CLOCK_REALTIME
),
3557 if (rename(fname
, p
) < 0)
3560 /* btrfs doesn't cope well with our write pattern and
3561 * fragments heavily. Let's defrag all files we rotate */
3563 (void) chattr_path(p
, 0, FS_NOCOW_FL
);
3564 (void) btrfs_defrag(p
);
3566 log_warning_errno(r
, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
3568 return journal_file_open(-1, fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, deferred_closes
, template, ret
);
3571 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
3573 uint64_t q
, xor_hash
= 0;
3586 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
3587 ts
.realtime
= le64toh(o
->entry
.realtime
);
3589 n
= journal_file_entry_n_items(o
);
3590 /* alloca() can't take 0, hence let's allocate at least one */
3591 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
3593 for (i
= 0; i
< n
; i
++) {
3600 q
= le64toh(o
->entry
.items
[i
].object_offset
);
3601 le_hash
= o
->entry
.items
[i
].hash
;
3603 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
3607 if (le_hash
!= o
->data
.hash
)
3610 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
3613 /* We hit the limit on 32bit machines */
3614 if ((uint64_t) t
!= l
)
3617 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
3618 #if HAVE_XZ || HAVE_LZ4
3621 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
3622 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
3626 data
= from
->compress_buffer
;
3629 return -EPROTONOSUPPORT
;
3632 data
= o
->data
.payload
;
3634 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
3638 xor_hash
^= le64toh(u
->data
.hash
);
3639 items
[i
].object_offset
= htole64(h
);
3640 items
[i
].hash
= u
->data
.hash
;
3642 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
3647 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
3649 if (mmap_cache_got_sigbus(to
->mmap
, to
->cache_fd
))
3655 void journal_reset_metrics(JournalMetrics
*m
) {
3658 /* Set everything to "pick automatic values". */
3660 *m
= (JournalMetrics
) {
3661 .min_use
= (uint64_t) -1,
3662 .max_use
= (uint64_t) -1,
3663 .min_size
= (uint64_t) -1,
3664 .max_size
= (uint64_t) -1,
3665 .keep_free
= (uint64_t) -1,
3666 .n_max_files
= (uint64_t) -1,
3670 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
3671 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
], e
[FORMAT_BYTES_MAX
];
3678 if (fstatvfs(fd
, &ss
) >= 0)
3679 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
3681 log_debug_errno(errno
, "Failed to detremine disk size: %m");
3685 if (m
->max_use
== (uint64_t) -1) {
3688 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
3690 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
3691 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
3693 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
3694 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3696 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3698 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3700 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3701 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3704 if (m
->min_use
== (uint64_t) -1)
3705 m
->min_use
= DEFAULT_MIN_USE
;
3707 if (m
->min_use
> m
->max_use
)
3708 m
->min_use
= m
->max_use
;
3710 if (m
->max_size
== (uint64_t) -1) {
3711 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3713 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3714 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3716 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3718 if (m
->max_size
!= 0) {
3719 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3720 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3722 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3723 m
->max_use
= m
->max_size
*2;
3726 if (m
->min_size
== (uint64_t) -1)
3727 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3729 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3731 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3732 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3734 if (m
->max_size
!= 0 && m
->min_size
> m
->max_size
)
3735 m
->max_size
= m
->min_size
;
3738 if (m
->keep_free
== (uint64_t) -1) {
3741 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3743 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3744 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3747 m
->keep_free
= DEFAULT_KEEP_FREE
;
3750 if (m
->n_max_files
== (uint64_t) -1)
3751 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3753 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3754 format_bytes(a
, sizeof(a
), m
->min_use
),
3755 format_bytes(b
, sizeof(b
), m
->max_use
),
3756 format_bytes(c
, sizeof(c
), m
->max_size
),
3757 format_bytes(d
, sizeof(d
), m
->min_size
),
3758 format_bytes(e
, sizeof(e
), m
->keep_free
),
3762 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3768 if (f
->header
->head_entry_realtime
== 0)
3771 *from
= le64toh(f
->header
->head_entry_realtime
);
3775 if (f
->header
->tail_entry_realtime
== 0)
3778 *to
= le64toh(f
->header
->tail_entry_realtime
);
3784 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3792 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3796 if (le64toh(o
->data
.n_entries
) <= 0)
3800 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3804 *from
= le64toh(o
->entry
.monotonic
);
3808 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3812 r
= generic_array_get_plus_one(f
,
3813 le64toh(o
->data
.entry_offset
),
3814 le64toh(o
->data
.entry_array_offset
),
3815 le64toh(o
->data
.n_entries
)-1,
3820 *to
= le64toh(o
->entry
.monotonic
);
3826 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3830 /* If we gained new header fields we gained new features,
3831 * hence suggest a rotation */
3832 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3833 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3837 /* Let's check if the hash tables grew over a certain fill
3838 * level (75%, borrowing this value from Java's hash table
3839 * implementation), and if so suggest a rotation. To calculate
3840 * the fill level we need the n_data field, which only exists
3841 * in newer versions. */
3843 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3844 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3845 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3847 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3848 le64toh(f
->header
->n_data
),
3849 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3850 (unsigned long long) f
->last_stat
.st_size
,
3851 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3855 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3856 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3857 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3859 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3860 le64toh(f
->header
->n_fields
),
3861 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3865 /* Are the data objects properly indexed by field objects? */
3866 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3867 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3868 le64toh(f
->header
->n_data
) > 0 &&
3869 le64toh(f
->header
->n_fields
) == 0)
3872 if (max_file_usec
> 0) {
3875 h
= le64toh(f
->header
->head_entry_realtime
);
3876 t
= now(CLOCK_REALTIME
);
3878 if (h
> 0 && t
> h
+ max_file_usec
)