1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2011 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/statvfs.h>
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
37 #include "journal-authenticate.h"
38 #include "journal-def.h"
39 #include "journal-file.h"
41 #include "parse-util.h"
42 #include "path-util.h"
43 #include "random-util.h"
46 #include "stat-util.h"
47 #include "string-util.h"
49 #include "xattr-util.h"
51 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
52 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
54 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
56 /* This is the minimum journal file size */
57 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
59 /* These are the lower and upper bounds if we deduce the max_use value
60 * from the file system size */
61 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
62 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
64 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
65 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
67 /* This is the upper bound if we deduce max_size from max_use */
68 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
70 /* This is the upper bound if we deduce the keep_free value from the
72 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
74 /* This is the keep_free value when we can't determine the system
76 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
78 /* This is the default maximum number of journal files to keep around. */
79 #define DEFAULT_N_MAX_FILES (100)
81 /* n_data was the first entry we added after the initial file format design */
82 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
84 /* How many entries to keep in the entry array chain cache at max */
85 #define CHAIN_CACHE_MAX 20
87 /* How much to increase the journal file size at once each time we allocate something new. */
88 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
90 /* Reread fstat() of the file for detecting deletions at least this often */
91 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
93 /* The mmap context to use for the header we pick as one above the last defined typed */
94 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
97 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
100 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
101 * As a result we use atomic operations on f->offline_state for inter-thread communications with
102 * journal_file_set_offline() and journal_file_set_online(). */
103 static void journal_file_set_offline_internal(JournalFile
*f
) {
109 switch (f
->offline_state
) {
111 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_CANCEL
, OFFLINE_DONE
))
115 case OFFLINE_AGAIN_FROM_SYNCING
:
116 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_SYNCING
, OFFLINE_SYNCING
))
120 case OFFLINE_AGAIN_FROM_OFFLINING
:
121 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_OFFLINING
, OFFLINE_SYNCING
))
125 case OFFLINE_SYNCING
:
128 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_OFFLINING
))
131 f
->header
->state
= f
->archive
? STATE_ARCHIVED
: STATE_OFFLINE
;
135 case OFFLINE_OFFLINING
:
136 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_OFFLINING
, OFFLINE_DONE
))
143 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
149 static void * journal_file_set_offline_thread(void *arg
) {
150 JournalFile
*f
= arg
;
152 (void) pthread_setname_np(pthread_self(), "journal-offline");
154 journal_file_set_offline_internal(f
);
159 static int journal_file_set_offline_thread_join(JournalFile
*f
) {
164 if (f
->offline_state
== OFFLINE_JOINED
)
167 r
= pthread_join(f
->offline_thread
, NULL
);
171 f
->offline_state
= OFFLINE_JOINED
;
173 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
179 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
180 static bool journal_file_set_offline_try_restart(JournalFile
*f
) {
182 switch (f
->offline_state
) {
183 case OFFLINE_AGAIN_FROM_SYNCING
:
184 case OFFLINE_AGAIN_FROM_OFFLINING
:
188 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_CANCEL
, OFFLINE_AGAIN_FROM_SYNCING
))
192 case OFFLINE_SYNCING
:
193 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_AGAIN_FROM_SYNCING
))
197 case OFFLINE_OFFLINING
:
198 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_OFFLINING
, OFFLINE_AGAIN_FROM_OFFLINING
))
208 /* Sets a journal offline.
210 * If wait is false then an offline is dispatched in a separate thread for a
211 * subsequent journal_file_set_offline() or journal_file_set_online() of the
212 * same journal to synchronize with.
214 * If wait is true, then either an existing offline thread will be restarted
215 * and joined, or if none exists the offline is simply performed in this
216 * context without involving another thread.
218 int journal_file_set_offline(JournalFile
*f
, bool wait
) {
227 if (!(f
->fd
>= 0 && f
->header
))
230 /* An offlining journal is implicitly online and may modify f->header->state,
231 * we must also join any potentially lingering offline thread when not online. */
232 if (!journal_file_is_offlining(f
) && f
->header
->state
!= STATE_ONLINE
)
233 return journal_file_set_offline_thread_join(f
);
235 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
236 restarted
= journal_file_set_offline_try_restart(f
);
237 if ((restarted
&& wait
) || !restarted
) {
238 r
= journal_file_set_offline_thread_join(f
);
246 /* Initiate a new offline. */
247 f
->offline_state
= OFFLINE_SYNCING
;
249 if (wait
) /* Without using a thread if waiting. */
250 journal_file_set_offline_internal(f
);
252 sigset_t ss
, saved_ss
;
255 if (sigfillset(&ss
) < 0)
258 r
= pthread_sigmask(SIG_BLOCK
, &ss
, &saved_ss
);
262 r
= pthread_create(&f
->offline_thread
, NULL
, journal_file_set_offline_thread
, f
);
264 k
= pthread_sigmask(SIG_SETMASK
, &saved_ss
, NULL
);
266 f
->offline_state
= OFFLINE_JOINED
;
276 static int journal_file_set_online(JournalFile
*f
) {
284 if (!(f
->fd
>= 0 && f
->header
))
288 switch (f
->offline_state
) {
290 /* No offline thread, no need to wait. */
294 case OFFLINE_SYNCING
:
295 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_CANCEL
))
297 /* Canceled syncing prior to offlining, no need to wait. */
300 case OFFLINE_AGAIN_FROM_SYNCING
:
301 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_SYNCING
, OFFLINE_CANCEL
))
303 /* Canceled restart from syncing, no need to wait. */
306 case OFFLINE_AGAIN_FROM_OFFLINING
:
307 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_OFFLINING
, OFFLINE_CANCEL
))
309 /* Canceled restart from offlining, must wait for offlining to complete however. */
314 r
= journal_file_set_offline_thread_join(f
);
324 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
327 switch (f
->header
->state
) {
332 f
->header
->state
= STATE_ONLINE
;
341 bool journal_file_is_offlining(JournalFile
*f
) {
344 __sync_synchronize();
346 if (IN_SET(f
->offline_state
, OFFLINE_DONE
, OFFLINE_JOINED
))
352 JournalFile
* journal_file_close(JournalFile
*f
) {
356 /* Write the final tag */
357 if (f
->seal
&& f
->writable
) {
360 r
= journal_file_append_tag(f
);
362 log_error_errno(r
, "Failed to append tag when closing journal: %m");
366 if (f
->post_change_timer
) {
369 if (sd_event_source_get_enabled(f
->post_change_timer
, &enabled
) >= 0)
370 if (enabled
== SD_EVENT_ONESHOT
)
371 journal_file_post_change(f
);
373 (void) sd_event_source_set_enabled(f
->post_change_timer
, SD_EVENT_OFF
);
374 sd_event_source_unref(f
->post_change_timer
);
377 journal_file_set_offline(f
, true);
379 if (f
->mmap
&& f
->cache_fd
)
380 mmap_cache_free_fd(f
->mmap
, f
->cache_fd
);
382 if (f
->fd
>= 0 && f
->defrag_on_close
) {
384 /* Be friendly to btrfs: turn COW back on again now,
385 * and defragment the file. We won't write to the file
386 * ever again, hence remove all fragmentation, and
387 * reenable all the good bits COW usually provides
388 * (such as data checksumming). */
390 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
391 (void) btrfs_defrag_fd(f
->fd
);
398 mmap_cache_unref(f
->mmap
);
400 ordered_hashmap_free_free(f
->chain_cache
);
402 #if HAVE_XZ || HAVE_LZ4
403 free(f
->compress_buffer
);
408 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
410 free(f
->fsprg_state
);
415 gcry_md_close(f
->hmac
);
421 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
428 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
429 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
431 h
.incompatible_flags
|= htole32(
432 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
433 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
435 h
.compatible_flags
= htole32(
436 f
->seal
* HEADER_COMPATIBLE_SEALED
);
438 r
= sd_id128_randomize(&h
.file_id
);
443 h
.seqnum_id
= template->header
->seqnum_id
;
444 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
446 h
.seqnum_id
= h
.file_id
;
448 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
458 static int journal_file_refresh_header(JournalFile
*f
) {
465 r
= sd_id128_get_machine(&f
->header
->machine_id
);
469 r
= sd_id128_get_boot(&boot_id
);
473 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
474 f
->tail_entry_monotonic_valid
= true;
476 f
->header
->boot_id
= boot_id
;
478 r
= journal_file_set_online(f
);
480 /* Sync the online state to disk */
483 /* We likely just created a new file, also sync the directory this file is located in. */
484 (void) fsync_directory_of_file(f
->fd
);
489 static bool warn_wrong_flags(const JournalFile
*f
, bool compatible
) {
490 const uint32_t any
= compatible
? HEADER_COMPATIBLE_ANY
: HEADER_INCOMPATIBLE_ANY
,
491 supported
= compatible
? HEADER_COMPATIBLE_SUPPORTED
: HEADER_INCOMPATIBLE_SUPPORTED
;
492 const char *type
= compatible
? "compatible" : "incompatible";
495 flags
= le32toh(compatible
? f
->header
->compatible_flags
: f
->header
->incompatible_flags
);
497 if (flags
& ~supported
) {
499 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32
,
500 f
->path
, type
, flags
& ~any
);
501 flags
= (flags
& any
) & ~supported
;
505 _cleanup_free_
char *t
= NULL
;
507 if (compatible
&& (flags
& HEADER_COMPATIBLE_SEALED
))
508 strv
[n
++] = "sealed";
509 if (!compatible
&& (flags
& HEADER_INCOMPATIBLE_COMPRESSED_XZ
))
510 strv
[n
++] = "xz-compressed";
511 if (!compatible
&& (flags
& HEADER_INCOMPATIBLE_COMPRESSED_LZ4
))
512 strv
[n
++] = "lz4-compressed";
514 assert(n
< ELEMENTSOF(strv
));
516 t
= strv_join((char**) strv
, ", ");
517 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
518 f
->path
, type
, n
> 1 ? "flags" : "flag", strnull(t
));
526 static int journal_file_verify_header(JournalFile
*f
) {
527 uint64_t arena_size
, header_size
;
532 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
535 /* In both read and write mode we refuse to open files with incompatible
536 * flags we don't know. */
537 if (warn_wrong_flags(f
, false))
538 return -EPROTONOSUPPORT
;
540 /* When open for writing we refuse to open files with compatible flags, too. */
541 if (f
->writable
&& warn_wrong_flags(f
, true))
542 return -EPROTONOSUPPORT
;
544 if (f
->header
->state
>= _STATE_MAX
)
547 header_size
= le64toh(f
->header
->header_size
);
549 /* The first addition was n_data, so check that we are at least this large */
550 if (header_size
< HEADER_SIZE_MIN
)
553 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
556 arena_size
= le64toh(f
->header
->arena_size
);
558 if (UINT64_MAX
- header_size
< arena_size
|| header_size
+ arena_size
> (uint64_t) f
->last_stat
.st_size
)
561 if (le64toh(f
->header
->tail_object_offset
) > header_size
+ arena_size
)
564 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
565 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
566 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
567 !VALID64(le64toh(f
->header
->entry_array_offset
)))
571 sd_id128_t machine_id
;
575 r
= sd_id128_get_machine(&machine_id
);
579 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
582 state
= f
->header
->state
;
584 if (state
== STATE_ARCHIVED
)
585 return -ESHUTDOWN
; /* Already archived */
586 else if (state
== STATE_ONLINE
) {
587 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
589 } else if (state
!= STATE_OFFLINE
) {
590 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
594 if (f
->header
->field_hash_table_size
== 0 || f
->header
->data_hash_table_size
== 0)
597 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
598 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
600 if (le64toh(f
->header
->tail_entry_realtime
) > now(CLOCK_REALTIME
)) {
601 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f
->path
);
606 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
607 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
609 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
614 static int journal_file_fstat(JournalFile
*f
) {
620 if (fstat(f
->fd
, &f
->last_stat
) < 0)
623 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
625 /* Refuse dealing with with files that aren't regular */
626 r
= stat_verify_regular(&f
->last_stat
);
630 /* Refuse appending to files that are already deleted */
631 if (f
->last_stat
.st_nlink
<= 0)
637 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
638 uint64_t old_size
, new_size
;
644 /* We assume that this file is not sparse, and we know that
645 * for sure, since we always call posix_fallocate()
648 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
652 le64toh(f
->header
->header_size
) +
653 le64toh(f
->header
->arena_size
);
655 new_size
= PAGE_ALIGN(offset
+ size
);
656 if (new_size
< le64toh(f
->header
->header_size
))
657 new_size
= le64toh(f
->header
->header_size
);
659 if (new_size
<= old_size
) {
661 /* We already pre-allocated enough space, but before
662 * we write to it, let's check with fstat() if the
663 * file got deleted, in order make sure we don't throw
664 * away the data immediately. Don't check fstat() for
665 * all writes though, but only once ever 10s. */
667 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
670 return journal_file_fstat(f
);
673 /* Allocate more space. */
675 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
678 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
681 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
684 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
686 if (new_size
- old_size
> available
)
691 /* Increase by larger blocks at once */
692 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
693 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
694 new_size
= f
->metrics
.max_size
;
696 /* Note that the glibc fallocate() fallback is very
697 inefficient, hence we try to minimize the allocation area
699 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
703 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
705 return journal_file_fstat(f
);
708 static unsigned type_to_context(ObjectType type
) {
709 /* One context for each type, plus one catch-all for the rest */
710 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
711 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
712 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
715 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
, size_t *ret_size
) {
724 /* Avoid SIGBUS on invalid accesses */
725 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
726 /* Hmm, out of range? Let's refresh the fstat() data
727 * first, before we trust that check. */
729 r
= journal_file_fstat(f
);
733 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
734 return -EADDRNOTAVAIL
;
737 return mmap_cache_get(f
->mmap
, f
->cache_fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
, ret_size
);
740 static uint64_t minimum_header_size(Object
*o
) {
742 static const uint64_t table
[] = {
743 [OBJECT_DATA
] = sizeof(DataObject
),
744 [OBJECT_FIELD
] = sizeof(FieldObject
),
745 [OBJECT_ENTRY
] = sizeof(EntryObject
),
746 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
747 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
748 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
749 [OBJECT_TAG
] = sizeof(TagObject
),
752 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
753 return sizeof(ObjectHeader
);
755 return table
[o
->object
.type
];
758 /* Lightweight object checks. We want this to be fast, so that we won't
759 * slowdown every journal_file_move_to_object() call too much. */
760 static int journal_file_check_object(JournalFile
*f
, uint64_t offset
, Object
*o
) {
764 switch (o
->object
.type
) {
767 if ((le64toh(o
->data
.entry_offset
) == 0) ^ (le64toh(o
->data
.n_entries
) == 0)) {
768 log_debug("Bad n_entries: %"PRIu64
": %"PRIu64
,
769 le64toh(o
->data
.n_entries
), offset
);
773 if (le64toh(o
->object
.size
) - offsetof(DataObject
, payload
) <= 0) {
774 log_debug("Bad object size (<= %zu): %"PRIu64
": %"PRIu64
,
775 offsetof(DataObject
, payload
),
776 le64toh(o
->object
.size
),
781 if (!VALID64(le64toh(o
->data
.next_hash_offset
)) ||
782 !VALID64(le64toh(o
->data
.next_field_offset
)) ||
783 !VALID64(le64toh(o
->data
.entry_offset
)) ||
784 !VALID64(le64toh(o
->data
.entry_array_offset
))) {
785 log_debug("Invalid offset, next_hash_offset="OFSfmt
", next_field_offset="OFSfmt
786 ", entry_offset="OFSfmt
", entry_array_offset="OFSfmt
": %"PRIu64
,
787 le64toh(o
->data
.next_hash_offset
),
788 le64toh(o
->data
.next_field_offset
),
789 le64toh(o
->data
.entry_offset
),
790 le64toh(o
->data
.entry_array_offset
),
799 if (le64toh(o
->object
.size
) - offsetof(FieldObject
, payload
) <= 0) {
801 "Bad field size (<= %zu): %"PRIu64
": %"PRIu64
,
802 offsetof(FieldObject
, payload
),
803 le64toh(o
->object
.size
),
808 if (!VALID64(le64toh(o
->field
.next_hash_offset
)) ||
809 !VALID64(le64toh(o
->field
.head_data_offset
))) {
811 "Invalid offset, next_hash_offset="OFSfmt
812 ", head_data_offset="OFSfmt
": %"PRIu64
,
813 le64toh(o
->field
.next_hash_offset
),
814 le64toh(o
->field
.head_data_offset
),
821 if ((le64toh(o
->object
.size
) - offsetof(EntryObject
, items
)) % sizeof(EntryItem
) != 0) {
823 "Bad entry size (<= %zu): %"PRIu64
": %"PRIu64
,
824 offsetof(EntryObject
, items
),
825 le64toh(o
->object
.size
),
830 if ((le64toh(o
->object
.size
) - offsetof(EntryObject
, items
)) / sizeof(EntryItem
) <= 0) {
832 "Invalid number items in entry: %"PRIu64
": %"PRIu64
,
833 (le64toh(o
->object
.size
) - offsetof(EntryObject
, items
)) / sizeof(EntryItem
),
838 if (le64toh(o
->entry
.seqnum
) <= 0) {
840 "Invalid entry seqnum: %"PRIx64
": %"PRIu64
,
841 le64toh(o
->entry
.seqnum
),
846 if (!VALID_REALTIME(le64toh(o
->entry
.realtime
))) {
848 "Invalid entry realtime timestamp: %"PRIu64
": %"PRIu64
,
849 le64toh(o
->entry
.realtime
),
854 if (!VALID_MONOTONIC(le64toh(o
->entry
.monotonic
))) {
856 "Invalid entry monotonic timestamp: %"PRIu64
": %"PRIu64
,
857 le64toh(o
->entry
.monotonic
),
864 case OBJECT_DATA_HASH_TABLE
:
865 case OBJECT_FIELD_HASH_TABLE
:
866 if ((le64toh(o
->object
.size
) - offsetof(HashTableObject
, items
)) % sizeof(HashItem
) != 0 ||
867 (le64toh(o
->object
.size
) - offsetof(HashTableObject
, items
)) / sizeof(HashItem
) <= 0) {
869 "Invalid %s hash table size: %"PRIu64
": %"PRIu64
,
870 o
->object
.type
== OBJECT_DATA_HASH_TABLE
? "data" : "field",
871 le64toh(o
->object
.size
),
878 case OBJECT_ENTRY_ARRAY
:
879 if ((le64toh(o
->object
.size
) - offsetof(EntryArrayObject
, items
)) % sizeof(le64_t
) != 0 ||
880 (le64toh(o
->object
.size
) - offsetof(EntryArrayObject
, items
)) / sizeof(le64_t
) <= 0) {
882 "Invalid object entry array size: %"PRIu64
": %"PRIu64
,
883 le64toh(o
->object
.size
),
888 if (!VALID64(le64toh(o
->entry_array
.next_entry_array_offset
))) {
890 "Invalid object entry array next_entry_array_offset: "OFSfmt
": %"PRIu64
,
891 le64toh(o
->entry_array
.next_entry_array_offset
),
899 if (le64toh(o
->object
.size
) != sizeof(TagObject
)) {
901 "Invalid object tag size: %"PRIu64
": %"PRIu64
,
902 le64toh(o
->object
.size
),
907 if (!VALID_EPOCH(le64toh(o
->tag
.epoch
))) {
909 "Invalid object tag epoch: %"PRIu64
": %"PRIu64
,
910 le64toh(o
->tag
.epoch
),
921 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
931 /* Objects may only be located at multiple of 64 bit */
932 if (!VALID64(offset
)) {
933 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64
, offset
);
937 /* Object may not be located in the file header */
938 if (offset
< le64toh(f
->header
->header_size
)) {
939 log_debug("Attempt to move to object located in file header: %" PRIu64
, offset
);
943 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
, &tsize
);
948 s
= le64toh(o
->object
.size
);
951 log_debug("Attempt to move to uninitialized object: %" PRIu64
, offset
);
954 if (s
< sizeof(ObjectHeader
)) {
955 log_debug("Attempt to move to overly short object: %" PRIu64
, offset
);
959 if (o
->object
.type
<= OBJECT_UNUSED
) {
960 log_debug("Attempt to move to object with invalid type: %" PRIu64
, offset
);
964 if (s
< minimum_header_size(o
)) {
965 log_debug("Attempt to move to truncated object: %" PRIu64
, offset
);
969 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
) {
970 log_debug("Attempt to move to object of unexpected type: %" PRIu64
, offset
);
975 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
, NULL
);
982 r
= journal_file_check_object(f
, offset
, o
);
990 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
996 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
999 /* If an external seqnum counter was passed, we update
1000 * both the local and the external one, and set it to
1001 * the maximum of both */
1003 if (*seqnum
+ 1 > r
)
1009 f
->header
->tail_entry_seqnum
= htole64(r
);
1011 if (f
->header
->head_entry_seqnum
== 0)
1012 f
->header
->head_entry_seqnum
= htole64(r
);
1017 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
1025 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
1026 assert(size
>= sizeof(ObjectHeader
));
1030 r
= journal_file_set_online(f
);
1034 p
= le64toh(f
->header
->tail_object_offset
);
1036 p
= le64toh(f
->header
->header_size
);
1038 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
1042 p
+= ALIGN64(le64toh(tail
->object
.size
));
1045 r
= journal_file_allocate(f
, p
, size
);
1049 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
, NULL
);
1056 o
->object
.type
= type
;
1057 o
->object
.size
= htole64(size
);
1059 f
->header
->tail_object_offset
= htole64(p
);
1060 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
1068 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
1076 /* We estimate that we need 1 hash table entry per 768 bytes
1077 of journal file and we want to make sure we never get
1078 beyond 75% fill level. Calculate the hash table size for
1079 the maximum file size based on these metrics. */
1081 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
1082 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
1083 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
1085 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
1087 r
= journal_file_append_object(f
,
1088 OBJECT_DATA_HASH_TABLE
,
1089 offsetof(Object
, hash_table
.items
) + s
,
1094 memzero(o
->hash_table
.items
, s
);
1096 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
1097 f
->header
->data_hash_table_size
= htole64(s
);
1102 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
1110 /* We use a fixed size hash table for the fields as this
1111 * number should grow very slowly only */
1113 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
1114 r
= journal_file_append_object(f
,
1115 OBJECT_FIELD_HASH_TABLE
,
1116 offsetof(Object
, hash_table
.items
) + s
,
1121 memzero(o
->hash_table
.items
, s
);
1123 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
1124 f
->header
->field_hash_table_size
= htole64(s
);
1129 int journal_file_map_data_hash_table(JournalFile
*f
) {
1137 if (f
->data_hash_table
)
1140 p
= le64toh(f
->header
->data_hash_table_offset
);
1141 s
= le64toh(f
->header
->data_hash_table_size
);
1143 r
= journal_file_move_to(f
,
1144 OBJECT_DATA_HASH_TABLE
,
1151 f
->data_hash_table
= t
;
1155 int journal_file_map_field_hash_table(JournalFile
*f
) {
1163 if (f
->field_hash_table
)
1166 p
= le64toh(f
->header
->field_hash_table_offset
);
1167 s
= le64toh(f
->header
->field_hash_table_size
);
1169 r
= journal_file_move_to(f
,
1170 OBJECT_FIELD_HASH_TABLE
,
1177 f
->field_hash_table
= t
;
1181 static int journal_file_link_field(
1192 assert(f
->field_hash_table
);
1196 if (o
->object
.type
!= OBJECT_FIELD
)
1199 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
1203 /* This might alter the window we are looking at */
1204 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
1207 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
1209 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
1211 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1215 o
->field
.next_hash_offset
= htole64(offset
);
1218 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
1220 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
1221 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
1226 static int journal_file_link_data(
1237 assert(f
->data_hash_table
);
1241 if (o
->object
.type
!= OBJECT_DATA
)
1244 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
1248 /* This might alter the window we are looking at */
1249 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
1250 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
1251 o
->data
.n_entries
= 0;
1254 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
1256 /* Only entry in the hash table is easy */
1257 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
1259 /* Move back to the previous data object, to patch in
1262 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1266 o
->data
.next_hash_offset
= htole64(offset
);
1269 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
1271 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
1272 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
1277 int journal_file_find_field_object_with_hash(
1279 const void *field
, uint64_t size
, uint64_t hash
,
1280 Object
**ret
, uint64_t *offset
) {
1282 uint64_t p
, osize
, h
, m
;
1287 assert(field
&& size
> 0);
1289 /* If the field hash table is empty, we can't find anything */
1290 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
1293 /* Map the field hash table, if it isn't mapped yet. */
1294 r
= journal_file_map_field_hash_table(f
);
1298 osize
= offsetof(Object
, field
.payload
) + size
;
1300 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
1305 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
1310 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1314 if (le64toh(o
->field
.hash
) == hash
&&
1315 le64toh(o
->object
.size
) == osize
&&
1316 memcmp(o
->field
.payload
, field
, size
) == 0) {
1326 p
= le64toh(o
->field
.next_hash_offset
);
1332 int journal_file_find_field_object(
1334 const void *field
, uint64_t size
,
1335 Object
**ret
, uint64_t *offset
) {
1340 assert(field
&& size
> 0);
1342 hash
= hash64(field
, size
);
1344 return journal_file_find_field_object_with_hash(f
,
1349 int journal_file_find_data_object_with_hash(
1351 const void *data
, uint64_t size
, uint64_t hash
,
1352 Object
**ret
, uint64_t *offset
) {
1354 uint64_t p
, osize
, h
, m
;
1359 assert(data
|| size
== 0);
1361 /* If there's no data hash table, then there's no entry. */
1362 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
1365 /* Map the data hash table, if it isn't mapped yet. */
1366 r
= journal_file_map_data_hash_table(f
);
1370 osize
= offsetof(Object
, data
.payload
) + size
;
1372 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
1377 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
1382 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1386 if (le64toh(o
->data
.hash
) != hash
)
1389 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
1390 #if HAVE_XZ || HAVE_LZ4
1394 l
= le64toh(o
->object
.size
);
1395 if (l
<= offsetof(Object
, data
.payload
))
1398 l
-= offsetof(Object
, data
.payload
);
1400 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
1401 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
1405 if (rsize
== size
&&
1406 memcmp(f
->compress_buffer
, data
, size
) == 0) {
1417 return -EPROTONOSUPPORT
;
1419 } else if (le64toh(o
->object
.size
) == osize
&&
1420 memcmp(o
->data
.payload
, data
, size
) == 0) {
1432 p
= le64toh(o
->data
.next_hash_offset
);
1438 int journal_file_find_data_object(
1440 const void *data
, uint64_t size
,
1441 Object
**ret
, uint64_t *offset
) {
1446 assert(data
|| size
== 0);
1448 hash
= hash64(data
, size
);
1450 return journal_file_find_data_object_with_hash(f
,
1455 static int journal_file_append_field(
1457 const void *field
, uint64_t size
,
1458 Object
**ret
, uint64_t *offset
) {
1466 assert(field
&& size
> 0);
1468 hash
= hash64(field
, size
);
1470 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1484 osize
= offsetof(Object
, field
.payload
) + size
;
1485 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1489 o
->field
.hash
= htole64(hash
);
1490 memcpy(o
->field
.payload
, field
, size
);
1492 r
= journal_file_link_field(f
, o
, p
, hash
);
1496 /* The linking might have altered the window, so let's
1497 * refresh our pointer */
1498 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1503 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1517 static int journal_file_append_data(
1519 const void *data
, uint64_t size
,
1520 Object
**ret
, uint64_t *offset
) {
1525 int r
, compression
= 0;
1529 assert(data
|| size
== 0);
1531 hash
= hash64(data
, size
);
1533 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1547 osize
= offsetof(Object
, data
.payload
) + size
;
1548 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1552 o
->data
.hash
= htole64(hash
);
1554 #if HAVE_XZ || HAVE_LZ4
1555 if (JOURNAL_FILE_COMPRESS(f
) && size
>= COMPRESSION_SIZE_THRESHOLD
) {
1558 compression
= compress_blob(data
, size
, o
->data
.payload
, size
- 1, &rsize
);
1560 if (compression
>= 0) {
1561 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1562 o
->object
.flags
|= compression
;
1564 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1565 size
, rsize
, object_compressed_to_string(compression
));
1567 /* Compression didn't work, we don't really care why, let's continue without compression */
1572 if (compression
== 0)
1573 memcpy_safe(o
->data
.payload
, data
, size
);
1575 r
= journal_file_link_data(f
, o
, p
, hash
);
1580 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1585 /* The linking might have altered the window, so let's
1586 * refresh our pointer */
1587 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1594 eq
= memchr(data
, '=', size
);
1595 if (eq
&& eq
> data
) {
1599 /* Create field object ... */
1600 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1604 /* ... and link it in. */
1605 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1606 fo
->field
.head_data_offset
= le64toh(p
);
1618 uint64_t journal_file_entry_n_items(Object
*o
) {
1621 if (o
->object
.type
!= OBJECT_ENTRY
)
1624 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1627 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1630 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1633 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1636 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1639 if (!IN_SET(o
->object
.type
, OBJECT_DATA_HASH_TABLE
, OBJECT_FIELD_HASH_TABLE
))
1642 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1645 static int link_entry_into_array(JournalFile
*f
,
1650 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1659 a
= le64toh(*first
);
1660 i
= hidx
= le64toh(*idx
);
1663 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1667 n
= journal_file_entry_array_n_items(o
);
1669 o
->entry_array
.items
[i
] = htole64(p
);
1670 *idx
= htole64(hidx
+ 1);
1676 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1687 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1688 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1694 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1699 o
->entry_array
.items
[i
] = htole64(p
);
1702 *first
= htole64(q
);
1704 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1708 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1711 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1712 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1714 *idx
= htole64(hidx
+ 1);
1719 static int link_entry_into_array_plus_one(JournalFile
*f
,
1734 *extra
= htole64(p
);
1738 i
= htole64(le64toh(*idx
) - 1);
1739 r
= link_entry_into_array(f
, first
, &i
, p
);
1744 *idx
= htole64(le64toh(*idx
) + 1);
1748 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1755 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1759 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1763 return link_entry_into_array_plus_one(f
,
1764 &o
->data
.entry_offset
,
1765 &o
->data
.entry_array_offset
,
1770 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1779 if (o
->object
.type
!= OBJECT_ENTRY
)
1782 __sync_synchronize();
1784 /* Link up the entry itself */
1785 r
= link_entry_into_array(f
,
1786 &f
->header
->entry_array_offset
,
1787 &f
->header
->n_entries
,
1792 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1794 if (f
->header
->head_entry_realtime
== 0)
1795 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1797 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1798 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1800 f
->tail_entry_monotonic_valid
= true;
1802 /* Link up the items */
1803 n
= journal_file_entry_n_items(o
);
1804 for (i
= 0; i
< n
; i
++) {
1805 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1813 static int journal_file_append_entry_internal(
1815 const dual_timestamp
*ts
,
1817 const EntryItem items
[], unsigned n_items
,
1819 Object
**ret
, uint64_t *offset
) {
1827 assert(items
|| n_items
== 0);
1830 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1832 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1836 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1837 memcpy_safe(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1838 o
->entry
.realtime
= htole64(ts
->realtime
);
1839 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1840 o
->entry
.xor_hash
= htole64(xor_hash
);
1841 o
->entry
.boot_id
= f
->header
->boot_id
;
1844 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1849 r
= journal_file_link_entry(f
, o
, np
);
1862 void journal_file_post_change(JournalFile
*f
) {
1865 /* inotify() does not receive IN_MODIFY events from file
1866 * accesses done via mmap(). After each access we hence
1867 * trigger IN_MODIFY by truncating the journal file to its
1868 * current size which triggers IN_MODIFY. */
1870 __sync_synchronize();
1872 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1873 log_debug_errno(errno
, "Failed to truncate file to its own size: %m");
1876 static int post_change_thunk(sd_event_source
*timer
, uint64_t usec
, void *userdata
) {
1879 journal_file_post_change(userdata
);
1884 static void schedule_post_change(JournalFile
*f
) {
1885 sd_event_source
*timer
;
1890 assert(f
->post_change_timer
);
1892 timer
= f
->post_change_timer
;
1894 r
= sd_event_source_get_enabled(timer
, &enabled
);
1896 log_debug_errno(r
, "Failed to get ftruncate timer state: %m");
1900 if (enabled
== SD_EVENT_ONESHOT
)
1903 r
= sd_event_now(sd_event_source_get_event(timer
), CLOCK_MONOTONIC
, &now
);
1905 log_debug_errno(r
, "Failed to get clock's now for scheduling ftruncate: %m");
1909 r
= sd_event_source_set_time(timer
, now
+f
->post_change_timer_period
);
1911 log_debug_errno(r
, "Failed to set time for scheduling ftruncate: %m");
1915 r
= sd_event_source_set_enabled(timer
, SD_EVENT_ONESHOT
);
1917 log_debug_errno(r
, "Failed to enable scheduled ftruncate: %m");
1924 /* On failure, let's simply post the change immediately. */
1925 journal_file_post_change(f
);
1928 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1929 int journal_file_enable_post_change_timer(JournalFile
*f
, sd_event
*e
, usec_t t
) {
1930 _cleanup_(sd_event_source_unrefp
) sd_event_source
*timer
= NULL
;
1934 assert_return(!f
->post_change_timer
, -EINVAL
);
1938 r
= sd_event_add_time(e
, &timer
, CLOCK_MONOTONIC
, 0, 0, post_change_thunk
, f
);
1942 r
= sd_event_source_set_enabled(timer
, SD_EVENT_OFF
);
1946 f
->post_change_timer
= timer
;
1948 f
->post_change_timer_period
= t
;
1953 static int entry_item_cmp(const void *_a
, const void *_b
) {
1954 const EntryItem
*a
= _a
, *b
= _b
;
1956 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1958 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1963 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1967 uint64_t xor_hash
= 0;
1968 struct dual_timestamp _ts
;
1972 assert(iovec
|| n_iovec
== 0);
1975 dual_timestamp_get(&_ts
);
1980 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
1985 /* alloca() can't take 0, hence let's allocate at least one */
1986 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
1988 for (i
= 0; i
< n_iovec
; i
++) {
1992 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
1996 xor_hash
^= le64toh(o
->data
.hash
);
1997 items
[i
].object_offset
= htole64(p
);
1998 items
[i
].hash
= o
->data
.hash
;
2001 /* Order by the position on disk, in order to improve seek
2002 * times for rotating media. */
2003 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
2005 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
2007 /* If the memory mapping triggered a SIGBUS then we return an
2008 * IO error and ignore the error code passed down to us, since
2009 * it is very likely just an effect of a nullified replacement
2012 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
2015 if (f
->post_change_timer
)
2016 schedule_post_change(f
);
2018 journal_file_post_change(f
);
2023 typedef struct ChainCacheItem
{
2024 uint64_t first
; /* the array at the beginning of the chain */
2025 uint64_t array
; /* the cached array */
2026 uint64_t begin
; /* the first item in the cached array */
2027 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
2028 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
2031 static void chain_cache_put(
2038 uint64_t last_index
) {
2041 /* If the chain item to cache for this chain is the
2042 * first one it's not worth caching anything */
2046 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
2047 ci
= ordered_hashmap_steal_first(h
);
2050 ci
= new(ChainCacheItem
, 1);
2057 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
2062 assert(ci
->first
== first
);
2067 ci
->last_index
= last_index
;
2070 static int generic_array_get(
2074 Object
**ret
, uint64_t *offset
) {
2077 uint64_t p
= 0, a
, t
= 0;
2085 /* Try the chain cache first */
2086 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
2087 if (ci
&& i
> ci
->total
) {
2096 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
2100 k
= journal_file_entry_array_n_items(o
);
2102 p
= le64toh(o
->entry_array
.items
[i
]);
2108 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
2114 /* Let's cache this item for the next invocation */
2115 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
2117 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2130 static int generic_array_get_plus_one(
2135 Object
**ret
, uint64_t *offset
) {
2144 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
2157 return generic_array_get(f
, first
, i
-1, ret
, offset
);
2166 static int generic_array_bisect(
2171 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
2172 direction_t direction
,
2177 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
2178 bool subtract_one
= false;
2179 Object
*o
, *array
= NULL
;
2184 assert(test_object
);
2186 /* Start with the first array in the chain */
2189 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
2190 if (ci
&& n
> ci
->total
) {
2191 /* Ah, we have iterated this bisection array chain
2192 * previously! Let's see if we can skip ahead in the
2193 * chain, as far as the last time. But we can't jump
2194 * backwards in the chain, so let's check that
2197 r
= test_object(f
, ci
->begin
, needle
);
2201 if (r
== TEST_LEFT
) {
2202 /* OK, what we are looking for is right of the
2203 * begin of this EntryArray, so let's jump
2204 * straight to previously cached array in the
2210 last_index
= ci
->last_index
;
2215 uint64_t left
, right
, k
, lp
;
2217 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
2221 k
= journal_file_entry_array_n_items(array
);
2227 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
2231 r
= test_object(f
, p
, needle
);
2232 if (r
== -EBADMSG
) {
2233 log_debug_errno(r
, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2240 if (r
== TEST_FOUND
)
2241 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2243 if (r
== TEST_RIGHT
) {
2247 if (last_index
!= (uint64_t) -1) {
2248 assert(last_index
<= right
);
2250 /* If we cached the last index we
2251 * looked at, let's try to not to jump
2252 * too wildly around and see if we can
2253 * limit the range to look at early to
2254 * the immediate neighbors of the last
2255 * index we looked at. */
2257 if (last_index
> 0) {
2258 uint64_t x
= last_index
- 1;
2260 p
= le64toh(array
->entry_array
.items
[x
]);
2264 r
= test_object(f
, p
, needle
);
2268 if (r
== TEST_FOUND
)
2269 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2271 if (r
== TEST_RIGHT
)
2277 if (last_index
< right
) {
2278 uint64_t y
= last_index
+ 1;
2280 p
= le64toh(array
->entry_array
.items
[y
]);
2284 r
= test_object(f
, p
, needle
);
2288 if (r
== TEST_FOUND
)
2289 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2291 if (r
== TEST_RIGHT
)
2299 if (left
== right
) {
2300 if (direction
== DIRECTION_UP
)
2301 subtract_one
= true;
2307 assert(left
< right
);
2308 i
= (left
+ right
) / 2;
2310 p
= le64toh(array
->entry_array
.items
[i
]);
2314 r
= test_object(f
, p
, needle
);
2315 if (r
== -EBADMSG
) {
2316 log_debug_errno(r
, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2323 if (r
== TEST_FOUND
)
2324 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2326 if (r
== TEST_RIGHT
)
2334 if (direction
== DIRECTION_UP
) {
2336 subtract_one
= true;
2347 last_index
= (uint64_t) -1;
2348 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
2354 if (subtract_one
&& t
== 0 && i
== 0)
2357 /* Let's cache this item for the next invocation */
2358 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
2360 if (subtract_one
&& i
== 0)
2362 else if (subtract_one
)
2363 p
= le64toh(array
->entry_array
.items
[i
-1]);
2365 p
= le64toh(array
->entry_array
.items
[i
]);
2367 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2378 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
2383 static int generic_array_bisect_plus_one(
2389 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
2390 direction_t direction
,
2396 bool step_back
= false;
2400 assert(test_object
);
2405 /* This bisects the array in object 'first', but first checks
2407 r
= test_object(f
, extra
, needle
);
2411 if (r
== TEST_FOUND
)
2412 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2414 /* if we are looking with DIRECTION_UP then we need to first
2415 see if in the actual array there is a matching entry, and
2416 return the last one of that. But if there isn't any we need
2417 to return this one. Hence remember this, and return it
2420 step_back
= direction
== DIRECTION_UP
;
2422 if (r
== TEST_RIGHT
) {
2423 if (direction
== DIRECTION_DOWN
)
2429 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
2431 if (r
== 0 && step_back
)
2440 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
2456 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2462 else if (p
< needle
)
2468 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2475 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2479 if (le64toh(o
->entry
.seqnum
) == needle
)
2481 else if (le64toh(o
->entry
.seqnum
) < needle
)
2487 int journal_file_move_to_entry_by_seqnum(
2490 direction_t direction
,
2496 return generic_array_bisect(f
,
2497 le64toh(f
->header
->entry_array_offset
),
2498 le64toh(f
->header
->n_entries
),
2505 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2512 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2516 if (le64toh(o
->entry
.realtime
) == needle
)
2518 else if (le64toh(o
->entry
.realtime
) < needle
)
2524 int journal_file_move_to_entry_by_realtime(
2527 direction_t direction
,
2533 return generic_array_bisect(f
,
2534 le64toh(f
->header
->entry_array_offset
),
2535 le64toh(f
->header
->n_entries
),
2537 test_object_realtime
,
2542 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2549 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2553 if (le64toh(o
->entry
.monotonic
) == needle
)
2555 else if (le64toh(o
->entry
.monotonic
) < needle
)
2561 static int find_data_object_by_boot_id(
2567 char t
[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2569 sd_id128_to_string(boot_id
, t
+ 9);
2570 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2573 int journal_file_move_to_entry_by_monotonic(
2577 direction_t direction
,
2586 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2592 return generic_array_bisect_plus_one(f
,
2593 le64toh(o
->data
.entry_offset
),
2594 le64toh(o
->data
.entry_array_offset
),
2595 le64toh(o
->data
.n_entries
),
2597 test_object_monotonic
,
2602 void journal_file_reset_location(JournalFile
*f
) {
2603 f
->location_type
= LOCATION_HEAD
;
2604 f
->current_offset
= 0;
2605 f
->current_seqnum
= 0;
2606 f
->current_realtime
= 0;
2607 f
->current_monotonic
= 0;
2608 zero(f
->current_boot_id
);
2609 f
->current_xor_hash
= 0;
2612 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2613 f
->location_type
= LOCATION_SEEK
;
2614 f
->current_offset
= offset
;
2615 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2616 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2617 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2618 f
->current_boot_id
= o
->entry
.boot_id
;
2619 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2622 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2627 assert(af
->location_type
== LOCATION_SEEK
);
2628 assert(bf
->location_type
== LOCATION_SEEK
);
2630 /* If contents and timestamps match, these entries are
2631 * identical, even if the seqnum does not match */
2632 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2633 af
->current_monotonic
== bf
->current_monotonic
&&
2634 af
->current_realtime
== bf
->current_realtime
&&
2635 af
->current_xor_hash
== bf
->current_xor_hash
)
2638 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2640 /* If this is from the same seqnum source, compare
2642 if (af
->current_seqnum
< bf
->current_seqnum
)
2644 if (af
->current_seqnum
> bf
->current_seqnum
)
2647 /* Wow! This is weird, different data but the same
2648 * seqnums? Something is borked, but let's make the
2649 * best of it and compare by time. */
2652 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2654 /* If the boot id matches, compare monotonic time */
2655 if (af
->current_monotonic
< bf
->current_monotonic
)
2657 if (af
->current_monotonic
> bf
->current_monotonic
)
2661 /* Otherwise, compare UTC time */
2662 if (af
->current_realtime
< bf
->current_realtime
)
2664 if (af
->current_realtime
> bf
->current_realtime
)
2667 /* Finally, compare by contents */
2668 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2670 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2676 static int bump_array_index(uint64_t *i
, direction_t direction
, uint64_t n
) {
2678 /* Increase or decrease the specified index, in the right direction. */
2680 if (direction
== DIRECTION_DOWN
) {
2695 static bool check_properly_ordered(uint64_t new_offset
, uint64_t old_offset
, direction_t direction
) {
2697 /* Consider it an error if any of the two offsets is uninitialized */
2698 if (old_offset
== 0 || new_offset
== 0)
2701 /* If we go down, the new offset must be larger than the old one. */
2702 return direction
== DIRECTION_DOWN
?
2703 new_offset
> old_offset
:
2704 new_offset
< old_offset
;
2707 int journal_file_next_entry(
2710 direction_t direction
,
2711 Object
**ret
, uint64_t *offset
) {
2719 n
= le64toh(f
->header
->n_entries
);
2724 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2726 r
= generic_array_bisect(f
,
2727 le64toh(f
->header
->entry_array_offset
),
2728 le64toh(f
->header
->n_entries
),
2737 r
= bump_array_index(&i
, direction
, n
);
2742 /* And jump to it */
2744 r
= generic_array_get(f
,
2745 le64toh(f
->header
->entry_array_offset
),
2753 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2754 * the next one might work for us instead. */
2755 log_debug_errno(r
, "Entry item %" PRIu64
" is bad, skipping over it.", i
);
2757 r
= bump_array_index(&i
, direction
, n
);
2762 /* Ensure our array is properly ordered. */
2763 if (p
> 0 && !check_properly_ordered(ofs
, p
, direction
)) {
2764 log_debug("%s: entry array not properly ordered at entry %" PRIu64
, f
->path
, i
);
2774 int journal_file_next_entry_for_data(
2776 Object
*o
, uint64_t p
,
2777 uint64_t data_offset
,
2778 direction_t direction
,
2779 Object
**ret
, uint64_t *offset
) {
2786 assert(p
> 0 || !o
);
2788 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2792 n
= le64toh(d
->data
.n_entries
);
2797 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2799 if (o
->object
.type
!= OBJECT_ENTRY
)
2802 r
= generic_array_bisect_plus_one(f
,
2803 le64toh(d
->data
.entry_offset
),
2804 le64toh(d
->data
.entry_array_offset
),
2805 le64toh(d
->data
.n_entries
),
2815 r
= bump_array_index(&i
, direction
, n
);
2821 r
= generic_array_get_plus_one(f
,
2822 le64toh(d
->data
.entry_offset
),
2823 le64toh(d
->data
.entry_array_offset
),
2831 log_debug_errno(r
, "Data entry item %" PRIu64
" is bad, skipping over it.", i
);
2833 r
= bump_array_index(&i
, direction
, n
);
2838 /* Ensure our array is properly ordered. */
2839 if (p
> 0 && check_properly_ordered(ofs
, p
, direction
)) {
2840 log_debug("%s data entry array not properly ordered at entry %" PRIu64
, f
->path
, i
);
2850 int journal_file_move_to_entry_by_offset_for_data(
2852 uint64_t data_offset
,
2854 direction_t direction
,
2855 Object
**ret
, uint64_t *offset
) {
2862 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2866 return generic_array_bisect_plus_one(f
,
2867 le64toh(d
->data
.entry_offset
),
2868 le64toh(d
->data
.entry_array_offset
),
2869 le64toh(d
->data
.n_entries
),
2876 int journal_file_move_to_entry_by_monotonic_for_data(
2878 uint64_t data_offset
,
2881 direction_t direction
,
2882 Object
**ret
, uint64_t *offset
) {
2890 /* First, seek by time */
2891 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2897 r
= generic_array_bisect_plus_one(f
,
2898 le64toh(o
->data
.entry_offset
),
2899 le64toh(o
->data
.entry_array_offset
),
2900 le64toh(o
->data
.n_entries
),
2902 test_object_monotonic
,
2908 /* And now, continue seeking until we find an entry that
2909 * exists in both bisection arrays */
2915 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2919 r
= generic_array_bisect_plus_one(f
,
2920 le64toh(d
->data
.entry_offset
),
2921 le64toh(d
->data
.entry_array_offset
),
2922 le64toh(d
->data
.n_entries
),
2930 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2934 r
= generic_array_bisect_plus_one(f
,
2935 le64toh(o
->data
.entry_offset
),
2936 le64toh(o
->data
.entry_array_offset
),
2937 le64toh(o
->data
.n_entries
),
2959 int journal_file_move_to_entry_by_seqnum_for_data(
2961 uint64_t data_offset
,
2963 direction_t direction
,
2964 Object
**ret
, uint64_t *offset
) {
2971 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2975 return generic_array_bisect_plus_one(f
,
2976 le64toh(d
->data
.entry_offset
),
2977 le64toh(d
->data
.entry_array_offset
),
2978 le64toh(d
->data
.n_entries
),
2985 int journal_file_move_to_entry_by_realtime_for_data(
2987 uint64_t data_offset
,
2989 direction_t direction
,
2990 Object
**ret
, uint64_t *offset
) {
2997 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
3001 return generic_array_bisect_plus_one(f
,
3002 le64toh(d
->data
.entry_offset
),
3003 le64toh(d
->data
.entry_array_offset
),
3004 le64toh(d
->data
.n_entries
),
3006 test_object_realtime
,
3011 void journal_file_dump(JournalFile
*f
) {
3019 journal_file_print_header(f
);
3021 p
= le64toh(f
->header
->header_size
);
3023 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
3027 switch (o
->object
.type
) {
3030 printf("Type: OBJECT_UNUSED\n");
3034 printf("Type: OBJECT_DATA\n");
3038 printf("Type: OBJECT_FIELD\n");
3042 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
3043 le64toh(o
->entry
.seqnum
),
3044 le64toh(o
->entry
.monotonic
),
3045 le64toh(o
->entry
.realtime
));
3048 case OBJECT_FIELD_HASH_TABLE
:
3049 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3052 case OBJECT_DATA_HASH_TABLE
:
3053 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3056 case OBJECT_ENTRY_ARRAY
:
3057 printf("Type: OBJECT_ENTRY_ARRAY\n");
3061 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
3062 le64toh(o
->tag
.seqnum
),
3063 le64toh(o
->tag
.epoch
));
3067 printf("Type: unknown (%i)\n", o
->object
.type
);
3071 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
3072 printf("Flags: %s\n",
3073 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
3075 if (p
== le64toh(f
->header
->tail_object_offset
))
3078 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
3083 log_error("File corrupt");
3086 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
3089 x
= format_timestamp(buf
, l
, t
);
3095 void journal_file_print_header(JournalFile
*f
) {
3096 char a
[33], b
[33], c
[33], d
[33];
3097 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
3099 char bytes
[FORMAT_BYTES_MAX
];
3104 printf("File Path: %s\n"
3108 "Sequential Number ID: %s\n"
3110 "Compatible Flags:%s%s\n"
3111 "Incompatible Flags:%s%s%s\n"
3112 "Header size: %"PRIu64
"\n"
3113 "Arena size: %"PRIu64
"\n"
3114 "Data Hash Table Size: %"PRIu64
"\n"
3115 "Field Hash Table Size: %"PRIu64
"\n"
3116 "Rotate Suggested: %s\n"
3117 "Head Sequential Number: %"PRIu64
" (%"PRIx64
")\n"
3118 "Tail Sequential Number: %"PRIu64
" (%"PRIx64
")\n"
3119 "Head Realtime Timestamp: %s (%"PRIx64
")\n"
3120 "Tail Realtime Timestamp: %s (%"PRIx64
")\n"
3121 "Tail Monotonic Timestamp: %s (%"PRIx64
")\n"
3122 "Objects: %"PRIu64
"\n"
3123 "Entry Objects: %"PRIu64
"\n",
3125 sd_id128_to_string(f
->header
->file_id
, a
),
3126 sd_id128_to_string(f
->header
->machine_id
, b
),
3127 sd_id128_to_string(f
->header
->boot_id
, c
),
3128 sd_id128_to_string(f
->header
->seqnum_id
, d
),
3129 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
3130 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
3131 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
3132 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
3133 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
3134 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
3135 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
3136 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
3137 le64toh(f
->header
->header_size
),
3138 le64toh(f
->header
->arena_size
),
3139 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3140 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
3141 yes_no(journal_file_rotate_suggested(f
, 0)),
3142 le64toh(f
->header
->head_entry_seqnum
), le64toh(f
->header
->head_entry_seqnum
),
3143 le64toh(f
->header
->tail_entry_seqnum
), le64toh(f
->header
->tail_entry_seqnum
),
3144 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)), le64toh(f
->header
->head_entry_realtime
),
3145 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)), le64toh(f
->header
->tail_entry_realtime
),
3146 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
), le64toh(f
->header
->tail_entry_monotonic
),
3147 le64toh(f
->header
->n_objects
),
3148 le64toh(f
->header
->n_entries
));
3150 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3151 printf("Data Objects: %"PRIu64
"\n"
3152 "Data Hash Table Fill: %.1f%%\n",
3153 le64toh(f
->header
->n_data
),
3154 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
3156 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3157 printf("Field Objects: %"PRIu64
"\n"
3158 "Field Hash Table Fill: %.1f%%\n",
3159 le64toh(f
->header
->n_fields
),
3160 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
3162 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
3163 printf("Tag Objects: %"PRIu64
"\n",
3164 le64toh(f
->header
->n_tags
));
3165 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
3166 printf("Entry Array Objects: %"PRIu64
"\n",
3167 le64toh(f
->header
->n_entry_arrays
));
3169 if (fstat(f
->fd
, &st
) >= 0)
3170 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (uint64_t) st
.st_blocks
* 512ULL));
3173 static int journal_file_warn_btrfs(JournalFile
*f
) {
3179 /* Before we write anything, check if the COW logic is turned
3180 * off on btrfs. Given our write pattern that is quite
3181 * unfriendly to COW file systems this should greatly improve
3182 * performance on COW file systems, such as btrfs, at the
3183 * expense of data integrity features (which shouldn't be too
3184 * bad, given that we do our own checksumming). */
3186 r
= btrfs_is_filesystem(f
->fd
);
3188 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
3192 r
= read_attr_fd(f
->fd
, &attrs
);
3194 return log_warning_errno(r
, "Failed to read file attributes: %m");
3196 if (attrs
& FS_NOCOW_FL
) {
3197 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3201 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3202 "This is likely to slow down journal access substantially, please consider turning "
3203 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
3208 int journal_file_open(
3215 JournalMetrics
*metrics
,
3216 MMapCache
*mmap_cache
,
3217 Set
*deferred_closes
,
3218 JournalFile
*template,
3219 JournalFile
**ret
) {
3221 bool newly_created
= false;
3227 assert(fd
>= 0 || fname
);
3229 if (!IN_SET((flags
& O_ACCMODE
), O_RDONLY
, O_RDWR
))
3232 if (fname
&& (flags
& O_CREAT
) && !endswith(fname
, ".journal"))
3235 f
= new0(JournalFile
, 1);
3243 f
->prot
= prot_from_flags(flags
);
3244 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
3246 f
->compress_lz4
= compress
;
3248 f
->compress_xz
= compress
;
3255 f
->mmap
= mmap_cache_ref(mmap_cache
);
3257 f
->mmap
= mmap_cache_new();
3265 f
->path
= strdup(fname
);
3273 /* If we don't know the path, fill in something explanatory and vaguely useful */
3274 if (asprintf(&f
->path
, "/proc/self/%i", fd
) < 0) {
3280 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
3281 if (!f
->chain_cache
) {
3287 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3288 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3289 * it doesn't hurt in that case. */
3291 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
|O_NONBLOCK
, f
->mode
);
3297 /* fds we opened here by us should also be closed by us. */
3300 r
= fd_nonblock(f
->fd
, false);
3305 f
->cache_fd
= mmap_cache_add_fd(f
->mmap
, f
->fd
);
3311 r
= journal_file_fstat(f
);
3315 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
3317 (void) journal_file_warn_btrfs(f
);
3319 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3320 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3321 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3322 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3323 * solely on mtime/atime/ctime of the file. */
3324 (void) fd_setcrtime(f
->fd
, 0);
3327 /* Try to load the FSPRG state, and if we can't, then
3328 * just don't do sealing */
3330 r
= journal_file_fss_load(f
);
3336 r
= journal_file_init_header(f
, template);
3340 r
= journal_file_fstat(f
);
3344 newly_created
= true;
3347 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
3352 r
= mmap_cache_get(f
->mmap
, f
->cache_fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
, NULL
);
3358 if (!newly_created
) {
3359 set_clear_with_destructor(deferred_closes
, journal_file_close
);
3361 r
= journal_file_verify_header(f
);
3367 if (!newly_created
&& f
->writable
) {
3368 r
= journal_file_fss_load(f
);
3376 journal_default_metrics(metrics
, f
->fd
);
3377 f
->metrics
= *metrics
;
3378 } else if (template)
3379 f
->metrics
= template->metrics
;
3381 r
= journal_file_refresh_header(f
);
3387 r
= journal_file_hmac_setup(f
);
3392 if (newly_created
) {
3393 r
= journal_file_setup_field_hash_table(f
);
3397 r
= journal_file_setup_data_hash_table(f
);
3402 r
= journal_file_append_first_tag(f
);
3408 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
)) {
3413 if (template && template->post_change_timer
) {
3414 r
= journal_file_enable_post_change_timer(
3416 sd_event_source_get_event(template->post_change_timer
),
3417 template->post_change_timer_period
);
3423 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3430 if (f
->cache_fd
&& mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
3433 (void) journal_file_close(f
);
3438 int journal_file_rotate(JournalFile
**f
, bool compress
, bool seal
, Set
*deferred_closes
) {
3439 _cleanup_free_
char *p
= NULL
;
3441 JournalFile
*old_file
, *new_file
= NULL
;
3449 if (!old_file
->writable
)
3452 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3453 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3454 if (path_startswith(old_file
->path
, "/proc/self/fd"))
3457 if (!endswith(old_file
->path
, ".journal"))
3460 l
= strlen(old_file
->path
);
3461 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
3462 (int) l
- 8, old_file
->path
,
3463 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
3464 le64toh((*f
)->header
->head_entry_seqnum
),
3465 le64toh((*f
)->header
->head_entry_realtime
));
3469 /* Try to rename the file to the archived version. If the file
3470 * already was deleted, we'll get ENOENT, let's ignore that
3472 r
= rename(old_file
->path
, p
);
3473 if (r
< 0 && errno
!= ENOENT
)
3476 /* Sync the rename to disk */
3477 (void) fsync_directory_of_file(old_file
->fd
);
3479 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3480 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3481 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3482 * would result in the rotated journal never getting fsync() called before closing.
3483 * Now we simply queue the archive state by setting an archive bit, leaving the state
3484 * as STATE_ONLINE so proper offlining occurs. */
3485 old_file
->archive
= true;
3487 /* Currently, btrfs is not very good with out write patterns
3488 * and fragments heavily. Let's defrag our journal files when
3489 * we archive them */
3490 old_file
->defrag_on_close
= true;
3492 r
= journal_file_open(-1, old_file
->path
, old_file
->flags
, old_file
->mode
, compress
, seal
, NULL
, old_file
->mmap
, deferred_closes
, old_file
, &new_file
);
3494 if (deferred_closes
&&
3495 set_put(deferred_closes
, old_file
) >= 0)
3496 (void) journal_file_set_offline(old_file
, false);
3498 (void) journal_file_close(old_file
);
3504 int journal_file_open_reliably(
3510 JournalMetrics
*metrics
,
3511 MMapCache
*mmap_cache
,
3512 Set
*deferred_closes
,
3513 JournalFile
*template,
3514 JournalFile
**ret
) {
3518 _cleanup_free_
char *p
= NULL
;
3520 r
= journal_file_open(-1, fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, deferred_closes
, template, ret
);
3522 -EBADMSG
, /* Corrupted */
3523 -ENODATA
, /* Truncated */
3524 -EHOSTDOWN
, /* Other machine */
3525 -EPROTONOSUPPORT
, /* Incompatible feature */
3526 -EBUSY
, /* Unclean shutdown */
3527 -ESHUTDOWN
, /* Already archived */
3528 -EIO
, /* IO error, including SIGBUS on mmap */
3529 -EIDRM
, /* File has been deleted */
3530 -ETXTBSY
)) /* File is from the future */
3533 if ((flags
& O_ACCMODE
) == O_RDONLY
)
3536 if (!(flags
& O_CREAT
))
3539 if (!endswith(fname
, ".journal"))
3542 /* The file is corrupted. Rotate it away and try it again (but only once) */
3545 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
3547 now(CLOCK_REALTIME
),
3551 if (rename(fname
, p
) < 0)
3554 /* btrfs doesn't cope well with our write pattern and
3555 * fragments heavily. Let's defrag all files we rotate */
3557 (void) chattr_path(p
, 0, FS_NOCOW_FL
);
3558 (void) btrfs_defrag(p
);
3560 log_warning_errno(r
, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
3562 return journal_file_open(-1, fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, deferred_closes
, template, ret
);
3565 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
3567 uint64_t q
, xor_hash
= 0;
3580 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
3581 ts
.realtime
= le64toh(o
->entry
.realtime
);
3583 n
= journal_file_entry_n_items(o
);
3584 /* alloca() can't take 0, hence let's allocate at least one */
3585 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
3587 for (i
= 0; i
< n
; i
++) {
3594 q
= le64toh(o
->entry
.items
[i
].object_offset
);
3595 le_hash
= o
->entry
.items
[i
].hash
;
3597 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
3601 if (le_hash
!= o
->data
.hash
)
3604 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
3607 /* We hit the limit on 32bit machines */
3608 if ((uint64_t) t
!= l
)
3611 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
3612 #if HAVE_XZ || HAVE_LZ4
3615 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
3616 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
3620 data
= from
->compress_buffer
;
3623 return -EPROTONOSUPPORT
;
3626 data
= o
->data
.payload
;
3628 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
3632 xor_hash
^= le64toh(u
->data
.hash
);
3633 items
[i
].object_offset
= htole64(h
);
3634 items
[i
].hash
= u
->data
.hash
;
3636 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
3641 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
3643 if (mmap_cache_got_sigbus(to
->mmap
, to
->cache_fd
))
3649 void journal_reset_metrics(JournalMetrics
*m
) {
3652 /* Set everything to "pick automatic values". */
3654 *m
= (JournalMetrics
) {
3655 .min_use
= (uint64_t) -1,
3656 .max_use
= (uint64_t) -1,
3657 .min_size
= (uint64_t) -1,
3658 .max_size
= (uint64_t) -1,
3659 .keep_free
= (uint64_t) -1,
3660 .n_max_files
= (uint64_t) -1,
3664 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
3665 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
], e
[FORMAT_BYTES_MAX
];
3672 if (fstatvfs(fd
, &ss
) >= 0)
3673 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
3675 log_debug_errno(errno
, "Failed to determine disk size: %m");
3679 if (m
->max_use
== (uint64_t) -1) {
3682 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
3684 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
3685 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
3687 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
3688 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3690 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3692 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3694 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3695 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3698 if (m
->min_use
== (uint64_t) -1)
3699 m
->min_use
= DEFAULT_MIN_USE
;
3701 if (m
->min_use
> m
->max_use
)
3702 m
->min_use
= m
->max_use
;
3704 if (m
->max_size
== (uint64_t) -1) {
3705 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3707 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3708 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3710 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3712 if (m
->max_size
!= 0) {
3713 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3714 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3716 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3717 m
->max_use
= m
->max_size
*2;
3720 if (m
->min_size
== (uint64_t) -1)
3721 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3723 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3725 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3726 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3728 if (m
->max_size
!= 0 && m
->min_size
> m
->max_size
)
3729 m
->max_size
= m
->min_size
;
3732 if (m
->keep_free
== (uint64_t) -1) {
3735 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3737 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3738 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3741 m
->keep_free
= DEFAULT_KEEP_FREE
;
3744 if (m
->n_max_files
== (uint64_t) -1)
3745 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3747 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3748 format_bytes(a
, sizeof(a
), m
->min_use
),
3749 format_bytes(b
, sizeof(b
), m
->max_use
),
3750 format_bytes(c
, sizeof(c
), m
->max_size
),
3751 format_bytes(d
, sizeof(d
), m
->min_size
),
3752 format_bytes(e
, sizeof(e
), m
->keep_free
),
3756 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3762 if (f
->header
->head_entry_realtime
== 0)
3765 *from
= le64toh(f
->header
->head_entry_realtime
);
3769 if (f
->header
->tail_entry_realtime
== 0)
3772 *to
= le64toh(f
->header
->tail_entry_realtime
);
3778 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3786 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3790 if (le64toh(o
->data
.n_entries
) <= 0)
3794 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3798 *from
= le64toh(o
->entry
.monotonic
);
3802 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3806 r
= generic_array_get_plus_one(f
,
3807 le64toh(o
->data
.entry_offset
),
3808 le64toh(o
->data
.entry_array_offset
),
3809 le64toh(o
->data
.n_entries
)-1,
3814 *to
= le64toh(o
->entry
.monotonic
);
3820 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3824 /* If we gained new header fields we gained new features,
3825 * hence suggest a rotation */
3826 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3827 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3831 /* Let's check if the hash tables grew over a certain fill
3832 * level (75%, borrowing this value from Java's hash table
3833 * implementation), and if so suggest a rotation. To calculate
3834 * the fill level we need the n_data field, which only exists
3835 * in newer versions. */
3837 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3838 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3839 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3841 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3842 le64toh(f
->header
->n_data
),
3843 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3844 (unsigned long long) f
->last_stat
.st_size
,
3845 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3849 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3850 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3851 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3853 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3854 le64toh(f
->header
->n_fields
),
3855 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3859 /* Are the data objects properly indexed by field objects? */
3860 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3861 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3862 le64toh(f
->header
->n_data
) > 0 &&
3863 le64toh(f
->header
->n_fields
) == 0)
3866 if (max_file_usec
> 0) {
3869 h
= le64toh(f
->header
->head_entry_realtime
);
3870 t
= now(CLOCK_REALTIME
);
3872 if (h
> 0 && t
> h
+ max_file_usec
)