1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2011 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/statvfs.h>
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
36 #include "journal-authenticate.h"
37 #include "journal-def.h"
38 #include "journal-file.h"
40 #include "parse-util.h"
41 #include "path-util.h"
42 #include "random-util.h"
45 #include "string-util.h"
47 #include "xattr-util.h"
49 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
50 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
52 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
54 /* This is the minimum journal file size */
55 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
57 /* These are the lower and upper bounds if we deduce the max_use value
58 * from the file system size */
59 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
60 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
62 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
63 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
65 /* This is the upper bound if we deduce max_size from max_use */
66 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
68 /* This is the upper bound if we deduce the keep_free value from the
70 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
72 /* This is the keep_free value when we can't determine the system
74 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
76 /* This is the default maximum number of journal files to keep around. */
77 #define DEFAULT_N_MAX_FILES (100)
79 /* n_data was the first entry we added after the initial file format design */
80 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
82 /* How many entries to keep in the entry array chain cache at max */
83 #define CHAIN_CACHE_MAX 20
85 /* How much to increase the journal file size at once each time we allocate something new. */
86 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
88 /* Reread fstat() of the file for detecting deletions at least this often */
89 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
91 /* The mmap context to use for the header we pick as one above the last defined typed */
92 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
95 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
98 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
99 * As a result we use atomic operations on f->offline_state for inter-thread communications with
100 * journal_file_set_offline() and journal_file_set_online(). */
101 static void journal_file_set_offline_internal(JournalFile
*f
) {
107 switch (f
->offline_state
) {
109 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_CANCEL
, OFFLINE_DONE
))
113 case OFFLINE_AGAIN_FROM_SYNCING
:
114 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_SYNCING
, OFFLINE_SYNCING
))
118 case OFFLINE_AGAIN_FROM_OFFLINING
:
119 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_OFFLINING
, OFFLINE_SYNCING
))
123 case OFFLINE_SYNCING
:
126 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_OFFLINING
))
129 f
->header
->state
= f
->archive
? STATE_ARCHIVED
: STATE_OFFLINE
;
133 case OFFLINE_OFFLINING
:
134 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_OFFLINING
, OFFLINE_DONE
))
141 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
147 static void * journal_file_set_offline_thread(void *arg
) {
148 JournalFile
*f
= arg
;
150 journal_file_set_offline_internal(f
);
155 static int journal_file_set_offline_thread_join(JournalFile
*f
) {
160 if (f
->offline_state
== OFFLINE_JOINED
)
163 r
= pthread_join(f
->offline_thread
, NULL
);
167 f
->offline_state
= OFFLINE_JOINED
;
169 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
175 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
176 static bool journal_file_set_offline_try_restart(JournalFile
*f
) {
178 switch (f
->offline_state
) {
179 case OFFLINE_AGAIN_FROM_SYNCING
:
180 case OFFLINE_AGAIN_FROM_OFFLINING
:
184 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_CANCEL
, OFFLINE_AGAIN_FROM_SYNCING
))
188 case OFFLINE_SYNCING
:
189 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_AGAIN_FROM_SYNCING
))
193 case OFFLINE_OFFLINING
:
194 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_OFFLINING
, OFFLINE_AGAIN_FROM_OFFLINING
))
204 /* Sets a journal offline.
206 * If wait is false then an offline is dispatched in a separate thread for a
207 * subsequent journal_file_set_offline() or journal_file_set_online() of the
208 * same journal to synchronize with.
210 * If wait is true, then either an existing offline thread will be restarted
211 * and joined, or if none exists the offline is simply performed in this
212 * context without involving another thread.
214 int journal_file_set_offline(JournalFile
*f
, bool wait
) {
223 if (!(f
->fd
>= 0 && f
->header
))
226 /* An offlining journal is implicitly online and may modify f->header->state,
227 * we must also join any potentially lingering offline thread when not online. */
228 if (!journal_file_is_offlining(f
) && f
->header
->state
!= STATE_ONLINE
)
229 return journal_file_set_offline_thread_join(f
);
231 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
232 restarted
= journal_file_set_offline_try_restart(f
);
233 if ((restarted
&& wait
) || !restarted
) {
234 r
= journal_file_set_offline_thread_join(f
);
242 /* Initiate a new offline. */
243 f
->offline_state
= OFFLINE_SYNCING
;
245 if (wait
) /* Without using a thread if waiting. */
246 journal_file_set_offline_internal(f
);
248 r
= pthread_create(&f
->offline_thread
, NULL
, journal_file_set_offline_thread
, f
);
250 f
->offline_state
= OFFLINE_JOINED
;
258 static int journal_file_set_online(JournalFile
*f
) {
266 if (!(f
->fd
>= 0 && f
->header
))
270 switch (f
->offline_state
) {
272 /* No offline thread, no need to wait. */
276 case OFFLINE_SYNCING
:
277 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_CANCEL
))
279 /* Canceled syncing prior to offlining, no need to wait. */
282 case OFFLINE_AGAIN_FROM_SYNCING
:
283 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_SYNCING
, OFFLINE_CANCEL
))
285 /* Canceled restart from syncing, no need to wait. */
288 case OFFLINE_AGAIN_FROM_OFFLINING
:
289 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_OFFLINING
, OFFLINE_CANCEL
))
291 /* Canceled restart from offlining, must wait for offlining to complete however. */
296 r
= journal_file_set_offline_thread_join(f
);
306 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
309 switch (f
->header
->state
) {
314 f
->header
->state
= STATE_ONLINE
;
323 bool journal_file_is_offlining(JournalFile
*f
) {
326 __sync_synchronize();
328 if (IN_SET(f
->offline_state
, OFFLINE_DONE
, OFFLINE_JOINED
))
334 JournalFile
* journal_file_close(JournalFile
*f
) {
338 /* Write the final tag */
339 if (f
->seal
&& f
->writable
) {
342 r
= journal_file_append_tag(f
);
344 log_error_errno(r
, "Failed to append tag when closing journal: %m");
348 if (f
->post_change_timer
) {
351 if (sd_event_source_get_enabled(f
->post_change_timer
, &enabled
) >= 0)
352 if (enabled
== SD_EVENT_ONESHOT
)
353 journal_file_post_change(f
);
355 (void) sd_event_source_set_enabled(f
->post_change_timer
, SD_EVENT_OFF
);
356 sd_event_source_unref(f
->post_change_timer
);
359 journal_file_set_offline(f
, true);
361 if (f
->mmap
&& f
->cache_fd
)
362 mmap_cache_free_fd(f
->mmap
, f
->cache_fd
);
364 if (f
->fd
>= 0 && f
->defrag_on_close
) {
366 /* Be friendly to btrfs: turn COW back on again now,
367 * and defragment the file. We won't write to the file
368 * ever again, hence remove all fragmentation, and
369 * reenable all the good bits COW usually provides
370 * (such as data checksumming). */
372 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
373 (void) btrfs_defrag_fd(f
->fd
);
380 mmap_cache_unref(f
->mmap
);
382 ordered_hashmap_free_free(f
->chain_cache
);
384 #if HAVE_XZ || HAVE_LZ4
385 free(f
->compress_buffer
);
390 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
392 free(f
->fsprg_state
);
397 gcry_md_close(f
->hmac
);
403 void journal_file_close_set(Set
*s
) {
408 while ((f
= set_steal_first(s
)))
409 (void) journal_file_close(f
);
412 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
419 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
420 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
422 h
.incompatible_flags
|= htole32(
423 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
424 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
426 h
.compatible_flags
= htole32(
427 f
->seal
* HEADER_COMPATIBLE_SEALED
);
429 r
= sd_id128_randomize(&h
.file_id
);
434 h
.seqnum_id
= template->header
->seqnum_id
;
435 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
437 h
.seqnum_id
= h
.file_id
;
439 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
449 static int fsync_directory_of_file(int fd
) {
450 _cleanup_free_
char *path
= NULL
, *dn
= NULL
;
451 _cleanup_close_
int dfd
= -1;
455 if (fstat(fd
, &st
) < 0)
458 if (!S_ISREG(st
.st_mode
))
461 r
= fd_get_path(fd
, &path
);
465 if (!path_is_absolute(path
))
468 dn
= dirname_malloc(path
);
472 dfd
= open(dn
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
482 static int journal_file_refresh_header(JournalFile
*f
) {
489 r
= sd_id128_get_machine(&f
->header
->machine_id
);
493 r
= sd_id128_get_boot(&boot_id
);
497 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
498 f
->tail_entry_monotonic_valid
= true;
500 f
->header
->boot_id
= boot_id
;
502 r
= journal_file_set_online(f
);
504 /* Sync the online state to disk */
507 /* We likely just created a new file, also sync the directory this file is located in. */
508 (void) fsync_directory_of_file(f
->fd
);
513 static bool warn_wrong_flags(const JournalFile
*f
, bool compatible
) {
514 const uint32_t any
= compatible
? HEADER_COMPATIBLE_ANY
: HEADER_INCOMPATIBLE_ANY
,
515 supported
= compatible
? HEADER_COMPATIBLE_SUPPORTED
: HEADER_INCOMPATIBLE_SUPPORTED
;
516 const char *type
= compatible
? "compatible" : "incompatible";
519 flags
= le32toh(compatible
? f
->header
->compatible_flags
: f
->header
->incompatible_flags
);
521 if (flags
& ~supported
) {
523 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32
,
524 f
->path
, type
, flags
& ~any
);
525 flags
= (flags
& any
) & ~supported
;
529 _cleanup_free_
char *t
= NULL
;
531 if (compatible
&& (flags
& HEADER_COMPATIBLE_SEALED
))
532 strv
[n
++] = "sealed";
533 if (!compatible
&& (flags
& HEADER_INCOMPATIBLE_COMPRESSED_XZ
))
534 strv
[n
++] = "xz-compressed";
535 if (!compatible
&& (flags
& HEADER_INCOMPATIBLE_COMPRESSED_LZ4
))
536 strv
[n
++] = "lz4-compressed";
538 assert(n
< ELEMENTSOF(strv
));
540 t
= strv_join((char**) strv
, ", ");
541 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
542 f
->path
, type
, n
> 1 ? "flags" : "flag", strnull(t
));
550 static int journal_file_verify_header(JournalFile
*f
) {
551 uint64_t arena_size
, header_size
;
556 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
559 /* In both read and write mode we refuse to open files with incompatible
560 * flags we don't know. */
561 if (warn_wrong_flags(f
, false))
562 return -EPROTONOSUPPORT
;
564 /* When open for writing we refuse to open files with compatible flags, too. */
565 if (f
->writable
&& warn_wrong_flags(f
, true))
566 return -EPROTONOSUPPORT
;
568 if (f
->header
->state
>= _STATE_MAX
)
571 header_size
= le64toh(f
->header
->header_size
);
573 /* The first addition was n_data, so check that we are at least this large */
574 if (header_size
< HEADER_SIZE_MIN
)
577 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
580 arena_size
= le64toh(f
->header
->arena_size
);
582 if (UINT64_MAX
- header_size
< arena_size
|| header_size
+ arena_size
> (uint64_t) f
->last_stat
.st_size
)
585 if (le64toh(f
->header
->tail_object_offset
) > header_size
+ arena_size
)
588 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
589 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
590 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
591 !VALID64(le64toh(f
->header
->entry_array_offset
)))
595 sd_id128_t machine_id
;
599 r
= sd_id128_get_machine(&machine_id
);
603 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
606 state
= f
->header
->state
;
608 if (state
== STATE_ARCHIVED
)
609 return -ESHUTDOWN
; /* Already archived */
610 else if (state
== STATE_ONLINE
) {
611 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
613 } else if (state
!= STATE_OFFLINE
) {
614 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
618 if (f
->header
->field_hash_table_size
== 0 || f
->header
->data_hash_table_size
== 0)
621 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
622 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
624 if (le64toh(f
->header
->tail_entry_realtime
) > now(CLOCK_REALTIME
)) {
625 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f
->path
);
630 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
631 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
633 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
638 static int journal_file_fstat(JournalFile
*f
) {
642 if (fstat(f
->fd
, &f
->last_stat
) < 0)
645 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
647 /* Refuse appending to files that are already deleted */
648 if (f
->last_stat
.st_nlink
<= 0)
654 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
655 uint64_t old_size
, new_size
;
661 /* We assume that this file is not sparse, and we know that
662 * for sure, since we always call posix_fallocate()
665 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
669 le64toh(f
->header
->header_size
) +
670 le64toh(f
->header
->arena_size
);
672 new_size
= PAGE_ALIGN(offset
+ size
);
673 if (new_size
< le64toh(f
->header
->header_size
))
674 new_size
= le64toh(f
->header
->header_size
);
676 if (new_size
<= old_size
) {
678 /* We already pre-allocated enough space, but before
679 * we write to it, let's check with fstat() if the
680 * file got deleted, in order make sure we don't throw
681 * away the data immediately. Don't check fstat() for
682 * all writes though, but only once ever 10s. */
684 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
687 return journal_file_fstat(f
);
690 /* Allocate more space. */
692 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
695 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
698 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
701 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
703 if (new_size
- old_size
> available
)
708 /* Increase by larger blocks at once */
709 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
710 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
711 new_size
= f
->metrics
.max_size
;
713 /* Note that the glibc fallocate() fallback is very
714 inefficient, hence we try to minimize the allocation area
716 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
720 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
722 return journal_file_fstat(f
);
725 static unsigned type_to_context(ObjectType type
) {
726 /* One context for each type, plus one catch-all for the rest */
727 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
728 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
729 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
732 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
, size_t *ret_size
) {
741 /* Avoid SIGBUS on invalid accesses */
742 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
743 /* Hmm, out of range? Let's refresh the fstat() data
744 * first, before we trust that check. */
746 r
= journal_file_fstat(f
);
750 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
751 return -EADDRNOTAVAIL
;
754 return mmap_cache_get(f
->mmap
, f
->cache_fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
, ret_size
);
757 static uint64_t minimum_header_size(Object
*o
) {
759 static const uint64_t table
[] = {
760 [OBJECT_DATA
] = sizeof(DataObject
),
761 [OBJECT_FIELD
] = sizeof(FieldObject
),
762 [OBJECT_ENTRY
] = sizeof(EntryObject
),
763 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
764 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
765 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
766 [OBJECT_TAG
] = sizeof(TagObject
),
769 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
770 return sizeof(ObjectHeader
);
772 return table
[o
->object
.type
];
775 /* Lightweight object checks. We want this to be fast, so that we won't
776 * slowdown every journal_file_move_to_object() call too much. */
777 static int journal_file_check_object(JournalFile
*f
, uint64_t offset
, Object
*o
) {
781 switch (o
->object
.type
) {
784 if ((le64toh(o
->data
.entry_offset
) == 0) ^ (le64toh(o
->data
.n_entries
) == 0)) {
785 log_debug("Bad n_entries: %"PRIu64
": %"PRIu64
,
786 le64toh(o
->data
.n_entries
), offset
);
790 if (le64toh(o
->object
.size
) - offsetof(DataObject
, payload
) <= 0) {
791 log_debug("Bad object size (<= %zu): %"PRIu64
": %"PRIu64
,
792 offsetof(DataObject
, payload
),
793 le64toh(o
->object
.size
),
798 if (!VALID64(le64toh(o
->data
.next_hash_offset
)) ||
799 !VALID64(le64toh(o
->data
.next_field_offset
)) ||
800 !VALID64(le64toh(o
->data
.entry_offset
)) ||
801 !VALID64(le64toh(o
->data
.entry_array_offset
))) {
802 log_debug("Invalid offset, next_hash_offset="OFSfmt
", next_field_offset="OFSfmt
803 ", entry_offset="OFSfmt
", entry_array_offset="OFSfmt
": %"PRIu64
,
804 le64toh(o
->data
.next_hash_offset
),
805 le64toh(o
->data
.next_field_offset
),
806 le64toh(o
->data
.entry_offset
),
807 le64toh(o
->data
.entry_array_offset
),
816 if (le64toh(o
->object
.size
) - offsetof(FieldObject
, payload
) <= 0) {
818 "Bad field size (<= %zu): %"PRIu64
": %"PRIu64
,
819 offsetof(FieldObject
, payload
),
820 le64toh(o
->object
.size
),
825 if (!VALID64(le64toh(o
->field
.next_hash_offset
)) ||
826 !VALID64(le64toh(o
->field
.head_data_offset
))) {
828 "Invalid offset, next_hash_offset="OFSfmt
829 ", head_data_offset="OFSfmt
": %"PRIu64
,
830 le64toh(o
->field
.next_hash_offset
),
831 le64toh(o
->field
.head_data_offset
),
838 if ((le64toh(o
->object
.size
) - offsetof(EntryObject
, items
)) % sizeof(EntryItem
) != 0) {
840 "Bad entry size (<= %zu): %"PRIu64
": %"PRIu64
,
841 offsetof(EntryObject
, items
),
842 le64toh(o
->object
.size
),
847 if ((le64toh(o
->object
.size
) - offsetof(EntryObject
, items
)) / sizeof(EntryItem
) <= 0) {
849 "Invalid number items in entry: %"PRIu64
": %"PRIu64
,
850 (le64toh(o
->object
.size
) - offsetof(EntryObject
, items
)) / sizeof(EntryItem
),
855 if (le64toh(o
->entry
.seqnum
) <= 0) {
857 "Invalid entry seqnum: %"PRIx64
": %"PRIu64
,
858 le64toh(o
->entry
.seqnum
),
863 if (!VALID_REALTIME(le64toh(o
->entry
.realtime
))) {
865 "Invalid entry realtime timestamp: %"PRIu64
": %"PRIu64
,
866 le64toh(o
->entry
.realtime
),
871 if (!VALID_MONOTONIC(le64toh(o
->entry
.monotonic
))) {
873 "Invalid entry monotonic timestamp: %"PRIu64
": %"PRIu64
,
874 le64toh(o
->entry
.monotonic
),
881 case OBJECT_DATA_HASH_TABLE
:
882 case OBJECT_FIELD_HASH_TABLE
:
883 if ((le64toh(o
->object
.size
) - offsetof(HashTableObject
, items
)) % sizeof(HashItem
) != 0 ||
884 (le64toh(o
->object
.size
) - offsetof(HashTableObject
, items
)) / sizeof(HashItem
) <= 0) {
886 "Invalid %s hash table size: %"PRIu64
": %"PRIu64
,
887 o
->object
.type
== OBJECT_DATA_HASH_TABLE
? "data" : "field",
888 le64toh(o
->object
.size
),
895 case OBJECT_ENTRY_ARRAY
:
896 if ((le64toh(o
->object
.size
) - offsetof(EntryArrayObject
, items
)) % sizeof(le64_t
) != 0 ||
897 (le64toh(o
->object
.size
) - offsetof(EntryArrayObject
, items
)) / sizeof(le64_t
) <= 0) {
899 "Invalid object entry array size: %"PRIu64
": %"PRIu64
,
900 le64toh(o
->object
.size
),
905 if (!VALID64(le64toh(o
->entry_array
.next_entry_array_offset
))) {
907 "Invalid object entry array next_entry_array_offset: "OFSfmt
": %"PRIu64
,
908 le64toh(o
->entry_array
.next_entry_array_offset
),
916 if (le64toh(o
->object
.size
) != sizeof(TagObject
)) {
918 "Invalid object tag size: %"PRIu64
": %"PRIu64
,
919 le64toh(o
->object
.size
),
924 if (!VALID_EPOCH(le64toh(o
->tag
.epoch
))) {
926 "Invalid object tag epoch: %"PRIu64
": %"PRIu64
,
927 le64toh(o
->tag
.epoch
),
938 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
948 /* Objects may only be located at multiple of 64 bit */
949 if (!VALID64(offset
)) {
950 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64
, offset
);
954 /* Object may not be located in the file header */
955 if (offset
< le64toh(f
->header
->header_size
)) {
956 log_debug("Attempt to move to object located in file header: %" PRIu64
, offset
);
960 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
, &tsize
);
965 s
= le64toh(o
->object
.size
);
968 log_debug("Attempt to move to uninitialized object: %" PRIu64
, offset
);
971 if (s
< sizeof(ObjectHeader
)) {
972 log_debug("Attempt to move to overly short object: %" PRIu64
, offset
);
976 if (o
->object
.type
<= OBJECT_UNUSED
) {
977 log_debug("Attempt to move to object with invalid type: %" PRIu64
, offset
);
981 if (s
< minimum_header_size(o
)) {
982 log_debug("Attempt to move to truncated object: %" PRIu64
, offset
);
986 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
) {
987 log_debug("Attempt to move to object of unexpected type: %" PRIu64
, offset
);
992 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
, NULL
);
999 r
= journal_file_check_object(f
, offset
, o
);
1007 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
1013 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
1016 /* If an external seqnum counter was passed, we update
1017 * both the local and the external one, and set it to
1018 * the maximum of both */
1020 if (*seqnum
+ 1 > r
)
1026 f
->header
->tail_entry_seqnum
= htole64(r
);
1028 if (f
->header
->head_entry_seqnum
== 0)
1029 f
->header
->head_entry_seqnum
= htole64(r
);
1034 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
1042 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
1043 assert(size
>= sizeof(ObjectHeader
));
1047 r
= journal_file_set_online(f
);
1051 p
= le64toh(f
->header
->tail_object_offset
);
1053 p
= le64toh(f
->header
->header_size
);
1055 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
1059 p
+= ALIGN64(le64toh(tail
->object
.size
));
1062 r
= journal_file_allocate(f
, p
, size
);
1066 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
, NULL
);
1073 o
->object
.type
= type
;
1074 o
->object
.size
= htole64(size
);
1076 f
->header
->tail_object_offset
= htole64(p
);
1077 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
1085 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
1093 /* We estimate that we need 1 hash table entry per 768 bytes
1094 of journal file and we want to make sure we never get
1095 beyond 75% fill level. Calculate the hash table size for
1096 the maximum file size based on these metrics. */
1098 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
1099 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
1100 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
1102 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
1104 r
= journal_file_append_object(f
,
1105 OBJECT_DATA_HASH_TABLE
,
1106 offsetof(Object
, hash_table
.items
) + s
,
1111 memzero(o
->hash_table
.items
, s
);
1113 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
1114 f
->header
->data_hash_table_size
= htole64(s
);
1119 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
1127 /* We use a fixed size hash table for the fields as this
1128 * number should grow very slowly only */
1130 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
1131 r
= journal_file_append_object(f
,
1132 OBJECT_FIELD_HASH_TABLE
,
1133 offsetof(Object
, hash_table
.items
) + s
,
1138 memzero(o
->hash_table
.items
, s
);
1140 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
1141 f
->header
->field_hash_table_size
= htole64(s
);
1146 int journal_file_map_data_hash_table(JournalFile
*f
) {
1154 if (f
->data_hash_table
)
1157 p
= le64toh(f
->header
->data_hash_table_offset
);
1158 s
= le64toh(f
->header
->data_hash_table_size
);
1160 r
= journal_file_move_to(f
,
1161 OBJECT_DATA_HASH_TABLE
,
1168 f
->data_hash_table
= t
;
1172 int journal_file_map_field_hash_table(JournalFile
*f
) {
1180 if (f
->field_hash_table
)
1183 p
= le64toh(f
->header
->field_hash_table_offset
);
1184 s
= le64toh(f
->header
->field_hash_table_size
);
1186 r
= journal_file_move_to(f
,
1187 OBJECT_FIELD_HASH_TABLE
,
1194 f
->field_hash_table
= t
;
1198 static int journal_file_link_field(
1209 assert(f
->field_hash_table
);
1213 if (o
->object
.type
!= OBJECT_FIELD
)
1216 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
1220 /* This might alter the window we are looking at */
1221 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
1224 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
1226 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
1228 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1232 o
->field
.next_hash_offset
= htole64(offset
);
1235 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
1237 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
1238 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
1243 static int journal_file_link_data(
1254 assert(f
->data_hash_table
);
1258 if (o
->object
.type
!= OBJECT_DATA
)
1261 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
1265 /* This might alter the window we are looking at */
1266 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
1267 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
1268 o
->data
.n_entries
= 0;
1271 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
1273 /* Only entry in the hash table is easy */
1274 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
1276 /* Move back to the previous data object, to patch in
1279 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1283 o
->data
.next_hash_offset
= htole64(offset
);
1286 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
1288 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
1289 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
1294 int journal_file_find_field_object_with_hash(
1296 const void *field
, uint64_t size
, uint64_t hash
,
1297 Object
**ret
, uint64_t *offset
) {
1299 uint64_t p
, osize
, h
, m
;
1304 assert(field
&& size
> 0);
1306 /* If the field hash table is empty, we can't find anything */
1307 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
1310 /* Map the field hash table, if it isn't mapped yet. */
1311 r
= journal_file_map_field_hash_table(f
);
1315 osize
= offsetof(Object
, field
.payload
) + size
;
1317 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
1322 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
1327 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1331 if (le64toh(o
->field
.hash
) == hash
&&
1332 le64toh(o
->object
.size
) == osize
&&
1333 memcmp(o
->field
.payload
, field
, size
) == 0) {
1343 p
= le64toh(o
->field
.next_hash_offset
);
1349 int journal_file_find_field_object(
1351 const void *field
, uint64_t size
,
1352 Object
**ret
, uint64_t *offset
) {
1357 assert(field
&& size
> 0);
1359 hash
= hash64(field
, size
);
1361 return journal_file_find_field_object_with_hash(f
,
1366 int journal_file_find_data_object_with_hash(
1368 const void *data
, uint64_t size
, uint64_t hash
,
1369 Object
**ret
, uint64_t *offset
) {
1371 uint64_t p
, osize
, h
, m
;
1376 assert(data
|| size
== 0);
1378 /* If there's no data hash table, then there's no entry. */
1379 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
1382 /* Map the data hash table, if it isn't mapped yet. */
1383 r
= journal_file_map_data_hash_table(f
);
1387 osize
= offsetof(Object
, data
.payload
) + size
;
1389 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
1394 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
1399 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1403 if (le64toh(o
->data
.hash
) != hash
)
1406 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
1407 #if HAVE_XZ || HAVE_LZ4
1411 l
= le64toh(o
->object
.size
);
1412 if (l
<= offsetof(Object
, data
.payload
))
1415 l
-= offsetof(Object
, data
.payload
);
1417 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
1418 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
1422 if (rsize
== size
&&
1423 memcmp(f
->compress_buffer
, data
, size
) == 0) {
1434 return -EPROTONOSUPPORT
;
1436 } else if (le64toh(o
->object
.size
) == osize
&&
1437 memcmp(o
->data
.payload
, data
, size
) == 0) {
1449 p
= le64toh(o
->data
.next_hash_offset
);
1455 int journal_file_find_data_object(
1457 const void *data
, uint64_t size
,
1458 Object
**ret
, uint64_t *offset
) {
1463 assert(data
|| size
== 0);
1465 hash
= hash64(data
, size
);
1467 return journal_file_find_data_object_with_hash(f
,
1472 static int journal_file_append_field(
1474 const void *field
, uint64_t size
,
1475 Object
**ret
, uint64_t *offset
) {
1483 assert(field
&& size
> 0);
1485 hash
= hash64(field
, size
);
1487 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1501 osize
= offsetof(Object
, field
.payload
) + size
;
1502 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1506 o
->field
.hash
= htole64(hash
);
1507 memcpy(o
->field
.payload
, field
, size
);
1509 r
= journal_file_link_field(f
, o
, p
, hash
);
1513 /* The linking might have altered the window, so let's
1514 * refresh our pointer */
1515 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1520 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1534 static int journal_file_append_data(
1536 const void *data
, uint64_t size
,
1537 Object
**ret
, uint64_t *offset
) {
1542 int r
, compression
= 0;
1546 assert(data
|| size
== 0);
1548 hash
= hash64(data
, size
);
1550 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1564 osize
= offsetof(Object
, data
.payload
) + size
;
1565 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1569 o
->data
.hash
= htole64(hash
);
1571 #if HAVE_XZ || HAVE_LZ4
1572 if (JOURNAL_FILE_COMPRESS(f
) && size
>= COMPRESSION_SIZE_THRESHOLD
) {
1575 compression
= compress_blob(data
, size
, o
->data
.payload
, size
- 1, &rsize
);
1577 if (compression
>= 0) {
1578 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1579 o
->object
.flags
|= compression
;
1581 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1582 size
, rsize
, object_compressed_to_string(compression
));
1584 /* Compression didn't work, we don't really care why, let's continue without compression */
1589 if (compression
== 0)
1590 memcpy_safe(o
->data
.payload
, data
, size
);
1592 r
= journal_file_link_data(f
, o
, p
, hash
);
1597 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1602 /* The linking might have altered the window, so let's
1603 * refresh our pointer */
1604 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1611 eq
= memchr(data
, '=', size
);
1612 if (eq
&& eq
> data
) {
1616 /* Create field object ... */
1617 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1621 /* ... and link it in. */
1622 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1623 fo
->field
.head_data_offset
= le64toh(p
);
1635 uint64_t journal_file_entry_n_items(Object
*o
) {
1638 if (o
->object
.type
!= OBJECT_ENTRY
)
1641 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1644 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1647 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1650 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1653 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1656 if (!IN_SET(o
->object
.type
, OBJECT_DATA_HASH_TABLE
, OBJECT_FIELD_HASH_TABLE
))
1659 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1662 static int link_entry_into_array(JournalFile
*f
,
1667 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1676 a
= le64toh(*first
);
1677 i
= hidx
= le64toh(*idx
);
1680 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1684 n
= journal_file_entry_array_n_items(o
);
1686 o
->entry_array
.items
[i
] = htole64(p
);
1687 *idx
= htole64(hidx
+ 1);
1693 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1704 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1705 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1711 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1716 o
->entry_array
.items
[i
] = htole64(p
);
1719 *first
= htole64(q
);
1721 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1725 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1728 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1729 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1731 *idx
= htole64(hidx
+ 1);
1736 static int link_entry_into_array_plus_one(JournalFile
*f
,
1751 *extra
= htole64(p
);
1755 i
= htole64(le64toh(*idx
) - 1);
1756 r
= link_entry_into_array(f
, first
, &i
, p
);
1761 *idx
= htole64(le64toh(*idx
) + 1);
1765 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1772 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1776 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1780 return link_entry_into_array_plus_one(f
,
1781 &o
->data
.entry_offset
,
1782 &o
->data
.entry_array_offset
,
1787 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1796 if (o
->object
.type
!= OBJECT_ENTRY
)
1799 __sync_synchronize();
1801 /* Link up the entry itself */
1802 r
= link_entry_into_array(f
,
1803 &f
->header
->entry_array_offset
,
1804 &f
->header
->n_entries
,
1809 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1811 if (f
->header
->head_entry_realtime
== 0)
1812 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1814 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1815 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1817 f
->tail_entry_monotonic_valid
= true;
1819 /* Link up the items */
1820 n
= journal_file_entry_n_items(o
);
1821 for (i
= 0; i
< n
; i
++) {
1822 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1830 static int journal_file_append_entry_internal(
1832 const dual_timestamp
*ts
,
1834 const EntryItem items
[], unsigned n_items
,
1836 Object
**ret
, uint64_t *offset
) {
1844 assert(items
|| n_items
== 0);
1847 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1849 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1853 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1854 memcpy_safe(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1855 o
->entry
.realtime
= htole64(ts
->realtime
);
1856 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1857 o
->entry
.xor_hash
= htole64(xor_hash
);
1858 o
->entry
.boot_id
= f
->header
->boot_id
;
1861 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1866 r
= journal_file_link_entry(f
, o
, np
);
1879 void journal_file_post_change(JournalFile
*f
) {
1882 /* inotify() does not receive IN_MODIFY events from file
1883 * accesses done via mmap(). After each access we hence
1884 * trigger IN_MODIFY by truncating the journal file to its
1885 * current size which triggers IN_MODIFY. */
1887 __sync_synchronize();
1889 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1890 log_debug_errno(errno
, "Failed to truncate file to its own size: %m");
1893 static int post_change_thunk(sd_event_source
*timer
, uint64_t usec
, void *userdata
) {
1896 journal_file_post_change(userdata
);
1901 static void schedule_post_change(JournalFile
*f
) {
1902 sd_event_source
*timer
;
1907 assert(f
->post_change_timer
);
1909 timer
= f
->post_change_timer
;
1911 r
= sd_event_source_get_enabled(timer
, &enabled
);
1913 log_debug_errno(r
, "Failed to get ftruncate timer state: %m");
1917 if (enabled
== SD_EVENT_ONESHOT
)
1920 r
= sd_event_now(sd_event_source_get_event(timer
), CLOCK_MONOTONIC
, &now
);
1922 log_debug_errno(r
, "Failed to get clock's now for scheduling ftruncate: %m");
1926 r
= sd_event_source_set_time(timer
, now
+f
->post_change_timer_period
);
1928 log_debug_errno(r
, "Failed to set time for scheduling ftruncate: %m");
1932 r
= sd_event_source_set_enabled(timer
, SD_EVENT_ONESHOT
);
1934 log_debug_errno(r
, "Failed to enable scheduled ftruncate: %m");
1941 /* On failure, let's simply post the change immediately. */
1942 journal_file_post_change(f
);
1945 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1946 int journal_file_enable_post_change_timer(JournalFile
*f
, sd_event
*e
, usec_t t
) {
1947 _cleanup_(sd_event_source_unrefp
) sd_event_source
*timer
= NULL
;
1951 assert_return(!f
->post_change_timer
, -EINVAL
);
1955 r
= sd_event_add_time(e
, &timer
, CLOCK_MONOTONIC
, 0, 0, post_change_thunk
, f
);
1959 r
= sd_event_source_set_enabled(timer
, SD_EVENT_OFF
);
1963 f
->post_change_timer
= timer
;
1965 f
->post_change_timer_period
= t
;
1970 static int entry_item_cmp(const void *_a
, const void *_b
) {
1971 const EntryItem
*a
= _a
, *b
= _b
;
1973 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1975 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1980 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1984 uint64_t xor_hash
= 0;
1985 struct dual_timestamp _ts
;
1989 assert(iovec
|| n_iovec
== 0);
1992 dual_timestamp_get(&_ts
);
1997 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
2002 /* alloca() can't take 0, hence let's allocate at least one */
2003 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
2005 for (i
= 0; i
< n_iovec
; i
++) {
2009 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
2013 xor_hash
^= le64toh(o
->data
.hash
);
2014 items
[i
].object_offset
= htole64(p
);
2015 items
[i
].hash
= o
->data
.hash
;
2018 /* Order by the position on disk, in order to improve seek
2019 * times for rotating media. */
2020 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
2022 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
2024 /* If the memory mapping triggered a SIGBUS then we return an
2025 * IO error and ignore the error code passed down to us, since
2026 * it is very likely just an effect of a nullified replacement
2029 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
2032 if (f
->post_change_timer
)
2033 schedule_post_change(f
);
2035 journal_file_post_change(f
);
2040 typedef struct ChainCacheItem
{
2041 uint64_t first
; /* the array at the beginning of the chain */
2042 uint64_t array
; /* the cached array */
2043 uint64_t begin
; /* the first item in the cached array */
2044 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
2045 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
2048 static void chain_cache_put(
2055 uint64_t last_index
) {
2058 /* If the chain item to cache for this chain is the
2059 * first one it's not worth caching anything */
2063 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
2064 ci
= ordered_hashmap_steal_first(h
);
2067 ci
= new(ChainCacheItem
, 1);
2074 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
2079 assert(ci
->first
== first
);
2084 ci
->last_index
= last_index
;
2087 static int generic_array_get(
2091 Object
**ret
, uint64_t *offset
) {
2094 uint64_t p
= 0, a
, t
= 0;
2102 /* Try the chain cache first */
2103 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
2104 if (ci
&& i
> ci
->total
) {
2113 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
2117 k
= journal_file_entry_array_n_items(o
);
2119 p
= le64toh(o
->entry_array
.items
[i
]);
2125 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
2131 /* Let's cache this item for the next invocation */
2132 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
2134 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2147 static int generic_array_get_plus_one(
2152 Object
**ret
, uint64_t *offset
) {
2161 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
2174 return generic_array_get(f
, first
, i
-1, ret
, offset
);
2183 static int generic_array_bisect(
2188 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
2189 direction_t direction
,
2194 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
2195 bool subtract_one
= false;
2196 Object
*o
, *array
= NULL
;
2201 assert(test_object
);
2203 /* Start with the first array in the chain */
2206 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
2207 if (ci
&& n
> ci
->total
) {
2208 /* Ah, we have iterated this bisection array chain
2209 * previously! Let's see if we can skip ahead in the
2210 * chain, as far as the last time. But we can't jump
2211 * backwards in the chain, so let's check that
2214 r
= test_object(f
, ci
->begin
, needle
);
2218 if (r
== TEST_LEFT
) {
2219 /* OK, what we are looking for is right of the
2220 * begin of this EntryArray, so let's jump
2221 * straight to previously cached array in the
2227 last_index
= ci
->last_index
;
2232 uint64_t left
, right
, k
, lp
;
2234 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
2238 k
= journal_file_entry_array_n_items(array
);
2244 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
2248 r
= test_object(f
, p
, needle
);
2249 if (r
== -EBADMSG
) {
2250 log_debug_errno(r
, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2257 if (r
== TEST_FOUND
)
2258 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2260 if (r
== TEST_RIGHT
) {
2264 if (last_index
!= (uint64_t) -1) {
2265 assert(last_index
<= right
);
2267 /* If we cached the last index we
2268 * looked at, let's try to not to jump
2269 * too wildly around and see if we can
2270 * limit the range to look at early to
2271 * the immediate neighbors of the last
2272 * index we looked at. */
2274 if (last_index
> 0) {
2275 uint64_t x
= last_index
- 1;
2277 p
= le64toh(array
->entry_array
.items
[x
]);
2281 r
= test_object(f
, p
, needle
);
2285 if (r
== TEST_FOUND
)
2286 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2288 if (r
== TEST_RIGHT
)
2294 if (last_index
< right
) {
2295 uint64_t y
= last_index
+ 1;
2297 p
= le64toh(array
->entry_array
.items
[y
]);
2301 r
= test_object(f
, p
, needle
);
2305 if (r
== TEST_FOUND
)
2306 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2308 if (r
== TEST_RIGHT
)
2316 if (left
== right
) {
2317 if (direction
== DIRECTION_UP
)
2318 subtract_one
= true;
2324 assert(left
< right
);
2325 i
= (left
+ right
) / 2;
2327 p
= le64toh(array
->entry_array
.items
[i
]);
2331 r
= test_object(f
, p
, needle
);
2332 if (r
== -EBADMSG
) {
2333 log_debug_errno(r
, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2340 if (r
== TEST_FOUND
)
2341 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2343 if (r
== TEST_RIGHT
)
2351 if (direction
== DIRECTION_UP
) {
2353 subtract_one
= true;
2364 last_index
= (uint64_t) -1;
2365 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
2371 if (subtract_one
&& t
== 0 && i
== 0)
2374 /* Let's cache this item for the next invocation */
2375 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
2377 if (subtract_one
&& i
== 0)
2379 else if (subtract_one
)
2380 p
= le64toh(array
->entry_array
.items
[i
-1]);
2382 p
= le64toh(array
->entry_array
.items
[i
]);
2384 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2395 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
2400 static int generic_array_bisect_plus_one(
2406 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
2407 direction_t direction
,
2413 bool step_back
= false;
2417 assert(test_object
);
2422 /* This bisects the array in object 'first', but first checks
2424 r
= test_object(f
, extra
, needle
);
2428 if (r
== TEST_FOUND
)
2429 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2431 /* if we are looking with DIRECTION_UP then we need to first
2432 see if in the actual array there is a matching entry, and
2433 return the last one of that. But if there isn't any we need
2434 to return this one. Hence remember this, and return it
2437 step_back
= direction
== DIRECTION_UP
;
2439 if (r
== TEST_RIGHT
) {
2440 if (direction
== DIRECTION_DOWN
)
2446 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
2448 if (r
== 0 && step_back
)
2457 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
2473 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2479 else if (p
< needle
)
2485 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2492 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2496 if (le64toh(o
->entry
.seqnum
) == needle
)
2498 else if (le64toh(o
->entry
.seqnum
) < needle
)
2504 int journal_file_move_to_entry_by_seqnum(
2507 direction_t direction
,
2513 return generic_array_bisect(f
,
2514 le64toh(f
->header
->entry_array_offset
),
2515 le64toh(f
->header
->n_entries
),
2522 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2529 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2533 if (le64toh(o
->entry
.realtime
) == needle
)
2535 else if (le64toh(o
->entry
.realtime
) < needle
)
2541 int journal_file_move_to_entry_by_realtime(
2544 direction_t direction
,
2550 return generic_array_bisect(f
,
2551 le64toh(f
->header
->entry_array_offset
),
2552 le64toh(f
->header
->n_entries
),
2554 test_object_realtime
,
2559 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2566 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2570 if (le64toh(o
->entry
.monotonic
) == needle
)
2572 else if (le64toh(o
->entry
.monotonic
) < needle
)
2578 static int find_data_object_by_boot_id(
2584 char t
[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2586 sd_id128_to_string(boot_id
, t
+ 9);
2587 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2590 int journal_file_move_to_entry_by_monotonic(
2594 direction_t direction
,
2603 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2609 return generic_array_bisect_plus_one(f
,
2610 le64toh(o
->data
.entry_offset
),
2611 le64toh(o
->data
.entry_array_offset
),
2612 le64toh(o
->data
.n_entries
),
2614 test_object_monotonic
,
2619 void journal_file_reset_location(JournalFile
*f
) {
2620 f
->location_type
= LOCATION_HEAD
;
2621 f
->current_offset
= 0;
2622 f
->current_seqnum
= 0;
2623 f
->current_realtime
= 0;
2624 f
->current_monotonic
= 0;
2625 zero(f
->current_boot_id
);
2626 f
->current_xor_hash
= 0;
2629 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2630 f
->location_type
= LOCATION_SEEK
;
2631 f
->current_offset
= offset
;
2632 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2633 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2634 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2635 f
->current_boot_id
= o
->entry
.boot_id
;
2636 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2639 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2644 assert(af
->location_type
== LOCATION_SEEK
);
2645 assert(bf
->location_type
== LOCATION_SEEK
);
2647 /* If contents and timestamps match, these entries are
2648 * identical, even if the seqnum does not match */
2649 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2650 af
->current_monotonic
== bf
->current_monotonic
&&
2651 af
->current_realtime
== bf
->current_realtime
&&
2652 af
->current_xor_hash
== bf
->current_xor_hash
)
2655 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2657 /* If this is from the same seqnum source, compare
2659 if (af
->current_seqnum
< bf
->current_seqnum
)
2661 if (af
->current_seqnum
> bf
->current_seqnum
)
2664 /* Wow! This is weird, different data but the same
2665 * seqnums? Something is borked, but let's make the
2666 * best of it and compare by time. */
2669 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2671 /* If the boot id matches, compare monotonic time */
2672 if (af
->current_monotonic
< bf
->current_monotonic
)
2674 if (af
->current_monotonic
> bf
->current_monotonic
)
2678 /* Otherwise, compare UTC time */
2679 if (af
->current_realtime
< bf
->current_realtime
)
2681 if (af
->current_realtime
> bf
->current_realtime
)
2684 /* Finally, compare by contents */
2685 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2687 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2693 static int bump_array_index(uint64_t *i
, direction_t direction
, uint64_t n
) {
2695 /* Increase or decrease the specified index, in the right direction. */
2697 if (direction
== DIRECTION_DOWN
) {
2712 static bool check_properly_ordered(uint64_t new_offset
, uint64_t old_offset
, direction_t direction
) {
2714 /* Consider it an error if any of the two offsets is uninitialized */
2715 if (old_offset
== 0 || new_offset
== 0)
2718 /* If we go down, the new offset must be larger than the old one. */
2719 return direction
== DIRECTION_DOWN
?
2720 new_offset
> old_offset
:
2721 new_offset
< old_offset
;
2724 int journal_file_next_entry(
2727 direction_t direction
,
2728 Object
**ret
, uint64_t *offset
) {
2736 n
= le64toh(f
->header
->n_entries
);
2741 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2743 r
= generic_array_bisect(f
,
2744 le64toh(f
->header
->entry_array_offset
),
2745 le64toh(f
->header
->n_entries
),
2754 r
= bump_array_index(&i
, direction
, n
);
2759 /* And jump to it */
2761 r
= generic_array_get(f
,
2762 le64toh(f
->header
->entry_array_offset
),
2770 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2771 * the next one might work for us instead. */
2772 log_debug_errno(r
, "Entry item %" PRIu64
" is bad, skipping over it.", i
);
2774 r
= bump_array_index(&i
, direction
, n
);
2779 /* Ensure our array is properly ordered. */
2780 if (p
> 0 && !check_properly_ordered(ofs
, p
, direction
)) {
2781 log_debug("%s: entry array not properly ordered at entry %" PRIu64
, f
->path
, i
);
2791 int journal_file_next_entry_for_data(
2793 Object
*o
, uint64_t p
,
2794 uint64_t data_offset
,
2795 direction_t direction
,
2796 Object
**ret
, uint64_t *offset
) {
2803 assert(p
> 0 || !o
);
2805 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2809 n
= le64toh(d
->data
.n_entries
);
2814 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2816 if (o
->object
.type
!= OBJECT_ENTRY
)
2819 r
= generic_array_bisect_plus_one(f
,
2820 le64toh(d
->data
.entry_offset
),
2821 le64toh(d
->data
.entry_array_offset
),
2822 le64toh(d
->data
.n_entries
),
2832 r
= bump_array_index(&i
, direction
, n
);
2838 r
= generic_array_get_plus_one(f
,
2839 le64toh(d
->data
.entry_offset
),
2840 le64toh(d
->data
.entry_array_offset
),
2848 log_debug_errno(r
, "Data entry item %" PRIu64
" is bad, skipping over it.", i
);
2850 r
= bump_array_index(&i
, direction
, n
);
2855 /* Ensure our array is properly ordered. */
2856 if (p
> 0 && check_properly_ordered(ofs
, p
, direction
)) {
2857 log_debug("%s data entry array not properly ordered at entry %" PRIu64
, f
->path
, i
);
2867 int journal_file_move_to_entry_by_offset_for_data(
2869 uint64_t data_offset
,
2871 direction_t direction
,
2872 Object
**ret
, uint64_t *offset
) {
2879 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2883 return generic_array_bisect_plus_one(f
,
2884 le64toh(d
->data
.entry_offset
),
2885 le64toh(d
->data
.entry_array_offset
),
2886 le64toh(d
->data
.n_entries
),
2893 int journal_file_move_to_entry_by_monotonic_for_data(
2895 uint64_t data_offset
,
2898 direction_t direction
,
2899 Object
**ret
, uint64_t *offset
) {
2907 /* First, seek by time */
2908 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2914 r
= generic_array_bisect_plus_one(f
,
2915 le64toh(o
->data
.entry_offset
),
2916 le64toh(o
->data
.entry_array_offset
),
2917 le64toh(o
->data
.n_entries
),
2919 test_object_monotonic
,
2925 /* And now, continue seeking until we find an entry that
2926 * exists in both bisection arrays */
2932 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2936 r
= generic_array_bisect_plus_one(f
,
2937 le64toh(d
->data
.entry_offset
),
2938 le64toh(d
->data
.entry_array_offset
),
2939 le64toh(d
->data
.n_entries
),
2947 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2951 r
= generic_array_bisect_plus_one(f
,
2952 le64toh(o
->data
.entry_offset
),
2953 le64toh(o
->data
.entry_array_offset
),
2954 le64toh(o
->data
.n_entries
),
2976 int journal_file_move_to_entry_by_seqnum_for_data(
2978 uint64_t data_offset
,
2980 direction_t direction
,
2981 Object
**ret
, uint64_t *offset
) {
2988 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2992 return generic_array_bisect_plus_one(f
,
2993 le64toh(d
->data
.entry_offset
),
2994 le64toh(d
->data
.entry_array_offset
),
2995 le64toh(d
->data
.n_entries
),
3002 int journal_file_move_to_entry_by_realtime_for_data(
3004 uint64_t data_offset
,
3006 direction_t direction
,
3007 Object
**ret
, uint64_t *offset
) {
3014 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
3018 return generic_array_bisect_plus_one(f
,
3019 le64toh(d
->data
.entry_offset
),
3020 le64toh(d
->data
.entry_array_offset
),
3021 le64toh(d
->data
.n_entries
),
3023 test_object_realtime
,
3028 void journal_file_dump(JournalFile
*f
) {
3036 journal_file_print_header(f
);
3038 p
= le64toh(f
->header
->header_size
);
3040 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
3044 switch (o
->object
.type
) {
3047 printf("Type: OBJECT_UNUSED\n");
3051 printf("Type: OBJECT_DATA\n");
3055 printf("Type: OBJECT_FIELD\n");
3059 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
3060 le64toh(o
->entry
.seqnum
),
3061 le64toh(o
->entry
.monotonic
),
3062 le64toh(o
->entry
.realtime
));
3065 case OBJECT_FIELD_HASH_TABLE
:
3066 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3069 case OBJECT_DATA_HASH_TABLE
:
3070 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3073 case OBJECT_ENTRY_ARRAY
:
3074 printf("Type: OBJECT_ENTRY_ARRAY\n");
3078 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
3079 le64toh(o
->tag
.seqnum
),
3080 le64toh(o
->tag
.epoch
));
3084 printf("Type: unknown (%i)\n", o
->object
.type
);
3088 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
3089 printf("Flags: %s\n",
3090 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
3092 if (p
== le64toh(f
->header
->tail_object_offset
))
3095 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
3100 log_error("File corrupt");
3103 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
3106 x
= format_timestamp(buf
, l
, t
);
3112 void journal_file_print_header(JournalFile
*f
) {
3113 char a
[33], b
[33], c
[33], d
[33];
3114 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
3116 char bytes
[FORMAT_BYTES_MAX
];
3121 printf("File Path: %s\n"
3125 "Sequential Number ID: %s\n"
3127 "Compatible Flags:%s%s\n"
3128 "Incompatible Flags:%s%s%s\n"
3129 "Header size: %"PRIu64
"\n"
3130 "Arena size: %"PRIu64
"\n"
3131 "Data Hash Table Size: %"PRIu64
"\n"
3132 "Field Hash Table Size: %"PRIu64
"\n"
3133 "Rotate Suggested: %s\n"
3134 "Head Sequential Number: %"PRIu64
" (%"PRIx64
")\n"
3135 "Tail Sequential Number: %"PRIu64
" (%"PRIx64
")\n"
3136 "Head Realtime Timestamp: %s (%"PRIx64
")\n"
3137 "Tail Realtime Timestamp: %s (%"PRIx64
")\n"
3138 "Tail Monotonic Timestamp: %s (%"PRIx64
")\n"
3139 "Objects: %"PRIu64
"\n"
3140 "Entry Objects: %"PRIu64
"\n",
3142 sd_id128_to_string(f
->header
->file_id
, a
),
3143 sd_id128_to_string(f
->header
->machine_id
, b
),
3144 sd_id128_to_string(f
->header
->boot_id
, c
),
3145 sd_id128_to_string(f
->header
->seqnum_id
, d
),
3146 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
3147 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
3148 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
3149 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
3150 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
3151 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
3152 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
3153 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
3154 le64toh(f
->header
->header_size
),
3155 le64toh(f
->header
->arena_size
),
3156 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3157 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
3158 yes_no(journal_file_rotate_suggested(f
, 0)),
3159 le64toh(f
->header
->head_entry_seqnum
), le64toh(f
->header
->head_entry_seqnum
),
3160 le64toh(f
->header
->tail_entry_seqnum
), le64toh(f
->header
->tail_entry_seqnum
),
3161 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)), le64toh(f
->header
->head_entry_realtime
),
3162 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)), le64toh(f
->header
->tail_entry_realtime
),
3163 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
), le64toh(f
->header
->tail_entry_monotonic
),
3164 le64toh(f
->header
->n_objects
),
3165 le64toh(f
->header
->n_entries
));
3167 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3168 printf("Data Objects: %"PRIu64
"\n"
3169 "Data Hash Table Fill: %.1f%%\n",
3170 le64toh(f
->header
->n_data
),
3171 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
3173 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3174 printf("Field Objects: %"PRIu64
"\n"
3175 "Field Hash Table Fill: %.1f%%\n",
3176 le64toh(f
->header
->n_fields
),
3177 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
3179 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
3180 printf("Tag Objects: %"PRIu64
"\n",
3181 le64toh(f
->header
->n_tags
));
3182 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
3183 printf("Entry Array Objects: %"PRIu64
"\n",
3184 le64toh(f
->header
->n_entry_arrays
));
3186 if (fstat(f
->fd
, &st
) >= 0)
3187 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (uint64_t) st
.st_blocks
* 512ULL));
3190 static int journal_file_warn_btrfs(JournalFile
*f
) {
3196 /* Before we write anything, check if the COW logic is turned
3197 * off on btrfs. Given our write pattern that is quite
3198 * unfriendly to COW file systems this should greatly improve
3199 * performance on COW file systems, such as btrfs, at the
3200 * expense of data integrity features (which shouldn't be too
3201 * bad, given that we do our own checksumming). */
3203 r
= btrfs_is_filesystem(f
->fd
);
3205 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
3209 r
= read_attr_fd(f
->fd
, &attrs
);
3211 return log_warning_errno(r
, "Failed to read file attributes: %m");
3213 if (attrs
& FS_NOCOW_FL
) {
3214 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3218 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3219 "This is likely to slow down journal access substantially, please consider turning "
3220 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
3225 int journal_file_open(
3232 JournalMetrics
*metrics
,
3233 MMapCache
*mmap_cache
,
3234 Set
*deferred_closes
,
3235 JournalFile
*template,
3236 JournalFile
**ret
) {
3238 bool newly_created
= false;
3244 assert(fd
>= 0 || fname
);
3246 if (!IN_SET((flags
& O_ACCMODE
), O_RDONLY
, O_RDWR
))
3250 if (!endswith(fname
, ".journal") &&
3251 !endswith(fname
, ".journal~"))
3255 f
= new0(JournalFile
, 1);
3263 f
->prot
= prot_from_flags(flags
);
3264 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
3266 f
->compress_lz4
= compress
;
3268 f
->compress_xz
= compress
;
3275 f
->mmap
= mmap_cache_ref(mmap_cache
);
3277 f
->mmap
= mmap_cache_new();
3285 f
->path
= strdup(fname
);
3291 /* If we don't know the path, fill in something explanatory and vaguely useful */
3292 if (asprintf(&f
->path
, "/proc/self/%i", fd
) < 0) {
3298 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
3299 if (!f
->chain_cache
) {
3305 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
, f
->mode
);
3311 /* fds we opened here by us should also be closed by us. */
3315 f
->cache_fd
= mmap_cache_add_fd(f
->mmap
, f
->fd
);
3321 r
= journal_file_fstat(f
);
3325 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
3327 (void) journal_file_warn_btrfs(f
);
3329 /* Let's attach the creation time to the journal file,
3330 * so that the vacuuming code knows the age of this
3331 * file even if the file might end up corrupted one
3332 * day... Ideally we'd just use the creation time many
3333 * file systems maintain for each file, but there is
3334 * currently no usable API to query this, hence let's
3335 * emulate this via extended attributes. If extended
3336 * attributes are not supported we'll just skip this,
3337 * and rely solely on mtime/atime/ctime of the file. */
3339 fd_setcrtime(f
->fd
, 0);
3342 /* Try to load the FSPRG state, and if we can't, then
3343 * just don't do sealing */
3345 r
= journal_file_fss_load(f
);
3351 r
= journal_file_init_header(f
, template);
3355 r
= journal_file_fstat(f
);
3359 newly_created
= true;
3362 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
3367 r
= mmap_cache_get(f
->mmap
, f
->cache_fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
, NULL
);
3373 if (!newly_created
) {
3374 if (deferred_closes
)
3375 journal_file_close_set(deferred_closes
);
3377 r
= journal_file_verify_header(f
);
3383 if (!newly_created
&& f
->writable
) {
3384 r
= journal_file_fss_load(f
);
3392 journal_default_metrics(metrics
, f
->fd
);
3393 f
->metrics
= *metrics
;
3394 } else if (template)
3395 f
->metrics
= template->metrics
;
3397 r
= journal_file_refresh_header(f
);
3403 r
= journal_file_hmac_setup(f
);
3408 if (newly_created
) {
3409 r
= journal_file_setup_field_hash_table(f
);
3413 r
= journal_file_setup_data_hash_table(f
);
3418 r
= journal_file_append_first_tag(f
);
3424 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
)) {
3429 if (template && template->post_change_timer
) {
3430 r
= journal_file_enable_post_change_timer(
3432 sd_event_source_get_event(template->post_change_timer
),
3433 template->post_change_timer_period
);
3439 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3446 if (f
->cache_fd
&& mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
3449 (void) journal_file_close(f
);
3454 int journal_file_rotate(JournalFile
**f
, bool compress
, bool seal
, Set
*deferred_closes
) {
3455 _cleanup_free_
char *p
= NULL
;
3457 JournalFile
*old_file
, *new_file
= NULL
;
3465 if (!old_file
->writable
)
3468 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3469 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3470 if (path_startswith(old_file
->path
, "/proc/self/fd"))
3473 if (!endswith(old_file
->path
, ".journal"))
3476 l
= strlen(old_file
->path
);
3477 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
3478 (int) l
- 8, old_file
->path
,
3479 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
3480 le64toh((*f
)->header
->head_entry_seqnum
),
3481 le64toh((*f
)->header
->head_entry_realtime
));
3485 /* Try to rename the file to the archived version. If the file
3486 * already was deleted, we'll get ENOENT, let's ignore that
3488 r
= rename(old_file
->path
, p
);
3489 if (r
< 0 && errno
!= ENOENT
)
3492 /* Sync the rename to disk */
3493 (void) fsync_directory_of_file(old_file
->fd
);
3495 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3496 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3497 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3498 * would result in the rotated journal never getting fsync() called before closing.
3499 * Now we simply queue the archive state by setting an archive bit, leaving the state
3500 * as STATE_ONLINE so proper offlining occurs. */
3501 old_file
->archive
= true;
3503 /* Currently, btrfs is not very good with out write patterns
3504 * and fragments heavily. Let's defrag our journal files when
3505 * we archive them */
3506 old_file
->defrag_on_close
= true;
3508 r
= journal_file_open(-1, old_file
->path
, old_file
->flags
, old_file
->mode
, compress
, seal
, NULL
, old_file
->mmap
, deferred_closes
, old_file
, &new_file
);
3510 if (deferred_closes
&&
3511 set_put(deferred_closes
, old_file
) >= 0)
3512 (void) journal_file_set_offline(old_file
, false);
3514 (void) journal_file_close(old_file
);
3520 int journal_file_open_reliably(
3526 JournalMetrics
*metrics
,
3527 MMapCache
*mmap_cache
,
3528 Set
*deferred_closes
,
3529 JournalFile
*template,
3530 JournalFile
**ret
) {
3534 _cleanup_free_
char *p
= NULL
;
3536 r
= journal_file_open(-1, fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, deferred_closes
, template, ret
);
3538 -EBADMSG
, /* Corrupted */
3539 -ENODATA
, /* Truncated */
3540 -EHOSTDOWN
, /* Other machine */
3541 -EPROTONOSUPPORT
, /* Incompatible feature */
3542 -EBUSY
, /* Unclean shutdown */
3543 -ESHUTDOWN
, /* Already archived */
3544 -EIO
, /* IO error, including SIGBUS on mmap */
3545 -EIDRM
, /* File has been deleted */
3546 -ETXTBSY
)) /* File is from the future */
3549 if ((flags
& O_ACCMODE
) == O_RDONLY
)
3552 if (!(flags
& O_CREAT
))
3555 if (!endswith(fname
, ".journal"))
3558 /* The file is corrupted. Rotate it away and try it again (but only once) */
3561 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
3563 now(CLOCK_REALTIME
),
3567 if (rename(fname
, p
) < 0)
3570 /* btrfs doesn't cope well with our write pattern and
3571 * fragments heavily. Let's defrag all files we rotate */
3573 (void) chattr_path(p
, 0, FS_NOCOW_FL
);
3574 (void) btrfs_defrag(p
);
3576 log_warning_errno(r
, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
3578 return journal_file_open(-1, fname
, flags
, mode
, compress
, seal
, metrics
, mmap_cache
, deferred_closes
, template, ret
);
3581 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
3583 uint64_t q
, xor_hash
= 0;
3596 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
3597 ts
.realtime
= le64toh(o
->entry
.realtime
);
3599 n
= journal_file_entry_n_items(o
);
3600 /* alloca() can't take 0, hence let's allocate at least one */
3601 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
3603 for (i
= 0; i
< n
; i
++) {
3610 q
= le64toh(o
->entry
.items
[i
].object_offset
);
3611 le_hash
= o
->entry
.items
[i
].hash
;
3613 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
3617 if (le_hash
!= o
->data
.hash
)
3620 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
3623 /* We hit the limit on 32bit machines */
3624 if ((uint64_t) t
!= l
)
3627 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
3628 #if HAVE_XZ || HAVE_LZ4
3631 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
3632 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
3636 data
= from
->compress_buffer
;
3639 return -EPROTONOSUPPORT
;
3642 data
= o
->data
.payload
;
3644 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
3648 xor_hash
^= le64toh(u
->data
.hash
);
3649 items
[i
].object_offset
= htole64(h
);
3650 items
[i
].hash
= u
->data
.hash
;
3652 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
3657 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
3659 if (mmap_cache_got_sigbus(to
->mmap
, to
->cache_fd
))
3665 void journal_reset_metrics(JournalMetrics
*m
) {
3668 /* Set everything to "pick automatic values". */
3670 *m
= (JournalMetrics
) {
3671 .min_use
= (uint64_t) -1,
3672 .max_use
= (uint64_t) -1,
3673 .min_size
= (uint64_t) -1,
3674 .max_size
= (uint64_t) -1,
3675 .keep_free
= (uint64_t) -1,
3676 .n_max_files
= (uint64_t) -1,
3680 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
3681 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
], e
[FORMAT_BYTES_MAX
];
3688 if (fstatvfs(fd
, &ss
) >= 0)
3689 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
3691 log_debug_errno(errno
, "Failed to detremine disk size: %m");
3695 if (m
->max_use
== (uint64_t) -1) {
3698 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
3700 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
3701 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
3703 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
3704 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3706 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3708 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3710 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3711 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3714 if (m
->min_use
== (uint64_t) -1)
3715 m
->min_use
= DEFAULT_MIN_USE
;
3717 if (m
->min_use
> m
->max_use
)
3718 m
->min_use
= m
->max_use
;
3720 if (m
->max_size
== (uint64_t) -1) {
3721 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3723 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3724 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3726 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3728 if (m
->max_size
!= 0) {
3729 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3730 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3732 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3733 m
->max_use
= m
->max_size
*2;
3736 if (m
->min_size
== (uint64_t) -1)
3737 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3739 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3741 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3742 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3744 if (m
->max_size
!= 0 && m
->min_size
> m
->max_size
)
3745 m
->max_size
= m
->min_size
;
3748 if (m
->keep_free
== (uint64_t) -1) {
3751 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3753 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3754 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3757 m
->keep_free
= DEFAULT_KEEP_FREE
;
3760 if (m
->n_max_files
== (uint64_t) -1)
3761 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3763 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3764 format_bytes(a
, sizeof(a
), m
->min_use
),
3765 format_bytes(b
, sizeof(b
), m
->max_use
),
3766 format_bytes(c
, sizeof(c
), m
->max_size
),
3767 format_bytes(d
, sizeof(d
), m
->min_size
),
3768 format_bytes(e
, sizeof(e
), m
->keep_free
),
3772 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3778 if (f
->header
->head_entry_realtime
== 0)
3781 *from
= le64toh(f
->header
->head_entry_realtime
);
3785 if (f
->header
->tail_entry_realtime
== 0)
3788 *to
= le64toh(f
->header
->tail_entry_realtime
);
3794 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3802 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3806 if (le64toh(o
->data
.n_entries
) <= 0)
3810 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3814 *from
= le64toh(o
->entry
.monotonic
);
3818 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3822 r
= generic_array_get_plus_one(f
,
3823 le64toh(o
->data
.entry_offset
),
3824 le64toh(o
->data
.entry_array_offset
),
3825 le64toh(o
->data
.n_entries
)-1,
3830 *to
= le64toh(o
->entry
.monotonic
);
3836 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3840 /* If we gained new header fields we gained new features,
3841 * hence suggest a rotation */
3842 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3843 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3847 /* Let's check if the hash tables grew over a certain fill
3848 * level (75%, borrowing this value from Java's hash table
3849 * implementation), and if so suggest a rotation. To calculate
3850 * the fill level we need the n_data field, which only exists
3851 * in newer versions. */
3853 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3854 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3855 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3857 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3858 le64toh(f
->header
->n_data
),
3859 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3860 (unsigned long long) f
->last_stat
.st_size
,
3861 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3865 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3866 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3867 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3869 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3870 le64toh(f
->header
->n_fields
),
3871 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3875 /* Are the data objects properly indexed by field objects? */
3876 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3877 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3878 le64toh(f
->header
->n_data
) > 0 &&
3879 le64toh(f
->header
->n_fields
) == 0)
3882 if (max_file_usec
> 0) {
3885 h
= le64toh(f
->header
->head_entry_realtime
);
3886 t
= now(CLOCK_REALTIME
);
3888 if (h
> 0 && t
> h
+ max_file_usec
)