1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2011 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
27 #include <sys/statvfs.h>
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
37 #include "journal-authenticate.h"
38 #include "journal-def.h"
39 #include "journal-file.h"
41 #include "parse-util.h"
42 #include "path-util.h"
43 #include "random-util.h"
46 #include "stat-util.h"
47 #include "string-util.h"
49 #include "xattr-util.h"
51 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
52 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
54 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
55 #define MIN_COMPRESS_THRESHOLD (8ULL)
57 /* This is the minimum journal file size */
58 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
60 /* These are the lower and upper bounds if we deduce the max_use value
61 * from the file system size */
62 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
63 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
65 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
66 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
68 /* This is the upper bound if we deduce max_size from max_use */
69 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
71 /* This is the upper bound if we deduce the keep_free value from the
73 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
75 /* This is the keep_free value when we can't determine the system
77 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
79 /* This is the default maximum number of journal files to keep around. */
80 #define DEFAULT_N_MAX_FILES (100)
82 /* n_data was the first entry we added after the initial file format design */
83 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
85 /* How many entries to keep in the entry array chain cache at max */
86 #define CHAIN_CACHE_MAX 20
88 /* How much to increase the journal file size at once each time we allocate something new. */
89 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
91 /* Reread fstat() of the file for detecting deletions at least this often */
92 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
94 /* The mmap context to use for the header we pick as one above the last defined typed */
95 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
98 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
101 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
102 * As a result we use atomic operations on f->offline_state for inter-thread communications with
103 * journal_file_set_offline() and journal_file_set_online(). */
104 static void journal_file_set_offline_internal(JournalFile
*f
) {
110 switch (f
->offline_state
) {
112 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_CANCEL
, OFFLINE_DONE
))
116 case OFFLINE_AGAIN_FROM_SYNCING
:
117 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_SYNCING
, OFFLINE_SYNCING
))
121 case OFFLINE_AGAIN_FROM_OFFLINING
:
122 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_OFFLINING
, OFFLINE_SYNCING
))
126 case OFFLINE_SYNCING
:
129 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_OFFLINING
))
132 f
->header
->state
= f
->archive
? STATE_ARCHIVED
: STATE_OFFLINE
;
136 case OFFLINE_OFFLINING
:
137 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_OFFLINING
, OFFLINE_DONE
))
144 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
150 static void * journal_file_set_offline_thread(void *arg
) {
151 JournalFile
*f
= arg
;
153 (void) pthread_setname_np(pthread_self(), "journal-offline");
155 journal_file_set_offline_internal(f
);
160 static int journal_file_set_offline_thread_join(JournalFile
*f
) {
165 if (f
->offline_state
== OFFLINE_JOINED
)
168 r
= pthread_join(f
->offline_thread
, NULL
);
172 f
->offline_state
= OFFLINE_JOINED
;
174 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
180 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
181 static bool journal_file_set_offline_try_restart(JournalFile
*f
) {
183 switch (f
->offline_state
) {
184 case OFFLINE_AGAIN_FROM_SYNCING
:
185 case OFFLINE_AGAIN_FROM_OFFLINING
:
189 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_CANCEL
, OFFLINE_AGAIN_FROM_SYNCING
))
193 case OFFLINE_SYNCING
:
194 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_AGAIN_FROM_SYNCING
))
198 case OFFLINE_OFFLINING
:
199 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_OFFLINING
, OFFLINE_AGAIN_FROM_OFFLINING
))
209 /* Sets a journal offline.
211 * If wait is false then an offline is dispatched in a separate thread for a
212 * subsequent journal_file_set_offline() or journal_file_set_online() of the
213 * same journal to synchronize with.
215 * If wait is true, then either an existing offline thread will be restarted
216 * and joined, or if none exists the offline is simply performed in this
217 * context without involving another thread.
219 int journal_file_set_offline(JournalFile
*f
, bool wait
) {
228 if (!(f
->fd
>= 0 && f
->header
))
231 /* An offlining journal is implicitly online and may modify f->header->state,
232 * we must also join any potentially lingering offline thread when not online. */
233 if (!journal_file_is_offlining(f
) && f
->header
->state
!= STATE_ONLINE
)
234 return journal_file_set_offline_thread_join(f
);
236 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
237 restarted
= journal_file_set_offline_try_restart(f
);
238 if ((restarted
&& wait
) || !restarted
) {
239 r
= journal_file_set_offline_thread_join(f
);
247 /* Initiate a new offline. */
248 f
->offline_state
= OFFLINE_SYNCING
;
250 if (wait
) /* Without using a thread if waiting. */
251 journal_file_set_offline_internal(f
);
253 sigset_t ss
, saved_ss
;
256 if (sigfillset(&ss
) < 0)
259 r
= pthread_sigmask(SIG_BLOCK
, &ss
, &saved_ss
);
263 r
= pthread_create(&f
->offline_thread
, NULL
, journal_file_set_offline_thread
, f
);
265 k
= pthread_sigmask(SIG_SETMASK
, &saved_ss
, NULL
);
267 f
->offline_state
= OFFLINE_JOINED
;
277 static int journal_file_set_online(JournalFile
*f
) {
285 if (!(f
->fd
>= 0 && f
->header
))
289 switch (f
->offline_state
) {
291 /* No offline thread, no need to wait. */
295 case OFFLINE_SYNCING
:
296 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_SYNCING
, OFFLINE_CANCEL
))
298 /* Canceled syncing prior to offlining, no need to wait. */
301 case OFFLINE_AGAIN_FROM_SYNCING
:
302 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_SYNCING
, OFFLINE_CANCEL
))
304 /* Canceled restart from syncing, no need to wait. */
307 case OFFLINE_AGAIN_FROM_OFFLINING
:
308 if (!__sync_bool_compare_and_swap(&f
->offline_state
, OFFLINE_AGAIN_FROM_OFFLINING
, OFFLINE_CANCEL
))
310 /* Canceled restart from offlining, must wait for offlining to complete however. */
315 r
= journal_file_set_offline_thread_join(f
);
325 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
328 switch (f
->header
->state
) {
333 f
->header
->state
= STATE_ONLINE
;
342 bool journal_file_is_offlining(JournalFile
*f
) {
345 __sync_synchronize();
347 if (IN_SET(f
->offline_state
, OFFLINE_DONE
, OFFLINE_JOINED
))
353 JournalFile
* journal_file_close(JournalFile
*f
) {
357 /* Write the final tag */
358 if (f
->seal
&& f
->writable
) {
361 r
= journal_file_append_tag(f
);
363 log_error_errno(r
, "Failed to append tag when closing journal: %m");
367 if (f
->post_change_timer
) {
370 if (sd_event_source_get_enabled(f
->post_change_timer
, &enabled
) >= 0)
371 if (enabled
== SD_EVENT_ONESHOT
)
372 journal_file_post_change(f
);
374 (void) sd_event_source_set_enabled(f
->post_change_timer
, SD_EVENT_OFF
);
375 sd_event_source_unref(f
->post_change_timer
);
378 journal_file_set_offline(f
, true);
380 if (f
->mmap
&& f
->cache_fd
)
381 mmap_cache_free_fd(f
->mmap
, f
->cache_fd
);
383 if (f
->fd
>= 0 && f
->defrag_on_close
) {
385 /* Be friendly to btrfs: turn COW back on again now,
386 * and defragment the file. We won't write to the file
387 * ever again, hence remove all fragmentation, and
388 * reenable all the good bits COW usually provides
389 * (such as data checksumming). */
391 (void) chattr_fd(f
->fd
, 0, FS_NOCOW_FL
);
392 (void) btrfs_defrag_fd(f
->fd
);
399 mmap_cache_unref(f
->mmap
);
401 ordered_hashmap_free_free(f
->chain_cache
);
403 #if HAVE_XZ || HAVE_LZ4
404 free(f
->compress_buffer
);
409 munmap(f
->fss_file
, PAGE_ALIGN(f
->fss_file_size
));
411 free(f
->fsprg_state
);
416 gcry_md_close(f
->hmac
);
422 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
429 memcpy(h
.signature
, HEADER_SIGNATURE
, 8);
430 h
.header_size
= htole64(ALIGN64(sizeof(h
)));
432 h
.incompatible_flags
|= htole32(
433 f
->compress_xz
* HEADER_INCOMPATIBLE_COMPRESSED_XZ
|
434 f
->compress_lz4
* HEADER_INCOMPATIBLE_COMPRESSED_LZ4
);
436 h
.compatible_flags
= htole32(
437 f
->seal
* HEADER_COMPATIBLE_SEALED
);
439 r
= sd_id128_randomize(&h
.file_id
);
444 h
.seqnum_id
= template->header
->seqnum_id
;
445 h
.tail_entry_seqnum
= template->header
->tail_entry_seqnum
;
447 h
.seqnum_id
= h
.file_id
;
449 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
459 static int journal_file_refresh_header(JournalFile
*f
) {
466 r
= sd_id128_get_machine(&f
->header
->machine_id
);
470 r
= sd_id128_get_boot(&boot_id
);
474 f
->header
->boot_id
= boot_id
;
476 r
= journal_file_set_online(f
);
478 /* Sync the online state to disk */
481 /* We likely just created a new file, also sync the directory this file is located in. */
482 (void) fsync_directory_of_file(f
->fd
);
487 static bool warn_wrong_flags(const JournalFile
*f
, bool compatible
) {
488 const uint32_t any
= compatible
? HEADER_COMPATIBLE_ANY
: HEADER_INCOMPATIBLE_ANY
,
489 supported
= compatible
? HEADER_COMPATIBLE_SUPPORTED
: HEADER_INCOMPATIBLE_SUPPORTED
;
490 const char *type
= compatible
? "compatible" : "incompatible";
493 flags
= le32toh(compatible
? f
->header
->compatible_flags
: f
->header
->incompatible_flags
);
495 if (flags
& ~supported
) {
497 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32
,
498 f
->path
, type
, flags
& ~any
);
499 flags
= (flags
& any
) & ~supported
;
503 _cleanup_free_
char *t
= NULL
;
505 if (compatible
&& (flags
& HEADER_COMPATIBLE_SEALED
))
506 strv
[n
++] = "sealed";
507 if (!compatible
&& (flags
& HEADER_INCOMPATIBLE_COMPRESSED_XZ
))
508 strv
[n
++] = "xz-compressed";
509 if (!compatible
&& (flags
& HEADER_INCOMPATIBLE_COMPRESSED_LZ4
))
510 strv
[n
++] = "lz4-compressed";
512 assert(n
< ELEMENTSOF(strv
));
514 t
= strv_join((char**) strv
, ", ");
515 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
516 f
->path
, type
, n
> 1 ? "flags" : "flag", strnull(t
));
524 static int journal_file_verify_header(JournalFile
*f
) {
525 uint64_t arena_size
, header_size
;
530 if (memcmp(f
->header
->signature
, HEADER_SIGNATURE
, 8))
533 /* In both read and write mode we refuse to open files with incompatible
534 * flags we don't know. */
535 if (warn_wrong_flags(f
, false))
536 return -EPROTONOSUPPORT
;
538 /* When open for writing we refuse to open files with compatible flags, too. */
539 if (f
->writable
&& warn_wrong_flags(f
, true))
540 return -EPROTONOSUPPORT
;
542 if (f
->header
->state
>= _STATE_MAX
)
545 header_size
= le64toh(f
->header
->header_size
);
547 /* The first addition was n_data, so check that we are at least this large */
548 if (header_size
< HEADER_SIZE_MIN
)
551 if (JOURNAL_HEADER_SEALED(f
->header
) && !JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
554 arena_size
= le64toh(f
->header
->arena_size
);
556 if (UINT64_MAX
- header_size
< arena_size
|| header_size
+ arena_size
> (uint64_t) f
->last_stat
.st_size
)
559 if (le64toh(f
->header
->tail_object_offset
) > header_size
+ arena_size
)
562 if (!VALID64(le64toh(f
->header
->data_hash_table_offset
)) ||
563 !VALID64(le64toh(f
->header
->field_hash_table_offset
)) ||
564 !VALID64(le64toh(f
->header
->tail_object_offset
)) ||
565 !VALID64(le64toh(f
->header
->entry_array_offset
)))
569 sd_id128_t machine_id
;
573 r
= sd_id128_get_machine(&machine_id
);
577 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
580 state
= f
->header
->state
;
582 if (state
== STATE_ARCHIVED
)
583 return -ESHUTDOWN
; /* Already archived */
584 else if (state
== STATE_ONLINE
) {
585 log_debug("Journal file %s is already online. Assuming unclean closing.", f
->path
);
587 } else if (state
!= STATE_OFFLINE
) {
588 log_debug("Journal file %s has unknown state %i.", f
->path
, state
);
592 if (f
->header
->field_hash_table_size
== 0 || f
->header
->data_hash_table_size
== 0)
595 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
596 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
598 if (le64toh(f
->header
->tail_entry_realtime
) > now(CLOCK_REALTIME
)) {
599 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f
->path
);
604 f
->compress_xz
= JOURNAL_HEADER_COMPRESSED_XZ(f
->header
);
605 f
->compress_lz4
= JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
);
607 f
->seal
= JOURNAL_HEADER_SEALED(f
->header
);
612 static int journal_file_fstat(JournalFile
*f
) {
618 if (fstat(f
->fd
, &f
->last_stat
) < 0)
621 f
->last_stat_usec
= now(CLOCK_MONOTONIC
);
623 /* Refuse dealing with with files that aren't regular */
624 r
= stat_verify_regular(&f
->last_stat
);
628 /* Refuse appending to files that are already deleted */
629 if (f
->last_stat
.st_nlink
<= 0)
635 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
636 uint64_t old_size
, new_size
;
642 /* We assume that this file is not sparse, and we know that
643 * for sure, since we always call posix_fallocate()
646 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
650 le64toh(f
->header
->header_size
) +
651 le64toh(f
->header
->arena_size
);
653 new_size
= PAGE_ALIGN(offset
+ size
);
654 if (new_size
< le64toh(f
->header
->header_size
))
655 new_size
= le64toh(f
->header
->header_size
);
657 if (new_size
<= old_size
) {
659 /* We already pre-allocated enough space, but before
660 * we write to it, let's check with fstat() if the
661 * file got deleted, in order make sure we don't throw
662 * away the data immediately. Don't check fstat() for
663 * all writes though, but only once ever 10s. */
665 if (f
->last_stat_usec
+ LAST_STAT_REFRESH_USEC
> now(CLOCK_MONOTONIC
))
668 return journal_file_fstat(f
);
671 /* Allocate more space. */
673 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
676 if (new_size
> f
->metrics
.min_size
&& f
->metrics
.keep_free
> 0) {
679 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
682 available
= LESS_BY((uint64_t) svfs
.f_bfree
* (uint64_t) svfs
.f_bsize
, f
->metrics
.keep_free
);
684 if (new_size
- old_size
> available
)
689 /* Increase by larger blocks at once */
690 new_size
= ((new_size
+FILE_SIZE_INCREASE
-1) / FILE_SIZE_INCREASE
) * FILE_SIZE_INCREASE
;
691 if (f
->metrics
.max_size
> 0 && new_size
> f
->metrics
.max_size
)
692 new_size
= f
->metrics
.max_size
;
694 /* Note that the glibc fallocate() fallback is very
695 inefficient, hence we try to minimize the allocation area
697 r
= posix_fallocate(f
->fd
, old_size
, new_size
- old_size
);
701 f
->header
->arena_size
= htole64(new_size
- le64toh(f
->header
->header_size
));
703 return journal_file_fstat(f
);
706 static unsigned type_to_context(ObjectType type
) {
707 /* One context for each type, plus one catch-all for the rest */
708 assert_cc(_OBJECT_TYPE_MAX
<= MMAP_CACHE_MAX_CONTEXTS
);
709 assert_cc(CONTEXT_HEADER
< MMAP_CACHE_MAX_CONTEXTS
);
710 return type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
? type
: 0;
713 static int journal_file_move_to(JournalFile
*f
, ObjectType type
, bool keep_always
, uint64_t offset
, uint64_t size
, void **ret
, size_t *ret_size
) {
722 /* Avoid SIGBUS on invalid accesses */
723 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
) {
724 /* Hmm, out of range? Let's refresh the fstat() data
725 * first, before we trust that check. */
727 r
= journal_file_fstat(f
);
731 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
732 return -EADDRNOTAVAIL
;
735 return mmap_cache_get(f
->mmap
, f
->cache_fd
, f
->prot
, type_to_context(type
), keep_always
, offset
, size
, &f
->last_stat
, ret
, ret_size
);
738 static uint64_t minimum_header_size(Object
*o
) {
740 static const uint64_t table
[] = {
741 [OBJECT_DATA
] = sizeof(DataObject
),
742 [OBJECT_FIELD
] = sizeof(FieldObject
),
743 [OBJECT_ENTRY
] = sizeof(EntryObject
),
744 [OBJECT_DATA_HASH_TABLE
] = sizeof(HashTableObject
),
745 [OBJECT_FIELD_HASH_TABLE
] = sizeof(HashTableObject
),
746 [OBJECT_ENTRY_ARRAY
] = sizeof(EntryArrayObject
),
747 [OBJECT_TAG
] = sizeof(TagObject
),
750 if (o
->object
.type
>= ELEMENTSOF(table
) || table
[o
->object
.type
] <= 0)
751 return sizeof(ObjectHeader
);
753 return table
[o
->object
.type
];
756 /* Lightweight object checks. We want this to be fast, so that we won't
757 * slowdown every journal_file_move_to_object() call too much. */
758 static int journal_file_check_object(JournalFile
*f
, uint64_t offset
, Object
*o
) {
762 switch (o
->object
.type
) {
765 if ((le64toh(o
->data
.entry_offset
) == 0) ^ (le64toh(o
->data
.n_entries
) == 0)) {
766 log_debug("Bad n_entries: %"PRIu64
": %"PRIu64
,
767 le64toh(o
->data
.n_entries
), offset
);
771 if (le64toh(o
->object
.size
) - offsetof(DataObject
, payload
) <= 0) {
772 log_debug("Bad object size (<= %zu): %"PRIu64
": %"PRIu64
,
773 offsetof(DataObject
, payload
),
774 le64toh(o
->object
.size
),
779 if (!VALID64(le64toh(o
->data
.next_hash_offset
)) ||
780 !VALID64(le64toh(o
->data
.next_field_offset
)) ||
781 !VALID64(le64toh(o
->data
.entry_offset
)) ||
782 !VALID64(le64toh(o
->data
.entry_array_offset
))) {
783 log_debug("Invalid offset, next_hash_offset="OFSfmt
", next_field_offset="OFSfmt
784 ", entry_offset="OFSfmt
", entry_array_offset="OFSfmt
": %"PRIu64
,
785 le64toh(o
->data
.next_hash_offset
),
786 le64toh(o
->data
.next_field_offset
),
787 le64toh(o
->data
.entry_offset
),
788 le64toh(o
->data
.entry_array_offset
),
797 if (le64toh(o
->object
.size
) - offsetof(FieldObject
, payload
) <= 0) {
799 "Bad field size (<= %zu): %"PRIu64
": %"PRIu64
,
800 offsetof(FieldObject
, payload
),
801 le64toh(o
->object
.size
),
806 if (!VALID64(le64toh(o
->field
.next_hash_offset
)) ||
807 !VALID64(le64toh(o
->field
.head_data_offset
))) {
809 "Invalid offset, next_hash_offset="OFSfmt
810 ", head_data_offset="OFSfmt
": %"PRIu64
,
811 le64toh(o
->field
.next_hash_offset
),
812 le64toh(o
->field
.head_data_offset
),
819 if ((le64toh(o
->object
.size
) - offsetof(EntryObject
, items
)) % sizeof(EntryItem
) != 0) {
821 "Bad entry size (<= %zu): %"PRIu64
": %"PRIu64
,
822 offsetof(EntryObject
, items
),
823 le64toh(o
->object
.size
),
828 if ((le64toh(o
->object
.size
) - offsetof(EntryObject
, items
)) / sizeof(EntryItem
) <= 0) {
830 "Invalid number items in entry: %"PRIu64
": %"PRIu64
,
831 (le64toh(o
->object
.size
) - offsetof(EntryObject
, items
)) / sizeof(EntryItem
),
836 if (le64toh(o
->entry
.seqnum
) <= 0) {
838 "Invalid entry seqnum: %"PRIx64
": %"PRIu64
,
839 le64toh(o
->entry
.seqnum
),
844 if (!VALID_REALTIME(le64toh(o
->entry
.realtime
))) {
846 "Invalid entry realtime timestamp: %"PRIu64
": %"PRIu64
,
847 le64toh(o
->entry
.realtime
),
852 if (!VALID_MONOTONIC(le64toh(o
->entry
.monotonic
))) {
854 "Invalid entry monotonic timestamp: %"PRIu64
": %"PRIu64
,
855 le64toh(o
->entry
.monotonic
),
862 case OBJECT_DATA_HASH_TABLE
:
863 case OBJECT_FIELD_HASH_TABLE
:
864 if ((le64toh(o
->object
.size
) - offsetof(HashTableObject
, items
)) % sizeof(HashItem
) != 0 ||
865 (le64toh(o
->object
.size
) - offsetof(HashTableObject
, items
)) / sizeof(HashItem
) <= 0) {
867 "Invalid %s hash table size: %"PRIu64
": %"PRIu64
,
868 o
->object
.type
== OBJECT_DATA_HASH_TABLE
? "data" : "field",
869 le64toh(o
->object
.size
),
876 case OBJECT_ENTRY_ARRAY
:
877 if ((le64toh(o
->object
.size
) - offsetof(EntryArrayObject
, items
)) % sizeof(le64_t
) != 0 ||
878 (le64toh(o
->object
.size
) - offsetof(EntryArrayObject
, items
)) / sizeof(le64_t
) <= 0) {
880 "Invalid object entry array size: %"PRIu64
": %"PRIu64
,
881 le64toh(o
->object
.size
),
886 if (!VALID64(le64toh(o
->entry_array
.next_entry_array_offset
))) {
888 "Invalid object entry array next_entry_array_offset: "OFSfmt
": %"PRIu64
,
889 le64toh(o
->entry_array
.next_entry_array_offset
),
897 if (le64toh(o
->object
.size
) != sizeof(TagObject
)) {
899 "Invalid object tag size: %"PRIu64
": %"PRIu64
,
900 le64toh(o
->object
.size
),
905 if (!VALID_EPOCH(le64toh(o
->tag
.epoch
))) {
907 "Invalid object tag epoch: %"PRIu64
": %"PRIu64
,
908 le64toh(o
->tag
.epoch
),
919 int journal_file_move_to_object(JournalFile
*f
, ObjectType type
, uint64_t offset
, Object
**ret
) {
929 /* Objects may only be located at multiple of 64 bit */
930 if (!VALID64(offset
)) {
931 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64
, offset
);
935 /* Object may not be located in the file header */
936 if (offset
< le64toh(f
->header
->header_size
)) {
937 log_debug("Attempt to move to object located in file header: %" PRIu64
, offset
);
941 r
= journal_file_move_to(f
, type
, false, offset
, sizeof(ObjectHeader
), &t
, &tsize
);
946 s
= le64toh(o
->object
.size
);
949 log_debug("Attempt to move to uninitialized object: %" PRIu64
, offset
);
952 if (s
< sizeof(ObjectHeader
)) {
953 log_debug("Attempt to move to overly short object: %" PRIu64
, offset
);
957 if (o
->object
.type
<= OBJECT_UNUSED
) {
958 log_debug("Attempt to move to object with invalid type: %" PRIu64
, offset
);
962 if (s
< minimum_header_size(o
)) {
963 log_debug("Attempt to move to truncated object: %" PRIu64
, offset
);
967 if (type
> OBJECT_UNUSED
&& o
->object
.type
!= type
) {
968 log_debug("Attempt to move to object of unexpected type: %" PRIu64
, offset
);
973 r
= journal_file_move_to(f
, type
, false, offset
, s
, &t
, NULL
);
980 r
= journal_file_check_object(f
, offset
, o
);
988 static uint64_t journal_file_entry_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
994 r
= le64toh(f
->header
->tail_entry_seqnum
) + 1;
997 /* If an external seqnum counter was passed, we update
998 * both the local and the external one, and set it to
999 * the maximum of both */
1001 if (*seqnum
+ 1 > r
)
1007 f
->header
->tail_entry_seqnum
= htole64(r
);
1009 if (f
->header
->head_entry_seqnum
== 0)
1010 f
->header
->head_entry_seqnum
= htole64(r
);
1015 int journal_file_append_object(JournalFile
*f
, ObjectType type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
1023 assert(type
> OBJECT_UNUSED
&& type
< _OBJECT_TYPE_MAX
);
1024 assert(size
>= sizeof(ObjectHeader
));
1028 r
= journal_file_set_online(f
);
1032 p
= le64toh(f
->header
->tail_object_offset
);
1034 p
= le64toh(f
->header
->header_size
);
1036 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &tail
);
1040 p
+= ALIGN64(le64toh(tail
->object
.size
));
1043 r
= journal_file_allocate(f
, p
, size
);
1047 r
= journal_file_move_to(f
, type
, false, p
, size
, &t
, NULL
);
1054 o
->object
.type
= type
;
1055 o
->object
.size
= htole64(size
);
1057 f
->header
->tail_object_offset
= htole64(p
);
1058 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
1066 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
1074 /* We estimate that we need 1 hash table entry per 768 bytes
1075 of journal file and we want to make sure we never get
1076 beyond 75% fill level. Calculate the hash table size for
1077 the maximum file size based on these metrics. */
1079 s
= (f
->metrics
.max_size
* 4 / 768 / 3) * sizeof(HashItem
);
1080 if (s
< DEFAULT_DATA_HASH_TABLE_SIZE
)
1081 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
1083 log_debug("Reserving %"PRIu64
" entries in hash table.", s
/ sizeof(HashItem
));
1085 r
= journal_file_append_object(f
,
1086 OBJECT_DATA_HASH_TABLE
,
1087 offsetof(Object
, hash_table
.items
) + s
,
1092 memzero(o
->hash_table
.items
, s
);
1094 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
1095 f
->header
->data_hash_table_size
= htole64(s
);
1100 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
1108 /* We use a fixed size hash table for the fields as this
1109 * number should grow very slowly only */
1111 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
1112 r
= journal_file_append_object(f
,
1113 OBJECT_FIELD_HASH_TABLE
,
1114 offsetof(Object
, hash_table
.items
) + s
,
1119 memzero(o
->hash_table
.items
, s
);
1121 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
1122 f
->header
->field_hash_table_size
= htole64(s
);
1127 int journal_file_map_data_hash_table(JournalFile
*f
) {
1135 if (f
->data_hash_table
)
1138 p
= le64toh(f
->header
->data_hash_table_offset
);
1139 s
= le64toh(f
->header
->data_hash_table_size
);
1141 r
= journal_file_move_to(f
,
1142 OBJECT_DATA_HASH_TABLE
,
1149 f
->data_hash_table
= t
;
1153 int journal_file_map_field_hash_table(JournalFile
*f
) {
1161 if (f
->field_hash_table
)
1164 p
= le64toh(f
->header
->field_hash_table_offset
);
1165 s
= le64toh(f
->header
->field_hash_table_size
);
1167 r
= journal_file_move_to(f
,
1168 OBJECT_FIELD_HASH_TABLE
,
1175 f
->field_hash_table
= t
;
1179 static int journal_file_link_field(
1190 assert(f
->field_hash_table
);
1194 if (o
->object
.type
!= OBJECT_FIELD
)
1197 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
1201 /* This might alter the window we are looking at */
1202 o
->field
.next_hash_offset
= o
->field
.head_data_offset
= 0;
1205 p
= le64toh(f
->field_hash_table
[h
].tail_hash_offset
);
1207 f
->field_hash_table
[h
].head_hash_offset
= htole64(offset
);
1209 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1213 o
->field
.next_hash_offset
= htole64(offset
);
1216 f
->field_hash_table
[h
].tail_hash_offset
= htole64(offset
);
1218 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
1219 f
->header
->n_fields
= htole64(le64toh(f
->header
->n_fields
) + 1);
1224 static int journal_file_link_data(
1235 assert(f
->data_hash_table
);
1239 if (o
->object
.type
!= OBJECT_DATA
)
1242 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
1246 /* This might alter the window we are looking at */
1247 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
1248 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
1249 o
->data
.n_entries
= 0;
1252 p
= le64toh(f
->data_hash_table
[h
].tail_hash_offset
);
1254 /* Only entry in the hash table is easy */
1255 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
1257 /* Move back to the previous data object, to patch in
1260 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1264 o
->data
.next_hash_offset
= htole64(offset
);
1267 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
1269 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
1270 f
->header
->n_data
= htole64(le64toh(f
->header
->n_data
) + 1);
1275 int journal_file_find_field_object_with_hash(
1277 const void *field
, uint64_t size
, uint64_t hash
,
1278 Object
**ret
, uint64_t *offset
) {
1280 uint64_t p
, osize
, h
, m
;
1285 assert(field
&& size
> 0);
1287 /* If the field hash table is empty, we can't find anything */
1288 if (le64toh(f
->header
->field_hash_table_size
) <= 0)
1291 /* Map the field hash table, if it isn't mapped yet. */
1292 r
= journal_file_map_field_hash_table(f
);
1296 osize
= offsetof(Object
, field
.payload
) + size
;
1298 m
= le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
);
1303 p
= le64toh(f
->field_hash_table
[h
].head_hash_offset
);
1308 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1312 if (le64toh(o
->field
.hash
) == hash
&&
1313 le64toh(o
->object
.size
) == osize
&&
1314 memcmp(o
->field
.payload
, field
, size
) == 0) {
1324 p
= le64toh(o
->field
.next_hash_offset
);
1330 int journal_file_find_field_object(
1332 const void *field
, uint64_t size
,
1333 Object
**ret
, uint64_t *offset
) {
1338 assert(field
&& size
> 0);
1340 hash
= hash64(field
, size
);
1342 return journal_file_find_field_object_with_hash(f
,
1347 int journal_file_find_data_object_with_hash(
1349 const void *data
, uint64_t size
, uint64_t hash
,
1350 Object
**ret
, uint64_t *offset
) {
1352 uint64_t p
, osize
, h
, m
;
1357 assert(data
|| size
== 0);
1359 /* If there's no data hash table, then there's no entry. */
1360 if (le64toh(f
->header
->data_hash_table_size
) <= 0)
1363 /* Map the data hash table, if it isn't mapped yet. */
1364 r
= journal_file_map_data_hash_table(f
);
1368 osize
= offsetof(Object
, data
.payload
) + size
;
1370 m
= le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
);
1375 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
1380 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1384 if (le64toh(o
->data
.hash
) != hash
)
1387 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
1388 #if HAVE_XZ || HAVE_LZ4
1392 l
= le64toh(o
->object
.size
);
1393 if (l
<= offsetof(Object
, data
.payload
))
1396 l
-= offsetof(Object
, data
.payload
);
1398 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
1399 o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
, 0);
1403 if (rsize
== size
&&
1404 memcmp(f
->compress_buffer
, data
, size
) == 0) {
1415 return -EPROTONOSUPPORT
;
1417 } else if (le64toh(o
->object
.size
) == osize
&&
1418 memcmp(o
->data
.payload
, data
, size
) == 0) {
1430 p
= le64toh(o
->data
.next_hash_offset
);
1436 int journal_file_find_data_object(
1438 const void *data
, uint64_t size
,
1439 Object
**ret
, uint64_t *offset
) {
1444 assert(data
|| size
== 0);
1446 hash
= hash64(data
, size
);
1448 return journal_file_find_data_object_with_hash(f
,
1453 static int journal_file_append_field(
1455 const void *field
, uint64_t size
,
1456 Object
**ret
, uint64_t *offset
) {
1464 assert(field
&& size
> 0);
1466 hash
= hash64(field
, size
);
1468 r
= journal_file_find_field_object_with_hash(f
, field
, size
, hash
, &o
, &p
);
1482 osize
= offsetof(Object
, field
.payload
) + size
;
1483 r
= journal_file_append_object(f
, OBJECT_FIELD
, osize
, &o
, &p
);
1487 o
->field
.hash
= htole64(hash
);
1488 memcpy(o
->field
.payload
, field
, size
);
1490 r
= journal_file_link_field(f
, o
, p
, hash
);
1494 /* The linking might have altered the window, so let's
1495 * refresh our pointer */
1496 r
= journal_file_move_to_object(f
, OBJECT_FIELD
, p
, &o
);
1501 r
= journal_file_hmac_put_object(f
, OBJECT_FIELD
, o
, p
);
1515 static int journal_file_append_data(
1517 const void *data
, uint64_t size
,
1518 Object
**ret
, uint64_t *offset
) {
1523 int r
, compression
= 0;
1527 assert(data
|| size
== 0);
1529 hash
= hash64(data
, size
);
1531 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
1545 osize
= offsetof(Object
, data
.payload
) + size
;
1546 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
1550 o
->data
.hash
= htole64(hash
);
1552 #if HAVE_XZ || HAVE_LZ4
1553 if (JOURNAL_FILE_COMPRESS(f
) && size
>= f
->compress_threshold_bytes
) {
1556 compression
= compress_blob(data
, size
, o
->data
.payload
, size
- 1, &rsize
);
1558 if (compression
>= 0) {
1559 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
1560 o
->object
.flags
|= compression
;
1562 log_debug("Compressed data object %"PRIu64
" -> %zu using %s",
1563 size
, rsize
, object_compressed_to_string(compression
));
1565 /* Compression didn't work, we don't really care why, let's continue without compression */
1570 if (compression
== 0)
1571 memcpy_safe(o
->data
.payload
, data
, size
);
1573 r
= journal_file_link_data(f
, o
, p
, hash
);
1578 r
= journal_file_hmac_put_object(f
, OBJECT_DATA
, o
, p
);
1583 /* The linking might have altered the window, so let's
1584 * refresh our pointer */
1585 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1592 eq
= memchr(data
, '=', size
);
1593 if (eq
&& eq
> data
) {
1597 /* Create field object ... */
1598 r
= journal_file_append_field(f
, data
, (uint8_t*) eq
- (uint8_t*) data
, &fo
, &fp
);
1602 /* ... and link it in. */
1603 o
->data
.next_field_offset
= fo
->field
.head_data_offset
;
1604 fo
->field
.head_data_offset
= le64toh(p
);
1616 uint64_t journal_file_entry_n_items(Object
*o
) {
1619 if (o
->object
.type
!= OBJECT_ENTRY
)
1622 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
1625 uint64_t journal_file_entry_array_n_items(Object
*o
) {
1628 if (o
->object
.type
!= OBJECT_ENTRY_ARRAY
)
1631 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
1634 uint64_t journal_file_hash_table_n_items(Object
*o
) {
1637 if (!IN_SET(o
->object
.type
, OBJECT_DATA_HASH_TABLE
, OBJECT_FIELD_HASH_TABLE
))
1640 return (le64toh(o
->object
.size
) - offsetof(Object
, hash_table
.items
)) / sizeof(HashItem
);
1643 static int link_entry_into_array(JournalFile
*f
,
1648 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
1657 a
= le64toh(*first
);
1658 i
= hidx
= le64toh(*idx
);
1661 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1665 n
= journal_file_entry_array_n_items(o
);
1667 o
->entry_array
.items
[i
] = htole64(p
);
1668 *idx
= htole64(hidx
+ 1);
1674 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1685 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
1686 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
1692 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY_ARRAY
, o
, q
);
1697 o
->entry_array
.items
[i
] = htole64(p
);
1700 *first
= htole64(q
);
1702 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
1706 o
->entry_array
.next_entry_array_offset
= htole64(q
);
1709 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
1710 f
->header
->n_entry_arrays
= htole64(le64toh(f
->header
->n_entry_arrays
) + 1);
1712 *idx
= htole64(hidx
+ 1);
1717 static int link_entry_into_array_plus_one(JournalFile
*f
,
1732 *extra
= htole64(p
);
1736 i
= htole64(le64toh(*idx
) - 1);
1737 r
= link_entry_into_array(f
, first
, &i
, p
);
1742 *idx
= htole64(le64toh(*idx
) + 1);
1746 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
1753 p
= le64toh(o
->entry
.items
[i
].object_offset
);
1757 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
1761 return link_entry_into_array_plus_one(f
,
1762 &o
->data
.entry_offset
,
1763 &o
->data
.entry_array_offset
,
1768 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
1777 if (o
->object
.type
!= OBJECT_ENTRY
)
1780 __sync_synchronize();
1782 /* Link up the entry itself */
1783 r
= link_entry_into_array(f
,
1784 &f
->header
->entry_array_offset
,
1785 &f
->header
->n_entries
,
1790 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1792 if (f
->header
->head_entry_realtime
== 0)
1793 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
1795 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
1796 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
1798 /* Link up the items */
1799 n
= journal_file_entry_n_items(o
);
1800 for (i
= 0; i
< n
; i
++) {
1801 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
1809 static int journal_file_append_entry_internal(
1811 const dual_timestamp
*ts
,
1813 const EntryItem items
[], unsigned n_items
,
1815 Object
**ret
, uint64_t *offset
) {
1823 assert(items
|| n_items
== 0);
1826 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
1828 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
1832 o
->entry
.seqnum
= htole64(journal_file_entry_seqnum(f
, seqnum
));
1833 memcpy_safe(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
1834 o
->entry
.realtime
= htole64(ts
->realtime
);
1835 o
->entry
.monotonic
= htole64(ts
->monotonic
);
1836 o
->entry
.xor_hash
= htole64(xor_hash
);
1837 o
->entry
.boot_id
= f
->header
->boot_id
;
1840 r
= journal_file_hmac_put_object(f
, OBJECT_ENTRY
, o
, np
);
1845 r
= journal_file_link_entry(f
, o
, np
);
1858 void journal_file_post_change(JournalFile
*f
) {
1861 /* inotify() does not receive IN_MODIFY events from file
1862 * accesses done via mmap(). After each access we hence
1863 * trigger IN_MODIFY by truncating the journal file to its
1864 * current size which triggers IN_MODIFY. */
1866 __sync_synchronize();
1868 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
1869 log_debug_errno(errno
, "Failed to truncate file to its own size: %m");
1872 static int post_change_thunk(sd_event_source
*timer
, uint64_t usec
, void *userdata
) {
1875 journal_file_post_change(userdata
);
1880 static void schedule_post_change(JournalFile
*f
) {
1881 sd_event_source
*timer
;
1886 assert(f
->post_change_timer
);
1888 timer
= f
->post_change_timer
;
1890 r
= sd_event_source_get_enabled(timer
, &enabled
);
1892 log_debug_errno(r
, "Failed to get ftruncate timer state: %m");
1896 if (enabled
== SD_EVENT_ONESHOT
)
1899 r
= sd_event_now(sd_event_source_get_event(timer
), CLOCK_MONOTONIC
, &now
);
1901 log_debug_errno(r
, "Failed to get clock's now for scheduling ftruncate: %m");
1905 r
= sd_event_source_set_time(timer
, now
+f
->post_change_timer_period
);
1907 log_debug_errno(r
, "Failed to set time for scheduling ftruncate: %m");
1911 r
= sd_event_source_set_enabled(timer
, SD_EVENT_ONESHOT
);
1913 log_debug_errno(r
, "Failed to enable scheduled ftruncate: %m");
1920 /* On failure, let's simply post the change immediately. */
1921 journal_file_post_change(f
);
1924 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1925 int journal_file_enable_post_change_timer(JournalFile
*f
, sd_event
*e
, usec_t t
) {
1926 _cleanup_(sd_event_source_unrefp
) sd_event_source
*timer
= NULL
;
1930 assert_return(!f
->post_change_timer
, -EINVAL
);
1934 r
= sd_event_add_time(e
, &timer
, CLOCK_MONOTONIC
, 0, 0, post_change_thunk
, f
);
1938 r
= sd_event_source_set_enabled(timer
, SD_EVENT_OFF
);
1942 f
->post_change_timer
= timer
;
1944 f
->post_change_timer_period
= t
;
1949 static int entry_item_cmp(const void *_a
, const void *_b
) {
1950 const EntryItem
*a
= _a
, *b
= _b
;
1952 if (le64toh(a
->object_offset
) < le64toh(b
->object_offset
))
1954 if (le64toh(a
->object_offset
) > le64toh(b
->object_offset
))
1959 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
1963 uint64_t xor_hash
= 0;
1964 struct dual_timestamp _ts
;
1968 assert(iovec
|| n_iovec
== 0);
1971 dual_timestamp_get(&_ts
);
1976 r
= journal_file_maybe_append_tag(f
, ts
->realtime
);
1981 /* alloca() can't take 0, hence let's allocate at least one */
1982 items
= alloca(sizeof(EntryItem
) * MAX(1u, n_iovec
));
1984 for (i
= 0; i
< n_iovec
; i
++) {
1988 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
1992 xor_hash
^= le64toh(o
->data
.hash
);
1993 items
[i
].object_offset
= htole64(p
);
1994 items
[i
].hash
= o
->data
.hash
;
1997 /* Order by the position on disk, in order to improve seek
1998 * times for rotating media. */
1999 qsort_safe(items
, n_iovec
, sizeof(EntryItem
), entry_item_cmp
);
2001 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
2003 /* If the memory mapping triggered a SIGBUS then we return an
2004 * IO error and ignore the error code passed down to us, since
2005 * it is very likely just an effect of a nullified replacement
2008 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
2011 if (f
->post_change_timer
)
2012 schedule_post_change(f
);
2014 journal_file_post_change(f
);
2019 typedef struct ChainCacheItem
{
2020 uint64_t first
; /* the array at the beginning of the chain */
2021 uint64_t array
; /* the cached array */
2022 uint64_t begin
; /* the first item in the cached array */
2023 uint64_t total
; /* the total number of items in all arrays before this one in the chain */
2024 uint64_t last_index
; /* the last index we looked at, to optimize locality when bisecting */
2027 static void chain_cache_put(
2034 uint64_t last_index
) {
2037 /* If the chain item to cache for this chain is the
2038 * first one it's not worth caching anything */
2042 if (ordered_hashmap_size(h
) >= CHAIN_CACHE_MAX
) {
2043 ci
= ordered_hashmap_steal_first(h
);
2046 ci
= new(ChainCacheItem
, 1);
2053 if (ordered_hashmap_put(h
, &ci
->first
, ci
) < 0) {
2058 assert(ci
->first
== first
);
2063 ci
->last_index
= last_index
;
2066 static int generic_array_get(
2070 Object
**ret
, uint64_t *offset
) {
2073 uint64_t p
= 0, a
, t
= 0;
2081 /* Try the chain cache first */
2082 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
2083 if (ci
&& i
> ci
->total
) {
2092 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
2096 k
= journal_file_entry_array_n_items(o
);
2098 p
= le64toh(o
->entry_array
.items
[i
]);
2104 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
2110 /* Let's cache this item for the next invocation */
2111 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(o
->entry_array
.items
[0]), t
, i
);
2113 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2126 static int generic_array_get_plus_one(
2131 Object
**ret
, uint64_t *offset
) {
2140 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
2153 return generic_array_get(f
, first
, i
-1, ret
, offset
);
2162 static int generic_array_bisect(
2167 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
2168 direction_t direction
,
2173 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0, last_index
= (uint64_t) -1;
2174 bool subtract_one
= false;
2175 Object
*o
, *array
= NULL
;
2180 assert(test_object
);
2182 /* Start with the first array in the chain */
2185 ci
= ordered_hashmap_get(f
->chain_cache
, &first
);
2186 if (ci
&& n
> ci
->total
) {
2187 /* Ah, we have iterated this bisection array chain
2188 * previously! Let's see if we can skip ahead in the
2189 * chain, as far as the last time. But we can't jump
2190 * backwards in the chain, so let's check that
2193 r
= test_object(f
, ci
->begin
, needle
);
2197 if (r
== TEST_LEFT
) {
2198 /* OK, what we are looking for is right of the
2199 * begin of this EntryArray, so let's jump
2200 * straight to previously cached array in the
2206 last_index
= ci
->last_index
;
2211 uint64_t left
, right
, k
, lp
;
2213 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
2217 k
= journal_file_entry_array_n_items(array
);
2223 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
2227 r
= test_object(f
, p
, needle
);
2228 if (r
== -EBADMSG
) {
2229 log_debug_errno(r
, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2236 if (r
== TEST_FOUND
)
2237 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2239 if (r
== TEST_RIGHT
) {
2243 if (last_index
!= (uint64_t) -1) {
2244 assert(last_index
<= right
);
2246 /* If we cached the last index we
2247 * looked at, let's try to not to jump
2248 * too wildly around and see if we can
2249 * limit the range to look at early to
2250 * the immediate neighbors of the last
2251 * index we looked at. */
2253 if (last_index
> 0) {
2254 uint64_t x
= last_index
- 1;
2256 p
= le64toh(array
->entry_array
.items
[x
]);
2260 r
= test_object(f
, p
, needle
);
2264 if (r
== TEST_FOUND
)
2265 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2267 if (r
== TEST_RIGHT
)
2273 if (last_index
< right
) {
2274 uint64_t y
= last_index
+ 1;
2276 p
= le64toh(array
->entry_array
.items
[y
]);
2280 r
= test_object(f
, p
, needle
);
2284 if (r
== TEST_FOUND
)
2285 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2287 if (r
== TEST_RIGHT
)
2295 if (left
== right
) {
2296 if (direction
== DIRECTION_UP
)
2297 subtract_one
= true;
2303 assert(left
< right
);
2304 i
= (left
+ right
) / 2;
2306 p
= le64toh(array
->entry_array
.items
[i
]);
2310 r
= test_object(f
, p
, needle
);
2311 if (r
== -EBADMSG
) {
2312 log_debug_errno(r
, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2319 if (r
== TEST_FOUND
)
2320 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2322 if (r
== TEST_RIGHT
)
2330 if (direction
== DIRECTION_UP
) {
2332 subtract_one
= true;
2343 last_index
= (uint64_t) -1;
2344 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
2350 if (subtract_one
&& t
== 0 && i
== 0)
2353 /* Let's cache this item for the next invocation */
2354 chain_cache_put(f
->chain_cache
, ci
, first
, a
, le64toh(array
->entry_array
.items
[0]), t
, subtract_one
? (i
> 0 ? i
-1 : (uint64_t) -1) : i
);
2356 if (subtract_one
&& i
== 0)
2358 else if (subtract_one
)
2359 p
= le64toh(array
->entry_array
.items
[i
-1]);
2361 p
= le64toh(array
->entry_array
.items
[i
]);
2363 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2374 *idx
= t
+ i
+ (subtract_one
? -1 : 0);
2379 static int generic_array_bisect_plus_one(
2385 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
2386 direction_t direction
,
2392 bool step_back
= false;
2396 assert(test_object
);
2401 /* This bisects the array in object 'first', but first checks
2403 r
= test_object(f
, extra
, needle
);
2407 if (r
== TEST_FOUND
)
2408 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
2410 /* if we are looking with DIRECTION_UP then we need to first
2411 see if in the actual array there is a matching entry, and
2412 return the last one of that. But if there isn't any we need
2413 to return this one. Hence remember this, and return it
2416 step_back
= direction
== DIRECTION_UP
;
2418 if (r
== TEST_RIGHT
) {
2419 if (direction
== DIRECTION_DOWN
)
2425 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
2427 if (r
== 0 && step_back
)
2436 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
2452 _pure_
static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2458 else if (p
< needle
)
2464 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2471 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2475 if (le64toh(o
->entry
.seqnum
) == needle
)
2477 else if (le64toh(o
->entry
.seqnum
) < needle
)
2483 int journal_file_move_to_entry_by_seqnum(
2486 direction_t direction
,
2492 return generic_array_bisect(f
,
2493 le64toh(f
->header
->entry_array_offset
),
2494 le64toh(f
->header
->n_entries
),
2501 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2508 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2512 if (le64toh(o
->entry
.realtime
) == needle
)
2514 else if (le64toh(o
->entry
.realtime
) < needle
)
2520 int journal_file_move_to_entry_by_realtime(
2523 direction_t direction
,
2529 return generic_array_bisect(f
,
2530 le64toh(f
->header
->entry_array_offset
),
2531 le64toh(f
->header
->n_entries
),
2533 test_object_realtime
,
2538 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
2545 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
2549 if (le64toh(o
->entry
.monotonic
) == needle
)
2551 else if (le64toh(o
->entry
.monotonic
) < needle
)
2557 static int find_data_object_by_boot_id(
2563 char t
[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2565 sd_id128_to_string(boot_id
, t
+ 9);
2566 return journal_file_find_data_object(f
, t
, sizeof(t
) - 1, o
, b
);
2569 int journal_file_move_to_entry_by_monotonic(
2573 direction_t direction
,
2582 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, NULL
);
2588 return generic_array_bisect_plus_one(f
,
2589 le64toh(o
->data
.entry_offset
),
2590 le64toh(o
->data
.entry_array_offset
),
2591 le64toh(o
->data
.n_entries
),
2593 test_object_monotonic
,
2598 void journal_file_reset_location(JournalFile
*f
) {
2599 f
->location_type
= LOCATION_HEAD
;
2600 f
->current_offset
= 0;
2601 f
->current_seqnum
= 0;
2602 f
->current_realtime
= 0;
2603 f
->current_monotonic
= 0;
2604 zero(f
->current_boot_id
);
2605 f
->current_xor_hash
= 0;
2608 void journal_file_save_location(JournalFile
*f
, Object
*o
, uint64_t offset
) {
2609 f
->location_type
= LOCATION_SEEK
;
2610 f
->current_offset
= offset
;
2611 f
->current_seqnum
= le64toh(o
->entry
.seqnum
);
2612 f
->current_realtime
= le64toh(o
->entry
.realtime
);
2613 f
->current_monotonic
= le64toh(o
->entry
.monotonic
);
2614 f
->current_boot_id
= o
->entry
.boot_id
;
2615 f
->current_xor_hash
= le64toh(o
->entry
.xor_hash
);
2618 int journal_file_compare_locations(JournalFile
*af
, JournalFile
*bf
) {
2623 assert(af
->location_type
== LOCATION_SEEK
);
2624 assert(bf
->location_type
== LOCATION_SEEK
);
2626 /* If contents and timestamps match, these entries are
2627 * identical, even if the seqnum does not match */
2628 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
) &&
2629 af
->current_monotonic
== bf
->current_monotonic
&&
2630 af
->current_realtime
== bf
->current_realtime
&&
2631 af
->current_xor_hash
== bf
->current_xor_hash
)
2634 if (sd_id128_equal(af
->header
->seqnum_id
, bf
->header
->seqnum_id
)) {
2636 /* If this is from the same seqnum source, compare
2638 if (af
->current_seqnum
< bf
->current_seqnum
)
2640 if (af
->current_seqnum
> bf
->current_seqnum
)
2643 /* Wow! This is weird, different data but the same
2644 * seqnums? Something is borked, but let's make the
2645 * best of it and compare by time. */
2648 if (sd_id128_equal(af
->current_boot_id
, bf
->current_boot_id
)) {
2650 /* If the boot id matches, compare monotonic time */
2651 if (af
->current_monotonic
< bf
->current_monotonic
)
2653 if (af
->current_monotonic
> bf
->current_monotonic
)
2657 /* Otherwise, compare UTC time */
2658 if (af
->current_realtime
< bf
->current_realtime
)
2660 if (af
->current_realtime
> bf
->current_realtime
)
2663 /* Finally, compare by contents */
2664 if (af
->current_xor_hash
< bf
->current_xor_hash
)
2666 if (af
->current_xor_hash
> bf
->current_xor_hash
)
2672 static int bump_array_index(uint64_t *i
, direction_t direction
, uint64_t n
) {
2674 /* Increase or decrease the specified index, in the right direction. */
2676 if (direction
== DIRECTION_DOWN
) {
2691 static bool check_properly_ordered(uint64_t new_offset
, uint64_t old_offset
, direction_t direction
) {
2693 /* Consider it an error if any of the two offsets is uninitialized */
2694 if (old_offset
== 0 || new_offset
== 0)
2697 /* If we go down, the new offset must be larger than the old one. */
2698 return direction
== DIRECTION_DOWN
?
2699 new_offset
> old_offset
:
2700 new_offset
< old_offset
;
2703 int journal_file_next_entry(
2706 direction_t direction
,
2707 Object
**ret
, uint64_t *offset
) {
2715 n
= le64toh(f
->header
->n_entries
);
2720 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2722 r
= generic_array_bisect(f
,
2723 le64toh(f
->header
->entry_array_offset
),
2724 le64toh(f
->header
->n_entries
),
2733 r
= bump_array_index(&i
, direction
, n
);
2738 /* And jump to it */
2740 r
= generic_array_get(f
,
2741 le64toh(f
->header
->entry_array_offset
),
2749 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2750 * the next one might work for us instead. */
2751 log_debug_errno(r
, "Entry item %" PRIu64
" is bad, skipping over it.", i
);
2753 r
= bump_array_index(&i
, direction
, n
);
2758 /* Ensure our array is properly ordered. */
2759 if (p
> 0 && !check_properly_ordered(ofs
, p
, direction
)) {
2760 log_debug("%s: entry array not properly ordered at entry %" PRIu64
, f
->path
, i
);
2770 int journal_file_next_entry_for_data(
2772 Object
*o
, uint64_t p
,
2773 uint64_t data_offset
,
2774 direction_t direction
,
2775 Object
**ret
, uint64_t *offset
) {
2782 assert(p
> 0 || !o
);
2784 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2788 n
= le64toh(d
->data
.n_entries
);
2793 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
2795 if (o
->object
.type
!= OBJECT_ENTRY
)
2798 r
= generic_array_bisect_plus_one(f
,
2799 le64toh(d
->data
.entry_offset
),
2800 le64toh(d
->data
.entry_array_offset
),
2801 le64toh(d
->data
.n_entries
),
2811 r
= bump_array_index(&i
, direction
, n
);
2817 r
= generic_array_get_plus_one(f
,
2818 le64toh(d
->data
.entry_offset
),
2819 le64toh(d
->data
.entry_array_offset
),
2827 log_debug_errno(r
, "Data entry item %" PRIu64
" is bad, skipping over it.", i
);
2829 r
= bump_array_index(&i
, direction
, n
);
2834 /* Ensure our array is properly ordered. */
2835 if (p
> 0 && check_properly_ordered(ofs
, p
, direction
)) {
2836 log_debug("%s data entry array not properly ordered at entry %" PRIu64
, f
->path
, i
);
2846 int journal_file_move_to_entry_by_offset_for_data(
2848 uint64_t data_offset
,
2850 direction_t direction
,
2851 Object
**ret
, uint64_t *offset
) {
2858 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2862 return generic_array_bisect_plus_one(f
,
2863 le64toh(d
->data
.entry_offset
),
2864 le64toh(d
->data
.entry_array_offset
),
2865 le64toh(d
->data
.n_entries
),
2872 int journal_file_move_to_entry_by_monotonic_for_data(
2874 uint64_t data_offset
,
2877 direction_t direction
,
2878 Object
**ret
, uint64_t *offset
) {
2886 /* First, seek by time */
2887 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &b
);
2893 r
= generic_array_bisect_plus_one(f
,
2894 le64toh(o
->data
.entry_offset
),
2895 le64toh(o
->data
.entry_array_offset
),
2896 le64toh(o
->data
.n_entries
),
2898 test_object_monotonic
,
2904 /* And now, continue seeking until we find an entry that
2905 * exists in both bisection arrays */
2911 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2915 r
= generic_array_bisect_plus_one(f
,
2916 le64toh(d
->data
.entry_offset
),
2917 le64toh(d
->data
.entry_array_offset
),
2918 le64toh(d
->data
.n_entries
),
2926 r
= journal_file_move_to_object(f
, OBJECT_DATA
, b
, &o
);
2930 r
= generic_array_bisect_plus_one(f
,
2931 le64toh(o
->data
.entry_offset
),
2932 le64toh(o
->data
.entry_array_offset
),
2933 le64toh(o
->data
.n_entries
),
2955 int journal_file_move_to_entry_by_seqnum_for_data(
2957 uint64_t data_offset
,
2959 direction_t direction
,
2960 Object
**ret
, uint64_t *offset
) {
2967 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2971 return generic_array_bisect_plus_one(f
,
2972 le64toh(d
->data
.entry_offset
),
2973 le64toh(d
->data
.entry_array_offset
),
2974 le64toh(d
->data
.n_entries
),
2981 int journal_file_move_to_entry_by_realtime_for_data(
2983 uint64_t data_offset
,
2985 direction_t direction
,
2986 Object
**ret
, uint64_t *offset
) {
2993 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
2997 return generic_array_bisect_plus_one(f
,
2998 le64toh(d
->data
.entry_offset
),
2999 le64toh(d
->data
.entry_array_offset
),
3000 le64toh(d
->data
.n_entries
),
3002 test_object_realtime
,
3007 void journal_file_dump(JournalFile
*f
) {
3015 journal_file_print_header(f
);
3017 p
= le64toh(f
->header
->header_size
);
3019 r
= journal_file_move_to_object(f
, OBJECT_UNUSED
, p
, &o
);
3023 switch (o
->object
.type
) {
3026 printf("Type: OBJECT_UNUSED\n");
3030 printf("Type: OBJECT_DATA\n");
3034 printf("Type: OBJECT_FIELD\n");
3038 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64
" monotonic=%"PRIu64
" realtime=%"PRIu64
"\n",
3039 le64toh(o
->entry
.seqnum
),
3040 le64toh(o
->entry
.monotonic
),
3041 le64toh(o
->entry
.realtime
));
3044 case OBJECT_FIELD_HASH_TABLE
:
3045 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3048 case OBJECT_DATA_HASH_TABLE
:
3049 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3052 case OBJECT_ENTRY_ARRAY
:
3053 printf("Type: OBJECT_ENTRY_ARRAY\n");
3057 printf("Type: OBJECT_TAG seqnum=%"PRIu64
" epoch=%"PRIu64
"\n",
3058 le64toh(o
->tag
.seqnum
),
3059 le64toh(o
->tag
.epoch
));
3063 printf("Type: unknown (%i)\n", o
->object
.type
);
3067 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
)
3068 printf("Flags: %s\n",
3069 object_compressed_to_string(o
->object
.flags
& OBJECT_COMPRESSION_MASK
));
3071 if (p
== le64toh(f
->header
->tail_object_offset
))
3074 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
3079 log_error("File corrupt");
3082 static const char* format_timestamp_safe(char *buf
, size_t l
, usec_t t
) {
3085 x
= format_timestamp(buf
, l
, t
);
3091 void journal_file_print_header(JournalFile
*f
) {
3092 char a
[33], b
[33], c
[33], d
[33];
3093 char x
[FORMAT_TIMESTAMP_MAX
], y
[FORMAT_TIMESTAMP_MAX
], z
[FORMAT_TIMESTAMP_MAX
];
3095 char bytes
[FORMAT_BYTES_MAX
];
3100 printf("File Path: %s\n"
3104 "Sequential Number ID: %s\n"
3106 "Compatible Flags:%s%s\n"
3107 "Incompatible Flags:%s%s%s\n"
3108 "Header size: %"PRIu64
"\n"
3109 "Arena size: %"PRIu64
"\n"
3110 "Data Hash Table Size: %"PRIu64
"\n"
3111 "Field Hash Table Size: %"PRIu64
"\n"
3112 "Rotate Suggested: %s\n"
3113 "Head Sequential Number: %"PRIu64
" (%"PRIx64
")\n"
3114 "Tail Sequential Number: %"PRIu64
" (%"PRIx64
")\n"
3115 "Head Realtime Timestamp: %s (%"PRIx64
")\n"
3116 "Tail Realtime Timestamp: %s (%"PRIx64
")\n"
3117 "Tail Monotonic Timestamp: %s (%"PRIx64
")\n"
3118 "Objects: %"PRIu64
"\n"
3119 "Entry Objects: %"PRIu64
"\n",
3121 sd_id128_to_string(f
->header
->file_id
, a
),
3122 sd_id128_to_string(f
->header
->machine_id
, b
),
3123 sd_id128_to_string(f
->header
->boot_id
, c
),
3124 sd_id128_to_string(f
->header
->seqnum_id
, d
),
3125 f
->header
->state
== STATE_OFFLINE
? "OFFLINE" :
3126 f
->header
->state
== STATE_ONLINE
? "ONLINE" :
3127 f
->header
->state
== STATE_ARCHIVED
? "ARCHIVED" : "UNKNOWN",
3128 JOURNAL_HEADER_SEALED(f
->header
) ? " SEALED" : "",
3129 (le32toh(f
->header
->compatible_flags
) & ~HEADER_COMPATIBLE_ANY
) ? " ???" : "",
3130 JOURNAL_HEADER_COMPRESSED_XZ(f
->header
) ? " COMPRESSED-XZ" : "",
3131 JOURNAL_HEADER_COMPRESSED_LZ4(f
->header
) ? " COMPRESSED-LZ4" : "",
3132 (le32toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_ANY
) ? " ???" : "",
3133 le64toh(f
->header
->header_size
),
3134 le64toh(f
->header
->arena_size
),
3135 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3136 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
),
3137 yes_no(journal_file_rotate_suggested(f
, 0)),
3138 le64toh(f
->header
->head_entry_seqnum
), le64toh(f
->header
->head_entry_seqnum
),
3139 le64toh(f
->header
->tail_entry_seqnum
), le64toh(f
->header
->tail_entry_seqnum
),
3140 format_timestamp_safe(x
, sizeof(x
), le64toh(f
->header
->head_entry_realtime
)), le64toh(f
->header
->head_entry_realtime
),
3141 format_timestamp_safe(y
, sizeof(y
), le64toh(f
->header
->tail_entry_realtime
)), le64toh(f
->header
->tail_entry_realtime
),
3142 format_timespan(z
, sizeof(z
), le64toh(f
->header
->tail_entry_monotonic
), USEC_PER_MSEC
), le64toh(f
->header
->tail_entry_monotonic
),
3143 le64toh(f
->header
->n_objects
),
3144 le64toh(f
->header
->n_entries
));
3146 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3147 printf("Data Objects: %"PRIu64
"\n"
3148 "Data Hash Table Fill: %.1f%%\n",
3149 le64toh(f
->header
->n_data
),
3150 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))));
3152 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3153 printf("Field Objects: %"PRIu64
"\n"
3154 "Field Hash Table Fill: %.1f%%\n",
3155 le64toh(f
->header
->n_fields
),
3156 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))));
3158 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_tags
))
3159 printf("Tag Objects: %"PRIu64
"\n",
3160 le64toh(f
->header
->n_tags
));
3161 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_entry_arrays
))
3162 printf("Entry Array Objects: %"PRIu64
"\n",
3163 le64toh(f
->header
->n_entry_arrays
));
3165 if (fstat(f
->fd
, &st
) >= 0)
3166 printf("Disk usage: %s\n", format_bytes(bytes
, sizeof(bytes
), (uint64_t) st
.st_blocks
* 512ULL));
3169 static int journal_file_warn_btrfs(JournalFile
*f
) {
3175 /* Before we write anything, check if the COW logic is turned
3176 * off on btrfs. Given our write pattern that is quite
3177 * unfriendly to COW file systems this should greatly improve
3178 * performance on COW file systems, such as btrfs, at the
3179 * expense of data integrity features (which shouldn't be too
3180 * bad, given that we do our own checksumming). */
3182 r
= btrfs_is_filesystem(f
->fd
);
3184 return log_warning_errno(r
, "Failed to determine if journal is on btrfs: %m");
3188 r
= read_attr_fd(f
->fd
, &attrs
);
3190 return log_warning_errno(r
, "Failed to read file attributes: %m");
3192 if (attrs
& FS_NOCOW_FL
) {
3193 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3197 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3198 "This is likely to slow down journal access substantially, please consider turning "
3199 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f
->path
);
3204 int journal_file_open(
3210 uint64_t compress_threshold_bytes
,
3212 JournalMetrics
*metrics
,
3213 MMapCache
*mmap_cache
,
3214 Set
*deferred_closes
,
3215 JournalFile
*template,
3216 JournalFile
**ret
) {
3218 bool newly_created
= false;
3222 char bytes
[FORMAT_BYTES_MAX
];
3225 assert(fd
>= 0 || fname
);
3227 if (!IN_SET((flags
& O_ACCMODE
), O_RDONLY
, O_RDWR
))
3230 if (fname
&& (flags
& O_CREAT
) && !endswith(fname
, ".journal"))
3233 f
= new0(JournalFile
, 1);
3241 f
->prot
= prot_from_flags(flags
);
3242 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
3244 f
->compress_lz4
= compress
;
3246 f
->compress_xz
= compress
;
3249 if (compress_threshold_bytes
== (uint64_t) -1)
3250 f
->compress_threshold_bytes
= DEFAULT_COMPRESS_THRESHOLD
;
3252 f
->compress_threshold_bytes
= MAX(MIN_COMPRESS_THRESHOLD
, compress_threshold_bytes
);
3258 log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
3259 yes_no(f
->seal
), yes_no(JOURNAL_FILE_COMPRESS(f
)),
3260 format_bytes(bytes
, sizeof(bytes
), f
->compress_threshold_bytes
));
3263 f
->mmap
= mmap_cache_ref(mmap_cache
);
3265 f
->mmap
= mmap_cache_new();
3273 f
->path
= strdup(fname
);
3281 /* If we don't know the path, fill in something explanatory and vaguely useful */
3282 if (asprintf(&f
->path
, "/proc/self/%i", fd
) < 0) {
3288 f
->chain_cache
= ordered_hashmap_new(&uint64_hash_ops
);
3289 if (!f
->chain_cache
) {
3295 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3296 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3297 * it doesn't hurt in that case. */
3299 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
|O_NONBLOCK
, f
->mode
);
3305 /* fds we opened here by us should also be closed by us. */
3308 r
= fd_nonblock(f
->fd
, false);
3313 f
->cache_fd
= mmap_cache_add_fd(f
->mmap
, f
->fd
);
3319 r
= journal_file_fstat(f
);
3323 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
3325 (void) journal_file_warn_btrfs(f
);
3327 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3328 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3329 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3330 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3331 * solely on mtime/atime/ctime of the file. */
3332 (void) fd_setcrtime(f
->fd
, 0);
3335 /* Try to load the FSPRG state, and if we can't, then
3336 * just don't do sealing */
3338 r
= journal_file_fss_load(f
);
3344 r
= journal_file_init_header(f
, template);
3348 r
= journal_file_fstat(f
);
3352 newly_created
= true;
3355 if (f
->last_stat
.st_size
< (off_t
) HEADER_SIZE_MIN
) {
3360 r
= mmap_cache_get(f
->mmap
, f
->cache_fd
, f
->prot
, CONTEXT_HEADER
, true, 0, PAGE_ALIGN(sizeof(Header
)), &f
->last_stat
, &h
, NULL
);
3366 if (!newly_created
) {
3367 set_clear_with_destructor(deferred_closes
, journal_file_close
);
3369 r
= journal_file_verify_header(f
);
3375 if (!newly_created
&& f
->writable
) {
3376 r
= journal_file_fss_load(f
);
3384 journal_default_metrics(metrics
, f
->fd
);
3385 f
->metrics
= *metrics
;
3386 } else if (template)
3387 f
->metrics
= template->metrics
;
3389 r
= journal_file_refresh_header(f
);
3395 r
= journal_file_hmac_setup(f
);
3400 if (newly_created
) {
3401 r
= journal_file_setup_field_hash_table(f
);
3405 r
= journal_file_setup_data_hash_table(f
);
3410 r
= journal_file_append_first_tag(f
);
3416 if (mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
)) {
3421 if (template && template->post_change_timer
) {
3422 r
= journal_file_enable_post_change_timer(
3424 sd_event_source_get_event(template->post_change_timer
),
3425 template->post_change_timer_period
);
3431 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3438 if (f
->cache_fd
&& mmap_cache_got_sigbus(f
->mmap
, f
->cache_fd
))
3441 (void) journal_file_close(f
);
3446 int journal_file_rotate(JournalFile
**f
, bool compress
, uint64_t compress_threshold_bytes
, bool seal
, Set
*deferred_closes
) {
3447 _cleanup_free_
char *p
= NULL
;
3449 JournalFile
*old_file
, *new_file
= NULL
;
3457 if (!old_file
->writable
)
3460 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3461 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3462 if (path_startswith(old_file
->path
, "/proc/self/fd"))
3465 if (!endswith(old_file
->path
, ".journal"))
3468 l
= strlen(old_file
->path
);
3469 r
= asprintf(&p
, "%.*s@" SD_ID128_FORMAT_STR
"-%016"PRIx64
"-%016"PRIx64
".journal",
3470 (int) l
- 8, old_file
->path
,
3471 SD_ID128_FORMAT_VAL(old_file
->header
->seqnum_id
),
3472 le64toh((*f
)->header
->head_entry_seqnum
),
3473 le64toh((*f
)->header
->head_entry_realtime
));
3477 /* Try to rename the file to the archived version. If the file
3478 * already was deleted, we'll get ENOENT, let's ignore that
3480 r
= rename(old_file
->path
, p
);
3481 if (r
< 0 && errno
!= ENOENT
)
3484 /* Sync the rename to disk */
3485 (void) fsync_directory_of_file(old_file
->fd
);
3487 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3488 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3489 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3490 * would result in the rotated journal never getting fsync() called before closing.
3491 * Now we simply queue the archive state by setting an archive bit, leaving the state
3492 * as STATE_ONLINE so proper offlining occurs. */
3493 old_file
->archive
= true;
3495 /* Currently, btrfs is not very good with out write patterns
3496 * and fragments heavily. Let's defrag our journal files when
3497 * we archive them */
3498 old_file
->defrag_on_close
= true;
3500 r
= journal_file_open(-1, old_file
->path
, old_file
->flags
, old_file
->mode
, compress
,
3501 compress_threshold_bytes
, seal
, NULL
, old_file
->mmap
, deferred_closes
,
3502 old_file
, &new_file
);
3504 if (deferred_closes
&&
3505 set_put(deferred_closes
, old_file
) >= 0)
3506 (void) journal_file_set_offline(old_file
, false);
3508 (void) journal_file_close(old_file
);
3514 int journal_file_open_reliably(
3519 uint64_t compress_threshold_bytes
,
3521 JournalMetrics
*metrics
,
3522 MMapCache
*mmap_cache
,
3523 Set
*deferred_closes
,
3524 JournalFile
*template,
3525 JournalFile
**ret
) {
3529 _cleanup_free_
char *p
= NULL
;
3531 r
= journal_file_open(-1, fname
, flags
, mode
, compress
, compress_threshold_bytes
, seal
, metrics
, mmap_cache
,
3532 deferred_closes
, template, ret
);
3534 -EBADMSG
, /* Corrupted */
3535 -ENODATA
, /* Truncated */
3536 -EHOSTDOWN
, /* Other machine */
3537 -EPROTONOSUPPORT
, /* Incompatible feature */
3538 -EBUSY
, /* Unclean shutdown */
3539 -ESHUTDOWN
, /* Already archived */
3540 -EIO
, /* IO error, including SIGBUS on mmap */
3541 -EIDRM
, /* File has been deleted */
3542 -ETXTBSY
)) /* File is from the future */
3545 if ((flags
& O_ACCMODE
) == O_RDONLY
)
3548 if (!(flags
& O_CREAT
))
3551 if (!endswith(fname
, ".journal"))
3554 /* The file is corrupted. Rotate it away and try it again (but only once) */
3557 if (asprintf(&p
, "%.*s@%016"PRIx64
"-%016"PRIx64
".journal~",
3559 now(CLOCK_REALTIME
),
3563 if (rename(fname
, p
) < 0)
3566 /* btrfs doesn't cope well with our write pattern and
3567 * fragments heavily. Let's defrag all files we rotate */
3569 (void) chattr_path(p
, 0, FS_NOCOW_FL
);
3570 (void) btrfs_defrag(p
);
3572 log_warning_errno(r
, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname
);
3574 return journal_file_open(-1, fname
, flags
, mode
, compress
, compress_threshold_bytes
, seal
, metrics
, mmap_cache
,
3575 deferred_closes
, template, ret
);
3578 int journal_file_copy_entry(JournalFile
*from
, JournalFile
*to
, Object
*o
, uint64_t p
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
3580 uint64_t q
, xor_hash
= 0;
3593 ts
.monotonic
= le64toh(o
->entry
.monotonic
);
3594 ts
.realtime
= le64toh(o
->entry
.realtime
);
3596 n
= journal_file_entry_n_items(o
);
3597 /* alloca() can't take 0, hence let's allocate at least one */
3598 items
= alloca(sizeof(EntryItem
) * MAX(1u, n
));
3600 for (i
= 0; i
< n
; i
++) {
3607 q
= le64toh(o
->entry
.items
[i
].object_offset
);
3608 le_hash
= o
->entry
.items
[i
].hash
;
3610 r
= journal_file_move_to_object(from
, OBJECT_DATA
, q
, &o
);
3614 if (le_hash
!= o
->data
.hash
)
3617 l
= le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
);
3620 /* We hit the limit on 32bit machines */
3621 if ((uint64_t) t
!= l
)
3624 if (o
->object
.flags
& OBJECT_COMPRESSION_MASK
) {
3625 #if HAVE_XZ || HAVE_LZ4
3628 r
= decompress_blob(o
->object
.flags
& OBJECT_COMPRESSION_MASK
,
3629 o
->data
.payload
, l
, &from
->compress_buffer
, &from
->compress_buffer_size
, &rsize
, 0);
3633 data
= from
->compress_buffer
;
3636 return -EPROTONOSUPPORT
;
3639 data
= o
->data
.payload
;
3641 r
= journal_file_append_data(to
, data
, l
, &u
, &h
);
3645 xor_hash
^= le64toh(u
->data
.hash
);
3646 items
[i
].object_offset
= htole64(h
);
3647 items
[i
].hash
= u
->data
.hash
;
3649 r
= journal_file_move_to_object(from
, OBJECT_ENTRY
, p
, &o
);
3654 r
= journal_file_append_entry_internal(to
, &ts
, xor_hash
, items
, n
, seqnum
, ret
, offset
);
3656 if (mmap_cache_got_sigbus(to
->mmap
, to
->cache_fd
))
3662 void journal_reset_metrics(JournalMetrics
*m
) {
3665 /* Set everything to "pick automatic values". */
3667 *m
= (JournalMetrics
) {
3668 .min_use
= (uint64_t) -1,
3669 .max_use
= (uint64_t) -1,
3670 .min_size
= (uint64_t) -1,
3671 .max_size
= (uint64_t) -1,
3672 .keep_free
= (uint64_t) -1,
3673 .n_max_files
= (uint64_t) -1,
3677 void journal_default_metrics(JournalMetrics
*m
, int fd
) {
3678 char a
[FORMAT_BYTES_MAX
], b
[FORMAT_BYTES_MAX
], c
[FORMAT_BYTES_MAX
], d
[FORMAT_BYTES_MAX
], e
[FORMAT_BYTES_MAX
];
3685 if (fstatvfs(fd
, &ss
) >= 0)
3686 fs_size
= ss
.f_frsize
* ss
.f_blocks
;
3688 log_debug_errno(errno
, "Failed to determine disk size: %m");
3692 if (m
->max_use
== (uint64_t) -1) {
3695 m
->max_use
= PAGE_ALIGN(fs_size
/ 10); /* 10% of file system size */
3697 if (m
->max_use
> DEFAULT_MAX_USE_UPPER
)
3698 m
->max_use
= DEFAULT_MAX_USE_UPPER
;
3700 if (m
->max_use
< DEFAULT_MAX_USE_LOWER
)
3701 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3703 m
->max_use
= DEFAULT_MAX_USE_LOWER
;
3705 m
->max_use
= PAGE_ALIGN(m
->max_use
);
3707 if (m
->max_use
!= 0 && m
->max_use
< JOURNAL_FILE_SIZE_MIN
*2)
3708 m
->max_use
= JOURNAL_FILE_SIZE_MIN
*2;
3711 if (m
->min_use
== (uint64_t) -1)
3712 m
->min_use
= DEFAULT_MIN_USE
;
3714 if (m
->min_use
> m
->max_use
)
3715 m
->min_use
= m
->max_use
;
3717 if (m
->max_size
== (uint64_t) -1) {
3718 m
->max_size
= PAGE_ALIGN(m
->max_use
/ 8); /* 8 chunks */
3720 if (m
->max_size
> DEFAULT_MAX_SIZE_UPPER
)
3721 m
->max_size
= DEFAULT_MAX_SIZE_UPPER
;
3723 m
->max_size
= PAGE_ALIGN(m
->max_size
);
3725 if (m
->max_size
!= 0) {
3726 if (m
->max_size
< JOURNAL_FILE_SIZE_MIN
)
3727 m
->max_size
= JOURNAL_FILE_SIZE_MIN
;
3729 if (m
->max_use
!= 0 && m
->max_size
*2 > m
->max_use
)
3730 m
->max_use
= m
->max_size
*2;
3733 if (m
->min_size
== (uint64_t) -1)
3734 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3736 m
->min_size
= PAGE_ALIGN(m
->min_size
);
3738 if (m
->min_size
< JOURNAL_FILE_SIZE_MIN
)
3739 m
->min_size
= JOURNAL_FILE_SIZE_MIN
;
3741 if (m
->max_size
!= 0 && m
->min_size
> m
->max_size
)
3742 m
->max_size
= m
->min_size
;
3745 if (m
->keep_free
== (uint64_t) -1) {
3748 m
->keep_free
= PAGE_ALIGN(fs_size
* 3 / 20); /* 15% of file system size */
3750 if (m
->keep_free
> DEFAULT_KEEP_FREE_UPPER
)
3751 m
->keep_free
= DEFAULT_KEEP_FREE_UPPER
;
3754 m
->keep_free
= DEFAULT_KEEP_FREE
;
3757 if (m
->n_max_files
== (uint64_t) -1)
3758 m
->n_max_files
= DEFAULT_N_MAX_FILES
;
3760 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64
,
3761 format_bytes(a
, sizeof(a
), m
->min_use
),
3762 format_bytes(b
, sizeof(b
), m
->max_use
),
3763 format_bytes(c
, sizeof(c
), m
->max_size
),
3764 format_bytes(d
, sizeof(d
), m
->min_size
),
3765 format_bytes(e
, sizeof(e
), m
->keep_free
),
3769 int journal_file_get_cutoff_realtime_usec(JournalFile
*f
, usec_t
*from
, usec_t
*to
) {
3775 if (f
->header
->head_entry_realtime
== 0)
3778 *from
= le64toh(f
->header
->head_entry_realtime
);
3782 if (f
->header
->tail_entry_realtime
== 0)
3785 *to
= le64toh(f
->header
->tail_entry_realtime
);
3791 int journal_file_get_cutoff_monotonic_usec(JournalFile
*f
, sd_id128_t boot_id
, usec_t
*from
, usec_t
*to
) {
3799 r
= find_data_object_by_boot_id(f
, boot_id
, &o
, &p
);
3803 if (le64toh(o
->data
.n_entries
) <= 0)
3807 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, le64toh(o
->data
.entry_offset
), &o
);
3811 *from
= le64toh(o
->entry
.monotonic
);
3815 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
3819 r
= generic_array_get_plus_one(f
,
3820 le64toh(o
->data
.entry_offset
),
3821 le64toh(o
->data
.entry_array_offset
),
3822 le64toh(o
->data
.n_entries
)-1,
3827 *to
= le64toh(o
->entry
.monotonic
);
3833 bool journal_file_rotate_suggested(JournalFile
*f
, usec_t max_file_usec
) {
3837 /* If we gained new header fields we gained new features,
3838 * hence suggest a rotation */
3839 if (le64toh(f
->header
->header_size
) < sizeof(Header
)) {
3840 log_debug("%s uses an outdated header, suggesting rotation.", f
->path
);
3844 /* Let's check if the hash tables grew over a certain fill
3845 * level (75%, borrowing this value from Java's hash table
3846 * implementation), and if so suggest a rotation. To calculate
3847 * the fill level we need the n_data field, which only exists
3848 * in newer versions. */
3850 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
))
3851 if (le64toh(f
->header
->n_data
) * 4ULL > (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3852 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items, %llu file size, %"PRIu64
" bytes per hash table item), suggesting rotation.",
3854 100.0 * (double) le64toh(f
->header
->n_data
) / ((double) (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
))),
3855 le64toh(f
->header
->n_data
),
3856 le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
),
3857 (unsigned long long) f
->last_stat
.st_size
,
3858 f
->last_stat
.st_size
/ le64toh(f
->header
->n_data
));
3862 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
))
3863 if (le64toh(f
->header
->n_fields
) * 4ULL > (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
)) * 3ULL) {
3864 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64
" of %"PRIu64
" items), suggesting rotation.",
3866 100.0 * (double) le64toh(f
->header
->n_fields
) / ((double) (le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
))),
3867 le64toh(f
->header
->n_fields
),
3868 le64toh(f
->header
->field_hash_table_size
) / sizeof(HashItem
));
3872 /* Are the data objects properly indexed by field objects? */
3873 if (JOURNAL_HEADER_CONTAINS(f
->header
, n_data
) &&
3874 JOURNAL_HEADER_CONTAINS(f
->header
, n_fields
) &&
3875 le64toh(f
->header
->n_data
) > 0 &&
3876 le64toh(f
->header
->n_fields
) == 0)
3879 if (max_file_usec
> 0) {
3882 h
= le64toh(f
->header
->head_entry_realtime
);
3883 t
= now(CLOCK_REALTIME
);
3885 if (h
> 0 && t
> h
+ max_file_usec
)