1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
38 #define DEFAULT_WINDOW_SIZE (128ULL*1024ULL*1024ULL)
40 #define COMPRESSION_SIZE_THRESHOLD (64ULL)
42 static const char signature
[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
44 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
46 void journal_file_close(JournalFile
*f
) {
51 if (f
->header
&& f
->writable
)
52 f
->header
->state
= STATE_OFFLINE
;
55 for (t
= 0; t
< _WINDOW_MAX
; t
++)
56 if (f
->windows
[t
].ptr
)
57 munmap(f
->windows
[t
].ptr
, f
->windows
[t
].size
);
60 close_nointr_nofail(f
->fd
);
65 free(f
->compress_buffer
);
71 static int journal_file_init_header(JournalFile
*f
, JournalFile
*template) {
79 memcpy(h
.signature
, signature
, 8);
80 h
.arena_offset
= htole64(ALIGN64(sizeof(h
)));
82 r
= sd_id128_randomize(&h
.file_id
);
87 h
.seqnum_id
= template->header
->seqnum_id
;
88 h
.seqnum
= template->header
->seqnum
;
90 h
.seqnum_id
= h
.file_id
;
92 k
= pwrite(f
->fd
, &h
, sizeof(h
), 0);
102 static int journal_file_refresh_header(JournalFile
*f
) {
108 r
= sd_id128_get_machine(&f
->header
->machine_id
);
112 r
= sd_id128_get_boot(&boot_id
);
116 if (sd_id128_equal(boot_id
, f
->header
->boot_id
))
117 f
->tail_entry_monotonic_valid
= true;
119 f
->header
->boot_id
= boot_id
;
121 f
->header
->state
= STATE_ONLINE
;
125 static int journal_file_verify_header(JournalFile
*f
) {
128 if (memcmp(f
->header
, signature
, 8))
132 if ((le64toh(f
->header
->incompatible_flags
) & ~HEADER_INCOMPATIBLE_COMPRESSED
) != 0)
133 return -EPROTONOSUPPORT
;
135 if (f
->header
->incompatible_flags
!= 0)
136 return -EPROTONOSUPPORT
;
139 if ((uint64_t) f
->last_stat
.st_size
< (le64toh(f
->header
->arena_offset
) + le64toh(f
->header
->arena_size
)))
144 sd_id128_t machine_id
;
147 r
= sd_id128_get_machine(&machine_id
);
151 if (!sd_id128_equal(machine_id
, f
->header
->machine_id
))
154 state
= f
->header
->state
;
156 if (state
== STATE_ONLINE
)
157 log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f
->path
);
158 else if (state
== STATE_ARCHIVED
)
160 else if (state
!= STATE_OFFLINE
)
161 log_debug("Journal file %s has unknown state %u. Ignoring.", f
->path
, state
);
167 static int journal_file_allocate(JournalFile
*f
, uint64_t offset
, uint64_t size
) {
168 uint64_t old_size
, new_size
;
172 /* We assume that this file is not sparse, and we know that
173 * for sure, since we always call posix_fallocate()
177 le64toh(f
->header
->arena_offset
) +
178 le64toh(f
->header
->arena_size
);
180 new_size
= PAGE_ALIGN(offset
+ size
);
181 if (new_size
< le64toh(f
->header
->arena_offset
))
182 new_size
= le64toh(f
->header
->arena_offset
);
184 if (new_size
<= old_size
)
187 if (f
->metrics
.max_size
> 0 &&
188 new_size
> f
->metrics
.max_size
)
191 if (new_size
> f
->metrics
.min_size
&&
192 f
->metrics
.keep_free
> 0) {
195 if (fstatvfs(f
->fd
, &svfs
) >= 0) {
198 available
= svfs
.f_bfree
* svfs
.f_bsize
;
200 if (available
>= f
->metrics
.keep_free
)
201 available
-= f
->metrics
.keep_free
;
205 if (new_size
- old_size
> available
)
210 /* Note that the glibc fallocate() fallback is very
211 inefficient, hence we try to minimize the allocation area
213 if (posix_fallocate(f
->fd
, old_size
, new_size
- old_size
) < 0)
216 if (fstat(f
->fd
, &f
->last_stat
) < 0)
219 f
->header
->arena_size
= new_size
- htole64(f
->header
->arena_offset
);
224 static int journal_file_map(
233 uint64_t woffset
, wsize
;
240 woffset
= offset
& ~((uint64_t) page_size() - 1ULL);
241 wsize
= size
+ (offset
- woffset
);
242 wsize
= PAGE_ALIGN(wsize
);
244 /* Avoid SIGBUS on invalid accesses */
245 if (woffset
+ wsize
> (uint64_t) PAGE_ALIGN(f
->last_stat
.st_size
))
246 return -EADDRNOTAVAIL
;
248 window
= mmap(NULL
, wsize
, f
->prot
, MAP_SHARED
, f
->fd
, woffset
);
249 if (window
== MAP_FAILED
)
261 *ret
= (uint8_t*) window
+ (offset
- woffset
);
266 static int journal_file_move_to(JournalFile
*f
, int wt
, uint64_t offset
, uint64_t size
, void **ret
) {
275 assert(wt
< _WINDOW_MAX
);
279 if (_likely_(w
->ptr
&&
280 w
->offset
<= offset
&&
281 w
->offset
+ w
->size
>= offset
+ size
)) {
283 *ret
= (uint8_t*) w
->ptr
+ (offset
- w
->offset
);
288 if (munmap(w
->ptr
, w
->size
) < 0)
292 w
->size
= w
->offset
= 0;
295 if (size
< DEFAULT_WINDOW_SIZE
) {
296 /* If the default window size is larger then what was
297 * asked for extend the mapping a bit in the hope to
298 * minimize needed remappings later on. We add half
299 * the window space before and half behind the
300 * requested mapping */
302 delta
= PAGE_ALIGN((DEFAULT_WINDOW_SIZE
- size
) / 2);
308 size
+= (DEFAULT_WINDOW_SIZE
- delta
);
312 if (offset
> (uint64_t) f
->last_stat
.st_size
)
313 return -EADDRNOTAVAIL
;
315 if (offset
+ size
> (uint64_t) f
->last_stat
.st_size
)
316 size
= PAGE_ALIGN((uint64_t) f
->last_stat
.st_size
- offset
);
319 return -EADDRNOTAVAIL
;
321 r
= journal_file_map(f
,
323 &w
->ptr
, &w
->offset
, &w
->size
,
329 *ret
= (uint8_t*) p
+ delta
;
333 static bool verify_hash(Object
*o
) {
338 if (o
->object
.type
== OBJECT_DATA
&& !(o
->object
.flags
& OBJECT_COMPRESSED
)) {
339 h1
= le64toh(o
->data
.hash
);
340 h2
= hash64(o
->data
.payload
, le64toh(o
->object
.size
) - offsetof(Object
, data
.payload
));
341 } else if (o
->object
.type
== OBJECT_FIELD
) {
342 h1
= le64toh(o
->field
.hash
);
343 h2
= hash64(o
->field
.payload
, le64toh(o
->object
.size
) - offsetof(Object
, field
.payload
));
350 int journal_file_move_to_object(JournalFile
*f
, int type
, uint64_t offset
, Object
**ret
) {
358 assert(type
< _OBJECT_TYPE_MAX
);
360 r
= journal_file_move_to(f
, type
>= 0 ? type
: WINDOW_UNKNOWN
, offset
, sizeof(ObjectHeader
), &t
);
365 s
= le64toh(o
->object
.size
);
367 if (s
< sizeof(ObjectHeader
))
370 if (type
>= 0 && o
->object
.type
!= type
)
373 if (s
> sizeof(ObjectHeader
)) {
374 r
= journal_file_move_to(f
, o
->object
.type
, offset
, s
, &t
);
388 static uint64_t journal_file_seqnum(JournalFile
*f
, uint64_t *seqnum
) {
393 r
= le64toh(f
->header
->seqnum
) + 1;
396 /* If an external seqnum counter was passed, we update
397 * both the local and the external one, and set it to
398 * the maximum of both */
406 f
->header
->seqnum
= htole64(r
);
408 if (f
->header
->first_seqnum
== 0)
409 f
->header
->first_seqnum
= htole64(r
);
414 static int journal_file_append_object(JournalFile
*f
, int type
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
421 assert(size
>= sizeof(ObjectHeader
));
425 p
= le64toh(f
->header
->tail_object_offset
);
427 p
= le64toh(f
->header
->arena_offset
);
429 r
= journal_file_move_to_object(f
, -1, p
, &tail
);
433 p
+= ALIGN64(le64toh(tail
->object
.size
));
436 r
= journal_file_allocate(f
, p
, size
);
440 r
= journal_file_move_to(f
, type
, p
, size
, &t
);
447 o
->object
.type
= type
;
448 o
->object
.size
= htole64(size
);
450 f
->header
->tail_object_offset
= htole64(p
);
451 f
->header
->n_objects
= htole64(le64toh(f
->header
->n_objects
) + 1);
459 static int journal_file_setup_data_hash_table(JournalFile
*f
) {
466 s
= DEFAULT_DATA_HASH_TABLE_SIZE
;
467 r
= journal_file_append_object(f
,
468 OBJECT_DATA_HASH_TABLE
,
469 offsetof(Object
, hash_table
.items
) + s
,
474 memset(o
->hash_table
.items
, 0, s
);
476 f
->header
->data_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
477 f
->header
->data_hash_table_size
= htole64(s
);
482 static int journal_file_setup_field_hash_table(JournalFile
*f
) {
489 s
= DEFAULT_FIELD_HASH_TABLE_SIZE
;
490 r
= journal_file_append_object(f
,
491 OBJECT_FIELD_HASH_TABLE
,
492 offsetof(Object
, hash_table
.items
) + s
,
497 memset(o
->hash_table
.items
, 0, s
);
499 f
->header
->field_hash_table_offset
= htole64(p
+ offsetof(Object
, hash_table
.items
));
500 f
->header
->field_hash_table_size
= htole64(s
);
505 static int journal_file_map_data_hash_table(JournalFile
*f
) {
512 p
= le64toh(f
->header
->data_hash_table_offset
);
513 s
= le64toh(f
->header
->data_hash_table_size
);
515 r
= journal_file_move_to(f
,
516 WINDOW_DATA_HASH_TABLE
,
522 f
->data_hash_table
= t
;
526 static int journal_file_map_field_hash_table(JournalFile
*f
) {
533 p
= le64toh(f
->header
->field_hash_table_offset
);
534 s
= le64toh(f
->header
->field_hash_table_size
);
536 r
= journal_file_move_to(f
,
537 WINDOW_FIELD_HASH_TABLE
,
543 f
->field_hash_table
= t
;
547 static int journal_file_link_data(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t hash
) {
554 assert(o
->object
.type
== OBJECT_DATA
);
556 o
->data
.next_hash_offset
= o
->data
.next_field_offset
= 0;
557 o
->data
.entry_offset
= o
->data
.entry_array_offset
= 0;
558 o
->data
.n_entries
= 0;
560 h
= hash
% (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
));
561 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
563 /* Only entry in the hash table is easy */
564 f
->data_hash_table
[h
].head_hash_offset
= htole64(offset
);
566 /* Temporarily move back to the previous data object,
567 * to patch in pointer */
569 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
573 o
->data
.next_hash_offset
= htole64(offset
);
575 r
= journal_file_move_to_object(f
, OBJECT_DATA
, offset
, &o
);
580 f
->data_hash_table
[h
].tail_hash_offset
= htole64(offset
);
585 int journal_file_find_data_object_with_hash(
587 const void *data
, uint64_t size
, uint64_t hash
,
588 Object
**ret
, uint64_t *offset
) {
589 uint64_t p
, osize
, h
;
593 assert(data
|| size
== 0);
595 osize
= offsetof(Object
, data
.payload
) + size
;
597 if (f
->header
->data_hash_table_size
== 0)
600 h
= hash
% (le64toh(f
->header
->data_hash_table_size
) / sizeof(HashItem
));
601 p
= le64toh(f
->data_hash_table
[h
].head_hash_offset
);
606 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
610 if (le64toh(o
->data
.hash
) != hash
)
613 if (o
->object
.flags
& OBJECT_COMPRESSED
) {
617 l
= le64toh(o
->object
.size
);
618 if (l
<= offsetof(Object
, data
.payload
))
621 l
-= offsetof(Object
, data
.payload
);
623 if (!uncompress_blob(o
->data
.payload
, l
, &f
->compress_buffer
, &f
->compress_buffer_size
, &rsize
))
627 memcmp(f
->compress_buffer
, data
, size
) == 0) {
638 return -EPROTONOSUPPORT
;
641 } else if (le64toh(o
->object
.size
) == osize
&&
642 memcmp(o
->data
.payload
, data
, size
) == 0) {
654 p
= le64toh(o
->data
.next_hash_offset
);
660 int journal_file_find_data_object(
662 const void *data
, uint64_t size
,
663 Object
**ret
, uint64_t *offset
) {
668 assert(data
|| size
== 0);
670 hash
= hash64(data
, size
);
672 return journal_file_find_data_object_with_hash(f
,
677 static int journal_file_append_data(JournalFile
*f
, const void *data
, uint64_t size
, Object
**ret
, uint64_t *offset
) {
682 bool compressed
= false;
685 assert(data
|| size
== 0);
687 hash
= hash64(data
, size
);
689 r
= journal_file_find_data_object_with_hash(f
, data
, size
, hash
, &o
, &p
);
703 osize
= offsetof(Object
, data
.payload
) + size
;
704 r
= journal_file_append_object(f
, OBJECT_DATA
, osize
, &o
, &p
);
708 o
->data
.hash
= htole64(hash
);
712 size
>= COMPRESSION_SIZE_THRESHOLD
) {
715 compressed
= compress_blob(data
, size
, o
->data
.payload
, &rsize
);
718 o
->object
.size
= htole64(offsetof(Object
, data
.payload
) + rsize
);
719 o
->object
.flags
|= OBJECT_COMPRESSED
;
721 f
->header
->incompatible_flags
= htole32(le32toh(f
->header
->incompatible_flags
) | HEADER_INCOMPATIBLE_COMPRESSED
);
723 log_debug("Compressed data object %lu -> %lu", (unsigned long) size
, (unsigned long) rsize
);
729 memcpy(o
->data
.payload
, data
, size
);
731 r
= journal_file_link_data(f
, o
, p
, hash
);
744 uint64_t journal_file_entry_n_items(Object
*o
) {
746 assert(o
->object
.type
== htole64(OBJECT_ENTRY
));
748 return (le64toh(o
->object
.size
) - offsetof(Object
, entry
.items
)) / sizeof(EntryItem
);
751 static uint64_t journal_file_entry_array_n_items(Object
*o
) {
753 assert(o
->object
.type
== htole64(OBJECT_ENTRY_ARRAY
));
755 return (le64toh(o
->object
.size
) - offsetof(Object
, entry_array
.items
)) / sizeof(uint64_t);
758 static int link_entry_into_array(JournalFile
*f
,
763 uint64_t n
= 0, ap
= 0, q
, i
, a
, hidx
;
772 i
= hidx
= le64toh(*idx
);
775 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
779 n
= journal_file_entry_array_n_items(o
);
781 o
->entry_array
.items
[i
] = htole64(p
);
782 *idx
= htole64(hidx
+ 1);
788 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
799 r
= journal_file_append_object(f
, OBJECT_ENTRY_ARRAY
,
800 offsetof(Object
, entry_array
.items
) + n
* sizeof(uint64_t),
805 o
->entry_array
.items
[i
] = htole64(p
);
810 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, ap
, &o
);
814 o
->entry_array
.next_entry_array_offset
= htole64(q
);
817 *idx
= htole64(hidx
+ 1);
822 static int link_entry_into_array_plus_one(JournalFile
*f
,
841 i
= le64toh(*idx
) - 1;
842 r
= link_entry_into_array(f
, first
, &i
, p
);
847 *idx
= htole64(le64toh(*idx
) + 1);
851 static int journal_file_link_entry_item(JournalFile
*f
, Object
*o
, uint64_t offset
, uint64_t i
) {
858 p
= le64toh(o
->entry
.items
[i
].object_offset
);
862 r
= journal_file_move_to_object(f
, OBJECT_DATA
, p
, &o
);
866 return link_entry_into_array_plus_one(f
,
867 &o
->data
.entry_offset
,
868 &o
->data
.entry_array_offset
,
873 static int journal_file_link_entry(JournalFile
*f
, Object
*o
, uint64_t offset
) {
880 assert(o
->object
.type
== OBJECT_ENTRY
);
882 /* Link up the entry itself */
883 r
= link_entry_into_array(f
,
884 &f
->header
->entry_array_offset
,
885 &f
->header
->n_entries
,
890 log_error("=> %s seqnr=%lu n_entries=%lu", f
->path
, (unsigned long) o
->entry
.seqnum
, (unsigned long) f
->header
->n_entries
);
892 if (f
->header
->head_entry_realtime
== 0)
893 f
->header
->head_entry_realtime
= o
->entry
.realtime
;
895 f
->header
->tail_entry_realtime
= o
->entry
.realtime
;
896 f
->header
->tail_entry_monotonic
= o
->entry
.monotonic
;
898 f
->tail_entry_monotonic_valid
= true;
900 /* Link up the items */
901 n
= journal_file_entry_n_items(o
);
902 for (i
= 0; i
< n
; i
++) {
903 r
= journal_file_link_entry_item(f
, o
, offset
, i
);
911 static int journal_file_append_entry_internal(
913 const dual_timestamp
*ts
,
915 const EntryItem items
[], unsigned n_items
,
917 Object
**ret
, uint64_t *offset
) {
924 assert(items
|| n_items
== 0);
927 osize
= offsetof(Object
, entry
.items
) + (n_items
* sizeof(EntryItem
));
929 r
= journal_file_append_object(f
, OBJECT_ENTRY
, osize
, &o
, &np
);
933 o
->entry
.seqnum
= htole64(journal_file_seqnum(f
, seqnum
));
934 memcpy(o
->entry
.items
, items
, n_items
* sizeof(EntryItem
));
935 o
->entry
.realtime
= htole64(ts
->realtime
);
936 o
->entry
.monotonic
= htole64(ts
->monotonic
);
937 o
->entry
.xor_hash
= htole64(xor_hash
);
938 o
->entry
.boot_id
= f
->header
->boot_id
;
940 r
= journal_file_link_entry(f
, o
, np
);
953 static void journal_file_post_change(JournalFile
*f
) {
956 /* inotify() does not receive IN_MODIFY events from file
957 * accesses done via mmap(). After each access we hence
958 * trigger IN_MODIFY by truncating the journal file to its
959 * current size which triggers IN_MODIFY. */
961 __sync_synchronize();
963 if (ftruncate(f
->fd
, f
->last_stat
.st_size
) < 0)
964 log_error("Failed to to truncate file to its own size: %m");
967 int journal_file_append_entry(JournalFile
*f
, const dual_timestamp
*ts
, const struct iovec iovec
[], unsigned n_iovec
, uint64_t *seqnum
, Object
**ret
, uint64_t *offset
) {
971 uint64_t xor_hash
= 0;
972 struct dual_timestamp _ts
;
975 assert(iovec
|| n_iovec
== 0);
981 dual_timestamp_get(&_ts
);
985 if (f
->tail_entry_monotonic_valid
&&
986 ts
->monotonic
< le64toh(f
->header
->tail_entry_monotonic
))
989 if (ts
->realtime
< le64toh(f
->header
->tail_entry_realtime
))
992 items
= new(EntryItem
, n_iovec
);
996 for (i
= 0; i
< n_iovec
; i
++) {
1000 r
= journal_file_append_data(f
, iovec
[i
].iov_base
, iovec
[i
].iov_len
, &o
, &p
);
1004 xor_hash
^= le64toh(o
->data
.hash
);
1005 items
[i
].object_offset
= htole64(p
);
1006 items
[i
].hash
= o
->data
.hash
;
1009 r
= journal_file_append_entry_internal(f
, ts
, xor_hash
, items
, n_iovec
, seqnum
, ret
, offset
);
1011 journal_file_post_change(f
);
1019 static int generic_array_get(JournalFile
*f
,
1022 Object
**ret
, uint64_t *offset
) {
1034 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &o
);
1038 n
= journal_file_entry_array_n_items(o
);
1040 p
= le64toh(o
->entry_array
.items
[i
]);
1045 a
= le64toh(o
->entry_array
.next_entry_array_offset
);
1048 if (a
<= 0 || p
<= 0)
1051 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1064 static int generic_array_get_plus_one(JournalFile
*f
,
1068 Object
**ret
, uint64_t *offset
) {
1077 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1090 return generic_array_get(f
, first
, i
-1, ret
, offset
);
1099 static int generic_array_bisect(JournalFile
*f
,
1103 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1104 direction_t direction
,
1109 uint64_t a
, p
, t
= 0, i
= 0, last_p
= 0;
1110 bool subtract_one
= false;
1111 Object
*o
, *array
= NULL
;
1115 assert(test_object
);
1119 uint64_t left
, right
, k
, lp
;
1121 r
= journal_file_move_to_object(f
, OBJECT_ENTRY_ARRAY
, a
, &array
);
1125 k
= journal_file_entry_array_n_items(array
);
1131 lp
= p
= le64toh(array
->entry_array
.items
[i
]);
1135 r
= test_object(f
, p
, needle
);
1139 if (r
== TEST_FOUND
)
1140 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1142 if (r
== TEST_RIGHT
) {
1146 if (left
== right
) {
1147 if (direction
== DIRECTION_UP
)
1148 subtract_one
= true;
1154 assert(left
< right
);
1156 i
= (left
+ right
) / 2;
1157 p
= le64toh(array
->entry_array
.items
[i
]);
1161 r
= test_object(f
, p
, needle
);
1165 if (r
== TEST_FOUND
)
1166 r
= direction
== DIRECTION_DOWN
? TEST_RIGHT
: TEST_LEFT
;
1168 if (r
== TEST_RIGHT
)
1182 a
= le64toh(array
->entry_array
.next_entry_array_offset
);
1188 if (subtract_one
&& t
== 0 && i
== 0)
1191 if (subtract_one
&& i
== 0)
1193 else if (subtract_one
)
1194 p
= le64toh(array
->entry_array
.items
[i
-1]);
1196 p
= le64toh(array
->entry_array
.items
[i
]);
1198 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1209 *idx
= t
+ i
- (subtract_one
? 1 : 0);
1214 static int generic_array_bisect_plus_one(JournalFile
*f
,
1219 int (*test_object
)(JournalFile
*f
, uint64_t p
, uint64_t needle
),
1220 direction_t direction
,
1228 assert(test_object
);
1233 /* This bisects the array in object 'first', but first checks
1235 r
= test_object(f
, extra
, needle
);
1238 else if (r
== TEST_FOUND
) {
1241 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, extra
, &o
);
1255 } else if (r
== TEST_RIGHT
)
1258 r
= generic_array_bisect(f
, first
, n
-1, needle
, test_object
, direction
, ret
, offset
, idx
);
1266 static int test_object_seqnum(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1273 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1277 if (le64toh(o
->entry
.seqnum
) == needle
)
1279 else if (le64toh(o
->entry
.seqnum
) < needle
)
1285 int journal_file_move_to_entry_by_seqnum(
1288 direction_t direction
,
1292 return generic_array_bisect(f
,
1293 le64toh(f
->header
->entry_array_offset
),
1294 le64toh(f
->header
->n_entries
),
1301 static int test_object_realtime(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1308 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1312 if (le64toh(o
->entry
.realtime
) == needle
)
1314 else if (le64toh(o
->entry
.realtime
) < needle
)
1320 int journal_file_move_to_entry_by_realtime(
1323 direction_t direction
,
1327 return generic_array_bisect(f
,
1328 le64toh(f
->header
->entry_array_offset
),
1329 le64toh(f
->header
->n_entries
),
1331 test_object_realtime
,
1336 static int test_object_monotonic(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1343 r
= journal_file_move_to_object(f
, OBJECT_ENTRY
, p
, &o
);
1347 if (le64toh(o
->entry
.monotonic
) == needle
)
1349 else if (le64toh(o
->entry
.monotonic
) < needle
)
1355 int journal_file_move_to_entry_by_monotonic(
1359 direction_t direction
,
1363 char t
[8+32+1] = "_BOOT_ID=";
1367 sd_id128_to_string(boot_id
, t
+ 8);
1369 r
= journal_file_find_data_object(f
, t
, strlen(t
), &o
, NULL
);
1375 return generic_array_bisect_plus_one(f
,
1376 le64toh(o
->data
.entry_offset
),
1377 le64toh(o
->data
.entry_array_offset
),
1378 le64toh(o
->data
.n_entries
),
1380 test_object_monotonic
,
1385 static int test_object_offset(JournalFile
*f
, uint64_t p
, uint64_t needle
) {
1391 else if (p
< needle
)
1397 int journal_file_next_entry(
1399 Object
*o
, uint64_t p
,
1400 direction_t direction
,
1401 Object
**ret
, uint64_t *offset
) {
1407 assert(p
> 0 || !o
);
1409 n
= le64toh(f
->header
->n_entries
);
1414 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
1416 if (o
->object
.type
!= OBJECT_ENTRY
)
1419 r
= generic_array_bisect(f
,
1420 le64toh(f
->header
->entry_array_offset
),
1421 le64toh(f
->header
->n_entries
),
1430 if (direction
== DIRECTION_DOWN
) {
1443 /* And jump to it */
1444 return generic_array_get(f
,
1445 le64toh(f
->header
->entry_array_offset
),
1450 int journal_file_skip_entry(
1452 Object
*o
, uint64_t p
,
1454 Object
**ret
, uint64_t *offset
) {
1463 if (o
->object
.type
!= OBJECT_ENTRY
)
1466 r
= generic_array_bisect(f
,
1467 le64toh(f
->header
->entry_array_offset
),
1468 le64toh(f
->header
->n_entries
),
1477 /* Calculate new index */
1479 if ((uint64_t) -skip
>= i
)
1482 i
= i
- (uint64_t) -skip
;
1484 i
+= (uint64_t) skip
;
1486 n
= le64toh(f
->header
->n_entries
);
1493 return generic_array_get(f
,
1494 le64toh(f
->header
->entry_array_offset
),
1499 int journal_file_next_entry_for_data(
1501 Object
*o
, uint64_t p
,
1502 uint64_t data_offset
,
1503 direction_t direction
,
1504 Object
**ret
, uint64_t *offset
) {
1511 assert(p
> 0 || !o
);
1513 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
1517 n
= le64toh(d
->data
.n_entries
);
1522 i
= direction
== DIRECTION_DOWN
? 0 : n
- 1;
1524 if (o
->object
.type
!= OBJECT_ENTRY
)
1527 r
= generic_array_bisect_plus_one(f
,
1528 le64toh(d
->data
.entry_offset
),
1529 le64toh(d
->data
.entry_array_offset
),
1530 le64toh(d
->data
.n_entries
),
1540 if (direction
== DIRECTION_DOWN
) {
1554 return generic_array_get_plus_one(f
,
1555 le64toh(d
->data
.entry_offset
),
1556 le64toh(d
->data
.entry_array_offset
),
1561 int journal_file_move_to_entry_by_seqnum_for_data(
1563 uint64_t data_offset
,
1565 direction_t direction
,
1566 Object
**ret
, uint64_t *offset
) {
1571 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
1575 return generic_array_bisect_plus_one(f
,
1576 le64toh(d
->data
.entry_offset
),
1577 le64toh(d
->data
.entry_array_offset
),
1578 le64toh(d
->data
.n_entries
),
1585 int journal_file_move_to_entry_by_realtime_for_data(
1587 uint64_t data_offset
,
1589 direction_t direction
,
1590 Object
**ret
, uint64_t *offset
) {
1595 r
= journal_file_move_to_object(f
, OBJECT_DATA
, data_offset
, &d
);
1599 return generic_array_bisect_plus_one(f
,
1600 le64toh(d
->data
.entry_offset
),
1601 le64toh(d
->data
.entry_array_offset
),
1602 le64toh(d
->data
.n_entries
),
1604 test_object_realtime
,
1609 void journal_file_dump(JournalFile
*f
) {
1610 char a
[33], b
[33], c
[33];
1617 printf("File Path: %s\n"
1621 "Arena size: %llu\n"
1625 sd_id128_to_string(f
->header
->file_id
, a
),
1626 sd_id128_to_string(f
->header
->machine_id
, b
),
1627 sd_id128_to_string(f
->header
->boot_id
, c
),
1628 (unsigned long long) le64toh(f
->header
->arena_size
),
1629 (unsigned long) le64toh(f
->header
->n_objects
),
1630 (unsigned long) le64toh(f
->header
->n_entries
));
1632 p
= le64toh(f
->header
->arena_offset
);
1634 r
= journal_file_move_to_object(f
, -1, p
, &o
);
1638 switch (o
->object
.type
) {
1641 printf("Type: OBJECT_UNUSED\n");
1645 printf("Type: OBJECT_DATA\n");
1649 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1650 (unsigned long long) le64toh(o
->entry
.seqnum
),
1651 (unsigned long long) le64toh(o
->entry
.monotonic
),
1652 (unsigned long long) le64toh(o
->entry
.realtime
));
1655 case OBJECT_FIELD_HASH_TABLE
:
1656 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1659 case OBJECT_DATA_HASH_TABLE
:
1660 printf("Type: OBJECT_DATA_HASH_TABLE\n");
1663 case OBJECT_ENTRY_ARRAY
:
1664 printf("Type: OBJECT_ENTRY_ARRAY\n");
1668 if (o
->object
.flags
& OBJECT_COMPRESSED
)
1669 printf("Flags: COMPRESSED\n");
1671 if (p
== le64toh(f
->header
->tail_object_offset
))
1674 p
= p
+ ALIGN64(le64toh(o
->object
.size
));
1679 log_error("File corrupt");
1682 int journal_file_open(
1686 JournalFile
*template,
1687 JournalFile
**ret
) {
1691 bool newly_created
= false;
1695 if ((flags
& O_ACCMODE
) != O_RDONLY
&&
1696 (flags
& O_ACCMODE
) != O_RDWR
)
1699 f
= new0(JournalFile
, 1);
1706 f
->writable
= (flags
& O_ACCMODE
) != O_RDONLY
;
1707 f
->prot
= prot_from_flags(flags
);
1709 f
->metrics
.max_size
= DEFAULT_MAX_SIZE
;
1710 f
->metrics
.min_size
= DEFAULT_MIN_SIZE
;
1711 f
->metrics
.keep_free
= DEFAULT_KEEP_FREE
;
1713 f
->path
= strdup(fname
);
1719 f
->fd
= open(f
->path
, f
->flags
|O_CLOEXEC
, f
->mode
);
1725 if (fstat(f
->fd
, &f
->last_stat
) < 0) {
1730 if (f
->last_stat
.st_size
== 0 && f
->writable
) {
1731 newly_created
= true;
1733 r
= journal_file_init_header(f
, template);
1737 if (fstat(f
->fd
, &f
->last_stat
) < 0) {
1743 if (f
->last_stat
.st_size
< (off_t
) sizeof(Header
)) {
1748 f
->header
= mmap(NULL
, PAGE_ALIGN(sizeof(Header
)), prot_from_flags(flags
), MAP_SHARED
, f
->fd
, 0);
1749 if (f
->header
== MAP_FAILED
) {
1755 if (!newly_created
) {
1756 r
= journal_file_verify_header(f
);
1762 r
= journal_file_refresh_header(f
);
1767 if (newly_created
) {
1769 r
= journal_file_setup_field_hash_table(f
);
1773 r
= journal_file_setup_data_hash_table(f
);
1778 r
= journal_file_map_field_hash_table(f
);
1782 r
= journal_file_map_data_hash_table(f
);
1792 journal_file_close(f
);
1797 int journal_file_rotate(JournalFile
**f
) {
1800 JournalFile
*old_file
, *new_file
= NULL
;
1808 if (!old_file
->writable
)
1811 if (!endswith(old_file
->path
, ".journal"))
1814 l
= strlen(old_file
->path
);
1816 p
= new(char, l
+ 1 + 16 + 1 + 32 + 1 + 16 + 1);
1820 memcpy(p
, old_file
->path
, l
- 8);
1822 sd_id128_to_string(old_file
->header
->seqnum_id
, p
+ l
- 8 + 1);
1823 snprintf(p
+ l
- 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1824 "-%016llx-%016llx.journal",
1825 (unsigned long long) le64toh((*f
)->header
->seqnum
),
1826 (unsigned long long) le64toh((*f
)->header
->tail_entry_realtime
));
1828 r
= rename(old_file
->path
, p
);
1834 old_file
->header
->state
= le32toh(STATE_ARCHIVED
);
1836 r
= journal_file_open(old_file
->path
, old_file
->flags
, old_file
->mode
, old_file
, &new_file
);
1837 journal_file_close(old_file
);
1843 struct vacuum_info
{
1848 sd_id128_t seqnum_id
;
1852 static int vacuum_compare(const void *_a
, const void *_b
) {
1853 const struct vacuum_info
*a
, *b
;
1858 if (sd_id128_equal(a
->seqnum_id
, b
->seqnum_id
)) {
1859 if (a
->seqnum
< b
->seqnum
)
1861 else if (a
->seqnum
> b
->seqnum
)
1867 if (a
->realtime
< b
->realtime
)
1869 else if (a
->realtime
> b
->realtime
)
1872 return memcmp(&a
->seqnum_id
, &b
->seqnum_id
, 16);
1875 int journal_directory_vacuum(const char *directory
, uint64_t max_use
, uint64_t min_free
) {
1878 struct vacuum_info
*list
= NULL
;
1879 unsigned n_list
= 0, n_allocated
= 0, i
;
1885 max_use
= DEFAULT_MAX_USE
;
1887 d
= opendir(directory
);
1893 struct dirent buf
, *de
;
1897 unsigned long long seqnum
, realtime
;
1898 sd_id128_t seqnum_id
;
1900 k
= readdir_r(d
, &buf
, &de
);
1909 if (!dirent_is_file_with_suffix(de
, ".journal"))
1912 q
= strlen(de
->d_name
);
1914 if (q
< 1 + 32 + 1 + 16 + 1 + 16 + 8)
1917 if (de
->d_name
[q
-8-16-1] != '-' ||
1918 de
->d_name
[q
-8-16-1-16-1] != '-' ||
1919 de
->d_name
[q
-8-16-1-16-1-32-1] != '@')
1922 if (fstatat(dirfd(d
), de
->d_name
, &st
, AT_SYMLINK_NOFOLLOW
) < 0)
1925 if (!S_ISREG(st
.st_mode
))
1928 p
= strdup(de
->d_name
);
1934 de
->d_name
[q
-8-16-1-16-1] = 0;
1935 if (sd_id128_from_string(de
->d_name
+ q
-8-16-1-16-1-32, &seqnum_id
) < 0) {
1940 if (sscanf(de
->d_name
+ q
-8-16-1-16, "%16llx-%16llx.journal", &seqnum
, &realtime
) != 2) {
1945 if (n_list
>= n_allocated
) {
1946 struct vacuum_info
*j
;
1948 n_allocated
= MAX(n_allocated
* 2U, 8U);
1949 j
= realloc(list
, n_allocated
* sizeof(struct vacuum_info
));
1959 list
[n_list
].filename
= p
;
1960 list
[n_list
].usage
= (uint64_t) st
.st_blksize
* (uint64_t) st
.st_blocks
;
1961 list
[n_list
].seqnum
= seqnum
;
1962 list
[n_list
].realtime
= realtime
;
1963 list
[n_list
].seqnum_id
= seqnum_id
;
1965 sum
+= list
[n_list
].usage
;
1970 qsort(list
, n_list
, sizeof(struct vacuum_info
), vacuum_compare
);
1972 for(i
= 0; i
< n_list
; i
++) {
1975 if (fstatvfs(dirfd(d
), &ss
) < 0) {
1980 if (sum
<= max_use
&&
1981 (uint64_t) ss
.f_bavail
* (uint64_t) ss
.f_bsize
>= min_free
)
1984 if (unlinkat(dirfd(d
), list
[i
].filename
, 0) >= 0) {
1985 log_debug("Deleted archived journal %s/%s.", directory
, list
[i
].filename
);
1986 sum
-= list
[i
].usage
;
1987 } else if (errno
!= ENOENT
)
1988 log_warning("Failed to delete %s/%s: %m", directory
, list
[i
].filename
);
1992 for (i
= 0; i
< n_list
; i
++)
1993 free(list
[i
].filename
);