]> git.ipfire.org Git - thirdparty/systemd.git/blame_incremental - src/journal/journal-file.c
journal: fix error handling when compressing journal objects
[thirdparty/systemd.git] / src / journal / journal-file.c
... / ...
CommitLineData
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
29#include <linux/fs.h>
30
31#include "btrfs-util.h"
32#include "journal-def.h"
33#include "journal-file.h"
34#include "journal-authenticate.h"
35#include "lookup3.h"
36#include "compress.h"
37#include "random-util.h"
38
39#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41
42#define COMPRESSION_SIZE_THRESHOLD (512ULL)
43
44/* This is the minimum journal file size */
45#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
46
47/* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51
52/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
53#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
54
55/* This is the upper bound if we deduce max_size from max_use */
56#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57
58/* This is the upper bound if we deduce the keep_free value from the
59 * file system size */
60#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61
62/* This is the keep_free value when we can't determine the system
63 * size */
64#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65
66/* This is the default maximum number of journal files to keep around. */
67#define DEFAULT_N_MAX_FILES (100)
68
69/* n_data was the first entry we added after the initial file format design */
70#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
71
72/* How many entries to keep in the entry array chain cache at max */
73#define CHAIN_CACHE_MAX 20
74
75/* How much to increase the journal file size at once each time we allocate something new. */
76#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
77
78/* Reread fstat() of the file for detecting deletions at least this often */
79#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
80
81/* The mmap context to use for the header we pick as one above the last defined typed */
82#define CONTEXT_HEADER _OBJECT_TYPE_MAX
83
84static int journal_file_set_online(JournalFile *f) {
85 assert(f);
86
87 if (!f->writable)
88 return -EPERM;
89
90 if (!(f->fd >= 0 && f->header))
91 return -EINVAL;
92
93 if (mmap_cache_got_sigbus(f->mmap, f->fd))
94 return -EIO;
95
96 switch(f->header->state) {
97 case STATE_ONLINE:
98 return 0;
99
100 case STATE_OFFLINE:
101 f->header->state = STATE_ONLINE;
102 fsync(f->fd);
103 return 0;
104
105 default:
106 return -EINVAL;
107 }
108}
109
110int journal_file_set_offline(JournalFile *f) {
111 assert(f);
112
113 if (!f->writable)
114 return -EPERM;
115
116 if (!(f->fd >= 0 && f->header))
117 return -EINVAL;
118
119 if (f->header->state != STATE_ONLINE)
120 return 0;
121
122 fsync(f->fd);
123
124 if (mmap_cache_got_sigbus(f->mmap, f->fd))
125 return -EIO;
126
127 f->header->state = STATE_OFFLINE;
128
129 if (mmap_cache_got_sigbus(f->mmap, f->fd))
130 return -EIO;
131
132 fsync(f->fd);
133
134 return 0;
135}
136
137JournalFile* journal_file_close(JournalFile *f) {
138 assert(f);
139
140#ifdef HAVE_GCRYPT
141 /* Write the final tag */
142 if (f->seal && f->writable)
143 journal_file_append_tag(f);
144#endif
145
146 journal_file_set_offline(f);
147
148 if (f->mmap && f->fd >= 0)
149 mmap_cache_close_fd(f->mmap, f->fd);
150
151 if (f->fd >= 0 && f->defrag_on_close) {
152
153 /* Be friendly to btrfs: turn COW back on again now,
154 * and defragment the file. We won't write to the file
155 * ever again, hence remove all fragmentation, and
156 * reenable all the good bits COW usually provides
157 * (such as data checksumming). */
158
159 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
160 (void) btrfs_defrag_fd(f->fd);
161 }
162
163 safe_close(f->fd);
164 free(f->path);
165
166 if (f->mmap)
167 mmap_cache_unref(f->mmap);
168
169 ordered_hashmap_free_free(f->chain_cache);
170
171#if defined(HAVE_XZ) || defined(HAVE_LZ4)
172 free(f->compress_buffer);
173#endif
174
175#ifdef HAVE_GCRYPT
176 if (f->fss_file)
177 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
178 else
179 free(f->fsprg_state);
180
181 free(f->fsprg_seed);
182
183 if (f->hmac)
184 gcry_md_close(f->hmac);
185#endif
186
187 free(f);
188 return NULL;
189}
190
191static int journal_file_init_header(JournalFile *f, JournalFile *template) {
192 Header h = {};
193 ssize_t k;
194 int r;
195
196 assert(f);
197
198 memcpy(h.signature, HEADER_SIGNATURE, 8);
199 h.header_size = htole64(ALIGN64(sizeof(h)));
200
201 h.incompatible_flags |= htole32(
202 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
203 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
204
205 h.compatible_flags = htole32(
206 f->seal * HEADER_COMPATIBLE_SEALED);
207
208 r = sd_id128_randomize(&h.file_id);
209 if (r < 0)
210 return r;
211
212 if (template) {
213 h.seqnum_id = template->header->seqnum_id;
214 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
215 } else
216 h.seqnum_id = h.file_id;
217
218 k = pwrite(f->fd, &h, sizeof(h), 0);
219 if (k < 0)
220 return -errno;
221
222 if (k != sizeof(h))
223 return -EIO;
224
225 return 0;
226}
227
228static int journal_file_refresh_header(JournalFile *f) {
229 sd_id128_t boot_id;
230 int r;
231
232 assert(f);
233
234 r = sd_id128_get_machine(&f->header->machine_id);
235 if (r < 0)
236 return r;
237
238 r = sd_id128_get_boot(&boot_id);
239 if (r < 0)
240 return r;
241
242 if (sd_id128_equal(boot_id, f->header->boot_id))
243 f->tail_entry_monotonic_valid = true;
244
245 f->header->boot_id = boot_id;
246
247 r = journal_file_set_online(f);
248
249 /* Sync the online state to disk */
250 fsync(f->fd);
251
252 return r;
253}
254
255static int journal_file_verify_header(JournalFile *f) {
256 uint32_t flags;
257
258 assert(f);
259
260 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
261 return -EBADMSG;
262
263 /* In both read and write mode we refuse to open files with
264 * incompatible flags we don't know */
265 flags = le32toh(f->header->incompatible_flags);
266 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
267 if (flags & ~HEADER_INCOMPATIBLE_ANY)
268 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
269 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
270 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
271 if (flags)
272 log_debug("Journal file %s uses incompatible flags %"PRIx32
273 " disabled at compilation time.", f->path, flags);
274 return -EPROTONOSUPPORT;
275 }
276
277 /* When open for writing we refuse to open files with
278 * compatible flags, too */
279 flags = le32toh(f->header->compatible_flags);
280 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
281 if (flags & ~HEADER_COMPATIBLE_ANY)
282 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
283 f->path, flags & ~HEADER_COMPATIBLE_ANY);
284 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
285 if (flags)
286 log_debug("Journal file %s uses compatible flags %"PRIx32
287 " disabled at compilation time.", f->path, flags);
288 return -EPROTONOSUPPORT;
289 }
290
291 if (f->header->state >= _STATE_MAX)
292 return -EBADMSG;
293
294 /* The first addition was n_data, so check that we are at least this large */
295 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
296 return -EBADMSG;
297
298 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
299 return -EBADMSG;
300
301 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
302 return -ENODATA;
303
304 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
305 return -ENODATA;
306
307 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
308 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
309 !VALID64(le64toh(f->header->tail_object_offset)) ||
310 !VALID64(le64toh(f->header->entry_array_offset)))
311 return -ENODATA;
312
313 if (f->writable) {
314 uint8_t state;
315 sd_id128_t machine_id;
316 int r;
317
318 r = sd_id128_get_machine(&machine_id);
319 if (r < 0)
320 return r;
321
322 if (!sd_id128_equal(machine_id, f->header->machine_id))
323 return -EHOSTDOWN;
324
325 state = f->header->state;
326
327 if (state == STATE_ONLINE) {
328 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
329 return -EBUSY;
330 } else if (state == STATE_ARCHIVED)
331 return -ESHUTDOWN;
332 else if (state != STATE_OFFLINE) {
333 log_debug("Journal file %s has unknown state %i.", f->path, state);
334 return -EBUSY;
335 }
336 }
337
338 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
339 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
340
341 f->seal = JOURNAL_HEADER_SEALED(f->header);
342
343 return 0;
344}
345
346static int journal_file_fstat(JournalFile *f) {
347 assert(f);
348 assert(f->fd >= 0);
349
350 if (fstat(f->fd, &f->last_stat) < 0)
351 return -errno;
352
353 f->last_stat_usec = now(CLOCK_MONOTONIC);
354
355 /* Refuse appending to files that are already deleted */
356 if (f->last_stat.st_nlink <= 0)
357 return -EIDRM;
358
359 return 0;
360}
361
362static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
363 uint64_t old_size, new_size;
364 int r;
365
366 assert(f);
367
368 /* We assume that this file is not sparse, and we know that
369 * for sure, since we always call posix_fallocate()
370 * ourselves */
371
372 if (mmap_cache_got_sigbus(f->mmap, f->fd))
373 return -EIO;
374
375 old_size =
376 le64toh(f->header->header_size) +
377 le64toh(f->header->arena_size);
378
379 new_size = PAGE_ALIGN(offset + size);
380 if (new_size < le64toh(f->header->header_size))
381 new_size = le64toh(f->header->header_size);
382
383 if (new_size <= old_size) {
384
385 /* We already pre-allocated enough space, but before
386 * we write to it, let's check with fstat() if the
387 * file got deleted, in order make sure we don't throw
388 * away the data immediately. Don't check fstat() for
389 * all writes though, but only once ever 10s. */
390
391 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
392 return 0;
393
394 return journal_file_fstat(f);
395 }
396
397 /* Allocate more space. */
398
399 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
400 return -E2BIG;
401
402 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
403 struct statvfs svfs;
404
405 if (fstatvfs(f->fd, &svfs) >= 0) {
406 uint64_t available;
407
408 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
409
410 if (new_size - old_size > available)
411 return -E2BIG;
412 }
413 }
414
415 /* Increase by larger blocks at once */
416 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
417 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
418 new_size = f->metrics.max_size;
419
420 /* Note that the glibc fallocate() fallback is very
421 inefficient, hence we try to minimize the allocation area
422 as we can. */
423 r = posix_fallocate(f->fd, old_size, new_size - old_size);
424 if (r != 0)
425 return -r;
426
427 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
428
429 return journal_file_fstat(f);
430}
431
432static unsigned type_to_context(ObjectType type) {
433 /* One context for each type, plus one catch-all for the rest */
434 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
435 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
436 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
437}
438
439static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
440 int r;
441
442 assert(f);
443 assert(ret);
444
445 if (size <= 0)
446 return -EINVAL;
447
448 /* Avoid SIGBUS on invalid accesses */
449 if (offset + size > (uint64_t) f->last_stat.st_size) {
450 /* Hmm, out of range? Let's refresh the fstat() data
451 * first, before we trust that check. */
452
453 r = journal_file_fstat(f);
454 if (r < 0)
455 return r;
456
457 if (offset + size > (uint64_t) f->last_stat.st_size)
458 return -EADDRNOTAVAIL;
459 }
460
461 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
462}
463
464static uint64_t minimum_header_size(Object *o) {
465
466 static const uint64_t table[] = {
467 [OBJECT_DATA] = sizeof(DataObject),
468 [OBJECT_FIELD] = sizeof(FieldObject),
469 [OBJECT_ENTRY] = sizeof(EntryObject),
470 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
471 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
472 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
473 [OBJECT_TAG] = sizeof(TagObject),
474 };
475
476 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
477 return sizeof(ObjectHeader);
478
479 return table[o->object.type];
480}
481
482int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
483 int r;
484 void *t;
485 Object *o;
486 uint64_t s;
487
488 assert(f);
489 assert(ret);
490
491 /* Objects may only be located at multiple of 64 bit */
492 if (!VALID64(offset))
493 return -EFAULT;
494
495 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
496 if (r < 0)
497 return r;
498
499 o = (Object*) t;
500 s = le64toh(o->object.size);
501
502 if (s < sizeof(ObjectHeader))
503 return -EBADMSG;
504
505 if (o->object.type <= OBJECT_UNUSED)
506 return -EBADMSG;
507
508 if (s < minimum_header_size(o))
509 return -EBADMSG;
510
511 if (type > OBJECT_UNUSED && o->object.type != type)
512 return -EBADMSG;
513
514 if (s > sizeof(ObjectHeader)) {
515 r = journal_file_move_to(f, type, false, offset, s, &t);
516 if (r < 0)
517 return r;
518
519 o = (Object*) t;
520 }
521
522 *ret = o;
523 return 0;
524}
525
526static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
527 uint64_t r;
528
529 assert(f);
530
531 r = le64toh(f->header->tail_entry_seqnum) + 1;
532
533 if (seqnum) {
534 /* If an external seqnum counter was passed, we update
535 * both the local and the external one, and set it to
536 * the maximum of both */
537
538 if (*seqnum + 1 > r)
539 r = *seqnum + 1;
540
541 *seqnum = r;
542 }
543
544 f->header->tail_entry_seqnum = htole64(r);
545
546 if (f->header->head_entry_seqnum == 0)
547 f->header->head_entry_seqnum = htole64(r);
548
549 return r;
550}
551
552int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
553 int r;
554 uint64_t p;
555 Object *tail, *o;
556 void *t;
557
558 assert(f);
559 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
560 assert(size >= sizeof(ObjectHeader));
561 assert(offset);
562 assert(ret);
563
564 r = journal_file_set_online(f);
565 if (r < 0)
566 return r;
567
568 p = le64toh(f->header->tail_object_offset);
569 if (p == 0)
570 p = le64toh(f->header->header_size);
571 else {
572 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
573 if (r < 0)
574 return r;
575
576 p += ALIGN64(le64toh(tail->object.size));
577 }
578
579 r = journal_file_allocate(f, p, size);
580 if (r < 0)
581 return r;
582
583 r = journal_file_move_to(f, type, false, p, size, &t);
584 if (r < 0)
585 return r;
586
587 o = (Object*) t;
588
589 zero(o->object);
590 o->object.type = type;
591 o->object.size = htole64(size);
592
593 f->header->tail_object_offset = htole64(p);
594 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
595
596 *ret = o;
597 *offset = p;
598
599 return 0;
600}
601
602static int journal_file_setup_data_hash_table(JournalFile *f) {
603 uint64_t s, p;
604 Object *o;
605 int r;
606
607 assert(f);
608
609 /* We estimate that we need 1 hash table entry per 768 bytes
610 of journal file and we want to make sure we never get
611 beyond 75% fill level. Calculate the hash table size for
612 the maximum file size based on these metrics. */
613
614 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
615 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
616 s = DEFAULT_DATA_HASH_TABLE_SIZE;
617
618 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
619
620 r = journal_file_append_object(f,
621 OBJECT_DATA_HASH_TABLE,
622 offsetof(Object, hash_table.items) + s,
623 &o, &p);
624 if (r < 0)
625 return r;
626
627 memzero(o->hash_table.items, s);
628
629 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
630 f->header->data_hash_table_size = htole64(s);
631
632 return 0;
633}
634
635static int journal_file_setup_field_hash_table(JournalFile *f) {
636 uint64_t s, p;
637 Object *o;
638 int r;
639
640 assert(f);
641
642 /* We use a fixed size hash table for the fields as this
643 * number should grow very slowly only */
644
645 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
646 r = journal_file_append_object(f,
647 OBJECT_FIELD_HASH_TABLE,
648 offsetof(Object, hash_table.items) + s,
649 &o, &p);
650 if (r < 0)
651 return r;
652
653 memzero(o->hash_table.items, s);
654
655 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
656 f->header->field_hash_table_size = htole64(s);
657
658 return 0;
659}
660
661int journal_file_map_data_hash_table(JournalFile *f) {
662 uint64_t s, p;
663 void *t;
664 int r;
665
666 assert(f);
667
668 if (f->data_hash_table)
669 return 0;
670
671 p = le64toh(f->header->data_hash_table_offset);
672 s = le64toh(f->header->data_hash_table_size);
673
674 r = journal_file_move_to(f,
675 OBJECT_DATA_HASH_TABLE,
676 true,
677 p, s,
678 &t);
679 if (r < 0)
680 return r;
681
682 f->data_hash_table = t;
683 return 0;
684}
685
686int journal_file_map_field_hash_table(JournalFile *f) {
687 uint64_t s, p;
688 void *t;
689 int r;
690
691 assert(f);
692
693 if (f->field_hash_table)
694 return 0;
695
696 p = le64toh(f->header->field_hash_table_offset);
697 s = le64toh(f->header->field_hash_table_size);
698
699 r = journal_file_move_to(f,
700 OBJECT_FIELD_HASH_TABLE,
701 true,
702 p, s,
703 &t);
704 if (r < 0)
705 return r;
706
707 f->field_hash_table = t;
708 return 0;
709}
710
711static int journal_file_link_field(
712 JournalFile *f,
713 Object *o,
714 uint64_t offset,
715 uint64_t hash) {
716
717 uint64_t p, h, m;
718 int r;
719
720 assert(f);
721 assert(o);
722 assert(offset > 0);
723
724 if (o->object.type != OBJECT_FIELD)
725 return -EINVAL;
726
727 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
728 if (m <= 0)
729 return -EBADMSG;
730
731 /* This might alter the window we are looking at */
732 o->field.next_hash_offset = o->field.head_data_offset = 0;
733
734 h = hash % m;
735 p = le64toh(f->field_hash_table[h].tail_hash_offset);
736 if (p == 0)
737 f->field_hash_table[h].head_hash_offset = htole64(offset);
738 else {
739 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
740 if (r < 0)
741 return r;
742
743 o->field.next_hash_offset = htole64(offset);
744 }
745
746 f->field_hash_table[h].tail_hash_offset = htole64(offset);
747
748 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
749 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
750
751 return 0;
752}
753
754static int journal_file_link_data(
755 JournalFile *f,
756 Object *o,
757 uint64_t offset,
758 uint64_t hash) {
759
760 uint64_t p, h, m;
761 int r;
762
763 assert(f);
764 assert(o);
765 assert(offset > 0);
766
767 if (o->object.type != OBJECT_DATA)
768 return -EINVAL;
769
770 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
771 if (m <= 0)
772 return -EBADMSG;
773
774 /* This might alter the window we are looking at */
775 o->data.next_hash_offset = o->data.next_field_offset = 0;
776 o->data.entry_offset = o->data.entry_array_offset = 0;
777 o->data.n_entries = 0;
778
779 h = hash % m;
780 p = le64toh(f->data_hash_table[h].tail_hash_offset);
781 if (p == 0)
782 /* Only entry in the hash table is easy */
783 f->data_hash_table[h].head_hash_offset = htole64(offset);
784 else {
785 /* Move back to the previous data object, to patch in
786 * pointer */
787
788 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
789 if (r < 0)
790 return r;
791
792 o->data.next_hash_offset = htole64(offset);
793 }
794
795 f->data_hash_table[h].tail_hash_offset = htole64(offset);
796
797 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
798 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
799
800 return 0;
801}
802
803int journal_file_find_field_object_with_hash(
804 JournalFile *f,
805 const void *field, uint64_t size, uint64_t hash,
806 Object **ret, uint64_t *offset) {
807
808 uint64_t p, osize, h, m;
809 int r;
810
811 assert(f);
812 assert(field && size > 0);
813
814 /* If the field hash table is empty, we can't find anything */
815 if (le64toh(f->header->field_hash_table_size) <= 0)
816 return 0;
817
818 /* Map the field hash table, if it isn't mapped yet. */
819 r = journal_file_map_field_hash_table(f);
820 if (r < 0)
821 return r;
822
823 osize = offsetof(Object, field.payload) + size;
824
825 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
826 if (m <= 0)
827 return -EBADMSG;
828
829 h = hash % m;
830 p = le64toh(f->field_hash_table[h].head_hash_offset);
831
832 while (p > 0) {
833 Object *o;
834
835 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
836 if (r < 0)
837 return r;
838
839 if (le64toh(o->field.hash) == hash &&
840 le64toh(o->object.size) == osize &&
841 memcmp(o->field.payload, field, size) == 0) {
842
843 if (ret)
844 *ret = o;
845 if (offset)
846 *offset = p;
847
848 return 1;
849 }
850
851 p = le64toh(o->field.next_hash_offset);
852 }
853
854 return 0;
855}
856
857int journal_file_find_field_object(
858 JournalFile *f,
859 const void *field, uint64_t size,
860 Object **ret, uint64_t *offset) {
861
862 uint64_t hash;
863
864 assert(f);
865 assert(field && size > 0);
866
867 hash = hash64(field, size);
868
869 return journal_file_find_field_object_with_hash(f,
870 field, size, hash,
871 ret, offset);
872}
873
874int journal_file_find_data_object_with_hash(
875 JournalFile *f,
876 const void *data, uint64_t size, uint64_t hash,
877 Object **ret, uint64_t *offset) {
878
879 uint64_t p, osize, h, m;
880 int r;
881
882 assert(f);
883 assert(data || size == 0);
884
885 /* If there's no data hash table, then there's no entry. */
886 if (le64toh(f->header->data_hash_table_size) <= 0)
887 return 0;
888
889 /* Map the data hash table, if it isn't mapped yet. */
890 r = journal_file_map_data_hash_table(f);
891 if (r < 0)
892 return r;
893
894 osize = offsetof(Object, data.payload) + size;
895
896 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
897 if (m <= 0)
898 return -EBADMSG;
899
900 h = hash % m;
901 p = le64toh(f->data_hash_table[h].head_hash_offset);
902
903 while (p > 0) {
904 Object *o;
905
906 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
907 if (r < 0)
908 return r;
909
910 if (le64toh(o->data.hash) != hash)
911 goto next;
912
913 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
914#if defined(HAVE_XZ) || defined(HAVE_LZ4)
915 uint64_t l;
916 size_t rsize = 0;
917
918 l = le64toh(o->object.size);
919 if (l <= offsetof(Object, data.payload))
920 return -EBADMSG;
921
922 l -= offsetof(Object, data.payload);
923
924 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
925 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
926 if (r < 0)
927 return r;
928
929 if (rsize == size &&
930 memcmp(f->compress_buffer, data, size) == 0) {
931
932 if (ret)
933 *ret = o;
934
935 if (offset)
936 *offset = p;
937
938 return 1;
939 }
940#else
941 return -EPROTONOSUPPORT;
942#endif
943 } else if (le64toh(o->object.size) == osize &&
944 memcmp(o->data.payload, data, size) == 0) {
945
946 if (ret)
947 *ret = o;
948
949 if (offset)
950 *offset = p;
951
952 return 1;
953 }
954
955 next:
956 p = le64toh(o->data.next_hash_offset);
957 }
958
959 return 0;
960}
961
962int journal_file_find_data_object(
963 JournalFile *f,
964 const void *data, uint64_t size,
965 Object **ret, uint64_t *offset) {
966
967 uint64_t hash;
968
969 assert(f);
970 assert(data || size == 0);
971
972 hash = hash64(data, size);
973
974 return journal_file_find_data_object_with_hash(f,
975 data, size, hash,
976 ret, offset);
977}
978
979static int journal_file_append_field(
980 JournalFile *f,
981 const void *field, uint64_t size,
982 Object **ret, uint64_t *offset) {
983
984 uint64_t hash, p;
985 uint64_t osize;
986 Object *o;
987 int r;
988
989 assert(f);
990 assert(field && size > 0);
991
992 hash = hash64(field, size);
993
994 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
995 if (r < 0)
996 return r;
997 else if (r > 0) {
998
999 if (ret)
1000 *ret = o;
1001
1002 if (offset)
1003 *offset = p;
1004
1005 return 0;
1006 }
1007
1008 osize = offsetof(Object, field.payload) + size;
1009 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1010 if (r < 0)
1011 return r;
1012
1013 o->field.hash = htole64(hash);
1014 memcpy(o->field.payload, field, size);
1015
1016 r = journal_file_link_field(f, o, p, hash);
1017 if (r < 0)
1018 return r;
1019
1020 /* The linking might have altered the window, so let's
1021 * refresh our pointer */
1022 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1023 if (r < 0)
1024 return r;
1025
1026#ifdef HAVE_GCRYPT
1027 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1028 if (r < 0)
1029 return r;
1030#endif
1031
1032 if (ret)
1033 *ret = o;
1034
1035 if (offset)
1036 *offset = p;
1037
1038 return 0;
1039}
1040
1041static int journal_file_append_data(
1042 JournalFile *f,
1043 const void *data, uint64_t size,
1044 Object **ret, uint64_t *offset) {
1045
1046 uint64_t hash, p;
1047 uint64_t osize;
1048 Object *o;
1049 int r, compression = 0;
1050 const void *eq;
1051
1052 assert(f);
1053 assert(data || size == 0);
1054
1055 hash = hash64(data, size);
1056
1057 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1058 if (r < 0)
1059 return r;
1060 else if (r > 0) {
1061
1062 if (ret)
1063 *ret = o;
1064
1065 if (offset)
1066 *offset = p;
1067
1068 return 0;
1069 }
1070
1071 osize = offsetof(Object, data.payload) + size;
1072 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1073 if (r < 0)
1074 return r;
1075
1076 o->data.hash = htole64(hash);
1077
1078#if defined(HAVE_XZ) || defined(HAVE_LZ4)
1079 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1080 size_t rsize = 0;
1081
1082 compression = compress_blob(data, size, o->data.payload, &rsize);
1083
1084 if (compression >= 0) {
1085 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1086 o->object.flags |= compression;
1087
1088 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1089 size, rsize, object_compressed_to_string(compression));
1090 } else
1091 /* Compression didn't work, we don't really care why, let's continue without compression */
1092 compression = 0;
1093 }
1094#endif
1095
1096 if (compression == 0 && size > 0)
1097 memcpy(o->data.payload, data, size);
1098
1099 r = journal_file_link_data(f, o, p, hash);
1100 if (r < 0)
1101 return r;
1102
1103 /* The linking might have altered the window, so let's
1104 * refresh our pointer */
1105 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1106 if (r < 0)
1107 return r;
1108
1109 if (!data)
1110 eq = NULL;
1111 else
1112 eq = memchr(data, '=', size);
1113 if (eq && eq > data) {
1114 Object *fo = NULL;
1115 uint64_t fp;
1116
1117 /* Create field object ... */
1118 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1119 if (r < 0)
1120 return r;
1121
1122 /* ... and link it in. */
1123 o->data.next_field_offset = fo->field.head_data_offset;
1124 fo->field.head_data_offset = le64toh(p);
1125 }
1126
1127#ifdef HAVE_GCRYPT
1128 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1129 if (r < 0)
1130 return r;
1131#endif
1132
1133 if (ret)
1134 *ret = o;
1135
1136 if (offset)
1137 *offset = p;
1138
1139 return 0;
1140}
1141
1142uint64_t journal_file_entry_n_items(Object *o) {
1143 assert(o);
1144
1145 if (o->object.type != OBJECT_ENTRY)
1146 return 0;
1147
1148 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1149}
1150
1151uint64_t journal_file_entry_array_n_items(Object *o) {
1152 assert(o);
1153
1154 if (o->object.type != OBJECT_ENTRY_ARRAY)
1155 return 0;
1156
1157 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1158}
1159
1160uint64_t journal_file_hash_table_n_items(Object *o) {
1161 assert(o);
1162
1163 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1164 o->object.type != OBJECT_FIELD_HASH_TABLE)
1165 return 0;
1166
1167 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1168}
1169
1170static int link_entry_into_array(JournalFile *f,
1171 le64_t *first,
1172 le64_t *idx,
1173 uint64_t p) {
1174 int r;
1175 uint64_t n = 0, ap = 0, q, i, a, hidx;
1176 Object *o;
1177
1178 assert(f);
1179 assert(first);
1180 assert(idx);
1181 assert(p > 0);
1182
1183 a = le64toh(*first);
1184 i = hidx = le64toh(*idx);
1185 while (a > 0) {
1186
1187 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1188 if (r < 0)
1189 return r;
1190
1191 n = journal_file_entry_array_n_items(o);
1192 if (i < n) {
1193 o->entry_array.items[i] = htole64(p);
1194 *idx = htole64(hidx + 1);
1195 return 0;
1196 }
1197
1198 i -= n;
1199 ap = a;
1200 a = le64toh(o->entry_array.next_entry_array_offset);
1201 }
1202
1203 if (hidx > n)
1204 n = (hidx+1) * 2;
1205 else
1206 n = n * 2;
1207
1208 if (n < 4)
1209 n = 4;
1210
1211 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1212 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1213 &o, &q);
1214 if (r < 0)
1215 return r;
1216
1217#ifdef HAVE_GCRYPT
1218 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1219 if (r < 0)
1220 return r;
1221#endif
1222
1223 o->entry_array.items[i] = htole64(p);
1224
1225 if (ap == 0)
1226 *first = htole64(q);
1227 else {
1228 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1229 if (r < 0)
1230 return r;
1231
1232 o->entry_array.next_entry_array_offset = htole64(q);
1233 }
1234
1235 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1236 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1237
1238 *idx = htole64(hidx + 1);
1239
1240 return 0;
1241}
1242
1243static int link_entry_into_array_plus_one(JournalFile *f,
1244 le64_t *extra,
1245 le64_t *first,
1246 le64_t *idx,
1247 uint64_t p) {
1248
1249 int r;
1250
1251 assert(f);
1252 assert(extra);
1253 assert(first);
1254 assert(idx);
1255 assert(p > 0);
1256
1257 if (*idx == 0)
1258 *extra = htole64(p);
1259 else {
1260 le64_t i;
1261
1262 i = htole64(le64toh(*idx) - 1);
1263 r = link_entry_into_array(f, first, &i, p);
1264 if (r < 0)
1265 return r;
1266 }
1267
1268 *idx = htole64(le64toh(*idx) + 1);
1269 return 0;
1270}
1271
1272static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1273 uint64_t p;
1274 int r;
1275 assert(f);
1276 assert(o);
1277 assert(offset > 0);
1278
1279 p = le64toh(o->entry.items[i].object_offset);
1280 if (p == 0)
1281 return -EINVAL;
1282
1283 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1284 if (r < 0)
1285 return r;
1286
1287 return link_entry_into_array_plus_one(f,
1288 &o->data.entry_offset,
1289 &o->data.entry_array_offset,
1290 &o->data.n_entries,
1291 offset);
1292}
1293
1294static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1295 uint64_t n, i;
1296 int r;
1297
1298 assert(f);
1299 assert(o);
1300 assert(offset > 0);
1301
1302 if (o->object.type != OBJECT_ENTRY)
1303 return -EINVAL;
1304
1305 __sync_synchronize();
1306
1307 /* Link up the entry itself */
1308 r = link_entry_into_array(f,
1309 &f->header->entry_array_offset,
1310 &f->header->n_entries,
1311 offset);
1312 if (r < 0)
1313 return r;
1314
1315 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1316
1317 if (f->header->head_entry_realtime == 0)
1318 f->header->head_entry_realtime = o->entry.realtime;
1319
1320 f->header->tail_entry_realtime = o->entry.realtime;
1321 f->header->tail_entry_monotonic = o->entry.monotonic;
1322
1323 f->tail_entry_monotonic_valid = true;
1324
1325 /* Link up the items */
1326 n = journal_file_entry_n_items(o);
1327 for (i = 0; i < n; i++) {
1328 r = journal_file_link_entry_item(f, o, offset, i);
1329 if (r < 0)
1330 return r;
1331 }
1332
1333 return 0;
1334}
1335
1336static int journal_file_append_entry_internal(
1337 JournalFile *f,
1338 const dual_timestamp *ts,
1339 uint64_t xor_hash,
1340 const EntryItem items[], unsigned n_items,
1341 uint64_t *seqnum,
1342 Object **ret, uint64_t *offset) {
1343 uint64_t np;
1344 uint64_t osize;
1345 Object *o;
1346 int r;
1347
1348 assert(f);
1349 assert(items || n_items == 0);
1350 assert(ts);
1351
1352 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1353
1354 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1355 if (r < 0)
1356 return r;
1357
1358 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1359 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1360 o->entry.realtime = htole64(ts->realtime);
1361 o->entry.monotonic = htole64(ts->monotonic);
1362 o->entry.xor_hash = htole64(xor_hash);
1363 o->entry.boot_id = f->header->boot_id;
1364
1365#ifdef HAVE_GCRYPT
1366 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1367 if (r < 0)
1368 return r;
1369#endif
1370
1371 r = journal_file_link_entry(f, o, np);
1372 if (r < 0)
1373 return r;
1374
1375 if (ret)
1376 *ret = o;
1377
1378 if (offset)
1379 *offset = np;
1380
1381 return 0;
1382}
1383
1384void journal_file_post_change(JournalFile *f) {
1385 assert(f);
1386
1387 /* inotify() does not receive IN_MODIFY events from file
1388 * accesses done via mmap(). After each access we hence
1389 * trigger IN_MODIFY by truncating the journal file to its
1390 * current size which triggers IN_MODIFY. */
1391
1392 __sync_synchronize();
1393
1394 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1395 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1396}
1397
1398static int entry_item_cmp(const void *_a, const void *_b) {
1399 const EntryItem *a = _a, *b = _b;
1400
1401 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1402 return -1;
1403 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1404 return 1;
1405 return 0;
1406}
1407
1408int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1409 unsigned i;
1410 EntryItem *items;
1411 int r;
1412 uint64_t xor_hash = 0;
1413 struct dual_timestamp _ts;
1414
1415 assert(f);
1416 assert(iovec || n_iovec == 0);
1417
1418 if (!ts) {
1419 dual_timestamp_get(&_ts);
1420 ts = &_ts;
1421 }
1422
1423 if (f->tail_entry_monotonic_valid &&
1424 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1425 return -EINVAL;
1426
1427#ifdef HAVE_GCRYPT
1428 r = journal_file_maybe_append_tag(f, ts->realtime);
1429 if (r < 0)
1430 return r;
1431#endif
1432
1433 /* alloca() can't take 0, hence let's allocate at least one */
1434 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1435
1436 for (i = 0; i < n_iovec; i++) {
1437 uint64_t p;
1438 Object *o;
1439
1440 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1441 if (r < 0)
1442 return r;
1443
1444 xor_hash ^= le64toh(o->data.hash);
1445 items[i].object_offset = htole64(p);
1446 items[i].hash = o->data.hash;
1447 }
1448
1449 /* Order by the position on disk, in order to improve seek
1450 * times for rotating media. */
1451 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1452
1453 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1454
1455 /* If the memory mapping triggered a SIGBUS then we return an
1456 * IO error and ignore the error code passed down to us, since
1457 * it is very likely just an effect of a nullified replacement
1458 * mapping page */
1459
1460 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1461 r = -EIO;
1462
1463 journal_file_post_change(f);
1464
1465 return r;
1466}
1467
1468typedef struct ChainCacheItem {
1469 uint64_t first; /* the array at the beginning of the chain */
1470 uint64_t array; /* the cached array */
1471 uint64_t begin; /* the first item in the cached array */
1472 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1473 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1474} ChainCacheItem;
1475
1476static void chain_cache_put(
1477 OrderedHashmap *h,
1478 ChainCacheItem *ci,
1479 uint64_t first,
1480 uint64_t array,
1481 uint64_t begin,
1482 uint64_t total,
1483 uint64_t last_index) {
1484
1485 if (!ci) {
1486 /* If the chain item to cache for this chain is the
1487 * first one it's not worth caching anything */
1488 if (array == first)
1489 return;
1490
1491 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1492 ci = ordered_hashmap_steal_first(h);
1493 assert(ci);
1494 } else {
1495 ci = new(ChainCacheItem, 1);
1496 if (!ci)
1497 return;
1498 }
1499
1500 ci->first = first;
1501
1502 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1503 free(ci);
1504 return;
1505 }
1506 } else
1507 assert(ci->first == first);
1508
1509 ci->array = array;
1510 ci->begin = begin;
1511 ci->total = total;
1512 ci->last_index = last_index;
1513}
1514
1515static int generic_array_get(
1516 JournalFile *f,
1517 uint64_t first,
1518 uint64_t i,
1519 Object **ret, uint64_t *offset) {
1520
1521 Object *o;
1522 uint64_t p = 0, a, t = 0;
1523 int r;
1524 ChainCacheItem *ci;
1525
1526 assert(f);
1527
1528 a = first;
1529
1530 /* Try the chain cache first */
1531 ci = ordered_hashmap_get(f->chain_cache, &first);
1532 if (ci && i > ci->total) {
1533 a = ci->array;
1534 i -= ci->total;
1535 t = ci->total;
1536 }
1537
1538 while (a > 0) {
1539 uint64_t k;
1540
1541 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1542 if (r < 0)
1543 return r;
1544
1545 k = journal_file_entry_array_n_items(o);
1546 if (i < k) {
1547 p = le64toh(o->entry_array.items[i]);
1548 goto found;
1549 }
1550
1551 i -= k;
1552 t += k;
1553 a = le64toh(o->entry_array.next_entry_array_offset);
1554 }
1555
1556 return 0;
1557
1558found:
1559 /* Let's cache this item for the next invocation */
1560 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1561
1562 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1563 if (r < 0)
1564 return r;
1565
1566 if (ret)
1567 *ret = o;
1568
1569 if (offset)
1570 *offset = p;
1571
1572 return 1;
1573}
1574
1575static int generic_array_get_plus_one(
1576 JournalFile *f,
1577 uint64_t extra,
1578 uint64_t first,
1579 uint64_t i,
1580 Object **ret, uint64_t *offset) {
1581
1582 Object *o;
1583
1584 assert(f);
1585
1586 if (i == 0) {
1587 int r;
1588
1589 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1590 if (r < 0)
1591 return r;
1592
1593 if (ret)
1594 *ret = o;
1595
1596 if (offset)
1597 *offset = extra;
1598
1599 return 1;
1600 }
1601
1602 return generic_array_get(f, first, i-1, ret, offset);
1603}
1604
1605enum {
1606 TEST_FOUND,
1607 TEST_LEFT,
1608 TEST_RIGHT
1609};
1610
1611static int generic_array_bisect(
1612 JournalFile *f,
1613 uint64_t first,
1614 uint64_t n,
1615 uint64_t needle,
1616 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1617 direction_t direction,
1618 Object **ret,
1619 uint64_t *offset,
1620 uint64_t *idx) {
1621
1622 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1623 bool subtract_one = false;
1624 Object *o, *array = NULL;
1625 int r;
1626 ChainCacheItem *ci;
1627
1628 assert(f);
1629 assert(test_object);
1630
1631 /* Start with the first array in the chain */
1632 a = first;
1633
1634 ci = ordered_hashmap_get(f->chain_cache, &first);
1635 if (ci && n > ci->total) {
1636 /* Ah, we have iterated this bisection array chain
1637 * previously! Let's see if we can skip ahead in the
1638 * chain, as far as the last time. But we can't jump
1639 * backwards in the chain, so let's check that
1640 * first. */
1641
1642 r = test_object(f, ci->begin, needle);
1643 if (r < 0)
1644 return r;
1645
1646 if (r == TEST_LEFT) {
1647 /* OK, what we are looking for is right of the
1648 * begin of this EntryArray, so let's jump
1649 * straight to previously cached array in the
1650 * chain */
1651
1652 a = ci->array;
1653 n -= ci->total;
1654 t = ci->total;
1655 last_index = ci->last_index;
1656 }
1657 }
1658
1659 while (a > 0) {
1660 uint64_t left, right, k, lp;
1661
1662 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1663 if (r < 0)
1664 return r;
1665
1666 k = journal_file_entry_array_n_items(array);
1667 right = MIN(k, n);
1668 if (right <= 0)
1669 return 0;
1670
1671 i = right - 1;
1672 lp = p = le64toh(array->entry_array.items[i]);
1673 if (p <= 0)
1674 return -EBADMSG;
1675
1676 r = test_object(f, p, needle);
1677 if (r < 0)
1678 return r;
1679
1680 if (r == TEST_FOUND)
1681 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1682
1683 if (r == TEST_RIGHT) {
1684 left = 0;
1685 right -= 1;
1686
1687 if (last_index != (uint64_t) -1) {
1688 assert(last_index <= right);
1689
1690 /* If we cached the last index we
1691 * looked at, let's try to not to jump
1692 * too wildly around and see if we can
1693 * limit the range to look at early to
1694 * the immediate neighbors of the last
1695 * index we looked at. */
1696
1697 if (last_index > 0) {
1698 uint64_t x = last_index - 1;
1699
1700 p = le64toh(array->entry_array.items[x]);
1701 if (p <= 0)
1702 return -EBADMSG;
1703
1704 r = test_object(f, p, needle);
1705 if (r < 0)
1706 return r;
1707
1708 if (r == TEST_FOUND)
1709 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1710
1711 if (r == TEST_RIGHT)
1712 right = x;
1713 else
1714 left = x + 1;
1715 }
1716
1717 if (last_index < right) {
1718 uint64_t y = last_index + 1;
1719
1720 p = le64toh(array->entry_array.items[y]);
1721 if (p <= 0)
1722 return -EBADMSG;
1723
1724 r = test_object(f, p, needle);
1725 if (r < 0)
1726 return r;
1727
1728 if (r == TEST_FOUND)
1729 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1730
1731 if (r == TEST_RIGHT)
1732 right = y;
1733 else
1734 left = y + 1;
1735 }
1736 }
1737
1738 for (;;) {
1739 if (left == right) {
1740 if (direction == DIRECTION_UP)
1741 subtract_one = true;
1742
1743 i = left;
1744 goto found;
1745 }
1746
1747 assert(left < right);
1748 i = (left + right) / 2;
1749
1750 p = le64toh(array->entry_array.items[i]);
1751 if (p <= 0)
1752 return -EBADMSG;
1753
1754 r = test_object(f, p, needle);
1755 if (r < 0)
1756 return r;
1757
1758 if (r == TEST_FOUND)
1759 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1760
1761 if (r == TEST_RIGHT)
1762 right = i;
1763 else
1764 left = i + 1;
1765 }
1766 }
1767
1768 if (k >= n) {
1769 if (direction == DIRECTION_UP) {
1770 i = n;
1771 subtract_one = true;
1772 goto found;
1773 }
1774
1775 return 0;
1776 }
1777
1778 last_p = lp;
1779
1780 n -= k;
1781 t += k;
1782 last_index = (uint64_t) -1;
1783 a = le64toh(array->entry_array.next_entry_array_offset);
1784 }
1785
1786 return 0;
1787
1788found:
1789 if (subtract_one && t == 0 && i == 0)
1790 return 0;
1791
1792 /* Let's cache this item for the next invocation */
1793 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1794
1795 if (subtract_one && i == 0)
1796 p = last_p;
1797 else if (subtract_one)
1798 p = le64toh(array->entry_array.items[i-1]);
1799 else
1800 p = le64toh(array->entry_array.items[i]);
1801
1802 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1803 if (r < 0)
1804 return r;
1805
1806 if (ret)
1807 *ret = o;
1808
1809 if (offset)
1810 *offset = p;
1811
1812 if (idx)
1813 *idx = t + i + (subtract_one ? -1 : 0);
1814
1815 return 1;
1816}
1817
1818static int generic_array_bisect_plus_one(
1819 JournalFile *f,
1820 uint64_t extra,
1821 uint64_t first,
1822 uint64_t n,
1823 uint64_t needle,
1824 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1825 direction_t direction,
1826 Object **ret,
1827 uint64_t *offset,
1828 uint64_t *idx) {
1829
1830 int r;
1831 bool step_back = false;
1832 Object *o;
1833
1834 assert(f);
1835 assert(test_object);
1836
1837 if (n <= 0)
1838 return 0;
1839
1840 /* This bisects the array in object 'first', but first checks
1841 * an extra */
1842 r = test_object(f, extra, needle);
1843 if (r < 0)
1844 return r;
1845
1846 if (r == TEST_FOUND)
1847 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1848
1849 /* if we are looking with DIRECTION_UP then we need to first
1850 see if in the actual array there is a matching entry, and
1851 return the last one of that. But if there isn't any we need
1852 to return this one. Hence remember this, and return it
1853 below. */
1854 if (r == TEST_LEFT)
1855 step_back = direction == DIRECTION_UP;
1856
1857 if (r == TEST_RIGHT) {
1858 if (direction == DIRECTION_DOWN)
1859 goto found;
1860 else
1861 return 0;
1862 }
1863
1864 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1865
1866 if (r == 0 && step_back)
1867 goto found;
1868
1869 if (r > 0 && idx)
1870 (*idx) ++;
1871
1872 return r;
1873
1874found:
1875 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1876 if (r < 0)
1877 return r;
1878
1879 if (ret)
1880 *ret = o;
1881
1882 if (offset)
1883 *offset = extra;
1884
1885 if (idx)
1886 *idx = 0;
1887
1888 return 1;
1889}
1890
1891_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1892 assert(f);
1893 assert(p > 0);
1894
1895 if (p == needle)
1896 return TEST_FOUND;
1897 else if (p < needle)
1898 return TEST_LEFT;
1899 else
1900 return TEST_RIGHT;
1901}
1902
1903static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1904 Object *o;
1905 int r;
1906
1907 assert(f);
1908 assert(p > 0);
1909
1910 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1911 if (r < 0)
1912 return r;
1913
1914 if (le64toh(o->entry.seqnum) == needle)
1915 return TEST_FOUND;
1916 else if (le64toh(o->entry.seqnum) < needle)
1917 return TEST_LEFT;
1918 else
1919 return TEST_RIGHT;
1920}
1921
1922int journal_file_move_to_entry_by_seqnum(
1923 JournalFile *f,
1924 uint64_t seqnum,
1925 direction_t direction,
1926 Object **ret,
1927 uint64_t *offset) {
1928
1929 return generic_array_bisect(f,
1930 le64toh(f->header->entry_array_offset),
1931 le64toh(f->header->n_entries),
1932 seqnum,
1933 test_object_seqnum,
1934 direction,
1935 ret, offset, NULL);
1936}
1937
1938static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1939 Object *o;
1940 int r;
1941
1942 assert(f);
1943 assert(p > 0);
1944
1945 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1946 if (r < 0)
1947 return r;
1948
1949 if (le64toh(o->entry.realtime) == needle)
1950 return TEST_FOUND;
1951 else if (le64toh(o->entry.realtime) < needle)
1952 return TEST_LEFT;
1953 else
1954 return TEST_RIGHT;
1955}
1956
1957int journal_file_move_to_entry_by_realtime(
1958 JournalFile *f,
1959 uint64_t realtime,
1960 direction_t direction,
1961 Object **ret,
1962 uint64_t *offset) {
1963
1964 return generic_array_bisect(f,
1965 le64toh(f->header->entry_array_offset),
1966 le64toh(f->header->n_entries),
1967 realtime,
1968 test_object_realtime,
1969 direction,
1970 ret, offset, NULL);
1971}
1972
1973static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1974 Object *o;
1975 int r;
1976
1977 assert(f);
1978 assert(p > 0);
1979
1980 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1981 if (r < 0)
1982 return r;
1983
1984 if (le64toh(o->entry.monotonic) == needle)
1985 return TEST_FOUND;
1986 else if (le64toh(o->entry.monotonic) < needle)
1987 return TEST_LEFT;
1988 else
1989 return TEST_RIGHT;
1990}
1991
1992static int find_data_object_by_boot_id(
1993 JournalFile *f,
1994 sd_id128_t boot_id,
1995 Object **o,
1996 uint64_t *b) {
1997
1998 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1999
2000 sd_id128_to_string(boot_id, t + 9);
2001 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2002}
2003
2004int journal_file_move_to_entry_by_monotonic(
2005 JournalFile *f,
2006 sd_id128_t boot_id,
2007 uint64_t monotonic,
2008 direction_t direction,
2009 Object **ret,
2010 uint64_t *offset) {
2011
2012 Object *o;
2013 int r;
2014
2015 assert(f);
2016
2017 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2018 if (r < 0)
2019 return r;
2020 if (r == 0)
2021 return -ENOENT;
2022
2023 return generic_array_bisect_plus_one(f,
2024 le64toh(o->data.entry_offset),
2025 le64toh(o->data.entry_array_offset),
2026 le64toh(o->data.n_entries),
2027 monotonic,
2028 test_object_monotonic,
2029 direction,
2030 ret, offset, NULL);
2031}
2032
2033void journal_file_reset_location(JournalFile *f) {
2034 f->location_type = LOCATION_HEAD;
2035 f->current_offset = 0;
2036 f->current_seqnum = 0;
2037 f->current_realtime = 0;
2038 f->current_monotonic = 0;
2039 zero(f->current_boot_id);
2040 f->current_xor_hash = 0;
2041}
2042
2043void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2044 f->location_type = LOCATION_SEEK;
2045 f->current_offset = offset;
2046 f->current_seqnum = le64toh(o->entry.seqnum);
2047 f->current_realtime = le64toh(o->entry.realtime);
2048 f->current_monotonic = le64toh(o->entry.monotonic);
2049 f->current_boot_id = o->entry.boot_id;
2050 f->current_xor_hash = le64toh(o->entry.xor_hash);
2051}
2052
2053int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2054 assert(af);
2055 assert(bf);
2056 assert(af->location_type == LOCATION_SEEK);
2057 assert(bf->location_type == LOCATION_SEEK);
2058
2059 /* If contents and timestamps match, these entries are
2060 * identical, even if the seqnum does not match */
2061 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2062 af->current_monotonic == bf->current_monotonic &&
2063 af->current_realtime == bf->current_realtime &&
2064 af->current_xor_hash == bf->current_xor_hash)
2065 return 0;
2066
2067 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2068
2069 /* If this is from the same seqnum source, compare
2070 * seqnums */
2071 if (af->current_seqnum < bf->current_seqnum)
2072 return -1;
2073 if (af->current_seqnum > bf->current_seqnum)
2074 return 1;
2075
2076 /* Wow! This is weird, different data but the same
2077 * seqnums? Something is borked, but let's make the
2078 * best of it and compare by time. */
2079 }
2080
2081 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2082
2083 /* If the boot id matches, compare monotonic time */
2084 if (af->current_monotonic < bf->current_monotonic)
2085 return -1;
2086 if (af->current_monotonic > bf->current_monotonic)
2087 return 1;
2088 }
2089
2090 /* Otherwise, compare UTC time */
2091 if (af->current_realtime < bf->current_realtime)
2092 return -1;
2093 if (af->current_realtime > bf->current_realtime)
2094 return 1;
2095
2096 /* Finally, compare by contents */
2097 if (af->current_xor_hash < bf->current_xor_hash)
2098 return -1;
2099 if (af->current_xor_hash > bf->current_xor_hash)
2100 return 1;
2101
2102 return 0;
2103}
2104
2105int journal_file_next_entry(
2106 JournalFile *f,
2107 uint64_t p,
2108 direction_t direction,
2109 Object **ret, uint64_t *offset) {
2110
2111 uint64_t i, n, ofs;
2112 int r;
2113
2114 assert(f);
2115
2116 n = le64toh(f->header->n_entries);
2117 if (n <= 0)
2118 return 0;
2119
2120 if (p == 0)
2121 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2122 else {
2123 r = generic_array_bisect(f,
2124 le64toh(f->header->entry_array_offset),
2125 le64toh(f->header->n_entries),
2126 p,
2127 test_object_offset,
2128 DIRECTION_DOWN,
2129 NULL, NULL,
2130 &i);
2131 if (r <= 0)
2132 return r;
2133
2134 if (direction == DIRECTION_DOWN) {
2135 if (i >= n - 1)
2136 return 0;
2137
2138 i++;
2139 } else {
2140 if (i <= 0)
2141 return 0;
2142
2143 i--;
2144 }
2145 }
2146
2147 /* And jump to it */
2148 r = generic_array_get(f,
2149 le64toh(f->header->entry_array_offset),
2150 i,
2151 ret, &ofs);
2152 if (r <= 0)
2153 return r;
2154
2155 if (p > 0 &&
2156 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2157 log_debug("%s: entry array corrupted at entry %"PRIu64,
2158 f->path, i);
2159 return -EBADMSG;
2160 }
2161
2162 if (offset)
2163 *offset = ofs;
2164
2165 return 1;
2166}
2167
2168int journal_file_next_entry_for_data(
2169 JournalFile *f,
2170 Object *o, uint64_t p,
2171 uint64_t data_offset,
2172 direction_t direction,
2173 Object **ret, uint64_t *offset) {
2174
2175 uint64_t n, i;
2176 int r;
2177 Object *d;
2178
2179 assert(f);
2180 assert(p > 0 || !o);
2181
2182 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2183 if (r < 0)
2184 return r;
2185
2186 n = le64toh(d->data.n_entries);
2187 if (n <= 0)
2188 return n;
2189
2190 if (!o)
2191 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2192 else {
2193 if (o->object.type != OBJECT_ENTRY)
2194 return -EINVAL;
2195
2196 r = generic_array_bisect_plus_one(f,
2197 le64toh(d->data.entry_offset),
2198 le64toh(d->data.entry_array_offset),
2199 le64toh(d->data.n_entries),
2200 p,
2201 test_object_offset,
2202 DIRECTION_DOWN,
2203 NULL, NULL,
2204 &i);
2205
2206 if (r <= 0)
2207 return r;
2208
2209 if (direction == DIRECTION_DOWN) {
2210 if (i >= n - 1)
2211 return 0;
2212
2213 i++;
2214 } else {
2215 if (i <= 0)
2216 return 0;
2217
2218 i--;
2219 }
2220
2221 }
2222
2223 return generic_array_get_plus_one(f,
2224 le64toh(d->data.entry_offset),
2225 le64toh(d->data.entry_array_offset),
2226 i,
2227 ret, offset);
2228}
2229
2230int journal_file_move_to_entry_by_offset_for_data(
2231 JournalFile *f,
2232 uint64_t data_offset,
2233 uint64_t p,
2234 direction_t direction,
2235 Object **ret, uint64_t *offset) {
2236
2237 int r;
2238 Object *d;
2239
2240 assert(f);
2241
2242 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2243 if (r < 0)
2244 return r;
2245
2246 return generic_array_bisect_plus_one(f,
2247 le64toh(d->data.entry_offset),
2248 le64toh(d->data.entry_array_offset),
2249 le64toh(d->data.n_entries),
2250 p,
2251 test_object_offset,
2252 direction,
2253 ret, offset, NULL);
2254}
2255
2256int journal_file_move_to_entry_by_monotonic_for_data(
2257 JournalFile *f,
2258 uint64_t data_offset,
2259 sd_id128_t boot_id,
2260 uint64_t monotonic,
2261 direction_t direction,
2262 Object **ret, uint64_t *offset) {
2263
2264 Object *o, *d;
2265 int r;
2266 uint64_t b, z;
2267
2268 assert(f);
2269
2270 /* First, seek by time */
2271 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2272 if (r < 0)
2273 return r;
2274 if (r == 0)
2275 return -ENOENT;
2276
2277 r = generic_array_bisect_plus_one(f,
2278 le64toh(o->data.entry_offset),
2279 le64toh(o->data.entry_array_offset),
2280 le64toh(o->data.n_entries),
2281 monotonic,
2282 test_object_monotonic,
2283 direction,
2284 NULL, &z, NULL);
2285 if (r <= 0)
2286 return r;
2287
2288 /* And now, continue seeking until we find an entry that
2289 * exists in both bisection arrays */
2290
2291 for (;;) {
2292 Object *qo;
2293 uint64_t p, q;
2294
2295 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2296 if (r < 0)
2297 return r;
2298
2299 r = generic_array_bisect_plus_one(f,
2300 le64toh(d->data.entry_offset),
2301 le64toh(d->data.entry_array_offset),
2302 le64toh(d->data.n_entries),
2303 z,
2304 test_object_offset,
2305 direction,
2306 NULL, &p, NULL);
2307 if (r <= 0)
2308 return r;
2309
2310 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2311 if (r < 0)
2312 return r;
2313
2314 r = generic_array_bisect_plus_one(f,
2315 le64toh(o->data.entry_offset),
2316 le64toh(o->data.entry_array_offset),
2317 le64toh(o->data.n_entries),
2318 p,
2319 test_object_offset,
2320 direction,
2321 &qo, &q, NULL);
2322
2323 if (r <= 0)
2324 return r;
2325
2326 if (p == q) {
2327 if (ret)
2328 *ret = qo;
2329 if (offset)
2330 *offset = q;
2331
2332 return 1;
2333 }
2334
2335 z = q;
2336 }
2337}
2338
2339int journal_file_move_to_entry_by_seqnum_for_data(
2340 JournalFile *f,
2341 uint64_t data_offset,
2342 uint64_t seqnum,
2343 direction_t direction,
2344 Object **ret, uint64_t *offset) {
2345
2346 Object *d;
2347 int r;
2348
2349 assert(f);
2350
2351 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2352 if (r < 0)
2353 return r;
2354
2355 return generic_array_bisect_plus_one(f,
2356 le64toh(d->data.entry_offset),
2357 le64toh(d->data.entry_array_offset),
2358 le64toh(d->data.n_entries),
2359 seqnum,
2360 test_object_seqnum,
2361 direction,
2362 ret, offset, NULL);
2363}
2364
2365int journal_file_move_to_entry_by_realtime_for_data(
2366 JournalFile *f,
2367 uint64_t data_offset,
2368 uint64_t realtime,
2369 direction_t direction,
2370 Object **ret, uint64_t *offset) {
2371
2372 Object *d;
2373 int r;
2374
2375 assert(f);
2376
2377 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2378 if (r < 0)
2379 return r;
2380
2381 return generic_array_bisect_plus_one(f,
2382 le64toh(d->data.entry_offset),
2383 le64toh(d->data.entry_array_offset),
2384 le64toh(d->data.n_entries),
2385 realtime,
2386 test_object_realtime,
2387 direction,
2388 ret, offset, NULL);
2389}
2390
2391void journal_file_dump(JournalFile *f) {
2392 Object *o;
2393 int r;
2394 uint64_t p;
2395
2396 assert(f);
2397
2398 journal_file_print_header(f);
2399
2400 p = le64toh(f->header->header_size);
2401 while (p != 0) {
2402 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2403 if (r < 0)
2404 goto fail;
2405
2406 switch (o->object.type) {
2407
2408 case OBJECT_UNUSED:
2409 printf("Type: OBJECT_UNUSED\n");
2410 break;
2411
2412 case OBJECT_DATA:
2413 printf("Type: OBJECT_DATA\n");
2414 break;
2415
2416 case OBJECT_FIELD:
2417 printf("Type: OBJECT_FIELD\n");
2418 break;
2419
2420 case OBJECT_ENTRY:
2421 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2422 le64toh(o->entry.seqnum),
2423 le64toh(o->entry.monotonic),
2424 le64toh(o->entry.realtime));
2425 break;
2426
2427 case OBJECT_FIELD_HASH_TABLE:
2428 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2429 break;
2430
2431 case OBJECT_DATA_HASH_TABLE:
2432 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2433 break;
2434
2435 case OBJECT_ENTRY_ARRAY:
2436 printf("Type: OBJECT_ENTRY_ARRAY\n");
2437 break;
2438
2439 case OBJECT_TAG:
2440 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2441 le64toh(o->tag.seqnum),
2442 le64toh(o->tag.epoch));
2443 break;
2444
2445 default:
2446 printf("Type: unknown (%i)\n", o->object.type);
2447 break;
2448 }
2449
2450 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2451 printf("Flags: %s\n",
2452 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2453
2454 if (p == le64toh(f->header->tail_object_offset))
2455 p = 0;
2456 else
2457 p = p + ALIGN64(le64toh(o->object.size));
2458 }
2459
2460 return;
2461fail:
2462 log_error("File corrupt");
2463}
2464
2465static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2466 const char *x;
2467
2468 x = format_timestamp(buf, l, t);
2469 if (x)
2470 return x;
2471 return " --- ";
2472}
2473
2474void journal_file_print_header(JournalFile *f) {
2475 char a[33], b[33], c[33], d[33];
2476 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2477 struct stat st;
2478 char bytes[FORMAT_BYTES_MAX];
2479
2480 assert(f);
2481
2482 printf("File Path: %s\n"
2483 "File ID: %s\n"
2484 "Machine ID: %s\n"
2485 "Boot ID: %s\n"
2486 "Sequential Number ID: %s\n"
2487 "State: %s\n"
2488 "Compatible Flags:%s%s\n"
2489 "Incompatible Flags:%s%s%s\n"
2490 "Header size: %"PRIu64"\n"
2491 "Arena size: %"PRIu64"\n"
2492 "Data Hash Table Size: %"PRIu64"\n"
2493 "Field Hash Table Size: %"PRIu64"\n"
2494 "Rotate Suggested: %s\n"
2495 "Head Sequential Number: %"PRIu64"\n"
2496 "Tail Sequential Number: %"PRIu64"\n"
2497 "Head Realtime Timestamp: %s\n"
2498 "Tail Realtime Timestamp: %s\n"
2499 "Tail Monotonic Timestamp: %s\n"
2500 "Objects: %"PRIu64"\n"
2501 "Entry Objects: %"PRIu64"\n",
2502 f->path,
2503 sd_id128_to_string(f->header->file_id, a),
2504 sd_id128_to_string(f->header->machine_id, b),
2505 sd_id128_to_string(f->header->boot_id, c),
2506 sd_id128_to_string(f->header->seqnum_id, d),
2507 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2508 f->header->state == STATE_ONLINE ? "ONLINE" :
2509 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2510 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2511 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2512 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2513 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2514 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2515 le64toh(f->header->header_size),
2516 le64toh(f->header->arena_size),
2517 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2518 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2519 yes_no(journal_file_rotate_suggested(f, 0)),
2520 le64toh(f->header->head_entry_seqnum),
2521 le64toh(f->header->tail_entry_seqnum),
2522 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2523 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2524 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2525 le64toh(f->header->n_objects),
2526 le64toh(f->header->n_entries));
2527
2528 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2529 printf("Data Objects: %"PRIu64"\n"
2530 "Data Hash Table Fill: %.1f%%\n",
2531 le64toh(f->header->n_data),
2532 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2533
2534 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2535 printf("Field Objects: %"PRIu64"\n"
2536 "Field Hash Table Fill: %.1f%%\n",
2537 le64toh(f->header->n_fields),
2538 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2539
2540 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2541 printf("Tag Objects: %"PRIu64"\n",
2542 le64toh(f->header->n_tags));
2543 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2544 printf("Entry Array Objects: %"PRIu64"\n",
2545 le64toh(f->header->n_entry_arrays));
2546
2547 if (fstat(f->fd, &st) >= 0)
2548 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2549}
2550
2551static int journal_file_warn_btrfs(JournalFile *f) {
2552 unsigned attrs;
2553 int r;
2554
2555 assert(f);
2556
2557 /* Before we write anything, check if the COW logic is turned
2558 * off on btrfs. Given our write pattern that is quite
2559 * unfriendly to COW file systems this should greatly improve
2560 * performance on COW file systems, such as btrfs, at the
2561 * expense of data integrity features (which shouldn't be too
2562 * bad, given that we do our own checksumming). */
2563
2564 r = btrfs_is_filesystem(f->fd);
2565 if (r < 0)
2566 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2567 if (!r)
2568 return 0;
2569
2570 r = read_attr_fd(f->fd, &attrs);
2571 if (r < 0)
2572 return log_warning_errno(r, "Failed to read file attributes: %m");
2573
2574 if (attrs & FS_NOCOW_FL) {
2575 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2576 return 0;
2577 }
2578
2579 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2580 "This is likely to slow down journal access substantially, please consider turning "
2581 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2582
2583 return 1;
2584}
2585
2586int journal_file_open(
2587 const char *fname,
2588 int flags,
2589 mode_t mode,
2590 bool compress,
2591 bool seal,
2592 JournalMetrics *metrics,
2593 MMapCache *mmap_cache,
2594 JournalFile *template,
2595 JournalFile **ret) {
2596
2597 bool newly_created = false;
2598 JournalFile *f;
2599 void *h;
2600 int r;
2601
2602 assert(fname);
2603 assert(ret);
2604
2605 if ((flags & O_ACCMODE) != O_RDONLY &&
2606 (flags & O_ACCMODE) != O_RDWR)
2607 return -EINVAL;
2608
2609 if (!endswith(fname, ".journal") &&
2610 !endswith(fname, ".journal~"))
2611 return -EINVAL;
2612
2613 f = new0(JournalFile, 1);
2614 if (!f)
2615 return -ENOMEM;
2616
2617 f->fd = -1;
2618 f->mode = mode;
2619
2620 f->flags = flags;
2621 f->prot = prot_from_flags(flags);
2622 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2623#if defined(HAVE_LZ4)
2624 f->compress_lz4 = compress;
2625#elif defined(HAVE_XZ)
2626 f->compress_xz = compress;
2627#endif
2628#ifdef HAVE_GCRYPT
2629 f->seal = seal;
2630#endif
2631
2632 if (mmap_cache)
2633 f->mmap = mmap_cache_ref(mmap_cache);
2634 else {
2635 f->mmap = mmap_cache_new();
2636 if (!f->mmap) {
2637 r = -ENOMEM;
2638 goto fail;
2639 }
2640 }
2641
2642 f->path = strdup(fname);
2643 if (!f->path) {
2644 r = -ENOMEM;
2645 goto fail;
2646 }
2647
2648 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2649 if (!f->chain_cache) {
2650 r = -ENOMEM;
2651 goto fail;
2652 }
2653
2654 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2655 if (f->fd < 0) {
2656 r = -errno;
2657 goto fail;
2658 }
2659
2660 r = journal_file_fstat(f);
2661 if (r < 0)
2662 goto fail;
2663
2664 if (f->last_stat.st_size == 0 && f->writable) {
2665
2666 (void) journal_file_warn_btrfs(f);
2667
2668 /* Let's attach the creation time to the journal file,
2669 * so that the vacuuming code knows the age of this
2670 * file even if the file might end up corrupted one
2671 * day... Ideally we'd just use the creation time many
2672 * file systems maintain for each file, but there is
2673 * currently no usable API to query this, hence let's
2674 * emulate this via extended attributes. If extended
2675 * attributes are not supported we'll just skip this,
2676 * and rely solely on mtime/atime/ctime of the file. */
2677
2678 fd_setcrtime(f->fd, 0);
2679
2680#ifdef HAVE_GCRYPT
2681 /* Try to load the FSPRG state, and if we can't, then
2682 * just don't do sealing */
2683 if (f->seal) {
2684 r = journal_file_fss_load(f);
2685 if (r < 0)
2686 f->seal = false;
2687 }
2688#endif
2689
2690 r = journal_file_init_header(f, template);
2691 if (r < 0)
2692 goto fail;
2693
2694 r = journal_file_fstat(f);
2695 if (r < 0)
2696 goto fail;
2697
2698 newly_created = true;
2699 }
2700
2701 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2702 r = -EIO;
2703 goto fail;
2704 }
2705
2706 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2707 if (r < 0)
2708 goto fail;
2709
2710 f->header = h;
2711
2712 if (!newly_created) {
2713 r = journal_file_verify_header(f);
2714 if (r < 0)
2715 goto fail;
2716 }
2717
2718#ifdef HAVE_GCRYPT
2719 if (!newly_created && f->writable) {
2720 r = journal_file_fss_load(f);
2721 if (r < 0)
2722 goto fail;
2723 }
2724#endif
2725
2726 if (f->writable) {
2727 if (metrics) {
2728 journal_default_metrics(metrics, f->fd);
2729 f->metrics = *metrics;
2730 } else if (template)
2731 f->metrics = template->metrics;
2732
2733 r = journal_file_refresh_header(f);
2734 if (r < 0)
2735 goto fail;
2736 }
2737
2738#ifdef HAVE_GCRYPT
2739 r = journal_file_hmac_setup(f);
2740 if (r < 0)
2741 goto fail;
2742#endif
2743
2744 if (newly_created) {
2745 r = journal_file_setup_field_hash_table(f);
2746 if (r < 0)
2747 goto fail;
2748
2749 r = journal_file_setup_data_hash_table(f);
2750 if (r < 0)
2751 goto fail;
2752
2753#ifdef HAVE_GCRYPT
2754 r = journal_file_append_first_tag(f);
2755 if (r < 0)
2756 goto fail;
2757#endif
2758 }
2759
2760 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2761 r = -EIO;
2762 goto fail;
2763 }
2764
2765 *ret = f;
2766 return 0;
2767
2768fail:
2769 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2770 r = -EIO;
2771
2772 journal_file_close(f);
2773
2774 return r;
2775}
2776
2777int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2778 _cleanup_free_ char *p = NULL;
2779 size_t l;
2780 JournalFile *old_file, *new_file = NULL;
2781 int r;
2782
2783 assert(f);
2784 assert(*f);
2785
2786 old_file = *f;
2787
2788 if (!old_file->writable)
2789 return -EINVAL;
2790
2791 if (!endswith(old_file->path, ".journal"))
2792 return -EINVAL;
2793
2794 l = strlen(old_file->path);
2795 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2796 (int) l - 8, old_file->path,
2797 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2798 le64toh((*f)->header->head_entry_seqnum),
2799 le64toh((*f)->header->head_entry_realtime));
2800 if (r < 0)
2801 return -ENOMEM;
2802
2803 /* Try to rename the file to the archived version. If the file
2804 * already was deleted, we'll get ENOENT, let's ignore that
2805 * case. */
2806 r = rename(old_file->path, p);
2807 if (r < 0 && errno != ENOENT)
2808 return -errno;
2809
2810 old_file->header->state = STATE_ARCHIVED;
2811
2812 /* Currently, btrfs is not very good with out write patterns
2813 * and fragments heavily. Let's defrag our journal files when
2814 * we archive them */
2815 old_file->defrag_on_close = true;
2816
2817 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2818 journal_file_close(old_file);
2819
2820 *f = new_file;
2821 return r;
2822}
2823
2824int journal_file_open_reliably(
2825 const char *fname,
2826 int flags,
2827 mode_t mode,
2828 bool compress,
2829 bool seal,
2830 JournalMetrics *metrics,
2831 MMapCache *mmap_cache,
2832 JournalFile *template,
2833 JournalFile **ret) {
2834
2835 int r;
2836 size_t l;
2837 _cleanup_free_ char *p = NULL;
2838
2839 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2840 if (!IN_SET(r,
2841 -EBADMSG, /* corrupted */
2842 -ENODATA, /* truncated */
2843 -EHOSTDOWN, /* other machine */
2844 -EPROTONOSUPPORT, /* incompatible feature */
2845 -EBUSY, /* unclean shutdown */
2846 -ESHUTDOWN, /* already archived */
2847 -EIO, /* IO error, including SIGBUS on mmap */
2848 -EIDRM /* File has been deleted */))
2849 return r;
2850
2851 if ((flags & O_ACCMODE) == O_RDONLY)
2852 return r;
2853
2854 if (!(flags & O_CREAT))
2855 return r;
2856
2857 if (!endswith(fname, ".journal"))
2858 return r;
2859
2860 /* The file is corrupted. Rotate it away and try it again (but only once) */
2861
2862 l = strlen(fname);
2863 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2864 (int) l - 8, fname,
2865 now(CLOCK_REALTIME),
2866 random_u64()) < 0)
2867 return -ENOMEM;
2868
2869 if (rename(fname, p) < 0)
2870 return -errno;
2871
2872 /* btrfs doesn't cope well with our write pattern and
2873 * fragments heavily. Let's defrag all files we rotate */
2874
2875 (void) chattr_path(p, false, FS_NOCOW_FL);
2876 (void) btrfs_defrag(p);
2877
2878 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2879
2880 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2881}
2882
2883int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2884 uint64_t i, n;
2885 uint64_t q, xor_hash = 0;
2886 int r;
2887 EntryItem *items;
2888 dual_timestamp ts;
2889
2890 assert(from);
2891 assert(to);
2892 assert(o);
2893 assert(p);
2894
2895 if (!to->writable)
2896 return -EPERM;
2897
2898 ts.monotonic = le64toh(o->entry.monotonic);
2899 ts.realtime = le64toh(o->entry.realtime);
2900
2901 n = journal_file_entry_n_items(o);
2902 /* alloca() can't take 0, hence let's allocate at least one */
2903 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2904
2905 for (i = 0; i < n; i++) {
2906 uint64_t l, h;
2907 le64_t le_hash;
2908 size_t t;
2909 void *data;
2910 Object *u;
2911
2912 q = le64toh(o->entry.items[i].object_offset);
2913 le_hash = o->entry.items[i].hash;
2914
2915 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2916 if (r < 0)
2917 return r;
2918
2919 if (le_hash != o->data.hash)
2920 return -EBADMSG;
2921
2922 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2923 t = (size_t) l;
2924
2925 /* We hit the limit on 32bit machines */
2926 if ((uint64_t) t != l)
2927 return -E2BIG;
2928
2929 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2930#if defined(HAVE_XZ) || defined(HAVE_LZ4)
2931 size_t rsize = 0;
2932
2933 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2934 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2935 if (r < 0)
2936 return r;
2937
2938 data = from->compress_buffer;
2939 l = rsize;
2940#else
2941 return -EPROTONOSUPPORT;
2942#endif
2943 } else
2944 data = o->data.payload;
2945
2946 r = journal_file_append_data(to, data, l, &u, &h);
2947 if (r < 0)
2948 return r;
2949
2950 xor_hash ^= le64toh(u->data.hash);
2951 items[i].object_offset = htole64(h);
2952 items[i].hash = u->data.hash;
2953
2954 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2955 if (r < 0)
2956 return r;
2957 }
2958
2959 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2960
2961 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2962 return -EIO;
2963
2964 return r;
2965}
2966
2967void journal_reset_metrics(JournalMetrics *m) {
2968 assert(m);
2969
2970 /* Set everything to "pick automatic values". */
2971
2972 *m = (JournalMetrics) {
2973 .min_use = (uint64_t) -1,
2974 .max_use = (uint64_t) -1,
2975 .min_size = (uint64_t) -1,
2976 .max_size = (uint64_t) -1,
2977 .keep_free = (uint64_t) -1,
2978 .n_max_files = (uint64_t) -1,
2979 };
2980}
2981
2982void journal_default_metrics(JournalMetrics *m, int fd) {
2983 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
2984 struct statvfs ss;
2985 uint64_t fs_size;
2986
2987 assert(m);
2988 assert(fd >= 0);
2989
2990 if (fstatvfs(fd, &ss) >= 0)
2991 fs_size = ss.f_frsize * ss.f_blocks;
2992 else {
2993 log_debug_errno(errno, "Failed to detremine disk size: %m");
2994 fs_size = 0;
2995 }
2996
2997 if (m->max_use == (uint64_t) -1) {
2998
2999 if (fs_size > 0) {
3000 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3001
3002 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3003 m->max_use = DEFAULT_MAX_USE_UPPER;
3004
3005 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3006 m->max_use = DEFAULT_MAX_USE_LOWER;
3007 } else
3008 m->max_use = DEFAULT_MAX_USE_LOWER;
3009 } else {
3010 m->max_use = PAGE_ALIGN(m->max_use);
3011
3012 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3013 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3014 }
3015
3016 if (m->min_use == (uint64_t) -1)
3017 m->min_use = DEFAULT_MIN_USE;
3018
3019 if (m->min_use > m->max_use)
3020 m->min_use = m->max_use;
3021
3022 if (m->max_size == (uint64_t) -1) {
3023 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3024
3025 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3026 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3027 } else
3028 m->max_size = PAGE_ALIGN(m->max_size);
3029
3030 if (m->max_size != 0) {
3031 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3032 m->max_size = JOURNAL_FILE_SIZE_MIN;
3033
3034 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3035 m->max_use = m->max_size*2;
3036 }
3037
3038 if (m->min_size == (uint64_t) -1)
3039 m->min_size = JOURNAL_FILE_SIZE_MIN;
3040 else {
3041 m->min_size = PAGE_ALIGN(m->min_size);
3042
3043 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3044 m->min_size = JOURNAL_FILE_SIZE_MIN;
3045
3046 if (m->max_size != 0 && m->min_size > m->max_size)
3047 m->max_size = m->min_size;
3048 }
3049
3050 if (m->keep_free == (uint64_t) -1) {
3051
3052 if (fs_size > 0) {
3053 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3054
3055 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3056 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3057
3058 } else
3059 m->keep_free = DEFAULT_KEEP_FREE;
3060 }
3061
3062 if (m->n_max_files == (uint64_t) -1)
3063 m->n_max_files = DEFAULT_N_MAX_FILES;
3064
3065 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3066 format_bytes(a, sizeof(a), m->min_use),
3067 format_bytes(b, sizeof(b), m->max_use),
3068 format_bytes(c, sizeof(c), m->max_size),
3069 format_bytes(d, sizeof(d), m->min_size),
3070 format_bytes(e, sizeof(e), m->keep_free),
3071 m->n_max_files);
3072}
3073
3074int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3075 assert(f);
3076 assert(from || to);
3077
3078 if (from) {
3079 if (f->header->head_entry_realtime == 0)
3080 return -ENOENT;
3081
3082 *from = le64toh(f->header->head_entry_realtime);
3083 }
3084
3085 if (to) {
3086 if (f->header->tail_entry_realtime == 0)
3087 return -ENOENT;
3088
3089 *to = le64toh(f->header->tail_entry_realtime);
3090 }
3091
3092 return 1;
3093}
3094
3095int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3096 Object *o;
3097 uint64_t p;
3098 int r;
3099
3100 assert(f);
3101 assert(from || to);
3102
3103 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3104 if (r <= 0)
3105 return r;
3106
3107 if (le64toh(o->data.n_entries) <= 0)
3108 return 0;
3109
3110 if (from) {
3111 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3112 if (r < 0)
3113 return r;
3114
3115 *from = le64toh(o->entry.monotonic);
3116 }
3117
3118 if (to) {
3119 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3120 if (r < 0)
3121 return r;
3122
3123 r = generic_array_get_plus_one(f,
3124 le64toh(o->data.entry_offset),
3125 le64toh(o->data.entry_array_offset),
3126 le64toh(o->data.n_entries)-1,
3127 &o, NULL);
3128 if (r <= 0)
3129 return r;
3130
3131 *to = le64toh(o->entry.monotonic);
3132 }
3133
3134 return 1;
3135}
3136
3137bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3138 assert(f);
3139
3140 /* If we gained new header fields we gained new features,
3141 * hence suggest a rotation */
3142 if (le64toh(f->header->header_size) < sizeof(Header)) {
3143 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3144 return true;
3145 }
3146
3147 /* Let's check if the hash tables grew over a certain fill
3148 * level (75%, borrowing this value from Java's hash table
3149 * implementation), and if so suggest a rotation. To calculate
3150 * the fill level we need the n_data field, which only exists
3151 * in newer versions. */
3152
3153 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3154 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3155 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3156 f->path,
3157 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3158 le64toh(f->header->n_data),
3159 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3160 (unsigned long long) f->last_stat.st_size,
3161 f->last_stat.st_size / le64toh(f->header->n_data));
3162 return true;
3163 }
3164
3165 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3166 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3167 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3168 f->path,
3169 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3170 le64toh(f->header->n_fields),
3171 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3172 return true;
3173 }
3174
3175 /* Are the data objects properly indexed by field objects? */
3176 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3177 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3178 le64toh(f->header->n_data) > 0 &&
3179 le64toh(f->header->n_fields) == 0)
3180 return true;
3181
3182 if (max_file_usec > 0) {
3183 usec_t t, h;
3184
3185 h = le64toh(f->header->head_entry_realtime);
3186 t = now(CLOCK_REALTIME);
3187
3188 if (h > 0 && t > h + max_file_usec)
3189 return true;
3190 }
3191
3192 return false;
3193}