]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
util-lib: split our string related calls from util.[ch] into its own file string...
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <linux/fs.h>
25 #include <stddef.h>
26 #include <sys/mman.h>
27 #include <sys/statvfs.h>
28 #include <sys/uio.h>
29 #include <unistd.h>
30
31 #include "btrfs-util.h"
32 #include "compress.h"
33 #include "journal-authenticate.h"
34 #include "journal-def.h"
35 #include "lookup3.h"
36 #include "random-util.h"
37 #include "string-util.h"
38 #include "journal-file.h"
39
40 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
41 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
42
43 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
44
45 /* This is the minimum journal file size */
46 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
47
48 /* These are the lower and upper bounds if we deduce the max_use value
49 * from the file system size */
50 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
51 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
52
53 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
54 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
55
56 /* This is the upper bound if we deduce max_size from max_use */
57 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
58
59 /* This is the upper bound if we deduce the keep_free value from the
60 * file system size */
61 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
62
63 /* This is the keep_free value when we can't determine the system
64 * size */
65 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
66
67 /* This is the default maximum number of journal files to keep around. */
68 #define DEFAULT_N_MAX_FILES (100)
69
70 /* n_data was the first entry we added after the initial file format design */
71 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
72
73 /* How many entries to keep in the entry array chain cache at max */
74 #define CHAIN_CACHE_MAX 20
75
76 /* How much to increase the journal file size at once each time we allocate something new. */
77 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
78
79 /* Reread fstat() of the file for detecting deletions at least this often */
80 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
81
82 /* The mmap context to use for the header we pick as one above the last defined typed */
83 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
84
85 static int journal_file_set_online(JournalFile *f) {
86 assert(f);
87
88 if (!f->writable)
89 return -EPERM;
90
91 if (!(f->fd >= 0 && f->header))
92 return -EINVAL;
93
94 if (mmap_cache_got_sigbus(f->mmap, f->fd))
95 return -EIO;
96
97 switch(f->header->state) {
98 case STATE_ONLINE:
99 return 0;
100
101 case STATE_OFFLINE:
102 f->header->state = STATE_ONLINE;
103 fsync(f->fd);
104 return 0;
105
106 default:
107 return -EINVAL;
108 }
109 }
110
111 int journal_file_set_offline(JournalFile *f) {
112 assert(f);
113
114 if (!f->writable)
115 return -EPERM;
116
117 if (!(f->fd >= 0 && f->header))
118 return -EINVAL;
119
120 if (f->header->state != STATE_ONLINE)
121 return 0;
122
123 fsync(f->fd);
124
125 if (mmap_cache_got_sigbus(f->mmap, f->fd))
126 return -EIO;
127
128 f->header->state = STATE_OFFLINE;
129
130 if (mmap_cache_got_sigbus(f->mmap, f->fd))
131 return -EIO;
132
133 fsync(f->fd);
134
135 return 0;
136 }
137
138 JournalFile* journal_file_close(JournalFile *f) {
139 assert(f);
140
141 #ifdef HAVE_GCRYPT
142 /* Write the final tag */
143 if (f->seal && f->writable)
144 journal_file_append_tag(f);
145 #endif
146
147 journal_file_set_offline(f);
148
149 if (f->mmap && f->fd >= 0)
150 mmap_cache_close_fd(f->mmap, f->fd);
151
152 if (f->fd >= 0 && f->defrag_on_close) {
153
154 /* Be friendly to btrfs: turn COW back on again now,
155 * and defragment the file. We won't write to the file
156 * ever again, hence remove all fragmentation, and
157 * reenable all the good bits COW usually provides
158 * (such as data checksumming). */
159
160 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
161 (void) btrfs_defrag_fd(f->fd);
162 }
163
164 safe_close(f->fd);
165 free(f->path);
166
167 if (f->mmap)
168 mmap_cache_unref(f->mmap);
169
170 ordered_hashmap_free_free(f->chain_cache);
171
172 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
173 free(f->compress_buffer);
174 #endif
175
176 #ifdef HAVE_GCRYPT
177 if (f->fss_file)
178 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
179 else
180 free(f->fsprg_state);
181
182 free(f->fsprg_seed);
183
184 if (f->hmac)
185 gcry_md_close(f->hmac);
186 #endif
187
188 free(f);
189 return NULL;
190 }
191
192 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
193 Header h = {};
194 ssize_t k;
195 int r;
196
197 assert(f);
198
199 memcpy(h.signature, HEADER_SIGNATURE, 8);
200 h.header_size = htole64(ALIGN64(sizeof(h)));
201
202 h.incompatible_flags |= htole32(
203 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
204 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
205
206 h.compatible_flags = htole32(
207 f->seal * HEADER_COMPATIBLE_SEALED);
208
209 r = sd_id128_randomize(&h.file_id);
210 if (r < 0)
211 return r;
212
213 if (template) {
214 h.seqnum_id = template->header->seqnum_id;
215 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
216 } else
217 h.seqnum_id = h.file_id;
218
219 k = pwrite(f->fd, &h, sizeof(h), 0);
220 if (k < 0)
221 return -errno;
222
223 if (k != sizeof(h))
224 return -EIO;
225
226 return 0;
227 }
228
229 static int journal_file_refresh_header(JournalFile *f) {
230 sd_id128_t boot_id;
231 int r;
232
233 assert(f);
234
235 r = sd_id128_get_machine(&f->header->machine_id);
236 if (r < 0)
237 return r;
238
239 r = sd_id128_get_boot(&boot_id);
240 if (r < 0)
241 return r;
242
243 if (sd_id128_equal(boot_id, f->header->boot_id))
244 f->tail_entry_monotonic_valid = true;
245
246 f->header->boot_id = boot_id;
247
248 r = journal_file_set_online(f);
249
250 /* Sync the online state to disk */
251 fsync(f->fd);
252
253 return r;
254 }
255
256 static int journal_file_verify_header(JournalFile *f) {
257 uint32_t flags;
258
259 assert(f);
260
261 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
262 return -EBADMSG;
263
264 /* In both read and write mode we refuse to open files with
265 * incompatible flags we don't know */
266 flags = le32toh(f->header->incompatible_flags);
267 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
268 if (flags & ~HEADER_INCOMPATIBLE_ANY)
269 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
270 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
271 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
272 if (flags)
273 log_debug("Journal file %s uses incompatible flags %"PRIx32
274 " disabled at compilation time.", f->path, flags);
275 return -EPROTONOSUPPORT;
276 }
277
278 /* When open for writing we refuse to open files with
279 * compatible flags, too */
280 flags = le32toh(f->header->compatible_flags);
281 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
282 if (flags & ~HEADER_COMPATIBLE_ANY)
283 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
284 f->path, flags & ~HEADER_COMPATIBLE_ANY);
285 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
286 if (flags)
287 log_debug("Journal file %s uses compatible flags %"PRIx32
288 " disabled at compilation time.", f->path, flags);
289 return -EPROTONOSUPPORT;
290 }
291
292 if (f->header->state >= _STATE_MAX)
293 return -EBADMSG;
294
295 /* The first addition was n_data, so check that we are at least this large */
296 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
297 return -EBADMSG;
298
299 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
300 return -EBADMSG;
301
302 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
303 return -ENODATA;
304
305 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
306 return -ENODATA;
307
308 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
309 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
310 !VALID64(le64toh(f->header->tail_object_offset)) ||
311 !VALID64(le64toh(f->header->entry_array_offset)))
312 return -ENODATA;
313
314 if (f->writable) {
315 uint8_t state;
316 sd_id128_t machine_id;
317 int r;
318
319 r = sd_id128_get_machine(&machine_id);
320 if (r < 0)
321 return r;
322
323 if (!sd_id128_equal(machine_id, f->header->machine_id))
324 return -EHOSTDOWN;
325
326 state = f->header->state;
327
328 if (state == STATE_ONLINE) {
329 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
330 return -EBUSY;
331 } else if (state == STATE_ARCHIVED)
332 return -ESHUTDOWN;
333 else if (state != STATE_OFFLINE) {
334 log_debug("Journal file %s has unknown state %i.", f->path, state);
335 return -EBUSY;
336 }
337 }
338
339 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
340 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
341
342 f->seal = JOURNAL_HEADER_SEALED(f->header);
343
344 return 0;
345 }
346
347 static int journal_file_fstat(JournalFile *f) {
348 assert(f);
349 assert(f->fd >= 0);
350
351 if (fstat(f->fd, &f->last_stat) < 0)
352 return -errno;
353
354 f->last_stat_usec = now(CLOCK_MONOTONIC);
355
356 /* Refuse appending to files that are already deleted */
357 if (f->last_stat.st_nlink <= 0)
358 return -EIDRM;
359
360 return 0;
361 }
362
363 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
364 uint64_t old_size, new_size;
365 int r;
366
367 assert(f);
368
369 /* We assume that this file is not sparse, and we know that
370 * for sure, since we always call posix_fallocate()
371 * ourselves */
372
373 if (mmap_cache_got_sigbus(f->mmap, f->fd))
374 return -EIO;
375
376 old_size =
377 le64toh(f->header->header_size) +
378 le64toh(f->header->arena_size);
379
380 new_size = PAGE_ALIGN(offset + size);
381 if (new_size < le64toh(f->header->header_size))
382 new_size = le64toh(f->header->header_size);
383
384 if (new_size <= old_size) {
385
386 /* We already pre-allocated enough space, but before
387 * we write to it, let's check with fstat() if the
388 * file got deleted, in order make sure we don't throw
389 * away the data immediately. Don't check fstat() for
390 * all writes though, but only once ever 10s. */
391
392 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
393 return 0;
394
395 return journal_file_fstat(f);
396 }
397
398 /* Allocate more space. */
399
400 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
401 return -E2BIG;
402
403 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
404 struct statvfs svfs;
405
406 if (fstatvfs(f->fd, &svfs) >= 0) {
407 uint64_t available;
408
409 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
410
411 if (new_size - old_size > available)
412 return -E2BIG;
413 }
414 }
415
416 /* Increase by larger blocks at once */
417 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
418 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
419 new_size = f->metrics.max_size;
420
421 /* Note that the glibc fallocate() fallback is very
422 inefficient, hence we try to minimize the allocation area
423 as we can. */
424 r = posix_fallocate(f->fd, old_size, new_size - old_size);
425 if (r != 0)
426 return -r;
427
428 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
429
430 return journal_file_fstat(f);
431 }
432
433 static unsigned type_to_context(ObjectType type) {
434 /* One context for each type, plus one catch-all for the rest */
435 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
436 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
437 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
438 }
439
440 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
441 int r;
442
443 assert(f);
444 assert(ret);
445
446 if (size <= 0)
447 return -EINVAL;
448
449 /* Avoid SIGBUS on invalid accesses */
450 if (offset + size > (uint64_t) f->last_stat.st_size) {
451 /* Hmm, out of range? Let's refresh the fstat() data
452 * first, before we trust that check. */
453
454 r = journal_file_fstat(f);
455 if (r < 0)
456 return r;
457
458 if (offset + size > (uint64_t) f->last_stat.st_size)
459 return -EADDRNOTAVAIL;
460 }
461
462 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
463 }
464
465 static uint64_t minimum_header_size(Object *o) {
466
467 static const uint64_t table[] = {
468 [OBJECT_DATA] = sizeof(DataObject),
469 [OBJECT_FIELD] = sizeof(FieldObject),
470 [OBJECT_ENTRY] = sizeof(EntryObject),
471 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
472 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
473 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
474 [OBJECT_TAG] = sizeof(TagObject),
475 };
476
477 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
478 return sizeof(ObjectHeader);
479
480 return table[o->object.type];
481 }
482
483 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
484 int r;
485 void *t;
486 Object *o;
487 uint64_t s;
488
489 assert(f);
490 assert(ret);
491
492 /* Objects may only be located at multiple of 64 bit */
493 if (!VALID64(offset))
494 return -EFAULT;
495
496 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
497 if (r < 0)
498 return r;
499
500 o = (Object*) t;
501 s = le64toh(o->object.size);
502
503 if (s < sizeof(ObjectHeader))
504 return -EBADMSG;
505
506 if (o->object.type <= OBJECT_UNUSED)
507 return -EBADMSG;
508
509 if (s < minimum_header_size(o))
510 return -EBADMSG;
511
512 if (type > OBJECT_UNUSED && o->object.type != type)
513 return -EBADMSG;
514
515 if (s > sizeof(ObjectHeader)) {
516 r = journal_file_move_to(f, type, false, offset, s, &t);
517 if (r < 0)
518 return r;
519
520 o = (Object*) t;
521 }
522
523 *ret = o;
524 return 0;
525 }
526
527 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
528 uint64_t r;
529
530 assert(f);
531
532 r = le64toh(f->header->tail_entry_seqnum) + 1;
533
534 if (seqnum) {
535 /* If an external seqnum counter was passed, we update
536 * both the local and the external one, and set it to
537 * the maximum of both */
538
539 if (*seqnum + 1 > r)
540 r = *seqnum + 1;
541
542 *seqnum = r;
543 }
544
545 f->header->tail_entry_seqnum = htole64(r);
546
547 if (f->header->head_entry_seqnum == 0)
548 f->header->head_entry_seqnum = htole64(r);
549
550 return r;
551 }
552
553 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
554 int r;
555 uint64_t p;
556 Object *tail, *o;
557 void *t;
558
559 assert(f);
560 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
561 assert(size >= sizeof(ObjectHeader));
562 assert(offset);
563 assert(ret);
564
565 r = journal_file_set_online(f);
566 if (r < 0)
567 return r;
568
569 p = le64toh(f->header->tail_object_offset);
570 if (p == 0)
571 p = le64toh(f->header->header_size);
572 else {
573 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
574 if (r < 0)
575 return r;
576
577 p += ALIGN64(le64toh(tail->object.size));
578 }
579
580 r = journal_file_allocate(f, p, size);
581 if (r < 0)
582 return r;
583
584 r = journal_file_move_to(f, type, false, p, size, &t);
585 if (r < 0)
586 return r;
587
588 o = (Object*) t;
589
590 zero(o->object);
591 o->object.type = type;
592 o->object.size = htole64(size);
593
594 f->header->tail_object_offset = htole64(p);
595 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
596
597 *ret = o;
598 *offset = p;
599
600 return 0;
601 }
602
603 static int journal_file_setup_data_hash_table(JournalFile *f) {
604 uint64_t s, p;
605 Object *o;
606 int r;
607
608 assert(f);
609
610 /* We estimate that we need 1 hash table entry per 768 bytes
611 of journal file and we want to make sure we never get
612 beyond 75% fill level. Calculate the hash table size for
613 the maximum file size based on these metrics. */
614
615 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
616 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
617 s = DEFAULT_DATA_HASH_TABLE_SIZE;
618
619 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
620
621 r = journal_file_append_object(f,
622 OBJECT_DATA_HASH_TABLE,
623 offsetof(Object, hash_table.items) + s,
624 &o, &p);
625 if (r < 0)
626 return r;
627
628 memzero(o->hash_table.items, s);
629
630 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
631 f->header->data_hash_table_size = htole64(s);
632
633 return 0;
634 }
635
636 static int journal_file_setup_field_hash_table(JournalFile *f) {
637 uint64_t s, p;
638 Object *o;
639 int r;
640
641 assert(f);
642
643 /* We use a fixed size hash table for the fields as this
644 * number should grow very slowly only */
645
646 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
647 r = journal_file_append_object(f,
648 OBJECT_FIELD_HASH_TABLE,
649 offsetof(Object, hash_table.items) + s,
650 &o, &p);
651 if (r < 0)
652 return r;
653
654 memzero(o->hash_table.items, s);
655
656 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
657 f->header->field_hash_table_size = htole64(s);
658
659 return 0;
660 }
661
662 int journal_file_map_data_hash_table(JournalFile *f) {
663 uint64_t s, p;
664 void *t;
665 int r;
666
667 assert(f);
668
669 if (f->data_hash_table)
670 return 0;
671
672 p = le64toh(f->header->data_hash_table_offset);
673 s = le64toh(f->header->data_hash_table_size);
674
675 r = journal_file_move_to(f,
676 OBJECT_DATA_HASH_TABLE,
677 true,
678 p, s,
679 &t);
680 if (r < 0)
681 return r;
682
683 f->data_hash_table = t;
684 return 0;
685 }
686
687 int journal_file_map_field_hash_table(JournalFile *f) {
688 uint64_t s, p;
689 void *t;
690 int r;
691
692 assert(f);
693
694 if (f->field_hash_table)
695 return 0;
696
697 p = le64toh(f->header->field_hash_table_offset);
698 s = le64toh(f->header->field_hash_table_size);
699
700 r = journal_file_move_to(f,
701 OBJECT_FIELD_HASH_TABLE,
702 true,
703 p, s,
704 &t);
705 if (r < 0)
706 return r;
707
708 f->field_hash_table = t;
709 return 0;
710 }
711
712 static int journal_file_link_field(
713 JournalFile *f,
714 Object *o,
715 uint64_t offset,
716 uint64_t hash) {
717
718 uint64_t p, h, m;
719 int r;
720
721 assert(f);
722 assert(o);
723 assert(offset > 0);
724
725 if (o->object.type != OBJECT_FIELD)
726 return -EINVAL;
727
728 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
729 if (m <= 0)
730 return -EBADMSG;
731
732 /* This might alter the window we are looking at */
733 o->field.next_hash_offset = o->field.head_data_offset = 0;
734
735 h = hash % m;
736 p = le64toh(f->field_hash_table[h].tail_hash_offset);
737 if (p == 0)
738 f->field_hash_table[h].head_hash_offset = htole64(offset);
739 else {
740 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
741 if (r < 0)
742 return r;
743
744 o->field.next_hash_offset = htole64(offset);
745 }
746
747 f->field_hash_table[h].tail_hash_offset = htole64(offset);
748
749 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
750 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
751
752 return 0;
753 }
754
755 static int journal_file_link_data(
756 JournalFile *f,
757 Object *o,
758 uint64_t offset,
759 uint64_t hash) {
760
761 uint64_t p, h, m;
762 int r;
763
764 assert(f);
765 assert(o);
766 assert(offset > 0);
767
768 if (o->object.type != OBJECT_DATA)
769 return -EINVAL;
770
771 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
772 if (m <= 0)
773 return -EBADMSG;
774
775 /* This might alter the window we are looking at */
776 o->data.next_hash_offset = o->data.next_field_offset = 0;
777 o->data.entry_offset = o->data.entry_array_offset = 0;
778 o->data.n_entries = 0;
779
780 h = hash % m;
781 p = le64toh(f->data_hash_table[h].tail_hash_offset);
782 if (p == 0)
783 /* Only entry in the hash table is easy */
784 f->data_hash_table[h].head_hash_offset = htole64(offset);
785 else {
786 /* Move back to the previous data object, to patch in
787 * pointer */
788
789 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
790 if (r < 0)
791 return r;
792
793 o->data.next_hash_offset = htole64(offset);
794 }
795
796 f->data_hash_table[h].tail_hash_offset = htole64(offset);
797
798 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
799 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
800
801 return 0;
802 }
803
804 int journal_file_find_field_object_with_hash(
805 JournalFile *f,
806 const void *field, uint64_t size, uint64_t hash,
807 Object **ret, uint64_t *offset) {
808
809 uint64_t p, osize, h, m;
810 int r;
811
812 assert(f);
813 assert(field && size > 0);
814
815 /* If the field hash table is empty, we can't find anything */
816 if (le64toh(f->header->field_hash_table_size) <= 0)
817 return 0;
818
819 /* Map the field hash table, if it isn't mapped yet. */
820 r = journal_file_map_field_hash_table(f);
821 if (r < 0)
822 return r;
823
824 osize = offsetof(Object, field.payload) + size;
825
826 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
827 if (m <= 0)
828 return -EBADMSG;
829
830 h = hash % m;
831 p = le64toh(f->field_hash_table[h].head_hash_offset);
832
833 while (p > 0) {
834 Object *o;
835
836 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
837 if (r < 0)
838 return r;
839
840 if (le64toh(o->field.hash) == hash &&
841 le64toh(o->object.size) == osize &&
842 memcmp(o->field.payload, field, size) == 0) {
843
844 if (ret)
845 *ret = o;
846 if (offset)
847 *offset = p;
848
849 return 1;
850 }
851
852 p = le64toh(o->field.next_hash_offset);
853 }
854
855 return 0;
856 }
857
858 int journal_file_find_field_object(
859 JournalFile *f,
860 const void *field, uint64_t size,
861 Object **ret, uint64_t *offset) {
862
863 uint64_t hash;
864
865 assert(f);
866 assert(field && size > 0);
867
868 hash = hash64(field, size);
869
870 return journal_file_find_field_object_with_hash(f,
871 field, size, hash,
872 ret, offset);
873 }
874
875 int journal_file_find_data_object_with_hash(
876 JournalFile *f,
877 const void *data, uint64_t size, uint64_t hash,
878 Object **ret, uint64_t *offset) {
879
880 uint64_t p, osize, h, m;
881 int r;
882
883 assert(f);
884 assert(data || size == 0);
885
886 /* If there's no data hash table, then there's no entry. */
887 if (le64toh(f->header->data_hash_table_size) <= 0)
888 return 0;
889
890 /* Map the data hash table, if it isn't mapped yet. */
891 r = journal_file_map_data_hash_table(f);
892 if (r < 0)
893 return r;
894
895 osize = offsetof(Object, data.payload) + size;
896
897 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
898 if (m <= 0)
899 return -EBADMSG;
900
901 h = hash % m;
902 p = le64toh(f->data_hash_table[h].head_hash_offset);
903
904 while (p > 0) {
905 Object *o;
906
907 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
908 if (r < 0)
909 return r;
910
911 if (le64toh(o->data.hash) != hash)
912 goto next;
913
914 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
915 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
916 uint64_t l;
917 size_t rsize = 0;
918
919 l = le64toh(o->object.size);
920 if (l <= offsetof(Object, data.payload))
921 return -EBADMSG;
922
923 l -= offsetof(Object, data.payload);
924
925 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
926 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
927 if (r < 0)
928 return r;
929
930 if (rsize == size &&
931 memcmp(f->compress_buffer, data, size) == 0) {
932
933 if (ret)
934 *ret = o;
935
936 if (offset)
937 *offset = p;
938
939 return 1;
940 }
941 #else
942 return -EPROTONOSUPPORT;
943 #endif
944 } else if (le64toh(o->object.size) == osize &&
945 memcmp(o->data.payload, data, size) == 0) {
946
947 if (ret)
948 *ret = o;
949
950 if (offset)
951 *offset = p;
952
953 return 1;
954 }
955
956 next:
957 p = le64toh(o->data.next_hash_offset);
958 }
959
960 return 0;
961 }
962
963 int journal_file_find_data_object(
964 JournalFile *f,
965 const void *data, uint64_t size,
966 Object **ret, uint64_t *offset) {
967
968 uint64_t hash;
969
970 assert(f);
971 assert(data || size == 0);
972
973 hash = hash64(data, size);
974
975 return journal_file_find_data_object_with_hash(f,
976 data, size, hash,
977 ret, offset);
978 }
979
980 static int journal_file_append_field(
981 JournalFile *f,
982 const void *field, uint64_t size,
983 Object **ret, uint64_t *offset) {
984
985 uint64_t hash, p;
986 uint64_t osize;
987 Object *o;
988 int r;
989
990 assert(f);
991 assert(field && size > 0);
992
993 hash = hash64(field, size);
994
995 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
996 if (r < 0)
997 return r;
998 else if (r > 0) {
999
1000 if (ret)
1001 *ret = o;
1002
1003 if (offset)
1004 *offset = p;
1005
1006 return 0;
1007 }
1008
1009 osize = offsetof(Object, field.payload) + size;
1010 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1011 if (r < 0)
1012 return r;
1013
1014 o->field.hash = htole64(hash);
1015 memcpy(o->field.payload, field, size);
1016
1017 r = journal_file_link_field(f, o, p, hash);
1018 if (r < 0)
1019 return r;
1020
1021 /* The linking might have altered the window, so let's
1022 * refresh our pointer */
1023 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1024 if (r < 0)
1025 return r;
1026
1027 #ifdef HAVE_GCRYPT
1028 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1029 if (r < 0)
1030 return r;
1031 #endif
1032
1033 if (ret)
1034 *ret = o;
1035
1036 if (offset)
1037 *offset = p;
1038
1039 return 0;
1040 }
1041
1042 static int journal_file_append_data(
1043 JournalFile *f,
1044 const void *data, uint64_t size,
1045 Object **ret, uint64_t *offset) {
1046
1047 uint64_t hash, p;
1048 uint64_t osize;
1049 Object *o;
1050 int r, compression = 0;
1051 const void *eq;
1052
1053 assert(f);
1054 assert(data || size == 0);
1055
1056 hash = hash64(data, size);
1057
1058 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1059 if (r < 0)
1060 return r;
1061 else if (r > 0) {
1062
1063 if (ret)
1064 *ret = o;
1065
1066 if (offset)
1067 *offset = p;
1068
1069 return 0;
1070 }
1071
1072 osize = offsetof(Object, data.payload) + size;
1073 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1074 if (r < 0)
1075 return r;
1076
1077 o->data.hash = htole64(hash);
1078
1079 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1080 if (f->compress_xz &&
1081 size >= COMPRESSION_SIZE_THRESHOLD) {
1082 size_t rsize = 0;
1083
1084 compression = compress_blob(data, size, o->data.payload, &rsize);
1085
1086 if (compression) {
1087 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1088 o->object.flags |= compression;
1089
1090 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1091 size, rsize, object_compressed_to_string(compression));
1092 }
1093 }
1094 #endif
1095
1096 if (!compression && size > 0)
1097 memcpy(o->data.payload, data, size);
1098
1099 r = journal_file_link_data(f, o, p, hash);
1100 if (r < 0)
1101 return r;
1102
1103 /* The linking might have altered the window, so let's
1104 * refresh our pointer */
1105 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1106 if (r < 0)
1107 return r;
1108
1109 if (!data)
1110 eq = NULL;
1111 else
1112 eq = memchr(data, '=', size);
1113 if (eq && eq > data) {
1114 Object *fo = NULL;
1115 uint64_t fp;
1116
1117 /* Create field object ... */
1118 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1119 if (r < 0)
1120 return r;
1121
1122 /* ... and link it in. */
1123 o->data.next_field_offset = fo->field.head_data_offset;
1124 fo->field.head_data_offset = le64toh(p);
1125 }
1126
1127 #ifdef HAVE_GCRYPT
1128 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1129 if (r < 0)
1130 return r;
1131 #endif
1132
1133 if (ret)
1134 *ret = o;
1135
1136 if (offset)
1137 *offset = p;
1138
1139 return 0;
1140 }
1141
1142 uint64_t journal_file_entry_n_items(Object *o) {
1143 assert(o);
1144
1145 if (o->object.type != OBJECT_ENTRY)
1146 return 0;
1147
1148 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1149 }
1150
1151 uint64_t journal_file_entry_array_n_items(Object *o) {
1152 assert(o);
1153
1154 if (o->object.type != OBJECT_ENTRY_ARRAY)
1155 return 0;
1156
1157 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1158 }
1159
1160 uint64_t journal_file_hash_table_n_items(Object *o) {
1161 assert(o);
1162
1163 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1164 o->object.type != OBJECT_FIELD_HASH_TABLE)
1165 return 0;
1166
1167 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1168 }
1169
1170 static int link_entry_into_array(JournalFile *f,
1171 le64_t *first,
1172 le64_t *idx,
1173 uint64_t p) {
1174 int r;
1175 uint64_t n = 0, ap = 0, q, i, a, hidx;
1176 Object *o;
1177
1178 assert(f);
1179 assert(first);
1180 assert(idx);
1181 assert(p > 0);
1182
1183 a = le64toh(*first);
1184 i = hidx = le64toh(*idx);
1185 while (a > 0) {
1186
1187 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1188 if (r < 0)
1189 return r;
1190
1191 n = journal_file_entry_array_n_items(o);
1192 if (i < n) {
1193 o->entry_array.items[i] = htole64(p);
1194 *idx = htole64(hidx + 1);
1195 return 0;
1196 }
1197
1198 i -= n;
1199 ap = a;
1200 a = le64toh(o->entry_array.next_entry_array_offset);
1201 }
1202
1203 if (hidx > n)
1204 n = (hidx+1) * 2;
1205 else
1206 n = n * 2;
1207
1208 if (n < 4)
1209 n = 4;
1210
1211 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1212 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1213 &o, &q);
1214 if (r < 0)
1215 return r;
1216
1217 #ifdef HAVE_GCRYPT
1218 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1219 if (r < 0)
1220 return r;
1221 #endif
1222
1223 o->entry_array.items[i] = htole64(p);
1224
1225 if (ap == 0)
1226 *first = htole64(q);
1227 else {
1228 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1229 if (r < 0)
1230 return r;
1231
1232 o->entry_array.next_entry_array_offset = htole64(q);
1233 }
1234
1235 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1236 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1237
1238 *idx = htole64(hidx + 1);
1239
1240 return 0;
1241 }
1242
1243 static int link_entry_into_array_plus_one(JournalFile *f,
1244 le64_t *extra,
1245 le64_t *first,
1246 le64_t *idx,
1247 uint64_t p) {
1248
1249 int r;
1250
1251 assert(f);
1252 assert(extra);
1253 assert(first);
1254 assert(idx);
1255 assert(p > 0);
1256
1257 if (*idx == 0)
1258 *extra = htole64(p);
1259 else {
1260 le64_t i;
1261
1262 i = htole64(le64toh(*idx) - 1);
1263 r = link_entry_into_array(f, first, &i, p);
1264 if (r < 0)
1265 return r;
1266 }
1267
1268 *idx = htole64(le64toh(*idx) + 1);
1269 return 0;
1270 }
1271
1272 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1273 uint64_t p;
1274 int r;
1275 assert(f);
1276 assert(o);
1277 assert(offset > 0);
1278
1279 p = le64toh(o->entry.items[i].object_offset);
1280 if (p == 0)
1281 return -EINVAL;
1282
1283 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1284 if (r < 0)
1285 return r;
1286
1287 return link_entry_into_array_plus_one(f,
1288 &o->data.entry_offset,
1289 &o->data.entry_array_offset,
1290 &o->data.n_entries,
1291 offset);
1292 }
1293
1294 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1295 uint64_t n, i;
1296 int r;
1297
1298 assert(f);
1299 assert(o);
1300 assert(offset > 0);
1301
1302 if (o->object.type != OBJECT_ENTRY)
1303 return -EINVAL;
1304
1305 __sync_synchronize();
1306
1307 /* Link up the entry itself */
1308 r = link_entry_into_array(f,
1309 &f->header->entry_array_offset,
1310 &f->header->n_entries,
1311 offset);
1312 if (r < 0)
1313 return r;
1314
1315 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1316
1317 if (f->header->head_entry_realtime == 0)
1318 f->header->head_entry_realtime = o->entry.realtime;
1319
1320 f->header->tail_entry_realtime = o->entry.realtime;
1321 f->header->tail_entry_monotonic = o->entry.monotonic;
1322
1323 f->tail_entry_monotonic_valid = true;
1324
1325 /* Link up the items */
1326 n = journal_file_entry_n_items(o);
1327 for (i = 0; i < n; i++) {
1328 r = journal_file_link_entry_item(f, o, offset, i);
1329 if (r < 0)
1330 return r;
1331 }
1332
1333 return 0;
1334 }
1335
1336 static int journal_file_append_entry_internal(
1337 JournalFile *f,
1338 const dual_timestamp *ts,
1339 uint64_t xor_hash,
1340 const EntryItem items[], unsigned n_items,
1341 uint64_t *seqnum,
1342 Object **ret, uint64_t *offset) {
1343 uint64_t np;
1344 uint64_t osize;
1345 Object *o;
1346 int r;
1347
1348 assert(f);
1349 assert(items || n_items == 0);
1350 assert(ts);
1351
1352 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1353
1354 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1355 if (r < 0)
1356 return r;
1357
1358 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1359 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1360 o->entry.realtime = htole64(ts->realtime);
1361 o->entry.monotonic = htole64(ts->monotonic);
1362 o->entry.xor_hash = htole64(xor_hash);
1363 o->entry.boot_id = f->header->boot_id;
1364
1365 #ifdef HAVE_GCRYPT
1366 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1367 if (r < 0)
1368 return r;
1369 #endif
1370
1371 r = journal_file_link_entry(f, o, np);
1372 if (r < 0)
1373 return r;
1374
1375 if (ret)
1376 *ret = o;
1377
1378 if (offset)
1379 *offset = np;
1380
1381 return 0;
1382 }
1383
1384 void journal_file_post_change(JournalFile *f) {
1385 assert(f);
1386
1387 /* inotify() does not receive IN_MODIFY events from file
1388 * accesses done via mmap(). After each access we hence
1389 * trigger IN_MODIFY by truncating the journal file to its
1390 * current size which triggers IN_MODIFY. */
1391
1392 __sync_synchronize();
1393
1394 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1395 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1396 }
1397
1398 static int entry_item_cmp(const void *_a, const void *_b) {
1399 const EntryItem *a = _a, *b = _b;
1400
1401 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1402 return -1;
1403 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1404 return 1;
1405 return 0;
1406 }
1407
1408 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1409 unsigned i;
1410 EntryItem *items;
1411 int r;
1412 uint64_t xor_hash = 0;
1413 struct dual_timestamp _ts;
1414
1415 assert(f);
1416 assert(iovec || n_iovec == 0);
1417
1418 if (!ts) {
1419 dual_timestamp_get(&_ts);
1420 ts = &_ts;
1421 }
1422
1423 if (f->tail_entry_monotonic_valid &&
1424 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1425 return -EINVAL;
1426
1427 #ifdef HAVE_GCRYPT
1428 r = journal_file_maybe_append_tag(f, ts->realtime);
1429 if (r < 0)
1430 return r;
1431 #endif
1432
1433 /* alloca() can't take 0, hence let's allocate at least one */
1434 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1435
1436 for (i = 0; i < n_iovec; i++) {
1437 uint64_t p;
1438 Object *o;
1439
1440 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1441 if (r < 0)
1442 return r;
1443
1444 xor_hash ^= le64toh(o->data.hash);
1445 items[i].object_offset = htole64(p);
1446 items[i].hash = o->data.hash;
1447 }
1448
1449 /* Order by the position on disk, in order to improve seek
1450 * times for rotating media. */
1451 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1452
1453 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1454
1455 /* If the memory mapping triggered a SIGBUS then we return an
1456 * IO error and ignore the error code passed down to us, since
1457 * it is very likely just an effect of a nullified replacement
1458 * mapping page */
1459
1460 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1461 r = -EIO;
1462
1463 journal_file_post_change(f);
1464
1465 return r;
1466 }
1467
1468 typedef struct ChainCacheItem {
1469 uint64_t first; /* the array at the beginning of the chain */
1470 uint64_t array; /* the cached array */
1471 uint64_t begin; /* the first item in the cached array */
1472 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1473 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1474 } ChainCacheItem;
1475
1476 static void chain_cache_put(
1477 OrderedHashmap *h,
1478 ChainCacheItem *ci,
1479 uint64_t first,
1480 uint64_t array,
1481 uint64_t begin,
1482 uint64_t total,
1483 uint64_t last_index) {
1484
1485 if (!ci) {
1486 /* If the chain item to cache for this chain is the
1487 * first one it's not worth caching anything */
1488 if (array == first)
1489 return;
1490
1491 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1492 ci = ordered_hashmap_steal_first(h);
1493 assert(ci);
1494 } else {
1495 ci = new(ChainCacheItem, 1);
1496 if (!ci)
1497 return;
1498 }
1499
1500 ci->first = first;
1501
1502 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1503 free(ci);
1504 return;
1505 }
1506 } else
1507 assert(ci->first == first);
1508
1509 ci->array = array;
1510 ci->begin = begin;
1511 ci->total = total;
1512 ci->last_index = last_index;
1513 }
1514
1515 static int generic_array_get(
1516 JournalFile *f,
1517 uint64_t first,
1518 uint64_t i,
1519 Object **ret, uint64_t *offset) {
1520
1521 Object *o;
1522 uint64_t p = 0, a, t = 0;
1523 int r;
1524 ChainCacheItem *ci;
1525
1526 assert(f);
1527
1528 a = first;
1529
1530 /* Try the chain cache first */
1531 ci = ordered_hashmap_get(f->chain_cache, &first);
1532 if (ci && i > ci->total) {
1533 a = ci->array;
1534 i -= ci->total;
1535 t = ci->total;
1536 }
1537
1538 while (a > 0) {
1539 uint64_t k;
1540
1541 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1542 if (r < 0)
1543 return r;
1544
1545 k = journal_file_entry_array_n_items(o);
1546 if (i < k) {
1547 p = le64toh(o->entry_array.items[i]);
1548 goto found;
1549 }
1550
1551 i -= k;
1552 t += k;
1553 a = le64toh(o->entry_array.next_entry_array_offset);
1554 }
1555
1556 return 0;
1557
1558 found:
1559 /* Let's cache this item for the next invocation */
1560 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1561
1562 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1563 if (r < 0)
1564 return r;
1565
1566 if (ret)
1567 *ret = o;
1568
1569 if (offset)
1570 *offset = p;
1571
1572 return 1;
1573 }
1574
1575 static int generic_array_get_plus_one(
1576 JournalFile *f,
1577 uint64_t extra,
1578 uint64_t first,
1579 uint64_t i,
1580 Object **ret, uint64_t *offset) {
1581
1582 Object *o;
1583
1584 assert(f);
1585
1586 if (i == 0) {
1587 int r;
1588
1589 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1590 if (r < 0)
1591 return r;
1592
1593 if (ret)
1594 *ret = o;
1595
1596 if (offset)
1597 *offset = extra;
1598
1599 return 1;
1600 }
1601
1602 return generic_array_get(f, first, i-1, ret, offset);
1603 }
1604
1605 enum {
1606 TEST_FOUND,
1607 TEST_LEFT,
1608 TEST_RIGHT
1609 };
1610
1611 static int generic_array_bisect(
1612 JournalFile *f,
1613 uint64_t first,
1614 uint64_t n,
1615 uint64_t needle,
1616 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1617 direction_t direction,
1618 Object **ret,
1619 uint64_t *offset,
1620 uint64_t *idx) {
1621
1622 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1623 bool subtract_one = false;
1624 Object *o, *array = NULL;
1625 int r;
1626 ChainCacheItem *ci;
1627
1628 assert(f);
1629 assert(test_object);
1630
1631 /* Start with the first array in the chain */
1632 a = first;
1633
1634 ci = ordered_hashmap_get(f->chain_cache, &first);
1635 if (ci && n > ci->total) {
1636 /* Ah, we have iterated this bisection array chain
1637 * previously! Let's see if we can skip ahead in the
1638 * chain, as far as the last time. But we can't jump
1639 * backwards in the chain, so let's check that
1640 * first. */
1641
1642 r = test_object(f, ci->begin, needle);
1643 if (r < 0)
1644 return r;
1645
1646 if (r == TEST_LEFT) {
1647 /* OK, what we are looking for is right of the
1648 * begin of this EntryArray, so let's jump
1649 * straight to previously cached array in the
1650 * chain */
1651
1652 a = ci->array;
1653 n -= ci->total;
1654 t = ci->total;
1655 last_index = ci->last_index;
1656 }
1657 }
1658
1659 while (a > 0) {
1660 uint64_t left, right, k, lp;
1661
1662 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1663 if (r < 0)
1664 return r;
1665
1666 k = journal_file_entry_array_n_items(array);
1667 right = MIN(k, n);
1668 if (right <= 0)
1669 return 0;
1670
1671 i = right - 1;
1672 lp = p = le64toh(array->entry_array.items[i]);
1673 if (p <= 0)
1674 return -EBADMSG;
1675
1676 r = test_object(f, p, needle);
1677 if (r < 0)
1678 return r;
1679
1680 if (r == TEST_FOUND)
1681 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1682
1683 if (r == TEST_RIGHT) {
1684 left = 0;
1685 right -= 1;
1686
1687 if (last_index != (uint64_t) -1) {
1688 assert(last_index <= right);
1689
1690 /* If we cached the last index we
1691 * looked at, let's try to not to jump
1692 * too wildly around and see if we can
1693 * limit the range to look at early to
1694 * the immediate neighbors of the last
1695 * index we looked at. */
1696
1697 if (last_index > 0) {
1698 uint64_t x = last_index - 1;
1699
1700 p = le64toh(array->entry_array.items[x]);
1701 if (p <= 0)
1702 return -EBADMSG;
1703
1704 r = test_object(f, p, needle);
1705 if (r < 0)
1706 return r;
1707
1708 if (r == TEST_FOUND)
1709 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1710
1711 if (r == TEST_RIGHT)
1712 right = x;
1713 else
1714 left = x + 1;
1715 }
1716
1717 if (last_index < right) {
1718 uint64_t y = last_index + 1;
1719
1720 p = le64toh(array->entry_array.items[y]);
1721 if (p <= 0)
1722 return -EBADMSG;
1723
1724 r = test_object(f, p, needle);
1725 if (r < 0)
1726 return r;
1727
1728 if (r == TEST_FOUND)
1729 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1730
1731 if (r == TEST_RIGHT)
1732 right = y;
1733 else
1734 left = y + 1;
1735 }
1736 }
1737
1738 for (;;) {
1739 if (left == right) {
1740 if (direction == DIRECTION_UP)
1741 subtract_one = true;
1742
1743 i = left;
1744 goto found;
1745 }
1746
1747 assert(left < right);
1748 i = (left + right) / 2;
1749
1750 p = le64toh(array->entry_array.items[i]);
1751 if (p <= 0)
1752 return -EBADMSG;
1753
1754 r = test_object(f, p, needle);
1755 if (r < 0)
1756 return r;
1757
1758 if (r == TEST_FOUND)
1759 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1760
1761 if (r == TEST_RIGHT)
1762 right = i;
1763 else
1764 left = i + 1;
1765 }
1766 }
1767
1768 if (k >= n) {
1769 if (direction == DIRECTION_UP) {
1770 i = n;
1771 subtract_one = true;
1772 goto found;
1773 }
1774
1775 return 0;
1776 }
1777
1778 last_p = lp;
1779
1780 n -= k;
1781 t += k;
1782 last_index = (uint64_t) -1;
1783 a = le64toh(array->entry_array.next_entry_array_offset);
1784 }
1785
1786 return 0;
1787
1788 found:
1789 if (subtract_one && t == 0 && i == 0)
1790 return 0;
1791
1792 /* Let's cache this item for the next invocation */
1793 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1794
1795 if (subtract_one && i == 0)
1796 p = last_p;
1797 else if (subtract_one)
1798 p = le64toh(array->entry_array.items[i-1]);
1799 else
1800 p = le64toh(array->entry_array.items[i]);
1801
1802 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1803 if (r < 0)
1804 return r;
1805
1806 if (ret)
1807 *ret = o;
1808
1809 if (offset)
1810 *offset = p;
1811
1812 if (idx)
1813 *idx = t + i + (subtract_one ? -1 : 0);
1814
1815 return 1;
1816 }
1817
1818 static int generic_array_bisect_plus_one(
1819 JournalFile *f,
1820 uint64_t extra,
1821 uint64_t first,
1822 uint64_t n,
1823 uint64_t needle,
1824 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1825 direction_t direction,
1826 Object **ret,
1827 uint64_t *offset,
1828 uint64_t *idx) {
1829
1830 int r;
1831 bool step_back = false;
1832 Object *o;
1833
1834 assert(f);
1835 assert(test_object);
1836
1837 if (n <= 0)
1838 return 0;
1839
1840 /* This bisects the array in object 'first', but first checks
1841 * an extra */
1842 r = test_object(f, extra, needle);
1843 if (r < 0)
1844 return r;
1845
1846 if (r == TEST_FOUND)
1847 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1848
1849 /* if we are looking with DIRECTION_UP then we need to first
1850 see if in the actual array there is a matching entry, and
1851 return the last one of that. But if there isn't any we need
1852 to return this one. Hence remember this, and return it
1853 below. */
1854 if (r == TEST_LEFT)
1855 step_back = direction == DIRECTION_UP;
1856
1857 if (r == TEST_RIGHT) {
1858 if (direction == DIRECTION_DOWN)
1859 goto found;
1860 else
1861 return 0;
1862 }
1863
1864 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1865
1866 if (r == 0 && step_back)
1867 goto found;
1868
1869 if (r > 0 && idx)
1870 (*idx) ++;
1871
1872 return r;
1873
1874 found:
1875 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1876 if (r < 0)
1877 return r;
1878
1879 if (ret)
1880 *ret = o;
1881
1882 if (offset)
1883 *offset = extra;
1884
1885 if (idx)
1886 *idx = 0;
1887
1888 return 1;
1889 }
1890
1891 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1892 assert(f);
1893 assert(p > 0);
1894
1895 if (p == needle)
1896 return TEST_FOUND;
1897 else if (p < needle)
1898 return TEST_LEFT;
1899 else
1900 return TEST_RIGHT;
1901 }
1902
1903 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1904 Object *o;
1905 int r;
1906
1907 assert(f);
1908 assert(p > 0);
1909
1910 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1911 if (r < 0)
1912 return r;
1913
1914 if (le64toh(o->entry.seqnum) == needle)
1915 return TEST_FOUND;
1916 else if (le64toh(o->entry.seqnum) < needle)
1917 return TEST_LEFT;
1918 else
1919 return TEST_RIGHT;
1920 }
1921
1922 int journal_file_move_to_entry_by_seqnum(
1923 JournalFile *f,
1924 uint64_t seqnum,
1925 direction_t direction,
1926 Object **ret,
1927 uint64_t *offset) {
1928
1929 return generic_array_bisect(f,
1930 le64toh(f->header->entry_array_offset),
1931 le64toh(f->header->n_entries),
1932 seqnum,
1933 test_object_seqnum,
1934 direction,
1935 ret, offset, NULL);
1936 }
1937
1938 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1939 Object *o;
1940 int r;
1941
1942 assert(f);
1943 assert(p > 0);
1944
1945 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1946 if (r < 0)
1947 return r;
1948
1949 if (le64toh(o->entry.realtime) == needle)
1950 return TEST_FOUND;
1951 else if (le64toh(o->entry.realtime) < needle)
1952 return TEST_LEFT;
1953 else
1954 return TEST_RIGHT;
1955 }
1956
1957 int journal_file_move_to_entry_by_realtime(
1958 JournalFile *f,
1959 uint64_t realtime,
1960 direction_t direction,
1961 Object **ret,
1962 uint64_t *offset) {
1963
1964 return generic_array_bisect(f,
1965 le64toh(f->header->entry_array_offset),
1966 le64toh(f->header->n_entries),
1967 realtime,
1968 test_object_realtime,
1969 direction,
1970 ret, offset, NULL);
1971 }
1972
1973 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1974 Object *o;
1975 int r;
1976
1977 assert(f);
1978 assert(p > 0);
1979
1980 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1981 if (r < 0)
1982 return r;
1983
1984 if (le64toh(o->entry.monotonic) == needle)
1985 return TEST_FOUND;
1986 else if (le64toh(o->entry.monotonic) < needle)
1987 return TEST_LEFT;
1988 else
1989 return TEST_RIGHT;
1990 }
1991
1992 static int find_data_object_by_boot_id(
1993 JournalFile *f,
1994 sd_id128_t boot_id,
1995 Object **o,
1996 uint64_t *b) {
1997
1998 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1999
2000 sd_id128_to_string(boot_id, t + 9);
2001 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2002 }
2003
2004 int journal_file_move_to_entry_by_monotonic(
2005 JournalFile *f,
2006 sd_id128_t boot_id,
2007 uint64_t monotonic,
2008 direction_t direction,
2009 Object **ret,
2010 uint64_t *offset) {
2011
2012 Object *o;
2013 int r;
2014
2015 assert(f);
2016
2017 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2018 if (r < 0)
2019 return r;
2020 if (r == 0)
2021 return -ENOENT;
2022
2023 return generic_array_bisect_plus_one(f,
2024 le64toh(o->data.entry_offset),
2025 le64toh(o->data.entry_array_offset),
2026 le64toh(o->data.n_entries),
2027 monotonic,
2028 test_object_monotonic,
2029 direction,
2030 ret, offset, NULL);
2031 }
2032
2033 void journal_file_reset_location(JournalFile *f) {
2034 f->location_type = LOCATION_HEAD;
2035 f->current_offset = 0;
2036 f->current_seqnum = 0;
2037 f->current_realtime = 0;
2038 f->current_monotonic = 0;
2039 zero(f->current_boot_id);
2040 f->current_xor_hash = 0;
2041 }
2042
2043 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2044 f->location_type = LOCATION_SEEK;
2045 f->current_offset = offset;
2046 f->current_seqnum = le64toh(o->entry.seqnum);
2047 f->current_realtime = le64toh(o->entry.realtime);
2048 f->current_monotonic = le64toh(o->entry.monotonic);
2049 f->current_boot_id = o->entry.boot_id;
2050 f->current_xor_hash = le64toh(o->entry.xor_hash);
2051 }
2052
2053 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2054 assert(af);
2055 assert(bf);
2056 assert(af->location_type == LOCATION_SEEK);
2057 assert(bf->location_type == LOCATION_SEEK);
2058
2059 /* If contents and timestamps match, these entries are
2060 * identical, even if the seqnum does not match */
2061 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2062 af->current_monotonic == bf->current_monotonic &&
2063 af->current_realtime == bf->current_realtime &&
2064 af->current_xor_hash == bf->current_xor_hash)
2065 return 0;
2066
2067 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2068
2069 /* If this is from the same seqnum source, compare
2070 * seqnums */
2071 if (af->current_seqnum < bf->current_seqnum)
2072 return -1;
2073 if (af->current_seqnum > bf->current_seqnum)
2074 return 1;
2075
2076 /* Wow! This is weird, different data but the same
2077 * seqnums? Something is borked, but let's make the
2078 * best of it and compare by time. */
2079 }
2080
2081 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2082
2083 /* If the boot id matches, compare monotonic time */
2084 if (af->current_monotonic < bf->current_monotonic)
2085 return -1;
2086 if (af->current_monotonic > bf->current_monotonic)
2087 return 1;
2088 }
2089
2090 /* Otherwise, compare UTC time */
2091 if (af->current_realtime < bf->current_realtime)
2092 return -1;
2093 if (af->current_realtime > bf->current_realtime)
2094 return 1;
2095
2096 /* Finally, compare by contents */
2097 if (af->current_xor_hash < bf->current_xor_hash)
2098 return -1;
2099 if (af->current_xor_hash > bf->current_xor_hash)
2100 return 1;
2101
2102 return 0;
2103 }
2104
2105 int journal_file_next_entry(
2106 JournalFile *f,
2107 uint64_t p,
2108 direction_t direction,
2109 Object **ret, uint64_t *offset) {
2110
2111 uint64_t i, n, ofs;
2112 int r;
2113
2114 assert(f);
2115
2116 n = le64toh(f->header->n_entries);
2117 if (n <= 0)
2118 return 0;
2119
2120 if (p == 0)
2121 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2122 else {
2123 r = generic_array_bisect(f,
2124 le64toh(f->header->entry_array_offset),
2125 le64toh(f->header->n_entries),
2126 p,
2127 test_object_offset,
2128 DIRECTION_DOWN,
2129 NULL, NULL,
2130 &i);
2131 if (r <= 0)
2132 return r;
2133
2134 if (direction == DIRECTION_DOWN) {
2135 if (i >= n - 1)
2136 return 0;
2137
2138 i++;
2139 } else {
2140 if (i <= 0)
2141 return 0;
2142
2143 i--;
2144 }
2145 }
2146
2147 /* And jump to it */
2148 r = generic_array_get(f,
2149 le64toh(f->header->entry_array_offset),
2150 i,
2151 ret, &ofs);
2152 if (r <= 0)
2153 return r;
2154
2155 if (p > 0 &&
2156 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2157 log_debug("%s: entry array corrupted at entry %"PRIu64,
2158 f->path, i);
2159 return -EBADMSG;
2160 }
2161
2162 if (offset)
2163 *offset = ofs;
2164
2165 return 1;
2166 }
2167
2168 int journal_file_next_entry_for_data(
2169 JournalFile *f,
2170 Object *o, uint64_t p,
2171 uint64_t data_offset,
2172 direction_t direction,
2173 Object **ret, uint64_t *offset) {
2174
2175 uint64_t n, i;
2176 int r;
2177 Object *d;
2178
2179 assert(f);
2180 assert(p > 0 || !o);
2181
2182 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2183 if (r < 0)
2184 return r;
2185
2186 n = le64toh(d->data.n_entries);
2187 if (n <= 0)
2188 return n;
2189
2190 if (!o)
2191 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2192 else {
2193 if (o->object.type != OBJECT_ENTRY)
2194 return -EINVAL;
2195
2196 r = generic_array_bisect_plus_one(f,
2197 le64toh(d->data.entry_offset),
2198 le64toh(d->data.entry_array_offset),
2199 le64toh(d->data.n_entries),
2200 p,
2201 test_object_offset,
2202 DIRECTION_DOWN,
2203 NULL, NULL,
2204 &i);
2205
2206 if (r <= 0)
2207 return r;
2208
2209 if (direction == DIRECTION_DOWN) {
2210 if (i >= n - 1)
2211 return 0;
2212
2213 i++;
2214 } else {
2215 if (i <= 0)
2216 return 0;
2217
2218 i--;
2219 }
2220
2221 }
2222
2223 return generic_array_get_plus_one(f,
2224 le64toh(d->data.entry_offset),
2225 le64toh(d->data.entry_array_offset),
2226 i,
2227 ret, offset);
2228 }
2229
2230 int journal_file_move_to_entry_by_offset_for_data(
2231 JournalFile *f,
2232 uint64_t data_offset,
2233 uint64_t p,
2234 direction_t direction,
2235 Object **ret, uint64_t *offset) {
2236
2237 int r;
2238 Object *d;
2239
2240 assert(f);
2241
2242 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2243 if (r < 0)
2244 return r;
2245
2246 return generic_array_bisect_plus_one(f,
2247 le64toh(d->data.entry_offset),
2248 le64toh(d->data.entry_array_offset),
2249 le64toh(d->data.n_entries),
2250 p,
2251 test_object_offset,
2252 direction,
2253 ret, offset, NULL);
2254 }
2255
2256 int journal_file_move_to_entry_by_monotonic_for_data(
2257 JournalFile *f,
2258 uint64_t data_offset,
2259 sd_id128_t boot_id,
2260 uint64_t monotonic,
2261 direction_t direction,
2262 Object **ret, uint64_t *offset) {
2263
2264 Object *o, *d;
2265 int r;
2266 uint64_t b, z;
2267
2268 assert(f);
2269
2270 /* First, seek by time */
2271 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2272 if (r < 0)
2273 return r;
2274 if (r == 0)
2275 return -ENOENT;
2276
2277 r = generic_array_bisect_plus_one(f,
2278 le64toh(o->data.entry_offset),
2279 le64toh(o->data.entry_array_offset),
2280 le64toh(o->data.n_entries),
2281 monotonic,
2282 test_object_monotonic,
2283 direction,
2284 NULL, &z, NULL);
2285 if (r <= 0)
2286 return r;
2287
2288 /* And now, continue seeking until we find an entry that
2289 * exists in both bisection arrays */
2290
2291 for (;;) {
2292 Object *qo;
2293 uint64_t p, q;
2294
2295 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2296 if (r < 0)
2297 return r;
2298
2299 r = generic_array_bisect_plus_one(f,
2300 le64toh(d->data.entry_offset),
2301 le64toh(d->data.entry_array_offset),
2302 le64toh(d->data.n_entries),
2303 z,
2304 test_object_offset,
2305 direction,
2306 NULL, &p, NULL);
2307 if (r <= 0)
2308 return r;
2309
2310 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2311 if (r < 0)
2312 return r;
2313
2314 r = generic_array_bisect_plus_one(f,
2315 le64toh(o->data.entry_offset),
2316 le64toh(o->data.entry_array_offset),
2317 le64toh(o->data.n_entries),
2318 p,
2319 test_object_offset,
2320 direction,
2321 &qo, &q, NULL);
2322
2323 if (r <= 0)
2324 return r;
2325
2326 if (p == q) {
2327 if (ret)
2328 *ret = qo;
2329 if (offset)
2330 *offset = q;
2331
2332 return 1;
2333 }
2334
2335 z = q;
2336 }
2337 }
2338
2339 int journal_file_move_to_entry_by_seqnum_for_data(
2340 JournalFile *f,
2341 uint64_t data_offset,
2342 uint64_t seqnum,
2343 direction_t direction,
2344 Object **ret, uint64_t *offset) {
2345
2346 Object *d;
2347 int r;
2348
2349 assert(f);
2350
2351 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2352 if (r < 0)
2353 return r;
2354
2355 return generic_array_bisect_plus_one(f,
2356 le64toh(d->data.entry_offset),
2357 le64toh(d->data.entry_array_offset),
2358 le64toh(d->data.n_entries),
2359 seqnum,
2360 test_object_seqnum,
2361 direction,
2362 ret, offset, NULL);
2363 }
2364
2365 int journal_file_move_to_entry_by_realtime_for_data(
2366 JournalFile *f,
2367 uint64_t data_offset,
2368 uint64_t realtime,
2369 direction_t direction,
2370 Object **ret, uint64_t *offset) {
2371
2372 Object *d;
2373 int r;
2374
2375 assert(f);
2376
2377 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2378 if (r < 0)
2379 return r;
2380
2381 return generic_array_bisect_plus_one(f,
2382 le64toh(d->data.entry_offset),
2383 le64toh(d->data.entry_array_offset),
2384 le64toh(d->data.n_entries),
2385 realtime,
2386 test_object_realtime,
2387 direction,
2388 ret, offset, NULL);
2389 }
2390
2391 void journal_file_dump(JournalFile *f) {
2392 Object *o;
2393 int r;
2394 uint64_t p;
2395
2396 assert(f);
2397
2398 journal_file_print_header(f);
2399
2400 p = le64toh(f->header->header_size);
2401 while (p != 0) {
2402 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2403 if (r < 0)
2404 goto fail;
2405
2406 switch (o->object.type) {
2407
2408 case OBJECT_UNUSED:
2409 printf("Type: OBJECT_UNUSED\n");
2410 break;
2411
2412 case OBJECT_DATA:
2413 printf("Type: OBJECT_DATA\n");
2414 break;
2415
2416 case OBJECT_FIELD:
2417 printf("Type: OBJECT_FIELD\n");
2418 break;
2419
2420 case OBJECT_ENTRY:
2421 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2422 le64toh(o->entry.seqnum),
2423 le64toh(o->entry.monotonic),
2424 le64toh(o->entry.realtime));
2425 break;
2426
2427 case OBJECT_FIELD_HASH_TABLE:
2428 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2429 break;
2430
2431 case OBJECT_DATA_HASH_TABLE:
2432 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2433 break;
2434
2435 case OBJECT_ENTRY_ARRAY:
2436 printf("Type: OBJECT_ENTRY_ARRAY\n");
2437 break;
2438
2439 case OBJECT_TAG:
2440 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2441 le64toh(o->tag.seqnum),
2442 le64toh(o->tag.epoch));
2443 break;
2444
2445 default:
2446 printf("Type: unknown (%i)\n", o->object.type);
2447 break;
2448 }
2449
2450 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2451 printf("Flags: %s\n",
2452 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2453
2454 if (p == le64toh(f->header->tail_object_offset))
2455 p = 0;
2456 else
2457 p = p + ALIGN64(le64toh(o->object.size));
2458 }
2459
2460 return;
2461 fail:
2462 log_error("File corrupt");
2463 }
2464
2465 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2466 const char *x;
2467
2468 x = format_timestamp(buf, l, t);
2469 if (x)
2470 return x;
2471 return " --- ";
2472 }
2473
2474 void journal_file_print_header(JournalFile *f) {
2475 char a[33], b[33], c[33], d[33];
2476 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2477 struct stat st;
2478 char bytes[FORMAT_BYTES_MAX];
2479
2480 assert(f);
2481
2482 printf("File Path: %s\n"
2483 "File ID: %s\n"
2484 "Machine ID: %s\n"
2485 "Boot ID: %s\n"
2486 "Sequential Number ID: %s\n"
2487 "State: %s\n"
2488 "Compatible Flags:%s%s\n"
2489 "Incompatible Flags:%s%s%s\n"
2490 "Header size: %"PRIu64"\n"
2491 "Arena size: %"PRIu64"\n"
2492 "Data Hash Table Size: %"PRIu64"\n"
2493 "Field Hash Table Size: %"PRIu64"\n"
2494 "Rotate Suggested: %s\n"
2495 "Head Sequential Number: %"PRIu64"\n"
2496 "Tail Sequential Number: %"PRIu64"\n"
2497 "Head Realtime Timestamp: %s\n"
2498 "Tail Realtime Timestamp: %s\n"
2499 "Tail Monotonic Timestamp: %s\n"
2500 "Objects: %"PRIu64"\n"
2501 "Entry Objects: %"PRIu64"\n",
2502 f->path,
2503 sd_id128_to_string(f->header->file_id, a),
2504 sd_id128_to_string(f->header->machine_id, b),
2505 sd_id128_to_string(f->header->boot_id, c),
2506 sd_id128_to_string(f->header->seqnum_id, d),
2507 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2508 f->header->state == STATE_ONLINE ? "ONLINE" :
2509 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2510 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2511 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2512 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2513 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2514 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2515 le64toh(f->header->header_size),
2516 le64toh(f->header->arena_size),
2517 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2518 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2519 yes_no(journal_file_rotate_suggested(f, 0)),
2520 le64toh(f->header->head_entry_seqnum),
2521 le64toh(f->header->tail_entry_seqnum),
2522 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2523 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2524 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2525 le64toh(f->header->n_objects),
2526 le64toh(f->header->n_entries));
2527
2528 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2529 printf("Data Objects: %"PRIu64"\n"
2530 "Data Hash Table Fill: %.1f%%\n",
2531 le64toh(f->header->n_data),
2532 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2533
2534 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2535 printf("Field Objects: %"PRIu64"\n"
2536 "Field Hash Table Fill: %.1f%%\n",
2537 le64toh(f->header->n_fields),
2538 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2539
2540 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2541 printf("Tag Objects: %"PRIu64"\n",
2542 le64toh(f->header->n_tags));
2543 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2544 printf("Entry Array Objects: %"PRIu64"\n",
2545 le64toh(f->header->n_entry_arrays));
2546
2547 if (fstat(f->fd, &st) >= 0)
2548 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2549 }
2550
2551 static int journal_file_warn_btrfs(JournalFile *f) {
2552 unsigned attrs;
2553 int r;
2554
2555 assert(f);
2556
2557 /* Before we write anything, check if the COW logic is turned
2558 * off on btrfs. Given our write pattern that is quite
2559 * unfriendly to COW file systems this should greatly improve
2560 * performance on COW file systems, such as btrfs, at the
2561 * expense of data integrity features (which shouldn't be too
2562 * bad, given that we do our own checksumming). */
2563
2564 r = btrfs_is_filesystem(f->fd);
2565 if (r < 0)
2566 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2567 if (!r)
2568 return 0;
2569
2570 r = read_attr_fd(f->fd, &attrs);
2571 if (r < 0)
2572 return log_warning_errno(r, "Failed to read file attributes: %m");
2573
2574 if (attrs & FS_NOCOW_FL) {
2575 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2576 return 0;
2577 }
2578
2579 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2580 "This is likely to slow down journal access substantially, please consider turning "
2581 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2582
2583 return 1;
2584 }
2585
2586 int journal_file_open(
2587 const char *fname,
2588 int flags,
2589 mode_t mode,
2590 bool compress,
2591 bool seal,
2592 JournalMetrics *metrics,
2593 MMapCache *mmap_cache,
2594 JournalFile *template,
2595 JournalFile **ret) {
2596
2597 bool newly_created = false;
2598 JournalFile *f;
2599 void *h;
2600 int r;
2601
2602 assert(fname);
2603 assert(ret);
2604
2605 if ((flags & O_ACCMODE) != O_RDONLY &&
2606 (flags & O_ACCMODE) != O_RDWR)
2607 return -EINVAL;
2608
2609 if (!endswith(fname, ".journal") &&
2610 !endswith(fname, ".journal~"))
2611 return -EINVAL;
2612
2613 f = new0(JournalFile, 1);
2614 if (!f)
2615 return -ENOMEM;
2616
2617 f->fd = -1;
2618 f->mode = mode;
2619
2620 f->flags = flags;
2621 f->prot = prot_from_flags(flags);
2622 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2623 #if defined(HAVE_LZ4)
2624 f->compress_lz4 = compress;
2625 #elif defined(HAVE_XZ)
2626 f->compress_xz = compress;
2627 #endif
2628 #ifdef HAVE_GCRYPT
2629 f->seal = seal;
2630 #endif
2631
2632 if (mmap_cache)
2633 f->mmap = mmap_cache_ref(mmap_cache);
2634 else {
2635 f->mmap = mmap_cache_new();
2636 if (!f->mmap) {
2637 r = -ENOMEM;
2638 goto fail;
2639 }
2640 }
2641
2642 f->path = strdup(fname);
2643 if (!f->path) {
2644 r = -ENOMEM;
2645 goto fail;
2646 }
2647
2648 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2649 if (!f->chain_cache) {
2650 r = -ENOMEM;
2651 goto fail;
2652 }
2653
2654 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2655 if (f->fd < 0) {
2656 r = -errno;
2657 goto fail;
2658 }
2659
2660 r = journal_file_fstat(f);
2661 if (r < 0)
2662 goto fail;
2663
2664 if (f->last_stat.st_size == 0 && f->writable) {
2665
2666 (void) journal_file_warn_btrfs(f);
2667
2668 /* Let's attach the creation time to the journal file,
2669 * so that the vacuuming code knows the age of this
2670 * file even if the file might end up corrupted one
2671 * day... Ideally we'd just use the creation time many
2672 * file systems maintain for each file, but there is
2673 * currently no usable API to query this, hence let's
2674 * emulate this via extended attributes. If extended
2675 * attributes are not supported we'll just skip this,
2676 * and rely solely on mtime/atime/ctime of the file. */
2677
2678 fd_setcrtime(f->fd, 0);
2679
2680 #ifdef HAVE_GCRYPT
2681 /* Try to load the FSPRG state, and if we can't, then
2682 * just don't do sealing */
2683 if (f->seal) {
2684 r = journal_file_fss_load(f);
2685 if (r < 0)
2686 f->seal = false;
2687 }
2688 #endif
2689
2690 r = journal_file_init_header(f, template);
2691 if (r < 0)
2692 goto fail;
2693
2694 r = journal_file_fstat(f);
2695 if (r < 0)
2696 goto fail;
2697
2698 newly_created = true;
2699 }
2700
2701 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2702 r = -EIO;
2703 goto fail;
2704 }
2705
2706 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2707 if (r < 0)
2708 goto fail;
2709
2710 f->header = h;
2711
2712 if (!newly_created) {
2713 r = journal_file_verify_header(f);
2714 if (r < 0)
2715 goto fail;
2716 }
2717
2718 #ifdef HAVE_GCRYPT
2719 if (!newly_created && f->writable) {
2720 r = journal_file_fss_load(f);
2721 if (r < 0)
2722 goto fail;
2723 }
2724 #endif
2725
2726 if (f->writable) {
2727 if (metrics) {
2728 journal_default_metrics(metrics, f->fd);
2729 f->metrics = *metrics;
2730 } else if (template)
2731 f->metrics = template->metrics;
2732
2733 r = journal_file_refresh_header(f);
2734 if (r < 0)
2735 goto fail;
2736 }
2737
2738 #ifdef HAVE_GCRYPT
2739 r = journal_file_hmac_setup(f);
2740 if (r < 0)
2741 goto fail;
2742 #endif
2743
2744 if (newly_created) {
2745 r = journal_file_setup_field_hash_table(f);
2746 if (r < 0)
2747 goto fail;
2748
2749 r = journal_file_setup_data_hash_table(f);
2750 if (r < 0)
2751 goto fail;
2752
2753 #ifdef HAVE_GCRYPT
2754 r = journal_file_append_first_tag(f);
2755 if (r < 0)
2756 goto fail;
2757 #endif
2758 }
2759
2760 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2761 r = -EIO;
2762 goto fail;
2763 }
2764
2765 *ret = f;
2766 return 0;
2767
2768 fail:
2769 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2770 r = -EIO;
2771
2772 journal_file_close(f);
2773
2774 return r;
2775 }
2776
2777 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2778 _cleanup_free_ char *p = NULL;
2779 size_t l;
2780 JournalFile *old_file, *new_file = NULL;
2781 int r;
2782
2783 assert(f);
2784 assert(*f);
2785
2786 old_file = *f;
2787
2788 if (!old_file->writable)
2789 return -EINVAL;
2790
2791 if (!endswith(old_file->path, ".journal"))
2792 return -EINVAL;
2793
2794 l = strlen(old_file->path);
2795 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2796 (int) l - 8, old_file->path,
2797 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2798 le64toh((*f)->header->head_entry_seqnum),
2799 le64toh((*f)->header->head_entry_realtime));
2800 if (r < 0)
2801 return -ENOMEM;
2802
2803 /* Try to rename the file to the archived version. If the file
2804 * already was deleted, we'll get ENOENT, let's ignore that
2805 * case. */
2806 r = rename(old_file->path, p);
2807 if (r < 0 && errno != ENOENT)
2808 return -errno;
2809
2810 old_file->header->state = STATE_ARCHIVED;
2811
2812 /* Currently, btrfs is not very good with out write patterns
2813 * and fragments heavily. Let's defrag our journal files when
2814 * we archive them */
2815 old_file->defrag_on_close = true;
2816
2817 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2818 journal_file_close(old_file);
2819
2820 *f = new_file;
2821 return r;
2822 }
2823
2824 int journal_file_open_reliably(
2825 const char *fname,
2826 int flags,
2827 mode_t mode,
2828 bool compress,
2829 bool seal,
2830 JournalMetrics *metrics,
2831 MMapCache *mmap_cache,
2832 JournalFile *template,
2833 JournalFile **ret) {
2834
2835 int r;
2836 size_t l;
2837 _cleanup_free_ char *p = NULL;
2838
2839 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2840 if (!IN_SET(r,
2841 -EBADMSG, /* corrupted */
2842 -ENODATA, /* truncated */
2843 -EHOSTDOWN, /* other machine */
2844 -EPROTONOSUPPORT, /* incompatible feature */
2845 -EBUSY, /* unclean shutdown */
2846 -ESHUTDOWN, /* already archived */
2847 -EIO, /* IO error, including SIGBUS on mmap */
2848 -EIDRM /* File has been deleted */))
2849 return r;
2850
2851 if ((flags & O_ACCMODE) == O_RDONLY)
2852 return r;
2853
2854 if (!(flags & O_CREAT))
2855 return r;
2856
2857 if (!endswith(fname, ".journal"))
2858 return r;
2859
2860 /* The file is corrupted. Rotate it away and try it again (but only once) */
2861
2862 l = strlen(fname);
2863 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2864 (int) l - 8, fname,
2865 now(CLOCK_REALTIME),
2866 random_u64()) < 0)
2867 return -ENOMEM;
2868
2869 if (rename(fname, p) < 0)
2870 return -errno;
2871
2872 /* btrfs doesn't cope well with our write pattern and
2873 * fragments heavily. Let's defrag all files we rotate */
2874
2875 (void) chattr_path(p, false, FS_NOCOW_FL);
2876 (void) btrfs_defrag(p);
2877
2878 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2879
2880 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2881 }
2882
2883 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2884 uint64_t i, n;
2885 uint64_t q, xor_hash = 0;
2886 int r;
2887 EntryItem *items;
2888 dual_timestamp ts;
2889
2890 assert(from);
2891 assert(to);
2892 assert(o);
2893 assert(p);
2894
2895 if (!to->writable)
2896 return -EPERM;
2897
2898 ts.monotonic = le64toh(o->entry.monotonic);
2899 ts.realtime = le64toh(o->entry.realtime);
2900
2901 n = journal_file_entry_n_items(o);
2902 /* alloca() can't take 0, hence let's allocate at least one */
2903 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2904
2905 for (i = 0; i < n; i++) {
2906 uint64_t l, h;
2907 le64_t le_hash;
2908 size_t t;
2909 void *data;
2910 Object *u;
2911
2912 q = le64toh(o->entry.items[i].object_offset);
2913 le_hash = o->entry.items[i].hash;
2914
2915 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2916 if (r < 0)
2917 return r;
2918
2919 if (le_hash != o->data.hash)
2920 return -EBADMSG;
2921
2922 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2923 t = (size_t) l;
2924
2925 /* We hit the limit on 32bit machines */
2926 if ((uint64_t) t != l)
2927 return -E2BIG;
2928
2929 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2930 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2931 size_t rsize = 0;
2932
2933 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2934 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2935 if (r < 0)
2936 return r;
2937
2938 data = from->compress_buffer;
2939 l = rsize;
2940 #else
2941 return -EPROTONOSUPPORT;
2942 #endif
2943 } else
2944 data = o->data.payload;
2945
2946 r = journal_file_append_data(to, data, l, &u, &h);
2947 if (r < 0)
2948 return r;
2949
2950 xor_hash ^= le64toh(u->data.hash);
2951 items[i].object_offset = htole64(h);
2952 items[i].hash = u->data.hash;
2953
2954 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2955 if (r < 0)
2956 return r;
2957 }
2958
2959 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2960
2961 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2962 return -EIO;
2963
2964 return r;
2965 }
2966
2967 void journal_reset_metrics(JournalMetrics *m) {
2968 assert(m);
2969
2970 /* Set everything to "pick automatic values". */
2971
2972 *m = (JournalMetrics) {
2973 .min_use = (uint64_t) -1,
2974 .max_use = (uint64_t) -1,
2975 .min_size = (uint64_t) -1,
2976 .max_size = (uint64_t) -1,
2977 .keep_free = (uint64_t) -1,
2978 .n_max_files = (uint64_t) -1,
2979 };
2980 }
2981
2982 void journal_default_metrics(JournalMetrics *m, int fd) {
2983 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
2984 struct statvfs ss;
2985 uint64_t fs_size;
2986
2987 assert(m);
2988 assert(fd >= 0);
2989
2990 if (fstatvfs(fd, &ss) >= 0)
2991 fs_size = ss.f_frsize * ss.f_blocks;
2992 else {
2993 log_debug_errno(errno, "Failed to detremine disk size: %m");
2994 fs_size = 0;
2995 }
2996
2997 if (m->max_use == (uint64_t) -1) {
2998
2999 if (fs_size > 0) {
3000 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3001
3002 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3003 m->max_use = DEFAULT_MAX_USE_UPPER;
3004
3005 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3006 m->max_use = DEFAULT_MAX_USE_LOWER;
3007 } else
3008 m->max_use = DEFAULT_MAX_USE_LOWER;
3009 } else {
3010 m->max_use = PAGE_ALIGN(m->max_use);
3011
3012 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3013 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3014 }
3015
3016 if (m->min_use == (uint64_t) -1)
3017 m->min_use = DEFAULT_MIN_USE;
3018
3019 if (m->min_use > m->max_use)
3020 m->min_use = m->max_use;
3021
3022 if (m->max_size == (uint64_t) -1) {
3023 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3024
3025 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3026 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3027 } else
3028 m->max_size = PAGE_ALIGN(m->max_size);
3029
3030 if (m->max_size != 0) {
3031 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3032 m->max_size = JOURNAL_FILE_SIZE_MIN;
3033
3034 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3035 m->max_use = m->max_size*2;
3036 }
3037
3038 if (m->min_size == (uint64_t) -1)
3039 m->min_size = JOURNAL_FILE_SIZE_MIN;
3040 else {
3041 m->min_size = PAGE_ALIGN(m->min_size);
3042
3043 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3044 m->min_size = JOURNAL_FILE_SIZE_MIN;
3045
3046 if (m->max_size != 0 && m->min_size > m->max_size)
3047 m->max_size = m->min_size;
3048 }
3049
3050 if (m->keep_free == (uint64_t) -1) {
3051
3052 if (fs_size > 0) {
3053 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3054
3055 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3056 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3057
3058 } else
3059 m->keep_free = DEFAULT_KEEP_FREE;
3060 }
3061
3062 if (m->n_max_files == (uint64_t) -1)
3063 m->n_max_files = DEFAULT_N_MAX_FILES;
3064
3065 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3066 format_bytes(a, sizeof(a), m->min_use),
3067 format_bytes(b, sizeof(b), m->max_use),
3068 format_bytes(c, sizeof(c), m->max_size),
3069 format_bytes(d, sizeof(d), m->min_size),
3070 format_bytes(e, sizeof(e), m->keep_free),
3071 m->n_max_files);
3072 }
3073
3074 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3075 assert(f);
3076 assert(from || to);
3077
3078 if (from) {
3079 if (f->header->head_entry_realtime == 0)
3080 return -ENOENT;
3081
3082 *from = le64toh(f->header->head_entry_realtime);
3083 }
3084
3085 if (to) {
3086 if (f->header->tail_entry_realtime == 0)
3087 return -ENOENT;
3088
3089 *to = le64toh(f->header->tail_entry_realtime);
3090 }
3091
3092 return 1;
3093 }
3094
3095 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3096 Object *o;
3097 uint64_t p;
3098 int r;
3099
3100 assert(f);
3101 assert(from || to);
3102
3103 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3104 if (r <= 0)
3105 return r;
3106
3107 if (le64toh(o->data.n_entries) <= 0)
3108 return 0;
3109
3110 if (from) {
3111 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3112 if (r < 0)
3113 return r;
3114
3115 *from = le64toh(o->entry.monotonic);
3116 }
3117
3118 if (to) {
3119 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3120 if (r < 0)
3121 return r;
3122
3123 r = generic_array_get_plus_one(f,
3124 le64toh(o->data.entry_offset),
3125 le64toh(o->data.entry_array_offset),
3126 le64toh(o->data.n_entries)-1,
3127 &o, NULL);
3128 if (r <= 0)
3129 return r;
3130
3131 *to = le64toh(o->entry.monotonic);
3132 }
3133
3134 return 1;
3135 }
3136
3137 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3138 assert(f);
3139
3140 /* If we gained new header fields we gained new features,
3141 * hence suggest a rotation */
3142 if (le64toh(f->header->header_size) < sizeof(Header)) {
3143 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3144 return true;
3145 }
3146
3147 /* Let's check if the hash tables grew over a certain fill
3148 * level (75%, borrowing this value from Java's hash table
3149 * implementation), and if so suggest a rotation. To calculate
3150 * the fill level we need the n_data field, which only exists
3151 * in newer versions. */
3152
3153 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3154 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3155 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3156 f->path,
3157 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3158 le64toh(f->header->n_data),
3159 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3160 (unsigned long long) f->last_stat.st_size,
3161 f->last_stat.st_size / le64toh(f->header->n_data));
3162 return true;
3163 }
3164
3165 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3166 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3167 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3168 f->path,
3169 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3170 le64toh(f->header->n_fields),
3171 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3172 return true;
3173 }
3174
3175 /* Are the data objects properly indexed by field objects? */
3176 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3177 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3178 le64toh(f->header->n_data) > 0 &&
3179 le64toh(f->header->n_fields) == 0)
3180 return true;
3181
3182 if (max_file_usec > 0) {
3183 usec_t t, h;
3184
3185 h = le64toh(f->header->head_entry_realtime);
3186 t = now(CLOCK_REALTIME);
3187
3188 if (h > 0 && t > h + max_file_usec)
3189 return true;
3190 }
3191
3192 return false;
3193 }