]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
Merge pull request #1467 from jacob-keller/master
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <linux/fs.h>
30
31 #include "btrfs-util.h"
32 #include "journal-def.h"
33 #include "journal-file.h"
34 #include "journal-authenticate.h"
35 #include "lookup3.h"
36 #include "compress.h"
37 #include "random-util.h"
38
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41
42 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43
44 /* This is the minimum journal file size */
45 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
46
47 /* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51
52 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
53 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
54
55 /* This is the upper bound if we deduce max_size from max_use */
56 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57
58 /* This is the upper bound if we deduce the keep_free value from the
59 * file system size */
60 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61
62 /* This is the keep_free value when we can't determine the system
63 * size */
64 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65
66 /* This is the default maximum number of journal files to keep around. */
67 #define DEFAULT_N_MAX_FILES (100)
68
69 /* n_data was the first entry we added after the initial file format design */
70 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
71
72 /* How many entries to keep in the entry array chain cache at max */
73 #define CHAIN_CACHE_MAX 20
74
75 /* How much to increase the journal file size at once each time we allocate something new. */
76 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
77
78 /* Reread fstat() of the file for detecting deletions at least this often */
79 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
80
81 /* The mmap context to use for the header we pick as one above the last defined typed */
82 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
83
84 static int journal_file_set_online(JournalFile *f) {
85 assert(f);
86
87 if (!f->writable)
88 return -EPERM;
89
90 if (!(f->fd >= 0 && f->header))
91 return -EINVAL;
92
93 if (mmap_cache_got_sigbus(f->mmap, f->fd))
94 return -EIO;
95
96 switch(f->header->state) {
97 case STATE_ONLINE:
98 return 0;
99
100 case STATE_OFFLINE:
101 f->header->state = STATE_ONLINE;
102 fsync(f->fd);
103 return 0;
104
105 default:
106 return -EINVAL;
107 }
108 }
109
110 int journal_file_set_offline(JournalFile *f) {
111 assert(f);
112
113 if (!f->writable)
114 return -EPERM;
115
116 if (!(f->fd >= 0 && f->header))
117 return -EINVAL;
118
119 if (f->header->state != STATE_ONLINE)
120 return 0;
121
122 fsync(f->fd);
123
124 if (mmap_cache_got_sigbus(f->mmap, f->fd))
125 return -EIO;
126
127 f->header->state = STATE_OFFLINE;
128
129 if (mmap_cache_got_sigbus(f->mmap, f->fd))
130 return -EIO;
131
132 fsync(f->fd);
133
134 return 0;
135 }
136
137 JournalFile* journal_file_close(JournalFile *f) {
138 assert(f);
139
140 #ifdef HAVE_GCRYPT
141 /* Write the final tag */
142 if (f->seal && f->writable)
143 journal_file_append_tag(f);
144 #endif
145
146 journal_file_set_offline(f);
147
148 if (f->mmap && f->fd >= 0)
149 mmap_cache_close_fd(f->mmap, f->fd);
150
151 if (f->fd >= 0 && f->defrag_on_close) {
152
153 /* Be friendly to btrfs: turn COW back on again now,
154 * and defragment the file. We won't write to the file
155 * ever again, hence remove all fragmentation, and
156 * reenable all the good bits COW usually provides
157 * (such as data checksumming). */
158
159 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
160 (void) btrfs_defrag_fd(f->fd);
161 }
162
163 safe_close(f->fd);
164 free(f->path);
165
166 if (f->mmap)
167 mmap_cache_unref(f->mmap);
168
169 ordered_hashmap_free_free(f->chain_cache);
170
171 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
172 free(f->compress_buffer);
173 #endif
174
175 #ifdef HAVE_GCRYPT
176 if (f->fss_file)
177 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
178 else
179 free(f->fsprg_state);
180
181 free(f->fsprg_seed);
182
183 if (f->hmac)
184 gcry_md_close(f->hmac);
185 #endif
186
187 free(f);
188 return NULL;
189 }
190
191 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
192 Header h = {};
193 ssize_t k;
194 int r;
195
196 assert(f);
197
198 memcpy(h.signature, HEADER_SIGNATURE, 8);
199 h.header_size = htole64(ALIGN64(sizeof(h)));
200
201 h.incompatible_flags |= htole32(
202 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
203 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
204
205 h.compatible_flags = htole32(
206 f->seal * HEADER_COMPATIBLE_SEALED);
207
208 r = sd_id128_randomize(&h.file_id);
209 if (r < 0)
210 return r;
211
212 if (template) {
213 h.seqnum_id = template->header->seqnum_id;
214 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
215 } else
216 h.seqnum_id = h.file_id;
217
218 k = pwrite(f->fd, &h, sizeof(h), 0);
219 if (k < 0)
220 return -errno;
221
222 if (k != sizeof(h))
223 return -EIO;
224
225 return 0;
226 }
227
228 static int journal_file_refresh_header(JournalFile *f) {
229 sd_id128_t boot_id;
230 int r;
231
232 assert(f);
233
234 r = sd_id128_get_machine(&f->header->machine_id);
235 if (r < 0)
236 return r;
237
238 r = sd_id128_get_boot(&boot_id);
239 if (r < 0)
240 return r;
241
242 if (sd_id128_equal(boot_id, f->header->boot_id))
243 f->tail_entry_monotonic_valid = true;
244
245 f->header->boot_id = boot_id;
246
247 r = journal_file_set_online(f);
248
249 /* Sync the online state to disk */
250 fsync(f->fd);
251
252 return r;
253 }
254
255 static int journal_file_verify_header(JournalFile *f) {
256 uint32_t flags;
257
258 assert(f);
259
260 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
261 return -EBADMSG;
262
263 /* In both read and write mode we refuse to open files with
264 * incompatible flags we don't know */
265 flags = le32toh(f->header->incompatible_flags);
266 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
267 if (flags & ~HEADER_INCOMPATIBLE_ANY)
268 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
269 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
270 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
271 if (flags)
272 log_debug("Journal file %s uses incompatible flags %"PRIx32
273 " disabled at compilation time.", f->path, flags);
274 return -EPROTONOSUPPORT;
275 }
276
277 /* When open for writing we refuse to open files with
278 * compatible flags, too */
279 flags = le32toh(f->header->compatible_flags);
280 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
281 if (flags & ~HEADER_COMPATIBLE_ANY)
282 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
283 f->path, flags & ~HEADER_COMPATIBLE_ANY);
284 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
285 if (flags)
286 log_debug("Journal file %s uses compatible flags %"PRIx32
287 " disabled at compilation time.", f->path, flags);
288 return -EPROTONOSUPPORT;
289 }
290
291 if (f->header->state >= _STATE_MAX)
292 return -EBADMSG;
293
294 /* The first addition was n_data, so check that we are at least this large */
295 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
296 return -EBADMSG;
297
298 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
299 return -EBADMSG;
300
301 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
302 return -ENODATA;
303
304 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
305 return -ENODATA;
306
307 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
308 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
309 !VALID64(le64toh(f->header->tail_object_offset)) ||
310 !VALID64(le64toh(f->header->entry_array_offset)))
311 return -ENODATA;
312
313 if (f->writable) {
314 uint8_t state;
315 sd_id128_t machine_id;
316 int r;
317
318 r = sd_id128_get_machine(&machine_id);
319 if (r < 0)
320 return r;
321
322 if (!sd_id128_equal(machine_id, f->header->machine_id))
323 return -EHOSTDOWN;
324
325 state = f->header->state;
326
327 if (state == STATE_ONLINE) {
328 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
329 return -EBUSY;
330 } else if (state == STATE_ARCHIVED)
331 return -ESHUTDOWN;
332 else if (state != STATE_OFFLINE) {
333 log_debug("Journal file %s has unknown state %i.", f->path, state);
334 return -EBUSY;
335 }
336 }
337
338 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
339 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
340
341 f->seal = JOURNAL_HEADER_SEALED(f->header);
342
343 return 0;
344 }
345
346 static int journal_file_fstat(JournalFile *f) {
347 assert(f);
348 assert(f->fd >= 0);
349
350 if (fstat(f->fd, &f->last_stat) < 0)
351 return -errno;
352
353 f->last_stat_usec = now(CLOCK_MONOTONIC);
354
355 /* Refuse appending to files that are already deleted */
356 if (f->last_stat.st_nlink <= 0)
357 return -EIDRM;
358
359 return 0;
360 }
361
362 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
363 uint64_t old_size, new_size;
364 int r;
365
366 assert(f);
367
368 /* We assume that this file is not sparse, and we know that
369 * for sure, since we always call posix_fallocate()
370 * ourselves */
371
372 if (mmap_cache_got_sigbus(f->mmap, f->fd))
373 return -EIO;
374
375 old_size =
376 le64toh(f->header->header_size) +
377 le64toh(f->header->arena_size);
378
379 new_size = PAGE_ALIGN(offset + size);
380 if (new_size < le64toh(f->header->header_size))
381 new_size = le64toh(f->header->header_size);
382
383 if (new_size <= old_size) {
384
385 /* We already pre-allocated enough space, but before
386 * we write to it, let's check with fstat() if the
387 * file got deleted, in order make sure we don't throw
388 * away the data immediately. Don't check fstat() for
389 * all writes though, but only once ever 10s. */
390
391 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
392 return 0;
393
394 return journal_file_fstat(f);
395 }
396
397 /* Allocate more space. */
398
399 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
400 return -E2BIG;
401
402 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
403 struct statvfs svfs;
404
405 if (fstatvfs(f->fd, &svfs) >= 0) {
406 uint64_t available;
407
408 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
409
410 if (new_size - old_size > available)
411 return -E2BIG;
412 }
413 }
414
415 /* Increase by larger blocks at once */
416 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
417 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
418 new_size = f->metrics.max_size;
419
420 /* Note that the glibc fallocate() fallback is very
421 inefficient, hence we try to minimize the allocation area
422 as we can. */
423 r = posix_fallocate(f->fd, old_size, new_size - old_size);
424 if (r != 0)
425 return -r;
426
427 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
428
429 return journal_file_fstat(f);
430 }
431
432 static unsigned type_to_context(ObjectType type) {
433 /* One context for each type, plus one catch-all for the rest */
434 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
435 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
436 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
437 }
438
439 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
440 int r;
441
442 assert(f);
443 assert(ret);
444
445 if (size <= 0)
446 return -EINVAL;
447
448 /* Avoid SIGBUS on invalid accesses */
449 if (offset + size > (uint64_t) f->last_stat.st_size) {
450 /* Hmm, out of range? Let's refresh the fstat() data
451 * first, before we trust that check. */
452
453 r = journal_file_fstat(f);
454 if (r < 0)
455 return r;
456
457 if (offset + size > (uint64_t) f->last_stat.st_size)
458 return -EADDRNOTAVAIL;
459 }
460
461 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
462 }
463
464 static uint64_t minimum_header_size(Object *o) {
465
466 static const uint64_t table[] = {
467 [OBJECT_DATA] = sizeof(DataObject),
468 [OBJECT_FIELD] = sizeof(FieldObject),
469 [OBJECT_ENTRY] = sizeof(EntryObject),
470 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
471 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
472 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
473 [OBJECT_TAG] = sizeof(TagObject),
474 };
475
476 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
477 return sizeof(ObjectHeader);
478
479 return table[o->object.type];
480 }
481
482 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
483 int r;
484 void *t;
485 Object *o;
486 uint64_t s;
487
488 assert(f);
489 assert(ret);
490
491 /* Objects may only be located at multiple of 64 bit */
492 if (!VALID64(offset))
493 return -EFAULT;
494
495 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
496 if (r < 0)
497 return r;
498
499 o = (Object*) t;
500 s = le64toh(o->object.size);
501
502 if (s < sizeof(ObjectHeader))
503 return -EBADMSG;
504
505 if (o->object.type <= OBJECT_UNUSED)
506 return -EBADMSG;
507
508 if (s < minimum_header_size(o))
509 return -EBADMSG;
510
511 if (type > OBJECT_UNUSED && o->object.type != type)
512 return -EBADMSG;
513
514 if (s > sizeof(ObjectHeader)) {
515 r = journal_file_move_to(f, type, false, offset, s, &t);
516 if (r < 0)
517 return r;
518
519 o = (Object*) t;
520 }
521
522 *ret = o;
523 return 0;
524 }
525
526 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
527 uint64_t r;
528
529 assert(f);
530
531 r = le64toh(f->header->tail_entry_seqnum) + 1;
532
533 if (seqnum) {
534 /* If an external seqnum counter was passed, we update
535 * both the local and the external one, and set it to
536 * the maximum of both */
537
538 if (*seqnum + 1 > r)
539 r = *seqnum + 1;
540
541 *seqnum = r;
542 }
543
544 f->header->tail_entry_seqnum = htole64(r);
545
546 if (f->header->head_entry_seqnum == 0)
547 f->header->head_entry_seqnum = htole64(r);
548
549 return r;
550 }
551
552 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
553 int r;
554 uint64_t p;
555 Object *tail, *o;
556 void *t;
557
558 assert(f);
559 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
560 assert(size >= sizeof(ObjectHeader));
561 assert(offset);
562 assert(ret);
563
564 r = journal_file_set_online(f);
565 if (r < 0)
566 return r;
567
568 p = le64toh(f->header->tail_object_offset);
569 if (p == 0)
570 p = le64toh(f->header->header_size);
571 else {
572 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
573 if (r < 0)
574 return r;
575
576 p += ALIGN64(le64toh(tail->object.size));
577 }
578
579 r = journal_file_allocate(f, p, size);
580 if (r < 0)
581 return r;
582
583 r = journal_file_move_to(f, type, false, p, size, &t);
584 if (r < 0)
585 return r;
586
587 o = (Object*) t;
588
589 zero(o->object);
590 o->object.type = type;
591 o->object.size = htole64(size);
592
593 f->header->tail_object_offset = htole64(p);
594 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
595
596 *ret = o;
597 *offset = p;
598
599 return 0;
600 }
601
602 static int journal_file_setup_data_hash_table(JournalFile *f) {
603 uint64_t s, p;
604 Object *o;
605 int r;
606
607 assert(f);
608
609 /* We estimate that we need 1 hash table entry per 768 bytes
610 of journal file and we want to make sure we never get
611 beyond 75% fill level. Calculate the hash table size for
612 the maximum file size based on these metrics. */
613
614 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
615 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
616 s = DEFAULT_DATA_HASH_TABLE_SIZE;
617
618 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
619
620 r = journal_file_append_object(f,
621 OBJECT_DATA_HASH_TABLE,
622 offsetof(Object, hash_table.items) + s,
623 &o, &p);
624 if (r < 0)
625 return r;
626
627 memzero(o->hash_table.items, s);
628
629 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
630 f->header->data_hash_table_size = htole64(s);
631
632 return 0;
633 }
634
635 static int journal_file_setup_field_hash_table(JournalFile *f) {
636 uint64_t s, p;
637 Object *o;
638 int r;
639
640 assert(f);
641
642 /* We use a fixed size hash table for the fields as this
643 * number should grow very slowly only */
644
645 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
646 r = journal_file_append_object(f,
647 OBJECT_FIELD_HASH_TABLE,
648 offsetof(Object, hash_table.items) + s,
649 &o, &p);
650 if (r < 0)
651 return r;
652
653 memzero(o->hash_table.items, s);
654
655 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
656 f->header->field_hash_table_size = htole64(s);
657
658 return 0;
659 }
660
661 int journal_file_map_data_hash_table(JournalFile *f) {
662 uint64_t s, p;
663 void *t;
664 int r;
665
666 assert(f);
667
668 if (f->data_hash_table)
669 return 0;
670
671 p = le64toh(f->header->data_hash_table_offset);
672 s = le64toh(f->header->data_hash_table_size);
673
674 r = journal_file_move_to(f,
675 OBJECT_DATA_HASH_TABLE,
676 true,
677 p, s,
678 &t);
679 if (r < 0)
680 return r;
681
682 f->data_hash_table = t;
683 return 0;
684 }
685
686 int journal_file_map_field_hash_table(JournalFile *f) {
687 uint64_t s, p;
688 void *t;
689 int r;
690
691 assert(f);
692
693 if (f->field_hash_table)
694 return 0;
695
696 p = le64toh(f->header->field_hash_table_offset);
697 s = le64toh(f->header->field_hash_table_size);
698
699 r = journal_file_move_to(f,
700 OBJECT_FIELD_HASH_TABLE,
701 true,
702 p, s,
703 &t);
704 if (r < 0)
705 return r;
706
707 f->field_hash_table = t;
708 return 0;
709 }
710
711 static int journal_file_link_field(
712 JournalFile *f,
713 Object *o,
714 uint64_t offset,
715 uint64_t hash) {
716
717 uint64_t p, h, m;
718 int r;
719
720 assert(f);
721 assert(o);
722 assert(offset > 0);
723
724 if (o->object.type != OBJECT_FIELD)
725 return -EINVAL;
726
727 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
728 if (m <= 0)
729 return -EBADMSG;
730
731 /* This might alter the window we are looking at */
732 o->field.next_hash_offset = o->field.head_data_offset = 0;
733
734 h = hash % m;
735 p = le64toh(f->field_hash_table[h].tail_hash_offset);
736 if (p == 0)
737 f->field_hash_table[h].head_hash_offset = htole64(offset);
738 else {
739 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
740 if (r < 0)
741 return r;
742
743 o->field.next_hash_offset = htole64(offset);
744 }
745
746 f->field_hash_table[h].tail_hash_offset = htole64(offset);
747
748 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
749 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
750
751 return 0;
752 }
753
754 static int journal_file_link_data(
755 JournalFile *f,
756 Object *o,
757 uint64_t offset,
758 uint64_t hash) {
759
760 uint64_t p, h, m;
761 int r;
762
763 assert(f);
764 assert(o);
765 assert(offset > 0);
766
767 if (o->object.type != OBJECT_DATA)
768 return -EINVAL;
769
770 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
771 if (m <= 0)
772 return -EBADMSG;
773
774 /* This might alter the window we are looking at */
775 o->data.next_hash_offset = o->data.next_field_offset = 0;
776 o->data.entry_offset = o->data.entry_array_offset = 0;
777 o->data.n_entries = 0;
778
779 h = hash % m;
780 p = le64toh(f->data_hash_table[h].tail_hash_offset);
781 if (p == 0)
782 /* Only entry in the hash table is easy */
783 f->data_hash_table[h].head_hash_offset = htole64(offset);
784 else {
785 /* Move back to the previous data object, to patch in
786 * pointer */
787
788 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
789 if (r < 0)
790 return r;
791
792 o->data.next_hash_offset = htole64(offset);
793 }
794
795 f->data_hash_table[h].tail_hash_offset = htole64(offset);
796
797 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
798 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
799
800 return 0;
801 }
802
803 int journal_file_find_field_object_with_hash(
804 JournalFile *f,
805 const void *field, uint64_t size, uint64_t hash,
806 Object **ret, uint64_t *offset) {
807
808 uint64_t p, osize, h, m;
809 int r;
810
811 assert(f);
812 assert(field && size > 0);
813
814 /* If the field hash table is empty, we can't find anything */
815 if (le64toh(f->header->field_hash_table_size) <= 0)
816 return 0;
817
818 /* Map the field hash table, if it isn't mapped yet. */
819 r = journal_file_map_field_hash_table(f);
820 if (r < 0)
821 return r;
822
823 osize = offsetof(Object, field.payload) + size;
824
825 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
826 if (m <= 0)
827 return -EBADMSG;
828
829 h = hash % m;
830 p = le64toh(f->field_hash_table[h].head_hash_offset);
831
832 while (p > 0) {
833 Object *o;
834
835 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
836 if (r < 0)
837 return r;
838
839 if (le64toh(o->field.hash) == hash &&
840 le64toh(o->object.size) == osize &&
841 memcmp(o->field.payload, field, size) == 0) {
842
843 if (ret)
844 *ret = o;
845 if (offset)
846 *offset = p;
847
848 return 1;
849 }
850
851 p = le64toh(o->field.next_hash_offset);
852 }
853
854 return 0;
855 }
856
857 int journal_file_find_field_object(
858 JournalFile *f,
859 const void *field, uint64_t size,
860 Object **ret, uint64_t *offset) {
861
862 uint64_t hash;
863
864 assert(f);
865 assert(field && size > 0);
866
867 hash = hash64(field, size);
868
869 return journal_file_find_field_object_with_hash(f,
870 field, size, hash,
871 ret, offset);
872 }
873
874 int journal_file_find_data_object_with_hash(
875 JournalFile *f,
876 const void *data, uint64_t size, uint64_t hash,
877 Object **ret, uint64_t *offset) {
878
879 uint64_t p, osize, h, m;
880 int r;
881
882 assert(f);
883 assert(data || size == 0);
884
885 /* If there's no data hash table, then there's no entry. */
886 if (le64toh(f->header->data_hash_table_size) <= 0)
887 return 0;
888
889 /* Map the data hash table, if it isn't mapped yet. */
890 r = journal_file_map_data_hash_table(f);
891 if (r < 0)
892 return r;
893
894 osize = offsetof(Object, data.payload) + size;
895
896 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
897 if (m <= 0)
898 return -EBADMSG;
899
900 h = hash % m;
901 p = le64toh(f->data_hash_table[h].head_hash_offset);
902
903 while (p > 0) {
904 Object *o;
905
906 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
907 if (r < 0)
908 return r;
909
910 if (le64toh(o->data.hash) != hash)
911 goto next;
912
913 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
914 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
915 uint64_t l;
916 size_t rsize = 0;
917
918 l = le64toh(o->object.size);
919 if (l <= offsetof(Object, data.payload))
920 return -EBADMSG;
921
922 l -= offsetof(Object, data.payload);
923
924 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
925 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
926 if (r < 0)
927 return r;
928
929 if (rsize == size &&
930 memcmp(f->compress_buffer, data, size) == 0) {
931
932 if (ret)
933 *ret = o;
934
935 if (offset)
936 *offset = p;
937
938 return 1;
939 }
940 #else
941 return -EPROTONOSUPPORT;
942 #endif
943 } else if (le64toh(o->object.size) == osize &&
944 memcmp(o->data.payload, data, size) == 0) {
945
946 if (ret)
947 *ret = o;
948
949 if (offset)
950 *offset = p;
951
952 return 1;
953 }
954
955 next:
956 p = le64toh(o->data.next_hash_offset);
957 }
958
959 return 0;
960 }
961
962 int journal_file_find_data_object(
963 JournalFile *f,
964 const void *data, uint64_t size,
965 Object **ret, uint64_t *offset) {
966
967 uint64_t hash;
968
969 assert(f);
970 assert(data || size == 0);
971
972 hash = hash64(data, size);
973
974 return journal_file_find_data_object_with_hash(f,
975 data, size, hash,
976 ret, offset);
977 }
978
979 static int journal_file_append_field(
980 JournalFile *f,
981 const void *field, uint64_t size,
982 Object **ret, uint64_t *offset) {
983
984 uint64_t hash, p;
985 uint64_t osize;
986 Object *o;
987 int r;
988
989 assert(f);
990 assert(field && size > 0);
991
992 hash = hash64(field, size);
993
994 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
995 if (r < 0)
996 return r;
997 else if (r > 0) {
998
999 if (ret)
1000 *ret = o;
1001
1002 if (offset)
1003 *offset = p;
1004
1005 return 0;
1006 }
1007
1008 osize = offsetof(Object, field.payload) + size;
1009 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1010 if (r < 0)
1011 return r;
1012
1013 o->field.hash = htole64(hash);
1014 memcpy(o->field.payload, field, size);
1015
1016 r = journal_file_link_field(f, o, p, hash);
1017 if (r < 0)
1018 return r;
1019
1020 /* The linking might have altered the window, so let's
1021 * refresh our pointer */
1022 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1023 if (r < 0)
1024 return r;
1025
1026 #ifdef HAVE_GCRYPT
1027 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1028 if (r < 0)
1029 return r;
1030 #endif
1031
1032 if (ret)
1033 *ret = o;
1034
1035 if (offset)
1036 *offset = p;
1037
1038 return 0;
1039 }
1040
1041 static int journal_file_append_data(
1042 JournalFile *f,
1043 const void *data, uint64_t size,
1044 Object **ret, uint64_t *offset) {
1045
1046 uint64_t hash, p;
1047 uint64_t osize;
1048 Object *o;
1049 int r, compression = 0;
1050 const void *eq;
1051
1052 assert(f);
1053 assert(data || size == 0);
1054
1055 hash = hash64(data, size);
1056
1057 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1058 if (r < 0)
1059 return r;
1060 else if (r > 0) {
1061
1062 if (ret)
1063 *ret = o;
1064
1065 if (offset)
1066 *offset = p;
1067
1068 return 0;
1069 }
1070
1071 osize = offsetof(Object, data.payload) + size;
1072 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1073 if (r < 0)
1074 return r;
1075
1076 o->data.hash = htole64(hash);
1077
1078 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1079 if (f->compress_xz &&
1080 size >= COMPRESSION_SIZE_THRESHOLD) {
1081 size_t rsize = 0;
1082
1083 compression = compress_blob(data, size, o->data.payload, &rsize);
1084
1085 if (compression) {
1086 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1087 o->object.flags |= compression;
1088
1089 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1090 size, rsize, object_compressed_to_string(compression));
1091 }
1092 }
1093 #endif
1094
1095 if (!compression && size > 0)
1096 memcpy(o->data.payload, data, size);
1097
1098 r = journal_file_link_data(f, o, p, hash);
1099 if (r < 0)
1100 return r;
1101
1102 /* The linking might have altered the window, so let's
1103 * refresh our pointer */
1104 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1105 if (r < 0)
1106 return r;
1107
1108 if (!data)
1109 eq = NULL;
1110 else
1111 eq = memchr(data, '=', size);
1112 if (eq && eq > data) {
1113 Object *fo = NULL;
1114 uint64_t fp;
1115
1116 /* Create field object ... */
1117 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1118 if (r < 0)
1119 return r;
1120
1121 /* ... and link it in. */
1122 o->data.next_field_offset = fo->field.head_data_offset;
1123 fo->field.head_data_offset = le64toh(p);
1124 }
1125
1126 #ifdef HAVE_GCRYPT
1127 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1128 if (r < 0)
1129 return r;
1130 #endif
1131
1132 if (ret)
1133 *ret = o;
1134
1135 if (offset)
1136 *offset = p;
1137
1138 return 0;
1139 }
1140
1141 uint64_t journal_file_entry_n_items(Object *o) {
1142 assert(o);
1143
1144 if (o->object.type != OBJECT_ENTRY)
1145 return 0;
1146
1147 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1148 }
1149
1150 uint64_t journal_file_entry_array_n_items(Object *o) {
1151 assert(o);
1152
1153 if (o->object.type != OBJECT_ENTRY_ARRAY)
1154 return 0;
1155
1156 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1157 }
1158
1159 uint64_t journal_file_hash_table_n_items(Object *o) {
1160 assert(o);
1161
1162 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1163 o->object.type != OBJECT_FIELD_HASH_TABLE)
1164 return 0;
1165
1166 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1167 }
1168
1169 static int link_entry_into_array(JournalFile *f,
1170 le64_t *first,
1171 le64_t *idx,
1172 uint64_t p) {
1173 int r;
1174 uint64_t n = 0, ap = 0, q, i, a, hidx;
1175 Object *o;
1176
1177 assert(f);
1178 assert(first);
1179 assert(idx);
1180 assert(p > 0);
1181
1182 a = le64toh(*first);
1183 i = hidx = le64toh(*idx);
1184 while (a > 0) {
1185
1186 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1187 if (r < 0)
1188 return r;
1189
1190 n = journal_file_entry_array_n_items(o);
1191 if (i < n) {
1192 o->entry_array.items[i] = htole64(p);
1193 *idx = htole64(hidx + 1);
1194 return 0;
1195 }
1196
1197 i -= n;
1198 ap = a;
1199 a = le64toh(o->entry_array.next_entry_array_offset);
1200 }
1201
1202 if (hidx > n)
1203 n = (hidx+1) * 2;
1204 else
1205 n = n * 2;
1206
1207 if (n < 4)
1208 n = 4;
1209
1210 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1211 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1212 &o, &q);
1213 if (r < 0)
1214 return r;
1215
1216 #ifdef HAVE_GCRYPT
1217 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1218 if (r < 0)
1219 return r;
1220 #endif
1221
1222 o->entry_array.items[i] = htole64(p);
1223
1224 if (ap == 0)
1225 *first = htole64(q);
1226 else {
1227 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1228 if (r < 0)
1229 return r;
1230
1231 o->entry_array.next_entry_array_offset = htole64(q);
1232 }
1233
1234 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1235 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1236
1237 *idx = htole64(hidx + 1);
1238
1239 return 0;
1240 }
1241
1242 static int link_entry_into_array_plus_one(JournalFile *f,
1243 le64_t *extra,
1244 le64_t *first,
1245 le64_t *idx,
1246 uint64_t p) {
1247
1248 int r;
1249
1250 assert(f);
1251 assert(extra);
1252 assert(first);
1253 assert(idx);
1254 assert(p > 0);
1255
1256 if (*idx == 0)
1257 *extra = htole64(p);
1258 else {
1259 le64_t i;
1260
1261 i = htole64(le64toh(*idx) - 1);
1262 r = link_entry_into_array(f, first, &i, p);
1263 if (r < 0)
1264 return r;
1265 }
1266
1267 *idx = htole64(le64toh(*idx) + 1);
1268 return 0;
1269 }
1270
1271 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1272 uint64_t p;
1273 int r;
1274 assert(f);
1275 assert(o);
1276 assert(offset > 0);
1277
1278 p = le64toh(o->entry.items[i].object_offset);
1279 if (p == 0)
1280 return -EINVAL;
1281
1282 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1283 if (r < 0)
1284 return r;
1285
1286 return link_entry_into_array_plus_one(f,
1287 &o->data.entry_offset,
1288 &o->data.entry_array_offset,
1289 &o->data.n_entries,
1290 offset);
1291 }
1292
1293 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1294 uint64_t n, i;
1295 int r;
1296
1297 assert(f);
1298 assert(o);
1299 assert(offset > 0);
1300
1301 if (o->object.type != OBJECT_ENTRY)
1302 return -EINVAL;
1303
1304 __sync_synchronize();
1305
1306 /* Link up the entry itself */
1307 r = link_entry_into_array(f,
1308 &f->header->entry_array_offset,
1309 &f->header->n_entries,
1310 offset);
1311 if (r < 0)
1312 return r;
1313
1314 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1315
1316 if (f->header->head_entry_realtime == 0)
1317 f->header->head_entry_realtime = o->entry.realtime;
1318
1319 f->header->tail_entry_realtime = o->entry.realtime;
1320 f->header->tail_entry_monotonic = o->entry.monotonic;
1321
1322 f->tail_entry_monotonic_valid = true;
1323
1324 /* Link up the items */
1325 n = journal_file_entry_n_items(o);
1326 for (i = 0; i < n; i++) {
1327 r = journal_file_link_entry_item(f, o, offset, i);
1328 if (r < 0)
1329 return r;
1330 }
1331
1332 return 0;
1333 }
1334
1335 static int journal_file_append_entry_internal(
1336 JournalFile *f,
1337 const dual_timestamp *ts,
1338 uint64_t xor_hash,
1339 const EntryItem items[], unsigned n_items,
1340 uint64_t *seqnum,
1341 Object **ret, uint64_t *offset) {
1342 uint64_t np;
1343 uint64_t osize;
1344 Object *o;
1345 int r;
1346
1347 assert(f);
1348 assert(items || n_items == 0);
1349 assert(ts);
1350
1351 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1352
1353 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1354 if (r < 0)
1355 return r;
1356
1357 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1358 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1359 o->entry.realtime = htole64(ts->realtime);
1360 o->entry.monotonic = htole64(ts->monotonic);
1361 o->entry.xor_hash = htole64(xor_hash);
1362 o->entry.boot_id = f->header->boot_id;
1363
1364 #ifdef HAVE_GCRYPT
1365 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1366 if (r < 0)
1367 return r;
1368 #endif
1369
1370 r = journal_file_link_entry(f, o, np);
1371 if (r < 0)
1372 return r;
1373
1374 if (ret)
1375 *ret = o;
1376
1377 if (offset)
1378 *offset = np;
1379
1380 return 0;
1381 }
1382
1383 void journal_file_post_change(JournalFile *f) {
1384 assert(f);
1385
1386 /* inotify() does not receive IN_MODIFY events from file
1387 * accesses done via mmap(). After each access we hence
1388 * trigger IN_MODIFY by truncating the journal file to its
1389 * current size which triggers IN_MODIFY. */
1390
1391 __sync_synchronize();
1392
1393 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1394 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1395 }
1396
1397 static int entry_item_cmp(const void *_a, const void *_b) {
1398 const EntryItem *a = _a, *b = _b;
1399
1400 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1401 return -1;
1402 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1403 return 1;
1404 return 0;
1405 }
1406
1407 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1408 unsigned i;
1409 EntryItem *items;
1410 int r;
1411 uint64_t xor_hash = 0;
1412 struct dual_timestamp _ts;
1413
1414 assert(f);
1415 assert(iovec || n_iovec == 0);
1416
1417 if (!ts) {
1418 dual_timestamp_get(&_ts);
1419 ts = &_ts;
1420 }
1421
1422 if (f->tail_entry_monotonic_valid &&
1423 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1424 return -EINVAL;
1425
1426 #ifdef HAVE_GCRYPT
1427 r = journal_file_maybe_append_tag(f, ts->realtime);
1428 if (r < 0)
1429 return r;
1430 #endif
1431
1432 /* alloca() can't take 0, hence let's allocate at least one */
1433 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1434
1435 for (i = 0; i < n_iovec; i++) {
1436 uint64_t p;
1437 Object *o;
1438
1439 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1440 if (r < 0)
1441 return r;
1442
1443 xor_hash ^= le64toh(o->data.hash);
1444 items[i].object_offset = htole64(p);
1445 items[i].hash = o->data.hash;
1446 }
1447
1448 /* Order by the position on disk, in order to improve seek
1449 * times for rotating media. */
1450 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1451
1452 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1453
1454 /* If the memory mapping triggered a SIGBUS then we return an
1455 * IO error and ignore the error code passed down to us, since
1456 * it is very likely just an effect of a nullified replacement
1457 * mapping page */
1458
1459 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1460 r = -EIO;
1461
1462 journal_file_post_change(f);
1463
1464 return r;
1465 }
1466
1467 typedef struct ChainCacheItem {
1468 uint64_t first; /* the array at the beginning of the chain */
1469 uint64_t array; /* the cached array */
1470 uint64_t begin; /* the first item in the cached array */
1471 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1472 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1473 } ChainCacheItem;
1474
1475 static void chain_cache_put(
1476 OrderedHashmap *h,
1477 ChainCacheItem *ci,
1478 uint64_t first,
1479 uint64_t array,
1480 uint64_t begin,
1481 uint64_t total,
1482 uint64_t last_index) {
1483
1484 if (!ci) {
1485 /* If the chain item to cache for this chain is the
1486 * first one it's not worth caching anything */
1487 if (array == first)
1488 return;
1489
1490 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1491 ci = ordered_hashmap_steal_first(h);
1492 assert(ci);
1493 } else {
1494 ci = new(ChainCacheItem, 1);
1495 if (!ci)
1496 return;
1497 }
1498
1499 ci->first = first;
1500
1501 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1502 free(ci);
1503 return;
1504 }
1505 } else
1506 assert(ci->first == first);
1507
1508 ci->array = array;
1509 ci->begin = begin;
1510 ci->total = total;
1511 ci->last_index = last_index;
1512 }
1513
1514 static int generic_array_get(
1515 JournalFile *f,
1516 uint64_t first,
1517 uint64_t i,
1518 Object **ret, uint64_t *offset) {
1519
1520 Object *o;
1521 uint64_t p = 0, a, t = 0;
1522 int r;
1523 ChainCacheItem *ci;
1524
1525 assert(f);
1526
1527 a = first;
1528
1529 /* Try the chain cache first */
1530 ci = ordered_hashmap_get(f->chain_cache, &first);
1531 if (ci && i > ci->total) {
1532 a = ci->array;
1533 i -= ci->total;
1534 t = ci->total;
1535 }
1536
1537 while (a > 0) {
1538 uint64_t k;
1539
1540 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1541 if (r < 0)
1542 return r;
1543
1544 k = journal_file_entry_array_n_items(o);
1545 if (i < k) {
1546 p = le64toh(o->entry_array.items[i]);
1547 goto found;
1548 }
1549
1550 i -= k;
1551 t += k;
1552 a = le64toh(o->entry_array.next_entry_array_offset);
1553 }
1554
1555 return 0;
1556
1557 found:
1558 /* Let's cache this item for the next invocation */
1559 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1560
1561 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1562 if (r < 0)
1563 return r;
1564
1565 if (ret)
1566 *ret = o;
1567
1568 if (offset)
1569 *offset = p;
1570
1571 return 1;
1572 }
1573
1574 static int generic_array_get_plus_one(
1575 JournalFile *f,
1576 uint64_t extra,
1577 uint64_t first,
1578 uint64_t i,
1579 Object **ret, uint64_t *offset) {
1580
1581 Object *o;
1582
1583 assert(f);
1584
1585 if (i == 0) {
1586 int r;
1587
1588 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1589 if (r < 0)
1590 return r;
1591
1592 if (ret)
1593 *ret = o;
1594
1595 if (offset)
1596 *offset = extra;
1597
1598 return 1;
1599 }
1600
1601 return generic_array_get(f, first, i-1, ret, offset);
1602 }
1603
1604 enum {
1605 TEST_FOUND,
1606 TEST_LEFT,
1607 TEST_RIGHT
1608 };
1609
1610 static int generic_array_bisect(
1611 JournalFile *f,
1612 uint64_t first,
1613 uint64_t n,
1614 uint64_t needle,
1615 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1616 direction_t direction,
1617 Object **ret,
1618 uint64_t *offset,
1619 uint64_t *idx) {
1620
1621 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1622 bool subtract_one = false;
1623 Object *o, *array = NULL;
1624 int r;
1625 ChainCacheItem *ci;
1626
1627 assert(f);
1628 assert(test_object);
1629
1630 /* Start with the first array in the chain */
1631 a = first;
1632
1633 ci = ordered_hashmap_get(f->chain_cache, &first);
1634 if (ci && n > ci->total) {
1635 /* Ah, we have iterated this bisection array chain
1636 * previously! Let's see if we can skip ahead in the
1637 * chain, as far as the last time. But we can't jump
1638 * backwards in the chain, so let's check that
1639 * first. */
1640
1641 r = test_object(f, ci->begin, needle);
1642 if (r < 0)
1643 return r;
1644
1645 if (r == TEST_LEFT) {
1646 /* OK, what we are looking for is right of the
1647 * begin of this EntryArray, so let's jump
1648 * straight to previously cached array in the
1649 * chain */
1650
1651 a = ci->array;
1652 n -= ci->total;
1653 t = ci->total;
1654 last_index = ci->last_index;
1655 }
1656 }
1657
1658 while (a > 0) {
1659 uint64_t left, right, k, lp;
1660
1661 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1662 if (r < 0)
1663 return r;
1664
1665 k = journal_file_entry_array_n_items(array);
1666 right = MIN(k, n);
1667 if (right <= 0)
1668 return 0;
1669
1670 i = right - 1;
1671 lp = p = le64toh(array->entry_array.items[i]);
1672 if (p <= 0)
1673 return -EBADMSG;
1674
1675 r = test_object(f, p, needle);
1676 if (r < 0)
1677 return r;
1678
1679 if (r == TEST_FOUND)
1680 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1681
1682 if (r == TEST_RIGHT) {
1683 left = 0;
1684 right -= 1;
1685
1686 if (last_index != (uint64_t) -1) {
1687 assert(last_index <= right);
1688
1689 /* If we cached the last index we
1690 * looked at, let's try to not to jump
1691 * too wildly around and see if we can
1692 * limit the range to look at early to
1693 * the immediate neighbors of the last
1694 * index we looked at. */
1695
1696 if (last_index > 0) {
1697 uint64_t x = last_index - 1;
1698
1699 p = le64toh(array->entry_array.items[x]);
1700 if (p <= 0)
1701 return -EBADMSG;
1702
1703 r = test_object(f, p, needle);
1704 if (r < 0)
1705 return r;
1706
1707 if (r == TEST_FOUND)
1708 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1709
1710 if (r == TEST_RIGHT)
1711 right = x;
1712 else
1713 left = x + 1;
1714 }
1715
1716 if (last_index < right) {
1717 uint64_t y = last_index + 1;
1718
1719 p = le64toh(array->entry_array.items[y]);
1720 if (p <= 0)
1721 return -EBADMSG;
1722
1723 r = test_object(f, p, needle);
1724 if (r < 0)
1725 return r;
1726
1727 if (r == TEST_FOUND)
1728 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1729
1730 if (r == TEST_RIGHT)
1731 right = y;
1732 else
1733 left = y + 1;
1734 }
1735 }
1736
1737 for (;;) {
1738 if (left == right) {
1739 if (direction == DIRECTION_UP)
1740 subtract_one = true;
1741
1742 i = left;
1743 goto found;
1744 }
1745
1746 assert(left < right);
1747 i = (left + right) / 2;
1748
1749 p = le64toh(array->entry_array.items[i]);
1750 if (p <= 0)
1751 return -EBADMSG;
1752
1753 r = test_object(f, p, needle);
1754 if (r < 0)
1755 return r;
1756
1757 if (r == TEST_FOUND)
1758 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1759
1760 if (r == TEST_RIGHT)
1761 right = i;
1762 else
1763 left = i + 1;
1764 }
1765 }
1766
1767 if (k >= n) {
1768 if (direction == DIRECTION_UP) {
1769 i = n;
1770 subtract_one = true;
1771 goto found;
1772 }
1773
1774 return 0;
1775 }
1776
1777 last_p = lp;
1778
1779 n -= k;
1780 t += k;
1781 last_index = (uint64_t) -1;
1782 a = le64toh(array->entry_array.next_entry_array_offset);
1783 }
1784
1785 return 0;
1786
1787 found:
1788 if (subtract_one && t == 0 && i == 0)
1789 return 0;
1790
1791 /* Let's cache this item for the next invocation */
1792 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1793
1794 if (subtract_one && i == 0)
1795 p = last_p;
1796 else if (subtract_one)
1797 p = le64toh(array->entry_array.items[i-1]);
1798 else
1799 p = le64toh(array->entry_array.items[i]);
1800
1801 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1802 if (r < 0)
1803 return r;
1804
1805 if (ret)
1806 *ret = o;
1807
1808 if (offset)
1809 *offset = p;
1810
1811 if (idx)
1812 *idx = t + i + (subtract_one ? -1 : 0);
1813
1814 return 1;
1815 }
1816
1817 static int generic_array_bisect_plus_one(
1818 JournalFile *f,
1819 uint64_t extra,
1820 uint64_t first,
1821 uint64_t n,
1822 uint64_t needle,
1823 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1824 direction_t direction,
1825 Object **ret,
1826 uint64_t *offset,
1827 uint64_t *idx) {
1828
1829 int r;
1830 bool step_back = false;
1831 Object *o;
1832
1833 assert(f);
1834 assert(test_object);
1835
1836 if (n <= 0)
1837 return 0;
1838
1839 /* This bisects the array in object 'first', but first checks
1840 * an extra */
1841 r = test_object(f, extra, needle);
1842 if (r < 0)
1843 return r;
1844
1845 if (r == TEST_FOUND)
1846 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1847
1848 /* if we are looking with DIRECTION_UP then we need to first
1849 see if in the actual array there is a matching entry, and
1850 return the last one of that. But if there isn't any we need
1851 to return this one. Hence remember this, and return it
1852 below. */
1853 if (r == TEST_LEFT)
1854 step_back = direction == DIRECTION_UP;
1855
1856 if (r == TEST_RIGHT) {
1857 if (direction == DIRECTION_DOWN)
1858 goto found;
1859 else
1860 return 0;
1861 }
1862
1863 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1864
1865 if (r == 0 && step_back)
1866 goto found;
1867
1868 if (r > 0 && idx)
1869 (*idx) ++;
1870
1871 return r;
1872
1873 found:
1874 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1875 if (r < 0)
1876 return r;
1877
1878 if (ret)
1879 *ret = o;
1880
1881 if (offset)
1882 *offset = extra;
1883
1884 if (idx)
1885 *idx = 0;
1886
1887 return 1;
1888 }
1889
1890 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1891 assert(f);
1892 assert(p > 0);
1893
1894 if (p == needle)
1895 return TEST_FOUND;
1896 else if (p < needle)
1897 return TEST_LEFT;
1898 else
1899 return TEST_RIGHT;
1900 }
1901
1902 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1903 Object *o;
1904 int r;
1905
1906 assert(f);
1907 assert(p > 0);
1908
1909 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1910 if (r < 0)
1911 return r;
1912
1913 if (le64toh(o->entry.seqnum) == needle)
1914 return TEST_FOUND;
1915 else if (le64toh(o->entry.seqnum) < needle)
1916 return TEST_LEFT;
1917 else
1918 return TEST_RIGHT;
1919 }
1920
1921 int journal_file_move_to_entry_by_seqnum(
1922 JournalFile *f,
1923 uint64_t seqnum,
1924 direction_t direction,
1925 Object **ret,
1926 uint64_t *offset) {
1927
1928 return generic_array_bisect(f,
1929 le64toh(f->header->entry_array_offset),
1930 le64toh(f->header->n_entries),
1931 seqnum,
1932 test_object_seqnum,
1933 direction,
1934 ret, offset, NULL);
1935 }
1936
1937 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1938 Object *o;
1939 int r;
1940
1941 assert(f);
1942 assert(p > 0);
1943
1944 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1945 if (r < 0)
1946 return r;
1947
1948 if (le64toh(o->entry.realtime) == needle)
1949 return TEST_FOUND;
1950 else if (le64toh(o->entry.realtime) < needle)
1951 return TEST_LEFT;
1952 else
1953 return TEST_RIGHT;
1954 }
1955
1956 int journal_file_move_to_entry_by_realtime(
1957 JournalFile *f,
1958 uint64_t realtime,
1959 direction_t direction,
1960 Object **ret,
1961 uint64_t *offset) {
1962
1963 return generic_array_bisect(f,
1964 le64toh(f->header->entry_array_offset),
1965 le64toh(f->header->n_entries),
1966 realtime,
1967 test_object_realtime,
1968 direction,
1969 ret, offset, NULL);
1970 }
1971
1972 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1973 Object *o;
1974 int r;
1975
1976 assert(f);
1977 assert(p > 0);
1978
1979 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1980 if (r < 0)
1981 return r;
1982
1983 if (le64toh(o->entry.monotonic) == needle)
1984 return TEST_FOUND;
1985 else if (le64toh(o->entry.monotonic) < needle)
1986 return TEST_LEFT;
1987 else
1988 return TEST_RIGHT;
1989 }
1990
1991 static int find_data_object_by_boot_id(
1992 JournalFile *f,
1993 sd_id128_t boot_id,
1994 Object **o,
1995 uint64_t *b) {
1996
1997 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1998
1999 sd_id128_to_string(boot_id, t + 9);
2000 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2001 }
2002
2003 int journal_file_move_to_entry_by_monotonic(
2004 JournalFile *f,
2005 sd_id128_t boot_id,
2006 uint64_t monotonic,
2007 direction_t direction,
2008 Object **ret,
2009 uint64_t *offset) {
2010
2011 Object *o;
2012 int r;
2013
2014 assert(f);
2015
2016 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2017 if (r < 0)
2018 return r;
2019 if (r == 0)
2020 return -ENOENT;
2021
2022 return generic_array_bisect_plus_one(f,
2023 le64toh(o->data.entry_offset),
2024 le64toh(o->data.entry_array_offset),
2025 le64toh(o->data.n_entries),
2026 monotonic,
2027 test_object_monotonic,
2028 direction,
2029 ret, offset, NULL);
2030 }
2031
2032 void journal_file_reset_location(JournalFile *f) {
2033 f->location_type = LOCATION_HEAD;
2034 f->current_offset = 0;
2035 f->current_seqnum = 0;
2036 f->current_realtime = 0;
2037 f->current_monotonic = 0;
2038 zero(f->current_boot_id);
2039 f->current_xor_hash = 0;
2040 }
2041
2042 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2043 f->location_type = LOCATION_SEEK;
2044 f->current_offset = offset;
2045 f->current_seqnum = le64toh(o->entry.seqnum);
2046 f->current_realtime = le64toh(o->entry.realtime);
2047 f->current_monotonic = le64toh(o->entry.monotonic);
2048 f->current_boot_id = o->entry.boot_id;
2049 f->current_xor_hash = le64toh(o->entry.xor_hash);
2050 }
2051
2052 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2053 assert(af);
2054 assert(bf);
2055 assert(af->location_type == LOCATION_SEEK);
2056 assert(bf->location_type == LOCATION_SEEK);
2057
2058 /* If contents and timestamps match, these entries are
2059 * identical, even if the seqnum does not match */
2060 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2061 af->current_monotonic == bf->current_monotonic &&
2062 af->current_realtime == bf->current_realtime &&
2063 af->current_xor_hash == bf->current_xor_hash)
2064 return 0;
2065
2066 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2067
2068 /* If this is from the same seqnum source, compare
2069 * seqnums */
2070 if (af->current_seqnum < bf->current_seqnum)
2071 return -1;
2072 if (af->current_seqnum > bf->current_seqnum)
2073 return 1;
2074
2075 /* Wow! This is weird, different data but the same
2076 * seqnums? Something is borked, but let's make the
2077 * best of it and compare by time. */
2078 }
2079
2080 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2081
2082 /* If the boot id matches, compare monotonic time */
2083 if (af->current_monotonic < bf->current_monotonic)
2084 return -1;
2085 if (af->current_monotonic > bf->current_monotonic)
2086 return 1;
2087 }
2088
2089 /* Otherwise, compare UTC time */
2090 if (af->current_realtime < bf->current_realtime)
2091 return -1;
2092 if (af->current_realtime > bf->current_realtime)
2093 return 1;
2094
2095 /* Finally, compare by contents */
2096 if (af->current_xor_hash < bf->current_xor_hash)
2097 return -1;
2098 if (af->current_xor_hash > bf->current_xor_hash)
2099 return 1;
2100
2101 return 0;
2102 }
2103
2104 int journal_file_next_entry(
2105 JournalFile *f,
2106 uint64_t p,
2107 direction_t direction,
2108 Object **ret, uint64_t *offset) {
2109
2110 uint64_t i, n, ofs;
2111 int r;
2112
2113 assert(f);
2114
2115 n = le64toh(f->header->n_entries);
2116 if (n <= 0)
2117 return 0;
2118
2119 if (p == 0)
2120 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2121 else {
2122 r = generic_array_bisect(f,
2123 le64toh(f->header->entry_array_offset),
2124 le64toh(f->header->n_entries),
2125 p,
2126 test_object_offset,
2127 DIRECTION_DOWN,
2128 NULL, NULL,
2129 &i);
2130 if (r <= 0)
2131 return r;
2132
2133 if (direction == DIRECTION_DOWN) {
2134 if (i >= n - 1)
2135 return 0;
2136
2137 i++;
2138 } else {
2139 if (i <= 0)
2140 return 0;
2141
2142 i--;
2143 }
2144 }
2145
2146 /* And jump to it */
2147 r = generic_array_get(f,
2148 le64toh(f->header->entry_array_offset),
2149 i,
2150 ret, &ofs);
2151 if (r <= 0)
2152 return r;
2153
2154 if (p > 0 &&
2155 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2156 log_debug("%s: entry array corrupted at entry %"PRIu64,
2157 f->path, i);
2158 return -EBADMSG;
2159 }
2160
2161 if (offset)
2162 *offset = ofs;
2163
2164 return 1;
2165 }
2166
2167 int journal_file_next_entry_for_data(
2168 JournalFile *f,
2169 Object *o, uint64_t p,
2170 uint64_t data_offset,
2171 direction_t direction,
2172 Object **ret, uint64_t *offset) {
2173
2174 uint64_t n, i;
2175 int r;
2176 Object *d;
2177
2178 assert(f);
2179 assert(p > 0 || !o);
2180
2181 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2182 if (r < 0)
2183 return r;
2184
2185 n = le64toh(d->data.n_entries);
2186 if (n <= 0)
2187 return n;
2188
2189 if (!o)
2190 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2191 else {
2192 if (o->object.type != OBJECT_ENTRY)
2193 return -EINVAL;
2194
2195 r = generic_array_bisect_plus_one(f,
2196 le64toh(d->data.entry_offset),
2197 le64toh(d->data.entry_array_offset),
2198 le64toh(d->data.n_entries),
2199 p,
2200 test_object_offset,
2201 DIRECTION_DOWN,
2202 NULL, NULL,
2203 &i);
2204
2205 if (r <= 0)
2206 return r;
2207
2208 if (direction == DIRECTION_DOWN) {
2209 if (i >= n - 1)
2210 return 0;
2211
2212 i++;
2213 } else {
2214 if (i <= 0)
2215 return 0;
2216
2217 i--;
2218 }
2219
2220 }
2221
2222 return generic_array_get_plus_one(f,
2223 le64toh(d->data.entry_offset),
2224 le64toh(d->data.entry_array_offset),
2225 i,
2226 ret, offset);
2227 }
2228
2229 int journal_file_move_to_entry_by_offset_for_data(
2230 JournalFile *f,
2231 uint64_t data_offset,
2232 uint64_t p,
2233 direction_t direction,
2234 Object **ret, uint64_t *offset) {
2235
2236 int r;
2237 Object *d;
2238
2239 assert(f);
2240
2241 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2242 if (r < 0)
2243 return r;
2244
2245 return generic_array_bisect_plus_one(f,
2246 le64toh(d->data.entry_offset),
2247 le64toh(d->data.entry_array_offset),
2248 le64toh(d->data.n_entries),
2249 p,
2250 test_object_offset,
2251 direction,
2252 ret, offset, NULL);
2253 }
2254
2255 int journal_file_move_to_entry_by_monotonic_for_data(
2256 JournalFile *f,
2257 uint64_t data_offset,
2258 sd_id128_t boot_id,
2259 uint64_t monotonic,
2260 direction_t direction,
2261 Object **ret, uint64_t *offset) {
2262
2263 Object *o, *d;
2264 int r;
2265 uint64_t b, z;
2266
2267 assert(f);
2268
2269 /* First, seek by time */
2270 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2271 if (r < 0)
2272 return r;
2273 if (r == 0)
2274 return -ENOENT;
2275
2276 r = generic_array_bisect_plus_one(f,
2277 le64toh(o->data.entry_offset),
2278 le64toh(o->data.entry_array_offset),
2279 le64toh(o->data.n_entries),
2280 monotonic,
2281 test_object_monotonic,
2282 direction,
2283 NULL, &z, NULL);
2284 if (r <= 0)
2285 return r;
2286
2287 /* And now, continue seeking until we find an entry that
2288 * exists in both bisection arrays */
2289
2290 for (;;) {
2291 Object *qo;
2292 uint64_t p, q;
2293
2294 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2295 if (r < 0)
2296 return r;
2297
2298 r = generic_array_bisect_plus_one(f,
2299 le64toh(d->data.entry_offset),
2300 le64toh(d->data.entry_array_offset),
2301 le64toh(d->data.n_entries),
2302 z,
2303 test_object_offset,
2304 direction,
2305 NULL, &p, NULL);
2306 if (r <= 0)
2307 return r;
2308
2309 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2310 if (r < 0)
2311 return r;
2312
2313 r = generic_array_bisect_plus_one(f,
2314 le64toh(o->data.entry_offset),
2315 le64toh(o->data.entry_array_offset),
2316 le64toh(o->data.n_entries),
2317 p,
2318 test_object_offset,
2319 direction,
2320 &qo, &q, NULL);
2321
2322 if (r <= 0)
2323 return r;
2324
2325 if (p == q) {
2326 if (ret)
2327 *ret = qo;
2328 if (offset)
2329 *offset = q;
2330
2331 return 1;
2332 }
2333
2334 z = q;
2335 }
2336 }
2337
2338 int journal_file_move_to_entry_by_seqnum_for_data(
2339 JournalFile *f,
2340 uint64_t data_offset,
2341 uint64_t seqnum,
2342 direction_t direction,
2343 Object **ret, uint64_t *offset) {
2344
2345 Object *d;
2346 int r;
2347
2348 assert(f);
2349
2350 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2351 if (r < 0)
2352 return r;
2353
2354 return generic_array_bisect_plus_one(f,
2355 le64toh(d->data.entry_offset),
2356 le64toh(d->data.entry_array_offset),
2357 le64toh(d->data.n_entries),
2358 seqnum,
2359 test_object_seqnum,
2360 direction,
2361 ret, offset, NULL);
2362 }
2363
2364 int journal_file_move_to_entry_by_realtime_for_data(
2365 JournalFile *f,
2366 uint64_t data_offset,
2367 uint64_t realtime,
2368 direction_t direction,
2369 Object **ret, uint64_t *offset) {
2370
2371 Object *d;
2372 int r;
2373
2374 assert(f);
2375
2376 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2377 if (r < 0)
2378 return r;
2379
2380 return generic_array_bisect_plus_one(f,
2381 le64toh(d->data.entry_offset),
2382 le64toh(d->data.entry_array_offset),
2383 le64toh(d->data.n_entries),
2384 realtime,
2385 test_object_realtime,
2386 direction,
2387 ret, offset, NULL);
2388 }
2389
2390 void journal_file_dump(JournalFile *f) {
2391 Object *o;
2392 int r;
2393 uint64_t p;
2394
2395 assert(f);
2396
2397 journal_file_print_header(f);
2398
2399 p = le64toh(f->header->header_size);
2400 while (p != 0) {
2401 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2402 if (r < 0)
2403 goto fail;
2404
2405 switch (o->object.type) {
2406
2407 case OBJECT_UNUSED:
2408 printf("Type: OBJECT_UNUSED\n");
2409 break;
2410
2411 case OBJECT_DATA:
2412 printf("Type: OBJECT_DATA\n");
2413 break;
2414
2415 case OBJECT_FIELD:
2416 printf("Type: OBJECT_FIELD\n");
2417 break;
2418
2419 case OBJECT_ENTRY:
2420 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2421 le64toh(o->entry.seqnum),
2422 le64toh(o->entry.monotonic),
2423 le64toh(o->entry.realtime));
2424 break;
2425
2426 case OBJECT_FIELD_HASH_TABLE:
2427 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2428 break;
2429
2430 case OBJECT_DATA_HASH_TABLE:
2431 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2432 break;
2433
2434 case OBJECT_ENTRY_ARRAY:
2435 printf("Type: OBJECT_ENTRY_ARRAY\n");
2436 break;
2437
2438 case OBJECT_TAG:
2439 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2440 le64toh(o->tag.seqnum),
2441 le64toh(o->tag.epoch));
2442 break;
2443
2444 default:
2445 printf("Type: unknown (%i)\n", o->object.type);
2446 break;
2447 }
2448
2449 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2450 printf("Flags: %s\n",
2451 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2452
2453 if (p == le64toh(f->header->tail_object_offset))
2454 p = 0;
2455 else
2456 p = p + ALIGN64(le64toh(o->object.size));
2457 }
2458
2459 return;
2460 fail:
2461 log_error("File corrupt");
2462 }
2463
2464 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2465 const char *x;
2466
2467 x = format_timestamp(buf, l, t);
2468 if (x)
2469 return x;
2470 return " --- ";
2471 }
2472
2473 void journal_file_print_header(JournalFile *f) {
2474 char a[33], b[33], c[33], d[33];
2475 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2476 struct stat st;
2477 char bytes[FORMAT_BYTES_MAX];
2478
2479 assert(f);
2480
2481 printf("File Path: %s\n"
2482 "File ID: %s\n"
2483 "Machine ID: %s\n"
2484 "Boot ID: %s\n"
2485 "Sequential Number ID: %s\n"
2486 "State: %s\n"
2487 "Compatible Flags:%s%s\n"
2488 "Incompatible Flags:%s%s%s\n"
2489 "Header size: %"PRIu64"\n"
2490 "Arena size: %"PRIu64"\n"
2491 "Data Hash Table Size: %"PRIu64"\n"
2492 "Field Hash Table Size: %"PRIu64"\n"
2493 "Rotate Suggested: %s\n"
2494 "Head Sequential Number: %"PRIu64"\n"
2495 "Tail Sequential Number: %"PRIu64"\n"
2496 "Head Realtime Timestamp: %s\n"
2497 "Tail Realtime Timestamp: %s\n"
2498 "Tail Monotonic Timestamp: %s\n"
2499 "Objects: %"PRIu64"\n"
2500 "Entry Objects: %"PRIu64"\n",
2501 f->path,
2502 sd_id128_to_string(f->header->file_id, a),
2503 sd_id128_to_string(f->header->machine_id, b),
2504 sd_id128_to_string(f->header->boot_id, c),
2505 sd_id128_to_string(f->header->seqnum_id, d),
2506 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2507 f->header->state == STATE_ONLINE ? "ONLINE" :
2508 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2509 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2510 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2511 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2512 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2513 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2514 le64toh(f->header->header_size),
2515 le64toh(f->header->arena_size),
2516 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2517 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2518 yes_no(journal_file_rotate_suggested(f, 0)),
2519 le64toh(f->header->head_entry_seqnum),
2520 le64toh(f->header->tail_entry_seqnum),
2521 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2522 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2523 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2524 le64toh(f->header->n_objects),
2525 le64toh(f->header->n_entries));
2526
2527 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2528 printf("Data Objects: %"PRIu64"\n"
2529 "Data Hash Table Fill: %.1f%%\n",
2530 le64toh(f->header->n_data),
2531 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2532
2533 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2534 printf("Field Objects: %"PRIu64"\n"
2535 "Field Hash Table Fill: %.1f%%\n",
2536 le64toh(f->header->n_fields),
2537 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2538
2539 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2540 printf("Tag Objects: %"PRIu64"\n",
2541 le64toh(f->header->n_tags));
2542 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2543 printf("Entry Array Objects: %"PRIu64"\n",
2544 le64toh(f->header->n_entry_arrays));
2545
2546 if (fstat(f->fd, &st) >= 0)
2547 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2548 }
2549
2550 static int journal_file_warn_btrfs(JournalFile *f) {
2551 unsigned attrs;
2552 int r;
2553
2554 assert(f);
2555
2556 /* Before we write anything, check if the COW logic is turned
2557 * off on btrfs. Given our write pattern that is quite
2558 * unfriendly to COW file systems this should greatly improve
2559 * performance on COW file systems, such as btrfs, at the
2560 * expense of data integrity features (which shouldn't be too
2561 * bad, given that we do our own checksumming). */
2562
2563 r = btrfs_is_filesystem(f->fd);
2564 if (r < 0)
2565 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2566 if (!r)
2567 return 0;
2568
2569 r = read_attr_fd(f->fd, &attrs);
2570 if (r < 0)
2571 return log_warning_errno(r, "Failed to read file attributes: %m");
2572
2573 if (attrs & FS_NOCOW_FL) {
2574 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2575 return 0;
2576 }
2577
2578 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2579 "This is likely to slow down journal access substantially, please consider turning "
2580 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2581
2582 return 1;
2583 }
2584
2585 int journal_file_open(
2586 const char *fname,
2587 int flags,
2588 mode_t mode,
2589 bool compress,
2590 bool seal,
2591 JournalMetrics *metrics,
2592 MMapCache *mmap_cache,
2593 JournalFile *template,
2594 JournalFile **ret) {
2595
2596 bool newly_created = false;
2597 JournalFile *f;
2598 void *h;
2599 int r;
2600
2601 assert(fname);
2602 assert(ret);
2603
2604 if ((flags & O_ACCMODE) != O_RDONLY &&
2605 (flags & O_ACCMODE) != O_RDWR)
2606 return -EINVAL;
2607
2608 if (!endswith(fname, ".journal") &&
2609 !endswith(fname, ".journal~"))
2610 return -EINVAL;
2611
2612 f = new0(JournalFile, 1);
2613 if (!f)
2614 return -ENOMEM;
2615
2616 f->fd = -1;
2617 f->mode = mode;
2618
2619 f->flags = flags;
2620 f->prot = prot_from_flags(flags);
2621 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2622 #if defined(HAVE_LZ4)
2623 f->compress_lz4 = compress;
2624 #elif defined(HAVE_XZ)
2625 f->compress_xz = compress;
2626 #endif
2627 #ifdef HAVE_GCRYPT
2628 f->seal = seal;
2629 #endif
2630
2631 if (mmap_cache)
2632 f->mmap = mmap_cache_ref(mmap_cache);
2633 else {
2634 f->mmap = mmap_cache_new();
2635 if (!f->mmap) {
2636 r = -ENOMEM;
2637 goto fail;
2638 }
2639 }
2640
2641 f->path = strdup(fname);
2642 if (!f->path) {
2643 r = -ENOMEM;
2644 goto fail;
2645 }
2646
2647 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2648 if (!f->chain_cache) {
2649 r = -ENOMEM;
2650 goto fail;
2651 }
2652
2653 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2654 if (f->fd < 0) {
2655 r = -errno;
2656 goto fail;
2657 }
2658
2659 r = journal_file_fstat(f);
2660 if (r < 0)
2661 goto fail;
2662
2663 if (f->last_stat.st_size == 0 && f->writable) {
2664
2665 (void) journal_file_warn_btrfs(f);
2666
2667 /* Let's attach the creation time to the journal file,
2668 * so that the vacuuming code knows the age of this
2669 * file even if the file might end up corrupted one
2670 * day... Ideally we'd just use the creation time many
2671 * file systems maintain for each file, but there is
2672 * currently no usable API to query this, hence let's
2673 * emulate this via extended attributes. If extended
2674 * attributes are not supported we'll just skip this,
2675 * and rely solely on mtime/atime/ctime of the file. */
2676
2677 fd_setcrtime(f->fd, 0);
2678
2679 #ifdef HAVE_GCRYPT
2680 /* Try to load the FSPRG state, and if we can't, then
2681 * just don't do sealing */
2682 if (f->seal) {
2683 r = journal_file_fss_load(f);
2684 if (r < 0)
2685 f->seal = false;
2686 }
2687 #endif
2688
2689 r = journal_file_init_header(f, template);
2690 if (r < 0)
2691 goto fail;
2692
2693 r = journal_file_fstat(f);
2694 if (r < 0)
2695 goto fail;
2696
2697 newly_created = true;
2698 }
2699
2700 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2701 r = -EIO;
2702 goto fail;
2703 }
2704
2705 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2706 if (r < 0)
2707 goto fail;
2708
2709 f->header = h;
2710
2711 if (!newly_created) {
2712 r = journal_file_verify_header(f);
2713 if (r < 0)
2714 goto fail;
2715 }
2716
2717 #ifdef HAVE_GCRYPT
2718 if (!newly_created && f->writable) {
2719 r = journal_file_fss_load(f);
2720 if (r < 0)
2721 goto fail;
2722 }
2723 #endif
2724
2725 if (f->writable) {
2726 if (metrics) {
2727 journal_default_metrics(metrics, f->fd);
2728 f->metrics = *metrics;
2729 } else if (template)
2730 f->metrics = template->metrics;
2731
2732 r = journal_file_refresh_header(f);
2733 if (r < 0)
2734 goto fail;
2735 }
2736
2737 #ifdef HAVE_GCRYPT
2738 r = journal_file_hmac_setup(f);
2739 if (r < 0)
2740 goto fail;
2741 #endif
2742
2743 if (newly_created) {
2744 r = journal_file_setup_field_hash_table(f);
2745 if (r < 0)
2746 goto fail;
2747
2748 r = journal_file_setup_data_hash_table(f);
2749 if (r < 0)
2750 goto fail;
2751
2752 #ifdef HAVE_GCRYPT
2753 r = journal_file_append_first_tag(f);
2754 if (r < 0)
2755 goto fail;
2756 #endif
2757 }
2758
2759 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2760 r = -EIO;
2761 goto fail;
2762 }
2763
2764 *ret = f;
2765 return 0;
2766
2767 fail:
2768 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2769 r = -EIO;
2770
2771 journal_file_close(f);
2772
2773 return r;
2774 }
2775
2776 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2777 _cleanup_free_ char *p = NULL;
2778 size_t l;
2779 JournalFile *old_file, *new_file = NULL;
2780 int r;
2781
2782 assert(f);
2783 assert(*f);
2784
2785 old_file = *f;
2786
2787 if (!old_file->writable)
2788 return -EINVAL;
2789
2790 if (!endswith(old_file->path, ".journal"))
2791 return -EINVAL;
2792
2793 l = strlen(old_file->path);
2794 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2795 (int) l - 8, old_file->path,
2796 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2797 le64toh((*f)->header->head_entry_seqnum),
2798 le64toh((*f)->header->head_entry_realtime));
2799 if (r < 0)
2800 return -ENOMEM;
2801
2802 /* Try to rename the file to the archived version. If the file
2803 * already was deleted, we'll get ENOENT, let's ignore that
2804 * case. */
2805 r = rename(old_file->path, p);
2806 if (r < 0 && errno != ENOENT)
2807 return -errno;
2808
2809 old_file->header->state = STATE_ARCHIVED;
2810
2811 /* Currently, btrfs is not very good with out write patterns
2812 * and fragments heavily. Let's defrag our journal files when
2813 * we archive them */
2814 old_file->defrag_on_close = true;
2815
2816 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2817 journal_file_close(old_file);
2818
2819 *f = new_file;
2820 return r;
2821 }
2822
2823 int journal_file_open_reliably(
2824 const char *fname,
2825 int flags,
2826 mode_t mode,
2827 bool compress,
2828 bool seal,
2829 JournalMetrics *metrics,
2830 MMapCache *mmap_cache,
2831 JournalFile *template,
2832 JournalFile **ret) {
2833
2834 int r;
2835 size_t l;
2836 _cleanup_free_ char *p = NULL;
2837
2838 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2839 if (!IN_SET(r,
2840 -EBADMSG, /* corrupted */
2841 -ENODATA, /* truncated */
2842 -EHOSTDOWN, /* other machine */
2843 -EPROTONOSUPPORT, /* incompatible feature */
2844 -EBUSY, /* unclean shutdown */
2845 -ESHUTDOWN, /* already archived */
2846 -EIO, /* IO error, including SIGBUS on mmap */
2847 -EIDRM /* File has been deleted */))
2848 return r;
2849
2850 if ((flags & O_ACCMODE) == O_RDONLY)
2851 return r;
2852
2853 if (!(flags & O_CREAT))
2854 return r;
2855
2856 if (!endswith(fname, ".journal"))
2857 return r;
2858
2859 /* The file is corrupted. Rotate it away and try it again (but only once) */
2860
2861 l = strlen(fname);
2862 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2863 (int) l - 8, fname,
2864 now(CLOCK_REALTIME),
2865 random_u64()) < 0)
2866 return -ENOMEM;
2867
2868 if (rename(fname, p) < 0)
2869 return -errno;
2870
2871 /* btrfs doesn't cope well with our write pattern and
2872 * fragments heavily. Let's defrag all files we rotate */
2873
2874 (void) chattr_path(p, false, FS_NOCOW_FL);
2875 (void) btrfs_defrag(p);
2876
2877 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2878
2879 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2880 }
2881
2882 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2883 uint64_t i, n;
2884 uint64_t q, xor_hash = 0;
2885 int r;
2886 EntryItem *items;
2887 dual_timestamp ts;
2888
2889 assert(from);
2890 assert(to);
2891 assert(o);
2892 assert(p);
2893
2894 if (!to->writable)
2895 return -EPERM;
2896
2897 ts.monotonic = le64toh(o->entry.monotonic);
2898 ts.realtime = le64toh(o->entry.realtime);
2899
2900 n = journal_file_entry_n_items(o);
2901 /* alloca() can't take 0, hence let's allocate at least one */
2902 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2903
2904 for (i = 0; i < n; i++) {
2905 uint64_t l, h;
2906 le64_t le_hash;
2907 size_t t;
2908 void *data;
2909 Object *u;
2910
2911 q = le64toh(o->entry.items[i].object_offset);
2912 le_hash = o->entry.items[i].hash;
2913
2914 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2915 if (r < 0)
2916 return r;
2917
2918 if (le_hash != o->data.hash)
2919 return -EBADMSG;
2920
2921 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2922 t = (size_t) l;
2923
2924 /* We hit the limit on 32bit machines */
2925 if ((uint64_t) t != l)
2926 return -E2BIG;
2927
2928 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2929 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2930 size_t rsize = 0;
2931
2932 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2933 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2934 if (r < 0)
2935 return r;
2936
2937 data = from->compress_buffer;
2938 l = rsize;
2939 #else
2940 return -EPROTONOSUPPORT;
2941 #endif
2942 } else
2943 data = o->data.payload;
2944
2945 r = journal_file_append_data(to, data, l, &u, &h);
2946 if (r < 0)
2947 return r;
2948
2949 xor_hash ^= le64toh(u->data.hash);
2950 items[i].object_offset = htole64(h);
2951 items[i].hash = u->data.hash;
2952
2953 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2954 if (r < 0)
2955 return r;
2956 }
2957
2958 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2959
2960 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2961 return -EIO;
2962
2963 return r;
2964 }
2965
2966 void journal_reset_metrics(JournalMetrics *m) {
2967 assert(m);
2968
2969 /* Set everything to "pick automatic values". */
2970
2971 *m = (JournalMetrics) {
2972 .min_use = (uint64_t) -1,
2973 .max_use = (uint64_t) -1,
2974 .min_size = (uint64_t) -1,
2975 .max_size = (uint64_t) -1,
2976 .keep_free = (uint64_t) -1,
2977 .n_max_files = (uint64_t) -1,
2978 };
2979 }
2980
2981 void journal_default_metrics(JournalMetrics *m, int fd) {
2982 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
2983 struct statvfs ss;
2984 uint64_t fs_size;
2985
2986 assert(m);
2987 assert(fd >= 0);
2988
2989 if (fstatvfs(fd, &ss) >= 0)
2990 fs_size = ss.f_frsize * ss.f_blocks;
2991 else {
2992 log_debug_errno(errno, "Failed to detremine disk size: %m");
2993 fs_size = 0;
2994 }
2995
2996 if (m->max_use == (uint64_t) -1) {
2997
2998 if (fs_size > 0) {
2999 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3000
3001 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3002 m->max_use = DEFAULT_MAX_USE_UPPER;
3003
3004 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3005 m->max_use = DEFAULT_MAX_USE_LOWER;
3006 } else
3007 m->max_use = DEFAULT_MAX_USE_LOWER;
3008 } else {
3009 m->max_use = PAGE_ALIGN(m->max_use);
3010
3011 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3012 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3013 }
3014
3015 if (m->min_use == (uint64_t) -1)
3016 m->min_use = DEFAULT_MIN_USE;
3017
3018 if (m->min_use > m->max_use)
3019 m->min_use = m->max_use;
3020
3021 if (m->max_size == (uint64_t) -1) {
3022 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3023
3024 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3025 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3026 } else
3027 m->max_size = PAGE_ALIGN(m->max_size);
3028
3029 if (m->max_size != 0) {
3030 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3031 m->max_size = JOURNAL_FILE_SIZE_MIN;
3032
3033 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3034 m->max_use = m->max_size*2;
3035 }
3036
3037 if (m->min_size == (uint64_t) -1)
3038 m->min_size = JOURNAL_FILE_SIZE_MIN;
3039 else {
3040 m->min_size = PAGE_ALIGN(m->min_size);
3041
3042 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3043 m->min_size = JOURNAL_FILE_SIZE_MIN;
3044
3045 if (m->max_size != 0 && m->min_size > m->max_size)
3046 m->max_size = m->min_size;
3047 }
3048
3049 if (m->keep_free == (uint64_t) -1) {
3050
3051 if (fs_size > 0) {
3052 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3053
3054 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3055 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3056
3057 } else
3058 m->keep_free = DEFAULT_KEEP_FREE;
3059 }
3060
3061 if (m->n_max_files == (uint64_t) -1)
3062 m->n_max_files = DEFAULT_N_MAX_FILES;
3063
3064 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3065 format_bytes(a, sizeof(a), m->min_use),
3066 format_bytes(b, sizeof(b), m->max_use),
3067 format_bytes(c, sizeof(c), m->max_size),
3068 format_bytes(d, sizeof(d), m->min_size),
3069 format_bytes(e, sizeof(e), m->keep_free),
3070 m->n_max_files);
3071 }
3072
3073 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3074 assert(f);
3075 assert(from || to);
3076
3077 if (from) {
3078 if (f->header->head_entry_realtime == 0)
3079 return -ENOENT;
3080
3081 *from = le64toh(f->header->head_entry_realtime);
3082 }
3083
3084 if (to) {
3085 if (f->header->tail_entry_realtime == 0)
3086 return -ENOENT;
3087
3088 *to = le64toh(f->header->tail_entry_realtime);
3089 }
3090
3091 return 1;
3092 }
3093
3094 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3095 Object *o;
3096 uint64_t p;
3097 int r;
3098
3099 assert(f);
3100 assert(from || to);
3101
3102 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3103 if (r <= 0)
3104 return r;
3105
3106 if (le64toh(o->data.n_entries) <= 0)
3107 return 0;
3108
3109 if (from) {
3110 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3111 if (r < 0)
3112 return r;
3113
3114 *from = le64toh(o->entry.monotonic);
3115 }
3116
3117 if (to) {
3118 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3119 if (r < 0)
3120 return r;
3121
3122 r = generic_array_get_plus_one(f,
3123 le64toh(o->data.entry_offset),
3124 le64toh(o->data.entry_array_offset),
3125 le64toh(o->data.n_entries)-1,
3126 &o, NULL);
3127 if (r <= 0)
3128 return r;
3129
3130 *to = le64toh(o->entry.monotonic);
3131 }
3132
3133 return 1;
3134 }
3135
3136 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3137 assert(f);
3138
3139 /* If we gained new header fields we gained new features,
3140 * hence suggest a rotation */
3141 if (le64toh(f->header->header_size) < sizeof(Header)) {
3142 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3143 return true;
3144 }
3145
3146 /* Let's check if the hash tables grew over a certain fill
3147 * level (75%, borrowing this value from Java's hash table
3148 * implementation), and if so suggest a rotation. To calculate
3149 * the fill level we need the n_data field, which only exists
3150 * in newer versions. */
3151
3152 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3153 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3154 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3155 f->path,
3156 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3157 le64toh(f->header->n_data),
3158 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3159 (unsigned long long) f->last_stat.st_size,
3160 f->last_stat.st_size / le64toh(f->header->n_data));
3161 return true;
3162 }
3163
3164 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3165 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3166 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3167 f->path,
3168 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3169 le64toh(f->header->n_fields),
3170 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3171 return true;
3172 }
3173
3174 /* Are the data objects properly indexed by field objects? */
3175 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3176 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3177 le64toh(f->header->n_data) > 0 &&
3178 le64toh(f->header->n_fields) == 0)
3179 return true;
3180
3181 if (max_file_usec > 0) {
3182 usec_t t, h;
3183
3184 h = le64toh(f->header->head_entry_realtime);
3185 t = now(CLOCK_REALTIME);
3186
3187 if (h > 0 && t > h + max_file_usec)
3188 return true;
3189 }
3190
3191 return false;
3192 }