]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
Merge pull request #1654 from poettering/util-lib
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <linux/fs.h>
25 #include <stddef.h>
26 #include <sys/mman.h>
27 #include <sys/statvfs.h>
28 #include <sys/uio.h>
29 #include <unistd.h>
30
31 #include "btrfs-util.h"
32 #include "compress.h"
33 #include "fd-util.h"
34 #include "journal-authenticate.h"
35 #include "journal-def.h"
36 #include "journal-file.h"
37 #include "lookup3.h"
38 #include "random-util.h"
39 #include "string-util.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
53
54 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
55 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
56
57 /* This is the upper bound if we deduce max_size from max_use */
58 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
59
60 /* This is the upper bound if we deduce the keep_free value from the
61 * file system size */
62 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
63
64 /* This is the keep_free value when we can't determine the system
65 * size */
66 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
67
68 /* This is the default maximum number of journal files to keep around. */
69 #define DEFAULT_N_MAX_FILES (100)
70
71 /* n_data was the first entry we added after the initial file format design */
72 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
73
74 /* How many entries to keep in the entry array chain cache at max */
75 #define CHAIN_CACHE_MAX 20
76
77 /* How much to increase the journal file size at once each time we allocate something new. */
78 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
79
80 /* Reread fstat() of the file for detecting deletions at least this often */
81 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
82
83 /* The mmap context to use for the header we pick as one above the last defined typed */
84 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
85
86 static int journal_file_set_online(JournalFile *f) {
87 assert(f);
88
89 if (!f->writable)
90 return -EPERM;
91
92 if (!(f->fd >= 0 && f->header))
93 return -EINVAL;
94
95 if (mmap_cache_got_sigbus(f->mmap, f->fd))
96 return -EIO;
97
98 switch(f->header->state) {
99 case STATE_ONLINE:
100 return 0;
101
102 case STATE_OFFLINE:
103 f->header->state = STATE_ONLINE;
104 fsync(f->fd);
105 return 0;
106
107 default:
108 return -EINVAL;
109 }
110 }
111
112 int journal_file_set_offline(JournalFile *f) {
113 assert(f);
114
115 if (!f->writable)
116 return -EPERM;
117
118 if (!(f->fd >= 0 && f->header))
119 return -EINVAL;
120
121 if (f->header->state != STATE_ONLINE)
122 return 0;
123
124 fsync(f->fd);
125
126 if (mmap_cache_got_sigbus(f->mmap, f->fd))
127 return -EIO;
128
129 f->header->state = STATE_OFFLINE;
130
131 if (mmap_cache_got_sigbus(f->mmap, f->fd))
132 return -EIO;
133
134 fsync(f->fd);
135
136 return 0;
137 }
138
139 JournalFile* journal_file_close(JournalFile *f) {
140 assert(f);
141
142 #ifdef HAVE_GCRYPT
143 /* Write the final tag */
144 if (f->seal && f->writable)
145 journal_file_append_tag(f);
146 #endif
147
148 journal_file_set_offline(f);
149
150 if (f->mmap && f->fd >= 0)
151 mmap_cache_close_fd(f->mmap, f->fd);
152
153 if (f->fd >= 0 && f->defrag_on_close) {
154
155 /* Be friendly to btrfs: turn COW back on again now,
156 * and defragment the file. We won't write to the file
157 * ever again, hence remove all fragmentation, and
158 * reenable all the good bits COW usually provides
159 * (such as data checksumming). */
160
161 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
162 (void) btrfs_defrag_fd(f->fd);
163 }
164
165 safe_close(f->fd);
166 free(f->path);
167
168 if (f->mmap)
169 mmap_cache_unref(f->mmap);
170
171 ordered_hashmap_free_free(f->chain_cache);
172
173 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
174 free(f->compress_buffer);
175 #endif
176
177 #ifdef HAVE_GCRYPT
178 if (f->fss_file)
179 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
180 else
181 free(f->fsprg_state);
182
183 free(f->fsprg_seed);
184
185 if (f->hmac)
186 gcry_md_close(f->hmac);
187 #endif
188
189 free(f);
190 return NULL;
191 }
192
193 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
194 Header h = {};
195 ssize_t k;
196 int r;
197
198 assert(f);
199
200 memcpy(h.signature, HEADER_SIGNATURE, 8);
201 h.header_size = htole64(ALIGN64(sizeof(h)));
202
203 h.incompatible_flags |= htole32(
204 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
205 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
206
207 h.compatible_flags = htole32(
208 f->seal * HEADER_COMPATIBLE_SEALED);
209
210 r = sd_id128_randomize(&h.file_id);
211 if (r < 0)
212 return r;
213
214 if (template) {
215 h.seqnum_id = template->header->seqnum_id;
216 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
217 } else
218 h.seqnum_id = h.file_id;
219
220 k = pwrite(f->fd, &h, sizeof(h), 0);
221 if (k < 0)
222 return -errno;
223
224 if (k != sizeof(h))
225 return -EIO;
226
227 return 0;
228 }
229
230 static int journal_file_refresh_header(JournalFile *f) {
231 sd_id128_t boot_id;
232 int r;
233
234 assert(f);
235
236 r = sd_id128_get_machine(&f->header->machine_id);
237 if (r < 0)
238 return r;
239
240 r = sd_id128_get_boot(&boot_id);
241 if (r < 0)
242 return r;
243
244 if (sd_id128_equal(boot_id, f->header->boot_id))
245 f->tail_entry_monotonic_valid = true;
246
247 f->header->boot_id = boot_id;
248
249 r = journal_file_set_online(f);
250
251 /* Sync the online state to disk */
252 fsync(f->fd);
253
254 return r;
255 }
256
257 static int journal_file_verify_header(JournalFile *f) {
258 uint32_t flags;
259
260 assert(f);
261
262 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
263 return -EBADMSG;
264
265 /* In both read and write mode we refuse to open files with
266 * incompatible flags we don't know */
267 flags = le32toh(f->header->incompatible_flags);
268 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
269 if (flags & ~HEADER_INCOMPATIBLE_ANY)
270 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
271 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
272 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
273 if (flags)
274 log_debug("Journal file %s uses incompatible flags %"PRIx32
275 " disabled at compilation time.", f->path, flags);
276 return -EPROTONOSUPPORT;
277 }
278
279 /* When open for writing we refuse to open files with
280 * compatible flags, too */
281 flags = le32toh(f->header->compatible_flags);
282 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
283 if (flags & ~HEADER_COMPATIBLE_ANY)
284 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
285 f->path, flags & ~HEADER_COMPATIBLE_ANY);
286 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
287 if (flags)
288 log_debug("Journal file %s uses compatible flags %"PRIx32
289 " disabled at compilation time.", f->path, flags);
290 return -EPROTONOSUPPORT;
291 }
292
293 if (f->header->state >= _STATE_MAX)
294 return -EBADMSG;
295
296 /* The first addition was n_data, so check that we are at least this large */
297 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
298 return -EBADMSG;
299
300 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
301 return -EBADMSG;
302
303 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
304 return -ENODATA;
305
306 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
307 return -ENODATA;
308
309 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
310 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
311 !VALID64(le64toh(f->header->tail_object_offset)) ||
312 !VALID64(le64toh(f->header->entry_array_offset)))
313 return -ENODATA;
314
315 if (f->writable) {
316 uint8_t state;
317 sd_id128_t machine_id;
318 int r;
319
320 r = sd_id128_get_machine(&machine_id);
321 if (r < 0)
322 return r;
323
324 if (!sd_id128_equal(machine_id, f->header->machine_id))
325 return -EHOSTDOWN;
326
327 state = f->header->state;
328
329 if (state == STATE_ONLINE) {
330 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
331 return -EBUSY;
332 } else if (state == STATE_ARCHIVED)
333 return -ESHUTDOWN;
334 else if (state != STATE_OFFLINE) {
335 log_debug("Journal file %s has unknown state %i.", f->path, state);
336 return -EBUSY;
337 }
338 }
339
340 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
341 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
342
343 f->seal = JOURNAL_HEADER_SEALED(f->header);
344
345 return 0;
346 }
347
348 static int journal_file_fstat(JournalFile *f) {
349 assert(f);
350 assert(f->fd >= 0);
351
352 if (fstat(f->fd, &f->last_stat) < 0)
353 return -errno;
354
355 f->last_stat_usec = now(CLOCK_MONOTONIC);
356
357 /* Refuse appending to files that are already deleted */
358 if (f->last_stat.st_nlink <= 0)
359 return -EIDRM;
360
361 return 0;
362 }
363
364 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
365 uint64_t old_size, new_size;
366 int r;
367
368 assert(f);
369
370 /* We assume that this file is not sparse, and we know that
371 * for sure, since we always call posix_fallocate()
372 * ourselves */
373
374 if (mmap_cache_got_sigbus(f->mmap, f->fd))
375 return -EIO;
376
377 old_size =
378 le64toh(f->header->header_size) +
379 le64toh(f->header->arena_size);
380
381 new_size = PAGE_ALIGN(offset + size);
382 if (new_size < le64toh(f->header->header_size))
383 new_size = le64toh(f->header->header_size);
384
385 if (new_size <= old_size) {
386
387 /* We already pre-allocated enough space, but before
388 * we write to it, let's check with fstat() if the
389 * file got deleted, in order make sure we don't throw
390 * away the data immediately. Don't check fstat() for
391 * all writes though, but only once ever 10s. */
392
393 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
394 return 0;
395
396 return journal_file_fstat(f);
397 }
398
399 /* Allocate more space. */
400
401 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
402 return -E2BIG;
403
404 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
405 struct statvfs svfs;
406
407 if (fstatvfs(f->fd, &svfs) >= 0) {
408 uint64_t available;
409
410 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
411
412 if (new_size - old_size > available)
413 return -E2BIG;
414 }
415 }
416
417 /* Increase by larger blocks at once */
418 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
419 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
420 new_size = f->metrics.max_size;
421
422 /* Note that the glibc fallocate() fallback is very
423 inefficient, hence we try to minimize the allocation area
424 as we can. */
425 r = posix_fallocate(f->fd, old_size, new_size - old_size);
426 if (r != 0)
427 return -r;
428
429 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
430
431 return journal_file_fstat(f);
432 }
433
434 static unsigned type_to_context(ObjectType type) {
435 /* One context for each type, plus one catch-all for the rest */
436 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
437 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
438 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
439 }
440
441 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
442 int r;
443
444 assert(f);
445 assert(ret);
446
447 if (size <= 0)
448 return -EINVAL;
449
450 /* Avoid SIGBUS on invalid accesses */
451 if (offset + size > (uint64_t) f->last_stat.st_size) {
452 /* Hmm, out of range? Let's refresh the fstat() data
453 * first, before we trust that check. */
454
455 r = journal_file_fstat(f);
456 if (r < 0)
457 return r;
458
459 if (offset + size > (uint64_t) f->last_stat.st_size)
460 return -EADDRNOTAVAIL;
461 }
462
463 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
464 }
465
466 static uint64_t minimum_header_size(Object *o) {
467
468 static const uint64_t table[] = {
469 [OBJECT_DATA] = sizeof(DataObject),
470 [OBJECT_FIELD] = sizeof(FieldObject),
471 [OBJECT_ENTRY] = sizeof(EntryObject),
472 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
473 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
474 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
475 [OBJECT_TAG] = sizeof(TagObject),
476 };
477
478 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
479 return sizeof(ObjectHeader);
480
481 return table[o->object.type];
482 }
483
484 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
485 int r;
486 void *t;
487 Object *o;
488 uint64_t s;
489
490 assert(f);
491 assert(ret);
492
493 /* Objects may only be located at multiple of 64 bit */
494 if (!VALID64(offset))
495 return -EFAULT;
496
497 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
498 if (r < 0)
499 return r;
500
501 o = (Object*) t;
502 s = le64toh(o->object.size);
503
504 if (s < sizeof(ObjectHeader))
505 return -EBADMSG;
506
507 if (o->object.type <= OBJECT_UNUSED)
508 return -EBADMSG;
509
510 if (s < minimum_header_size(o))
511 return -EBADMSG;
512
513 if (type > OBJECT_UNUSED && o->object.type != type)
514 return -EBADMSG;
515
516 if (s > sizeof(ObjectHeader)) {
517 r = journal_file_move_to(f, type, false, offset, s, &t);
518 if (r < 0)
519 return r;
520
521 o = (Object*) t;
522 }
523
524 *ret = o;
525 return 0;
526 }
527
528 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
529 uint64_t r;
530
531 assert(f);
532
533 r = le64toh(f->header->tail_entry_seqnum) + 1;
534
535 if (seqnum) {
536 /* If an external seqnum counter was passed, we update
537 * both the local and the external one, and set it to
538 * the maximum of both */
539
540 if (*seqnum + 1 > r)
541 r = *seqnum + 1;
542
543 *seqnum = r;
544 }
545
546 f->header->tail_entry_seqnum = htole64(r);
547
548 if (f->header->head_entry_seqnum == 0)
549 f->header->head_entry_seqnum = htole64(r);
550
551 return r;
552 }
553
554 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
555 int r;
556 uint64_t p;
557 Object *tail, *o;
558 void *t;
559
560 assert(f);
561 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
562 assert(size >= sizeof(ObjectHeader));
563 assert(offset);
564 assert(ret);
565
566 r = journal_file_set_online(f);
567 if (r < 0)
568 return r;
569
570 p = le64toh(f->header->tail_object_offset);
571 if (p == 0)
572 p = le64toh(f->header->header_size);
573 else {
574 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
575 if (r < 0)
576 return r;
577
578 p += ALIGN64(le64toh(tail->object.size));
579 }
580
581 r = journal_file_allocate(f, p, size);
582 if (r < 0)
583 return r;
584
585 r = journal_file_move_to(f, type, false, p, size, &t);
586 if (r < 0)
587 return r;
588
589 o = (Object*) t;
590
591 zero(o->object);
592 o->object.type = type;
593 o->object.size = htole64(size);
594
595 f->header->tail_object_offset = htole64(p);
596 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
597
598 *ret = o;
599 *offset = p;
600
601 return 0;
602 }
603
604 static int journal_file_setup_data_hash_table(JournalFile *f) {
605 uint64_t s, p;
606 Object *o;
607 int r;
608
609 assert(f);
610
611 /* We estimate that we need 1 hash table entry per 768 bytes
612 of journal file and we want to make sure we never get
613 beyond 75% fill level. Calculate the hash table size for
614 the maximum file size based on these metrics. */
615
616 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
617 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
618 s = DEFAULT_DATA_HASH_TABLE_SIZE;
619
620 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
621
622 r = journal_file_append_object(f,
623 OBJECT_DATA_HASH_TABLE,
624 offsetof(Object, hash_table.items) + s,
625 &o, &p);
626 if (r < 0)
627 return r;
628
629 memzero(o->hash_table.items, s);
630
631 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
632 f->header->data_hash_table_size = htole64(s);
633
634 return 0;
635 }
636
637 static int journal_file_setup_field_hash_table(JournalFile *f) {
638 uint64_t s, p;
639 Object *o;
640 int r;
641
642 assert(f);
643
644 /* We use a fixed size hash table for the fields as this
645 * number should grow very slowly only */
646
647 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
648 r = journal_file_append_object(f,
649 OBJECT_FIELD_HASH_TABLE,
650 offsetof(Object, hash_table.items) + s,
651 &o, &p);
652 if (r < 0)
653 return r;
654
655 memzero(o->hash_table.items, s);
656
657 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
658 f->header->field_hash_table_size = htole64(s);
659
660 return 0;
661 }
662
663 int journal_file_map_data_hash_table(JournalFile *f) {
664 uint64_t s, p;
665 void *t;
666 int r;
667
668 assert(f);
669
670 if (f->data_hash_table)
671 return 0;
672
673 p = le64toh(f->header->data_hash_table_offset);
674 s = le64toh(f->header->data_hash_table_size);
675
676 r = journal_file_move_to(f,
677 OBJECT_DATA_HASH_TABLE,
678 true,
679 p, s,
680 &t);
681 if (r < 0)
682 return r;
683
684 f->data_hash_table = t;
685 return 0;
686 }
687
688 int journal_file_map_field_hash_table(JournalFile *f) {
689 uint64_t s, p;
690 void *t;
691 int r;
692
693 assert(f);
694
695 if (f->field_hash_table)
696 return 0;
697
698 p = le64toh(f->header->field_hash_table_offset);
699 s = le64toh(f->header->field_hash_table_size);
700
701 r = journal_file_move_to(f,
702 OBJECT_FIELD_HASH_TABLE,
703 true,
704 p, s,
705 &t);
706 if (r < 0)
707 return r;
708
709 f->field_hash_table = t;
710 return 0;
711 }
712
713 static int journal_file_link_field(
714 JournalFile *f,
715 Object *o,
716 uint64_t offset,
717 uint64_t hash) {
718
719 uint64_t p, h, m;
720 int r;
721
722 assert(f);
723 assert(o);
724 assert(offset > 0);
725
726 if (o->object.type != OBJECT_FIELD)
727 return -EINVAL;
728
729 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
730 if (m <= 0)
731 return -EBADMSG;
732
733 /* This might alter the window we are looking at */
734 o->field.next_hash_offset = o->field.head_data_offset = 0;
735
736 h = hash % m;
737 p = le64toh(f->field_hash_table[h].tail_hash_offset);
738 if (p == 0)
739 f->field_hash_table[h].head_hash_offset = htole64(offset);
740 else {
741 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
742 if (r < 0)
743 return r;
744
745 o->field.next_hash_offset = htole64(offset);
746 }
747
748 f->field_hash_table[h].tail_hash_offset = htole64(offset);
749
750 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
751 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
752
753 return 0;
754 }
755
756 static int journal_file_link_data(
757 JournalFile *f,
758 Object *o,
759 uint64_t offset,
760 uint64_t hash) {
761
762 uint64_t p, h, m;
763 int r;
764
765 assert(f);
766 assert(o);
767 assert(offset > 0);
768
769 if (o->object.type != OBJECT_DATA)
770 return -EINVAL;
771
772 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
773 if (m <= 0)
774 return -EBADMSG;
775
776 /* This might alter the window we are looking at */
777 o->data.next_hash_offset = o->data.next_field_offset = 0;
778 o->data.entry_offset = o->data.entry_array_offset = 0;
779 o->data.n_entries = 0;
780
781 h = hash % m;
782 p = le64toh(f->data_hash_table[h].tail_hash_offset);
783 if (p == 0)
784 /* Only entry in the hash table is easy */
785 f->data_hash_table[h].head_hash_offset = htole64(offset);
786 else {
787 /* Move back to the previous data object, to patch in
788 * pointer */
789
790 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
791 if (r < 0)
792 return r;
793
794 o->data.next_hash_offset = htole64(offset);
795 }
796
797 f->data_hash_table[h].tail_hash_offset = htole64(offset);
798
799 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
800 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
801
802 return 0;
803 }
804
805 int journal_file_find_field_object_with_hash(
806 JournalFile *f,
807 const void *field, uint64_t size, uint64_t hash,
808 Object **ret, uint64_t *offset) {
809
810 uint64_t p, osize, h, m;
811 int r;
812
813 assert(f);
814 assert(field && size > 0);
815
816 /* If the field hash table is empty, we can't find anything */
817 if (le64toh(f->header->field_hash_table_size) <= 0)
818 return 0;
819
820 /* Map the field hash table, if it isn't mapped yet. */
821 r = journal_file_map_field_hash_table(f);
822 if (r < 0)
823 return r;
824
825 osize = offsetof(Object, field.payload) + size;
826
827 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
828 if (m <= 0)
829 return -EBADMSG;
830
831 h = hash % m;
832 p = le64toh(f->field_hash_table[h].head_hash_offset);
833
834 while (p > 0) {
835 Object *o;
836
837 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
838 if (r < 0)
839 return r;
840
841 if (le64toh(o->field.hash) == hash &&
842 le64toh(o->object.size) == osize &&
843 memcmp(o->field.payload, field, size) == 0) {
844
845 if (ret)
846 *ret = o;
847 if (offset)
848 *offset = p;
849
850 return 1;
851 }
852
853 p = le64toh(o->field.next_hash_offset);
854 }
855
856 return 0;
857 }
858
859 int journal_file_find_field_object(
860 JournalFile *f,
861 const void *field, uint64_t size,
862 Object **ret, uint64_t *offset) {
863
864 uint64_t hash;
865
866 assert(f);
867 assert(field && size > 0);
868
869 hash = hash64(field, size);
870
871 return journal_file_find_field_object_with_hash(f,
872 field, size, hash,
873 ret, offset);
874 }
875
876 int journal_file_find_data_object_with_hash(
877 JournalFile *f,
878 const void *data, uint64_t size, uint64_t hash,
879 Object **ret, uint64_t *offset) {
880
881 uint64_t p, osize, h, m;
882 int r;
883
884 assert(f);
885 assert(data || size == 0);
886
887 /* If there's no data hash table, then there's no entry. */
888 if (le64toh(f->header->data_hash_table_size) <= 0)
889 return 0;
890
891 /* Map the data hash table, if it isn't mapped yet. */
892 r = journal_file_map_data_hash_table(f);
893 if (r < 0)
894 return r;
895
896 osize = offsetof(Object, data.payload) + size;
897
898 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
899 if (m <= 0)
900 return -EBADMSG;
901
902 h = hash % m;
903 p = le64toh(f->data_hash_table[h].head_hash_offset);
904
905 while (p > 0) {
906 Object *o;
907
908 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
909 if (r < 0)
910 return r;
911
912 if (le64toh(o->data.hash) != hash)
913 goto next;
914
915 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
916 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
917 uint64_t l;
918 size_t rsize = 0;
919
920 l = le64toh(o->object.size);
921 if (l <= offsetof(Object, data.payload))
922 return -EBADMSG;
923
924 l -= offsetof(Object, data.payload);
925
926 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
927 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
928 if (r < 0)
929 return r;
930
931 if (rsize == size &&
932 memcmp(f->compress_buffer, data, size) == 0) {
933
934 if (ret)
935 *ret = o;
936
937 if (offset)
938 *offset = p;
939
940 return 1;
941 }
942 #else
943 return -EPROTONOSUPPORT;
944 #endif
945 } else if (le64toh(o->object.size) == osize &&
946 memcmp(o->data.payload, data, size) == 0) {
947
948 if (ret)
949 *ret = o;
950
951 if (offset)
952 *offset = p;
953
954 return 1;
955 }
956
957 next:
958 p = le64toh(o->data.next_hash_offset);
959 }
960
961 return 0;
962 }
963
964 int journal_file_find_data_object(
965 JournalFile *f,
966 const void *data, uint64_t size,
967 Object **ret, uint64_t *offset) {
968
969 uint64_t hash;
970
971 assert(f);
972 assert(data || size == 0);
973
974 hash = hash64(data, size);
975
976 return journal_file_find_data_object_with_hash(f,
977 data, size, hash,
978 ret, offset);
979 }
980
981 static int journal_file_append_field(
982 JournalFile *f,
983 const void *field, uint64_t size,
984 Object **ret, uint64_t *offset) {
985
986 uint64_t hash, p;
987 uint64_t osize;
988 Object *o;
989 int r;
990
991 assert(f);
992 assert(field && size > 0);
993
994 hash = hash64(field, size);
995
996 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
997 if (r < 0)
998 return r;
999 else if (r > 0) {
1000
1001 if (ret)
1002 *ret = o;
1003
1004 if (offset)
1005 *offset = p;
1006
1007 return 0;
1008 }
1009
1010 osize = offsetof(Object, field.payload) + size;
1011 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1012 if (r < 0)
1013 return r;
1014
1015 o->field.hash = htole64(hash);
1016 memcpy(o->field.payload, field, size);
1017
1018 r = journal_file_link_field(f, o, p, hash);
1019 if (r < 0)
1020 return r;
1021
1022 /* The linking might have altered the window, so let's
1023 * refresh our pointer */
1024 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1025 if (r < 0)
1026 return r;
1027
1028 #ifdef HAVE_GCRYPT
1029 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1030 if (r < 0)
1031 return r;
1032 #endif
1033
1034 if (ret)
1035 *ret = o;
1036
1037 if (offset)
1038 *offset = p;
1039
1040 return 0;
1041 }
1042
1043 static int journal_file_append_data(
1044 JournalFile *f,
1045 const void *data, uint64_t size,
1046 Object **ret, uint64_t *offset) {
1047
1048 uint64_t hash, p;
1049 uint64_t osize;
1050 Object *o;
1051 int r, compression = 0;
1052 const void *eq;
1053
1054 assert(f);
1055 assert(data || size == 0);
1056
1057 hash = hash64(data, size);
1058
1059 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1060 if (r < 0)
1061 return r;
1062 if (r > 0) {
1063
1064 if (ret)
1065 *ret = o;
1066
1067 if (offset)
1068 *offset = p;
1069
1070 return 0;
1071 }
1072
1073 osize = offsetof(Object, data.payload) + size;
1074 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1075 if (r < 0)
1076 return r;
1077
1078 o->data.hash = htole64(hash);
1079
1080 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1081 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1082 size_t rsize = 0;
1083
1084 compression = compress_blob(data, size, o->data.payload, &rsize);
1085
1086 if (compression >= 0) {
1087 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1088 o->object.flags |= compression;
1089
1090 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1091 size, rsize, object_compressed_to_string(compression));
1092 } else
1093 /* Compression didn't work, we don't really care why, let's continue without compression */
1094 compression = 0;
1095 }
1096 #endif
1097
1098 if (compression == 0 && size > 0)
1099 memcpy(o->data.payload, data, size);
1100
1101 r = journal_file_link_data(f, o, p, hash);
1102 if (r < 0)
1103 return r;
1104
1105 /* The linking might have altered the window, so let's
1106 * refresh our pointer */
1107 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1108 if (r < 0)
1109 return r;
1110
1111 if (!data)
1112 eq = NULL;
1113 else
1114 eq = memchr(data, '=', size);
1115 if (eq && eq > data) {
1116 Object *fo = NULL;
1117 uint64_t fp;
1118
1119 /* Create field object ... */
1120 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1121 if (r < 0)
1122 return r;
1123
1124 /* ... and link it in. */
1125 o->data.next_field_offset = fo->field.head_data_offset;
1126 fo->field.head_data_offset = le64toh(p);
1127 }
1128
1129 #ifdef HAVE_GCRYPT
1130 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1131 if (r < 0)
1132 return r;
1133 #endif
1134
1135 if (ret)
1136 *ret = o;
1137
1138 if (offset)
1139 *offset = p;
1140
1141 return 0;
1142 }
1143
1144 uint64_t journal_file_entry_n_items(Object *o) {
1145 assert(o);
1146
1147 if (o->object.type != OBJECT_ENTRY)
1148 return 0;
1149
1150 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1151 }
1152
1153 uint64_t journal_file_entry_array_n_items(Object *o) {
1154 assert(o);
1155
1156 if (o->object.type != OBJECT_ENTRY_ARRAY)
1157 return 0;
1158
1159 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1160 }
1161
1162 uint64_t journal_file_hash_table_n_items(Object *o) {
1163 assert(o);
1164
1165 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1166 o->object.type != OBJECT_FIELD_HASH_TABLE)
1167 return 0;
1168
1169 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1170 }
1171
1172 static int link_entry_into_array(JournalFile *f,
1173 le64_t *first,
1174 le64_t *idx,
1175 uint64_t p) {
1176 int r;
1177 uint64_t n = 0, ap = 0, q, i, a, hidx;
1178 Object *o;
1179
1180 assert(f);
1181 assert(first);
1182 assert(idx);
1183 assert(p > 0);
1184
1185 a = le64toh(*first);
1186 i = hidx = le64toh(*idx);
1187 while (a > 0) {
1188
1189 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1190 if (r < 0)
1191 return r;
1192
1193 n = journal_file_entry_array_n_items(o);
1194 if (i < n) {
1195 o->entry_array.items[i] = htole64(p);
1196 *idx = htole64(hidx + 1);
1197 return 0;
1198 }
1199
1200 i -= n;
1201 ap = a;
1202 a = le64toh(o->entry_array.next_entry_array_offset);
1203 }
1204
1205 if (hidx > n)
1206 n = (hidx+1) * 2;
1207 else
1208 n = n * 2;
1209
1210 if (n < 4)
1211 n = 4;
1212
1213 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1214 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1215 &o, &q);
1216 if (r < 0)
1217 return r;
1218
1219 #ifdef HAVE_GCRYPT
1220 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1221 if (r < 0)
1222 return r;
1223 #endif
1224
1225 o->entry_array.items[i] = htole64(p);
1226
1227 if (ap == 0)
1228 *first = htole64(q);
1229 else {
1230 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1231 if (r < 0)
1232 return r;
1233
1234 o->entry_array.next_entry_array_offset = htole64(q);
1235 }
1236
1237 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1238 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1239
1240 *idx = htole64(hidx + 1);
1241
1242 return 0;
1243 }
1244
1245 static int link_entry_into_array_plus_one(JournalFile *f,
1246 le64_t *extra,
1247 le64_t *first,
1248 le64_t *idx,
1249 uint64_t p) {
1250
1251 int r;
1252
1253 assert(f);
1254 assert(extra);
1255 assert(first);
1256 assert(idx);
1257 assert(p > 0);
1258
1259 if (*idx == 0)
1260 *extra = htole64(p);
1261 else {
1262 le64_t i;
1263
1264 i = htole64(le64toh(*idx) - 1);
1265 r = link_entry_into_array(f, first, &i, p);
1266 if (r < 0)
1267 return r;
1268 }
1269
1270 *idx = htole64(le64toh(*idx) + 1);
1271 return 0;
1272 }
1273
1274 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1275 uint64_t p;
1276 int r;
1277 assert(f);
1278 assert(o);
1279 assert(offset > 0);
1280
1281 p = le64toh(o->entry.items[i].object_offset);
1282 if (p == 0)
1283 return -EINVAL;
1284
1285 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1286 if (r < 0)
1287 return r;
1288
1289 return link_entry_into_array_plus_one(f,
1290 &o->data.entry_offset,
1291 &o->data.entry_array_offset,
1292 &o->data.n_entries,
1293 offset);
1294 }
1295
1296 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1297 uint64_t n, i;
1298 int r;
1299
1300 assert(f);
1301 assert(o);
1302 assert(offset > 0);
1303
1304 if (o->object.type != OBJECT_ENTRY)
1305 return -EINVAL;
1306
1307 __sync_synchronize();
1308
1309 /* Link up the entry itself */
1310 r = link_entry_into_array(f,
1311 &f->header->entry_array_offset,
1312 &f->header->n_entries,
1313 offset);
1314 if (r < 0)
1315 return r;
1316
1317 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1318
1319 if (f->header->head_entry_realtime == 0)
1320 f->header->head_entry_realtime = o->entry.realtime;
1321
1322 f->header->tail_entry_realtime = o->entry.realtime;
1323 f->header->tail_entry_monotonic = o->entry.monotonic;
1324
1325 f->tail_entry_monotonic_valid = true;
1326
1327 /* Link up the items */
1328 n = journal_file_entry_n_items(o);
1329 for (i = 0; i < n; i++) {
1330 r = journal_file_link_entry_item(f, o, offset, i);
1331 if (r < 0)
1332 return r;
1333 }
1334
1335 return 0;
1336 }
1337
1338 static int journal_file_append_entry_internal(
1339 JournalFile *f,
1340 const dual_timestamp *ts,
1341 uint64_t xor_hash,
1342 const EntryItem items[], unsigned n_items,
1343 uint64_t *seqnum,
1344 Object **ret, uint64_t *offset) {
1345 uint64_t np;
1346 uint64_t osize;
1347 Object *o;
1348 int r;
1349
1350 assert(f);
1351 assert(items || n_items == 0);
1352 assert(ts);
1353
1354 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1355
1356 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1357 if (r < 0)
1358 return r;
1359
1360 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1361 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1362 o->entry.realtime = htole64(ts->realtime);
1363 o->entry.monotonic = htole64(ts->monotonic);
1364 o->entry.xor_hash = htole64(xor_hash);
1365 o->entry.boot_id = f->header->boot_id;
1366
1367 #ifdef HAVE_GCRYPT
1368 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1369 if (r < 0)
1370 return r;
1371 #endif
1372
1373 r = journal_file_link_entry(f, o, np);
1374 if (r < 0)
1375 return r;
1376
1377 if (ret)
1378 *ret = o;
1379
1380 if (offset)
1381 *offset = np;
1382
1383 return 0;
1384 }
1385
1386 void journal_file_post_change(JournalFile *f) {
1387 assert(f);
1388
1389 /* inotify() does not receive IN_MODIFY events from file
1390 * accesses done via mmap(). After each access we hence
1391 * trigger IN_MODIFY by truncating the journal file to its
1392 * current size which triggers IN_MODIFY. */
1393
1394 __sync_synchronize();
1395
1396 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1397 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1398 }
1399
1400 static int entry_item_cmp(const void *_a, const void *_b) {
1401 const EntryItem *a = _a, *b = _b;
1402
1403 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1404 return -1;
1405 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1406 return 1;
1407 return 0;
1408 }
1409
1410 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1411 unsigned i;
1412 EntryItem *items;
1413 int r;
1414 uint64_t xor_hash = 0;
1415 struct dual_timestamp _ts;
1416
1417 assert(f);
1418 assert(iovec || n_iovec == 0);
1419
1420 if (!ts) {
1421 dual_timestamp_get(&_ts);
1422 ts = &_ts;
1423 }
1424
1425 if (f->tail_entry_monotonic_valid &&
1426 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1427 return -EINVAL;
1428
1429 #ifdef HAVE_GCRYPT
1430 r = journal_file_maybe_append_tag(f, ts->realtime);
1431 if (r < 0)
1432 return r;
1433 #endif
1434
1435 /* alloca() can't take 0, hence let's allocate at least one */
1436 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1437
1438 for (i = 0; i < n_iovec; i++) {
1439 uint64_t p;
1440 Object *o;
1441
1442 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1443 if (r < 0)
1444 return r;
1445
1446 xor_hash ^= le64toh(o->data.hash);
1447 items[i].object_offset = htole64(p);
1448 items[i].hash = o->data.hash;
1449 }
1450
1451 /* Order by the position on disk, in order to improve seek
1452 * times for rotating media. */
1453 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1454
1455 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1456
1457 /* If the memory mapping triggered a SIGBUS then we return an
1458 * IO error and ignore the error code passed down to us, since
1459 * it is very likely just an effect of a nullified replacement
1460 * mapping page */
1461
1462 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1463 r = -EIO;
1464
1465 journal_file_post_change(f);
1466
1467 return r;
1468 }
1469
1470 typedef struct ChainCacheItem {
1471 uint64_t first; /* the array at the beginning of the chain */
1472 uint64_t array; /* the cached array */
1473 uint64_t begin; /* the first item in the cached array */
1474 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1475 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1476 } ChainCacheItem;
1477
1478 static void chain_cache_put(
1479 OrderedHashmap *h,
1480 ChainCacheItem *ci,
1481 uint64_t first,
1482 uint64_t array,
1483 uint64_t begin,
1484 uint64_t total,
1485 uint64_t last_index) {
1486
1487 if (!ci) {
1488 /* If the chain item to cache for this chain is the
1489 * first one it's not worth caching anything */
1490 if (array == first)
1491 return;
1492
1493 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1494 ci = ordered_hashmap_steal_first(h);
1495 assert(ci);
1496 } else {
1497 ci = new(ChainCacheItem, 1);
1498 if (!ci)
1499 return;
1500 }
1501
1502 ci->first = first;
1503
1504 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1505 free(ci);
1506 return;
1507 }
1508 } else
1509 assert(ci->first == first);
1510
1511 ci->array = array;
1512 ci->begin = begin;
1513 ci->total = total;
1514 ci->last_index = last_index;
1515 }
1516
1517 static int generic_array_get(
1518 JournalFile *f,
1519 uint64_t first,
1520 uint64_t i,
1521 Object **ret, uint64_t *offset) {
1522
1523 Object *o;
1524 uint64_t p = 0, a, t = 0;
1525 int r;
1526 ChainCacheItem *ci;
1527
1528 assert(f);
1529
1530 a = first;
1531
1532 /* Try the chain cache first */
1533 ci = ordered_hashmap_get(f->chain_cache, &first);
1534 if (ci && i > ci->total) {
1535 a = ci->array;
1536 i -= ci->total;
1537 t = ci->total;
1538 }
1539
1540 while (a > 0) {
1541 uint64_t k;
1542
1543 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1544 if (r < 0)
1545 return r;
1546
1547 k = journal_file_entry_array_n_items(o);
1548 if (i < k) {
1549 p = le64toh(o->entry_array.items[i]);
1550 goto found;
1551 }
1552
1553 i -= k;
1554 t += k;
1555 a = le64toh(o->entry_array.next_entry_array_offset);
1556 }
1557
1558 return 0;
1559
1560 found:
1561 /* Let's cache this item for the next invocation */
1562 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1563
1564 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1565 if (r < 0)
1566 return r;
1567
1568 if (ret)
1569 *ret = o;
1570
1571 if (offset)
1572 *offset = p;
1573
1574 return 1;
1575 }
1576
1577 static int generic_array_get_plus_one(
1578 JournalFile *f,
1579 uint64_t extra,
1580 uint64_t first,
1581 uint64_t i,
1582 Object **ret, uint64_t *offset) {
1583
1584 Object *o;
1585
1586 assert(f);
1587
1588 if (i == 0) {
1589 int r;
1590
1591 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1592 if (r < 0)
1593 return r;
1594
1595 if (ret)
1596 *ret = o;
1597
1598 if (offset)
1599 *offset = extra;
1600
1601 return 1;
1602 }
1603
1604 return generic_array_get(f, first, i-1, ret, offset);
1605 }
1606
1607 enum {
1608 TEST_FOUND,
1609 TEST_LEFT,
1610 TEST_RIGHT
1611 };
1612
1613 static int generic_array_bisect(
1614 JournalFile *f,
1615 uint64_t first,
1616 uint64_t n,
1617 uint64_t needle,
1618 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1619 direction_t direction,
1620 Object **ret,
1621 uint64_t *offset,
1622 uint64_t *idx) {
1623
1624 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1625 bool subtract_one = false;
1626 Object *o, *array = NULL;
1627 int r;
1628 ChainCacheItem *ci;
1629
1630 assert(f);
1631 assert(test_object);
1632
1633 /* Start with the first array in the chain */
1634 a = first;
1635
1636 ci = ordered_hashmap_get(f->chain_cache, &first);
1637 if (ci && n > ci->total) {
1638 /* Ah, we have iterated this bisection array chain
1639 * previously! Let's see if we can skip ahead in the
1640 * chain, as far as the last time. But we can't jump
1641 * backwards in the chain, so let's check that
1642 * first. */
1643
1644 r = test_object(f, ci->begin, needle);
1645 if (r < 0)
1646 return r;
1647
1648 if (r == TEST_LEFT) {
1649 /* OK, what we are looking for is right of the
1650 * begin of this EntryArray, so let's jump
1651 * straight to previously cached array in the
1652 * chain */
1653
1654 a = ci->array;
1655 n -= ci->total;
1656 t = ci->total;
1657 last_index = ci->last_index;
1658 }
1659 }
1660
1661 while (a > 0) {
1662 uint64_t left, right, k, lp;
1663
1664 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1665 if (r < 0)
1666 return r;
1667
1668 k = journal_file_entry_array_n_items(array);
1669 right = MIN(k, n);
1670 if (right <= 0)
1671 return 0;
1672
1673 i = right - 1;
1674 lp = p = le64toh(array->entry_array.items[i]);
1675 if (p <= 0)
1676 return -EBADMSG;
1677
1678 r = test_object(f, p, needle);
1679 if (r < 0)
1680 return r;
1681
1682 if (r == TEST_FOUND)
1683 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1684
1685 if (r == TEST_RIGHT) {
1686 left = 0;
1687 right -= 1;
1688
1689 if (last_index != (uint64_t) -1) {
1690 assert(last_index <= right);
1691
1692 /* If we cached the last index we
1693 * looked at, let's try to not to jump
1694 * too wildly around and see if we can
1695 * limit the range to look at early to
1696 * the immediate neighbors of the last
1697 * index we looked at. */
1698
1699 if (last_index > 0) {
1700 uint64_t x = last_index - 1;
1701
1702 p = le64toh(array->entry_array.items[x]);
1703 if (p <= 0)
1704 return -EBADMSG;
1705
1706 r = test_object(f, p, needle);
1707 if (r < 0)
1708 return r;
1709
1710 if (r == TEST_FOUND)
1711 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1712
1713 if (r == TEST_RIGHT)
1714 right = x;
1715 else
1716 left = x + 1;
1717 }
1718
1719 if (last_index < right) {
1720 uint64_t y = last_index + 1;
1721
1722 p = le64toh(array->entry_array.items[y]);
1723 if (p <= 0)
1724 return -EBADMSG;
1725
1726 r = test_object(f, p, needle);
1727 if (r < 0)
1728 return r;
1729
1730 if (r == TEST_FOUND)
1731 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1732
1733 if (r == TEST_RIGHT)
1734 right = y;
1735 else
1736 left = y + 1;
1737 }
1738 }
1739
1740 for (;;) {
1741 if (left == right) {
1742 if (direction == DIRECTION_UP)
1743 subtract_one = true;
1744
1745 i = left;
1746 goto found;
1747 }
1748
1749 assert(left < right);
1750 i = (left + right) / 2;
1751
1752 p = le64toh(array->entry_array.items[i]);
1753 if (p <= 0)
1754 return -EBADMSG;
1755
1756 r = test_object(f, p, needle);
1757 if (r < 0)
1758 return r;
1759
1760 if (r == TEST_FOUND)
1761 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1762
1763 if (r == TEST_RIGHT)
1764 right = i;
1765 else
1766 left = i + 1;
1767 }
1768 }
1769
1770 if (k >= n) {
1771 if (direction == DIRECTION_UP) {
1772 i = n;
1773 subtract_one = true;
1774 goto found;
1775 }
1776
1777 return 0;
1778 }
1779
1780 last_p = lp;
1781
1782 n -= k;
1783 t += k;
1784 last_index = (uint64_t) -1;
1785 a = le64toh(array->entry_array.next_entry_array_offset);
1786 }
1787
1788 return 0;
1789
1790 found:
1791 if (subtract_one && t == 0 && i == 0)
1792 return 0;
1793
1794 /* Let's cache this item for the next invocation */
1795 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1796
1797 if (subtract_one && i == 0)
1798 p = last_p;
1799 else if (subtract_one)
1800 p = le64toh(array->entry_array.items[i-1]);
1801 else
1802 p = le64toh(array->entry_array.items[i]);
1803
1804 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1805 if (r < 0)
1806 return r;
1807
1808 if (ret)
1809 *ret = o;
1810
1811 if (offset)
1812 *offset = p;
1813
1814 if (idx)
1815 *idx = t + i + (subtract_one ? -1 : 0);
1816
1817 return 1;
1818 }
1819
1820 static int generic_array_bisect_plus_one(
1821 JournalFile *f,
1822 uint64_t extra,
1823 uint64_t first,
1824 uint64_t n,
1825 uint64_t needle,
1826 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1827 direction_t direction,
1828 Object **ret,
1829 uint64_t *offset,
1830 uint64_t *idx) {
1831
1832 int r;
1833 bool step_back = false;
1834 Object *o;
1835
1836 assert(f);
1837 assert(test_object);
1838
1839 if (n <= 0)
1840 return 0;
1841
1842 /* This bisects the array in object 'first', but first checks
1843 * an extra */
1844 r = test_object(f, extra, needle);
1845 if (r < 0)
1846 return r;
1847
1848 if (r == TEST_FOUND)
1849 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1850
1851 /* if we are looking with DIRECTION_UP then we need to first
1852 see if in the actual array there is a matching entry, and
1853 return the last one of that. But if there isn't any we need
1854 to return this one. Hence remember this, and return it
1855 below. */
1856 if (r == TEST_LEFT)
1857 step_back = direction == DIRECTION_UP;
1858
1859 if (r == TEST_RIGHT) {
1860 if (direction == DIRECTION_DOWN)
1861 goto found;
1862 else
1863 return 0;
1864 }
1865
1866 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1867
1868 if (r == 0 && step_back)
1869 goto found;
1870
1871 if (r > 0 && idx)
1872 (*idx) ++;
1873
1874 return r;
1875
1876 found:
1877 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1878 if (r < 0)
1879 return r;
1880
1881 if (ret)
1882 *ret = o;
1883
1884 if (offset)
1885 *offset = extra;
1886
1887 if (idx)
1888 *idx = 0;
1889
1890 return 1;
1891 }
1892
1893 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1894 assert(f);
1895 assert(p > 0);
1896
1897 if (p == needle)
1898 return TEST_FOUND;
1899 else if (p < needle)
1900 return TEST_LEFT;
1901 else
1902 return TEST_RIGHT;
1903 }
1904
1905 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1906 Object *o;
1907 int r;
1908
1909 assert(f);
1910 assert(p > 0);
1911
1912 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1913 if (r < 0)
1914 return r;
1915
1916 if (le64toh(o->entry.seqnum) == needle)
1917 return TEST_FOUND;
1918 else if (le64toh(o->entry.seqnum) < needle)
1919 return TEST_LEFT;
1920 else
1921 return TEST_RIGHT;
1922 }
1923
1924 int journal_file_move_to_entry_by_seqnum(
1925 JournalFile *f,
1926 uint64_t seqnum,
1927 direction_t direction,
1928 Object **ret,
1929 uint64_t *offset) {
1930
1931 return generic_array_bisect(f,
1932 le64toh(f->header->entry_array_offset),
1933 le64toh(f->header->n_entries),
1934 seqnum,
1935 test_object_seqnum,
1936 direction,
1937 ret, offset, NULL);
1938 }
1939
1940 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1941 Object *o;
1942 int r;
1943
1944 assert(f);
1945 assert(p > 0);
1946
1947 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1948 if (r < 0)
1949 return r;
1950
1951 if (le64toh(o->entry.realtime) == needle)
1952 return TEST_FOUND;
1953 else if (le64toh(o->entry.realtime) < needle)
1954 return TEST_LEFT;
1955 else
1956 return TEST_RIGHT;
1957 }
1958
1959 int journal_file_move_to_entry_by_realtime(
1960 JournalFile *f,
1961 uint64_t realtime,
1962 direction_t direction,
1963 Object **ret,
1964 uint64_t *offset) {
1965
1966 return generic_array_bisect(f,
1967 le64toh(f->header->entry_array_offset),
1968 le64toh(f->header->n_entries),
1969 realtime,
1970 test_object_realtime,
1971 direction,
1972 ret, offset, NULL);
1973 }
1974
1975 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1976 Object *o;
1977 int r;
1978
1979 assert(f);
1980 assert(p > 0);
1981
1982 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1983 if (r < 0)
1984 return r;
1985
1986 if (le64toh(o->entry.monotonic) == needle)
1987 return TEST_FOUND;
1988 else if (le64toh(o->entry.monotonic) < needle)
1989 return TEST_LEFT;
1990 else
1991 return TEST_RIGHT;
1992 }
1993
1994 static int find_data_object_by_boot_id(
1995 JournalFile *f,
1996 sd_id128_t boot_id,
1997 Object **o,
1998 uint64_t *b) {
1999
2000 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2001
2002 sd_id128_to_string(boot_id, t + 9);
2003 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2004 }
2005
2006 int journal_file_move_to_entry_by_monotonic(
2007 JournalFile *f,
2008 sd_id128_t boot_id,
2009 uint64_t monotonic,
2010 direction_t direction,
2011 Object **ret,
2012 uint64_t *offset) {
2013
2014 Object *o;
2015 int r;
2016
2017 assert(f);
2018
2019 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2020 if (r < 0)
2021 return r;
2022 if (r == 0)
2023 return -ENOENT;
2024
2025 return generic_array_bisect_plus_one(f,
2026 le64toh(o->data.entry_offset),
2027 le64toh(o->data.entry_array_offset),
2028 le64toh(o->data.n_entries),
2029 monotonic,
2030 test_object_monotonic,
2031 direction,
2032 ret, offset, NULL);
2033 }
2034
2035 void journal_file_reset_location(JournalFile *f) {
2036 f->location_type = LOCATION_HEAD;
2037 f->current_offset = 0;
2038 f->current_seqnum = 0;
2039 f->current_realtime = 0;
2040 f->current_monotonic = 0;
2041 zero(f->current_boot_id);
2042 f->current_xor_hash = 0;
2043 }
2044
2045 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2046 f->location_type = LOCATION_SEEK;
2047 f->current_offset = offset;
2048 f->current_seqnum = le64toh(o->entry.seqnum);
2049 f->current_realtime = le64toh(o->entry.realtime);
2050 f->current_monotonic = le64toh(o->entry.monotonic);
2051 f->current_boot_id = o->entry.boot_id;
2052 f->current_xor_hash = le64toh(o->entry.xor_hash);
2053 }
2054
2055 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2056 assert(af);
2057 assert(bf);
2058 assert(af->location_type == LOCATION_SEEK);
2059 assert(bf->location_type == LOCATION_SEEK);
2060
2061 /* If contents and timestamps match, these entries are
2062 * identical, even if the seqnum does not match */
2063 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2064 af->current_monotonic == bf->current_monotonic &&
2065 af->current_realtime == bf->current_realtime &&
2066 af->current_xor_hash == bf->current_xor_hash)
2067 return 0;
2068
2069 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2070
2071 /* If this is from the same seqnum source, compare
2072 * seqnums */
2073 if (af->current_seqnum < bf->current_seqnum)
2074 return -1;
2075 if (af->current_seqnum > bf->current_seqnum)
2076 return 1;
2077
2078 /* Wow! This is weird, different data but the same
2079 * seqnums? Something is borked, but let's make the
2080 * best of it and compare by time. */
2081 }
2082
2083 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2084
2085 /* If the boot id matches, compare monotonic time */
2086 if (af->current_monotonic < bf->current_monotonic)
2087 return -1;
2088 if (af->current_monotonic > bf->current_monotonic)
2089 return 1;
2090 }
2091
2092 /* Otherwise, compare UTC time */
2093 if (af->current_realtime < bf->current_realtime)
2094 return -1;
2095 if (af->current_realtime > bf->current_realtime)
2096 return 1;
2097
2098 /* Finally, compare by contents */
2099 if (af->current_xor_hash < bf->current_xor_hash)
2100 return -1;
2101 if (af->current_xor_hash > bf->current_xor_hash)
2102 return 1;
2103
2104 return 0;
2105 }
2106
2107 int journal_file_next_entry(
2108 JournalFile *f,
2109 uint64_t p,
2110 direction_t direction,
2111 Object **ret, uint64_t *offset) {
2112
2113 uint64_t i, n, ofs;
2114 int r;
2115
2116 assert(f);
2117
2118 n = le64toh(f->header->n_entries);
2119 if (n <= 0)
2120 return 0;
2121
2122 if (p == 0)
2123 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2124 else {
2125 r = generic_array_bisect(f,
2126 le64toh(f->header->entry_array_offset),
2127 le64toh(f->header->n_entries),
2128 p,
2129 test_object_offset,
2130 DIRECTION_DOWN,
2131 NULL, NULL,
2132 &i);
2133 if (r <= 0)
2134 return r;
2135
2136 if (direction == DIRECTION_DOWN) {
2137 if (i >= n - 1)
2138 return 0;
2139
2140 i++;
2141 } else {
2142 if (i <= 0)
2143 return 0;
2144
2145 i--;
2146 }
2147 }
2148
2149 /* And jump to it */
2150 r = generic_array_get(f,
2151 le64toh(f->header->entry_array_offset),
2152 i,
2153 ret, &ofs);
2154 if (r <= 0)
2155 return r;
2156
2157 if (p > 0 &&
2158 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2159 log_debug("%s: entry array corrupted at entry %"PRIu64,
2160 f->path, i);
2161 return -EBADMSG;
2162 }
2163
2164 if (offset)
2165 *offset = ofs;
2166
2167 return 1;
2168 }
2169
2170 int journal_file_next_entry_for_data(
2171 JournalFile *f,
2172 Object *o, uint64_t p,
2173 uint64_t data_offset,
2174 direction_t direction,
2175 Object **ret, uint64_t *offset) {
2176
2177 uint64_t n, i;
2178 int r;
2179 Object *d;
2180
2181 assert(f);
2182 assert(p > 0 || !o);
2183
2184 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2185 if (r < 0)
2186 return r;
2187
2188 n = le64toh(d->data.n_entries);
2189 if (n <= 0)
2190 return n;
2191
2192 if (!o)
2193 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2194 else {
2195 if (o->object.type != OBJECT_ENTRY)
2196 return -EINVAL;
2197
2198 r = generic_array_bisect_plus_one(f,
2199 le64toh(d->data.entry_offset),
2200 le64toh(d->data.entry_array_offset),
2201 le64toh(d->data.n_entries),
2202 p,
2203 test_object_offset,
2204 DIRECTION_DOWN,
2205 NULL, NULL,
2206 &i);
2207
2208 if (r <= 0)
2209 return r;
2210
2211 if (direction == DIRECTION_DOWN) {
2212 if (i >= n - 1)
2213 return 0;
2214
2215 i++;
2216 } else {
2217 if (i <= 0)
2218 return 0;
2219
2220 i--;
2221 }
2222
2223 }
2224
2225 return generic_array_get_plus_one(f,
2226 le64toh(d->data.entry_offset),
2227 le64toh(d->data.entry_array_offset),
2228 i,
2229 ret, offset);
2230 }
2231
2232 int journal_file_move_to_entry_by_offset_for_data(
2233 JournalFile *f,
2234 uint64_t data_offset,
2235 uint64_t p,
2236 direction_t direction,
2237 Object **ret, uint64_t *offset) {
2238
2239 int r;
2240 Object *d;
2241
2242 assert(f);
2243
2244 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2245 if (r < 0)
2246 return r;
2247
2248 return generic_array_bisect_plus_one(f,
2249 le64toh(d->data.entry_offset),
2250 le64toh(d->data.entry_array_offset),
2251 le64toh(d->data.n_entries),
2252 p,
2253 test_object_offset,
2254 direction,
2255 ret, offset, NULL);
2256 }
2257
2258 int journal_file_move_to_entry_by_monotonic_for_data(
2259 JournalFile *f,
2260 uint64_t data_offset,
2261 sd_id128_t boot_id,
2262 uint64_t monotonic,
2263 direction_t direction,
2264 Object **ret, uint64_t *offset) {
2265
2266 Object *o, *d;
2267 int r;
2268 uint64_t b, z;
2269
2270 assert(f);
2271
2272 /* First, seek by time */
2273 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2274 if (r < 0)
2275 return r;
2276 if (r == 0)
2277 return -ENOENT;
2278
2279 r = generic_array_bisect_plus_one(f,
2280 le64toh(o->data.entry_offset),
2281 le64toh(o->data.entry_array_offset),
2282 le64toh(o->data.n_entries),
2283 monotonic,
2284 test_object_monotonic,
2285 direction,
2286 NULL, &z, NULL);
2287 if (r <= 0)
2288 return r;
2289
2290 /* And now, continue seeking until we find an entry that
2291 * exists in both bisection arrays */
2292
2293 for (;;) {
2294 Object *qo;
2295 uint64_t p, q;
2296
2297 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2298 if (r < 0)
2299 return r;
2300
2301 r = generic_array_bisect_plus_one(f,
2302 le64toh(d->data.entry_offset),
2303 le64toh(d->data.entry_array_offset),
2304 le64toh(d->data.n_entries),
2305 z,
2306 test_object_offset,
2307 direction,
2308 NULL, &p, NULL);
2309 if (r <= 0)
2310 return r;
2311
2312 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2313 if (r < 0)
2314 return r;
2315
2316 r = generic_array_bisect_plus_one(f,
2317 le64toh(o->data.entry_offset),
2318 le64toh(o->data.entry_array_offset),
2319 le64toh(o->data.n_entries),
2320 p,
2321 test_object_offset,
2322 direction,
2323 &qo, &q, NULL);
2324
2325 if (r <= 0)
2326 return r;
2327
2328 if (p == q) {
2329 if (ret)
2330 *ret = qo;
2331 if (offset)
2332 *offset = q;
2333
2334 return 1;
2335 }
2336
2337 z = q;
2338 }
2339 }
2340
2341 int journal_file_move_to_entry_by_seqnum_for_data(
2342 JournalFile *f,
2343 uint64_t data_offset,
2344 uint64_t seqnum,
2345 direction_t direction,
2346 Object **ret, uint64_t *offset) {
2347
2348 Object *d;
2349 int r;
2350
2351 assert(f);
2352
2353 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2354 if (r < 0)
2355 return r;
2356
2357 return generic_array_bisect_plus_one(f,
2358 le64toh(d->data.entry_offset),
2359 le64toh(d->data.entry_array_offset),
2360 le64toh(d->data.n_entries),
2361 seqnum,
2362 test_object_seqnum,
2363 direction,
2364 ret, offset, NULL);
2365 }
2366
2367 int journal_file_move_to_entry_by_realtime_for_data(
2368 JournalFile *f,
2369 uint64_t data_offset,
2370 uint64_t realtime,
2371 direction_t direction,
2372 Object **ret, uint64_t *offset) {
2373
2374 Object *d;
2375 int r;
2376
2377 assert(f);
2378
2379 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2380 if (r < 0)
2381 return r;
2382
2383 return generic_array_bisect_plus_one(f,
2384 le64toh(d->data.entry_offset),
2385 le64toh(d->data.entry_array_offset),
2386 le64toh(d->data.n_entries),
2387 realtime,
2388 test_object_realtime,
2389 direction,
2390 ret, offset, NULL);
2391 }
2392
2393 void journal_file_dump(JournalFile *f) {
2394 Object *o;
2395 int r;
2396 uint64_t p;
2397
2398 assert(f);
2399
2400 journal_file_print_header(f);
2401
2402 p = le64toh(f->header->header_size);
2403 while (p != 0) {
2404 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2405 if (r < 0)
2406 goto fail;
2407
2408 switch (o->object.type) {
2409
2410 case OBJECT_UNUSED:
2411 printf("Type: OBJECT_UNUSED\n");
2412 break;
2413
2414 case OBJECT_DATA:
2415 printf("Type: OBJECT_DATA\n");
2416 break;
2417
2418 case OBJECT_FIELD:
2419 printf("Type: OBJECT_FIELD\n");
2420 break;
2421
2422 case OBJECT_ENTRY:
2423 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2424 le64toh(o->entry.seqnum),
2425 le64toh(o->entry.monotonic),
2426 le64toh(o->entry.realtime));
2427 break;
2428
2429 case OBJECT_FIELD_HASH_TABLE:
2430 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2431 break;
2432
2433 case OBJECT_DATA_HASH_TABLE:
2434 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2435 break;
2436
2437 case OBJECT_ENTRY_ARRAY:
2438 printf("Type: OBJECT_ENTRY_ARRAY\n");
2439 break;
2440
2441 case OBJECT_TAG:
2442 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2443 le64toh(o->tag.seqnum),
2444 le64toh(o->tag.epoch));
2445 break;
2446
2447 default:
2448 printf("Type: unknown (%i)\n", o->object.type);
2449 break;
2450 }
2451
2452 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2453 printf("Flags: %s\n",
2454 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2455
2456 if (p == le64toh(f->header->tail_object_offset))
2457 p = 0;
2458 else
2459 p = p + ALIGN64(le64toh(o->object.size));
2460 }
2461
2462 return;
2463 fail:
2464 log_error("File corrupt");
2465 }
2466
2467 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2468 const char *x;
2469
2470 x = format_timestamp(buf, l, t);
2471 if (x)
2472 return x;
2473 return " --- ";
2474 }
2475
2476 void journal_file_print_header(JournalFile *f) {
2477 char a[33], b[33], c[33], d[33];
2478 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2479 struct stat st;
2480 char bytes[FORMAT_BYTES_MAX];
2481
2482 assert(f);
2483
2484 printf("File Path: %s\n"
2485 "File ID: %s\n"
2486 "Machine ID: %s\n"
2487 "Boot ID: %s\n"
2488 "Sequential Number ID: %s\n"
2489 "State: %s\n"
2490 "Compatible Flags:%s%s\n"
2491 "Incompatible Flags:%s%s%s\n"
2492 "Header size: %"PRIu64"\n"
2493 "Arena size: %"PRIu64"\n"
2494 "Data Hash Table Size: %"PRIu64"\n"
2495 "Field Hash Table Size: %"PRIu64"\n"
2496 "Rotate Suggested: %s\n"
2497 "Head Sequential Number: %"PRIu64"\n"
2498 "Tail Sequential Number: %"PRIu64"\n"
2499 "Head Realtime Timestamp: %s\n"
2500 "Tail Realtime Timestamp: %s\n"
2501 "Tail Monotonic Timestamp: %s\n"
2502 "Objects: %"PRIu64"\n"
2503 "Entry Objects: %"PRIu64"\n",
2504 f->path,
2505 sd_id128_to_string(f->header->file_id, a),
2506 sd_id128_to_string(f->header->machine_id, b),
2507 sd_id128_to_string(f->header->boot_id, c),
2508 sd_id128_to_string(f->header->seqnum_id, d),
2509 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2510 f->header->state == STATE_ONLINE ? "ONLINE" :
2511 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2512 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2513 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2514 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2515 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2516 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2517 le64toh(f->header->header_size),
2518 le64toh(f->header->arena_size),
2519 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2520 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2521 yes_no(journal_file_rotate_suggested(f, 0)),
2522 le64toh(f->header->head_entry_seqnum),
2523 le64toh(f->header->tail_entry_seqnum),
2524 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2525 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2526 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2527 le64toh(f->header->n_objects),
2528 le64toh(f->header->n_entries));
2529
2530 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2531 printf("Data Objects: %"PRIu64"\n"
2532 "Data Hash Table Fill: %.1f%%\n",
2533 le64toh(f->header->n_data),
2534 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2535
2536 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2537 printf("Field Objects: %"PRIu64"\n"
2538 "Field Hash Table Fill: %.1f%%\n",
2539 le64toh(f->header->n_fields),
2540 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2541
2542 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2543 printf("Tag Objects: %"PRIu64"\n",
2544 le64toh(f->header->n_tags));
2545 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2546 printf("Entry Array Objects: %"PRIu64"\n",
2547 le64toh(f->header->n_entry_arrays));
2548
2549 if (fstat(f->fd, &st) >= 0)
2550 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2551 }
2552
2553 static int journal_file_warn_btrfs(JournalFile *f) {
2554 unsigned attrs;
2555 int r;
2556
2557 assert(f);
2558
2559 /* Before we write anything, check if the COW logic is turned
2560 * off on btrfs. Given our write pattern that is quite
2561 * unfriendly to COW file systems this should greatly improve
2562 * performance on COW file systems, such as btrfs, at the
2563 * expense of data integrity features (which shouldn't be too
2564 * bad, given that we do our own checksumming). */
2565
2566 r = btrfs_is_filesystem(f->fd);
2567 if (r < 0)
2568 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2569 if (!r)
2570 return 0;
2571
2572 r = read_attr_fd(f->fd, &attrs);
2573 if (r < 0)
2574 return log_warning_errno(r, "Failed to read file attributes: %m");
2575
2576 if (attrs & FS_NOCOW_FL) {
2577 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2578 return 0;
2579 }
2580
2581 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2582 "This is likely to slow down journal access substantially, please consider turning "
2583 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2584
2585 return 1;
2586 }
2587
2588 int journal_file_open(
2589 const char *fname,
2590 int flags,
2591 mode_t mode,
2592 bool compress,
2593 bool seal,
2594 JournalMetrics *metrics,
2595 MMapCache *mmap_cache,
2596 JournalFile *template,
2597 JournalFile **ret) {
2598
2599 bool newly_created = false;
2600 JournalFile *f;
2601 void *h;
2602 int r;
2603
2604 assert(fname);
2605 assert(ret);
2606
2607 if ((flags & O_ACCMODE) != O_RDONLY &&
2608 (flags & O_ACCMODE) != O_RDWR)
2609 return -EINVAL;
2610
2611 if (!endswith(fname, ".journal") &&
2612 !endswith(fname, ".journal~"))
2613 return -EINVAL;
2614
2615 f = new0(JournalFile, 1);
2616 if (!f)
2617 return -ENOMEM;
2618
2619 f->fd = -1;
2620 f->mode = mode;
2621
2622 f->flags = flags;
2623 f->prot = prot_from_flags(flags);
2624 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2625 #if defined(HAVE_LZ4)
2626 f->compress_lz4 = compress;
2627 #elif defined(HAVE_XZ)
2628 f->compress_xz = compress;
2629 #endif
2630 #ifdef HAVE_GCRYPT
2631 f->seal = seal;
2632 #endif
2633
2634 if (mmap_cache)
2635 f->mmap = mmap_cache_ref(mmap_cache);
2636 else {
2637 f->mmap = mmap_cache_new();
2638 if (!f->mmap) {
2639 r = -ENOMEM;
2640 goto fail;
2641 }
2642 }
2643
2644 f->path = strdup(fname);
2645 if (!f->path) {
2646 r = -ENOMEM;
2647 goto fail;
2648 }
2649
2650 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2651 if (!f->chain_cache) {
2652 r = -ENOMEM;
2653 goto fail;
2654 }
2655
2656 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2657 if (f->fd < 0) {
2658 r = -errno;
2659 goto fail;
2660 }
2661
2662 r = journal_file_fstat(f);
2663 if (r < 0)
2664 goto fail;
2665
2666 if (f->last_stat.st_size == 0 && f->writable) {
2667
2668 (void) journal_file_warn_btrfs(f);
2669
2670 /* Let's attach the creation time to the journal file,
2671 * so that the vacuuming code knows the age of this
2672 * file even if the file might end up corrupted one
2673 * day... Ideally we'd just use the creation time many
2674 * file systems maintain for each file, but there is
2675 * currently no usable API to query this, hence let's
2676 * emulate this via extended attributes. If extended
2677 * attributes are not supported we'll just skip this,
2678 * and rely solely on mtime/atime/ctime of the file. */
2679
2680 fd_setcrtime(f->fd, 0);
2681
2682 #ifdef HAVE_GCRYPT
2683 /* Try to load the FSPRG state, and if we can't, then
2684 * just don't do sealing */
2685 if (f->seal) {
2686 r = journal_file_fss_load(f);
2687 if (r < 0)
2688 f->seal = false;
2689 }
2690 #endif
2691
2692 r = journal_file_init_header(f, template);
2693 if (r < 0)
2694 goto fail;
2695
2696 r = journal_file_fstat(f);
2697 if (r < 0)
2698 goto fail;
2699
2700 newly_created = true;
2701 }
2702
2703 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2704 r = -EIO;
2705 goto fail;
2706 }
2707
2708 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2709 if (r < 0)
2710 goto fail;
2711
2712 f->header = h;
2713
2714 if (!newly_created) {
2715 r = journal_file_verify_header(f);
2716 if (r < 0)
2717 goto fail;
2718 }
2719
2720 #ifdef HAVE_GCRYPT
2721 if (!newly_created && f->writable) {
2722 r = journal_file_fss_load(f);
2723 if (r < 0)
2724 goto fail;
2725 }
2726 #endif
2727
2728 if (f->writable) {
2729 if (metrics) {
2730 journal_default_metrics(metrics, f->fd);
2731 f->metrics = *metrics;
2732 } else if (template)
2733 f->metrics = template->metrics;
2734
2735 r = journal_file_refresh_header(f);
2736 if (r < 0)
2737 goto fail;
2738 }
2739
2740 #ifdef HAVE_GCRYPT
2741 r = journal_file_hmac_setup(f);
2742 if (r < 0)
2743 goto fail;
2744 #endif
2745
2746 if (newly_created) {
2747 r = journal_file_setup_field_hash_table(f);
2748 if (r < 0)
2749 goto fail;
2750
2751 r = journal_file_setup_data_hash_table(f);
2752 if (r < 0)
2753 goto fail;
2754
2755 #ifdef HAVE_GCRYPT
2756 r = journal_file_append_first_tag(f);
2757 if (r < 0)
2758 goto fail;
2759 #endif
2760 }
2761
2762 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2763 r = -EIO;
2764 goto fail;
2765 }
2766
2767 *ret = f;
2768 return 0;
2769
2770 fail:
2771 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2772 r = -EIO;
2773
2774 journal_file_close(f);
2775
2776 return r;
2777 }
2778
2779 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2780 _cleanup_free_ char *p = NULL;
2781 size_t l;
2782 JournalFile *old_file, *new_file = NULL;
2783 int r;
2784
2785 assert(f);
2786 assert(*f);
2787
2788 old_file = *f;
2789
2790 if (!old_file->writable)
2791 return -EINVAL;
2792
2793 if (!endswith(old_file->path, ".journal"))
2794 return -EINVAL;
2795
2796 l = strlen(old_file->path);
2797 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2798 (int) l - 8, old_file->path,
2799 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2800 le64toh((*f)->header->head_entry_seqnum),
2801 le64toh((*f)->header->head_entry_realtime));
2802 if (r < 0)
2803 return -ENOMEM;
2804
2805 /* Try to rename the file to the archived version. If the file
2806 * already was deleted, we'll get ENOENT, let's ignore that
2807 * case. */
2808 r = rename(old_file->path, p);
2809 if (r < 0 && errno != ENOENT)
2810 return -errno;
2811
2812 old_file->header->state = STATE_ARCHIVED;
2813
2814 /* Currently, btrfs is not very good with out write patterns
2815 * and fragments heavily. Let's defrag our journal files when
2816 * we archive them */
2817 old_file->defrag_on_close = true;
2818
2819 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2820 journal_file_close(old_file);
2821
2822 *f = new_file;
2823 return r;
2824 }
2825
2826 int journal_file_open_reliably(
2827 const char *fname,
2828 int flags,
2829 mode_t mode,
2830 bool compress,
2831 bool seal,
2832 JournalMetrics *metrics,
2833 MMapCache *mmap_cache,
2834 JournalFile *template,
2835 JournalFile **ret) {
2836
2837 int r;
2838 size_t l;
2839 _cleanup_free_ char *p = NULL;
2840
2841 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2842 if (!IN_SET(r,
2843 -EBADMSG, /* corrupted */
2844 -ENODATA, /* truncated */
2845 -EHOSTDOWN, /* other machine */
2846 -EPROTONOSUPPORT, /* incompatible feature */
2847 -EBUSY, /* unclean shutdown */
2848 -ESHUTDOWN, /* already archived */
2849 -EIO, /* IO error, including SIGBUS on mmap */
2850 -EIDRM /* File has been deleted */))
2851 return r;
2852
2853 if ((flags & O_ACCMODE) == O_RDONLY)
2854 return r;
2855
2856 if (!(flags & O_CREAT))
2857 return r;
2858
2859 if (!endswith(fname, ".journal"))
2860 return r;
2861
2862 /* The file is corrupted. Rotate it away and try it again (but only once) */
2863
2864 l = strlen(fname);
2865 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2866 (int) l - 8, fname,
2867 now(CLOCK_REALTIME),
2868 random_u64()) < 0)
2869 return -ENOMEM;
2870
2871 if (rename(fname, p) < 0)
2872 return -errno;
2873
2874 /* btrfs doesn't cope well with our write pattern and
2875 * fragments heavily. Let's defrag all files we rotate */
2876
2877 (void) chattr_path(p, false, FS_NOCOW_FL);
2878 (void) btrfs_defrag(p);
2879
2880 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2881
2882 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2883 }
2884
2885 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2886 uint64_t i, n;
2887 uint64_t q, xor_hash = 0;
2888 int r;
2889 EntryItem *items;
2890 dual_timestamp ts;
2891
2892 assert(from);
2893 assert(to);
2894 assert(o);
2895 assert(p);
2896
2897 if (!to->writable)
2898 return -EPERM;
2899
2900 ts.monotonic = le64toh(o->entry.monotonic);
2901 ts.realtime = le64toh(o->entry.realtime);
2902
2903 n = journal_file_entry_n_items(o);
2904 /* alloca() can't take 0, hence let's allocate at least one */
2905 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2906
2907 for (i = 0; i < n; i++) {
2908 uint64_t l, h;
2909 le64_t le_hash;
2910 size_t t;
2911 void *data;
2912 Object *u;
2913
2914 q = le64toh(o->entry.items[i].object_offset);
2915 le_hash = o->entry.items[i].hash;
2916
2917 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2918 if (r < 0)
2919 return r;
2920
2921 if (le_hash != o->data.hash)
2922 return -EBADMSG;
2923
2924 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2925 t = (size_t) l;
2926
2927 /* We hit the limit on 32bit machines */
2928 if ((uint64_t) t != l)
2929 return -E2BIG;
2930
2931 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2932 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2933 size_t rsize = 0;
2934
2935 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2936 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2937 if (r < 0)
2938 return r;
2939
2940 data = from->compress_buffer;
2941 l = rsize;
2942 #else
2943 return -EPROTONOSUPPORT;
2944 #endif
2945 } else
2946 data = o->data.payload;
2947
2948 r = journal_file_append_data(to, data, l, &u, &h);
2949 if (r < 0)
2950 return r;
2951
2952 xor_hash ^= le64toh(u->data.hash);
2953 items[i].object_offset = htole64(h);
2954 items[i].hash = u->data.hash;
2955
2956 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2957 if (r < 0)
2958 return r;
2959 }
2960
2961 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2962
2963 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2964 return -EIO;
2965
2966 return r;
2967 }
2968
2969 void journal_reset_metrics(JournalMetrics *m) {
2970 assert(m);
2971
2972 /* Set everything to "pick automatic values". */
2973
2974 *m = (JournalMetrics) {
2975 .min_use = (uint64_t) -1,
2976 .max_use = (uint64_t) -1,
2977 .min_size = (uint64_t) -1,
2978 .max_size = (uint64_t) -1,
2979 .keep_free = (uint64_t) -1,
2980 .n_max_files = (uint64_t) -1,
2981 };
2982 }
2983
2984 void journal_default_metrics(JournalMetrics *m, int fd) {
2985 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
2986 struct statvfs ss;
2987 uint64_t fs_size;
2988
2989 assert(m);
2990 assert(fd >= 0);
2991
2992 if (fstatvfs(fd, &ss) >= 0)
2993 fs_size = ss.f_frsize * ss.f_blocks;
2994 else {
2995 log_debug_errno(errno, "Failed to detremine disk size: %m");
2996 fs_size = 0;
2997 }
2998
2999 if (m->max_use == (uint64_t) -1) {
3000
3001 if (fs_size > 0) {
3002 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3003
3004 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3005 m->max_use = DEFAULT_MAX_USE_UPPER;
3006
3007 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3008 m->max_use = DEFAULT_MAX_USE_LOWER;
3009 } else
3010 m->max_use = DEFAULT_MAX_USE_LOWER;
3011 } else {
3012 m->max_use = PAGE_ALIGN(m->max_use);
3013
3014 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3015 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3016 }
3017
3018 if (m->min_use == (uint64_t) -1)
3019 m->min_use = DEFAULT_MIN_USE;
3020
3021 if (m->min_use > m->max_use)
3022 m->min_use = m->max_use;
3023
3024 if (m->max_size == (uint64_t) -1) {
3025 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3026
3027 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3028 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3029 } else
3030 m->max_size = PAGE_ALIGN(m->max_size);
3031
3032 if (m->max_size != 0) {
3033 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3034 m->max_size = JOURNAL_FILE_SIZE_MIN;
3035
3036 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3037 m->max_use = m->max_size*2;
3038 }
3039
3040 if (m->min_size == (uint64_t) -1)
3041 m->min_size = JOURNAL_FILE_SIZE_MIN;
3042 else {
3043 m->min_size = PAGE_ALIGN(m->min_size);
3044
3045 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3046 m->min_size = JOURNAL_FILE_SIZE_MIN;
3047
3048 if (m->max_size != 0 && m->min_size > m->max_size)
3049 m->max_size = m->min_size;
3050 }
3051
3052 if (m->keep_free == (uint64_t) -1) {
3053
3054 if (fs_size > 0) {
3055 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3056
3057 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3058 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3059
3060 } else
3061 m->keep_free = DEFAULT_KEEP_FREE;
3062 }
3063
3064 if (m->n_max_files == (uint64_t) -1)
3065 m->n_max_files = DEFAULT_N_MAX_FILES;
3066
3067 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3068 format_bytes(a, sizeof(a), m->min_use),
3069 format_bytes(b, sizeof(b), m->max_use),
3070 format_bytes(c, sizeof(c), m->max_size),
3071 format_bytes(d, sizeof(d), m->min_size),
3072 format_bytes(e, sizeof(e), m->keep_free),
3073 m->n_max_files);
3074 }
3075
3076 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3077 assert(f);
3078 assert(from || to);
3079
3080 if (from) {
3081 if (f->header->head_entry_realtime == 0)
3082 return -ENOENT;
3083
3084 *from = le64toh(f->header->head_entry_realtime);
3085 }
3086
3087 if (to) {
3088 if (f->header->tail_entry_realtime == 0)
3089 return -ENOENT;
3090
3091 *to = le64toh(f->header->tail_entry_realtime);
3092 }
3093
3094 return 1;
3095 }
3096
3097 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3098 Object *o;
3099 uint64_t p;
3100 int r;
3101
3102 assert(f);
3103 assert(from || to);
3104
3105 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3106 if (r <= 0)
3107 return r;
3108
3109 if (le64toh(o->data.n_entries) <= 0)
3110 return 0;
3111
3112 if (from) {
3113 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3114 if (r < 0)
3115 return r;
3116
3117 *from = le64toh(o->entry.monotonic);
3118 }
3119
3120 if (to) {
3121 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3122 if (r < 0)
3123 return r;
3124
3125 r = generic_array_get_plus_one(f,
3126 le64toh(o->data.entry_offset),
3127 le64toh(o->data.entry_array_offset),
3128 le64toh(o->data.n_entries)-1,
3129 &o, NULL);
3130 if (r <= 0)
3131 return r;
3132
3133 *to = le64toh(o->entry.monotonic);
3134 }
3135
3136 return 1;
3137 }
3138
3139 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3140 assert(f);
3141
3142 /* If we gained new header fields we gained new features,
3143 * hence suggest a rotation */
3144 if (le64toh(f->header->header_size) < sizeof(Header)) {
3145 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3146 return true;
3147 }
3148
3149 /* Let's check if the hash tables grew over a certain fill
3150 * level (75%, borrowing this value from Java's hash table
3151 * implementation), and if so suggest a rotation. To calculate
3152 * the fill level we need the n_data field, which only exists
3153 * in newer versions. */
3154
3155 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3156 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3157 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3158 f->path,
3159 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3160 le64toh(f->header->n_data),
3161 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3162 (unsigned long long) f->last_stat.st_size,
3163 f->last_stat.st_size / le64toh(f->header->n_data));
3164 return true;
3165 }
3166
3167 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3168 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3169 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3170 f->path,
3171 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3172 le64toh(f->header->n_fields),
3173 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3174 return true;
3175 }
3176
3177 /* Are the data objects properly indexed by field objects? */
3178 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3179 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3180 le64toh(f->header->n_data) > 0 &&
3181 le64toh(f->header->n_fields) == 0)
3182 return true;
3183
3184 if (max_file_usec > 0) {
3185 usec_t t, h;
3186
3187 h = le64toh(f->header->head_entry_realtime);
3188 t = now(CLOCK_REALTIME);
3189
3190 if (h > 0 && t > h + max_file_usec)
3191 return true;
3192 }
3193
3194 return false;
3195 }