]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
journal: assume that next entry is after previous entry
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58 * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62 * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
73
74 static int journal_file_set_online(JournalFile *f) {
75 assert(f);
76
77 if (!f->writable)
78 return -EPERM;
79
80 if (!(f->fd >= 0 && f->header))
81 return -EINVAL;
82
83 switch(f->header->state) {
84 case STATE_ONLINE:
85 return 0;
86
87 case STATE_OFFLINE:
88 f->header->state = STATE_ONLINE;
89 fsync(f->fd);
90 return 0;
91
92 default:
93 return -EINVAL;
94 }
95 }
96
97 int journal_file_set_offline(JournalFile *f) {
98 assert(f);
99
100 if (!f->writable)
101 return -EPERM;
102
103 if (!(f->fd >= 0 && f->header))
104 return -EINVAL;
105
106 if (f->header->state != STATE_ONLINE)
107 return 0;
108
109 fsync(f->fd);
110
111 f->header->state = STATE_OFFLINE;
112
113 fsync(f->fd);
114
115 return 0;
116 }
117
118 void journal_file_close(JournalFile *f) {
119 assert(f);
120
121 #ifdef HAVE_GCRYPT
122 /* Write the final tag */
123 if (f->seal && f->writable)
124 journal_file_append_tag(f);
125 #endif
126
127 /* Sync everything to disk, before we mark the file offline */
128 if (f->mmap && f->fd >= 0)
129 mmap_cache_close_fd(f->mmap, f->fd);
130
131 journal_file_set_offline(f);
132
133 if (f->header)
134 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
135
136 if (f->fd >= 0)
137 close_nointr_nofail(f->fd);
138
139 free(f->path);
140
141 if (f->mmap)
142 mmap_cache_unref(f->mmap);
143
144 hashmap_free_free(f->chain_cache);
145
146 #ifdef HAVE_XZ
147 free(f->compress_buffer);
148 #endif
149
150 #ifdef HAVE_GCRYPT
151 if (f->fss_file)
152 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
153 else if (f->fsprg_state)
154 free(f->fsprg_state);
155
156 free(f->fsprg_seed);
157
158 if (f->hmac)
159 gcry_md_close(f->hmac);
160 #endif
161
162 free(f);
163 }
164
165 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
166 Header h;
167 ssize_t k;
168 int r;
169
170 assert(f);
171
172 zero(h);
173 memcpy(h.signature, HEADER_SIGNATURE, 8);
174 h.header_size = htole64(ALIGN64(sizeof(h)));
175
176 h.incompatible_flags =
177 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
178
179 h.compatible_flags =
180 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
181
182 r = sd_id128_randomize(&h.file_id);
183 if (r < 0)
184 return r;
185
186 if (template) {
187 h.seqnum_id = template->header->seqnum_id;
188 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
189 } else
190 h.seqnum_id = h.file_id;
191
192 k = pwrite(f->fd, &h, sizeof(h), 0);
193 if (k < 0)
194 return -errno;
195
196 if (k != sizeof(h))
197 return -EIO;
198
199 return 0;
200 }
201
202 static int journal_file_refresh_header(JournalFile *f) {
203 int r;
204 sd_id128_t boot_id;
205
206 assert(f);
207
208 r = sd_id128_get_machine(&f->header->machine_id);
209 if (r < 0)
210 return r;
211
212 r = sd_id128_get_boot(&boot_id);
213 if (r < 0)
214 return r;
215
216 if (sd_id128_equal(boot_id, f->header->boot_id))
217 f->tail_entry_monotonic_valid = true;
218
219 f->header->boot_id = boot_id;
220
221 journal_file_set_online(f);
222
223 /* Sync the online state to disk */
224 fsync(f->fd);
225
226 return 0;
227 }
228
229 static int journal_file_verify_header(JournalFile *f) {
230 assert(f);
231
232 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
233 return -EBADMSG;
234
235 /* In both read and write mode we refuse to open files with
236 * incompatible flags we don't know */
237 #ifdef HAVE_XZ
238 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
239 return -EPROTONOSUPPORT;
240 #else
241 if (f->header->incompatible_flags != 0)
242 return -EPROTONOSUPPORT;
243 #endif
244
245 /* When open for writing we refuse to open files with
246 * compatible flags, too */
247 if (f->writable) {
248 #ifdef HAVE_GCRYPT
249 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
250 return -EPROTONOSUPPORT;
251 #else
252 if (f->header->compatible_flags != 0)
253 return -EPROTONOSUPPORT;
254 #endif
255 }
256
257 if (f->header->state >= _STATE_MAX)
258 return -EBADMSG;
259
260 /* The first addition was n_data, so check that we are at least this large */
261 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
262 return -EBADMSG;
263
264 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
265 return -EBADMSG;
266
267 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268 return -ENODATA;
269
270 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271 return -ENODATA;
272
273 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275 !VALID64(le64toh(f->header->tail_object_offset)) ||
276 !VALID64(le64toh(f->header->entry_array_offset)))
277 return -ENODATA;
278
279 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
283 return -ENODATA;
284
285 if (f->writable) {
286 uint8_t state;
287 sd_id128_t machine_id;
288 int r;
289
290 r = sd_id128_get_machine(&machine_id);
291 if (r < 0)
292 return r;
293
294 if (!sd_id128_equal(machine_id, f->header->machine_id))
295 return -EHOSTDOWN;
296
297 state = f->header->state;
298
299 if (state == STATE_ONLINE) {
300 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
301 return -EBUSY;
302 } else if (state == STATE_ARCHIVED)
303 return -ESHUTDOWN;
304 else if (state != STATE_OFFLINE) {
305 log_debug("Journal file %s has unknown state %u.", f->path, state);
306 return -EBUSY;
307 }
308 }
309
310 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
311
312 f->seal = JOURNAL_HEADER_SEALED(f->header);
313
314 return 0;
315 }
316
317 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
318 uint64_t old_size, new_size;
319 int r;
320
321 assert(f);
322
323 /* We assume that this file is not sparse, and we know that
324 * for sure, since we always call posix_fallocate()
325 * ourselves */
326
327 old_size =
328 le64toh(f->header->header_size) +
329 le64toh(f->header->arena_size);
330
331 new_size = PAGE_ALIGN(offset + size);
332 if (new_size < le64toh(f->header->header_size))
333 new_size = le64toh(f->header->header_size);
334
335 if (new_size <= old_size)
336 return 0;
337
338 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
339 return -E2BIG;
340
341 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
342 struct statvfs svfs;
343
344 if (fstatvfs(f->fd, &svfs) >= 0) {
345 uint64_t available;
346
347 available = svfs.f_bfree * svfs.f_bsize;
348
349 if (available >= f->metrics.keep_free)
350 available -= f->metrics.keep_free;
351 else
352 available = 0;
353
354 if (new_size - old_size > available)
355 return -E2BIG;
356 }
357 }
358
359 /* Increase by larger blocks at once */
360 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
361 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
362 new_size = f->metrics.max_size;
363
364 /* Note that the glibc fallocate() fallback is very
365 inefficient, hence we try to minimize the allocation area
366 as we can. */
367 r = posix_fallocate(f->fd, old_size, new_size - old_size);
368 if (r != 0)
369 return -r;
370
371 if (fstat(f->fd, &f->last_stat) < 0)
372 return -errno;
373
374 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
375
376 return 0;
377 }
378
379 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
380 assert(f);
381 assert(ret);
382
383 if (size <= 0)
384 return -EINVAL;
385
386 /* Avoid SIGBUS on invalid accesses */
387 if (offset + size > (uint64_t) f->last_stat.st_size) {
388 /* Hmm, out of range? Let's refresh the fstat() data
389 * first, before we trust that check. */
390
391 if (fstat(f->fd, &f->last_stat) < 0 ||
392 offset + size > (uint64_t) f->last_stat.st_size)
393 return -EADDRNOTAVAIL;
394 }
395
396 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
397 }
398
399 static uint64_t minimum_header_size(Object *o) {
400
401 static const uint64_t table[] = {
402 [OBJECT_DATA] = sizeof(DataObject),
403 [OBJECT_FIELD] = sizeof(FieldObject),
404 [OBJECT_ENTRY] = sizeof(EntryObject),
405 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
406 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
407 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
408 [OBJECT_TAG] = sizeof(TagObject),
409 };
410
411 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
412 return sizeof(ObjectHeader);
413
414 return table[o->object.type];
415 }
416
417 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
418 int r;
419 void *t;
420 Object *o;
421 uint64_t s;
422
423 assert(f);
424 assert(ret);
425
426 /* Objects may only be located at multiple of 64 bit */
427 if (!VALID64(offset))
428 return -EFAULT;
429
430
431 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
432 if (r < 0)
433 return r;
434
435 o = (Object*) t;
436 s = le64toh(o->object.size);
437
438 if (s < sizeof(ObjectHeader))
439 return -EBADMSG;
440
441 if (o->object.type <= OBJECT_UNUSED)
442 return -EBADMSG;
443
444 if (s < minimum_header_size(o))
445 return -EBADMSG;
446
447 if (type > 0 && o->object.type != type)
448 return -EBADMSG;
449
450 if (s > sizeof(ObjectHeader)) {
451 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
452 if (r < 0)
453 return r;
454
455 o = (Object*) t;
456 }
457
458 *ret = o;
459 return 0;
460 }
461
462 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
463 uint64_t r;
464
465 assert(f);
466
467 r = le64toh(f->header->tail_entry_seqnum) + 1;
468
469 if (seqnum) {
470 /* If an external seqnum counter was passed, we update
471 * both the local and the external one, and set it to
472 * the maximum of both */
473
474 if (*seqnum + 1 > r)
475 r = *seqnum + 1;
476
477 *seqnum = r;
478 }
479
480 f->header->tail_entry_seqnum = htole64(r);
481
482 if (f->header->head_entry_seqnum == 0)
483 f->header->head_entry_seqnum = htole64(r);
484
485 return r;
486 }
487
488 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
489 int r;
490 uint64_t p;
491 Object *tail, *o;
492 void *t;
493
494 assert(f);
495 assert(type > 0 && type < _OBJECT_TYPE_MAX);
496 assert(size >= sizeof(ObjectHeader));
497 assert(offset);
498 assert(ret);
499
500 r = journal_file_set_online(f);
501 if (r < 0)
502 return r;
503
504 p = le64toh(f->header->tail_object_offset);
505 if (p == 0)
506 p = le64toh(f->header->header_size);
507 else {
508 r = journal_file_move_to_object(f, -1, p, &tail);
509 if (r < 0)
510 return r;
511
512 p += ALIGN64(le64toh(tail->object.size));
513 }
514
515 r = journal_file_allocate(f, p, size);
516 if (r < 0)
517 return r;
518
519 r = journal_file_move_to(f, type, false, p, size, &t);
520 if (r < 0)
521 return r;
522
523 o = (Object*) t;
524
525 zero(o->object);
526 o->object.type = type;
527 o->object.size = htole64(size);
528
529 f->header->tail_object_offset = htole64(p);
530 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
531
532 *ret = o;
533 *offset = p;
534
535 return 0;
536 }
537
538 static int journal_file_setup_data_hash_table(JournalFile *f) {
539 uint64_t s, p;
540 Object *o;
541 int r;
542
543 assert(f);
544
545 /* We estimate that we need 1 hash table entry per 768 of
546 journal file and we want to make sure we never get beyond
547 75% fill level. Calculate the hash table size for the
548 maximum file size based on these metrics. */
549
550 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
551 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
552 s = DEFAULT_DATA_HASH_TABLE_SIZE;
553
554 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
555
556 r = journal_file_append_object(f,
557 OBJECT_DATA_HASH_TABLE,
558 offsetof(Object, hash_table.items) + s,
559 &o, &p);
560 if (r < 0)
561 return r;
562
563 memzero(o->hash_table.items, s);
564
565 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
566 f->header->data_hash_table_size = htole64(s);
567
568 return 0;
569 }
570
571 static int journal_file_setup_field_hash_table(JournalFile *f) {
572 uint64_t s, p;
573 Object *o;
574 int r;
575
576 assert(f);
577
578 /* We use a fixed size hash table for the fields as this
579 * number should grow very slowly only */
580
581 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
582 r = journal_file_append_object(f,
583 OBJECT_FIELD_HASH_TABLE,
584 offsetof(Object, hash_table.items) + s,
585 &o, &p);
586 if (r < 0)
587 return r;
588
589 memzero(o->hash_table.items, s);
590
591 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
592 f->header->field_hash_table_size = htole64(s);
593
594 return 0;
595 }
596
597 static int journal_file_map_data_hash_table(JournalFile *f) {
598 uint64_t s, p;
599 void *t;
600 int r;
601
602 assert(f);
603
604 p = le64toh(f->header->data_hash_table_offset);
605 s = le64toh(f->header->data_hash_table_size);
606
607 r = journal_file_move_to(f,
608 OBJECT_DATA_HASH_TABLE,
609 true,
610 p, s,
611 &t);
612 if (r < 0)
613 return r;
614
615 f->data_hash_table = t;
616 return 0;
617 }
618
619 static int journal_file_map_field_hash_table(JournalFile *f) {
620 uint64_t s, p;
621 void *t;
622 int r;
623
624 assert(f);
625
626 p = le64toh(f->header->field_hash_table_offset);
627 s = le64toh(f->header->field_hash_table_size);
628
629 r = journal_file_move_to(f,
630 OBJECT_FIELD_HASH_TABLE,
631 true,
632 p, s,
633 &t);
634 if (r < 0)
635 return r;
636
637 f->field_hash_table = t;
638 return 0;
639 }
640
641 static int journal_file_link_field(
642 JournalFile *f,
643 Object *o,
644 uint64_t offset,
645 uint64_t hash) {
646
647 uint64_t p, h;
648 int r;
649
650 assert(f);
651 assert(o);
652 assert(offset > 0);
653
654 if (o->object.type != OBJECT_FIELD)
655 return -EINVAL;
656
657 /* This might alter the window we are looking at */
658
659 o->field.next_hash_offset = o->field.head_data_offset = 0;
660
661 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
662 p = le64toh(f->field_hash_table[h].tail_hash_offset);
663 if (p == 0)
664 f->field_hash_table[h].head_hash_offset = htole64(offset);
665 else {
666 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
667 if (r < 0)
668 return r;
669
670 o->field.next_hash_offset = htole64(offset);
671 }
672
673 f->field_hash_table[h].tail_hash_offset = htole64(offset);
674
675 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
676 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
677
678 return 0;
679 }
680
681 static int journal_file_link_data(
682 JournalFile *f,
683 Object *o,
684 uint64_t offset,
685 uint64_t hash) {
686
687 uint64_t p, h;
688 int r;
689
690 assert(f);
691 assert(o);
692 assert(offset > 0);
693
694 if (o->object.type != OBJECT_DATA)
695 return -EINVAL;
696
697 /* This might alter the window we are looking at */
698
699 o->data.next_hash_offset = o->data.next_field_offset = 0;
700 o->data.entry_offset = o->data.entry_array_offset = 0;
701 o->data.n_entries = 0;
702
703 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
704 p = le64toh(f->data_hash_table[h].tail_hash_offset);
705 if (p == 0)
706 /* Only entry in the hash table is easy */
707 f->data_hash_table[h].head_hash_offset = htole64(offset);
708 else {
709 /* Move back to the previous data object, to patch in
710 * pointer */
711
712 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
713 if (r < 0)
714 return r;
715
716 o->data.next_hash_offset = htole64(offset);
717 }
718
719 f->data_hash_table[h].tail_hash_offset = htole64(offset);
720
721 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
722 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
723
724 return 0;
725 }
726
727 int journal_file_find_field_object_with_hash(
728 JournalFile *f,
729 const void *field, uint64_t size, uint64_t hash,
730 Object **ret, uint64_t *offset) {
731
732 uint64_t p, osize, h;
733 int r;
734
735 assert(f);
736 assert(field && size > 0);
737
738 osize = offsetof(Object, field.payload) + size;
739
740 if (f->header->field_hash_table_size == 0)
741 return -EBADMSG;
742
743 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
744 p = le64toh(f->field_hash_table[h].head_hash_offset);
745
746 while (p > 0) {
747 Object *o;
748
749 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
750 if (r < 0)
751 return r;
752
753 if (le64toh(o->field.hash) == hash &&
754 le64toh(o->object.size) == osize &&
755 memcmp(o->field.payload, field, size) == 0) {
756
757 if (ret)
758 *ret = o;
759 if (offset)
760 *offset = p;
761
762 return 1;
763 }
764
765 p = le64toh(o->field.next_hash_offset);
766 }
767
768 return 0;
769 }
770
771 int journal_file_find_field_object(
772 JournalFile *f,
773 const void *field, uint64_t size,
774 Object **ret, uint64_t *offset) {
775
776 uint64_t hash;
777
778 assert(f);
779 assert(field && size > 0);
780
781 hash = hash64(field, size);
782
783 return journal_file_find_field_object_with_hash(f,
784 field, size, hash,
785 ret, offset);
786 }
787
788 int journal_file_find_data_object_with_hash(
789 JournalFile *f,
790 const void *data, uint64_t size, uint64_t hash,
791 Object **ret, uint64_t *offset) {
792
793 uint64_t p, osize, h;
794 int r;
795
796 assert(f);
797 assert(data || size == 0);
798
799 osize = offsetof(Object, data.payload) + size;
800
801 if (f->header->data_hash_table_size == 0)
802 return -EBADMSG;
803
804 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
805 p = le64toh(f->data_hash_table[h].head_hash_offset);
806
807 while (p > 0) {
808 Object *o;
809
810 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
811 if (r < 0)
812 return r;
813
814 if (le64toh(o->data.hash) != hash)
815 goto next;
816
817 if (o->object.flags & OBJECT_COMPRESSED) {
818 #ifdef HAVE_XZ
819 uint64_t l, rsize;
820
821 l = le64toh(o->object.size);
822 if (l <= offsetof(Object, data.payload))
823 return -EBADMSG;
824
825 l -= offsetof(Object, data.payload);
826
827 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
828 return -EBADMSG;
829
830 if (rsize == size &&
831 memcmp(f->compress_buffer, data, size) == 0) {
832
833 if (ret)
834 *ret = o;
835
836 if (offset)
837 *offset = p;
838
839 return 1;
840 }
841 #else
842 return -EPROTONOSUPPORT;
843 #endif
844
845 } else if (le64toh(o->object.size) == osize &&
846 memcmp(o->data.payload, data, size) == 0) {
847
848 if (ret)
849 *ret = o;
850
851 if (offset)
852 *offset = p;
853
854 return 1;
855 }
856
857 next:
858 p = le64toh(o->data.next_hash_offset);
859 }
860
861 return 0;
862 }
863
864 int journal_file_find_data_object(
865 JournalFile *f,
866 const void *data, uint64_t size,
867 Object **ret, uint64_t *offset) {
868
869 uint64_t hash;
870
871 assert(f);
872 assert(data || size == 0);
873
874 hash = hash64(data, size);
875
876 return journal_file_find_data_object_with_hash(f,
877 data, size, hash,
878 ret, offset);
879 }
880
881 static int journal_file_append_field(
882 JournalFile *f,
883 const void *field, uint64_t size,
884 Object **ret, uint64_t *offset) {
885
886 uint64_t hash, p;
887 uint64_t osize;
888 Object *o;
889 int r;
890
891 assert(f);
892 assert(field && size > 0);
893
894 hash = hash64(field, size);
895
896 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
897 if (r < 0)
898 return r;
899 else if (r > 0) {
900
901 if (ret)
902 *ret = o;
903
904 if (offset)
905 *offset = p;
906
907 return 0;
908 }
909
910 osize = offsetof(Object, field.payload) + size;
911 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
912 if (r < 0)
913 return r;
914
915 o->field.hash = htole64(hash);
916 memcpy(o->field.payload, field, size);
917
918 r = journal_file_link_field(f, o, p, hash);
919 if (r < 0)
920 return r;
921
922 /* The linking might have altered the window, so let's
923 * refresh our pointer */
924 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
925 if (r < 0)
926 return r;
927
928 #ifdef HAVE_GCRYPT
929 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
930 if (r < 0)
931 return r;
932 #endif
933
934 if (ret)
935 *ret = o;
936
937 if (offset)
938 *offset = p;
939
940 return 0;
941 }
942
943 static int journal_file_append_data(
944 JournalFile *f,
945 const void *data, uint64_t size,
946 Object **ret, uint64_t *offset) {
947
948 uint64_t hash, p;
949 uint64_t osize;
950 Object *o;
951 int r;
952 bool compressed = false;
953 const void *eq;
954
955 assert(f);
956 assert(data || size == 0);
957
958 hash = hash64(data, size);
959
960 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
961 if (r < 0)
962 return r;
963 else if (r > 0) {
964
965 if (ret)
966 *ret = o;
967
968 if (offset)
969 *offset = p;
970
971 return 0;
972 }
973
974 osize = offsetof(Object, data.payload) + size;
975 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
976 if (r < 0)
977 return r;
978
979 o->data.hash = htole64(hash);
980
981 #ifdef HAVE_XZ
982 if (f->compress &&
983 size >= COMPRESSION_SIZE_THRESHOLD) {
984 uint64_t rsize;
985
986 compressed = compress_blob(data, size, o->data.payload, &rsize);
987
988 if (compressed) {
989 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
990 o->object.flags |= OBJECT_COMPRESSED;
991
992 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
993 }
994 }
995 #endif
996
997 if (!compressed && size > 0)
998 memcpy(o->data.payload, data, size);
999
1000 r = journal_file_link_data(f, o, p, hash);
1001 if (r < 0)
1002 return r;
1003
1004 /* The linking might have altered the window, so let's
1005 * refresh our pointer */
1006 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1007 if (r < 0)
1008 return r;
1009
1010 if (!data)
1011 eq = NULL;
1012 else
1013 eq = memchr(data, '=', size);
1014 if (eq && eq > data) {
1015 Object *fo = NULL;
1016 uint64_t fp;
1017
1018 /* Create field object ... */
1019 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1020 if (r < 0)
1021 return r;
1022
1023 /* ... and link it in. */
1024 o->data.next_field_offset = fo->field.head_data_offset;
1025 fo->field.head_data_offset = le64toh(p);
1026 }
1027
1028 #ifdef HAVE_GCRYPT
1029 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1030 if (r < 0)
1031 return r;
1032 #endif
1033
1034 if (ret)
1035 *ret = o;
1036
1037 if (offset)
1038 *offset = p;
1039
1040 return 0;
1041 }
1042
1043 uint64_t journal_file_entry_n_items(Object *o) {
1044 assert(o);
1045
1046 if (o->object.type != OBJECT_ENTRY)
1047 return 0;
1048
1049 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1050 }
1051
1052 uint64_t journal_file_entry_array_n_items(Object *o) {
1053 assert(o);
1054
1055 if (o->object.type != OBJECT_ENTRY_ARRAY)
1056 return 0;
1057
1058 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1059 }
1060
1061 uint64_t journal_file_hash_table_n_items(Object *o) {
1062 assert(o);
1063
1064 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1065 o->object.type != OBJECT_FIELD_HASH_TABLE)
1066 return 0;
1067
1068 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1069 }
1070
1071 static int link_entry_into_array(JournalFile *f,
1072 le64_t *first,
1073 le64_t *idx,
1074 uint64_t p) {
1075 int r;
1076 uint64_t n = 0, ap = 0, q, i, a, hidx;
1077 Object *o;
1078
1079 assert(f);
1080 assert(first);
1081 assert(idx);
1082 assert(p > 0);
1083
1084 a = le64toh(*first);
1085 i = hidx = le64toh(*idx);
1086 while (a > 0) {
1087
1088 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1089 if (r < 0)
1090 return r;
1091
1092 n = journal_file_entry_array_n_items(o);
1093 if (i < n) {
1094 o->entry_array.items[i] = htole64(p);
1095 *idx = htole64(hidx + 1);
1096 return 0;
1097 }
1098
1099 i -= n;
1100 ap = a;
1101 a = le64toh(o->entry_array.next_entry_array_offset);
1102 }
1103
1104 if (hidx > n)
1105 n = (hidx+1) * 2;
1106 else
1107 n = n * 2;
1108
1109 if (n < 4)
1110 n = 4;
1111
1112 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1113 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1114 &o, &q);
1115 if (r < 0)
1116 return r;
1117
1118 #ifdef HAVE_GCRYPT
1119 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1120 if (r < 0)
1121 return r;
1122 #endif
1123
1124 o->entry_array.items[i] = htole64(p);
1125
1126 if (ap == 0)
1127 *first = htole64(q);
1128 else {
1129 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1130 if (r < 0)
1131 return r;
1132
1133 o->entry_array.next_entry_array_offset = htole64(q);
1134 }
1135
1136 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1137 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1138
1139 *idx = htole64(hidx + 1);
1140
1141 return 0;
1142 }
1143
1144 static int link_entry_into_array_plus_one(JournalFile *f,
1145 le64_t *extra,
1146 le64_t *first,
1147 le64_t *idx,
1148 uint64_t p) {
1149
1150 int r;
1151
1152 assert(f);
1153 assert(extra);
1154 assert(first);
1155 assert(idx);
1156 assert(p > 0);
1157
1158 if (*idx == 0)
1159 *extra = htole64(p);
1160 else {
1161 le64_t i;
1162
1163 i = htole64(le64toh(*idx) - 1);
1164 r = link_entry_into_array(f, first, &i, p);
1165 if (r < 0)
1166 return r;
1167 }
1168
1169 *idx = htole64(le64toh(*idx) + 1);
1170 return 0;
1171 }
1172
1173 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1174 uint64_t p;
1175 int r;
1176 assert(f);
1177 assert(o);
1178 assert(offset > 0);
1179
1180 p = le64toh(o->entry.items[i].object_offset);
1181 if (p == 0)
1182 return -EINVAL;
1183
1184 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1185 if (r < 0)
1186 return r;
1187
1188 return link_entry_into_array_plus_one(f,
1189 &o->data.entry_offset,
1190 &o->data.entry_array_offset,
1191 &o->data.n_entries,
1192 offset);
1193 }
1194
1195 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1196 uint64_t n, i;
1197 int r;
1198
1199 assert(f);
1200 assert(o);
1201 assert(offset > 0);
1202
1203 if (o->object.type != OBJECT_ENTRY)
1204 return -EINVAL;
1205
1206 __sync_synchronize();
1207
1208 /* Link up the entry itself */
1209 r = link_entry_into_array(f,
1210 &f->header->entry_array_offset,
1211 &f->header->n_entries,
1212 offset);
1213 if (r < 0)
1214 return r;
1215
1216 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1217
1218 if (f->header->head_entry_realtime == 0)
1219 f->header->head_entry_realtime = o->entry.realtime;
1220
1221 f->header->tail_entry_realtime = o->entry.realtime;
1222 f->header->tail_entry_monotonic = o->entry.monotonic;
1223
1224 f->tail_entry_monotonic_valid = true;
1225
1226 /* Link up the items */
1227 n = journal_file_entry_n_items(o);
1228 for (i = 0; i < n; i++) {
1229 r = journal_file_link_entry_item(f, o, offset, i);
1230 if (r < 0)
1231 return r;
1232 }
1233
1234 return 0;
1235 }
1236
1237 static int journal_file_append_entry_internal(
1238 JournalFile *f,
1239 const dual_timestamp *ts,
1240 uint64_t xor_hash,
1241 const EntryItem items[], unsigned n_items,
1242 uint64_t *seqnum,
1243 Object **ret, uint64_t *offset) {
1244 uint64_t np;
1245 uint64_t osize;
1246 Object *o;
1247 int r;
1248
1249 assert(f);
1250 assert(items || n_items == 0);
1251 assert(ts);
1252
1253 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1254
1255 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1256 if (r < 0)
1257 return r;
1258
1259 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1260 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1261 o->entry.realtime = htole64(ts->realtime);
1262 o->entry.monotonic = htole64(ts->monotonic);
1263 o->entry.xor_hash = htole64(xor_hash);
1264 o->entry.boot_id = f->header->boot_id;
1265
1266 #ifdef HAVE_GCRYPT
1267 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1268 if (r < 0)
1269 return r;
1270 #endif
1271
1272 r = journal_file_link_entry(f, o, np);
1273 if (r < 0)
1274 return r;
1275
1276 if (ret)
1277 *ret = o;
1278
1279 if (offset)
1280 *offset = np;
1281
1282 return 0;
1283 }
1284
1285 void journal_file_post_change(JournalFile *f) {
1286 assert(f);
1287
1288 /* inotify() does not receive IN_MODIFY events from file
1289 * accesses done via mmap(). After each access we hence
1290 * trigger IN_MODIFY by truncating the journal file to its
1291 * current size which triggers IN_MODIFY. */
1292
1293 __sync_synchronize();
1294
1295 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1296 log_error("Failed to truncate file to its own size: %m");
1297 }
1298
1299 static int entry_item_cmp(const void *_a, const void *_b) {
1300 const EntryItem *a = _a, *b = _b;
1301
1302 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1303 return -1;
1304 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1305 return 1;
1306 return 0;
1307 }
1308
1309 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1310 unsigned i;
1311 EntryItem *items;
1312 int r;
1313 uint64_t xor_hash = 0;
1314 struct dual_timestamp _ts;
1315
1316 assert(f);
1317 assert(iovec || n_iovec == 0);
1318
1319 if (!ts) {
1320 dual_timestamp_get(&_ts);
1321 ts = &_ts;
1322 }
1323
1324 if (f->tail_entry_monotonic_valid &&
1325 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1326 return -EINVAL;
1327
1328 #ifdef HAVE_GCRYPT
1329 r = journal_file_maybe_append_tag(f, ts->realtime);
1330 if (r < 0)
1331 return r;
1332 #endif
1333
1334 /* alloca() can't take 0, hence let's allocate at least one */
1335 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1336
1337 for (i = 0; i < n_iovec; i++) {
1338 uint64_t p;
1339 Object *o;
1340
1341 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1342 if (r < 0)
1343 return r;
1344
1345 xor_hash ^= le64toh(o->data.hash);
1346 items[i].object_offset = htole64(p);
1347 items[i].hash = o->data.hash;
1348 }
1349
1350 /* Order by the position on disk, in order to improve seek
1351 * times for rotating media. */
1352 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1353
1354 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1355
1356 journal_file_post_change(f);
1357
1358 return r;
1359 }
1360
1361 typedef struct ChainCacheItem {
1362 uint64_t first; /* the array at the beginning of the chain */
1363 uint64_t array; /* the cached array */
1364 uint64_t begin; /* the first item in the cached array */
1365 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1366 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1367 } ChainCacheItem;
1368
1369 static void chain_cache_put(
1370 Hashmap *h,
1371 ChainCacheItem *ci,
1372 uint64_t first,
1373 uint64_t array,
1374 uint64_t begin,
1375 uint64_t total,
1376 uint64_t last_index) {
1377
1378 if (!ci) {
1379 /* If the chain item to cache for this chain is the
1380 * first one it's not worth caching anything */
1381 if (array == first)
1382 return;
1383
1384 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1385 ci = hashmap_steal_first(h);
1386 else {
1387 ci = new(ChainCacheItem, 1);
1388 if (!ci)
1389 return;
1390 }
1391
1392 ci->first = first;
1393
1394 if (hashmap_put(h, &ci->first, ci) < 0) {
1395 free(ci);
1396 return;
1397 }
1398 } else
1399 assert(ci->first == first);
1400
1401 ci->array = array;
1402 ci->begin = begin;
1403 ci->total = total;
1404 ci->last_index = last_index;
1405 }
1406
1407 static int generic_array_get(
1408 JournalFile *f,
1409 uint64_t first,
1410 uint64_t i,
1411 Object **ret, uint64_t *offset) {
1412
1413 Object *o;
1414 uint64_t p = 0, a, t = 0;
1415 int r;
1416 ChainCacheItem *ci;
1417
1418 assert(f);
1419
1420 a = first;
1421
1422 /* Try the chain cache first */
1423 ci = hashmap_get(f->chain_cache, &first);
1424 if (ci && i > ci->total) {
1425 a = ci->array;
1426 i -= ci->total;
1427 t = ci->total;
1428 }
1429
1430 while (a > 0) {
1431 uint64_t k;
1432
1433 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1434 if (r < 0)
1435 return r;
1436
1437 k = journal_file_entry_array_n_items(o);
1438 if (i < k) {
1439 p = le64toh(o->entry_array.items[i]);
1440 goto found;
1441 }
1442
1443 i -= k;
1444 t += k;
1445 a = le64toh(o->entry_array.next_entry_array_offset);
1446 }
1447
1448 return 0;
1449
1450 found:
1451 /* Let's cache this item for the next invocation */
1452 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1453
1454 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1455 if (r < 0)
1456 return r;
1457
1458 if (ret)
1459 *ret = o;
1460
1461 if (offset)
1462 *offset = p;
1463
1464 return 1;
1465 }
1466
1467 static int generic_array_get_plus_one(
1468 JournalFile *f,
1469 uint64_t extra,
1470 uint64_t first,
1471 uint64_t i,
1472 Object **ret, uint64_t *offset) {
1473
1474 Object *o;
1475
1476 assert(f);
1477
1478 if (i == 0) {
1479 int r;
1480
1481 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1482 if (r < 0)
1483 return r;
1484
1485 if (ret)
1486 *ret = o;
1487
1488 if (offset)
1489 *offset = extra;
1490
1491 return 1;
1492 }
1493
1494 return generic_array_get(f, first, i-1, ret, offset);
1495 }
1496
1497 enum {
1498 TEST_FOUND,
1499 TEST_LEFT,
1500 TEST_RIGHT
1501 };
1502
1503 static int generic_array_bisect(
1504 JournalFile *f,
1505 uint64_t first,
1506 uint64_t n,
1507 uint64_t needle,
1508 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1509 direction_t direction,
1510 Object **ret,
1511 uint64_t *offset,
1512 uint64_t *idx) {
1513
1514 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1515 bool subtract_one = false;
1516 Object *o, *array = NULL;
1517 int r;
1518 ChainCacheItem *ci;
1519
1520 assert(f);
1521 assert(test_object);
1522
1523 /* Start with the first array in the chain */
1524 a = first;
1525
1526 ci = hashmap_get(f->chain_cache, &first);
1527 if (ci && n > ci->total) {
1528 /* Ah, we have iterated this bisection array chain
1529 * previously! Let's see if we can skip ahead in the
1530 * chain, as far as the last time. But we can't jump
1531 * backwards in the chain, so let's check that
1532 * first. */
1533
1534 r = test_object(f, ci->begin, needle);
1535 if (r < 0)
1536 return r;
1537
1538 if (r == TEST_LEFT) {
1539 /* OK, what we are looking for is right of the
1540 * begin of this EntryArray, so let's jump
1541 * straight to previously cached array in the
1542 * chain */
1543
1544 a = ci->array;
1545 n -= ci->total;
1546 t = ci->total;
1547 last_index = ci->last_index;
1548 }
1549 }
1550
1551 while (a > 0) {
1552 uint64_t left, right, k, lp;
1553
1554 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1555 if (r < 0)
1556 return r;
1557
1558 k = journal_file_entry_array_n_items(array);
1559 right = MIN(k, n);
1560 if (right <= 0)
1561 return 0;
1562
1563 i = right - 1;
1564 lp = p = le64toh(array->entry_array.items[i]);
1565 if (p <= 0)
1566 return -EBADMSG;
1567
1568 r = test_object(f, p, needle);
1569 if (r < 0)
1570 return r;
1571
1572 if (r == TEST_FOUND)
1573 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1574
1575 if (r == TEST_RIGHT) {
1576 left = 0;
1577 right -= 1;
1578
1579 if (last_index != (uint64_t) -1) {
1580 assert(last_index <= right);
1581
1582 /* If we cached the last index we
1583 * looked at, let's try to not to jump
1584 * too wildly around and see if we can
1585 * limit the range to look at early to
1586 * the immediate neighbors of the last
1587 * index we looked at. */
1588
1589 if (last_index > 0) {
1590 uint64_t x = last_index - 1;
1591
1592 p = le64toh(array->entry_array.items[x]);
1593 if (p <= 0)
1594 return -EBADMSG;
1595
1596 r = test_object(f, p, needle);
1597 if (r < 0)
1598 return r;
1599
1600 if (r == TEST_FOUND)
1601 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1602
1603 if (r == TEST_RIGHT)
1604 right = x;
1605 else
1606 left = x + 1;
1607 }
1608
1609 if (last_index < right) {
1610 uint64_t y = last_index + 1;
1611
1612 p = le64toh(array->entry_array.items[y]);
1613 if (p <= 0)
1614 return -EBADMSG;
1615
1616 r = test_object(f, p, needle);
1617 if (r < 0)
1618 return r;
1619
1620 if (r == TEST_FOUND)
1621 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1622
1623 if (r == TEST_RIGHT)
1624 right = y;
1625 else
1626 left = y + 1;
1627 }
1628 }
1629
1630 for (;;) {
1631 if (left == right) {
1632 if (direction == DIRECTION_UP)
1633 subtract_one = true;
1634
1635 i = left;
1636 goto found;
1637 }
1638
1639 assert(left < right);
1640 i = (left + right) / 2;
1641
1642 p = le64toh(array->entry_array.items[i]);
1643 if (p <= 0)
1644 return -EBADMSG;
1645
1646 r = test_object(f, p, needle);
1647 if (r < 0)
1648 return r;
1649
1650 if (r == TEST_FOUND)
1651 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1652
1653 if (r == TEST_RIGHT)
1654 right = i;
1655 else
1656 left = i + 1;
1657 }
1658 }
1659
1660 if (k > n) {
1661 if (direction == DIRECTION_UP) {
1662 i = n;
1663 subtract_one = true;
1664 goto found;
1665 }
1666
1667 return 0;
1668 }
1669
1670 last_p = lp;
1671
1672 n -= k;
1673 t += k;
1674 last_index = (uint64_t) -1;
1675 a = le64toh(array->entry_array.next_entry_array_offset);
1676 }
1677
1678 return 0;
1679
1680 found:
1681 if (subtract_one && t == 0 && i == 0)
1682 return 0;
1683
1684 /* Let's cache this item for the next invocation */
1685 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1686
1687 if (subtract_one && i == 0)
1688 p = last_p;
1689 else if (subtract_one)
1690 p = le64toh(array->entry_array.items[i-1]);
1691 else
1692 p = le64toh(array->entry_array.items[i]);
1693
1694 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1695 if (r < 0)
1696 return r;
1697
1698 if (ret)
1699 *ret = o;
1700
1701 if (offset)
1702 *offset = p;
1703
1704 if (idx)
1705 *idx = t + i + (subtract_one ? -1 : 0);
1706
1707 return 1;
1708 }
1709
1710
1711 static int generic_array_bisect_plus_one(
1712 JournalFile *f,
1713 uint64_t extra,
1714 uint64_t first,
1715 uint64_t n,
1716 uint64_t needle,
1717 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1718 direction_t direction,
1719 Object **ret,
1720 uint64_t *offset,
1721 uint64_t *idx) {
1722
1723 int r;
1724 bool step_back = false;
1725 Object *o;
1726
1727 assert(f);
1728 assert(test_object);
1729
1730 if (n <= 0)
1731 return 0;
1732
1733 /* This bisects the array in object 'first', but first checks
1734 * an extra */
1735 r = test_object(f, extra, needle);
1736 if (r < 0)
1737 return r;
1738
1739 if (r == TEST_FOUND)
1740 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1741
1742 /* if we are looking with DIRECTION_UP then we need to first
1743 see if in the actual array there is a matching entry, and
1744 return the last one of that. But if there isn't any we need
1745 to return this one. Hence remember this, and return it
1746 below. */
1747 if (r == TEST_LEFT)
1748 step_back = direction == DIRECTION_UP;
1749
1750 if (r == TEST_RIGHT) {
1751 if (direction == DIRECTION_DOWN)
1752 goto found;
1753 else
1754 return 0;
1755 }
1756
1757 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1758
1759 if (r == 0 && step_back)
1760 goto found;
1761
1762 if (r > 0 && idx)
1763 (*idx) ++;
1764
1765 return r;
1766
1767 found:
1768 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1769 if (r < 0)
1770 return r;
1771
1772 if (ret)
1773 *ret = o;
1774
1775 if (offset)
1776 *offset = extra;
1777
1778 if (idx)
1779 *idx = 0;
1780
1781 return 1;
1782 }
1783
1784 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1785 assert(f);
1786 assert(p > 0);
1787
1788 if (p == needle)
1789 return TEST_FOUND;
1790 else if (p < needle)
1791 return TEST_LEFT;
1792 else
1793 return TEST_RIGHT;
1794 }
1795
1796 int journal_file_move_to_entry_by_offset(
1797 JournalFile *f,
1798 uint64_t p,
1799 direction_t direction,
1800 Object **ret,
1801 uint64_t *offset) {
1802
1803 return generic_array_bisect(f,
1804 le64toh(f->header->entry_array_offset),
1805 le64toh(f->header->n_entries),
1806 p,
1807 test_object_offset,
1808 direction,
1809 ret, offset, NULL);
1810 }
1811
1812
1813 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1814 Object *o;
1815 int r;
1816
1817 assert(f);
1818 assert(p > 0);
1819
1820 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1821 if (r < 0)
1822 return r;
1823
1824 if (le64toh(o->entry.seqnum) == needle)
1825 return TEST_FOUND;
1826 else if (le64toh(o->entry.seqnum) < needle)
1827 return TEST_LEFT;
1828 else
1829 return TEST_RIGHT;
1830 }
1831
1832 int journal_file_move_to_entry_by_seqnum(
1833 JournalFile *f,
1834 uint64_t seqnum,
1835 direction_t direction,
1836 Object **ret,
1837 uint64_t *offset) {
1838
1839 return generic_array_bisect(f,
1840 le64toh(f->header->entry_array_offset),
1841 le64toh(f->header->n_entries),
1842 seqnum,
1843 test_object_seqnum,
1844 direction,
1845 ret, offset, NULL);
1846 }
1847
1848 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1849 Object *o;
1850 int r;
1851
1852 assert(f);
1853 assert(p > 0);
1854
1855 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1856 if (r < 0)
1857 return r;
1858
1859 if (le64toh(o->entry.realtime) == needle)
1860 return TEST_FOUND;
1861 else if (le64toh(o->entry.realtime) < needle)
1862 return TEST_LEFT;
1863 else
1864 return TEST_RIGHT;
1865 }
1866
1867 int journal_file_move_to_entry_by_realtime(
1868 JournalFile *f,
1869 uint64_t realtime,
1870 direction_t direction,
1871 Object **ret,
1872 uint64_t *offset) {
1873
1874 return generic_array_bisect(f,
1875 le64toh(f->header->entry_array_offset),
1876 le64toh(f->header->n_entries),
1877 realtime,
1878 test_object_realtime,
1879 direction,
1880 ret, offset, NULL);
1881 }
1882
1883 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1884 Object *o;
1885 int r;
1886
1887 assert(f);
1888 assert(p > 0);
1889
1890 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1891 if (r < 0)
1892 return r;
1893
1894 if (le64toh(o->entry.monotonic) == needle)
1895 return TEST_FOUND;
1896 else if (le64toh(o->entry.monotonic) < needle)
1897 return TEST_LEFT;
1898 else
1899 return TEST_RIGHT;
1900 }
1901
1902 static inline int find_data_object_by_boot_id(
1903 JournalFile *f,
1904 sd_id128_t boot_id,
1905 Object **o,
1906 uint64_t *b) {
1907 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1908
1909 sd_id128_to_string(boot_id, t + 9);
1910 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1911 }
1912
1913 int journal_file_move_to_entry_by_monotonic(
1914 JournalFile *f,
1915 sd_id128_t boot_id,
1916 uint64_t monotonic,
1917 direction_t direction,
1918 Object **ret,
1919 uint64_t *offset) {
1920
1921 Object *o;
1922 int r;
1923
1924 assert(f);
1925
1926 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1927 if (r < 0)
1928 return r;
1929 if (r == 0)
1930 return -ENOENT;
1931
1932 return generic_array_bisect_plus_one(f,
1933 le64toh(o->data.entry_offset),
1934 le64toh(o->data.entry_array_offset),
1935 le64toh(o->data.n_entries),
1936 monotonic,
1937 test_object_monotonic,
1938 direction,
1939 ret, offset, NULL);
1940 }
1941
1942 int journal_file_next_entry(
1943 JournalFile *f,
1944 Object *o, uint64_t p,
1945 direction_t direction,
1946 Object **ret, uint64_t *offset) {
1947
1948 uint64_t i, n, ofs;
1949 int r;
1950
1951 assert(f);
1952 assert(p > 0 || !o);
1953
1954 n = le64toh(f->header->n_entries);
1955 if (n <= 0)
1956 return 0;
1957
1958 if (!o)
1959 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1960 else {
1961 if (o->object.type != OBJECT_ENTRY)
1962 return -EINVAL;
1963
1964 r = generic_array_bisect(f,
1965 le64toh(f->header->entry_array_offset),
1966 le64toh(f->header->n_entries),
1967 p,
1968 test_object_offset,
1969 DIRECTION_DOWN,
1970 NULL, NULL,
1971 &i);
1972 if (r <= 0)
1973 return r;
1974
1975 if (direction == DIRECTION_DOWN) {
1976 if (i >= n - 1)
1977 return 0;
1978
1979 i++;
1980 } else {
1981 if (i <= 0)
1982 return 0;
1983
1984 i--;
1985 }
1986 }
1987
1988 /* And jump to it */
1989 r = generic_array_get(f,
1990 le64toh(f->header->entry_array_offset),
1991 i,
1992 ret, &ofs);
1993 if (r <= 0)
1994 return r;
1995
1996 if (p > 0 &&
1997 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1998 log_debug("%s: entry array corrupted at entry %"PRIu64,
1999 f->path, i);
2000 return -EBADMSG;
2001 }
2002
2003 if (offset)
2004 *offset = ofs;
2005
2006 return 1;
2007 }
2008
2009 int journal_file_skip_entry(
2010 JournalFile *f,
2011 Object *o, uint64_t p,
2012 int64_t skip,
2013 Object **ret, uint64_t *offset) {
2014
2015 uint64_t i, n;
2016 int r;
2017
2018 assert(f);
2019 assert(o);
2020 assert(p > 0);
2021
2022 if (o->object.type != OBJECT_ENTRY)
2023 return -EINVAL;
2024
2025 r = generic_array_bisect(f,
2026 le64toh(f->header->entry_array_offset),
2027 le64toh(f->header->n_entries),
2028 p,
2029 test_object_offset,
2030 DIRECTION_DOWN,
2031 NULL, NULL,
2032 &i);
2033 if (r <= 0)
2034 return r;
2035
2036 /* Calculate new index */
2037 if (skip < 0) {
2038 if ((uint64_t) -skip >= i)
2039 i = 0;
2040 else
2041 i = i - (uint64_t) -skip;
2042 } else
2043 i += (uint64_t) skip;
2044
2045 n = le64toh(f->header->n_entries);
2046 if (n <= 0)
2047 return -EBADMSG;
2048
2049 if (i >= n)
2050 i = n-1;
2051
2052 return generic_array_get(f,
2053 le64toh(f->header->entry_array_offset),
2054 i,
2055 ret, offset);
2056 }
2057
2058 int journal_file_next_entry_for_data(
2059 JournalFile *f,
2060 Object *o, uint64_t p,
2061 uint64_t data_offset,
2062 direction_t direction,
2063 Object **ret, uint64_t *offset) {
2064
2065 uint64_t n, i;
2066 int r;
2067 Object *d;
2068
2069 assert(f);
2070 assert(p > 0 || !o);
2071
2072 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2073 if (r < 0)
2074 return r;
2075
2076 n = le64toh(d->data.n_entries);
2077 if (n <= 0)
2078 return n;
2079
2080 if (!o)
2081 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2082 else {
2083 if (o->object.type != OBJECT_ENTRY)
2084 return -EINVAL;
2085
2086 r = generic_array_bisect_plus_one(f,
2087 le64toh(d->data.entry_offset),
2088 le64toh(d->data.entry_array_offset),
2089 le64toh(d->data.n_entries),
2090 p,
2091 test_object_offset,
2092 DIRECTION_DOWN,
2093 NULL, NULL,
2094 &i);
2095
2096 if (r <= 0)
2097 return r;
2098
2099 if (direction == DIRECTION_DOWN) {
2100 if (i >= n - 1)
2101 return 0;
2102
2103 i++;
2104 } else {
2105 if (i <= 0)
2106 return 0;
2107
2108 i--;
2109 }
2110
2111 }
2112
2113 return generic_array_get_plus_one(f,
2114 le64toh(d->data.entry_offset),
2115 le64toh(d->data.entry_array_offset),
2116 i,
2117 ret, offset);
2118 }
2119
2120 int journal_file_move_to_entry_by_offset_for_data(
2121 JournalFile *f,
2122 uint64_t data_offset,
2123 uint64_t p,
2124 direction_t direction,
2125 Object **ret, uint64_t *offset) {
2126
2127 int r;
2128 Object *d;
2129
2130 assert(f);
2131
2132 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2133 if (r < 0)
2134 return r;
2135
2136 return generic_array_bisect_plus_one(f,
2137 le64toh(d->data.entry_offset),
2138 le64toh(d->data.entry_array_offset),
2139 le64toh(d->data.n_entries),
2140 p,
2141 test_object_offset,
2142 direction,
2143 ret, offset, NULL);
2144 }
2145
2146 int journal_file_move_to_entry_by_monotonic_for_data(
2147 JournalFile *f,
2148 uint64_t data_offset,
2149 sd_id128_t boot_id,
2150 uint64_t monotonic,
2151 direction_t direction,
2152 Object **ret, uint64_t *offset) {
2153
2154 Object *o, *d;
2155 int r;
2156 uint64_t b, z;
2157
2158 assert(f);
2159
2160 /* First, seek by time */
2161 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2162 if (r < 0)
2163 return r;
2164 if (r == 0)
2165 return -ENOENT;
2166
2167 r = generic_array_bisect_plus_one(f,
2168 le64toh(o->data.entry_offset),
2169 le64toh(o->data.entry_array_offset),
2170 le64toh(o->data.n_entries),
2171 monotonic,
2172 test_object_monotonic,
2173 direction,
2174 NULL, &z, NULL);
2175 if (r <= 0)
2176 return r;
2177
2178 /* And now, continue seeking until we find an entry that
2179 * exists in both bisection arrays */
2180
2181 for (;;) {
2182 Object *qo;
2183 uint64_t p, q;
2184
2185 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2186 if (r < 0)
2187 return r;
2188
2189 r = generic_array_bisect_plus_one(f,
2190 le64toh(d->data.entry_offset),
2191 le64toh(d->data.entry_array_offset),
2192 le64toh(d->data.n_entries),
2193 z,
2194 test_object_offset,
2195 direction,
2196 NULL, &p, NULL);
2197 if (r <= 0)
2198 return r;
2199
2200 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2201 if (r < 0)
2202 return r;
2203
2204 r = generic_array_bisect_plus_one(f,
2205 le64toh(o->data.entry_offset),
2206 le64toh(o->data.entry_array_offset),
2207 le64toh(o->data.n_entries),
2208 p,
2209 test_object_offset,
2210 direction,
2211 &qo, &q, NULL);
2212
2213 if (r <= 0)
2214 return r;
2215
2216 if (p == q) {
2217 if (ret)
2218 *ret = qo;
2219 if (offset)
2220 *offset = q;
2221
2222 return 1;
2223 }
2224
2225 z = q;
2226 }
2227 }
2228
2229 int journal_file_move_to_entry_by_seqnum_for_data(
2230 JournalFile *f,
2231 uint64_t data_offset,
2232 uint64_t seqnum,
2233 direction_t direction,
2234 Object **ret, uint64_t *offset) {
2235
2236 Object *d;
2237 int r;
2238
2239 assert(f);
2240
2241 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2242 if (r < 0)
2243 return r;
2244
2245 return generic_array_bisect_plus_one(f,
2246 le64toh(d->data.entry_offset),
2247 le64toh(d->data.entry_array_offset),
2248 le64toh(d->data.n_entries),
2249 seqnum,
2250 test_object_seqnum,
2251 direction,
2252 ret, offset, NULL);
2253 }
2254
2255 int journal_file_move_to_entry_by_realtime_for_data(
2256 JournalFile *f,
2257 uint64_t data_offset,
2258 uint64_t realtime,
2259 direction_t direction,
2260 Object **ret, uint64_t *offset) {
2261
2262 Object *d;
2263 int r;
2264
2265 assert(f);
2266
2267 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2268 if (r < 0)
2269 return r;
2270
2271 return generic_array_bisect_plus_one(f,
2272 le64toh(d->data.entry_offset),
2273 le64toh(d->data.entry_array_offset),
2274 le64toh(d->data.n_entries),
2275 realtime,
2276 test_object_realtime,
2277 direction,
2278 ret, offset, NULL);
2279 }
2280
2281 void journal_file_dump(JournalFile *f) {
2282 Object *o;
2283 int r;
2284 uint64_t p;
2285
2286 assert(f);
2287
2288 journal_file_print_header(f);
2289
2290 p = le64toh(f->header->header_size);
2291 while (p != 0) {
2292 r = journal_file_move_to_object(f, -1, p, &o);
2293 if (r < 0)
2294 goto fail;
2295
2296 switch (o->object.type) {
2297
2298 case OBJECT_UNUSED:
2299 printf("Type: OBJECT_UNUSED\n");
2300 break;
2301
2302 case OBJECT_DATA:
2303 printf("Type: OBJECT_DATA\n");
2304 break;
2305
2306 case OBJECT_FIELD:
2307 printf("Type: OBJECT_FIELD\n");
2308 break;
2309
2310 case OBJECT_ENTRY:
2311 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2312 le64toh(o->entry.seqnum),
2313 le64toh(o->entry.monotonic),
2314 le64toh(o->entry.realtime));
2315 break;
2316
2317 case OBJECT_FIELD_HASH_TABLE:
2318 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2319 break;
2320
2321 case OBJECT_DATA_HASH_TABLE:
2322 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2323 break;
2324
2325 case OBJECT_ENTRY_ARRAY:
2326 printf("Type: OBJECT_ENTRY_ARRAY\n");
2327 break;
2328
2329 case OBJECT_TAG:
2330 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2331 le64toh(o->tag.seqnum),
2332 le64toh(o->tag.epoch));
2333 break;
2334
2335 default:
2336 printf("Type: unknown (%u)\n", o->object.type);
2337 break;
2338 }
2339
2340 if (o->object.flags & OBJECT_COMPRESSED)
2341 printf("Flags: COMPRESSED\n");
2342
2343 if (p == le64toh(f->header->tail_object_offset))
2344 p = 0;
2345 else
2346 p = p + ALIGN64(le64toh(o->object.size));
2347 }
2348
2349 return;
2350 fail:
2351 log_error("File corrupt");
2352 }
2353
2354 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2355 const char *x;
2356
2357 x = format_timestamp(buf, l, t);
2358 if (x)
2359 return x;
2360 return " --- ";
2361 }
2362
2363 void journal_file_print_header(JournalFile *f) {
2364 char a[33], b[33], c[33], d[33];
2365 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2366 struct stat st;
2367 char bytes[FORMAT_BYTES_MAX];
2368
2369 assert(f);
2370
2371 printf("File Path: %s\n"
2372 "File ID: %s\n"
2373 "Machine ID: %s\n"
2374 "Boot ID: %s\n"
2375 "Sequential Number ID: %s\n"
2376 "State: %s\n"
2377 "Compatible Flags:%s%s\n"
2378 "Incompatible Flags:%s%s\n"
2379 "Header size: %"PRIu64"\n"
2380 "Arena size: %"PRIu64"\n"
2381 "Data Hash Table Size: %"PRIu64"\n"
2382 "Field Hash Table Size: %"PRIu64"\n"
2383 "Rotate Suggested: %s\n"
2384 "Head Sequential Number: %"PRIu64"\n"
2385 "Tail Sequential Number: %"PRIu64"\n"
2386 "Head Realtime Timestamp: %s\n"
2387 "Tail Realtime Timestamp: %s\n"
2388 "Tail Monotonic Timestamp: %s\n"
2389 "Objects: %"PRIu64"\n"
2390 "Entry Objects: %"PRIu64"\n",
2391 f->path,
2392 sd_id128_to_string(f->header->file_id, a),
2393 sd_id128_to_string(f->header->machine_id, b),
2394 sd_id128_to_string(f->header->boot_id, c),
2395 sd_id128_to_string(f->header->seqnum_id, d),
2396 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2397 f->header->state == STATE_ONLINE ? "ONLINE" :
2398 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2399 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2400 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2401 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2402 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2403 le64toh(f->header->header_size),
2404 le64toh(f->header->arena_size),
2405 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2406 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2407 yes_no(journal_file_rotate_suggested(f, 0)),
2408 le64toh(f->header->head_entry_seqnum),
2409 le64toh(f->header->tail_entry_seqnum),
2410 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2411 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2412 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2413 le64toh(f->header->n_objects),
2414 le64toh(f->header->n_entries));
2415
2416 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2417 printf("Data Objects: %"PRIu64"\n"
2418 "Data Hash Table Fill: %.1f%%\n",
2419 le64toh(f->header->n_data),
2420 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2421
2422 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2423 printf("Field Objects: %"PRIu64"\n"
2424 "Field Hash Table Fill: %.1f%%\n",
2425 le64toh(f->header->n_fields),
2426 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2427
2428 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2429 printf("Tag Objects: %"PRIu64"\n",
2430 le64toh(f->header->n_tags));
2431 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2432 printf("Entry Array Objects: %"PRIu64"\n",
2433 le64toh(f->header->n_entry_arrays));
2434
2435 if (fstat(f->fd, &st) >= 0)
2436 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2437 }
2438
2439 int journal_file_open(
2440 const char *fname,
2441 int flags,
2442 mode_t mode,
2443 bool compress,
2444 bool seal,
2445 JournalMetrics *metrics,
2446 MMapCache *mmap_cache,
2447 JournalFile *template,
2448 JournalFile **ret) {
2449
2450 JournalFile *f;
2451 int r;
2452 bool newly_created = false;
2453
2454 assert(fname);
2455 assert(ret);
2456
2457 if ((flags & O_ACCMODE) != O_RDONLY &&
2458 (flags & O_ACCMODE) != O_RDWR)
2459 return -EINVAL;
2460
2461 if (!endswith(fname, ".journal") &&
2462 !endswith(fname, ".journal~"))
2463 return -EINVAL;
2464
2465 f = new0(JournalFile, 1);
2466 if (!f)
2467 return -ENOMEM;
2468
2469 f->fd = -1;
2470 f->mode = mode;
2471
2472 f->flags = flags;
2473 f->prot = prot_from_flags(flags);
2474 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2475 #ifdef HAVE_XZ
2476 f->compress = compress;
2477 #endif
2478 #ifdef HAVE_GCRYPT
2479 f->seal = seal;
2480 #endif
2481
2482 if (mmap_cache)
2483 f->mmap = mmap_cache_ref(mmap_cache);
2484 else {
2485 f->mmap = mmap_cache_new();
2486 if (!f->mmap) {
2487 r = -ENOMEM;
2488 goto fail;
2489 }
2490 }
2491
2492 f->path = strdup(fname);
2493 if (!f->path) {
2494 r = -ENOMEM;
2495 goto fail;
2496 }
2497
2498 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2499 if (!f->chain_cache) {
2500 r = -ENOMEM;
2501 goto fail;
2502 }
2503
2504 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2505 if (f->fd < 0) {
2506 r = -errno;
2507 goto fail;
2508 }
2509
2510 if (fstat(f->fd, &f->last_stat) < 0) {
2511 r = -errno;
2512 goto fail;
2513 }
2514
2515 if (f->last_stat.st_size == 0 && f->writable) {
2516 #ifdef HAVE_XATTR
2517 uint64_t crtime;
2518
2519 /* Let's attach the creation time to the journal file,
2520 * so that the vacuuming code knows the age of this
2521 * file even if the file might end up corrupted one
2522 * day... Ideally we'd just use the creation time many
2523 * file systems maintain for each file, but there is
2524 * currently no usable API to query this, hence let's
2525 * emulate this via extended attributes. If extended
2526 * attributes are not supported we'll just skip this,
2527 * and rely solely on mtime/atime/ctime of the file.*/
2528
2529 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2530 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2531 #endif
2532
2533 #ifdef HAVE_GCRYPT
2534 /* Try to load the FSPRG state, and if we can't, then
2535 * just don't do sealing */
2536 if (f->seal) {
2537 r = journal_file_fss_load(f);
2538 if (r < 0)
2539 f->seal = false;
2540 }
2541 #endif
2542
2543 r = journal_file_init_header(f, template);
2544 if (r < 0)
2545 goto fail;
2546
2547 if (fstat(f->fd, &f->last_stat) < 0) {
2548 r = -errno;
2549 goto fail;
2550 }
2551
2552 newly_created = true;
2553 }
2554
2555 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2556 r = -EIO;
2557 goto fail;
2558 }
2559
2560 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2561 if (f->header == MAP_FAILED) {
2562 f->header = NULL;
2563 r = -errno;
2564 goto fail;
2565 }
2566
2567 if (!newly_created) {
2568 r = journal_file_verify_header(f);
2569 if (r < 0)
2570 goto fail;
2571 }
2572
2573 #ifdef HAVE_GCRYPT
2574 if (!newly_created && f->writable) {
2575 r = journal_file_fss_load(f);
2576 if (r < 0)
2577 goto fail;
2578 }
2579 #endif
2580
2581 if (f->writable) {
2582 if (metrics) {
2583 journal_default_metrics(metrics, f->fd);
2584 f->metrics = *metrics;
2585 } else if (template)
2586 f->metrics = template->metrics;
2587
2588 r = journal_file_refresh_header(f);
2589 if (r < 0)
2590 goto fail;
2591 }
2592
2593 #ifdef HAVE_GCRYPT
2594 r = journal_file_hmac_setup(f);
2595 if (r < 0)
2596 goto fail;
2597 #endif
2598
2599 if (newly_created) {
2600 r = journal_file_setup_field_hash_table(f);
2601 if (r < 0)
2602 goto fail;
2603
2604 r = journal_file_setup_data_hash_table(f);
2605 if (r < 0)
2606 goto fail;
2607
2608 #ifdef HAVE_GCRYPT
2609 r = journal_file_append_first_tag(f);
2610 if (r < 0)
2611 goto fail;
2612 #endif
2613 }
2614
2615 r = journal_file_map_field_hash_table(f);
2616 if (r < 0)
2617 goto fail;
2618
2619 r = journal_file_map_data_hash_table(f);
2620 if (r < 0)
2621 goto fail;
2622
2623 *ret = f;
2624 return 0;
2625
2626 fail:
2627 journal_file_close(f);
2628
2629 return r;
2630 }
2631
2632 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2633 _cleanup_free_ char *p = NULL;
2634 size_t l;
2635 JournalFile *old_file, *new_file = NULL;
2636 int r;
2637
2638 assert(f);
2639 assert(*f);
2640
2641 old_file = *f;
2642
2643 if (!old_file->writable)
2644 return -EINVAL;
2645
2646 if (!endswith(old_file->path, ".journal"))
2647 return -EINVAL;
2648
2649 l = strlen(old_file->path);
2650 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2651 (int) l - 8, old_file->path,
2652 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2653 le64toh((*f)->header->head_entry_seqnum),
2654 le64toh((*f)->header->head_entry_realtime));
2655 if (r < 0)
2656 return -ENOMEM;
2657
2658 r = rename(old_file->path, p);
2659 if (r < 0)
2660 return -errno;
2661
2662 old_file->header->state = STATE_ARCHIVED;
2663
2664 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2665 journal_file_close(old_file);
2666
2667 *f = new_file;
2668 return r;
2669 }
2670
2671 int journal_file_open_reliably(
2672 const char *fname,
2673 int flags,
2674 mode_t mode,
2675 bool compress,
2676 bool seal,
2677 JournalMetrics *metrics,
2678 MMapCache *mmap_cache,
2679 JournalFile *template,
2680 JournalFile **ret) {
2681
2682 int r;
2683 size_t l;
2684 _cleanup_free_ char *p = NULL;
2685
2686 r = journal_file_open(fname, flags, mode, compress, seal,
2687 metrics, mmap_cache, template, ret);
2688 if (r != -EBADMSG && /* corrupted */
2689 r != -ENODATA && /* truncated */
2690 r != -EHOSTDOWN && /* other machine */
2691 r != -EPROTONOSUPPORT && /* incompatible feature */
2692 r != -EBUSY && /* unclean shutdown */
2693 r != -ESHUTDOWN /* already archived */)
2694 return r;
2695
2696 if ((flags & O_ACCMODE) == O_RDONLY)
2697 return r;
2698
2699 if (!(flags & O_CREAT))
2700 return r;
2701
2702 if (!endswith(fname, ".journal"))
2703 return r;
2704
2705 /* The file is corrupted. Rotate it away and try it again (but only once) */
2706
2707 l = strlen(fname);
2708 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2709 (int) l - 8, fname,
2710 (unsigned long long) now(CLOCK_REALTIME),
2711 random_u64()) < 0)
2712 return -ENOMEM;
2713
2714 r = rename(fname, p);
2715 if (r < 0)
2716 return -errno;
2717
2718 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2719
2720 return journal_file_open(fname, flags, mode, compress, seal,
2721 metrics, mmap_cache, template, ret);
2722 }
2723
2724 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2725 uint64_t i, n;
2726 uint64_t q, xor_hash = 0;
2727 int r;
2728 EntryItem *items;
2729 dual_timestamp ts;
2730
2731 assert(from);
2732 assert(to);
2733 assert(o);
2734 assert(p);
2735
2736 if (!to->writable)
2737 return -EPERM;
2738
2739 ts.monotonic = le64toh(o->entry.monotonic);
2740 ts.realtime = le64toh(o->entry.realtime);
2741
2742 n = journal_file_entry_n_items(o);
2743 /* alloca() can't take 0, hence let's allocate at least one */
2744 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2745
2746 for (i = 0; i < n; i++) {
2747 uint64_t l, h;
2748 le64_t le_hash;
2749 size_t t;
2750 void *data;
2751 Object *u;
2752
2753 q = le64toh(o->entry.items[i].object_offset);
2754 le_hash = o->entry.items[i].hash;
2755
2756 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2757 if (r < 0)
2758 return r;
2759
2760 if (le_hash != o->data.hash)
2761 return -EBADMSG;
2762
2763 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2764 t = (size_t) l;
2765
2766 /* We hit the limit on 32bit machines */
2767 if ((uint64_t) t != l)
2768 return -E2BIG;
2769
2770 if (o->object.flags & OBJECT_COMPRESSED) {
2771 #ifdef HAVE_XZ
2772 uint64_t rsize;
2773
2774 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2775 return -EBADMSG;
2776
2777 data = from->compress_buffer;
2778 l = rsize;
2779 #else
2780 return -EPROTONOSUPPORT;
2781 #endif
2782 } else
2783 data = o->data.payload;
2784
2785 r = journal_file_append_data(to, data, l, &u, &h);
2786 if (r < 0)
2787 return r;
2788
2789 xor_hash ^= le64toh(u->data.hash);
2790 items[i].object_offset = htole64(h);
2791 items[i].hash = u->data.hash;
2792
2793 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2794 if (r < 0)
2795 return r;
2796 }
2797
2798 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2799 }
2800
2801 void journal_default_metrics(JournalMetrics *m, int fd) {
2802 uint64_t fs_size = 0;
2803 struct statvfs ss;
2804 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2805
2806 assert(m);
2807 assert(fd >= 0);
2808
2809 if (fstatvfs(fd, &ss) >= 0)
2810 fs_size = ss.f_frsize * ss.f_blocks;
2811
2812 if (m->max_use == (uint64_t) -1) {
2813
2814 if (fs_size > 0) {
2815 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2816
2817 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2818 m->max_use = DEFAULT_MAX_USE_UPPER;
2819
2820 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2821 m->max_use = DEFAULT_MAX_USE_LOWER;
2822 } else
2823 m->max_use = DEFAULT_MAX_USE_LOWER;
2824 } else {
2825 m->max_use = PAGE_ALIGN(m->max_use);
2826
2827 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2828 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2829 }
2830
2831 if (m->max_size == (uint64_t) -1) {
2832 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2833
2834 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2835 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2836 } else
2837 m->max_size = PAGE_ALIGN(m->max_size);
2838
2839 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2840 m->max_size = JOURNAL_FILE_SIZE_MIN;
2841
2842 if (m->max_size*2 > m->max_use)
2843 m->max_use = m->max_size*2;
2844
2845 if (m->min_size == (uint64_t) -1)
2846 m->min_size = JOURNAL_FILE_SIZE_MIN;
2847 else {
2848 m->min_size = PAGE_ALIGN(m->min_size);
2849
2850 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2851 m->min_size = JOURNAL_FILE_SIZE_MIN;
2852
2853 if (m->min_size > m->max_size)
2854 m->max_size = m->min_size;
2855 }
2856
2857 if (m->keep_free == (uint64_t) -1) {
2858
2859 if (fs_size > 0) {
2860 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2861
2862 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2863 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2864
2865 } else
2866 m->keep_free = DEFAULT_KEEP_FREE;
2867 }
2868
2869 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2870 format_bytes(a, sizeof(a), m->max_use),
2871 format_bytes(b, sizeof(b), m->max_size),
2872 format_bytes(c, sizeof(c), m->min_size),
2873 format_bytes(d, sizeof(d), m->keep_free));
2874 }
2875
2876 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2877 assert(f);
2878 assert(from || to);
2879
2880 if (from) {
2881 if (f->header->head_entry_realtime == 0)
2882 return -ENOENT;
2883
2884 *from = le64toh(f->header->head_entry_realtime);
2885 }
2886
2887 if (to) {
2888 if (f->header->tail_entry_realtime == 0)
2889 return -ENOENT;
2890
2891 *to = le64toh(f->header->tail_entry_realtime);
2892 }
2893
2894 return 1;
2895 }
2896
2897 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2898 Object *o;
2899 uint64_t p;
2900 int r;
2901
2902 assert(f);
2903 assert(from || to);
2904
2905 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2906 if (r <= 0)
2907 return r;
2908
2909 if (le64toh(o->data.n_entries) <= 0)
2910 return 0;
2911
2912 if (from) {
2913 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2914 if (r < 0)
2915 return r;
2916
2917 *from = le64toh(o->entry.monotonic);
2918 }
2919
2920 if (to) {
2921 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2922 if (r < 0)
2923 return r;
2924
2925 r = generic_array_get_plus_one(f,
2926 le64toh(o->data.entry_offset),
2927 le64toh(o->data.entry_array_offset),
2928 le64toh(o->data.n_entries)-1,
2929 &o, NULL);
2930 if (r <= 0)
2931 return r;
2932
2933 *to = le64toh(o->entry.monotonic);
2934 }
2935
2936 return 1;
2937 }
2938
2939 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2940 assert(f);
2941
2942 /* If we gained new header fields we gained new features,
2943 * hence suggest a rotation */
2944 if (le64toh(f->header->header_size) < sizeof(Header)) {
2945 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2946 return true;
2947 }
2948
2949 /* Let's check if the hash tables grew over a certain fill
2950 * level (75%, borrowing this value from Java's hash table
2951 * implementation), and if so suggest a rotation. To calculate
2952 * the fill level we need the n_data field, which only exists
2953 * in newer versions. */
2954
2955 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2956 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2957 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2958 f->path,
2959 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2960 le64toh(f->header->n_data),
2961 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2962 (unsigned long long) f->last_stat.st_size,
2963 f->last_stat.st_size / le64toh(f->header->n_data));
2964 return true;
2965 }
2966
2967 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2968 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2969 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2970 f->path,
2971 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2972 le64toh(f->header->n_fields),
2973 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2974 return true;
2975 }
2976
2977 /* Are the data objects properly indexed by field objects? */
2978 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2979 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2980 le64toh(f->header->n_data) > 0 &&
2981 le64toh(f->header->n_fields) == 0)
2982 return true;
2983
2984 if (max_file_usec > 0) {
2985 usec_t t, h;
2986
2987 h = le64toh(f->header->head_entry_realtime);
2988 t = now(CLOCK_REALTIME);
2989
2990 if (h > 0 && t > h + max_file_usec)
2991 return true;
2992 }
2993
2994 return false;
2995 }