]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
journal/compress: return early in uncompress_startswith
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <sys/xattr.h>
30
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
34 #include "lookup3.h"
35 #include "compress.h"
36 #include "fsprg.h"
37
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
45
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53
54 /* This is the upper bound if we deduce the keep_free value from the
55 * file system size */
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58 /* This is the keep_free value when we can't determine the system
59 * size */
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
67
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70
71 static int journal_file_set_online(JournalFile *f) {
72 assert(f);
73
74 if (!f->writable)
75 return -EPERM;
76
77 if (!(f->fd >= 0 && f->header))
78 return -EINVAL;
79
80 switch(f->header->state) {
81 case STATE_ONLINE:
82 return 0;
83
84 case STATE_OFFLINE:
85 f->header->state = STATE_ONLINE;
86 fsync(f->fd);
87 return 0;
88
89 default:
90 return -EINVAL;
91 }
92 }
93
94 int journal_file_set_offline(JournalFile *f) {
95 assert(f);
96
97 if (!f->writable)
98 return -EPERM;
99
100 if (!(f->fd >= 0 && f->header))
101 return -EINVAL;
102
103 if (f->header->state != STATE_ONLINE)
104 return 0;
105
106 fsync(f->fd);
107
108 f->header->state = STATE_OFFLINE;
109
110 fsync(f->fd);
111
112 return 0;
113 }
114
115 void journal_file_close(JournalFile *f) {
116 assert(f);
117
118 #ifdef HAVE_GCRYPT
119 /* Write the final tag */
120 if (f->seal && f->writable)
121 journal_file_append_tag(f);
122 #endif
123
124 /* Sync everything to disk, before we mark the file offline */
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
127
128 journal_file_set_offline(f);
129
130 if (f->header)
131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
132
133 safe_close(f->fd);
134 free(f->path);
135
136 if (f->mmap)
137 mmap_cache_unref(f->mmap);
138
139 hashmap_free_free(f->chain_cache);
140
141 #ifdef HAVE_XZ
142 free(f->compress_buffer);
143 #endif
144
145 #ifdef HAVE_GCRYPT
146 if (f->fss_file)
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
150
151 free(f->fsprg_seed);
152
153 if (f->hmac)
154 gcry_md_close(f->hmac);
155 #endif
156
157 free(f);
158 }
159
160 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
161 Header h;
162 ssize_t k;
163 int r;
164
165 assert(f);
166
167 zero(h);
168 memcpy(h.signature, HEADER_SIGNATURE, 8);
169 h.header_size = htole64(ALIGN64(sizeof(h)));
170
171 h.incompatible_flags =
172 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
173
174 h.compatible_flags =
175 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
176
177 r = sd_id128_randomize(&h.file_id);
178 if (r < 0)
179 return r;
180
181 if (template) {
182 h.seqnum_id = template->header->seqnum_id;
183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
184 } else
185 h.seqnum_id = h.file_id;
186
187 k = pwrite(f->fd, &h, sizeof(h), 0);
188 if (k < 0)
189 return -errno;
190
191 if (k != sizeof(h))
192 return -EIO;
193
194 return 0;
195 }
196
197 static int journal_file_refresh_header(JournalFile *f) {
198 int r;
199 sd_id128_t boot_id;
200
201 assert(f);
202
203 r = sd_id128_get_machine(&f->header->machine_id);
204 if (r < 0)
205 return r;
206
207 r = sd_id128_get_boot(&boot_id);
208 if (r < 0)
209 return r;
210
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
213
214 f->header->boot_id = boot_id;
215
216 journal_file_set_online(f);
217
218 /* Sync the online state to disk */
219 fsync(f->fd);
220
221 return 0;
222 }
223
224 static int journal_file_verify_header(JournalFile *f) {
225 assert(f);
226
227 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
228 return -EBADMSG;
229
230 /* In both read and write mode we refuse to open files with
231 * incompatible flags we don't know */
232 #ifdef HAVE_XZ
233 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
234 return -EPROTONOSUPPORT;
235 #else
236 if (f->header->incompatible_flags != 0)
237 return -EPROTONOSUPPORT;
238 #endif
239
240 /* When open for writing we refuse to open files with
241 * compatible flags, too */
242 if (f->writable) {
243 #ifdef HAVE_GCRYPT
244 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
245 return -EPROTONOSUPPORT;
246 #else
247 if (f->header->compatible_flags != 0)
248 return -EPROTONOSUPPORT;
249 #endif
250 }
251
252 if (f->header->state >= _STATE_MAX)
253 return -EBADMSG;
254
255 /* The first addition was n_data, so check that we are at least this large */
256 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
257 return -EBADMSG;
258
259 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
260 return -EBADMSG;
261
262 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
263 return -ENODATA;
264
265 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
266 return -ENODATA;
267
268 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
269 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
270 !VALID64(le64toh(f->header->tail_object_offset)) ||
271 !VALID64(le64toh(f->header->entry_array_offset)))
272 return -ENODATA;
273
274 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
275 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
276 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
277 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
278 return -ENODATA;
279
280 if (f->writable) {
281 uint8_t state;
282 sd_id128_t machine_id;
283 int r;
284
285 r = sd_id128_get_machine(&machine_id);
286 if (r < 0)
287 return r;
288
289 if (!sd_id128_equal(machine_id, f->header->machine_id))
290 return -EHOSTDOWN;
291
292 state = f->header->state;
293
294 if (state == STATE_ONLINE) {
295 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
296 return -EBUSY;
297 } else if (state == STATE_ARCHIVED)
298 return -ESHUTDOWN;
299 else if (state != STATE_OFFLINE) {
300 log_debug("Journal file %s has unknown state %u.", f->path, state);
301 return -EBUSY;
302 }
303 }
304
305 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
306
307 f->seal = JOURNAL_HEADER_SEALED(f->header);
308
309 return 0;
310 }
311
312 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
313 uint64_t old_size, new_size;
314 int r;
315
316 assert(f);
317
318 /* We assume that this file is not sparse, and we know that
319 * for sure, since we always call posix_fallocate()
320 * ourselves */
321
322 old_size =
323 le64toh(f->header->header_size) +
324 le64toh(f->header->arena_size);
325
326 new_size = PAGE_ALIGN(offset + size);
327 if (new_size < le64toh(f->header->header_size))
328 new_size = le64toh(f->header->header_size);
329
330 if (new_size <= old_size)
331 return 0;
332
333 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
334 return -E2BIG;
335
336 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
337 struct statvfs svfs;
338
339 if (fstatvfs(f->fd, &svfs) >= 0) {
340 uint64_t available;
341
342 available = svfs.f_bfree * svfs.f_bsize;
343
344 if (available >= f->metrics.keep_free)
345 available -= f->metrics.keep_free;
346 else
347 available = 0;
348
349 if (new_size - old_size > available)
350 return -E2BIG;
351 }
352 }
353
354 /* Increase by larger blocks at once */
355 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
356 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
357 new_size = f->metrics.max_size;
358
359 /* Note that the glibc fallocate() fallback is very
360 inefficient, hence we try to minimize the allocation area
361 as we can. */
362 r = posix_fallocate(f->fd, old_size, new_size - old_size);
363 if (r != 0)
364 return -r;
365
366 if (fstat(f->fd, &f->last_stat) < 0)
367 return -errno;
368
369 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
370
371 return 0;
372 }
373
374 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
375 assert(f);
376 assert(ret);
377
378 if (size <= 0)
379 return -EINVAL;
380
381 /* Avoid SIGBUS on invalid accesses */
382 if (offset + size > (uint64_t) f->last_stat.st_size) {
383 /* Hmm, out of range? Let's refresh the fstat() data
384 * first, before we trust that check. */
385
386 if (fstat(f->fd, &f->last_stat) < 0 ||
387 offset + size > (uint64_t) f->last_stat.st_size)
388 return -EADDRNOTAVAIL;
389 }
390
391 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
392 }
393
394 static uint64_t minimum_header_size(Object *o) {
395
396 static const uint64_t table[] = {
397 [OBJECT_DATA] = sizeof(DataObject),
398 [OBJECT_FIELD] = sizeof(FieldObject),
399 [OBJECT_ENTRY] = sizeof(EntryObject),
400 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
401 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
402 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
403 [OBJECT_TAG] = sizeof(TagObject),
404 };
405
406 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
407 return sizeof(ObjectHeader);
408
409 return table[o->object.type];
410 }
411
412 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
413 int r;
414 void *t;
415 Object *o;
416 uint64_t s;
417
418 assert(f);
419 assert(ret);
420
421 /* Objects may only be located at multiple of 64 bit */
422 if (!VALID64(offset))
423 return -EFAULT;
424
425
426 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
427 if (r < 0)
428 return r;
429
430 o = (Object*) t;
431 s = le64toh(o->object.size);
432
433 if (s < sizeof(ObjectHeader))
434 return -EBADMSG;
435
436 if (o->object.type <= OBJECT_UNUSED)
437 return -EBADMSG;
438
439 if (s < minimum_header_size(o))
440 return -EBADMSG;
441
442 if (type > 0 && o->object.type != type)
443 return -EBADMSG;
444
445 if (s > sizeof(ObjectHeader)) {
446 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
447 if (r < 0)
448 return r;
449
450 o = (Object*) t;
451 }
452
453 *ret = o;
454 return 0;
455 }
456
457 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
458 uint64_t r;
459
460 assert(f);
461
462 r = le64toh(f->header->tail_entry_seqnum) + 1;
463
464 if (seqnum) {
465 /* If an external seqnum counter was passed, we update
466 * both the local and the external one, and set it to
467 * the maximum of both */
468
469 if (*seqnum + 1 > r)
470 r = *seqnum + 1;
471
472 *seqnum = r;
473 }
474
475 f->header->tail_entry_seqnum = htole64(r);
476
477 if (f->header->head_entry_seqnum == 0)
478 f->header->head_entry_seqnum = htole64(r);
479
480 return r;
481 }
482
483 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
484 int r;
485 uint64_t p;
486 Object *tail, *o;
487 void *t;
488
489 assert(f);
490 assert(type > 0 && type < _OBJECT_TYPE_MAX);
491 assert(size >= sizeof(ObjectHeader));
492 assert(offset);
493 assert(ret);
494
495 r = journal_file_set_online(f);
496 if (r < 0)
497 return r;
498
499 p = le64toh(f->header->tail_object_offset);
500 if (p == 0)
501 p = le64toh(f->header->header_size);
502 else {
503 r = journal_file_move_to_object(f, -1, p, &tail);
504 if (r < 0)
505 return r;
506
507 p += ALIGN64(le64toh(tail->object.size));
508 }
509
510 r = journal_file_allocate(f, p, size);
511 if (r < 0)
512 return r;
513
514 r = journal_file_move_to(f, type, false, p, size, &t);
515 if (r < 0)
516 return r;
517
518 o = (Object*) t;
519
520 zero(o->object);
521 o->object.type = type;
522 o->object.size = htole64(size);
523
524 f->header->tail_object_offset = htole64(p);
525 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
526
527 *ret = o;
528 *offset = p;
529
530 return 0;
531 }
532
533 static int journal_file_setup_data_hash_table(JournalFile *f) {
534 uint64_t s, p;
535 Object *o;
536 int r;
537
538 assert(f);
539
540 /* We estimate that we need 1 hash table entry per 768 of
541 journal file and we want to make sure we never get beyond
542 75% fill level. Calculate the hash table size for the
543 maximum file size based on these metrics. */
544
545 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
546 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
547 s = DEFAULT_DATA_HASH_TABLE_SIZE;
548
549 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
550
551 r = journal_file_append_object(f,
552 OBJECT_DATA_HASH_TABLE,
553 offsetof(Object, hash_table.items) + s,
554 &o, &p);
555 if (r < 0)
556 return r;
557
558 memzero(o->hash_table.items, s);
559
560 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
561 f->header->data_hash_table_size = htole64(s);
562
563 return 0;
564 }
565
566 static int journal_file_setup_field_hash_table(JournalFile *f) {
567 uint64_t s, p;
568 Object *o;
569 int r;
570
571 assert(f);
572
573 /* We use a fixed size hash table for the fields as this
574 * number should grow very slowly only */
575
576 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
577 r = journal_file_append_object(f,
578 OBJECT_FIELD_HASH_TABLE,
579 offsetof(Object, hash_table.items) + s,
580 &o, &p);
581 if (r < 0)
582 return r;
583
584 memzero(o->hash_table.items, s);
585
586 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
587 f->header->field_hash_table_size = htole64(s);
588
589 return 0;
590 }
591
592 static int journal_file_map_data_hash_table(JournalFile *f) {
593 uint64_t s, p;
594 void *t;
595 int r;
596
597 assert(f);
598
599 p = le64toh(f->header->data_hash_table_offset);
600 s = le64toh(f->header->data_hash_table_size);
601
602 r = journal_file_move_to(f,
603 OBJECT_DATA_HASH_TABLE,
604 true,
605 p, s,
606 &t);
607 if (r < 0)
608 return r;
609
610 f->data_hash_table = t;
611 return 0;
612 }
613
614 static int journal_file_map_field_hash_table(JournalFile *f) {
615 uint64_t s, p;
616 void *t;
617 int r;
618
619 assert(f);
620
621 p = le64toh(f->header->field_hash_table_offset);
622 s = le64toh(f->header->field_hash_table_size);
623
624 r = journal_file_move_to(f,
625 OBJECT_FIELD_HASH_TABLE,
626 true,
627 p, s,
628 &t);
629 if (r < 0)
630 return r;
631
632 f->field_hash_table = t;
633 return 0;
634 }
635
636 static int journal_file_link_field(
637 JournalFile *f,
638 Object *o,
639 uint64_t offset,
640 uint64_t hash) {
641
642 uint64_t p, h;
643 int r;
644
645 assert(f);
646 assert(o);
647 assert(offset > 0);
648
649 if (o->object.type != OBJECT_FIELD)
650 return -EINVAL;
651
652 /* This might alter the window we are looking at */
653
654 o->field.next_hash_offset = o->field.head_data_offset = 0;
655
656 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
657 p = le64toh(f->field_hash_table[h].tail_hash_offset);
658 if (p == 0)
659 f->field_hash_table[h].head_hash_offset = htole64(offset);
660 else {
661 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
662 if (r < 0)
663 return r;
664
665 o->field.next_hash_offset = htole64(offset);
666 }
667
668 f->field_hash_table[h].tail_hash_offset = htole64(offset);
669
670 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
671 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
672
673 return 0;
674 }
675
676 static int journal_file_link_data(
677 JournalFile *f,
678 Object *o,
679 uint64_t offset,
680 uint64_t hash) {
681
682 uint64_t p, h;
683 int r;
684
685 assert(f);
686 assert(o);
687 assert(offset > 0);
688
689 if (o->object.type != OBJECT_DATA)
690 return -EINVAL;
691
692 /* This might alter the window we are looking at */
693
694 o->data.next_hash_offset = o->data.next_field_offset = 0;
695 o->data.entry_offset = o->data.entry_array_offset = 0;
696 o->data.n_entries = 0;
697
698 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
699 p = le64toh(f->data_hash_table[h].tail_hash_offset);
700 if (p == 0)
701 /* Only entry in the hash table is easy */
702 f->data_hash_table[h].head_hash_offset = htole64(offset);
703 else {
704 /* Move back to the previous data object, to patch in
705 * pointer */
706
707 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
708 if (r < 0)
709 return r;
710
711 o->data.next_hash_offset = htole64(offset);
712 }
713
714 f->data_hash_table[h].tail_hash_offset = htole64(offset);
715
716 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
717 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
718
719 return 0;
720 }
721
722 int journal_file_find_field_object_with_hash(
723 JournalFile *f,
724 const void *field, uint64_t size, uint64_t hash,
725 Object **ret, uint64_t *offset) {
726
727 uint64_t p, osize, h;
728 int r;
729
730 assert(f);
731 assert(field && size > 0);
732
733 osize = offsetof(Object, field.payload) + size;
734
735 if (f->header->field_hash_table_size == 0)
736 return -EBADMSG;
737
738 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
739 p = le64toh(f->field_hash_table[h].head_hash_offset);
740
741 while (p > 0) {
742 Object *o;
743
744 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
745 if (r < 0)
746 return r;
747
748 if (le64toh(o->field.hash) == hash &&
749 le64toh(o->object.size) == osize &&
750 memcmp(o->field.payload, field, size) == 0) {
751
752 if (ret)
753 *ret = o;
754 if (offset)
755 *offset = p;
756
757 return 1;
758 }
759
760 p = le64toh(o->field.next_hash_offset);
761 }
762
763 return 0;
764 }
765
766 int journal_file_find_field_object(
767 JournalFile *f,
768 const void *field, uint64_t size,
769 Object **ret, uint64_t *offset) {
770
771 uint64_t hash;
772
773 assert(f);
774 assert(field && size > 0);
775
776 hash = hash64(field, size);
777
778 return journal_file_find_field_object_with_hash(f,
779 field, size, hash,
780 ret, offset);
781 }
782
783 int journal_file_find_data_object_with_hash(
784 JournalFile *f,
785 const void *data, uint64_t size, uint64_t hash,
786 Object **ret, uint64_t *offset) {
787
788 uint64_t p, osize, h;
789 int r;
790
791 assert(f);
792 assert(data || size == 0);
793
794 osize = offsetof(Object, data.payload) + size;
795
796 if (f->header->data_hash_table_size == 0)
797 return -EBADMSG;
798
799 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
800 p = le64toh(f->data_hash_table[h].head_hash_offset);
801
802 while (p > 0) {
803 Object *o;
804
805 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
806 if (r < 0)
807 return r;
808
809 if (le64toh(o->data.hash) != hash)
810 goto next;
811
812 if (o->object.flags & OBJECT_COMPRESSED) {
813 #ifdef HAVE_XZ
814 uint64_t l, rsize;
815
816 l = le64toh(o->object.size);
817 if (l <= offsetof(Object, data.payload))
818 return -EBADMSG;
819
820 l -= offsetof(Object, data.payload);
821
822 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
823 return -EBADMSG;
824
825 if (rsize == size &&
826 memcmp(f->compress_buffer, data, size) == 0) {
827
828 if (ret)
829 *ret = o;
830
831 if (offset)
832 *offset = p;
833
834 return 1;
835 }
836 #else
837 return -EPROTONOSUPPORT;
838 #endif
839
840 } else if (le64toh(o->object.size) == osize &&
841 memcmp(o->data.payload, data, size) == 0) {
842
843 if (ret)
844 *ret = o;
845
846 if (offset)
847 *offset = p;
848
849 return 1;
850 }
851
852 next:
853 p = le64toh(o->data.next_hash_offset);
854 }
855
856 return 0;
857 }
858
859 int journal_file_find_data_object(
860 JournalFile *f,
861 const void *data, uint64_t size,
862 Object **ret, uint64_t *offset) {
863
864 uint64_t hash;
865
866 assert(f);
867 assert(data || size == 0);
868
869 hash = hash64(data, size);
870
871 return journal_file_find_data_object_with_hash(f,
872 data, size, hash,
873 ret, offset);
874 }
875
876 static int journal_file_append_field(
877 JournalFile *f,
878 const void *field, uint64_t size,
879 Object **ret, uint64_t *offset) {
880
881 uint64_t hash, p;
882 uint64_t osize;
883 Object *o;
884 int r;
885
886 assert(f);
887 assert(field && size > 0);
888
889 hash = hash64(field, size);
890
891 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
892 if (r < 0)
893 return r;
894 else if (r > 0) {
895
896 if (ret)
897 *ret = o;
898
899 if (offset)
900 *offset = p;
901
902 return 0;
903 }
904
905 osize = offsetof(Object, field.payload) + size;
906 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
907 if (r < 0)
908 return r;
909
910 o->field.hash = htole64(hash);
911 memcpy(o->field.payload, field, size);
912
913 r = journal_file_link_field(f, o, p, hash);
914 if (r < 0)
915 return r;
916
917 /* The linking might have altered the window, so let's
918 * refresh our pointer */
919 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
920 if (r < 0)
921 return r;
922
923 #ifdef HAVE_GCRYPT
924 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
925 if (r < 0)
926 return r;
927 #endif
928
929 if (ret)
930 *ret = o;
931
932 if (offset)
933 *offset = p;
934
935 return 0;
936 }
937
938 static int journal_file_append_data(
939 JournalFile *f,
940 const void *data, uint64_t size,
941 Object **ret, uint64_t *offset) {
942
943 uint64_t hash, p;
944 uint64_t osize;
945 Object *o;
946 int r;
947 bool compressed = false;
948 const void *eq;
949
950 assert(f);
951 assert(data || size == 0);
952
953 hash = hash64(data, size);
954
955 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
956 if (r < 0)
957 return r;
958 else if (r > 0) {
959
960 if (ret)
961 *ret = o;
962
963 if (offset)
964 *offset = p;
965
966 return 0;
967 }
968
969 osize = offsetof(Object, data.payload) + size;
970 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
971 if (r < 0)
972 return r;
973
974 o->data.hash = htole64(hash);
975
976 #ifdef HAVE_XZ
977 if (f->compress &&
978 size >= COMPRESSION_SIZE_THRESHOLD) {
979 uint64_t rsize;
980
981 compressed = compress_blob(data, size, o->data.payload, &rsize);
982
983 if (compressed) {
984 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
985 o->object.flags |= OBJECT_COMPRESSED;
986
987 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
988 }
989 }
990 #endif
991
992 if (!compressed && size > 0)
993 memcpy(o->data.payload, data, size);
994
995 r = journal_file_link_data(f, o, p, hash);
996 if (r < 0)
997 return r;
998
999 /* The linking might have altered the window, so let's
1000 * refresh our pointer */
1001 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1002 if (r < 0)
1003 return r;
1004
1005 if (!data)
1006 eq = NULL;
1007 else
1008 eq = memchr(data, '=', size);
1009 if (eq && eq > data) {
1010 Object *fo = NULL;
1011 uint64_t fp;
1012
1013 /* Create field object ... */
1014 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1015 if (r < 0)
1016 return r;
1017
1018 /* ... and link it in. */
1019 o->data.next_field_offset = fo->field.head_data_offset;
1020 fo->field.head_data_offset = le64toh(p);
1021 }
1022
1023 #ifdef HAVE_GCRYPT
1024 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1025 if (r < 0)
1026 return r;
1027 #endif
1028
1029 if (ret)
1030 *ret = o;
1031
1032 if (offset)
1033 *offset = p;
1034
1035 return 0;
1036 }
1037
1038 uint64_t journal_file_entry_n_items(Object *o) {
1039 assert(o);
1040
1041 if (o->object.type != OBJECT_ENTRY)
1042 return 0;
1043
1044 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1045 }
1046
1047 uint64_t journal_file_entry_array_n_items(Object *o) {
1048 assert(o);
1049
1050 if (o->object.type != OBJECT_ENTRY_ARRAY)
1051 return 0;
1052
1053 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1054 }
1055
1056 uint64_t journal_file_hash_table_n_items(Object *o) {
1057 assert(o);
1058
1059 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1060 o->object.type != OBJECT_FIELD_HASH_TABLE)
1061 return 0;
1062
1063 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1064 }
1065
1066 static int link_entry_into_array(JournalFile *f,
1067 le64_t *first,
1068 le64_t *idx,
1069 uint64_t p) {
1070 int r;
1071 uint64_t n = 0, ap = 0, q, i, a, hidx;
1072 Object *o;
1073
1074 assert(f);
1075 assert(first);
1076 assert(idx);
1077 assert(p > 0);
1078
1079 a = le64toh(*first);
1080 i = hidx = le64toh(*idx);
1081 while (a > 0) {
1082
1083 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1084 if (r < 0)
1085 return r;
1086
1087 n = journal_file_entry_array_n_items(o);
1088 if (i < n) {
1089 o->entry_array.items[i] = htole64(p);
1090 *idx = htole64(hidx + 1);
1091 return 0;
1092 }
1093
1094 i -= n;
1095 ap = a;
1096 a = le64toh(o->entry_array.next_entry_array_offset);
1097 }
1098
1099 if (hidx > n)
1100 n = (hidx+1) * 2;
1101 else
1102 n = n * 2;
1103
1104 if (n < 4)
1105 n = 4;
1106
1107 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1108 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1109 &o, &q);
1110 if (r < 0)
1111 return r;
1112
1113 #ifdef HAVE_GCRYPT
1114 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1115 if (r < 0)
1116 return r;
1117 #endif
1118
1119 o->entry_array.items[i] = htole64(p);
1120
1121 if (ap == 0)
1122 *first = htole64(q);
1123 else {
1124 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1125 if (r < 0)
1126 return r;
1127
1128 o->entry_array.next_entry_array_offset = htole64(q);
1129 }
1130
1131 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1132 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1133
1134 *idx = htole64(hidx + 1);
1135
1136 return 0;
1137 }
1138
1139 static int link_entry_into_array_plus_one(JournalFile *f,
1140 le64_t *extra,
1141 le64_t *first,
1142 le64_t *idx,
1143 uint64_t p) {
1144
1145 int r;
1146
1147 assert(f);
1148 assert(extra);
1149 assert(first);
1150 assert(idx);
1151 assert(p > 0);
1152
1153 if (*idx == 0)
1154 *extra = htole64(p);
1155 else {
1156 le64_t i;
1157
1158 i = htole64(le64toh(*idx) - 1);
1159 r = link_entry_into_array(f, first, &i, p);
1160 if (r < 0)
1161 return r;
1162 }
1163
1164 *idx = htole64(le64toh(*idx) + 1);
1165 return 0;
1166 }
1167
1168 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1169 uint64_t p;
1170 int r;
1171 assert(f);
1172 assert(o);
1173 assert(offset > 0);
1174
1175 p = le64toh(o->entry.items[i].object_offset);
1176 if (p == 0)
1177 return -EINVAL;
1178
1179 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1180 if (r < 0)
1181 return r;
1182
1183 return link_entry_into_array_plus_one(f,
1184 &o->data.entry_offset,
1185 &o->data.entry_array_offset,
1186 &o->data.n_entries,
1187 offset);
1188 }
1189
1190 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1191 uint64_t n, i;
1192 int r;
1193
1194 assert(f);
1195 assert(o);
1196 assert(offset > 0);
1197
1198 if (o->object.type != OBJECT_ENTRY)
1199 return -EINVAL;
1200
1201 __sync_synchronize();
1202
1203 /* Link up the entry itself */
1204 r = link_entry_into_array(f,
1205 &f->header->entry_array_offset,
1206 &f->header->n_entries,
1207 offset);
1208 if (r < 0)
1209 return r;
1210
1211 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1212
1213 if (f->header->head_entry_realtime == 0)
1214 f->header->head_entry_realtime = o->entry.realtime;
1215
1216 f->header->tail_entry_realtime = o->entry.realtime;
1217 f->header->tail_entry_monotonic = o->entry.monotonic;
1218
1219 f->tail_entry_monotonic_valid = true;
1220
1221 /* Link up the items */
1222 n = journal_file_entry_n_items(o);
1223 for (i = 0; i < n; i++) {
1224 r = journal_file_link_entry_item(f, o, offset, i);
1225 if (r < 0)
1226 return r;
1227 }
1228
1229 return 0;
1230 }
1231
1232 static int journal_file_append_entry_internal(
1233 JournalFile *f,
1234 const dual_timestamp *ts,
1235 uint64_t xor_hash,
1236 const EntryItem items[], unsigned n_items,
1237 uint64_t *seqnum,
1238 Object **ret, uint64_t *offset) {
1239 uint64_t np;
1240 uint64_t osize;
1241 Object *o;
1242 int r;
1243
1244 assert(f);
1245 assert(items || n_items == 0);
1246 assert(ts);
1247
1248 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1249
1250 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1251 if (r < 0)
1252 return r;
1253
1254 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1255 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1256 o->entry.realtime = htole64(ts->realtime);
1257 o->entry.monotonic = htole64(ts->monotonic);
1258 o->entry.xor_hash = htole64(xor_hash);
1259 o->entry.boot_id = f->header->boot_id;
1260
1261 #ifdef HAVE_GCRYPT
1262 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1263 if (r < 0)
1264 return r;
1265 #endif
1266
1267 r = journal_file_link_entry(f, o, np);
1268 if (r < 0)
1269 return r;
1270
1271 if (ret)
1272 *ret = o;
1273
1274 if (offset)
1275 *offset = np;
1276
1277 return 0;
1278 }
1279
1280 void journal_file_post_change(JournalFile *f) {
1281 assert(f);
1282
1283 /* inotify() does not receive IN_MODIFY events from file
1284 * accesses done via mmap(). After each access we hence
1285 * trigger IN_MODIFY by truncating the journal file to its
1286 * current size which triggers IN_MODIFY. */
1287
1288 __sync_synchronize();
1289
1290 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1291 log_error("Failed to truncate file to its own size: %m");
1292 }
1293
1294 static int entry_item_cmp(const void *_a, const void *_b) {
1295 const EntryItem *a = _a, *b = _b;
1296
1297 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1298 return -1;
1299 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1300 return 1;
1301 return 0;
1302 }
1303
1304 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1305 unsigned i;
1306 EntryItem *items;
1307 int r;
1308 uint64_t xor_hash = 0;
1309 struct dual_timestamp _ts;
1310
1311 assert(f);
1312 assert(iovec || n_iovec == 0);
1313
1314 if (!ts) {
1315 dual_timestamp_get(&_ts);
1316 ts = &_ts;
1317 }
1318
1319 if (f->tail_entry_monotonic_valid &&
1320 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1321 return -EINVAL;
1322
1323 #ifdef HAVE_GCRYPT
1324 r = journal_file_maybe_append_tag(f, ts->realtime);
1325 if (r < 0)
1326 return r;
1327 #endif
1328
1329 /* alloca() can't take 0, hence let's allocate at least one */
1330 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1331
1332 for (i = 0; i < n_iovec; i++) {
1333 uint64_t p;
1334 Object *o;
1335
1336 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1337 if (r < 0)
1338 return r;
1339
1340 xor_hash ^= le64toh(o->data.hash);
1341 items[i].object_offset = htole64(p);
1342 items[i].hash = o->data.hash;
1343 }
1344
1345 /* Order by the position on disk, in order to improve seek
1346 * times for rotating media. */
1347 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1348
1349 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1350
1351 journal_file_post_change(f);
1352
1353 return r;
1354 }
1355
1356 typedef struct ChainCacheItem {
1357 uint64_t first; /* the array at the beginning of the chain */
1358 uint64_t array; /* the cached array */
1359 uint64_t begin; /* the first item in the cached array */
1360 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1361 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1362 } ChainCacheItem;
1363
1364 static void chain_cache_put(
1365 Hashmap *h,
1366 ChainCacheItem *ci,
1367 uint64_t first,
1368 uint64_t array,
1369 uint64_t begin,
1370 uint64_t total,
1371 uint64_t last_index) {
1372
1373 if (!ci) {
1374 /* If the chain item to cache for this chain is the
1375 * first one it's not worth caching anything */
1376 if (array == first)
1377 return;
1378
1379 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1380 ci = hashmap_steal_first(h);
1381 else {
1382 ci = new(ChainCacheItem, 1);
1383 if (!ci)
1384 return;
1385 }
1386
1387 ci->first = first;
1388
1389 if (hashmap_put(h, &ci->first, ci) < 0) {
1390 free(ci);
1391 return;
1392 }
1393 } else
1394 assert(ci->first == first);
1395
1396 ci->array = array;
1397 ci->begin = begin;
1398 ci->total = total;
1399 ci->last_index = last_index;
1400 }
1401
1402 static int generic_array_get(
1403 JournalFile *f,
1404 uint64_t first,
1405 uint64_t i,
1406 Object **ret, uint64_t *offset) {
1407
1408 Object *o;
1409 uint64_t p = 0, a, t = 0;
1410 int r;
1411 ChainCacheItem *ci;
1412
1413 assert(f);
1414
1415 a = first;
1416
1417 /* Try the chain cache first */
1418 ci = hashmap_get(f->chain_cache, &first);
1419 if (ci && i > ci->total) {
1420 a = ci->array;
1421 i -= ci->total;
1422 t = ci->total;
1423 }
1424
1425 while (a > 0) {
1426 uint64_t k;
1427
1428 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1429 if (r < 0)
1430 return r;
1431
1432 k = journal_file_entry_array_n_items(o);
1433 if (i < k) {
1434 p = le64toh(o->entry_array.items[i]);
1435 goto found;
1436 }
1437
1438 i -= k;
1439 t += k;
1440 a = le64toh(o->entry_array.next_entry_array_offset);
1441 }
1442
1443 return 0;
1444
1445 found:
1446 /* Let's cache this item for the next invocation */
1447 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1448
1449 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1450 if (r < 0)
1451 return r;
1452
1453 if (ret)
1454 *ret = o;
1455
1456 if (offset)
1457 *offset = p;
1458
1459 return 1;
1460 }
1461
1462 static int generic_array_get_plus_one(
1463 JournalFile *f,
1464 uint64_t extra,
1465 uint64_t first,
1466 uint64_t i,
1467 Object **ret, uint64_t *offset) {
1468
1469 Object *o;
1470
1471 assert(f);
1472
1473 if (i == 0) {
1474 int r;
1475
1476 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1477 if (r < 0)
1478 return r;
1479
1480 if (ret)
1481 *ret = o;
1482
1483 if (offset)
1484 *offset = extra;
1485
1486 return 1;
1487 }
1488
1489 return generic_array_get(f, first, i-1, ret, offset);
1490 }
1491
1492 enum {
1493 TEST_FOUND,
1494 TEST_LEFT,
1495 TEST_RIGHT
1496 };
1497
1498 static int generic_array_bisect(
1499 JournalFile *f,
1500 uint64_t first,
1501 uint64_t n,
1502 uint64_t needle,
1503 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1504 direction_t direction,
1505 Object **ret,
1506 uint64_t *offset,
1507 uint64_t *idx) {
1508
1509 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1510 bool subtract_one = false;
1511 Object *o, *array = NULL;
1512 int r;
1513 ChainCacheItem *ci;
1514
1515 assert(f);
1516 assert(test_object);
1517
1518 /* Start with the first array in the chain */
1519 a = first;
1520
1521 ci = hashmap_get(f->chain_cache, &first);
1522 if (ci && n > ci->total) {
1523 /* Ah, we have iterated this bisection array chain
1524 * previously! Let's see if we can skip ahead in the
1525 * chain, as far as the last time. But we can't jump
1526 * backwards in the chain, so let's check that
1527 * first. */
1528
1529 r = test_object(f, ci->begin, needle);
1530 if (r < 0)
1531 return r;
1532
1533 if (r == TEST_LEFT) {
1534 /* OK, what we are looking for is right of the
1535 * begin of this EntryArray, so let's jump
1536 * straight to previously cached array in the
1537 * chain */
1538
1539 a = ci->array;
1540 n -= ci->total;
1541 t = ci->total;
1542 last_index = ci->last_index;
1543 }
1544 }
1545
1546 while (a > 0) {
1547 uint64_t left, right, k, lp;
1548
1549 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1550 if (r < 0)
1551 return r;
1552
1553 k = journal_file_entry_array_n_items(array);
1554 right = MIN(k, n);
1555 if (right <= 0)
1556 return 0;
1557
1558 i = right - 1;
1559 lp = p = le64toh(array->entry_array.items[i]);
1560 if (p <= 0)
1561 return -EBADMSG;
1562
1563 r = test_object(f, p, needle);
1564 if (r < 0)
1565 return r;
1566
1567 if (r == TEST_FOUND)
1568 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1569
1570 if (r == TEST_RIGHT) {
1571 left = 0;
1572 right -= 1;
1573
1574 if (last_index != (uint64_t) -1) {
1575 assert(last_index <= right);
1576
1577 /* If we cached the last index we
1578 * looked at, let's try to not to jump
1579 * too wildly around and see if we can
1580 * limit the range to look at early to
1581 * the immediate neighbors of the last
1582 * index we looked at. */
1583
1584 if (last_index > 0) {
1585 uint64_t x = last_index - 1;
1586
1587 p = le64toh(array->entry_array.items[x]);
1588 if (p <= 0)
1589 return -EBADMSG;
1590
1591 r = test_object(f, p, needle);
1592 if (r < 0)
1593 return r;
1594
1595 if (r == TEST_FOUND)
1596 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1597
1598 if (r == TEST_RIGHT)
1599 right = x;
1600 else
1601 left = x + 1;
1602 }
1603
1604 if (last_index < right) {
1605 uint64_t y = last_index + 1;
1606
1607 p = le64toh(array->entry_array.items[y]);
1608 if (p <= 0)
1609 return -EBADMSG;
1610
1611 r = test_object(f, p, needle);
1612 if (r < 0)
1613 return r;
1614
1615 if (r == TEST_FOUND)
1616 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1617
1618 if (r == TEST_RIGHT)
1619 right = y;
1620 else
1621 left = y + 1;
1622 }
1623 }
1624
1625 for (;;) {
1626 if (left == right) {
1627 if (direction == DIRECTION_UP)
1628 subtract_one = true;
1629
1630 i = left;
1631 goto found;
1632 }
1633
1634 assert(left < right);
1635 i = (left + right) / 2;
1636
1637 p = le64toh(array->entry_array.items[i]);
1638 if (p <= 0)
1639 return -EBADMSG;
1640
1641 r = test_object(f, p, needle);
1642 if (r < 0)
1643 return r;
1644
1645 if (r == TEST_FOUND)
1646 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1647
1648 if (r == TEST_RIGHT)
1649 right = i;
1650 else
1651 left = i + 1;
1652 }
1653 }
1654
1655 if (k > n) {
1656 if (direction == DIRECTION_UP) {
1657 i = n;
1658 subtract_one = true;
1659 goto found;
1660 }
1661
1662 return 0;
1663 }
1664
1665 last_p = lp;
1666
1667 n -= k;
1668 t += k;
1669 last_index = (uint64_t) -1;
1670 a = le64toh(array->entry_array.next_entry_array_offset);
1671 }
1672
1673 return 0;
1674
1675 found:
1676 if (subtract_one && t == 0 && i == 0)
1677 return 0;
1678
1679 /* Let's cache this item for the next invocation */
1680 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1681
1682 if (subtract_one && i == 0)
1683 p = last_p;
1684 else if (subtract_one)
1685 p = le64toh(array->entry_array.items[i-1]);
1686 else
1687 p = le64toh(array->entry_array.items[i]);
1688
1689 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1690 if (r < 0)
1691 return r;
1692
1693 if (ret)
1694 *ret = o;
1695
1696 if (offset)
1697 *offset = p;
1698
1699 if (idx)
1700 *idx = t + i + (subtract_one ? -1 : 0);
1701
1702 return 1;
1703 }
1704
1705
1706 static int generic_array_bisect_plus_one(
1707 JournalFile *f,
1708 uint64_t extra,
1709 uint64_t first,
1710 uint64_t n,
1711 uint64_t needle,
1712 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1713 direction_t direction,
1714 Object **ret,
1715 uint64_t *offset,
1716 uint64_t *idx) {
1717
1718 int r;
1719 bool step_back = false;
1720 Object *o;
1721
1722 assert(f);
1723 assert(test_object);
1724
1725 if (n <= 0)
1726 return 0;
1727
1728 /* This bisects the array in object 'first', but first checks
1729 * an extra */
1730 r = test_object(f, extra, needle);
1731 if (r < 0)
1732 return r;
1733
1734 if (r == TEST_FOUND)
1735 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1736
1737 /* if we are looking with DIRECTION_UP then we need to first
1738 see if in the actual array there is a matching entry, and
1739 return the last one of that. But if there isn't any we need
1740 to return this one. Hence remember this, and return it
1741 below. */
1742 if (r == TEST_LEFT)
1743 step_back = direction == DIRECTION_UP;
1744
1745 if (r == TEST_RIGHT) {
1746 if (direction == DIRECTION_DOWN)
1747 goto found;
1748 else
1749 return 0;
1750 }
1751
1752 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1753
1754 if (r == 0 && step_back)
1755 goto found;
1756
1757 if (r > 0 && idx)
1758 (*idx) ++;
1759
1760 return r;
1761
1762 found:
1763 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1764 if (r < 0)
1765 return r;
1766
1767 if (ret)
1768 *ret = o;
1769
1770 if (offset)
1771 *offset = extra;
1772
1773 if (idx)
1774 *idx = 0;
1775
1776 return 1;
1777 }
1778
1779 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1780 assert(f);
1781 assert(p > 0);
1782
1783 if (p == needle)
1784 return TEST_FOUND;
1785 else if (p < needle)
1786 return TEST_LEFT;
1787 else
1788 return TEST_RIGHT;
1789 }
1790
1791 int journal_file_move_to_entry_by_offset(
1792 JournalFile *f,
1793 uint64_t p,
1794 direction_t direction,
1795 Object **ret,
1796 uint64_t *offset) {
1797
1798 return generic_array_bisect(f,
1799 le64toh(f->header->entry_array_offset),
1800 le64toh(f->header->n_entries),
1801 p,
1802 test_object_offset,
1803 direction,
1804 ret, offset, NULL);
1805 }
1806
1807
1808 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1809 Object *o;
1810 int r;
1811
1812 assert(f);
1813 assert(p > 0);
1814
1815 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1816 if (r < 0)
1817 return r;
1818
1819 if (le64toh(o->entry.seqnum) == needle)
1820 return TEST_FOUND;
1821 else if (le64toh(o->entry.seqnum) < needle)
1822 return TEST_LEFT;
1823 else
1824 return TEST_RIGHT;
1825 }
1826
1827 int journal_file_move_to_entry_by_seqnum(
1828 JournalFile *f,
1829 uint64_t seqnum,
1830 direction_t direction,
1831 Object **ret,
1832 uint64_t *offset) {
1833
1834 return generic_array_bisect(f,
1835 le64toh(f->header->entry_array_offset),
1836 le64toh(f->header->n_entries),
1837 seqnum,
1838 test_object_seqnum,
1839 direction,
1840 ret, offset, NULL);
1841 }
1842
1843 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1844 Object *o;
1845 int r;
1846
1847 assert(f);
1848 assert(p > 0);
1849
1850 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1851 if (r < 0)
1852 return r;
1853
1854 if (le64toh(o->entry.realtime) == needle)
1855 return TEST_FOUND;
1856 else if (le64toh(o->entry.realtime) < needle)
1857 return TEST_LEFT;
1858 else
1859 return TEST_RIGHT;
1860 }
1861
1862 int journal_file_move_to_entry_by_realtime(
1863 JournalFile *f,
1864 uint64_t realtime,
1865 direction_t direction,
1866 Object **ret,
1867 uint64_t *offset) {
1868
1869 return generic_array_bisect(f,
1870 le64toh(f->header->entry_array_offset),
1871 le64toh(f->header->n_entries),
1872 realtime,
1873 test_object_realtime,
1874 direction,
1875 ret, offset, NULL);
1876 }
1877
1878 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1879 Object *o;
1880 int r;
1881
1882 assert(f);
1883 assert(p > 0);
1884
1885 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1886 if (r < 0)
1887 return r;
1888
1889 if (le64toh(o->entry.monotonic) == needle)
1890 return TEST_FOUND;
1891 else if (le64toh(o->entry.monotonic) < needle)
1892 return TEST_LEFT;
1893 else
1894 return TEST_RIGHT;
1895 }
1896
1897 static inline int find_data_object_by_boot_id(
1898 JournalFile *f,
1899 sd_id128_t boot_id,
1900 Object **o,
1901 uint64_t *b) {
1902 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1903
1904 sd_id128_to_string(boot_id, t + 9);
1905 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1906 }
1907
1908 int journal_file_move_to_entry_by_monotonic(
1909 JournalFile *f,
1910 sd_id128_t boot_id,
1911 uint64_t monotonic,
1912 direction_t direction,
1913 Object **ret,
1914 uint64_t *offset) {
1915
1916 Object *o;
1917 int r;
1918
1919 assert(f);
1920
1921 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1922 if (r < 0)
1923 return r;
1924 if (r == 0)
1925 return -ENOENT;
1926
1927 return generic_array_bisect_plus_one(f,
1928 le64toh(o->data.entry_offset),
1929 le64toh(o->data.entry_array_offset),
1930 le64toh(o->data.n_entries),
1931 monotonic,
1932 test_object_monotonic,
1933 direction,
1934 ret, offset, NULL);
1935 }
1936
1937 int journal_file_next_entry(
1938 JournalFile *f,
1939 Object *o, uint64_t p,
1940 direction_t direction,
1941 Object **ret, uint64_t *offset) {
1942
1943 uint64_t i, n, ofs;
1944 int r;
1945
1946 assert(f);
1947 assert(p > 0 || !o);
1948
1949 n = le64toh(f->header->n_entries);
1950 if (n <= 0)
1951 return 0;
1952
1953 if (!o)
1954 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1955 else {
1956 if (o->object.type != OBJECT_ENTRY)
1957 return -EINVAL;
1958
1959 r = generic_array_bisect(f,
1960 le64toh(f->header->entry_array_offset),
1961 le64toh(f->header->n_entries),
1962 p,
1963 test_object_offset,
1964 DIRECTION_DOWN,
1965 NULL, NULL,
1966 &i);
1967 if (r <= 0)
1968 return r;
1969
1970 if (direction == DIRECTION_DOWN) {
1971 if (i >= n - 1)
1972 return 0;
1973
1974 i++;
1975 } else {
1976 if (i <= 0)
1977 return 0;
1978
1979 i--;
1980 }
1981 }
1982
1983 /* And jump to it */
1984 r = generic_array_get(f,
1985 le64toh(f->header->entry_array_offset),
1986 i,
1987 ret, &ofs);
1988 if (r <= 0)
1989 return r;
1990
1991 if (p > 0 &&
1992 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1993 log_debug("%s: entry array corrupted at entry %"PRIu64,
1994 f->path, i);
1995 return -EBADMSG;
1996 }
1997
1998 if (offset)
1999 *offset = ofs;
2000
2001 return 1;
2002 }
2003
2004 int journal_file_skip_entry(
2005 JournalFile *f,
2006 Object *o, uint64_t p,
2007 int64_t skip,
2008 Object **ret, uint64_t *offset) {
2009
2010 uint64_t i, n;
2011 int r;
2012
2013 assert(f);
2014 assert(o);
2015 assert(p > 0);
2016
2017 if (o->object.type != OBJECT_ENTRY)
2018 return -EINVAL;
2019
2020 r = generic_array_bisect(f,
2021 le64toh(f->header->entry_array_offset),
2022 le64toh(f->header->n_entries),
2023 p,
2024 test_object_offset,
2025 DIRECTION_DOWN,
2026 NULL, NULL,
2027 &i);
2028 if (r <= 0)
2029 return r;
2030
2031 /* Calculate new index */
2032 if (skip < 0) {
2033 if ((uint64_t) -skip >= i)
2034 i = 0;
2035 else
2036 i = i - (uint64_t) -skip;
2037 } else
2038 i += (uint64_t) skip;
2039
2040 n = le64toh(f->header->n_entries);
2041 if (n <= 0)
2042 return -EBADMSG;
2043
2044 if (i >= n)
2045 i = n-1;
2046
2047 return generic_array_get(f,
2048 le64toh(f->header->entry_array_offset),
2049 i,
2050 ret, offset);
2051 }
2052
2053 int journal_file_next_entry_for_data(
2054 JournalFile *f,
2055 Object *o, uint64_t p,
2056 uint64_t data_offset,
2057 direction_t direction,
2058 Object **ret, uint64_t *offset) {
2059
2060 uint64_t n, i;
2061 int r;
2062 Object *d;
2063
2064 assert(f);
2065 assert(p > 0 || !o);
2066
2067 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2068 if (r < 0)
2069 return r;
2070
2071 n = le64toh(d->data.n_entries);
2072 if (n <= 0)
2073 return n;
2074
2075 if (!o)
2076 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2077 else {
2078 if (o->object.type != OBJECT_ENTRY)
2079 return -EINVAL;
2080
2081 r = generic_array_bisect_plus_one(f,
2082 le64toh(d->data.entry_offset),
2083 le64toh(d->data.entry_array_offset),
2084 le64toh(d->data.n_entries),
2085 p,
2086 test_object_offset,
2087 DIRECTION_DOWN,
2088 NULL, NULL,
2089 &i);
2090
2091 if (r <= 0)
2092 return r;
2093
2094 if (direction == DIRECTION_DOWN) {
2095 if (i >= n - 1)
2096 return 0;
2097
2098 i++;
2099 } else {
2100 if (i <= 0)
2101 return 0;
2102
2103 i--;
2104 }
2105
2106 }
2107
2108 return generic_array_get_plus_one(f,
2109 le64toh(d->data.entry_offset),
2110 le64toh(d->data.entry_array_offset),
2111 i,
2112 ret, offset);
2113 }
2114
2115 int journal_file_move_to_entry_by_offset_for_data(
2116 JournalFile *f,
2117 uint64_t data_offset,
2118 uint64_t p,
2119 direction_t direction,
2120 Object **ret, uint64_t *offset) {
2121
2122 int r;
2123 Object *d;
2124
2125 assert(f);
2126
2127 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2128 if (r < 0)
2129 return r;
2130
2131 return generic_array_bisect_plus_one(f,
2132 le64toh(d->data.entry_offset),
2133 le64toh(d->data.entry_array_offset),
2134 le64toh(d->data.n_entries),
2135 p,
2136 test_object_offset,
2137 direction,
2138 ret, offset, NULL);
2139 }
2140
2141 int journal_file_move_to_entry_by_monotonic_for_data(
2142 JournalFile *f,
2143 uint64_t data_offset,
2144 sd_id128_t boot_id,
2145 uint64_t monotonic,
2146 direction_t direction,
2147 Object **ret, uint64_t *offset) {
2148
2149 Object *o, *d;
2150 int r;
2151 uint64_t b, z;
2152
2153 assert(f);
2154
2155 /* First, seek by time */
2156 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2157 if (r < 0)
2158 return r;
2159 if (r == 0)
2160 return -ENOENT;
2161
2162 r = generic_array_bisect_plus_one(f,
2163 le64toh(o->data.entry_offset),
2164 le64toh(o->data.entry_array_offset),
2165 le64toh(o->data.n_entries),
2166 monotonic,
2167 test_object_monotonic,
2168 direction,
2169 NULL, &z, NULL);
2170 if (r <= 0)
2171 return r;
2172
2173 /* And now, continue seeking until we find an entry that
2174 * exists in both bisection arrays */
2175
2176 for (;;) {
2177 Object *qo;
2178 uint64_t p, q;
2179
2180 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2181 if (r < 0)
2182 return r;
2183
2184 r = generic_array_bisect_plus_one(f,
2185 le64toh(d->data.entry_offset),
2186 le64toh(d->data.entry_array_offset),
2187 le64toh(d->data.n_entries),
2188 z,
2189 test_object_offset,
2190 direction,
2191 NULL, &p, NULL);
2192 if (r <= 0)
2193 return r;
2194
2195 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2196 if (r < 0)
2197 return r;
2198
2199 r = generic_array_bisect_plus_one(f,
2200 le64toh(o->data.entry_offset),
2201 le64toh(o->data.entry_array_offset),
2202 le64toh(o->data.n_entries),
2203 p,
2204 test_object_offset,
2205 direction,
2206 &qo, &q, NULL);
2207
2208 if (r <= 0)
2209 return r;
2210
2211 if (p == q) {
2212 if (ret)
2213 *ret = qo;
2214 if (offset)
2215 *offset = q;
2216
2217 return 1;
2218 }
2219
2220 z = q;
2221 }
2222 }
2223
2224 int journal_file_move_to_entry_by_seqnum_for_data(
2225 JournalFile *f,
2226 uint64_t data_offset,
2227 uint64_t seqnum,
2228 direction_t direction,
2229 Object **ret, uint64_t *offset) {
2230
2231 Object *d;
2232 int r;
2233
2234 assert(f);
2235
2236 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2237 if (r < 0)
2238 return r;
2239
2240 return generic_array_bisect_plus_one(f,
2241 le64toh(d->data.entry_offset),
2242 le64toh(d->data.entry_array_offset),
2243 le64toh(d->data.n_entries),
2244 seqnum,
2245 test_object_seqnum,
2246 direction,
2247 ret, offset, NULL);
2248 }
2249
2250 int journal_file_move_to_entry_by_realtime_for_data(
2251 JournalFile *f,
2252 uint64_t data_offset,
2253 uint64_t realtime,
2254 direction_t direction,
2255 Object **ret, uint64_t *offset) {
2256
2257 Object *d;
2258 int r;
2259
2260 assert(f);
2261
2262 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2263 if (r < 0)
2264 return r;
2265
2266 return generic_array_bisect_plus_one(f,
2267 le64toh(d->data.entry_offset),
2268 le64toh(d->data.entry_array_offset),
2269 le64toh(d->data.n_entries),
2270 realtime,
2271 test_object_realtime,
2272 direction,
2273 ret, offset, NULL);
2274 }
2275
2276 void journal_file_dump(JournalFile *f) {
2277 Object *o;
2278 int r;
2279 uint64_t p;
2280
2281 assert(f);
2282
2283 journal_file_print_header(f);
2284
2285 p = le64toh(f->header->header_size);
2286 while (p != 0) {
2287 r = journal_file_move_to_object(f, -1, p, &o);
2288 if (r < 0)
2289 goto fail;
2290
2291 switch (o->object.type) {
2292
2293 case OBJECT_UNUSED:
2294 printf("Type: OBJECT_UNUSED\n");
2295 break;
2296
2297 case OBJECT_DATA:
2298 printf("Type: OBJECT_DATA\n");
2299 break;
2300
2301 case OBJECT_FIELD:
2302 printf("Type: OBJECT_FIELD\n");
2303 break;
2304
2305 case OBJECT_ENTRY:
2306 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2307 le64toh(o->entry.seqnum),
2308 le64toh(o->entry.monotonic),
2309 le64toh(o->entry.realtime));
2310 break;
2311
2312 case OBJECT_FIELD_HASH_TABLE:
2313 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2314 break;
2315
2316 case OBJECT_DATA_HASH_TABLE:
2317 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2318 break;
2319
2320 case OBJECT_ENTRY_ARRAY:
2321 printf("Type: OBJECT_ENTRY_ARRAY\n");
2322 break;
2323
2324 case OBJECT_TAG:
2325 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2326 le64toh(o->tag.seqnum),
2327 le64toh(o->tag.epoch));
2328 break;
2329
2330 default:
2331 printf("Type: unknown (%u)\n", o->object.type);
2332 break;
2333 }
2334
2335 if (o->object.flags & OBJECT_COMPRESSED)
2336 printf("Flags: COMPRESSED\n");
2337
2338 if (p == le64toh(f->header->tail_object_offset))
2339 p = 0;
2340 else
2341 p = p + ALIGN64(le64toh(o->object.size));
2342 }
2343
2344 return;
2345 fail:
2346 log_error("File corrupt");
2347 }
2348
2349 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2350 const char *x;
2351
2352 x = format_timestamp(buf, l, t);
2353 if (x)
2354 return x;
2355 return " --- ";
2356 }
2357
2358 void journal_file_print_header(JournalFile *f) {
2359 char a[33], b[33], c[33], d[33];
2360 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2361 struct stat st;
2362 char bytes[FORMAT_BYTES_MAX];
2363
2364 assert(f);
2365
2366 printf("File Path: %s\n"
2367 "File ID: %s\n"
2368 "Machine ID: %s\n"
2369 "Boot ID: %s\n"
2370 "Sequential Number ID: %s\n"
2371 "State: %s\n"
2372 "Compatible Flags:%s%s\n"
2373 "Incompatible Flags:%s%s\n"
2374 "Header size: %"PRIu64"\n"
2375 "Arena size: %"PRIu64"\n"
2376 "Data Hash Table Size: %"PRIu64"\n"
2377 "Field Hash Table Size: %"PRIu64"\n"
2378 "Rotate Suggested: %s\n"
2379 "Head Sequential Number: %"PRIu64"\n"
2380 "Tail Sequential Number: %"PRIu64"\n"
2381 "Head Realtime Timestamp: %s\n"
2382 "Tail Realtime Timestamp: %s\n"
2383 "Tail Monotonic Timestamp: %s\n"
2384 "Objects: %"PRIu64"\n"
2385 "Entry Objects: %"PRIu64"\n",
2386 f->path,
2387 sd_id128_to_string(f->header->file_id, a),
2388 sd_id128_to_string(f->header->machine_id, b),
2389 sd_id128_to_string(f->header->boot_id, c),
2390 sd_id128_to_string(f->header->seqnum_id, d),
2391 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2392 f->header->state == STATE_ONLINE ? "ONLINE" :
2393 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2394 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2395 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2396 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2397 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2398 le64toh(f->header->header_size),
2399 le64toh(f->header->arena_size),
2400 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2401 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2402 yes_no(journal_file_rotate_suggested(f, 0)),
2403 le64toh(f->header->head_entry_seqnum),
2404 le64toh(f->header->tail_entry_seqnum),
2405 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2406 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2407 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2408 le64toh(f->header->n_objects),
2409 le64toh(f->header->n_entries));
2410
2411 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2412 printf("Data Objects: %"PRIu64"\n"
2413 "Data Hash Table Fill: %.1f%%\n",
2414 le64toh(f->header->n_data),
2415 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2416
2417 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2418 printf("Field Objects: %"PRIu64"\n"
2419 "Field Hash Table Fill: %.1f%%\n",
2420 le64toh(f->header->n_fields),
2421 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2422
2423 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2424 printf("Tag Objects: %"PRIu64"\n",
2425 le64toh(f->header->n_tags));
2426 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2427 printf("Entry Array Objects: %"PRIu64"\n",
2428 le64toh(f->header->n_entry_arrays));
2429
2430 if (fstat(f->fd, &st) >= 0)
2431 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2432 }
2433
2434 int journal_file_open(
2435 const char *fname,
2436 int flags,
2437 mode_t mode,
2438 bool compress,
2439 bool seal,
2440 JournalMetrics *metrics,
2441 MMapCache *mmap_cache,
2442 JournalFile *template,
2443 JournalFile **ret) {
2444
2445 JournalFile *f;
2446 int r;
2447 bool newly_created = false;
2448
2449 assert(fname);
2450 assert(ret);
2451
2452 if ((flags & O_ACCMODE) != O_RDONLY &&
2453 (flags & O_ACCMODE) != O_RDWR)
2454 return -EINVAL;
2455
2456 if (!endswith(fname, ".journal") &&
2457 !endswith(fname, ".journal~"))
2458 return -EINVAL;
2459
2460 f = new0(JournalFile, 1);
2461 if (!f)
2462 return -ENOMEM;
2463
2464 f->fd = -1;
2465 f->mode = mode;
2466
2467 f->flags = flags;
2468 f->prot = prot_from_flags(flags);
2469 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2470 #ifdef HAVE_XZ
2471 f->compress = compress;
2472 #endif
2473 #ifdef HAVE_GCRYPT
2474 f->seal = seal;
2475 #endif
2476
2477 if (mmap_cache)
2478 f->mmap = mmap_cache_ref(mmap_cache);
2479 else {
2480 f->mmap = mmap_cache_new();
2481 if (!f->mmap) {
2482 r = -ENOMEM;
2483 goto fail;
2484 }
2485 }
2486
2487 f->path = strdup(fname);
2488 if (!f->path) {
2489 r = -ENOMEM;
2490 goto fail;
2491 }
2492
2493 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2494 if (!f->chain_cache) {
2495 r = -ENOMEM;
2496 goto fail;
2497 }
2498
2499 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2500 if (f->fd < 0) {
2501 r = -errno;
2502 goto fail;
2503 }
2504
2505 if (fstat(f->fd, &f->last_stat) < 0) {
2506 r = -errno;
2507 goto fail;
2508 }
2509
2510 if (f->last_stat.st_size == 0 && f->writable) {
2511 uint64_t crtime;
2512
2513 /* Let's attach the creation time to the journal file,
2514 * so that the vacuuming code knows the age of this
2515 * file even if the file might end up corrupted one
2516 * day... Ideally we'd just use the creation time many
2517 * file systems maintain for each file, but there is
2518 * currently no usable API to query this, hence let's
2519 * emulate this via extended attributes. If extended
2520 * attributes are not supported we'll just skip this,
2521 * and rely solely on mtime/atime/ctime of the file.*/
2522
2523 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2524 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2525
2526 #ifdef HAVE_GCRYPT
2527 /* Try to load the FSPRG state, and if we can't, then
2528 * just don't do sealing */
2529 if (f->seal) {
2530 r = journal_file_fss_load(f);
2531 if (r < 0)
2532 f->seal = false;
2533 }
2534 #endif
2535
2536 r = journal_file_init_header(f, template);
2537 if (r < 0)
2538 goto fail;
2539
2540 if (fstat(f->fd, &f->last_stat) < 0) {
2541 r = -errno;
2542 goto fail;
2543 }
2544
2545 newly_created = true;
2546 }
2547
2548 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2549 r = -EIO;
2550 goto fail;
2551 }
2552
2553 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2554 if (f->header == MAP_FAILED) {
2555 f->header = NULL;
2556 r = -errno;
2557 goto fail;
2558 }
2559
2560 if (!newly_created) {
2561 r = journal_file_verify_header(f);
2562 if (r < 0)
2563 goto fail;
2564 }
2565
2566 #ifdef HAVE_GCRYPT
2567 if (!newly_created && f->writable) {
2568 r = journal_file_fss_load(f);
2569 if (r < 0)
2570 goto fail;
2571 }
2572 #endif
2573
2574 if (f->writable) {
2575 if (metrics) {
2576 journal_default_metrics(metrics, f->fd);
2577 f->metrics = *metrics;
2578 } else if (template)
2579 f->metrics = template->metrics;
2580
2581 r = journal_file_refresh_header(f);
2582 if (r < 0)
2583 goto fail;
2584 }
2585
2586 #ifdef HAVE_GCRYPT
2587 r = journal_file_hmac_setup(f);
2588 if (r < 0)
2589 goto fail;
2590 #endif
2591
2592 if (newly_created) {
2593 r = journal_file_setup_field_hash_table(f);
2594 if (r < 0)
2595 goto fail;
2596
2597 r = journal_file_setup_data_hash_table(f);
2598 if (r < 0)
2599 goto fail;
2600
2601 #ifdef HAVE_GCRYPT
2602 r = journal_file_append_first_tag(f);
2603 if (r < 0)
2604 goto fail;
2605 #endif
2606 }
2607
2608 r = journal_file_map_field_hash_table(f);
2609 if (r < 0)
2610 goto fail;
2611
2612 r = journal_file_map_data_hash_table(f);
2613 if (r < 0)
2614 goto fail;
2615
2616 *ret = f;
2617 return 0;
2618
2619 fail:
2620 journal_file_close(f);
2621
2622 return r;
2623 }
2624
2625 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2626 _cleanup_free_ char *p = NULL;
2627 size_t l;
2628 JournalFile *old_file, *new_file = NULL;
2629 int r;
2630
2631 assert(f);
2632 assert(*f);
2633
2634 old_file = *f;
2635
2636 if (!old_file->writable)
2637 return -EINVAL;
2638
2639 if (!endswith(old_file->path, ".journal"))
2640 return -EINVAL;
2641
2642 l = strlen(old_file->path);
2643 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2644 (int) l - 8, old_file->path,
2645 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2646 le64toh((*f)->header->head_entry_seqnum),
2647 le64toh((*f)->header->head_entry_realtime));
2648 if (r < 0)
2649 return -ENOMEM;
2650
2651 r = rename(old_file->path, p);
2652 if (r < 0)
2653 return -errno;
2654
2655 old_file->header->state = STATE_ARCHIVED;
2656
2657 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2658 journal_file_close(old_file);
2659
2660 *f = new_file;
2661 return r;
2662 }
2663
2664 int journal_file_open_reliably(
2665 const char *fname,
2666 int flags,
2667 mode_t mode,
2668 bool compress,
2669 bool seal,
2670 JournalMetrics *metrics,
2671 MMapCache *mmap_cache,
2672 JournalFile *template,
2673 JournalFile **ret) {
2674
2675 int r;
2676 size_t l;
2677 _cleanup_free_ char *p = NULL;
2678
2679 r = journal_file_open(fname, flags, mode, compress, seal,
2680 metrics, mmap_cache, template, ret);
2681 if (r != -EBADMSG && /* corrupted */
2682 r != -ENODATA && /* truncated */
2683 r != -EHOSTDOWN && /* other machine */
2684 r != -EPROTONOSUPPORT && /* incompatible feature */
2685 r != -EBUSY && /* unclean shutdown */
2686 r != -ESHUTDOWN /* already archived */)
2687 return r;
2688
2689 if ((flags & O_ACCMODE) == O_RDONLY)
2690 return r;
2691
2692 if (!(flags & O_CREAT))
2693 return r;
2694
2695 if (!endswith(fname, ".journal"))
2696 return r;
2697
2698 /* The file is corrupted. Rotate it away and try it again (but only once) */
2699
2700 l = strlen(fname);
2701 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2702 (int) l - 8, fname,
2703 (unsigned long long) now(CLOCK_REALTIME),
2704 random_u64()) < 0)
2705 return -ENOMEM;
2706
2707 r = rename(fname, p);
2708 if (r < 0)
2709 return -errno;
2710
2711 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2712
2713 return journal_file_open(fname, flags, mode, compress, seal,
2714 metrics, mmap_cache, template, ret);
2715 }
2716
2717 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2718 uint64_t i, n;
2719 uint64_t q, xor_hash = 0;
2720 int r;
2721 EntryItem *items;
2722 dual_timestamp ts;
2723
2724 assert(from);
2725 assert(to);
2726 assert(o);
2727 assert(p);
2728
2729 if (!to->writable)
2730 return -EPERM;
2731
2732 ts.monotonic = le64toh(o->entry.monotonic);
2733 ts.realtime = le64toh(o->entry.realtime);
2734
2735 n = journal_file_entry_n_items(o);
2736 /* alloca() can't take 0, hence let's allocate at least one */
2737 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2738
2739 for (i = 0; i < n; i++) {
2740 uint64_t l, h;
2741 le64_t le_hash;
2742 size_t t;
2743 void *data;
2744 Object *u;
2745
2746 q = le64toh(o->entry.items[i].object_offset);
2747 le_hash = o->entry.items[i].hash;
2748
2749 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2750 if (r < 0)
2751 return r;
2752
2753 if (le_hash != o->data.hash)
2754 return -EBADMSG;
2755
2756 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2757 t = (size_t) l;
2758
2759 /* We hit the limit on 32bit machines */
2760 if ((uint64_t) t != l)
2761 return -E2BIG;
2762
2763 if (o->object.flags & OBJECT_COMPRESSED) {
2764 #ifdef HAVE_XZ
2765 uint64_t rsize;
2766
2767 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2768 return -EBADMSG;
2769
2770 data = from->compress_buffer;
2771 l = rsize;
2772 #else
2773 return -EPROTONOSUPPORT;
2774 #endif
2775 } else
2776 data = o->data.payload;
2777
2778 r = journal_file_append_data(to, data, l, &u, &h);
2779 if (r < 0)
2780 return r;
2781
2782 xor_hash ^= le64toh(u->data.hash);
2783 items[i].object_offset = htole64(h);
2784 items[i].hash = u->data.hash;
2785
2786 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2787 if (r < 0)
2788 return r;
2789 }
2790
2791 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2792 }
2793
2794 void journal_default_metrics(JournalMetrics *m, int fd) {
2795 uint64_t fs_size = 0;
2796 struct statvfs ss;
2797 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2798
2799 assert(m);
2800 assert(fd >= 0);
2801
2802 if (fstatvfs(fd, &ss) >= 0)
2803 fs_size = ss.f_frsize * ss.f_blocks;
2804
2805 if (m->max_use == (uint64_t) -1) {
2806
2807 if (fs_size > 0) {
2808 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2809
2810 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2811 m->max_use = DEFAULT_MAX_USE_UPPER;
2812
2813 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2814 m->max_use = DEFAULT_MAX_USE_LOWER;
2815 } else
2816 m->max_use = DEFAULT_MAX_USE_LOWER;
2817 } else {
2818 m->max_use = PAGE_ALIGN(m->max_use);
2819
2820 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2821 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2822 }
2823
2824 if (m->max_size == (uint64_t) -1) {
2825 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2826
2827 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2828 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2829 } else
2830 m->max_size = PAGE_ALIGN(m->max_size);
2831
2832 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2833 m->max_size = JOURNAL_FILE_SIZE_MIN;
2834
2835 if (m->max_size*2 > m->max_use)
2836 m->max_use = m->max_size*2;
2837
2838 if (m->min_size == (uint64_t) -1)
2839 m->min_size = JOURNAL_FILE_SIZE_MIN;
2840 else {
2841 m->min_size = PAGE_ALIGN(m->min_size);
2842
2843 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2844 m->min_size = JOURNAL_FILE_SIZE_MIN;
2845
2846 if (m->min_size > m->max_size)
2847 m->max_size = m->min_size;
2848 }
2849
2850 if (m->keep_free == (uint64_t) -1) {
2851
2852 if (fs_size > 0) {
2853 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2854
2855 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2856 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2857
2858 } else
2859 m->keep_free = DEFAULT_KEEP_FREE;
2860 }
2861
2862 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2863 format_bytes(a, sizeof(a), m->max_use),
2864 format_bytes(b, sizeof(b), m->max_size),
2865 format_bytes(c, sizeof(c), m->min_size),
2866 format_bytes(d, sizeof(d), m->keep_free));
2867 }
2868
2869 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2870 assert(f);
2871 assert(from || to);
2872
2873 if (from) {
2874 if (f->header->head_entry_realtime == 0)
2875 return -ENOENT;
2876
2877 *from = le64toh(f->header->head_entry_realtime);
2878 }
2879
2880 if (to) {
2881 if (f->header->tail_entry_realtime == 0)
2882 return -ENOENT;
2883
2884 *to = le64toh(f->header->tail_entry_realtime);
2885 }
2886
2887 return 1;
2888 }
2889
2890 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2891 Object *o;
2892 uint64_t p;
2893 int r;
2894
2895 assert(f);
2896 assert(from || to);
2897
2898 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2899 if (r <= 0)
2900 return r;
2901
2902 if (le64toh(o->data.n_entries) <= 0)
2903 return 0;
2904
2905 if (from) {
2906 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2907 if (r < 0)
2908 return r;
2909
2910 *from = le64toh(o->entry.monotonic);
2911 }
2912
2913 if (to) {
2914 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2915 if (r < 0)
2916 return r;
2917
2918 r = generic_array_get_plus_one(f,
2919 le64toh(o->data.entry_offset),
2920 le64toh(o->data.entry_array_offset),
2921 le64toh(o->data.n_entries)-1,
2922 &o, NULL);
2923 if (r <= 0)
2924 return r;
2925
2926 *to = le64toh(o->entry.monotonic);
2927 }
2928
2929 return 1;
2930 }
2931
2932 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2933 assert(f);
2934
2935 /* If we gained new header fields we gained new features,
2936 * hence suggest a rotation */
2937 if (le64toh(f->header->header_size) < sizeof(Header)) {
2938 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2939 return true;
2940 }
2941
2942 /* Let's check if the hash tables grew over a certain fill
2943 * level (75%, borrowing this value from Java's hash table
2944 * implementation), and if so suggest a rotation. To calculate
2945 * the fill level we need the n_data field, which only exists
2946 * in newer versions. */
2947
2948 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2949 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2950 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2951 f->path,
2952 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2953 le64toh(f->header->n_data),
2954 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2955 (unsigned long long) f->last_stat.st_size,
2956 f->last_stat.st_size / le64toh(f->header->n_data));
2957 return true;
2958 }
2959
2960 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2961 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2962 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2963 f->path,
2964 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2965 le64toh(f->header->n_fields),
2966 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2967 return true;
2968 }
2969
2970 /* Are the data objects properly indexed by field objects? */
2971 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2972 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2973 le64toh(f->header->n_data) > 0 &&
2974 le64toh(f->header->n_fields) == 0)
2975 return true;
2976
2977 if (max_file_usec > 0) {
2978 usec_t t, h;
2979
2980 h = le64toh(f->header->head_entry_realtime);
2981 t = now(CLOCK_REALTIME);
2982
2983 if (h > 0 && t > h + max_file_usec)
2984 return true;
2985 }
2986
2987 return false;
2988 }