]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
install: make InstallContext::{will_install,have_installed} OrderedHashmaps
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <sys/xattr.h>
30
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
34 #include "lookup3.h"
35 #include "compress.h"
36 #include "fsprg.h"
37
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
45
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53
54 /* This is the upper bound if we deduce the keep_free value from the
55 * file system size */
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58 /* This is the keep_free value when we can't determine the system
59 * size */
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
67
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70
71 static int journal_file_set_online(JournalFile *f) {
72 assert(f);
73
74 if (!f->writable)
75 return -EPERM;
76
77 if (!(f->fd >= 0 && f->header))
78 return -EINVAL;
79
80 switch(f->header->state) {
81 case STATE_ONLINE:
82 return 0;
83
84 case STATE_OFFLINE:
85 f->header->state = STATE_ONLINE;
86 fsync(f->fd);
87 return 0;
88
89 default:
90 return -EINVAL;
91 }
92 }
93
94 int journal_file_set_offline(JournalFile *f) {
95 assert(f);
96
97 if (!f->writable)
98 return -EPERM;
99
100 if (!(f->fd >= 0 && f->header))
101 return -EINVAL;
102
103 if (f->header->state != STATE_ONLINE)
104 return 0;
105
106 fsync(f->fd);
107
108 f->header->state = STATE_OFFLINE;
109
110 fsync(f->fd);
111
112 return 0;
113 }
114
115 void journal_file_close(JournalFile *f) {
116 assert(f);
117
118 #ifdef HAVE_GCRYPT
119 /* Write the final tag */
120 if (f->seal && f->writable)
121 journal_file_append_tag(f);
122 #endif
123
124 /* Sync everything to disk, before we mark the file offline */
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
127
128 journal_file_set_offline(f);
129
130 if (f->header)
131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
132
133 safe_close(f->fd);
134 free(f->path);
135
136 if (f->mmap)
137 mmap_cache_unref(f->mmap);
138
139 hashmap_free_free(f->chain_cache);
140
141 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
142 free(f->compress_buffer);
143 #endif
144
145 #ifdef HAVE_GCRYPT
146 if (f->fss_file)
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
150
151 free(f->fsprg_seed);
152
153 if (f->hmac)
154 gcry_md_close(f->hmac);
155 #endif
156
157 free(f);
158 }
159
160 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
161 Header h = {};
162 ssize_t k;
163 int r;
164
165 assert(f);
166
167 memcpy(h.signature, HEADER_SIGNATURE, 8);
168 h.header_size = htole64(ALIGN64(sizeof(h)));
169
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
173
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
176
177 r = sd_id128_randomize(&h.file_id);
178 if (r < 0)
179 return r;
180
181 if (template) {
182 h.seqnum_id = template->header->seqnum_id;
183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
184 } else
185 h.seqnum_id = h.file_id;
186
187 k = pwrite(f->fd, &h, sizeof(h), 0);
188 if (k < 0)
189 return -errno;
190
191 if (k != sizeof(h))
192 return -EIO;
193
194 return 0;
195 }
196
197 static int journal_file_refresh_header(JournalFile *f) {
198 int r;
199 sd_id128_t boot_id;
200
201 assert(f);
202
203 r = sd_id128_get_machine(&f->header->machine_id);
204 if (r < 0)
205 return r;
206
207 r = sd_id128_get_boot(&boot_id);
208 if (r < 0)
209 return r;
210
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
213
214 f->header->boot_id = boot_id;
215
216 journal_file_set_online(f);
217
218 /* Sync the online state to disk */
219 fsync(f->fd);
220
221 return 0;
222 }
223
224 static int journal_file_verify_header(JournalFile *f) {
225 uint32_t flags;
226
227 assert(f);
228
229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
230 return -EBADMSG;
231
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
240 if (flags)
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
243 return -EPROTONOSUPPORT;
244 }
245
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
254 if (flags)
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
258 }
259
260 if (f->header->state >= _STATE_MAX)
261 return -EBADMSG;
262
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
265 return -EBADMSG;
266
267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
268 return -EBADMSG;
269
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
271 return -ENODATA;
272
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
274 return -ENODATA;
275
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
280 return -ENODATA;
281
282 if (f->writable) {
283 uint8_t state;
284 sd_id128_t machine_id;
285 int r;
286
287 r = sd_id128_get_machine(&machine_id);
288 if (r < 0)
289 return r;
290
291 if (!sd_id128_equal(machine_id, f->header->machine_id))
292 return -EHOSTDOWN;
293
294 state = f->header->state;
295
296 if (state == STATE_ONLINE) {
297 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
298 return -EBUSY;
299 } else if (state == STATE_ARCHIVED)
300 return -ESHUTDOWN;
301 else if (state != STATE_OFFLINE) {
302 log_debug("Journal file %s has unknown state %u.", f->path, state);
303 return -EBUSY;
304 }
305 }
306
307 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
308 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
309
310 f->seal = JOURNAL_HEADER_SEALED(f->header);
311
312 return 0;
313 }
314
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316 uint64_t old_size, new_size;
317 int r;
318
319 assert(f);
320
321 /* We assume that this file is not sparse, and we know that
322 * for sure, since we always call posix_fallocate()
323 * ourselves */
324
325 old_size =
326 le64toh(f->header->header_size) +
327 le64toh(f->header->arena_size);
328
329 new_size = PAGE_ALIGN(offset + size);
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
332
333 if (new_size <= old_size)
334 return 0;
335
336 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
337 return -E2BIG;
338
339 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
340 struct statvfs svfs;
341
342 if (fstatvfs(f->fd, &svfs) >= 0) {
343 uint64_t available;
344
345 available = svfs.f_bfree * svfs.f_bsize;
346
347 if (available >= f->metrics.keep_free)
348 available -= f->metrics.keep_free;
349 else
350 available = 0;
351
352 if (new_size - old_size > available)
353 return -E2BIG;
354 }
355 }
356
357 /* Increase by larger blocks at once */
358 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360 new_size = f->metrics.max_size;
361
362 /* Note that the glibc fallocate() fallback is very
363 inefficient, hence we try to minimize the allocation area
364 as we can. */
365 r = posix_fallocate(f->fd, old_size, new_size - old_size);
366 if (r != 0)
367 return -r;
368
369 if (fstat(f->fd, &f->last_stat) < 0)
370 return -errno;
371
372 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
373
374 return 0;
375 }
376
377 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
378 assert(f);
379 assert(ret);
380
381 if (size <= 0)
382 return -EINVAL;
383
384 /* Avoid SIGBUS on invalid accesses */
385 if (offset + size > (uint64_t) f->last_stat.st_size) {
386 /* Hmm, out of range? Let's refresh the fstat() data
387 * first, before we trust that check. */
388
389 if (fstat(f->fd, &f->last_stat) < 0 ||
390 offset + size > (uint64_t) f->last_stat.st_size)
391 return -EADDRNOTAVAIL;
392 }
393
394 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret, NULL);
395 }
396
397 static uint64_t minimum_header_size(Object *o) {
398
399 static const uint64_t table[] = {
400 [OBJECT_DATA] = sizeof(DataObject),
401 [OBJECT_FIELD] = sizeof(FieldObject),
402 [OBJECT_ENTRY] = sizeof(EntryObject),
403 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
404 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
405 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
406 [OBJECT_TAG] = sizeof(TagObject),
407 };
408
409 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
410 return sizeof(ObjectHeader);
411
412 return table[o->object.type];
413 }
414
415 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
416 int r;
417 void *t;
418 Object *o;
419 uint64_t s;
420
421 assert(f);
422 assert(ret);
423
424 /* Objects may only be located at multiple of 64 bit */
425 if (!VALID64(offset))
426 return -EFAULT;
427
428 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
429 if (r < 0)
430 return r;
431
432 o = (Object*) t;
433 s = le64toh(o->object.size);
434
435 if (s < sizeof(ObjectHeader))
436 return -EBADMSG;
437
438 if (o->object.type <= OBJECT_UNUSED)
439 return -EBADMSG;
440
441 if (s < minimum_header_size(o))
442 return -EBADMSG;
443
444 if (type > 0 && o->object.type != type)
445 return -EBADMSG;
446
447 if (s > sizeof(ObjectHeader)) {
448 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
449 if (r < 0)
450 return r;
451
452 o = (Object*) t;
453 }
454
455 *ret = o;
456 return 0;
457 }
458
459 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
460 uint64_t r;
461
462 assert(f);
463
464 r = le64toh(f->header->tail_entry_seqnum) + 1;
465
466 if (seqnum) {
467 /* If an external seqnum counter was passed, we update
468 * both the local and the external one, and set it to
469 * the maximum of both */
470
471 if (*seqnum + 1 > r)
472 r = *seqnum + 1;
473
474 *seqnum = r;
475 }
476
477 f->header->tail_entry_seqnum = htole64(r);
478
479 if (f->header->head_entry_seqnum == 0)
480 f->header->head_entry_seqnum = htole64(r);
481
482 return r;
483 }
484
485 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
486 int r;
487 uint64_t p;
488 Object *tail, *o;
489 void *t;
490
491 assert(f);
492 assert(type > 0 && type < _OBJECT_TYPE_MAX);
493 assert(size >= sizeof(ObjectHeader));
494 assert(offset);
495 assert(ret);
496
497 r = journal_file_set_online(f);
498 if (r < 0)
499 return r;
500
501 p = le64toh(f->header->tail_object_offset);
502 if (p == 0)
503 p = le64toh(f->header->header_size);
504 else {
505 r = journal_file_move_to_object(f, -1, p, &tail);
506 if (r < 0)
507 return r;
508
509 p += ALIGN64(le64toh(tail->object.size));
510 }
511
512 r = journal_file_allocate(f, p, size);
513 if (r < 0)
514 return r;
515
516 r = journal_file_move_to(f, type, false, p, size, &t);
517 if (r < 0)
518 return r;
519
520 o = (Object*) t;
521
522 zero(o->object);
523 o->object.type = type;
524 o->object.size = htole64(size);
525
526 f->header->tail_object_offset = htole64(p);
527 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
528
529 *ret = o;
530 *offset = p;
531
532 return 0;
533 }
534
535 static int journal_file_setup_data_hash_table(JournalFile *f) {
536 uint64_t s, p;
537 Object *o;
538 int r;
539
540 assert(f);
541
542 /* We estimate that we need 1 hash table entry per 768 of
543 journal file and we want to make sure we never get beyond
544 75% fill level. Calculate the hash table size for the
545 maximum file size based on these metrics. */
546
547 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
548 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
549 s = DEFAULT_DATA_HASH_TABLE_SIZE;
550
551 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
552
553 r = journal_file_append_object(f,
554 OBJECT_DATA_HASH_TABLE,
555 offsetof(Object, hash_table.items) + s,
556 &o, &p);
557 if (r < 0)
558 return r;
559
560 memzero(o->hash_table.items, s);
561
562 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
563 f->header->data_hash_table_size = htole64(s);
564
565 return 0;
566 }
567
568 static int journal_file_setup_field_hash_table(JournalFile *f) {
569 uint64_t s, p;
570 Object *o;
571 int r;
572
573 assert(f);
574
575 /* We use a fixed size hash table for the fields as this
576 * number should grow very slowly only */
577
578 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
579 r = journal_file_append_object(f,
580 OBJECT_FIELD_HASH_TABLE,
581 offsetof(Object, hash_table.items) + s,
582 &o, &p);
583 if (r < 0)
584 return r;
585
586 memzero(o->hash_table.items, s);
587
588 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
589 f->header->field_hash_table_size = htole64(s);
590
591 return 0;
592 }
593
594 static int journal_file_map_data_hash_table(JournalFile *f) {
595 uint64_t s, p;
596 void *t;
597 int r;
598
599 assert(f);
600
601 p = le64toh(f->header->data_hash_table_offset);
602 s = le64toh(f->header->data_hash_table_size);
603
604 r = journal_file_move_to(f,
605 OBJECT_DATA_HASH_TABLE,
606 true,
607 p, s,
608 &t);
609 if (r < 0)
610 return r;
611
612 f->data_hash_table = t;
613 return 0;
614 }
615
616 static int journal_file_map_field_hash_table(JournalFile *f) {
617 uint64_t s, p;
618 void *t;
619 int r;
620
621 assert(f);
622
623 p = le64toh(f->header->field_hash_table_offset);
624 s = le64toh(f->header->field_hash_table_size);
625
626 r = journal_file_move_to(f,
627 OBJECT_FIELD_HASH_TABLE,
628 true,
629 p, s,
630 &t);
631 if (r < 0)
632 return r;
633
634 f->field_hash_table = t;
635 return 0;
636 }
637
638 static int journal_file_link_field(
639 JournalFile *f,
640 Object *o,
641 uint64_t offset,
642 uint64_t hash) {
643
644 uint64_t p, h;
645 int r;
646
647 assert(f);
648 assert(o);
649 assert(offset > 0);
650
651 if (o->object.type != OBJECT_FIELD)
652 return -EINVAL;
653
654 /* This might alter the window we are looking at */
655
656 o->field.next_hash_offset = o->field.head_data_offset = 0;
657
658 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
659 p = le64toh(f->field_hash_table[h].tail_hash_offset);
660 if (p == 0)
661 f->field_hash_table[h].head_hash_offset = htole64(offset);
662 else {
663 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
664 if (r < 0)
665 return r;
666
667 o->field.next_hash_offset = htole64(offset);
668 }
669
670 f->field_hash_table[h].tail_hash_offset = htole64(offset);
671
672 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
673 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
674
675 return 0;
676 }
677
678 static int journal_file_link_data(
679 JournalFile *f,
680 Object *o,
681 uint64_t offset,
682 uint64_t hash) {
683
684 uint64_t p, h;
685 int r;
686
687 assert(f);
688 assert(o);
689 assert(offset > 0);
690
691 if (o->object.type != OBJECT_DATA)
692 return -EINVAL;
693
694 /* This might alter the window we are looking at */
695
696 o->data.next_hash_offset = o->data.next_field_offset = 0;
697 o->data.entry_offset = o->data.entry_array_offset = 0;
698 o->data.n_entries = 0;
699
700 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
701 p = le64toh(f->data_hash_table[h].tail_hash_offset);
702 if (p == 0)
703 /* Only entry in the hash table is easy */
704 f->data_hash_table[h].head_hash_offset = htole64(offset);
705 else {
706 /* Move back to the previous data object, to patch in
707 * pointer */
708
709 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
710 if (r < 0)
711 return r;
712
713 o->data.next_hash_offset = htole64(offset);
714 }
715
716 f->data_hash_table[h].tail_hash_offset = htole64(offset);
717
718 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
719 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
720
721 return 0;
722 }
723
724 int journal_file_find_field_object_with_hash(
725 JournalFile *f,
726 const void *field, uint64_t size, uint64_t hash,
727 Object **ret, uint64_t *offset) {
728
729 uint64_t p, osize, h;
730 int r;
731
732 assert(f);
733 assert(field && size > 0);
734
735 osize = offsetof(Object, field.payload) + size;
736
737 if (f->header->field_hash_table_size == 0)
738 return -EBADMSG;
739
740 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
741 p = le64toh(f->field_hash_table[h].head_hash_offset);
742
743 while (p > 0) {
744 Object *o;
745
746 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
747 if (r < 0)
748 return r;
749
750 if (le64toh(o->field.hash) == hash &&
751 le64toh(o->object.size) == osize &&
752 memcmp(o->field.payload, field, size) == 0) {
753
754 if (ret)
755 *ret = o;
756 if (offset)
757 *offset = p;
758
759 return 1;
760 }
761
762 p = le64toh(o->field.next_hash_offset);
763 }
764
765 return 0;
766 }
767
768 int journal_file_find_field_object(
769 JournalFile *f,
770 const void *field, uint64_t size,
771 Object **ret, uint64_t *offset) {
772
773 uint64_t hash;
774
775 assert(f);
776 assert(field && size > 0);
777
778 hash = hash64(field, size);
779
780 return journal_file_find_field_object_with_hash(f,
781 field, size, hash,
782 ret, offset);
783 }
784
785 int journal_file_find_data_object_with_hash(
786 JournalFile *f,
787 const void *data, uint64_t size, uint64_t hash,
788 Object **ret, uint64_t *offset) {
789
790 uint64_t p, osize, h;
791 int r;
792
793 assert(f);
794 assert(data || size == 0);
795
796 osize = offsetof(Object, data.payload) + size;
797
798 if (f->header->data_hash_table_size == 0)
799 return -EBADMSG;
800
801 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
802 p = le64toh(f->data_hash_table[h].head_hash_offset);
803
804 while (p > 0) {
805 Object *o;
806
807 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
808 if (r < 0)
809 return r;
810
811 if (le64toh(o->data.hash) != hash)
812 goto next;
813
814 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
815 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
816 uint64_t l;
817 size_t rsize;
818
819 l = le64toh(o->object.size);
820 if (l <= offsetof(Object, data.payload))
821 return -EBADMSG;
822
823 l -= offsetof(Object, data.payload);
824
825 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
826 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
827 if (r < 0)
828 return r;
829
830 if (rsize == size &&
831 memcmp(f->compress_buffer, data, size) == 0) {
832
833 if (ret)
834 *ret = o;
835
836 if (offset)
837 *offset = p;
838
839 return 1;
840 }
841 #else
842 return -EPROTONOSUPPORT;
843 #endif
844 } else if (le64toh(o->object.size) == osize &&
845 memcmp(o->data.payload, data, size) == 0) {
846
847 if (ret)
848 *ret = o;
849
850 if (offset)
851 *offset = p;
852
853 return 1;
854 }
855
856 next:
857 p = le64toh(o->data.next_hash_offset);
858 }
859
860 return 0;
861 }
862
863 int journal_file_find_data_object(
864 JournalFile *f,
865 const void *data, uint64_t size,
866 Object **ret, uint64_t *offset) {
867
868 uint64_t hash;
869
870 assert(f);
871 assert(data || size == 0);
872
873 hash = hash64(data, size);
874
875 return journal_file_find_data_object_with_hash(f,
876 data, size, hash,
877 ret, offset);
878 }
879
880 static int journal_file_append_field(
881 JournalFile *f,
882 const void *field, uint64_t size,
883 Object **ret, uint64_t *offset) {
884
885 uint64_t hash, p;
886 uint64_t osize;
887 Object *o;
888 int r;
889
890 assert(f);
891 assert(field && size > 0);
892
893 hash = hash64(field, size);
894
895 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
896 if (r < 0)
897 return r;
898 else if (r > 0) {
899
900 if (ret)
901 *ret = o;
902
903 if (offset)
904 *offset = p;
905
906 return 0;
907 }
908
909 osize = offsetof(Object, field.payload) + size;
910 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
911 if (r < 0)
912 return r;
913
914 o->field.hash = htole64(hash);
915 memcpy(o->field.payload, field, size);
916
917 r = journal_file_link_field(f, o, p, hash);
918 if (r < 0)
919 return r;
920
921 /* The linking might have altered the window, so let's
922 * refresh our pointer */
923 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
924 if (r < 0)
925 return r;
926
927 #ifdef HAVE_GCRYPT
928 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
929 if (r < 0)
930 return r;
931 #endif
932
933 if (ret)
934 *ret = o;
935
936 if (offset)
937 *offset = p;
938
939 return 0;
940 }
941
942 static int journal_file_append_data(
943 JournalFile *f,
944 const void *data, uint64_t size,
945 Object **ret, uint64_t *offset) {
946
947 uint64_t hash, p;
948 uint64_t osize;
949 Object *o;
950 int r, compression = 0;
951 const void *eq;
952
953 assert(f);
954 assert(data || size == 0);
955
956 hash = hash64(data, size);
957
958 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
959 if (r < 0)
960 return r;
961 else if (r > 0) {
962
963 if (ret)
964 *ret = o;
965
966 if (offset)
967 *offset = p;
968
969 return 0;
970 }
971
972 osize = offsetof(Object, data.payload) + size;
973 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
974 if (r < 0)
975 return r;
976
977 o->data.hash = htole64(hash);
978
979 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
980 if (f->compress_xz &&
981 size >= COMPRESSION_SIZE_THRESHOLD) {
982 size_t rsize;
983
984 compression = compress_blob(data, size, o->data.payload, &rsize);
985
986 if (compression) {
987 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
988 o->object.flags |= compression;
989
990 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
991 size, rsize, object_compressed_to_string(compression));
992 }
993 }
994 #endif
995
996 if (!compression && size > 0)
997 memcpy(o->data.payload, data, size);
998
999 r = journal_file_link_data(f, o, p, hash);
1000 if (r < 0)
1001 return r;
1002
1003 /* The linking might have altered the window, so let's
1004 * refresh our pointer */
1005 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1006 if (r < 0)
1007 return r;
1008
1009 if (!data)
1010 eq = NULL;
1011 else
1012 eq = memchr(data, '=', size);
1013 if (eq && eq > data) {
1014 Object *fo = NULL;
1015 uint64_t fp;
1016
1017 /* Create field object ... */
1018 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1019 if (r < 0)
1020 return r;
1021
1022 /* ... and link it in. */
1023 o->data.next_field_offset = fo->field.head_data_offset;
1024 fo->field.head_data_offset = le64toh(p);
1025 }
1026
1027 #ifdef HAVE_GCRYPT
1028 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1029 if (r < 0)
1030 return r;
1031 #endif
1032
1033 if (ret)
1034 *ret = o;
1035
1036 if (offset)
1037 *offset = p;
1038
1039 return 0;
1040 }
1041
1042 uint64_t journal_file_entry_n_items(Object *o) {
1043 assert(o);
1044
1045 if (o->object.type != OBJECT_ENTRY)
1046 return 0;
1047
1048 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1049 }
1050
1051 uint64_t journal_file_entry_array_n_items(Object *o) {
1052 assert(o);
1053
1054 if (o->object.type != OBJECT_ENTRY_ARRAY)
1055 return 0;
1056
1057 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1058 }
1059
1060 uint64_t journal_file_hash_table_n_items(Object *o) {
1061 assert(o);
1062
1063 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1064 o->object.type != OBJECT_FIELD_HASH_TABLE)
1065 return 0;
1066
1067 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1068 }
1069
1070 static int link_entry_into_array(JournalFile *f,
1071 le64_t *first,
1072 le64_t *idx,
1073 uint64_t p) {
1074 int r;
1075 uint64_t n = 0, ap = 0, q, i, a, hidx;
1076 Object *o;
1077
1078 assert(f);
1079 assert(first);
1080 assert(idx);
1081 assert(p > 0);
1082
1083 a = le64toh(*first);
1084 i = hidx = le64toh(*idx);
1085 while (a > 0) {
1086
1087 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1088 if (r < 0)
1089 return r;
1090
1091 n = journal_file_entry_array_n_items(o);
1092 if (i < n) {
1093 o->entry_array.items[i] = htole64(p);
1094 *idx = htole64(hidx + 1);
1095 return 0;
1096 }
1097
1098 i -= n;
1099 ap = a;
1100 a = le64toh(o->entry_array.next_entry_array_offset);
1101 }
1102
1103 if (hidx > n)
1104 n = (hidx+1) * 2;
1105 else
1106 n = n * 2;
1107
1108 if (n < 4)
1109 n = 4;
1110
1111 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1112 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1113 &o, &q);
1114 if (r < 0)
1115 return r;
1116
1117 #ifdef HAVE_GCRYPT
1118 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1119 if (r < 0)
1120 return r;
1121 #endif
1122
1123 o->entry_array.items[i] = htole64(p);
1124
1125 if (ap == 0)
1126 *first = htole64(q);
1127 else {
1128 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1129 if (r < 0)
1130 return r;
1131
1132 o->entry_array.next_entry_array_offset = htole64(q);
1133 }
1134
1135 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1136 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1137
1138 *idx = htole64(hidx + 1);
1139
1140 return 0;
1141 }
1142
1143 static int link_entry_into_array_plus_one(JournalFile *f,
1144 le64_t *extra,
1145 le64_t *first,
1146 le64_t *idx,
1147 uint64_t p) {
1148
1149 int r;
1150
1151 assert(f);
1152 assert(extra);
1153 assert(first);
1154 assert(idx);
1155 assert(p > 0);
1156
1157 if (*idx == 0)
1158 *extra = htole64(p);
1159 else {
1160 le64_t i;
1161
1162 i = htole64(le64toh(*idx) - 1);
1163 r = link_entry_into_array(f, first, &i, p);
1164 if (r < 0)
1165 return r;
1166 }
1167
1168 *idx = htole64(le64toh(*idx) + 1);
1169 return 0;
1170 }
1171
1172 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1173 uint64_t p;
1174 int r;
1175 assert(f);
1176 assert(o);
1177 assert(offset > 0);
1178
1179 p = le64toh(o->entry.items[i].object_offset);
1180 if (p == 0)
1181 return -EINVAL;
1182
1183 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1184 if (r < 0)
1185 return r;
1186
1187 return link_entry_into_array_plus_one(f,
1188 &o->data.entry_offset,
1189 &o->data.entry_array_offset,
1190 &o->data.n_entries,
1191 offset);
1192 }
1193
1194 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1195 uint64_t n, i;
1196 int r;
1197
1198 assert(f);
1199 assert(o);
1200 assert(offset > 0);
1201
1202 if (o->object.type != OBJECT_ENTRY)
1203 return -EINVAL;
1204
1205 __sync_synchronize();
1206
1207 /* Link up the entry itself */
1208 r = link_entry_into_array(f,
1209 &f->header->entry_array_offset,
1210 &f->header->n_entries,
1211 offset);
1212 if (r < 0)
1213 return r;
1214
1215 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1216
1217 if (f->header->head_entry_realtime == 0)
1218 f->header->head_entry_realtime = o->entry.realtime;
1219
1220 f->header->tail_entry_realtime = o->entry.realtime;
1221 f->header->tail_entry_monotonic = o->entry.monotonic;
1222
1223 f->tail_entry_monotonic_valid = true;
1224
1225 /* Link up the items */
1226 n = journal_file_entry_n_items(o);
1227 for (i = 0; i < n; i++) {
1228 r = journal_file_link_entry_item(f, o, offset, i);
1229 if (r < 0)
1230 return r;
1231 }
1232
1233 return 0;
1234 }
1235
1236 static int journal_file_append_entry_internal(
1237 JournalFile *f,
1238 const dual_timestamp *ts,
1239 uint64_t xor_hash,
1240 const EntryItem items[], unsigned n_items,
1241 uint64_t *seqnum,
1242 Object **ret, uint64_t *offset) {
1243 uint64_t np;
1244 uint64_t osize;
1245 Object *o;
1246 int r;
1247
1248 assert(f);
1249 assert(items || n_items == 0);
1250 assert(ts);
1251
1252 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1253
1254 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1255 if (r < 0)
1256 return r;
1257
1258 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1259 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1260 o->entry.realtime = htole64(ts->realtime);
1261 o->entry.monotonic = htole64(ts->monotonic);
1262 o->entry.xor_hash = htole64(xor_hash);
1263 o->entry.boot_id = f->header->boot_id;
1264
1265 #ifdef HAVE_GCRYPT
1266 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1267 if (r < 0)
1268 return r;
1269 #endif
1270
1271 r = journal_file_link_entry(f, o, np);
1272 if (r < 0)
1273 return r;
1274
1275 if (ret)
1276 *ret = o;
1277
1278 if (offset)
1279 *offset = np;
1280
1281 return 0;
1282 }
1283
1284 void journal_file_post_change(JournalFile *f) {
1285 assert(f);
1286
1287 /* inotify() does not receive IN_MODIFY events from file
1288 * accesses done via mmap(). After each access we hence
1289 * trigger IN_MODIFY by truncating the journal file to its
1290 * current size which triggers IN_MODIFY. */
1291
1292 __sync_synchronize();
1293
1294 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1295 log_error("Failed to truncate file to its own size: %m");
1296 }
1297
1298 static int entry_item_cmp(const void *_a, const void *_b) {
1299 const EntryItem *a = _a, *b = _b;
1300
1301 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1302 return -1;
1303 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1304 return 1;
1305 return 0;
1306 }
1307
1308 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1309 unsigned i;
1310 EntryItem *items;
1311 int r;
1312 uint64_t xor_hash = 0;
1313 struct dual_timestamp _ts;
1314
1315 assert(f);
1316 assert(iovec || n_iovec == 0);
1317
1318 if (!ts) {
1319 dual_timestamp_get(&_ts);
1320 ts = &_ts;
1321 }
1322
1323 if (f->tail_entry_monotonic_valid &&
1324 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1325 return -EINVAL;
1326
1327 #ifdef HAVE_GCRYPT
1328 r = journal_file_maybe_append_tag(f, ts->realtime);
1329 if (r < 0)
1330 return r;
1331 #endif
1332
1333 /* alloca() can't take 0, hence let's allocate at least one */
1334 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1335
1336 for (i = 0; i < n_iovec; i++) {
1337 uint64_t p;
1338 Object *o;
1339
1340 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1341 if (r < 0)
1342 return r;
1343
1344 xor_hash ^= le64toh(o->data.hash);
1345 items[i].object_offset = htole64(p);
1346 items[i].hash = o->data.hash;
1347 }
1348
1349 /* Order by the position on disk, in order to improve seek
1350 * times for rotating media. */
1351 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1352
1353 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1354
1355 journal_file_post_change(f);
1356
1357 return r;
1358 }
1359
1360 typedef struct ChainCacheItem {
1361 uint64_t first; /* the array at the beginning of the chain */
1362 uint64_t array; /* the cached array */
1363 uint64_t begin; /* the first item in the cached array */
1364 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1365 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1366 } ChainCacheItem;
1367
1368 static void chain_cache_put(
1369 Hashmap *h,
1370 ChainCacheItem *ci,
1371 uint64_t first,
1372 uint64_t array,
1373 uint64_t begin,
1374 uint64_t total,
1375 uint64_t last_index) {
1376
1377 if (!ci) {
1378 /* If the chain item to cache for this chain is the
1379 * first one it's not worth caching anything */
1380 if (array == first)
1381 return;
1382
1383 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1384 ci = hashmap_steal_first(h);
1385 else {
1386 ci = new(ChainCacheItem, 1);
1387 if (!ci)
1388 return;
1389 }
1390
1391 ci->first = first;
1392
1393 if (hashmap_put(h, &ci->first, ci) < 0) {
1394 free(ci);
1395 return;
1396 }
1397 } else
1398 assert(ci->first == first);
1399
1400 ci->array = array;
1401 ci->begin = begin;
1402 ci->total = total;
1403 ci->last_index = last_index;
1404 }
1405
1406 static int generic_array_get(
1407 JournalFile *f,
1408 uint64_t first,
1409 uint64_t i,
1410 Object **ret, uint64_t *offset) {
1411
1412 Object *o;
1413 uint64_t p = 0, a, t = 0;
1414 int r;
1415 ChainCacheItem *ci;
1416
1417 assert(f);
1418
1419 a = first;
1420
1421 /* Try the chain cache first */
1422 ci = hashmap_get(f->chain_cache, &first);
1423 if (ci && i > ci->total) {
1424 a = ci->array;
1425 i -= ci->total;
1426 t = ci->total;
1427 }
1428
1429 while (a > 0) {
1430 uint64_t k;
1431
1432 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1433 if (r < 0)
1434 return r;
1435
1436 k = journal_file_entry_array_n_items(o);
1437 if (i < k) {
1438 p = le64toh(o->entry_array.items[i]);
1439 goto found;
1440 }
1441
1442 i -= k;
1443 t += k;
1444 a = le64toh(o->entry_array.next_entry_array_offset);
1445 }
1446
1447 return 0;
1448
1449 found:
1450 /* Let's cache this item for the next invocation */
1451 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1452
1453 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1454 if (r < 0)
1455 return r;
1456
1457 if (ret)
1458 *ret = o;
1459
1460 if (offset)
1461 *offset = p;
1462
1463 return 1;
1464 }
1465
1466 static int generic_array_get_plus_one(
1467 JournalFile *f,
1468 uint64_t extra,
1469 uint64_t first,
1470 uint64_t i,
1471 Object **ret, uint64_t *offset) {
1472
1473 Object *o;
1474
1475 assert(f);
1476
1477 if (i == 0) {
1478 int r;
1479
1480 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1481 if (r < 0)
1482 return r;
1483
1484 if (ret)
1485 *ret = o;
1486
1487 if (offset)
1488 *offset = extra;
1489
1490 return 1;
1491 }
1492
1493 return generic_array_get(f, first, i-1, ret, offset);
1494 }
1495
1496 enum {
1497 TEST_FOUND,
1498 TEST_LEFT,
1499 TEST_RIGHT
1500 };
1501
1502 static int generic_array_bisect(
1503 JournalFile *f,
1504 uint64_t first,
1505 uint64_t n,
1506 uint64_t needle,
1507 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1508 direction_t direction,
1509 Object **ret,
1510 uint64_t *offset,
1511 uint64_t *idx) {
1512
1513 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1514 bool subtract_one = false;
1515 Object *o, *array = NULL;
1516 int r;
1517 ChainCacheItem *ci;
1518
1519 assert(f);
1520 assert(test_object);
1521
1522 /* Start with the first array in the chain */
1523 a = first;
1524
1525 ci = hashmap_get(f->chain_cache, &first);
1526 if (ci && n > ci->total) {
1527 /* Ah, we have iterated this bisection array chain
1528 * previously! Let's see if we can skip ahead in the
1529 * chain, as far as the last time. But we can't jump
1530 * backwards in the chain, so let's check that
1531 * first. */
1532
1533 r = test_object(f, ci->begin, needle);
1534 if (r < 0)
1535 return r;
1536
1537 if (r == TEST_LEFT) {
1538 /* OK, what we are looking for is right of the
1539 * begin of this EntryArray, so let's jump
1540 * straight to previously cached array in the
1541 * chain */
1542
1543 a = ci->array;
1544 n -= ci->total;
1545 t = ci->total;
1546 last_index = ci->last_index;
1547 }
1548 }
1549
1550 while (a > 0) {
1551 uint64_t left, right, k, lp;
1552
1553 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1554 if (r < 0)
1555 return r;
1556
1557 k = journal_file_entry_array_n_items(array);
1558 right = MIN(k, n);
1559 if (right <= 0)
1560 return 0;
1561
1562 i = right - 1;
1563 lp = p = le64toh(array->entry_array.items[i]);
1564 if (p <= 0)
1565 return -EBADMSG;
1566
1567 r = test_object(f, p, needle);
1568 if (r < 0)
1569 return r;
1570
1571 if (r == TEST_FOUND)
1572 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1573
1574 if (r == TEST_RIGHT) {
1575 left = 0;
1576 right -= 1;
1577
1578 if (last_index != (uint64_t) -1) {
1579 assert(last_index <= right);
1580
1581 /* If we cached the last index we
1582 * looked at, let's try to not to jump
1583 * too wildly around and see if we can
1584 * limit the range to look at early to
1585 * the immediate neighbors of the last
1586 * index we looked at. */
1587
1588 if (last_index > 0) {
1589 uint64_t x = last_index - 1;
1590
1591 p = le64toh(array->entry_array.items[x]);
1592 if (p <= 0)
1593 return -EBADMSG;
1594
1595 r = test_object(f, p, needle);
1596 if (r < 0)
1597 return r;
1598
1599 if (r == TEST_FOUND)
1600 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1601
1602 if (r == TEST_RIGHT)
1603 right = x;
1604 else
1605 left = x + 1;
1606 }
1607
1608 if (last_index < right) {
1609 uint64_t y = last_index + 1;
1610
1611 p = le64toh(array->entry_array.items[y]);
1612 if (p <= 0)
1613 return -EBADMSG;
1614
1615 r = test_object(f, p, needle);
1616 if (r < 0)
1617 return r;
1618
1619 if (r == TEST_FOUND)
1620 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1621
1622 if (r == TEST_RIGHT)
1623 right = y;
1624 else
1625 left = y + 1;
1626 }
1627 }
1628
1629 for (;;) {
1630 if (left == right) {
1631 if (direction == DIRECTION_UP)
1632 subtract_one = true;
1633
1634 i = left;
1635 goto found;
1636 }
1637
1638 assert(left < right);
1639 i = (left + right) / 2;
1640
1641 p = le64toh(array->entry_array.items[i]);
1642 if (p <= 0)
1643 return -EBADMSG;
1644
1645 r = test_object(f, p, needle);
1646 if (r < 0)
1647 return r;
1648
1649 if (r == TEST_FOUND)
1650 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1651
1652 if (r == TEST_RIGHT)
1653 right = i;
1654 else
1655 left = i + 1;
1656 }
1657 }
1658
1659 if (k > n) {
1660 if (direction == DIRECTION_UP) {
1661 i = n;
1662 subtract_one = true;
1663 goto found;
1664 }
1665
1666 return 0;
1667 }
1668
1669 last_p = lp;
1670
1671 n -= k;
1672 t += k;
1673 last_index = (uint64_t) -1;
1674 a = le64toh(array->entry_array.next_entry_array_offset);
1675 }
1676
1677 return 0;
1678
1679 found:
1680 if (subtract_one && t == 0 && i == 0)
1681 return 0;
1682
1683 /* Let's cache this item for the next invocation */
1684 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1685
1686 if (subtract_one && i == 0)
1687 p = last_p;
1688 else if (subtract_one)
1689 p = le64toh(array->entry_array.items[i-1]);
1690 else
1691 p = le64toh(array->entry_array.items[i]);
1692
1693 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1694 if (r < 0)
1695 return r;
1696
1697 if (ret)
1698 *ret = o;
1699
1700 if (offset)
1701 *offset = p;
1702
1703 if (idx)
1704 *idx = t + i + (subtract_one ? -1 : 0);
1705
1706 return 1;
1707 }
1708
1709
1710 static int generic_array_bisect_plus_one(
1711 JournalFile *f,
1712 uint64_t extra,
1713 uint64_t first,
1714 uint64_t n,
1715 uint64_t needle,
1716 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1717 direction_t direction,
1718 Object **ret,
1719 uint64_t *offset,
1720 uint64_t *idx) {
1721
1722 int r;
1723 bool step_back = false;
1724 Object *o;
1725
1726 assert(f);
1727 assert(test_object);
1728
1729 if (n <= 0)
1730 return 0;
1731
1732 /* This bisects the array in object 'first', but first checks
1733 * an extra */
1734 r = test_object(f, extra, needle);
1735 if (r < 0)
1736 return r;
1737
1738 if (r == TEST_FOUND)
1739 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1740
1741 /* if we are looking with DIRECTION_UP then we need to first
1742 see if in the actual array there is a matching entry, and
1743 return the last one of that. But if there isn't any we need
1744 to return this one. Hence remember this, and return it
1745 below. */
1746 if (r == TEST_LEFT)
1747 step_back = direction == DIRECTION_UP;
1748
1749 if (r == TEST_RIGHT) {
1750 if (direction == DIRECTION_DOWN)
1751 goto found;
1752 else
1753 return 0;
1754 }
1755
1756 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1757
1758 if (r == 0 && step_back)
1759 goto found;
1760
1761 if (r > 0 && idx)
1762 (*idx) ++;
1763
1764 return r;
1765
1766 found:
1767 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1768 if (r < 0)
1769 return r;
1770
1771 if (ret)
1772 *ret = o;
1773
1774 if (offset)
1775 *offset = extra;
1776
1777 if (idx)
1778 *idx = 0;
1779
1780 return 1;
1781 }
1782
1783 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1784 assert(f);
1785 assert(p > 0);
1786
1787 if (p == needle)
1788 return TEST_FOUND;
1789 else if (p < needle)
1790 return TEST_LEFT;
1791 else
1792 return TEST_RIGHT;
1793 }
1794
1795 int journal_file_move_to_entry_by_offset(
1796 JournalFile *f,
1797 uint64_t p,
1798 direction_t direction,
1799 Object **ret,
1800 uint64_t *offset) {
1801
1802 return generic_array_bisect(f,
1803 le64toh(f->header->entry_array_offset),
1804 le64toh(f->header->n_entries),
1805 p,
1806 test_object_offset,
1807 direction,
1808 ret, offset, NULL);
1809 }
1810
1811
1812 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1813 Object *o;
1814 int r;
1815
1816 assert(f);
1817 assert(p > 0);
1818
1819 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1820 if (r < 0)
1821 return r;
1822
1823 if (le64toh(o->entry.seqnum) == needle)
1824 return TEST_FOUND;
1825 else if (le64toh(o->entry.seqnum) < needle)
1826 return TEST_LEFT;
1827 else
1828 return TEST_RIGHT;
1829 }
1830
1831 int journal_file_move_to_entry_by_seqnum(
1832 JournalFile *f,
1833 uint64_t seqnum,
1834 direction_t direction,
1835 Object **ret,
1836 uint64_t *offset) {
1837
1838 return generic_array_bisect(f,
1839 le64toh(f->header->entry_array_offset),
1840 le64toh(f->header->n_entries),
1841 seqnum,
1842 test_object_seqnum,
1843 direction,
1844 ret, offset, NULL);
1845 }
1846
1847 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1848 Object *o;
1849 int r;
1850
1851 assert(f);
1852 assert(p > 0);
1853
1854 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1855 if (r < 0)
1856 return r;
1857
1858 if (le64toh(o->entry.realtime) == needle)
1859 return TEST_FOUND;
1860 else if (le64toh(o->entry.realtime) < needle)
1861 return TEST_LEFT;
1862 else
1863 return TEST_RIGHT;
1864 }
1865
1866 int journal_file_move_to_entry_by_realtime(
1867 JournalFile *f,
1868 uint64_t realtime,
1869 direction_t direction,
1870 Object **ret,
1871 uint64_t *offset) {
1872
1873 return generic_array_bisect(f,
1874 le64toh(f->header->entry_array_offset),
1875 le64toh(f->header->n_entries),
1876 realtime,
1877 test_object_realtime,
1878 direction,
1879 ret, offset, NULL);
1880 }
1881
1882 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1883 Object *o;
1884 int r;
1885
1886 assert(f);
1887 assert(p > 0);
1888
1889 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1890 if (r < 0)
1891 return r;
1892
1893 if (le64toh(o->entry.monotonic) == needle)
1894 return TEST_FOUND;
1895 else if (le64toh(o->entry.monotonic) < needle)
1896 return TEST_LEFT;
1897 else
1898 return TEST_RIGHT;
1899 }
1900
1901 static inline int find_data_object_by_boot_id(
1902 JournalFile *f,
1903 sd_id128_t boot_id,
1904 Object **o,
1905 uint64_t *b) {
1906 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1907
1908 sd_id128_to_string(boot_id, t + 9);
1909 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1910 }
1911
1912 int journal_file_move_to_entry_by_monotonic(
1913 JournalFile *f,
1914 sd_id128_t boot_id,
1915 uint64_t monotonic,
1916 direction_t direction,
1917 Object **ret,
1918 uint64_t *offset) {
1919
1920 Object *o;
1921 int r;
1922
1923 assert(f);
1924
1925 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1926 if (r < 0)
1927 return r;
1928 if (r == 0)
1929 return -ENOENT;
1930
1931 return generic_array_bisect_plus_one(f,
1932 le64toh(o->data.entry_offset),
1933 le64toh(o->data.entry_array_offset),
1934 le64toh(o->data.n_entries),
1935 monotonic,
1936 test_object_monotonic,
1937 direction,
1938 ret, offset, NULL);
1939 }
1940
1941 int journal_file_next_entry(
1942 JournalFile *f,
1943 Object *o, uint64_t p,
1944 direction_t direction,
1945 Object **ret, uint64_t *offset) {
1946
1947 uint64_t i, n, ofs;
1948 int r;
1949
1950 assert(f);
1951 assert(p > 0 || !o);
1952
1953 n = le64toh(f->header->n_entries);
1954 if (n <= 0)
1955 return 0;
1956
1957 if (!o)
1958 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1959 else {
1960 if (o->object.type != OBJECT_ENTRY)
1961 return -EINVAL;
1962
1963 r = generic_array_bisect(f,
1964 le64toh(f->header->entry_array_offset),
1965 le64toh(f->header->n_entries),
1966 p,
1967 test_object_offset,
1968 DIRECTION_DOWN,
1969 NULL, NULL,
1970 &i);
1971 if (r <= 0)
1972 return r;
1973
1974 if (direction == DIRECTION_DOWN) {
1975 if (i >= n - 1)
1976 return 0;
1977
1978 i++;
1979 } else {
1980 if (i <= 0)
1981 return 0;
1982
1983 i--;
1984 }
1985 }
1986
1987 /* And jump to it */
1988 r = generic_array_get(f,
1989 le64toh(f->header->entry_array_offset),
1990 i,
1991 ret, &ofs);
1992 if (r <= 0)
1993 return r;
1994
1995 if (p > 0 &&
1996 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1997 log_debug("%s: entry array corrupted at entry %"PRIu64,
1998 f->path, i);
1999 return -EBADMSG;
2000 }
2001
2002 if (offset)
2003 *offset = ofs;
2004
2005 return 1;
2006 }
2007
2008 int journal_file_skip_entry(
2009 JournalFile *f,
2010 Object *o, uint64_t p,
2011 int64_t skip,
2012 Object **ret, uint64_t *offset) {
2013
2014 uint64_t i, n;
2015 int r;
2016
2017 assert(f);
2018 assert(o);
2019 assert(p > 0);
2020
2021 if (o->object.type != OBJECT_ENTRY)
2022 return -EINVAL;
2023
2024 r = generic_array_bisect(f,
2025 le64toh(f->header->entry_array_offset),
2026 le64toh(f->header->n_entries),
2027 p,
2028 test_object_offset,
2029 DIRECTION_DOWN,
2030 NULL, NULL,
2031 &i);
2032 if (r <= 0)
2033 return r;
2034
2035 /* Calculate new index */
2036 if (skip < 0) {
2037 if ((uint64_t) -skip >= i)
2038 i = 0;
2039 else
2040 i = i - (uint64_t) -skip;
2041 } else
2042 i += (uint64_t) skip;
2043
2044 n = le64toh(f->header->n_entries);
2045 if (n <= 0)
2046 return -EBADMSG;
2047
2048 if (i >= n)
2049 i = n-1;
2050
2051 return generic_array_get(f,
2052 le64toh(f->header->entry_array_offset),
2053 i,
2054 ret, offset);
2055 }
2056
2057 int journal_file_next_entry_for_data(
2058 JournalFile *f,
2059 Object *o, uint64_t p,
2060 uint64_t data_offset,
2061 direction_t direction,
2062 Object **ret, uint64_t *offset) {
2063
2064 uint64_t n, i;
2065 int r;
2066 Object *d;
2067
2068 assert(f);
2069 assert(p > 0 || !o);
2070
2071 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2072 if (r < 0)
2073 return r;
2074
2075 n = le64toh(d->data.n_entries);
2076 if (n <= 0)
2077 return n;
2078
2079 if (!o)
2080 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2081 else {
2082 if (o->object.type != OBJECT_ENTRY)
2083 return -EINVAL;
2084
2085 r = generic_array_bisect_plus_one(f,
2086 le64toh(d->data.entry_offset),
2087 le64toh(d->data.entry_array_offset),
2088 le64toh(d->data.n_entries),
2089 p,
2090 test_object_offset,
2091 DIRECTION_DOWN,
2092 NULL, NULL,
2093 &i);
2094
2095 if (r <= 0)
2096 return r;
2097
2098 if (direction == DIRECTION_DOWN) {
2099 if (i >= n - 1)
2100 return 0;
2101
2102 i++;
2103 } else {
2104 if (i <= 0)
2105 return 0;
2106
2107 i--;
2108 }
2109
2110 }
2111
2112 return generic_array_get_plus_one(f,
2113 le64toh(d->data.entry_offset),
2114 le64toh(d->data.entry_array_offset),
2115 i,
2116 ret, offset);
2117 }
2118
2119 int journal_file_move_to_entry_by_offset_for_data(
2120 JournalFile *f,
2121 uint64_t data_offset,
2122 uint64_t p,
2123 direction_t direction,
2124 Object **ret, uint64_t *offset) {
2125
2126 int r;
2127 Object *d;
2128
2129 assert(f);
2130
2131 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2132 if (r < 0)
2133 return r;
2134
2135 return generic_array_bisect_plus_one(f,
2136 le64toh(d->data.entry_offset),
2137 le64toh(d->data.entry_array_offset),
2138 le64toh(d->data.n_entries),
2139 p,
2140 test_object_offset,
2141 direction,
2142 ret, offset, NULL);
2143 }
2144
2145 int journal_file_move_to_entry_by_monotonic_for_data(
2146 JournalFile *f,
2147 uint64_t data_offset,
2148 sd_id128_t boot_id,
2149 uint64_t monotonic,
2150 direction_t direction,
2151 Object **ret, uint64_t *offset) {
2152
2153 Object *o, *d;
2154 int r;
2155 uint64_t b, z;
2156
2157 assert(f);
2158
2159 /* First, seek by time */
2160 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2161 if (r < 0)
2162 return r;
2163 if (r == 0)
2164 return -ENOENT;
2165
2166 r = generic_array_bisect_plus_one(f,
2167 le64toh(o->data.entry_offset),
2168 le64toh(o->data.entry_array_offset),
2169 le64toh(o->data.n_entries),
2170 monotonic,
2171 test_object_monotonic,
2172 direction,
2173 NULL, &z, NULL);
2174 if (r <= 0)
2175 return r;
2176
2177 /* And now, continue seeking until we find an entry that
2178 * exists in both bisection arrays */
2179
2180 for (;;) {
2181 Object *qo;
2182 uint64_t p, q;
2183
2184 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2185 if (r < 0)
2186 return r;
2187
2188 r = generic_array_bisect_plus_one(f,
2189 le64toh(d->data.entry_offset),
2190 le64toh(d->data.entry_array_offset),
2191 le64toh(d->data.n_entries),
2192 z,
2193 test_object_offset,
2194 direction,
2195 NULL, &p, NULL);
2196 if (r <= 0)
2197 return r;
2198
2199 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2200 if (r < 0)
2201 return r;
2202
2203 r = generic_array_bisect_plus_one(f,
2204 le64toh(o->data.entry_offset),
2205 le64toh(o->data.entry_array_offset),
2206 le64toh(o->data.n_entries),
2207 p,
2208 test_object_offset,
2209 direction,
2210 &qo, &q, NULL);
2211
2212 if (r <= 0)
2213 return r;
2214
2215 if (p == q) {
2216 if (ret)
2217 *ret = qo;
2218 if (offset)
2219 *offset = q;
2220
2221 return 1;
2222 }
2223
2224 z = q;
2225 }
2226 }
2227
2228 int journal_file_move_to_entry_by_seqnum_for_data(
2229 JournalFile *f,
2230 uint64_t data_offset,
2231 uint64_t seqnum,
2232 direction_t direction,
2233 Object **ret, uint64_t *offset) {
2234
2235 Object *d;
2236 int r;
2237
2238 assert(f);
2239
2240 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2241 if (r < 0)
2242 return r;
2243
2244 return generic_array_bisect_plus_one(f,
2245 le64toh(d->data.entry_offset),
2246 le64toh(d->data.entry_array_offset),
2247 le64toh(d->data.n_entries),
2248 seqnum,
2249 test_object_seqnum,
2250 direction,
2251 ret, offset, NULL);
2252 }
2253
2254 int journal_file_move_to_entry_by_realtime_for_data(
2255 JournalFile *f,
2256 uint64_t data_offset,
2257 uint64_t realtime,
2258 direction_t direction,
2259 Object **ret, uint64_t *offset) {
2260
2261 Object *d;
2262 int r;
2263
2264 assert(f);
2265
2266 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2267 if (r < 0)
2268 return r;
2269
2270 return generic_array_bisect_plus_one(f,
2271 le64toh(d->data.entry_offset),
2272 le64toh(d->data.entry_array_offset),
2273 le64toh(d->data.n_entries),
2274 realtime,
2275 test_object_realtime,
2276 direction,
2277 ret, offset, NULL);
2278 }
2279
2280 void journal_file_dump(JournalFile *f) {
2281 Object *o;
2282 int r;
2283 uint64_t p;
2284
2285 assert(f);
2286
2287 journal_file_print_header(f);
2288
2289 p = le64toh(f->header->header_size);
2290 while (p != 0) {
2291 r = journal_file_move_to_object(f, -1, p, &o);
2292 if (r < 0)
2293 goto fail;
2294
2295 switch (o->object.type) {
2296
2297 case OBJECT_UNUSED:
2298 printf("Type: OBJECT_UNUSED\n");
2299 break;
2300
2301 case OBJECT_DATA:
2302 printf("Type: OBJECT_DATA\n");
2303 break;
2304
2305 case OBJECT_FIELD:
2306 printf("Type: OBJECT_FIELD\n");
2307 break;
2308
2309 case OBJECT_ENTRY:
2310 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2311 le64toh(o->entry.seqnum),
2312 le64toh(o->entry.monotonic),
2313 le64toh(o->entry.realtime));
2314 break;
2315
2316 case OBJECT_FIELD_HASH_TABLE:
2317 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2318 break;
2319
2320 case OBJECT_DATA_HASH_TABLE:
2321 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2322 break;
2323
2324 case OBJECT_ENTRY_ARRAY:
2325 printf("Type: OBJECT_ENTRY_ARRAY\n");
2326 break;
2327
2328 case OBJECT_TAG:
2329 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2330 le64toh(o->tag.seqnum),
2331 le64toh(o->tag.epoch));
2332 break;
2333
2334 default:
2335 printf("Type: unknown (%u)\n", o->object.type);
2336 break;
2337 }
2338
2339 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2340 printf("Flags: %s\n",
2341 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2342
2343 if (p == le64toh(f->header->tail_object_offset))
2344 p = 0;
2345 else
2346 p = p + ALIGN64(le64toh(o->object.size));
2347 }
2348
2349 return;
2350 fail:
2351 log_error("File corrupt");
2352 }
2353
2354 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2355 const char *x;
2356
2357 x = format_timestamp(buf, l, t);
2358 if (x)
2359 return x;
2360 return " --- ";
2361 }
2362
2363 void journal_file_print_header(JournalFile *f) {
2364 char a[33], b[33], c[33], d[33];
2365 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2366 struct stat st;
2367 char bytes[FORMAT_BYTES_MAX];
2368
2369 assert(f);
2370
2371 printf("File Path: %s\n"
2372 "File ID: %s\n"
2373 "Machine ID: %s\n"
2374 "Boot ID: %s\n"
2375 "Sequential Number ID: %s\n"
2376 "State: %s\n"
2377 "Compatible Flags:%s%s\n"
2378 "Incompatible Flags:%s%s%s\n"
2379 "Header size: %"PRIu64"\n"
2380 "Arena size: %"PRIu64"\n"
2381 "Data Hash Table Size: %"PRIu64"\n"
2382 "Field Hash Table Size: %"PRIu64"\n"
2383 "Rotate Suggested: %s\n"
2384 "Head Sequential Number: %"PRIu64"\n"
2385 "Tail Sequential Number: %"PRIu64"\n"
2386 "Head Realtime Timestamp: %s\n"
2387 "Tail Realtime Timestamp: %s\n"
2388 "Tail Monotonic Timestamp: %s\n"
2389 "Objects: %"PRIu64"\n"
2390 "Entry Objects: %"PRIu64"\n",
2391 f->path,
2392 sd_id128_to_string(f->header->file_id, a),
2393 sd_id128_to_string(f->header->machine_id, b),
2394 sd_id128_to_string(f->header->boot_id, c),
2395 sd_id128_to_string(f->header->seqnum_id, d),
2396 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2397 f->header->state == STATE_ONLINE ? "ONLINE" :
2398 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2399 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2400 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2401 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2402 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2403 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2404 le64toh(f->header->header_size),
2405 le64toh(f->header->arena_size),
2406 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2407 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2408 yes_no(journal_file_rotate_suggested(f, 0)),
2409 le64toh(f->header->head_entry_seqnum),
2410 le64toh(f->header->tail_entry_seqnum),
2411 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2412 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2413 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2414 le64toh(f->header->n_objects),
2415 le64toh(f->header->n_entries));
2416
2417 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2418 printf("Data Objects: %"PRIu64"\n"
2419 "Data Hash Table Fill: %.1f%%\n",
2420 le64toh(f->header->n_data),
2421 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2422
2423 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2424 printf("Field Objects: %"PRIu64"\n"
2425 "Field Hash Table Fill: %.1f%%\n",
2426 le64toh(f->header->n_fields),
2427 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2428
2429 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2430 printf("Tag Objects: %"PRIu64"\n",
2431 le64toh(f->header->n_tags));
2432 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2433 printf("Entry Array Objects: %"PRIu64"\n",
2434 le64toh(f->header->n_entry_arrays));
2435
2436 if (fstat(f->fd, &st) >= 0)
2437 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2438 }
2439
2440 int journal_file_open(
2441 const char *fname,
2442 int flags,
2443 mode_t mode,
2444 bool compress,
2445 bool seal,
2446 JournalMetrics *metrics,
2447 MMapCache *mmap_cache,
2448 JournalFile *template,
2449 JournalFile **ret) {
2450
2451 JournalFile *f;
2452 int r;
2453 bool newly_created = false;
2454
2455 assert(fname);
2456 assert(ret);
2457
2458 if ((flags & O_ACCMODE) != O_RDONLY &&
2459 (flags & O_ACCMODE) != O_RDWR)
2460 return -EINVAL;
2461
2462 if (!endswith(fname, ".journal") &&
2463 !endswith(fname, ".journal~"))
2464 return -EINVAL;
2465
2466 f = new0(JournalFile, 1);
2467 if (!f)
2468 return -ENOMEM;
2469
2470 f->fd = -1;
2471 f->mode = mode;
2472
2473 f->flags = flags;
2474 f->prot = prot_from_flags(flags);
2475 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2476 #if defined(HAVE_LZ4)
2477 f->compress_lz4 = compress;
2478 #elif defined(HAVE_XZ)
2479 f->compress_xz = compress;
2480 #endif
2481 #ifdef HAVE_GCRYPT
2482 f->seal = seal;
2483 #endif
2484
2485 if (mmap_cache)
2486 f->mmap = mmap_cache_ref(mmap_cache);
2487 else {
2488 f->mmap = mmap_cache_new();
2489 if (!f->mmap) {
2490 r = -ENOMEM;
2491 goto fail;
2492 }
2493 }
2494
2495 f->path = strdup(fname);
2496 if (!f->path) {
2497 r = -ENOMEM;
2498 goto fail;
2499 }
2500
2501 f->chain_cache = hashmap_new(&uint64_hash_ops);
2502 if (!f->chain_cache) {
2503 r = -ENOMEM;
2504 goto fail;
2505 }
2506
2507 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2508 if (f->fd < 0) {
2509 r = -errno;
2510 goto fail;
2511 }
2512
2513 if (fstat(f->fd, &f->last_stat) < 0) {
2514 r = -errno;
2515 goto fail;
2516 }
2517
2518 if (f->last_stat.st_size == 0 && f->writable) {
2519 uint64_t crtime;
2520
2521 /* Let's attach the creation time to the journal file,
2522 * so that the vacuuming code knows the age of this
2523 * file even if the file might end up corrupted one
2524 * day... Ideally we'd just use the creation time many
2525 * file systems maintain for each file, but there is
2526 * currently no usable API to query this, hence let's
2527 * emulate this via extended attributes. If extended
2528 * attributes are not supported we'll just skip this,
2529 * and rely solely on mtime/atime/ctime of the file.*/
2530
2531 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2532 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2533
2534 #ifdef HAVE_GCRYPT
2535 /* Try to load the FSPRG state, and if we can't, then
2536 * just don't do sealing */
2537 if (f->seal) {
2538 r = journal_file_fss_load(f);
2539 if (r < 0)
2540 f->seal = false;
2541 }
2542 #endif
2543
2544 r = journal_file_init_header(f, template);
2545 if (r < 0)
2546 goto fail;
2547
2548 if (fstat(f->fd, &f->last_stat) < 0) {
2549 r = -errno;
2550 goto fail;
2551 }
2552
2553 newly_created = true;
2554 }
2555
2556 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2557 r = -EIO;
2558 goto fail;
2559 }
2560
2561 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2562 if (f->header == MAP_FAILED) {
2563 f->header = NULL;
2564 r = -errno;
2565 goto fail;
2566 }
2567
2568 if (!newly_created) {
2569 r = journal_file_verify_header(f);
2570 if (r < 0)
2571 goto fail;
2572 }
2573
2574 #ifdef HAVE_GCRYPT
2575 if (!newly_created && f->writable) {
2576 r = journal_file_fss_load(f);
2577 if (r < 0)
2578 goto fail;
2579 }
2580 #endif
2581
2582 if (f->writable) {
2583 if (metrics) {
2584 journal_default_metrics(metrics, f->fd);
2585 f->metrics = *metrics;
2586 } else if (template)
2587 f->metrics = template->metrics;
2588
2589 r = journal_file_refresh_header(f);
2590 if (r < 0)
2591 goto fail;
2592 }
2593
2594 #ifdef HAVE_GCRYPT
2595 r = journal_file_hmac_setup(f);
2596 if (r < 0)
2597 goto fail;
2598 #endif
2599
2600 if (newly_created) {
2601 r = journal_file_setup_field_hash_table(f);
2602 if (r < 0)
2603 goto fail;
2604
2605 r = journal_file_setup_data_hash_table(f);
2606 if (r < 0)
2607 goto fail;
2608
2609 #ifdef HAVE_GCRYPT
2610 r = journal_file_append_first_tag(f);
2611 if (r < 0)
2612 goto fail;
2613 #endif
2614 }
2615
2616 r = journal_file_map_field_hash_table(f);
2617 if (r < 0)
2618 goto fail;
2619
2620 r = journal_file_map_data_hash_table(f);
2621 if (r < 0)
2622 goto fail;
2623
2624 *ret = f;
2625 return 0;
2626
2627 fail:
2628 journal_file_close(f);
2629
2630 return r;
2631 }
2632
2633 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2634 _cleanup_free_ char *p = NULL;
2635 size_t l;
2636 JournalFile *old_file, *new_file = NULL;
2637 int r;
2638
2639 assert(f);
2640 assert(*f);
2641
2642 old_file = *f;
2643
2644 if (!old_file->writable)
2645 return -EINVAL;
2646
2647 if (!endswith(old_file->path, ".journal"))
2648 return -EINVAL;
2649
2650 l = strlen(old_file->path);
2651 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2652 (int) l - 8, old_file->path,
2653 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2654 le64toh((*f)->header->head_entry_seqnum),
2655 le64toh((*f)->header->head_entry_realtime));
2656 if (r < 0)
2657 return -ENOMEM;
2658
2659 r = rename(old_file->path, p);
2660 if (r < 0)
2661 return -errno;
2662
2663 old_file->header->state = STATE_ARCHIVED;
2664
2665 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2666 journal_file_close(old_file);
2667
2668 *f = new_file;
2669 return r;
2670 }
2671
2672 int journal_file_open_reliably(
2673 const char *fname,
2674 int flags,
2675 mode_t mode,
2676 bool compress,
2677 bool seal,
2678 JournalMetrics *metrics,
2679 MMapCache *mmap_cache,
2680 JournalFile *template,
2681 JournalFile **ret) {
2682
2683 int r;
2684 size_t l;
2685 _cleanup_free_ char *p = NULL;
2686
2687 r = journal_file_open(fname, flags, mode, compress, seal,
2688 metrics, mmap_cache, template, ret);
2689 if (r != -EBADMSG && /* corrupted */
2690 r != -ENODATA && /* truncated */
2691 r != -EHOSTDOWN && /* other machine */
2692 r != -EPROTONOSUPPORT && /* incompatible feature */
2693 r != -EBUSY && /* unclean shutdown */
2694 r != -ESHUTDOWN /* already archived */)
2695 return r;
2696
2697 if ((flags & O_ACCMODE) == O_RDONLY)
2698 return r;
2699
2700 if (!(flags & O_CREAT))
2701 return r;
2702
2703 if (!endswith(fname, ".journal"))
2704 return r;
2705
2706 /* The file is corrupted. Rotate it away and try it again (but only once) */
2707
2708 l = strlen(fname);
2709 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2710 (int) l - 8, fname,
2711 (unsigned long long) now(CLOCK_REALTIME),
2712 random_u64()) < 0)
2713 return -ENOMEM;
2714
2715 r = rename(fname, p);
2716 if (r < 0)
2717 return -errno;
2718
2719 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2720
2721 return journal_file_open(fname, flags, mode, compress, seal,
2722 metrics, mmap_cache, template, ret);
2723 }
2724
2725 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2726 uint64_t i, n;
2727 uint64_t q, xor_hash = 0;
2728 int r;
2729 EntryItem *items;
2730 dual_timestamp ts;
2731
2732 assert(from);
2733 assert(to);
2734 assert(o);
2735 assert(p);
2736
2737 if (!to->writable)
2738 return -EPERM;
2739
2740 ts.monotonic = le64toh(o->entry.monotonic);
2741 ts.realtime = le64toh(o->entry.realtime);
2742
2743 n = journal_file_entry_n_items(o);
2744 /* alloca() can't take 0, hence let's allocate at least one */
2745 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2746
2747 for (i = 0; i < n; i++) {
2748 uint64_t l, h;
2749 le64_t le_hash;
2750 size_t t;
2751 void *data;
2752 Object *u;
2753
2754 q = le64toh(o->entry.items[i].object_offset);
2755 le_hash = o->entry.items[i].hash;
2756
2757 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2758 if (r < 0)
2759 return r;
2760
2761 if (le_hash != o->data.hash)
2762 return -EBADMSG;
2763
2764 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2765 t = (size_t) l;
2766
2767 /* We hit the limit on 32bit machines */
2768 if ((uint64_t) t != l)
2769 return -E2BIG;
2770
2771 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2772 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2773 size_t rsize;
2774
2775 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2776 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2777 if (r < 0)
2778 return r;
2779
2780 data = from->compress_buffer;
2781 l = rsize;
2782 #else
2783 return -EPROTONOSUPPORT;
2784 #endif
2785 } else
2786 data = o->data.payload;
2787
2788 r = journal_file_append_data(to, data, l, &u, &h);
2789 if (r < 0)
2790 return r;
2791
2792 xor_hash ^= le64toh(u->data.hash);
2793 items[i].object_offset = htole64(h);
2794 items[i].hash = u->data.hash;
2795
2796 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2797 if (r < 0)
2798 return r;
2799 }
2800
2801 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2802 }
2803
2804 void journal_default_metrics(JournalMetrics *m, int fd) {
2805 uint64_t fs_size = 0;
2806 struct statvfs ss;
2807 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2808
2809 assert(m);
2810 assert(fd >= 0);
2811
2812 if (fstatvfs(fd, &ss) >= 0)
2813 fs_size = ss.f_frsize * ss.f_blocks;
2814
2815 if (m->max_use == (uint64_t) -1) {
2816
2817 if (fs_size > 0) {
2818 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2819
2820 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2821 m->max_use = DEFAULT_MAX_USE_UPPER;
2822
2823 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2824 m->max_use = DEFAULT_MAX_USE_LOWER;
2825 } else
2826 m->max_use = DEFAULT_MAX_USE_LOWER;
2827 } else {
2828 m->max_use = PAGE_ALIGN(m->max_use);
2829
2830 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2831 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2832 }
2833
2834 if (m->max_size == (uint64_t) -1) {
2835 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2836
2837 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2838 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2839 } else
2840 m->max_size = PAGE_ALIGN(m->max_size);
2841
2842 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2843 m->max_size = JOURNAL_FILE_SIZE_MIN;
2844
2845 if (m->max_size*2 > m->max_use)
2846 m->max_use = m->max_size*2;
2847
2848 if (m->min_size == (uint64_t) -1)
2849 m->min_size = JOURNAL_FILE_SIZE_MIN;
2850 else {
2851 m->min_size = PAGE_ALIGN(m->min_size);
2852
2853 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2854 m->min_size = JOURNAL_FILE_SIZE_MIN;
2855
2856 if (m->min_size > m->max_size)
2857 m->max_size = m->min_size;
2858 }
2859
2860 if (m->keep_free == (uint64_t) -1) {
2861
2862 if (fs_size > 0) {
2863 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2864
2865 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2866 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2867
2868 } else
2869 m->keep_free = DEFAULT_KEEP_FREE;
2870 }
2871
2872 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2873 format_bytes(a, sizeof(a), m->max_use),
2874 format_bytes(b, sizeof(b), m->max_size),
2875 format_bytes(c, sizeof(c), m->min_size),
2876 format_bytes(d, sizeof(d), m->keep_free));
2877 }
2878
2879 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2880 assert(f);
2881 assert(from || to);
2882
2883 if (from) {
2884 if (f->header->head_entry_realtime == 0)
2885 return -ENOENT;
2886
2887 *from = le64toh(f->header->head_entry_realtime);
2888 }
2889
2890 if (to) {
2891 if (f->header->tail_entry_realtime == 0)
2892 return -ENOENT;
2893
2894 *to = le64toh(f->header->tail_entry_realtime);
2895 }
2896
2897 return 1;
2898 }
2899
2900 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2901 Object *o;
2902 uint64_t p;
2903 int r;
2904
2905 assert(f);
2906 assert(from || to);
2907
2908 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2909 if (r <= 0)
2910 return r;
2911
2912 if (le64toh(o->data.n_entries) <= 0)
2913 return 0;
2914
2915 if (from) {
2916 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2917 if (r < 0)
2918 return r;
2919
2920 *from = le64toh(o->entry.monotonic);
2921 }
2922
2923 if (to) {
2924 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2925 if (r < 0)
2926 return r;
2927
2928 r = generic_array_get_plus_one(f,
2929 le64toh(o->data.entry_offset),
2930 le64toh(o->data.entry_array_offset),
2931 le64toh(o->data.n_entries)-1,
2932 &o, NULL);
2933 if (r <= 0)
2934 return r;
2935
2936 *to = le64toh(o->entry.monotonic);
2937 }
2938
2939 return 1;
2940 }
2941
2942 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2943 assert(f);
2944
2945 /* If we gained new header fields we gained new features,
2946 * hence suggest a rotation */
2947 if (le64toh(f->header->header_size) < sizeof(Header)) {
2948 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2949 return true;
2950 }
2951
2952 /* Let's check if the hash tables grew over a certain fill
2953 * level (75%, borrowing this value from Java's hash table
2954 * implementation), and if so suggest a rotation. To calculate
2955 * the fill level we need the n_data field, which only exists
2956 * in newer versions. */
2957
2958 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2959 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2960 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2961 f->path,
2962 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2963 le64toh(f->header->n_data),
2964 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2965 (unsigned long long) f->last_stat.st_size,
2966 f->last_stat.st_size / le64toh(f->header->n_data));
2967 return true;
2968 }
2969
2970 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2971 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2972 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2973 f->path,
2974 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2975 le64toh(f->header->n_fields),
2976 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2977 return true;
2978 }
2979
2980 /* Are the data objects properly indexed by field objects? */
2981 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2982 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2983 le64toh(f->header->n_data) > 0 &&
2984 le64toh(f->header->n_fields) == 0)
2985 return true;
2986
2987 if (max_file_usec > 0) {
2988 usec_t t, h;
2989
2990 h = le64toh(f->header->head_entry_realtime);
2991 t = now(CLOCK_REALTIME);
2992
2993 if (h > 0 && t > h + max_file_usec)
2994 return true;
2995 }
2996
2997 return false;
2998 }