]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
treewide: use log_*_errno whenever %m is in the format string
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <sys/xattr.h>
30
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
34 #include "lookup3.h"
35 #include "compress.h"
36 #include "fsprg.h"
37
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
45
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53
54 /* This is the upper bound if we deduce the keep_free value from the
55 * file system size */
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58 /* This is the keep_free value when we can't determine the system
59 * size */
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
67
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70
71 static int journal_file_set_online(JournalFile *f) {
72 assert(f);
73
74 if (!f->writable)
75 return -EPERM;
76
77 if (!(f->fd >= 0 && f->header))
78 return -EINVAL;
79
80 switch(f->header->state) {
81 case STATE_ONLINE:
82 return 0;
83
84 case STATE_OFFLINE:
85 f->header->state = STATE_ONLINE;
86 fsync(f->fd);
87 return 0;
88
89 default:
90 return -EINVAL;
91 }
92 }
93
94 int journal_file_set_offline(JournalFile *f) {
95 assert(f);
96
97 if (!f->writable)
98 return -EPERM;
99
100 if (!(f->fd >= 0 && f->header))
101 return -EINVAL;
102
103 if (f->header->state != STATE_ONLINE)
104 return 0;
105
106 fsync(f->fd);
107
108 f->header->state = STATE_OFFLINE;
109
110 fsync(f->fd);
111
112 return 0;
113 }
114
115 void journal_file_close(JournalFile *f) {
116 assert(f);
117
118 #ifdef HAVE_GCRYPT
119 /* Write the final tag */
120 if (f->seal && f->writable)
121 journal_file_append_tag(f);
122 #endif
123
124 /* Sync everything to disk, before we mark the file offline */
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
127
128 journal_file_set_offline(f);
129
130 if (f->header)
131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
132
133 safe_close(f->fd);
134 free(f->path);
135
136 if (f->mmap)
137 mmap_cache_unref(f->mmap);
138
139 ordered_hashmap_free_free(f->chain_cache);
140
141 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
142 free(f->compress_buffer);
143 #endif
144
145 #ifdef HAVE_GCRYPT
146 if (f->fss_file)
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
150
151 free(f->fsprg_seed);
152
153 if (f->hmac)
154 gcry_md_close(f->hmac);
155 #endif
156
157 free(f);
158 }
159
160 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
161 Header h = {};
162 ssize_t k;
163 int r;
164
165 assert(f);
166
167 memcpy(h.signature, HEADER_SIGNATURE, 8);
168 h.header_size = htole64(ALIGN64(sizeof(h)));
169
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
173
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
176
177 r = sd_id128_randomize(&h.file_id);
178 if (r < 0)
179 return r;
180
181 if (template) {
182 h.seqnum_id = template->header->seqnum_id;
183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
184 } else
185 h.seqnum_id = h.file_id;
186
187 k = pwrite(f->fd, &h, sizeof(h), 0);
188 if (k < 0)
189 return -errno;
190
191 if (k != sizeof(h))
192 return -EIO;
193
194 return 0;
195 }
196
197 static int journal_file_refresh_header(JournalFile *f) {
198 int r;
199 sd_id128_t boot_id;
200
201 assert(f);
202
203 r = sd_id128_get_machine(&f->header->machine_id);
204 if (r < 0)
205 return r;
206
207 r = sd_id128_get_boot(&boot_id);
208 if (r < 0)
209 return r;
210
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
213
214 f->header->boot_id = boot_id;
215
216 journal_file_set_online(f);
217
218 /* Sync the online state to disk */
219 fsync(f->fd);
220
221 return 0;
222 }
223
224 static int journal_file_verify_header(JournalFile *f) {
225 uint32_t flags;
226
227 assert(f);
228
229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
230 return -EBADMSG;
231
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
240 if (flags)
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
243 return -EPROTONOSUPPORT;
244 }
245
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
254 if (flags)
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
258 }
259
260 if (f->header->state >= _STATE_MAX)
261 return -EBADMSG;
262
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
265 return -EBADMSG;
266
267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
268 return -EBADMSG;
269
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
271 return -ENODATA;
272
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
274 return -ENODATA;
275
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
280 return -ENODATA;
281
282 if (f->writable) {
283 uint8_t state;
284 sd_id128_t machine_id;
285 int r;
286
287 r = sd_id128_get_machine(&machine_id);
288 if (r < 0)
289 return r;
290
291 if (!sd_id128_equal(machine_id, f->header->machine_id))
292 return -EHOSTDOWN;
293
294 state = f->header->state;
295
296 if (state == STATE_ONLINE) {
297 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
298 return -EBUSY;
299 } else if (state == STATE_ARCHIVED)
300 return -ESHUTDOWN;
301 else if (state != STATE_OFFLINE) {
302 log_debug("Journal file %s has unknown state %u.", f->path, state);
303 return -EBUSY;
304 }
305 }
306
307 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
308 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
309
310 f->seal = JOURNAL_HEADER_SEALED(f->header);
311
312 return 0;
313 }
314
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316 uint64_t old_size, new_size;
317 int r;
318
319 assert(f);
320
321 /* We assume that this file is not sparse, and we know that
322 * for sure, since we always call posix_fallocate()
323 * ourselves */
324
325 old_size =
326 le64toh(f->header->header_size) +
327 le64toh(f->header->arena_size);
328
329 new_size = PAGE_ALIGN(offset + size);
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
332
333 if (new_size <= old_size)
334 return 0;
335
336 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
337 return -E2BIG;
338
339 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
340 struct statvfs svfs;
341
342 if (fstatvfs(f->fd, &svfs) >= 0) {
343 uint64_t available;
344
345 available = svfs.f_bfree * svfs.f_bsize;
346
347 if (available >= f->metrics.keep_free)
348 available -= f->metrics.keep_free;
349 else
350 available = 0;
351
352 if (new_size - old_size > available)
353 return -E2BIG;
354 }
355 }
356
357 /* Increase by larger blocks at once */
358 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360 new_size = f->metrics.max_size;
361
362 /* Note that the glibc fallocate() fallback is very
363 inefficient, hence we try to minimize the allocation area
364 as we can. */
365 r = posix_fallocate(f->fd, old_size, new_size - old_size);
366 if (r != 0)
367 return -r;
368
369 if (fstat(f->fd, &f->last_stat) < 0)
370 return -errno;
371
372 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
373
374 return 0;
375 }
376
377 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
378 assert(f);
379 assert(ret);
380
381 if (size <= 0)
382 return -EINVAL;
383
384 /* Avoid SIGBUS on invalid accesses */
385 if (offset + size > (uint64_t) f->last_stat.st_size) {
386 /* Hmm, out of range? Let's refresh the fstat() data
387 * first, before we trust that check. */
388
389 if (fstat(f->fd, &f->last_stat) < 0 ||
390 offset + size > (uint64_t) f->last_stat.st_size)
391 return -EADDRNOTAVAIL;
392 }
393
394 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret, NULL);
395 }
396
397 static uint64_t minimum_header_size(Object *o) {
398
399 static const uint64_t table[] = {
400 [OBJECT_DATA] = sizeof(DataObject),
401 [OBJECT_FIELD] = sizeof(FieldObject),
402 [OBJECT_ENTRY] = sizeof(EntryObject),
403 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
404 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
405 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
406 [OBJECT_TAG] = sizeof(TagObject),
407 };
408
409 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
410 return sizeof(ObjectHeader);
411
412 return table[o->object.type];
413 }
414
415 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
416 int r;
417 void *t;
418 Object *o;
419 uint64_t s;
420
421 assert(f);
422 assert(ret);
423
424 /* Objects may only be located at multiple of 64 bit */
425 if (!VALID64(offset))
426 return -EFAULT;
427
428 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
429 if (r < 0)
430 return r;
431
432 o = (Object*) t;
433 s = le64toh(o->object.size);
434
435 if (s < sizeof(ObjectHeader))
436 return -EBADMSG;
437
438 if (o->object.type <= OBJECT_UNUSED)
439 return -EBADMSG;
440
441 if (s < minimum_header_size(o))
442 return -EBADMSG;
443
444 if (type > 0 && o->object.type != type)
445 return -EBADMSG;
446
447 if (s > sizeof(ObjectHeader)) {
448 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
449 if (r < 0)
450 return r;
451
452 o = (Object*) t;
453 }
454
455 *ret = o;
456 return 0;
457 }
458
459 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
460 uint64_t r;
461
462 assert(f);
463
464 r = le64toh(f->header->tail_entry_seqnum) + 1;
465
466 if (seqnum) {
467 /* If an external seqnum counter was passed, we update
468 * both the local and the external one, and set it to
469 * the maximum of both */
470
471 if (*seqnum + 1 > r)
472 r = *seqnum + 1;
473
474 *seqnum = r;
475 }
476
477 f->header->tail_entry_seqnum = htole64(r);
478
479 if (f->header->head_entry_seqnum == 0)
480 f->header->head_entry_seqnum = htole64(r);
481
482 return r;
483 }
484
485 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
486 int r;
487 uint64_t p;
488 Object *tail, *o;
489 void *t;
490
491 assert(f);
492 assert(type > 0 && type < _OBJECT_TYPE_MAX);
493 assert(size >= sizeof(ObjectHeader));
494 assert(offset);
495 assert(ret);
496
497 r = journal_file_set_online(f);
498 if (r < 0)
499 return r;
500
501 p = le64toh(f->header->tail_object_offset);
502 if (p == 0)
503 p = le64toh(f->header->header_size);
504 else {
505 r = journal_file_move_to_object(f, -1, p, &tail);
506 if (r < 0)
507 return r;
508
509 p += ALIGN64(le64toh(tail->object.size));
510 }
511
512 r = journal_file_allocate(f, p, size);
513 if (r < 0)
514 return r;
515
516 r = journal_file_move_to(f, type, false, p, size, &t);
517 if (r < 0)
518 return r;
519
520 o = (Object*) t;
521
522 zero(o->object);
523 o->object.type = type;
524 o->object.size = htole64(size);
525
526 f->header->tail_object_offset = htole64(p);
527 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
528
529 *ret = o;
530 *offset = p;
531
532 return 0;
533 }
534
535 static int journal_file_setup_data_hash_table(JournalFile *f) {
536 uint64_t s, p;
537 Object *o;
538 int r;
539
540 assert(f);
541
542 /* We estimate that we need 1 hash table entry per 768 of
543 journal file and we want to make sure we never get beyond
544 75% fill level. Calculate the hash table size for the
545 maximum file size based on these metrics. */
546
547 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
548 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
549 s = DEFAULT_DATA_HASH_TABLE_SIZE;
550
551 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
552
553 r = journal_file_append_object(f,
554 OBJECT_DATA_HASH_TABLE,
555 offsetof(Object, hash_table.items) + s,
556 &o, &p);
557 if (r < 0)
558 return r;
559
560 memzero(o->hash_table.items, s);
561
562 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
563 f->header->data_hash_table_size = htole64(s);
564
565 return 0;
566 }
567
568 static int journal_file_setup_field_hash_table(JournalFile *f) {
569 uint64_t s, p;
570 Object *o;
571 int r;
572
573 assert(f);
574
575 /* We use a fixed size hash table for the fields as this
576 * number should grow very slowly only */
577
578 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
579 r = journal_file_append_object(f,
580 OBJECT_FIELD_HASH_TABLE,
581 offsetof(Object, hash_table.items) + s,
582 &o, &p);
583 if (r < 0)
584 return r;
585
586 memzero(o->hash_table.items, s);
587
588 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
589 f->header->field_hash_table_size = htole64(s);
590
591 return 0;
592 }
593
594 static int journal_file_map_data_hash_table(JournalFile *f) {
595 uint64_t s, p;
596 void *t;
597 int r;
598
599 assert(f);
600
601 p = le64toh(f->header->data_hash_table_offset);
602 s = le64toh(f->header->data_hash_table_size);
603
604 r = journal_file_move_to(f,
605 OBJECT_DATA_HASH_TABLE,
606 true,
607 p, s,
608 &t);
609 if (r < 0)
610 return r;
611
612 f->data_hash_table = t;
613 return 0;
614 }
615
616 static int journal_file_map_field_hash_table(JournalFile *f) {
617 uint64_t s, p;
618 void *t;
619 int r;
620
621 assert(f);
622
623 p = le64toh(f->header->field_hash_table_offset);
624 s = le64toh(f->header->field_hash_table_size);
625
626 r = journal_file_move_to(f,
627 OBJECT_FIELD_HASH_TABLE,
628 true,
629 p, s,
630 &t);
631 if (r < 0)
632 return r;
633
634 f->field_hash_table = t;
635 return 0;
636 }
637
638 static int journal_file_link_field(
639 JournalFile *f,
640 Object *o,
641 uint64_t offset,
642 uint64_t hash) {
643
644 uint64_t p, h;
645 int r;
646
647 assert(f);
648 assert(o);
649 assert(offset > 0);
650
651 if (o->object.type != OBJECT_FIELD)
652 return -EINVAL;
653
654 /* This might alter the window we are looking at */
655
656 o->field.next_hash_offset = o->field.head_data_offset = 0;
657
658 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
659 p = le64toh(f->field_hash_table[h].tail_hash_offset);
660 if (p == 0)
661 f->field_hash_table[h].head_hash_offset = htole64(offset);
662 else {
663 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
664 if (r < 0)
665 return r;
666
667 o->field.next_hash_offset = htole64(offset);
668 }
669
670 f->field_hash_table[h].tail_hash_offset = htole64(offset);
671
672 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
673 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
674
675 return 0;
676 }
677
678 static int journal_file_link_data(
679 JournalFile *f,
680 Object *o,
681 uint64_t offset,
682 uint64_t hash) {
683
684 uint64_t p, h;
685 int r;
686
687 assert(f);
688 assert(o);
689 assert(offset > 0);
690
691 if (o->object.type != OBJECT_DATA)
692 return -EINVAL;
693
694 /* This might alter the window we are looking at */
695
696 o->data.next_hash_offset = o->data.next_field_offset = 0;
697 o->data.entry_offset = o->data.entry_array_offset = 0;
698 o->data.n_entries = 0;
699
700 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
701 p = le64toh(f->data_hash_table[h].tail_hash_offset);
702 if (p == 0)
703 /* Only entry in the hash table is easy */
704 f->data_hash_table[h].head_hash_offset = htole64(offset);
705 else {
706 /* Move back to the previous data object, to patch in
707 * pointer */
708
709 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
710 if (r < 0)
711 return r;
712
713 o->data.next_hash_offset = htole64(offset);
714 }
715
716 f->data_hash_table[h].tail_hash_offset = htole64(offset);
717
718 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
719 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
720
721 return 0;
722 }
723
724 int journal_file_find_field_object_with_hash(
725 JournalFile *f,
726 const void *field, uint64_t size, uint64_t hash,
727 Object **ret, uint64_t *offset) {
728
729 uint64_t p, osize, h;
730 int r;
731
732 assert(f);
733 assert(field && size > 0);
734
735 osize = offsetof(Object, field.payload) + size;
736
737 if (f->header->field_hash_table_size == 0)
738 return -EBADMSG;
739
740 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
741 p = le64toh(f->field_hash_table[h].head_hash_offset);
742
743 while (p > 0) {
744 Object *o;
745
746 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
747 if (r < 0)
748 return r;
749
750 if (le64toh(o->field.hash) == hash &&
751 le64toh(o->object.size) == osize &&
752 memcmp(o->field.payload, field, size) == 0) {
753
754 if (ret)
755 *ret = o;
756 if (offset)
757 *offset = p;
758
759 return 1;
760 }
761
762 p = le64toh(o->field.next_hash_offset);
763 }
764
765 return 0;
766 }
767
768 int journal_file_find_field_object(
769 JournalFile *f,
770 const void *field, uint64_t size,
771 Object **ret, uint64_t *offset) {
772
773 uint64_t hash;
774
775 assert(f);
776 assert(field && size > 0);
777
778 hash = hash64(field, size);
779
780 return journal_file_find_field_object_with_hash(f,
781 field, size, hash,
782 ret, offset);
783 }
784
785 int journal_file_find_data_object_with_hash(
786 JournalFile *f,
787 const void *data, uint64_t size, uint64_t hash,
788 Object **ret, uint64_t *offset) {
789
790 uint64_t p, osize, h;
791 int r;
792
793 assert(f);
794 assert(data || size == 0);
795
796 osize = offsetof(Object, data.payload) + size;
797
798 if (f->header->data_hash_table_size == 0)
799 return -EBADMSG;
800
801 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
802 p = le64toh(f->data_hash_table[h].head_hash_offset);
803
804 while (p > 0) {
805 Object *o;
806
807 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
808 if (r < 0)
809 return r;
810
811 if (le64toh(o->data.hash) != hash)
812 goto next;
813
814 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
815 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
816 uint64_t l;
817 size_t rsize;
818
819 l = le64toh(o->object.size);
820 if (l <= offsetof(Object, data.payload))
821 return -EBADMSG;
822
823 l -= offsetof(Object, data.payload);
824
825 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
826 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
827 if (r < 0)
828 return r;
829
830 if (rsize == size &&
831 memcmp(f->compress_buffer, data, size) == 0) {
832
833 if (ret)
834 *ret = o;
835
836 if (offset)
837 *offset = p;
838
839 return 1;
840 }
841 #else
842 return -EPROTONOSUPPORT;
843 #endif
844 } else if (le64toh(o->object.size) == osize &&
845 memcmp(o->data.payload, data, size) == 0) {
846
847 if (ret)
848 *ret = o;
849
850 if (offset)
851 *offset = p;
852
853 return 1;
854 }
855
856 next:
857 p = le64toh(o->data.next_hash_offset);
858 }
859
860 return 0;
861 }
862
863 int journal_file_find_data_object(
864 JournalFile *f,
865 const void *data, uint64_t size,
866 Object **ret, uint64_t *offset) {
867
868 uint64_t hash;
869
870 assert(f);
871 assert(data || size == 0);
872
873 hash = hash64(data, size);
874
875 return journal_file_find_data_object_with_hash(f,
876 data, size, hash,
877 ret, offset);
878 }
879
880 static int journal_file_append_field(
881 JournalFile *f,
882 const void *field, uint64_t size,
883 Object **ret, uint64_t *offset) {
884
885 uint64_t hash, p;
886 uint64_t osize;
887 Object *o;
888 int r;
889
890 assert(f);
891 assert(field && size > 0);
892
893 hash = hash64(field, size);
894
895 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
896 if (r < 0)
897 return r;
898 else if (r > 0) {
899
900 if (ret)
901 *ret = o;
902
903 if (offset)
904 *offset = p;
905
906 return 0;
907 }
908
909 osize = offsetof(Object, field.payload) + size;
910 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
911 if (r < 0)
912 return r;
913
914 o->field.hash = htole64(hash);
915 memcpy(o->field.payload, field, size);
916
917 r = journal_file_link_field(f, o, p, hash);
918 if (r < 0)
919 return r;
920
921 /* The linking might have altered the window, so let's
922 * refresh our pointer */
923 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
924 if (r < 0)
925 return r;
926
927 #ifdef HAVE_GCRYPT
928 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
929 if (r < 0)
930 return r;
931 #endif
932
933 if (ret)
934 *ret = o;
935
936 if (offset)
937 *offset = p;
938
939 return 0;
940 }
941
942 static int journal_file_append_data(
943 JournalFile *f,
944 const void *data, uint64_t size,
945 Object **ret, uint64_t *offset) {
946
947 uint64_t hash, p;
948 uint64_t osize;
949 Object *o;
950 int r, compression = 0;
951 const void *eq;
952
953 assert(f);
954 assert(data || size == 0);
955
956 hash = hash64(data, size);
957
958 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
959 if (r < 0)
960 return r;
961 else if (r > 0) {
962
963 if (ret)
964 *ret = o;
965
966 if (offset)
967 *offset = p;
968
969 return 0;
970 }
971
972 osize = offsetof(Object, data.payload) + size;
973 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
974 if (r < 0)
975 return r;
976
977 o->data.hash = htole64(hash);
978
979 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
980 if (f->compress_xz &&
981 size >= COMPRESSION_SIZE_THRESHOLD) {
982 size_t rsize;
983
984 compression = compress_blob(data, size, o->data.payload, &rsize);
985
986 if (compression) {
987 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
988 o->object.flags |= compression;
989
990 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
991 size, rsize, object_compressed_to_string(compression));
992 }
993 }
994 #endif
995
996 if (!compression && size > 0)
997 memcpy(o->data.payload, data, size);
998
999 r = journal_file_link_data(f, o, p, hash);
1000 if (r < 0)
1001 return r;
1002
1003 /* The linking might have altered the window, so let's
1004 * refresh our pointer */
1005 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1006 if (r < 0)
1007 return r;
1008
1009 if (!data)
1010 eq = NULL;
1011 else
1012 eq = memchr(data, '=', size);
1013 if (eq && eq > data) {
1014 Object *fo = NULL;
1015 uint64_t fp;
1016
1017 /* Create field object ... */
1018 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1019 if (r < 0)
1020 return r;
1021
1022 /* ... and link it in. */
1023 o->data.next_field_offset = fo->field.head_data_offset;
1024 fo->field.head_data_offset = le64toh(p);
1025 }
1026
1027 #ifdef HAVE_GCRYPT
1028 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1029 if (r < 0)
1030 return r;
1031 #endif
1032
1033 if (ret)
1034 *ret = o;
1035
1036 if (offset)
1037 *offset = p;
1038
1039 return 0;
1040 }
1041
1042 uint64_t journal_file_entry_n_items(Object *o) {
1043 assert(o);
1044
1045 if (o->object.type != OBJECT_ENTRY)
1046 return 0;
1047
1048 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1049 }
1050
1051 uint64_t journal_file_entry_array_n_items(Object *o) {
1052 assert(o);
1053
1054 if (o->object.type != OBJECT_ENTRY_ARRAY)
1055 return 0;
1056
1057 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1058 }
1059
1060 uint64_t journal_file_hash_table_n_items(Object *o) {
1061 assert(o);
1062
1063 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1064 o->object.type != OBJECT_FIELD_HASH_TABLE)
1065 return 0;
1066
1067 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1068 }
1069
1070 static int link_entry_into_array(JournalFile *f,
1071 le64_t *first,
1072 le64_t *idx,
1073 uint64_t p) {
1074 int r;
1075 uint64_t n = 0, ap = 0, q, i, a, hidx;
1076 Object *o;
1077
1078 assert(f);
1079 assert(first);
1080 assert(idx);
1081 assert(p > 0);
1082
1083 a = le64toh(*first);
1084 i = hidx = le64toh(*idx);
1085 while (a > 0) {
1086
1087 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1088 if (r < 0)
1089 return r;
1090
1091 n = journal_file_entry_array_n_items(o);
1092 if (i < n) {
1093 o->entry_array.items[i] = htole64(p);
1094 *idx = htole64(hidx + 1);
1095 return 0;
1096 }
1097
1098 i -= n;
1099 ap = a;
1100 a = le64toh(o->entry_array.next_entry_array_offset);
1101 }
1102
1103 if (hidx > n)
1104 n = (hidx+1) * 2;
1105 else
1106 n = n * 2;
1107
1108 if (n < 4)
1109 n = 4;
1110
1111 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1112 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1113 &o, &q);
1114 if (r < 0)
1115 return r;
1116
1117 #ifdef HAVE_GCRYPT
1118 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1119 if (r < 0)
1120 return r;
1121 #endif
1122
1123 o->entry_array.items[i] = htole64(p);
1124
1125 if (ap == 0)
1126 *first = htole64(q);
1127 else {
1128 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1129 if (r < 0)
1130 return r;
1131
1132 o->entry_array.next_entry_array_offset = htole64(q);
1133 }
1134
1135 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1136 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1137
1138 *idx = htole64(hidx + 1);
1139
1140 return 0;
1141 }
1142
1143 static int link_entry_into_array_plus_one(JournalFile *f,
1144 le64_t *extra,
1145 le64_t *first,
1146 le64_t *idx,
1147 uint64_t p) {
1148
1149 int r;
1150
1151 assert(f);
1152 assert(extra);
1153 assert(first);
1154 assert(idx);
1155 assert(p > 0);
1156
1157 if (*idx == 0)
1158 *extra = htole64(p);
1159 else {
1160 le64_t i;
1161
1162 i = htole64(le64toh(*idx) - 1);
1163 r = link_entry_into_array(f, first, &i, p);
1164 if (r < 0)
1165 return r;
1166 }
1167
1168 *idx = htole64(le64toh(*idx) + 1);
1169 return 0;
1170 }
1171
1172 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1173 uint64_t p;
1174 int r;
1175 assert(f);
1176 assert(o);
1177 assert(offset > 0);
1178
1179 p = le64toh(o->entry.items[i].object_offset);
1180 if (p == 0)
1181 return -EINVAL;
1182
1183 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1184 if (r < 0)
1185 return r;
1186
1187 return link_entry_into_array_plus_one(f,
1188 &o->data.entry_offset,
1189 &o->data.entry_array_offset,
1190 &o->data.n_entries,
1191 offset);
1192 }
1193
1194 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1195 uint64_t n, i;
1196 int r;
1197
1198 assert(f);
1199 assert(o);
1200 assert(offset > 0);
1201
1202 if (o->object.type != OBJECT_ENTRY)
1203 return -EINVAL;
1204
1205 __sync_synchronize();
1206
1207 /* Link up the entry itself */
1208 r = link_entry_into_array(f,
1209 &f->header->entry_array_offset,
1210 &f->header->n_entries,
1211 offset);
1212 if (r < 0)
1213 return r;
1214
1215 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1216
1217 if (f->header->head_entry_realtime == 0)
1218 f->header->head_entry_realtime = o->entry.realtime;
1219
1220 f->header->tail_entry_realtime = o->entry.realtime;
1221 f->header->tail_entry_monotonic = o->entry.monotonic;
1222
1223 f->tail_entry_monotonic_valid = true;
1224
1225 /* Link up the items */
1226 n = journal_file_entry_n_items(o);
1227 for (i = 0; i < n; i++) {
1228 r = journal_file_link_entry_item(f, o, offset, i);
1229 if (r < 0)
1230 return r;
1231 }
1232
1233 return 0;
1234 }
1235
1236 static int journal_file_append_entry_internal(
1237 JournalFile *f,
1238 const dual_timestamp *ts,
1239 uint64_t xor_hash,
1240 const EntryItem items[], unsigned n_items,
1241 uint64_t *seqnum,
1242 Object **ret, uint64_t *offset) {
1243 uint64_t np;
1244 uint64_t osize;
1245 Object *o;
1246 int r;
1247
1248 assert(f);
1249 assert(items || n_items == 0);
1250 assert(ts);
1251
1252 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1253
1254 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1255 if (r < 0)
1256 return r;
1257
1258 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1259 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1260 o->entry.realtime = htole64(ts->realtime);
1261 o->entry.monotonic = htole64(ts->monotonic);
1262 o->entry.xor_hash = htole64(xor_hash);
1263 o->entry.boot_id = f->header->boot_id;
1264
1265 #ifdef HAVE_GCRYPT
1266 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1267 if (r < 0)
1268 return r;
1269 #endif
1270
1271 r = journal_file_link_entry(f, o, np);
1272 if (r < 0)
1273 return r;
1274
1275 if (ret)
1276 *ret = o;
1277
1278 if (offset)
1279 *offset = np;
1280
1281 return 0;
1282 }
1283
1284 void journal_file_post_change(JournalFile *f) {
1285 assert(f);
1286
1287 /* inotify() does not receive IN_MODIFY events from file
1288 * accesses done via mmap(). After each access we hence
1289 * trigger IN_MODIFY by truncating the journal file to its
1290 * current size which triggers IN_MODIFY. */
1291
1292 __sync_synchronize();
1293
1294 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1295 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1296 }
1297
1298 static int entry_item_cmp(const void *_a, const void *_b) {
1299 const EntryItem *a = _a, *b = _b;
1300
1301 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1302 return -1;
1303 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1304 return 1;
1305 return 0;
1306 }
1307
1308 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1309 unsigned i;
1310 EntryItem *items;
1311 int r;
1312 uint64_t xor_hash = 0;
1313 struct dual_timestamp _ts;
1314
1315 assert(f);
1316 assert(iovec || n_iovec == 0);
1317
1318 if (!ts) {
1319 dual_timestamp_get(&_ts);
1320 ts = &_ts;
1321 }
1322
1323 if (f->tail_entry_monotonic_valid &&
1324 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1325 return -EINVAL;
1326
1327 #ifdef HAVE_GCRYPT
1328 r = journal_file_maybe_append_tag(f, ts->realtime);
1329 if (r < 0)
1330 return r;
1331 #endif
1332
1333 /* alloca() can't take 0, hence let's allocate at least one */
1334 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1335
1336 for (i = 0; i < n_iovec; i++) {
1337 uint64_t p;
1338 Object *o;
1339
1340 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1341 if (r < 0)
1342 return r;
1343
1344 xor_hash ^= le64toh(o->data.hash);
1345 items[i].object_offset = htole64(p);
1346 items[i].hash = o->data.hash;
1347 }
1348
1349 /* Order by the position on disk, in order to improve seek
1350 * times for rotating media. */
1351 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1352
1353 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1354
1355 journal_file_post_change(f);
1356
1357 return r;
1358 }
1359
1360 typedef struct ChainCacheItem {
1361 uint64_t first; /* the array at the beginning of the chain */
1362 uint64_t array; /* the cached array */
1363 uint64_t begin; /* the first item in the cached array */
1364 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1365 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1366 } ChainCacheItem;
1367
1368 static void chain_cache_put(
1369 OrderedHashmap *h,
1370 ChainCacheItem *ci,
1371 uint64_t first,
1372 uint64_t array,
1373 uint64_t begin,
1374 uint64_t total,
1375 uint64_t last_index) {
1376
1377 if (!ci) {
1378 /* If the chain item to cache for this chain is the
1379 * first one it's not worth caching anything */
1380 if (array == first)
1381 return;
1382
1383 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1384 ci = ordered_hashmap_steal_first(h);
1385 assert(ci);
1386 } else {
1387 ci = new(ChainCacheItem, 1);
1388 if (!ci)
1389 return;
1390 }
1391
1392 ci->first = first;
1393
1394 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1395 free(ci);
1396 return;
1397 }
1398 } else
1399 assert(ci->first == first);
1400
1401 ci->array = array;
1402 ci->begin = begin;
1403 ci->total = total;
1404 ci->last_index = last_index;
1405 }
1406
1407 static int generic_array_get(
1408 JournalFile *f,
1409 uint64_t first,
1410 uint64_t i,
1411 Object **ret, uint64_t *offset) {
1412
1413 Object *o;
1414 uint64_t p = 0, a, t = 0;
1415 int r;
1416 ChainCacheItem *ci;
1417
1418 assert(f);
1419
1420 a = first;
1421
1422 /* Try the chain cache first */
1423 ci = ordered_hashmap_get(f->chain_cache, &first);
1424 if (ci && i > ci->total) {
1425 a = ci->array;
1426 i -= ci->total;
1427 t = ci->total;
1428 }
1429
1430 while (a > 0) {
1431 uint64_t k;
1432
1433 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1434 if (r < 0)
1435 return r;
1436
1437 k = journal_file_entry_array_n_items(o);
1438 if (i < k) {
1439 p = le64toh(o->entry_array.items[i]);
1440 goto found;
1441 }
1442
1443 i -= k;
1444 t += k;
1445 a = le64toh(o->entry_array.next_entry_array_offset);
1446 }
1447
1448 return 0;
1449
1450 found:
1451 /* Let's cache this item for the next invocation */
1452 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1453
1454 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1455 if (r < 0)
1456 return r;
1457
1458 if (ret)
1459 *ret = o;
1460
1461 if (offset)
1462 *offset = p;
1463
1464 return 1;
1465 }
1466
1467 static int generic_array_get_plus_one(
1468 JournalFile *f,
1469 uint64_t extra,
1470 uint64_t first,
1471 uint64_t i,
1472 Object **ret, uint64_t *offset) {
1473
1474 Object *o;
1475
1476 assert(f);
1477
1478 if (i == 0) {
1479 int r;
1480
1481 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1482 if (r < 0)
1483 return r;
1484
1485 if (ret)
1486 *ret = o;
1487
1488 if (offset)
1489 *offset = extra;
1490
1491 return 1;
1492 }
1493
1494 return generic_array_get(f, first, i-1, ret, offset);
1495 }
1496
1497 enum {
1498 TEST_FOUND,
1499 TEST_LEFT,
1500 TEST_RIGHT
1501 };
1502
1503 static int generic_array_bisect(
1504 JournalFile *f,
1505 uint64_t first,
1506 uint64_t n,
1507 uint64_t needle,
1508 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1509 direction_t direction,
1510 Object **ret,
1511 uint64_t *offset,
1512 uint64_t *idx) {
1513
1514 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1515 bool subtract_one = false;
1516 Object *o, *array = NULL;
1517 int r;
1518 ChainCacheItem *ci;
1519
1520 assert(f);
1521 assert(test_object);
1522
1523 /* Start with the first array in the chain */
1524 a = first;
1525
1526 ci = ordered_hashmap_get(f->chain_cache, &first);
1527 if (ci && n > ci->total) {
1528 /* Ah, we have iterated this bisection array chain
1529 * previously! Let's see if we can skip ahead in the
1530 * chain, as far as the last time. But we can't jump
1531 * backwards in the chain, so let's check that
1532 * first. */
1533
1534 r = test_object(f, ci->begin, needle);
1535 if (r < 0)
1536 return r;
1537
1538 if (r == TEST_LEFT) {
1539 /* OK, what we are looking for is right of the
1540 * begin of this EntryArray, so let's jump
1541 * straight to previously cached array in the
1542 * chain */
1543
1544 a = ci->array;
1545 n -= ci->total;
1546 t = ci->total;
1547 last_index = ci->last_index;
1548 }
1549 }
1550
1551 while (a > 0) {
1552 uint64_t left, right, k, lp;
1553
1554 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1555 if (r < 0)
1556 return r;
1557
1558 k = journal_file_entry_array_n_items(array);
1559 right = MIN(k, n);
1560 if (right <= 0)
1561 return 0;
1562
1563 i = right - 1;
1564 lp = p = le64toh(array->entry_array.items[i]);
1565 if (p <= 0)
1566 return -EBADMSG;
1567
1568 r = test_object(f, p, needle);
1569 if (r < 0)
1570 return r;
1571
1572 if (r == TEST_FOUND)
1573 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1574
1575 if (r == TEST_RIGHT) {
1576 left = 0;
1577 right -= 1;
1578
1579 if (last_index != (uint64_t) -1) {
1580 assert(last_index <= right);
1581
1582 /* If we cached the last index we
1583 * looked at, let's try to not to jump
1584 * too wildly around and see if we can
1585 * limit the range to look at early to
1586 * the immediate neighbors of the last
1587 * index we looked at. */
1588
1589 if (last_index > 0) {
1590 uint64_t x = last_index - 1;
1591
1592 p = le64toh(array->entry_array.items[x]);
1593 if (p <= 0)
1594 return -EBADMSG;
1595
1596 r = test_object(f, p, needle);
1597 if (r < 0)
1598 return r;
1599
1600 if (r == TEST_FOUND)
1601 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1602
1603 if (r == TEST_RIGHT)
1604 right = x;
1605 else
1606 left = x + 1;
1607 }
1608
1609 if (last_index < right) {
1610 uint64_t y = last_index + 1;
1611
1612 p = le64toh(array->entry_array.items[y]);
1613 if (p <= 0)
1614 return -EBADMSG;
1615
1616 r = test_object(f, p, needle);
1617 if (r < 0)
1618 return r;
1619
1620 if (r == TEST_FOUND)
1621 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1622
1623 if (r == TEST_RIGHT)
1624 right = y;
1625 else
1626 left = y + 1;
1627 }
1628 }
1629
1630 for (;;) {
1631 if (left == right) {
1632 if (direction == DIRECTION_UP)
1633 subtract_one = true;
1634
1635 i = left;
1636 goto found;
1637 }
1638
1639 assert(left < right);
1640 i = (left + right) / 2;
1641
1642 p = le64toh(array->entry_array.items[i]);
1643 if (p <= 0)
1644 return -EBADMSG;
1645
1646 r = test_object(f, p, needle);
1647 if (r < 0)
1648 return r;
1649
1650 if (r == TEST_FOUND)
1651 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1652
1653 if (r == TEST_RIGHT)
1654 right = i;
1655 else
1656 left = i + 1;
1657 }
1658 }
1659
1660 if (k > n) {
1661 if (direction == DIRECTION_UP) {
1662 i = n;
1663 subtract_one = true;
1664 goto found;
1665 }
1666
1667 return 0;
1668 }
1669
1670 last_p = lp;
1671
1672 n -= k;
1673 t += k;
1674 last_index = (uint64_t) -1;
1675 a = le64toh(array->entry_array.next_entry_array_offset);
1676 }
1677
1678 return 0;
1679
1680 found:
1681 if (subtract_one && t == 0 && i == 0)
1682 return 0;
1683
1684 /* Let's cache this item for the next invocation */
1685 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1686
1687 if (subtract_one && i == 0)
1688 p = last_p;
1689 else if (subtract_one)
1690 p = le64toh(array->entry_array.items[i-1]);
1691 else
1692 p = le64toh(array->entry_array.items[i]);
1693
1694 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1695 if (r < 0)
1696 return r;
1697
1698 if (ret)
1699 *ret = o;
1700
1701 if (offset)
1702 *offset = p;
1703
1704 if (idx)
1705 *idx = t + i + (subtract_one ? -1 : 0);
1706
1707 return 1;
1708 }
1709
1710
1711 static int generic_array_bisect_plus_one(
1712 JournalFile *f,
1713 uint64_t extra,
1714 uint64_t first,
1715 uint64_t n,
1716 uint64_t needle,
1717 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1718 direction_t direction,
1719 Object **ret,
1720 uint64_t *offset,
1721 uint64_t *idx) {
1722
1723 int r;
1724 bool step_back = false;
1725 Object *o;
1726
1727 assert(f);
1728 assert(test_object);
1729
1730 if (n <= 0)
1731 return 0;
1732
1733 /* This bisects the array in object 'first', but first checks
1734 * an extra */
1735 r = test_object(f, extra, needle);
1736 if (r < 0)
1737 return r;
1738
1739 if (r == TEST_FOUND)
1740 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1741
1742 /* if we are looking with DIRECTION_UP then we need to first
1743 see if in the actual array there is a matching entry, and
1744 return the last one of that. But if there isn't any we need
1745 to return this one. Hence remember this, and return it
1746 below. */
1747 if (r == TEST_LEFT)
1748 step_back = direction == DIRECTION_UP;
1749
1750 if (r == TEST_RIGHT) {
1751 if (direction == DIRECTION_DOWN)
1752 goto found;
1753 else
1754 return 0;
1755 }
1756
1757 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1758
1759 if (r == 0 && step_back)
1760 goto found;
1761
1762 if (r > 0 && idx)
1763 (*idx) ++;
1764
1765 return r;
1766
1767 found:
1768 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1769 if (r < 0)
1770 return r;
1771
1772 if (ret)
1773 *ret = o;
1774
1775 if (offset)
1776 *offset = extra;
1777
1778 if (idx)
1779 *idx = 0;
1780
1781 return 1;
1782 }
1783
1784 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1785 assert(f);
1786 assert(p > 0);
1787
1788 if (p == needle)
1789 return TEST_FOUND;
1790 else if (p < needle)
1791 return TEST_LEFT;
1792 else
1793 return TEST_RIGHT;
1794 }
1795
1796 int journal_file_move_to_entry_by_offset(
1797 JournalFile *f,
1798 uint64_t p,
1799 direction_t direction,
1800 Object **ret,
1801 uint64_t *offset) {
1802
1803 return generic_array_bisect(f,
1804 le64toh(f->header->entry_array_offset),
1805 le64toh(f->header->n_entries),
1806 p,
1807 test_object_offset,
1808 direction,
1809 ret, offset, NULL);
1810 }
1811
1812
1813 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1814 Object *o;
1815 int r;
1816
1817 assert(f);
1818 assert(p > 0);
1819
1820 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1821 if (r < 0)
1822 return r;
1823
1824 if (le64toh(o->entry.seqnum) == needle)
1825 return TEST_FOUND;
1826 else if (le64toh(o->entry.seqnum) < needle)
1827 return TEST_LEFT;
1828 else
1829 return TEST_RIGHT;
1830 }
1831
1832 int journal_file_move_to_entry_by_seqnum(
1833 JournalFile *f,
1834 uint64_t seqnum,
1835 direction_t direction,
1836 Object **ret,
1837 uint64_t *offset) {
1838
1839 return generic_array_bisect(f,
1840 le64toh(f->header->entry_array_offset),
1841 le64toh(f->header->n_entries),
1842 seqnum,
1843 test_object_seqnum,
1844 direction,
1845 ret, offset, NULL);
1846 }
1847
1848 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1849 Object *o;
1850 int r;
1851
1852 assert(f);
1853 assert(p > 0);
1854
1855 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1856 if (r < 0)
1857 return r;
1858
1859 if (le64toh(o->entry.realtime) == needle)
1860 return TEST_FOUND;
1861 else if (le64toh(o->entry.realtime) < needle)
1862 return TEST_LEFT;
1863 else
1864 return TEST_RIGHT;
1865 }
1866
1867 int journal_file_move_to_entry_by_realtime(
1868 JournalFile *f,
1869 uint64_t realtime,
1870 direction_t direction,
1871 Object **ret,
1872 uint64_t *offset) {
1873
1874 return generic_array_bisect(f,
1875 le64toh(f->header->entry_array_offset),
1876 le64toh(f->header->n_entries),
1877 realtime,
1878 test_object_realtime,
1879 direction,
1880 ret, offset, NULL);
1881 }
1882
1883 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1884 Object *o;
1885 int r;
1886
1887 assert(f);
1888 assert(p > 0);
1889
1890 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1891 if (r < 0)
1892 return r;
1893
1894 if (le64toh(o->entry.monotonic) == needle)
1895 return TEST_FOUND;
1896 else if (le64toh(o->entry.monotonic) < needle)
1897 return TEST_LEFT;
1898 else
1899 return TEST_RIGHT;
1900 }
1901
1902 static inline int find_data_object_by_boot_id(
1903 JournalFile *f,
1904 sd_id128_t boot_id,
1905 Object **o,
1906 uint64_t *b) {
1907 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1908
1909 sd_id128_to_string(boot_id, t + 9);
1910 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1911 }
1912
1913 int journal_file_move_to_entry_by_monotonic(
1914 JournalFile *f,
1915 sd_id128_t boot_id,
1916 uint64_t monotonic,
1917 direction_t direction,
1918 Object **ret,
1919 uint64_t *offset) {
1920
1921 Object *o;
1922 int r;
1923
1924 assert(f);
1925
1926 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1927 if (r < 0)
1928 return r;
1929 if (r == 0)
1930 return -ENOENT;
1931
1932 return generic_array_bisect_plus_one(f,
1933 le64toh(o->data.entry_offset),
1934 le64toh(o->data.entry_array_offset),
1935 le64toh(o->data.n_entries),
1936 monotonic,
1937 test_object_monotonic,
1938 direction,
1939 ret, offset, NULL);
1940 }
1941
1942 int journal_file_next_entry(
1943 JournalFile *f,
1944 Object *o, uint64_t p,
1945 direction_t direction,
1946 Object **ret, uint64_t *offset) {
1947
1948 uint64_t i, n, ofs;
1949 int r;
1950
1951 assert(f);
1952 assert(p > 0 || !o);
1953
1954 n = le64toh(f->header->n_entries);
1955 if (n <= 0)
1956 return 0;
1957
1958 if (!o)
1959 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1960 else {
1961 if (o->object.type != OBJECT_ENTRY)
1962 return -EINVAL;
1963
1964 r = generic_array_bisect(f,
1965 le64toh(f->header->entry_array_offset),
1966 le64toh(f->header->n_entries),
1967 p,
1968 test_object_offset,
1969 DIRECTION_DOWN,
1970 NULL, NULL,
1971 &i);
1972 if (r <= 0)
1973 return r;
1974
1975 if (direction == DIRECTION_DOWN) {
1976 if (i >= n - 1)
1977 return 0;
1978
1979 i++;
1980 } else {
1981 if (i <= 0)
1982 return 0;
1983
1984 i--;
1985 }
1986 }
1987
1988 /* And jump to it */
1989 r = generic_array_get(f,
1990 le64toh(f->header->entry_array_offset),
1991 i,
1992 ret, &ofs);
1993 if (r <= 0)
1994 return r;
1995
1996 if (p > 0 &&
1997 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1998 log_debug("%s: entry array corrupted at entry %"PRIu64,
1999 f->path, i);
2000 return -EBADMSG;
2001 }
2002
2003 if (offset)
2004 *offset = ofs;
2005
2006 return 1;
2007 }
2008
2009 int journal_file_skip_entry(
2010 JournalFile *f,
2011 Object *o, uint64_t p,
2012 int64_t skip,
2013 Object **ret, uint64_t *offset) {
2014
2015 uint64_t i, n;
2016 int r;
2017
2018 assert(f);
2019 assert(o);
2020 assert(p > 0);
2021
2022 if (o->object.type != OBJECT_ENTRY)
2023 return -EINVAL;
2024
2025 r = generic_array_bisect(f,
2026 le64toh(f->header->entry_array_offset),
2027 le64toh(f->header->n_entries),
2028 p,
2029 test_object_offset,
2030 DIRECTION_DOWN,
2031 NULL, NULL,
2032 &i);
2033 if (r <= 0)
2034 return r;
2035
2036 /* Calculate new index */
2037 if (skip < 0) {
2038 if ((uint64_t) -skip >= i)
2039 i = 0;
2040 else
2041 i = i - (uint64_t) -skip;
2042 } else
2043 i += (uint64_t) skip;
2044
2045 n = le64toh(f->header->n_entries);
2046 if (n <= 0)
2047 return -EBADMSG;
2048
2049 if (i >= n)
2050 i = n-1;
2051
2052 return generic_array_get(f,
2053 le64toh(f->header->entry_array_offset),
2054 i,
2055 ret, offset);
2056 }
2057
2058 int journal_file_next_entry_for_data(
2059 JournalFile *f,
2060 Object *o, uint64_t p,
2061 uint64_t data_offset,
2062 direction_t direction,
2063 Object **ret, uint64_t *offset) {
2064
2065 uint64_t n, i;
2066 int r;
2067 Object *d;
2068
2069 assert(f);
2070 assert(p > 0 || !o);
2071
2072 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2073 if (r < 0)
2074 return r;
2075
2076 n = le64toh(d->data.n_entries);
2077 if (n <= 0)
2078 return n;
2079
2080 if (!o)
2081 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2082 else {
2083 if (o->object.type != OBJECT_ENTRY)
2084 return -EINVAL;
2085
2086 r = generic_array_bisect_plus_one(f,
2087 le64toh(d->data.entry_offset),
2088 le64toh(d->data.entry_array_offset),
2089 le64toh(d->data.n_entries),
2090 p,
2091 test_object_offset,
2092 DIRECTION_DOWN,
2093 NULL, NULL,
2094 &i);
2095
2096 if (r <= 0)
2097 return r;
2098
2099 if (direction == DIRECTION_DOWN) {
2100 if (i >= n - 1)
2101 return 0;
2102
2103 i++;
2104 } else {
2105 if (i <= 0)
2106 return 0;
2107
2108 i--;
2109 }
2110
2111 }
2112
2113 return generic_array_get_plus_one(f,
2114 le64toh(d->data.entry_offset),
2115 le64toh(d->data.entry_array_offset),
2116 i,
2117 ret, offset);
2118 }
2119
2120 int journal_file_move_to_entry_by_offset_for_data(
2121 JournalFile *f,
2122 uint64_t data_offset,
2123 uint64_t p,
2124 direction_t direction,
2125 Object **ret, uint64_t *offset) {
2126
2127 int r;
2128 Object *d;
2129
2130 assert(f);
2131
2132 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2133 if (r < 0)
2134 return r;
2135
2136 return generic_array_bisect_plus_one(f,
2137 le64toh(d->data.entry_offset),
2138 le64toh(d->data.entry_array_offset),
2139 le64toh(d->data.n_entries),
2140 p,
2141 test_object_offset,
2142 direction,
2143 ret, offset, NULL);
2144 }
2145
2146 int journal_file_move_to_entry_by_monotonic_for_data(
2147 JournalFile *f,
2148 uint64_t data_offset,
2149 sd_id128_t boot_id,
2150 uint64_t monotonic,
2151 direction_t direction,
2152 Object **ret, uint64_t *offset) {
2153
2154 Object *o, *d;
2155 int r;
2156 uint64_t b, z;
2157
2158 assert(f);
2159
2160 /* First, seek by time */
2161 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2162 if (r < 0)
2163 return r;
2164 if (r == 0)
2165 return -ENOENT;
2166
2167 r = generic_array_bisect_plus_one(f,
2168 le64toh(o->data.entry_offset),
2169 le64toh(o->data.entry_array_offset),
2170 le64toh(o->data.n_entries),
2171 monotonic,
2172 test_object_monotonic,
2173 direction,
2174 NULL, &z, NULL);
2175 if (r <= 0)
2176 return r;
2177
2178 /* And now, continue seeking until we find an entry that
2179 * exists in both bisection arrays */
2180
2181 for (;;) {
2182 Object *qo;
2183 uint64_t p, q;
2184
2185 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2186 if (r < 0)
2187 return r;
2188
2189 r = generic_array_bisect_plus_one(f,
2190 le64toh(d->data.entry_offset),
2191 le64toh(d->data.entry_array_offset),
2192 le64toh(d->data.n_entries),
2193 z,
2194 test_object_offset,
2195 direction,
2196 NULL, &p, NULL);
2197 if (r <= 0)
2198 return r;
2199
2200 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2201 if (r < 0)
2202 return r;
2203
2204 r = generic_array_bisect_plus_one(f,
2205 le64toh(o->data.entry_offset),
2206 le64toh(o->data.entry_array_offset),
2207 le64toh(o->data.n_entries),
2208 p,
2209 test_object_offset,
2210 direction,
2211 &qo, &q, NULL);
2212
2213 if (r <= 0)
2214 return r;
2215
2216 if (p == q) {
2217 if (ret)
2218 *ret = qo;
2219 if (offset)
2220 *offset = q;
2221
2222 return 1;
2223 }
2224
2225 z = q;
2226 }
2227 }
2228
2229 int journal_file_move_to_entry_by_seqnum_for_data(
2230 JournalFile *f,
2231 uint64_t data_offset,
2232 uint64_t seqnum,
2233 direction_t direction,
2234 Object **ret, uint64_t *offset) {
2235
2236 Object *d;
2237 int r;
2238
2239 assert(f);
2240
2241 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2242 if (r < 0)
2243 return r;
2244
2245 return generic_array_bisect_plus_one(f,
2246 le64toh(d->data.entry_offset),
2247 le64toh(d->data.entry_array_offset),
2248 le64toh(d->data.n_entries),
2249 seqnum,
2250 test_object_seqnum,
2251 direction,
2252 ret, offset, NULL);
2253 }
2254
2255 int journal_file_move_to_entry_by_realtime_for_data(
2256 JournalFile *f,
2257 uint64_t data_offset,
2258 uint64_t realtime,
2259 direction_t direction,
2260 Object **ret, uint64_t *offset) {
2261
2262 Object *d;
2263 int r;
2264
2265 assert(f);
2266
2267 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2268 if (r < 0)
2269 return r;
2270
2271 return generic_array_bisect_plus_one(f,
2272 le64toh(d->data.entry_offset),
2273 le64toh(d->data.entry_array_offset),
2274 le64toh(d->data.n_entries),
2275 realtime,
2276 test_object_realtime,
2277 direction,
2278 ret, offset, NULL);
2279 }
2280
2281 void journal_file_dump(JournalFile *f) {
2282 Object *o;
2283 int r;
2284 uint64_t p;
2285
2286 assert(f);
2287
2288 journal_file_print_header(f);
2289
2290 p = le64toh(f->header->header_size);
2291 while (p != 0) {
2292 r = journal_file_move_to_object(f, -1, p, &o);
2293 if (r < 0)
2294 goto fail;
2295
2296 switch (o->object.type) {
2297
2298 case OBJECT_UNUSED:
2299 printf("Type: OBJECT_UNUSED\n");
2300 break;
2301
2302 case OBJECT_DATA:
2303 printf("Type: OBJECT_DATA\n");
2304 break;
2305
2306 case OBJECT_FIELD:
2307 printf("Type: OBJECT_FIELD\n");
2308 break;
2309
2310 case OBJECT_ENTRY:
2311 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2312 le64toh(o->entry.seqnum),
2313 le64toh(o->entry.monotonic),
2314 le64toh(o->entry.realtime));
2315 break;
2316
2317 case OBJECT_FIELD_HASH_TABLE:
2318 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2319 break;
2320
2321 case OBJECT_DATA_HASH_TABLE:
2322 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2323 break;
2324
2325 case OBJECT_ENTRY_ARRAY:
2326 printf("Type: OBJECT_ENTRY_ARRAY\n");
2327 break;
2328
2329 case OBJECT_TAG:
2330 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2331 le64toh(o->tag.seqnum),
2332 le64toh(o->tag.epoch));
2333 break;
2334
2335 default:
2336 printf("Type: unknown (%u)\n", o->object.type);
2337 break;
2338 }
2339
2340 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2341 printf("Flags: %s\n",
2342 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2343
2344 if (p == le64toh(f->header->tail_object_offset))
2345 p = 0;
2346 else
2347 p = p + ALIGN64(le64toh(o->object.size));
2348 }
2349
2350 return;
2351 fail:
2352 log_error("File corrupt");
2353 }
2354
2355 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2356 const char *x;
2357
2358 x = format_timestamp(buf, l, t);
2359 if (x)
2360 return x;
2361 return " --- ";
2362 }
2363
2364 void journal_file_print_header(JournalFile *f) {
2365 char a[33], b[33], c[33], d[33];
2366 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2367 struct stat st;
2368 char bytes[FORMAT_BYTES_MAX];
2369
2370 assert(f);
2371
2372 printf("File Path: %s\n"
2373 "File ID: %s\n"
2374 "Machine ID: %s\n"
2375 "Boot ID: %s\n"
2376 "Sequential Number ID: %s\n"
2377 "State: %s\n"
2378 "Compatible Flags:%s%s\n"
2379 "Incompatible Flags:%s%s%s\n"
2380 "Header size: %"PRIu64"\n"
2381 "Arena size: %"PRIu64"\n"
2382 "Data Hash Table Size: %"PRIu64"\n"
2383 "Field Hash Table Size: %"PRIu64"\n"
2384 "Rotate Suggested: %s\n"
2385 "Head Sequential Number: %"PRIu64"\n"
2386 "Tail Sequential Number: %"PRIu64"\n"
2387 "Head Realtime Timestamp: %s\n"
2388 "Tail Realtime Timestamp: %s\n"
2389 "Tail Monotonic Timestamp: %s\n"
2390 "Objects: %"PRIu64"\n"
2391 "Entry Objects: %"PRIu64"\n",
2392 f->path,
2393 sd_id128_to_string(f->header->file_id, a),
2394 sd_id128_to_string(f->header->machine_id, b),
2395 sd_id128_to_string(f->header->boot_id, c),
2396 sd_id128_to_string(f->header->seqnum_id, d),
2397 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2398 f->header->state == STATE_ONLINE ? "ONLINE" :
2399 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2400 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2401 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2402 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2403 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2404 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2405 le64toh(f->header->header_size),
2406 le64toh(f->header->arena_size),
2407 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2408 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2409 yes_no(journal_file_rotate_suggested(f, 0)),
2410 le64toh(f->header->head_entry_seqnum),
2411 le64toh(f->header->tail_entry_seqnum),
2412 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2413 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2414 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2415 le64toh(f->header->n_objects),
2416 le64toh(f->header->n_entries));
2417
2418 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2419 printf("Data Objects: %"PRIu64"\n"
2420 "Data Hash Table Fill: %.1f%%\n",
2421 le64toh(f->header->n_data),
2422 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2423
2424 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2425 printf("Field Objects: %"PRIu64"\n"
2426 "Field Hash Table Fill: %.1f%%\n",
2427 le64toh(f->header->n_fields),
2428 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2429
2430 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2431 printf("Tag Objects: %"PRIu64"\n",
2432 le64toh(f->header->n_tags));
2433 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2434 printf("Entry Array Objects: %"PRIu64"\n",
2435 le64toh(f->header->n_entry_arrays));
2436
2437 if (fstat(f->fd, &st) >= 0)
2438 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2439 }
2440
2441 int journal_file_open(
2442 const char *fname,
2443 int flags,
2444 mode_t mode,
2445 bool compress,
2446 bool seal,
2447 JournalMetrics *metrics,
2448 MMapCache *mmap_cache,
2449 JournalFile *template,
2450 JournalFile **ret) {
2451
2452 JournalFile *f;
2453 int r;
2454 bool newly_created = false;
2455
2456 assert(fname);
2457 assert(ret);
2458
2459 if ((flags & O_ACCMODE) != O_RDONLY &&
2460 (flags & O_ACCMODE) != O_RDWR)
2461 return -EINVAL;
2462
2463 if (!endswith(fname, ".journal") &&
2464 !endswith(fname, ".journal~"))
2465 return -EINVAL;
2466
2467 f = new0(JournalFile, 1);
2468 if (!f)
2469 return -ENOMEM;
2470
2471 f->fd = -1;
2472 f->mode = mode;
2473
2474 f->flags = flags;
2475 f->prot = prot_from_flags(flags);
2476 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2477 #if defined(HAVE_LZ4)
2478 f->compress_lz4 = compress;
2479 #elif defined(HAVE_XZ)
2480 f->compress_xz = compress;
2481 #endif
2482 #ifdef HAVE_GCRYPT
2483 f->seal = seal;
2484 #endif
2485
2486 if (mmap_cache)
2487 f->mmap = mmap_cache_ref(mmap_cache);
2488 else {
2489 f->mmap = mmap_cache_new();
2490 if (!f->mmap) {
2491 r = -ENOMEM;
2492 goto fail;
2493 }
2494 }
2495
2496 f->path = strdup(fname);
2497 if (!f->path) {
2498 r = -ENOMEM;
2499 goto fail;
2500 }
2501
2502 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2503 if (!f->chain_cache) {
2504 r = -ENOMEM;
2505 goto fail;
2506 }
2507
2508 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2509 if (f->fd < 0) {
2510 r = -errno;
2511 goto fail;
2512 }
2513
2514 if (fstat(f->fd, &f->last_stat) < 0) {
2515 r = -errno;
2516 goto fail;
2517 }
2518
2519 if (f->last_stat.st_size == 0 && f->writable) {
2520 uint64_t crtime;
2521
2522 /* Let's attach the creation time to the journal file,
2523 * so that the vacuuming code knows the age of this
2524 * file even if the file might end up corrupted one
2525 * day... Ideally we'd just use the creation time many
2526 * file systems maintain for each file, but there is
2527 * currently no usable API to query this, hence let's
2528 * emulate this via extended attributes. If extended
2529 * attributes are not supported we'll just skip this,
2530 * and rely solely on mtime/atime/ctime of the file.*/
2531
2532 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2533 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2534
2535 #ifdef HAVE_GCRYPT
2536 /* Try to load the FSPRG state, and if we can't, then
2537 * just don't do sealing */
2538 if (f->seal) {
2539 r = journal_file_fss_load(f);
2540 if (r < 0)
2541 f->seal = false;
2542 }
2543 #endif
2544
2545 r = journal_file_init_header(f, template);
2546 if (r < 0)
2547 goto fail;
2548
2549 if (fstat(f->fd, &f->last_stat) < 0) {
2550 r = -errno;
2551 goto fail;
2552 }
2553
2554 newly_created = true;
2555 }
2556
2557 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2558 r = -EIO;
2559 goto fail;
2560 }
2561
2562 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2563 if (f->header == MAP_FAILED) {
2564 f->header = NULL;
2565 r = -errno;
2566 goto fail;
2567 }
2568
2569 if (!newly_created) {
2570 r = journal_file_verify_header(f);
2571 if (r < 0)
2572 goto fail;
2573 }
2574
2575 #ifdef HAVE_GCRYPT
2576 if (!newly_created && f->writable) {
2577 r = journal_file_fss_load(f);
2578 if (r < 0)
2579 goto fail;
2580 }
2581 #endif
2582
2583 if (f->writable) {
2584 if (metrics) {
2585 journal_default_metrics(metrics, f->fd);
2586 f->metrics = *metrics;
2587 } else if (template)
2588 f->metrics = template->metrics;
2589
2590 r = journal_file_refresh_header(f);
2591 if (r < 0)
2592 goto fail;
2593 }
2594
2595 #ifdef HAVE_GCRYPT
2596 r = journal_file_hmac_setup(f);
2597 if (r < 0)
2598 goto fail;
2599 #endif
2600
2601 if (newly_created) {
2602 r = journal_file_setup_field_hash_table(f);
2603 if (r < 0)
2604 goto fail;
2605
2606 r = journal_file_setup_data_hash_table(f);
2607 if (r < 0)
2608 goto fail;
2609
2610 #ifdef HAVE_GCRYPT
2611 r = journal_file_append_first_tag(f);
2612 if (r < 0)
2613 goto fail;
2614 #endif
2615 }
2616
2617 r = journal_file_map_field_hash_table(f);
2618 if (r < 0)
2619 goto fail;
2620
2621 r = journal_file_map_data_hash_table(f);
2622 if (r < 0)
2623 goto fail;
2624
2625 *ret = f;
2626 return 0;
2627
2628 fail:
2629 journal_file_close(f);
2630
2631 return r;
2632 }
2633
2634 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2635 _cleanup_free_ char *p = NULL;
2636 size_t l;
2637 JournalFile *old_file, *new_file = NULL;
2638 int r;
2639
2640 assert(f);
2641 assert(*f);
2642
2643 old_file = *f;
2644
2645 if (!old_file->writable)
2646 return -EINVAL;
2647
2648 if (!endswith(old_file->path, ".journal"))
2649 return -EINVAL;
2650
2651 l = strlen(old_file->path);
2652 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2653 (int) l - 8, old_file->path,
2654 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2655 le64toh((*f)->header->head_entry_seqnum),
2656 le64toh((*f)->header->head_entry_realtime));
2657 if (r < 0)
2658 return -ENOMEM;
2659
2660 r = rename(old_file->path, p);
2661 if (r < 0)
2662 return -errno;
2663
2664 old_file->header->state = STATE_ARCHIVED;
2665
2666 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2667 journal_file_close(old_file);
2668
2669 *f = new_file;
2670 return r;
2671 }
2672
2673 int journal_file_open_reliably(
2674 const char *fname,
2675 int flags,
2676 mode_t mode,
2677 bool compress,
2678 bool seal,
2679 JournalMetrics *metrics,
2680 MMapCache *mmap_cache,
2681 JournalFile *template,
2682 JournalFile **ret) {
2683
2684 int r;
2685 size_t l;
2686 _cleanup_free_ char *p = NULL;
2687
2688 r = journal_file_open(fname, flags, mode, compress, seal,
2689 metrics, mmap_cache, template, ret);
2690 if (r != -EBADMSG && /* corrupted */
2691 r != -ENODATA && /* truncated */
2692 r != -EHOSTDOWN && /* other machine */
2693 r != -EPROTONOSUPPORT && /* incompatible feature */
2694 r != -EBUSY && /* unclean shutdown */
2695 r != -ESHUTDOWN /* already archived */)
2696 return r;
2697
2698 if ((flags & O_ACCMODE) == O_RDONLY)
2699 return r;
2700
2701 if (!(flags & O_CREAT))
2702 return r;
2703
2704 if (!endswith(fname, ".journal"))
2705 return r;
2706
2707 /* The file is corrupted. Rotate it away and try it again (but only once) */
2708
2709 l = strlen(fname);
2710 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2711 (int) l - 8, fname,
2712 (unsigned long long) now(CLOCK_REALTIME),
2713 random_u64()) < 0)
2714 return -ENOMEM;
2715
2716 r = rename(fname, p);
2717 if (r < 0)
2718 return -errno;
2719
2720 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2721
2722 return journal_file_open(fname, flags, mode, compress, seal,
2723 metrics, mmap_cache, template, ret);
2724 }
2725
2726 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2727 uint64_t i, n;
2728 uint64_t q, xor_hash = 0;
2729 int r;
2730 EntryItem *items;
2731 dual_timestamp ts;
2732
2733 assert(from);
2734 assert(to);
2735 assert(o);
2736 assert(p);
2737
2738 if (!to->writable)
2739 return -EPERM;
2740
2741 ts.monotonic = le64toh(o->entry.monotonic);
2742 ts.realtime = le64toh(o->entry.realtime);
2743
2744 n = journal_file_entry_n_items(o);
2745 /* alloca() can't take 0, hence let's allocate at least one */
2746 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2747
2748 for (i = 0; i < n; i++) {
2749 uint64_t l, h;
2750 le64_t le_hash;
2751 size_t t;
2752 void *data;
2753 Object *u;
2754
2755 q = le64toh(o->entry.items[i].object_offset);
2756 le_hash = o->entry.items[i].hash;
2757
2758 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2759 if (r < 0)
2760 return r;
2761
2762 if (le_hash != o->data.hash)
2763 return -EBADMSG;
2764
2765 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2766 t = (size_t) l;
2767
2768 /* We hit the limit on 32bit machines */
2769 if ((uint64_t) t != l)
2770 return -E2BIG;
2771
2772 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2773 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2774 size_t rsize;
2775
2776 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2777 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2778 if (r < 0)
2779 return r;
2780
2781 data = from->compress_buffer;
2782 l = rsize;
2783 #else
2784 return -EPROTONOSUPPORT;
2785 #endif
2786 } else
2787 data = o->data.payload;
2788
2789 r = journal_file_append_data(to, data, l, &u, &h);
2790 if (r < 0)
2791 return r;
2792
2793 xor_hash ^= le64toh(u->data.hash);
2794 items[i].object_offset = htole64(h);
2795 items[i].hash = u->data.hash;
2796
2797 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2798 if (r < 0)
2799 return r;
2800 }
2801
2802 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2803 }
2804
2805 void journal_default_metrics(JournalMetrics *m, int fd) {
2806 uint64_t fs_size = 0;
2807 struct statvfs ss;
2808 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2809
2810 assert(m);
2811 assert(fd >= 0);
2812
2813 if (fstatvfs(fd, &ss) >= 0)
2814 fs_size = ss.f_frsize * ss.f_blocks;
2815
2816 if (m->max_use == (uint64_t) -1) {
2817
2818 if (fs_size > 0) {
2819 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2820
2821 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2822 m->max_use = DEFAULT_MAX_USE_UPPER;
2823
2824 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2825 m->max_use = DEFAULT_MAX_USE_LOWER;
2826 } else
2827 m->max_use = DEFAULT_MAX_USE_LOWER;
2828 } else {
2829 m->max_use = PAGE_ALIGN(m->max_use);
2830
2831 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2832 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2833 }
2834
2835 if (m->max_size == (uint64_t) -1) {
2836 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2837
2838 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2839 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2840 } else
2841 m->max_size = PAGE_ALIGN(m->max_size);
2842
2843 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2844 m->max_size = JOURNAL_FILE_SIZE_MIN;
2845
2846 if (m->max_size*2 > m->max_use)
2847 m->max_use = m->max_size*2;
2848
2849 if (m->min_size == (uint64_t) -1)
2850 m->min_size = JOURNAL_FILE_SIZE_MIN;
2851 else {
2852 m->min_size = PAGE_ALIGN(m->min_size);
2853
2854 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2855 m->min_size = JOURNAL_FILE_SIZE_MIN;
2856
2857 if (m->min_size > m->max_size)
2858 m->max_size = m->min_size;
2859 }
2860
2861 if (m->keep_free == (uint64_t) -1) {
2862
2863 if (fs_size > 0) {
2864 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2865
2866 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2867 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2868
2869 } else
2870 m->keep_free = DEFAULT_KEEP_FREE;
2871 }
2872
2873 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2874 format_bytes(a, sizeof(a), m->max_use),
2875 format_bytes(b, sizeof(b), m->max_size),
2876 format_bytes(c, sizeof(c), m->min_size),
2877 format_bytes(d, sizeof(d), m->keep_free));
2878 }
2879
2880 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2881 assert(f);
2882 assert(from || to);
2883
2884 if (from) {
2885 if (f->header->head_entry_realtime == 0)
2886 return -ENOENT;
2887
2888 *from = le64toh(f->header->head_entry_realtime);
2889 }
2890
2891 if (to) {
2892 if (f->header->tail_entry_realtime == 0)
2893 return -ENOENT;
2894
2895 *to = le64toh(f->header->tail_entry_realtime);
2896 }
2897
2898 return 1;
2899 }
2900
2901 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2902 Object *o;
2903 uint64_t p;
2904 int r;
2905
2906 assert(f);
2907 assert(from || to);
2908
2909 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2910 if (r <= 0)
2911 return r;
2912
2913 if (le64toh(o->data.n_entries) <= 0)
2914 return 0;
2915
2916 if (from) {
2917 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2918 if (r < 0)
2919 return r;
2920
2921 *from = le64toh(o->entry.monotonic);
2922 }
2923
2924 if (to) {
2925 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2926 if (r < 0)
2927 return r;
2928
2929 r = generic_array_get_plus_one(f,
2930 le64toh(o->data.entry_offset),
2931 le64toh(o->data.entry_array_offset),
2932 le64toh(o->data.n_entries)-1,
2933 &o, NULL);
2934 if (r <= 0)
2935 return r;
2936
2937 *to = le64toh(o->entry.monotonic);
2938 }
2939
2940 return 1;
2941 }
2942
2943 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2944 assert(f);
2945
2946 /* If we gained new header fields we gained new features,
2947 * hence suggest a rotation */
2948 if (le64toh(f->header->header_size) < sizeof(Header)) {
2949 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2950 return true;
2951 }
2952
2953 /* Let's check if the hash tables grew over a certain fill
2954 * level (75%, borrowing this value from Java's hash table
2955 * implementation), and if so suggest a rotation. To calculate
2956 * the fill level we need the n_data field, which only exists
2957 * in newer versions. */
2958
2959 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2960 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2961 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2962 f->path,
2963 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2964 le64toh(f->header->n_data),
2965 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2966 (unsigned long long) f->last_stat.st_size,
2967 f->last_stat.st_size / le64toh(f->header->n_data));
2968 return true;
2969 }
2970
2971 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2972 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2973 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2974 f->path,
2975 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2976 le64toh(f->header->n_fields),
2977 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2978 return true;
2979 }
2980
2981 /* Are the data objects properly indexed by field objects? */
2982 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2983 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2984 le64toh(f->header->n_data) > 0 &&
2985 le64toh(f->header->n_fields) == 0)
2986 return true;
2987
2988 if (max_file_usec > 0) {
2989 usec_t t, h;
2990
2991 h = le64toh(f->header->head_entry_realtime);
2992 t = now(CLOCK_REALTIME);
2993
2994 if (h > 0 && t > h + max_file_usec)
2995 return true;
2996 }
2997
2998 return false;
2999 }