]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
util: make creation time xattr logic more generic
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54 * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58 * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 /* How many entries to keep in the entry array chain cache at max */
65 #define CHAIN_CACHE_MAX 20
66
67 /* How much to increase the journal file size at once each time we allocate something new. */
68 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
69
70 static int journal_file_set_online(JournalFile *f) {
71 assert(f);
72
73 if (!f->writable)
74 return -EPERM;
75
76 if (!(f->fd >= 0 && f->header))
77 return -EINVAL;
78
79 switch(f->header->state) {
80 case STATE_ONLINE:
81 return 0;
82
83 case STATE_OFFLINE:
84 f->header->state = STATE_ONLINE;
85 fsync(f->fd);
86 return 0;
87
88 default:
89 return -EINVAL;
90 }
91 }
92
93 int journal_file_set_offline(JournalFile *f) {
94 assert(f);
95
96 if (!f->writable)
97 return -EPERM;
98
99 if (!(f->fd >= 0 && f->header))
100 return -EINVAL;
101
102 if (f->header->state != STATE_ONLINE)
103 return 0;
104
105 fsync(f->fd);
106
107 f->header->state = STATE_OFFLINE;
108
109 fsync(f->fd);
110
111 return 0;
112 }
113
114 void journal_file_close(JournalFile *f) {
115 assert(f);
116
117 #ifdef HAVE_GCRYPT
118 /* Write the final tag */
119 if (f->seal && f->writable)
120 journal_file_append_tag(f);
121 #endif
122
123 /* Sync everything to disk, before we mark the file offline */
124 if (f->mmap && f->fd >= 0)
125 mmap_cache_close_fd(f->mmap, f->fd);
126
127 journal_file_set_offline(f);
128
129 if (f->header)
130 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
131
132 safe_close(f->fd);
133 free(f->path);
134
135 if (f->mmap)
136 mmap_cache_unref(f->mmap);
137
138 ordered_hashmap_free_free(f->chain_cache);
139
140 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
141 free(f->compress_buffer);
142 #endif
143
144 #ifdef HAVE_GCRYPT
145 if (f->fss_file)
146 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
147 else if (f->fsprg_state)
148 free(f->fsprg_state);
149
150 free(f->fsprg_seed);
151
152 if (f->hmac)
153 gcry_md_close(f->hmac);
154 #endif
155
156 free(f);
157 }
158
159 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
160 Header h = {};
161 ssize_t k;
162 int r;
163
164 assert(f);
165
166 memcpy(h.signature, HEADER_SIGNATURE, 8);
167 h.header_size = htole64(ALIGN64(sizeof(h)));
168
169 h.incompatible_flags |= htole32(
170 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
171 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
172
173 h.compatible_flags = htole32(
174 f->seal * HEADER_COMPATIBLE_SEALED);
175
176 r = sd_id128_randomize(&h.file_id);
177 if (r < 0)
178 return r;
179
180 if (template) {
181 h.seqnum_id = template->header->seqnum_id;
182 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
183 } else
184 h.seqnum_id = h.file_id;
185
186 k = pwrite(f->fd, &h, sizeof(h), 0);
187 if (k < 0)
188 return -errno;
189
190 if (k != sizeof(h))
191 return -EIO;
192
193 return 0;
194 }
195
196 static int journal_file_refresh_header(JournalFile *f) {
197 int r;
198 sd_id128_t boot_id;
199
200 assert(f);
201
202 r = sd_id128_get_machine(&f->header->machine_id);
203 if (r < 0)
204 return r;
205
206 r = sd_id128_get_boot(&boot_id);
207 if (r < 0)
208 return r;
209
210 if (sd_id128_equal(boot_id, f->header->boot_id))
211 f->tail_entry_monotonic_valid = true;
212
213 f->header->boot_id = boot_id;
214
215 journal_file_set_online(f);
216
217 /* Sync the online state to disk */
218 fsync(f->fd);
219
220 return 0;
221 }
222
223 static int journal_file_verify_header(JournalFile *f) {
224 uint32_t flags;
225
226 assert(f);
227
228 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
229 return -EBADMSG;
230
231 /* In both read and write mode we refuse to open files with
232 * incompatible flags we don't know */
233 flags = le32toh(f->header->incompatible_flags);
234 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
235 if (flags & ~HEADER_INCOMPATIBLE_ANY)
236 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
237 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
238 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
239 if (flags)
240 log_debug("Journal file %s uses incompatible flags %"PRIx32
241 " disabled at compilation time.", f->path, flags);
242 return -EPROTONOSUPPORT;
243 }
244
245 /* When open for writing we refuse to open files with
246 * compatible flags, too */
247 flags = le32toh(f->header->compatible_flags);
248 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
249 if (flags & ~HEADER_COMPATIBLE_ANY)
250 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
251 f->path, flags & ~HEADER_COMPATIBLE_ANY);
252 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
253 if (flags)
254 log_debug("Journal file %s uses compatible flags %"PRIx32
255 " disabled at compilation time.", f->path, flags);
256 return -EPROTONOSUPPORT;
257 }
258
259 if (f->header->state >= _STATE_MAX)
260 return -EBADMSG;
261
262 /* The first addition was n_data, so check that we are at least this large */
263 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
264 return -EBADMSG;
265
266 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
267 return -EBADMSG;
268
269 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
270 return -ENODATA;
271
272 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
273 return -ENODATA;
274
275 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
276 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->tail_object_offset)) ||
278 !VALID64(le64toh(f->header->entry_array_offset)))
279 return -ENODATA;
280
281 if (f->writable) {
282 uint8_t state;
283 sd_id128_t machine_id;
284 int r;
285
286 r = sd_id128_get_machine(&machine_id);
287 if (r < 0)
288 return r;
289
290 if (!sd_id128_equal(machine_id, f->header->machine_id))
291 return -EHOSTDOWN;
292
293 state = f->header->state;
294
295 if (state == STATE_ONLINE) {
296 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
297 return -EBUSY;
298 } else if (state == STATE_ARCHIVED)
299 return -ESHUTDOWN;
300 else if (state != STATE_OFFLINE) {
301 log_debug("Journal file %s has unknown state %u.", f->path, state);
302 return -EBUSY;
303 }
304 }
305
306 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
307 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
308
309 f->seal = JOURNAL_HEADER_SEALED(f->header);
310
311 return 0;
312 }
313
314 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
315 uint64_t old_size, new_size;
316 int r;
317
318 assert(f);
319
320 /* We assume that this file is not sparse, and we know that
321 * for sure, since we always call posix_fallocate()
322 * ourselves */
323
324 old_size =
325 le64toh(f->header->header_size) +
326 le64toh(f->header->arena_size);
327
328 new_size = PAGE_ALIGN(offset + size);
329 if (new_size < le64toh(f->header->header_size))
330 new_size = le64toh(f->header->header_size);
331
332 if (new_size <= old_size)
333 return 0;
334
335 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
336 return -E2BIG;
337
338 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
339 struct statvfs svfs;
340
341 if (fstatvfs(f->fd, &svfs) >= 0) {
342 uint64_t available;
343
344 available = svfs.f_bfree * svfs.f_bsize;
345
346 if (available >= f->metrics.keep_free)
347 available -= f->metrics.keep_free;
348 else
349 available = 0;
350
351 if (new_size - old_size > available)
352 return -E2BIG;
353 }
354 }
355
356 /* Increase by larger blocks at once */
357 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
358 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
359 new_size = f->metrics.max_size;
360
361 /* Note that the glibc fallocate() fallback is very
362 inefficient, hence we try to minimize the allocation area
363 as we can. */
364 r = posix_fallocate(f->fd, old_size, new_size - old_size);
365 if (r != 0)
366 return -r;
367
368 if (fstat(f->fd, &f->last_stat) < 0)
369 return -errno;
370
371 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
372
373 return 0;
374 }
375
376 static unsigned type_to_context(ObjectType type) {
377 /* One context for each type, plus one catch-all for the rest */
378 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
379 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
380 }
381
382 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
383 assert(f);
384 assert(ret);
385
386 if (size <= 0)
387 return -EINVAL;
388
389 /* Avoid SIGBUS on invalid accesses */
390 if (offset + size > (uint64_t) f->last_stat.st_size) {
391 /* Hmm, out of range? Let's refresh the fstat() data
392 * first, before we trust that check. */
393
394 if (fstat(f->fd, &f->last_stat) < 0 ||
395 offset + size > (uint64_t) f->last_stat.st_size)
396 return -EADDRNOTAVAIL;
397 }
398
399 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
400 }
401
402 static uint64_t minimum_header_size(Object *o) {
403
404 static const uint64_t table[] = {
405 [OBJECT_DATA] = sizeof(DataObject),
406 [OBJECT_FIELD] = sizeof(FieldObject),
407 [OBJECT_ENTRY] = sizeof(EntryObject),
408 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
409 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
410 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
411 [OBJECT_TAG] = sizeof(TagObject),
412 };
413
414 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
415 return sizeof(ObjectHeader);
416
417 return table[o->object.type];
418 }
419
420 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
421 int r;
422 void *t;
423 Object *o;
424 uint64_t s;
425
426 assert(f);
427 assert(ret);
428
429 /* Objects may only be located at multiple of 64 bit */
430 if (!VALID64(offset))
431 return -EFAULT;
432
433 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
434 if (r < 0)
435 return r;
436
437 o = (Object*) t;
438 s = le64toh(o->object.size);
439
440 if (s < sizeof(ObjectHeader))
441 return -EBADMSG;
442
443 if (o->object.type <= OBJECT_UNUSED)
444 return -EBADMSG;
445
446 if (s < minimum_header_size(o))
447 return -EBADMSG;
448
449 if (type > OBJECT_UNUSED && o->object.type != type)
450 return -EBADMSG;
451
452 if (s > sizeof(ObjectHeader)) {
453 r = journal_file_move_to(f, type, false, offset, s, &t);
454 if (r < 0)
455 return r;
456
457 o = (Object*) t;
458 }
459
460 *ret = o;
461 return 0;
462 }
463
464 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
465 uint64_t r;
466
467 assert(f);
468
469 r = le64toh(f->header->tail_entry_seqnum) + 1;
470
471 if (seqnum) {
472 /* If an external seqnum counter was passed, we update
473 * both the local and the external one, and set it to
474 * the maximum of both */
475
476 if (*seqnum + 1 > r)
477 r = *seqnum + 1;
478
479 *seqnum = r;
480 }
481
482 f->header->tail_entry_seqnum = htole64(r);
483
484 if (f->header->head_entry_seqnum == 0)
485 f->header->head_entry_seqnum = htole64(r);
486
487 return r;
488 }
489
490 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
491 int r;
492 uint64_t p;
493 Object *tail, *o;
494 void *t;
495
496 assert(f);
497 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
498 assert(size >= sizeof(ObjectHeader));
499 assert(offset);
500 assert(ret);
501
502 r = journal_file_set_online(f);
503 if (r < 0)
504 return r;
505
506 p = le64toh(f->header->tail_object_offset);
507 if (p == 0)
508 p = le64toh(f->header->header_size);
509 else {
510 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
511 if (r < 0)
512 return r;
513
514 p += ALIGN64(le64toh(tail->object.size));
515 }
516
517 r = journal_file_allocate(f, p, size);
518 if (r < 0)
519 return r;
520
521 r = journal_file_move_to(f, type, false, p, size, &t);
522 if (r < 0)
523 return r;
524
525 o = (Object*) t;
526
527 zero(o->object);
528 o->object.type = type;
529 o->object.size = htole64(size);
530
531 f->header->tail_object_offset = htole64(p);
532 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
533
534 *ret = o;
535 *offset = p;
536
537 return 0;
538 }
539
540 static int journal_file_setup_data_hash_table(JournalFile *f) {
541 uint64_t s, p;
542 Object *o;
543 int r;
544
545 assert(f);
546
547 /* We estimate that we need 1 hash table entry per 768 of
548 journal file and we want to make sure we never get beyond
549 75% fill level. Calculate the hash table size for the
550 maximum file size based on these metrics. */
551
552 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
553 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
554 s = DEFAULT_DATA_HASH_TABLE_SIZE;
555
556 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
557
558 r = journal_file_append_object(f,
559 OBJECT_DATA_HASH_TABLE,
560 offsetof(Object, hash_table.items) + s,
561 &o, &p);
562 if (r < 0)
563 return r;
564
565 memzero(o->hash_table.items, s);
566
567 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
568 f->header->data_hash_table_size = htole64(s);
569
570 return 0;
571 }
572
573 static int journal_file_setup_field_hash_table(JournalFile *f) {
574 uint64_t s, p;
575 Object *o;
576 int r;
577
578 assert(f);
579
580 /* We use a fixed size hash table for the fields as this
581 * number should grow very slowly only */
582
583 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
584 r = journal_file_append_object(f,
585 OBJECT_FIELD_HASH_TABLE,
586 offsetof(Object, hash_table.items) + s,
587 &o, &p);
588 if (r < 0)
589 return r;
590
591 memzero(o->hash_table.items, s);
592
593 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
594 f->header->field_hash_table_size = htole64(s);
595
596 return 0;
597 }
598
599 static int journal_file_map_data_hash_table(JournalFile *f) {
600 uint64_t s, p;
601 void *t;
602 int r;
603
604 assert(f);
605
606 p = le64toh(f->header->data_hash_table_offset);
607 s = le64toh(f->header->data_hash_table_size);
608
609 r = journal_file_move_to(f,
610 OBJECT_DATA_HASH_TABLE,
611 true,
612 p, s,
613 &t);
614 if (r < 0)
615 return r;
616
617 f->data_hash_table = t;
618 return 0;
619 }
620
621 static int journal_file_map_field_hash_table(JournalFile *f) {
622 uint64_t s, p;
623 void *t;
624 int r;
625
626 assert(f);
627
628 p = le64toh(f->header->field_hash_table_offset);
629 s = le64toh(f->header->field_hash_table_size);
630
631 r = journal_file_move_to(f,
632 OBJECT_FIELD_HASH_TABLE,
633 true,
634 p, s,
635 &t);
636 if (r < 0)
637 return r;
638
639 f->field_hash_table = t;
640 return 0;
641 }
642
643 static int journal_file_link_field(
644 JournalFile *f,
645 Object *o,
646 uint64_t offset,
647 uint64_t hash) {
648
649 uint64_t p, h;
650 int r;
651
652 assert(f);
653 assert(o);
654 assert(offset > 0);
655
656 if (o->object.type != OBJECT_FIELD)
657 return -EINVAL;
658
659 /* This might alter the window we are looking at */
660
661 o->field.next_hash_offset = o->field.head_data_offset = 0;
662
663 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
664 p = le64toh(f->field_hash_table[h].tail_hash_offset);
665 if (p == 0)
666 f->field_hash_table[h].head_hash_offset = htole64(offset);
667 else {
668 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
669 if (r < 0)
670 return r;
671
672 o->field.next_hash_offset = htole64(offset);
673 }
674
675 f->field_hash_table[h].tail_hash_offset = htole64(offset);
676
677 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
678 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
679
680 return 0;
681 }
682
683 static int journal_file_link_data(
684 JournalFile *f,
685 Object *o,
686 uint64_t offset,
687 uint64_t hash) {
688
689 uint64_t p, h;
690 int r;
691
692 assert(f);
693 assert(o);
694 assert(offset > 0);
695
696 if (o->object.type != OBJECT_DATA)
697 return -EINVAL;
698
699 /* This might alter the window we are looking at */
700
701 o->data.next_hash_offset = o->data.next_field_offset = 0;
702 o->data.entry_offset = o->data.entry_array_offset = 0;
703 o->data.n_entries = 0;
704
705 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
706 p = le64toh(f->data_hash_table[h].tail_hash_offset);
707 if (p == 0)
708 /* Only entry in the hash table is easy */
709 f->data_hash_table[h].head_hash_offset = htole64(offset);
710 else {
711 /* Move back to the previous data object, to patch in
712 * pointer */
713
714 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
715 if (r < 0)
716 return r;
717
718 o->data.next_hash_offset = htole64(offset);
719 }
720
721 f->data_hash_table[h].tail_hash_offset = htole64(offset);
722
723 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
724 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
725
726 return 0;
727 }
728
729 int journal_file_find_field_object_with_hash(
730 JournalFile *f,
731 const void *field, uint64_t size, uint64_t hash,
732 Object **ret, uint64_t *offset) {
733
734 uint64_t p, osize, h;
735 int r;
736
737 assert(f);
738 assert(field && size > 0);
739
740 osize = offsetof(Object, field.payload) + size;
741
742 if (f->header->field_hash_table_size == 0)
743 return -EBADMSG;
744
745 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
746 p = le64toh(f->field_hash_table[h].head_hash_offset);
747
748 while (p > 0) {
749 Object *o;
750
751 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
752 if (r < 0)
753 return r;
754
755 if (le64toh(o->field.hash) == hash &&
756 le64toh(o->object.size) == osize &&
757 memcmp(o->field.payload, field, size) == 0) {
758
759 if (ret)
760 *ret = o;
761 if (offset)
762 *offset = p;
763
764 return 1;
765 }
766
767 p = le64toh(o->field.next_hash_offset);
768 }
769
770 return 0;
771 }
772
773 int journal_file_find_field_object(
774 JournalFile *f,
775 const void *field, uint64_t size,
776 Object **ret, uint64_t *offset) {
777
778 uint64_t hash;
779
780 assert(f);
781 assert(field && size > 0);
782
783 hash = hash64(field, size);
784
785 return journal_file_find_field_object_with_hash(f,
786 field, size, hash,
787 ret, offset);
788 }
789
790 int journal_file_find_data_object_with_hash(
791 JournalFile *f,
792 const void *data, uint64_t size, uint64_t hash,
793 Object **ret, uint64_t *offset) {
794
795 uint64_t p, osize, h;
796 int r;
797
798 assert(f);
799 assert(data || size == 0);
800
801 osize = offsetof(Object, data.payload) + size;
802
803 if (f->header->data_hash_table_size == 0)
804 return -EBADMSG;
805
806 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
807 p = le64toh(f->data_hash_table[h].head_hash_offset);
808
809 while (p > 0) {
810 Object *o;
811
812 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
813 if (r < 0)
814 return r;
815
816 if (le64toh(o->data.hash) != hash)
817 goto next;
818
819 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
820 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
821 uint64_t l;
822 size_t rsize;
823
824 l = le64toh(o->object.size);
825 if (l <= offsetof(Object, data.payload))
826 return -EBADMSG;
827
828 l -= offsetof(Object, data.payload);
829
830 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
831 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
832 if (r < 0)
833 return r;
834
835 if (rsize == size &&
836 memcmp(f->compress_buffer, data, size) == 0) {
837
838 if (ret)
839 *ret = o;
840
841 if (offset)
842 *offset = p;
843
844 return 1;
845 }
846 #else
847 return -EPROTONOSUPPORT;
848 #endif
849 } else if (le64toh(o->object.size) == osize &&
850 memcmp(o->data.payload, data, size) == 0) {
851
852 if (ret)
853 *ret = o;
854
855 if (offset)
856 *offset = p;
857
858 return 1;
859 }
860
861 next:
862 p = le64toh(o->data.next_hash_offset);
863 }
864
865 return 0;
866 }
867
868 int journal_file_find_data_object(
869 JournalFile *f,
870 const void *data, uint64_t size,
871 Object **ret, uint64_t *offset) {
872
873 uint64_t hash;
874
875 assert(f);
876 assert(data || size == 0);
877
878 hash = hash64(data, size);
879
880 return journal_file_find_data_object_with_hash(f,
881 data, size, hash,
882 ret, offset);
883 }
884
885 static int journal_file_append_field(
886 JournalFile *f,
887 const void *field, uint64_t size,
888 Object **ret, uint64_t *offset) {
889
890 uint64_t hash, p;
891 uint64_t osize;
892 Object *o;
893 int r;
894
895 assert(f);
896 assert(field && size > 0);
897
898 hash = hash64(field, size);
899
900 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
901 if (r < 0)
902 return r;
903 else if (r > 0) {
904
905 if (ret)
906 *ret = o;
907
908 if (offset)
909 *offset = p;
910
911 return 0;
912 }
913
914 osize = offsetof(Object, field.payload) + size;
915 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
916 if (r < 0)
917 return r;
918
919 o->field.hash = htole64(hash);
920 memcpy(o->field.payload, field, size);
921
922 r = journal_file_link_field(f, o, p, hash);
923 if (r < 0)
924 return r;
925
926 /* The linking might have altered the window, so let's
927 * refresh our pointer */
928 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
929 if (r < 0)
930 return r;
931
932 #ifdef HAVE_GCRYPT
933 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
934 if (r < 0)
935 return r;
936 #endif
937
938 if (ret)
939 *ret = o;
940
941 if (offset)
942 *offset = p;
943
944 return 0;
945 }
946
947 static int journal_file_append_data(
948 JournalFile *f,
949 const void *data, uint64_t size,
950 Object **ret, uint64_t *offset) {
951
952 uint64_t hash, p;
953 uint64_t osize;
954 Object *o;
955 int r, compression = 0;
956 const void *eq;
957
958 assert(f);
959 assert(data || size == 0);
960
961 hash = hash64(data, size);
962
963 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
964 if (r < 0)
965 return r;
966 else if (r > 0) {
967
968 if (ret)
969 *ret = o;
970
971 if (offset)
972 *offset = p;
973
974 return 0;
975 }
976
977 osize = offsetof(Object, data.payload) + size;
978 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
979 if (r < 0)
980 return r;
981
982 o->data.hash = htole64(hash);
983
984 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
985 if (f->compress_xz &&
986 size >= COMPRESSION_SIZE_THRESHOLD) {
987 size_t rsize;
988
989 compression = compress_blob(data, size, o->data.payload, &rsize);
990
991 if (compression) {
992 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
993 o->object.flags |= compression;
994
995 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
996 size, rsize, object_compressed_to_string(compression));
997 }
998 }
999 #endif
1000
1001 if (!compression && size > 0)
1002 memcpy(o->data.payload, data, size);
1003
1004 r = journal_file_link_data(f, o, p, hash);
1005 if (r < 0)
1006 return r;
1007
1008 /* The linking might have altered the window, so let's
1009 * refresh our pointer */
1010 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1011 if (r < 0)
1012 return r;
1013
1014 if (!data)
1015 eq = NULL;
1016 else
1017 eq = memchr(data, '=', size);
1018 if (eq && eq > data) {
1019 Object *fo = NULL;
1020 uint64_t fp;
1021
1022 /* Create field object ... */
1023 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1024 if (r < 0)
1025 return r;
1026
1027 /* ... and link it in. */
1028 o->data.next_field_offset = fo->field.head_data_offset;
1029 fo->field.head_data_offset = le64toh(p);
1030 }
1031
1032 #ifdef HAVE_GCRYPT
1033 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1034 if (r < 0)
1035 return r;
1036 #endif
1037
1038 if (ret)
1039 *ret = o;
1040
1041 if (offset)
1042 *offset = p;
1043
1044 return 0;
1045 }
1046
1047 uint64_t journal_file_entry_n_items(Object *o) {
1048 assert(o);
1049
1050 if (o->object.type != OBJECT_ENTRY)
1051 return 0;
1052
1053 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1054 }
1055
1056 uint64_t journal_file_entry_array_n_items(Object *o) {
1057 assert(o);
1058
1059 if (o->object.type != OBJECT_ENTRY_ARRAY)
1060 return 0;
1061
1062 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1063 }
1064
1065 uint64_t journal_file_hash_table_n_items(Object *o) {
1066 assert(o);
1067
1068 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1069 o->object.type != OBJECT_FIELD_HASH_TABLE)
1070 return 0;
1071
1072 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1073 }
1074
1075 static int link_entry_into_array(JournalFile *f,
1076 le64_t *first,
1077 le64_t *idx,
1078 uint64_t p) {
1079 int r;
1080 uint64_t n = 0, ap = 0, q, i, a, hidx;
1081 Object *o;
1082
1083 assert(f);
1084 assert(first);
1085 assert(idx);
1086 assert(p > 0);
1087
1088 a = le64toh(*first);
1089 i = hidx = le64toh(*idx);
1090 while (a > 0) {
1091
1092 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1093 if (r < 0)
1094 return r;
1095
1096 n = journal_file_entry_array_n_items(o);
1097 if (i < n) {
1098 o->entry_array.items[i] = htole64(p);
1099 *idx = htole64(hidx + 1);
1100 return 0;
1101 }
1102
1103 i -= n;
1104 ap = a;
1105 a = le64toh(o->entry_array.next_entry_array_offset);
1106 }
1107
1108 if (hidx > n)
1109 n = (hidx+1) * 2;
1110 else
1111 n = n * 2;
1112
1113 if (n < 4)
1114 n = 4;
1115
1116 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1117 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1118 &o, &q);
1119 if (r < 0)
1120 return r;
1121
1122 #ifdef HAVE_GCRYPT
1123 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1124 if (r < 0)
1125 return r;
1126 #endif
1127
1128 o->entry_array.items[i] = htole64(p);
1129
1130 if (ap == 0)
1131 *first = htole64(q);
1132 else {
1133 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1134 if (r < 0)
1135 return r;
1136
1137 o->entry_array.next_entry_array_offset = htole64(q);
1138 }
1139
1140 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1141 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1142
1143 *idx = htole64(hidx + 1);
1144
1145 return 0;
1146 }
1147
1148 static int link_entry_into_array_plus_one(JournalFile *f,
1149 le64_t *extra,
1150 le64_t *first,
1151 le64_t *idx,
1152 uint64_t p) {
1153
1154 int r;
1155
1156 assert(f);
1157 assert(extra);
1158 assert(first);
1159 assert(idx);
1160 assert(p > 0);
1161
1162 if (*idx == 0)
1163 *extra = htole64(p);
1164 else {
1165 le64_t i;
1166
1167 i = htole64(le64toh(*idx) - 1);
1168 r = link_entry_into_array(f, first, &i, p);
1169 if (r < 0)
1170 return r;
1171 }
1172
1173 *idx = htole64(le64toh(*idx) + 1);
1174 return 0;
1175 }
1176
1177 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1178 uint64_t p;
1179 int r;
1180 assert(f);
1181 assert(o);
1182 assert(offset > 0);
1183
1184 p = le64toh(o->entry.items[i].object_offset);
1185 if (p == 0)
1186 return -EINVAL;
1187
1188 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1189 if (r < 0)
1190 return r;
1191
1192 return link_entry_into_array_plus_one(f,
1193 &o->data.entry_offset,
1194 &o->data.entry_array_offset,
1195 &o->data.n_entries,
1196 offset);
1197 }
1198
1199 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1200 uint64_t n, i;
1201 int r;
1202
1203 assert(f);
1204 assert(o);
1205 assert(offset > 0);
1206
1207 if (o->object.type != OBJECT_ENTRY)
1208 return -EINVAL;
1209
1210 __sync_synchronize();
1211
1212 /* Link up the entry itself */
1213 r = link_entry_into_array(f,
1214 &f->header->entry_array_offset,
1215 &f->header->n_entries,
1216 offset);
1217 if (r < 0)
1218 return r;
1219
1220 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1221
1222 if (f->header->head_entry_realtime == 0)
1223 f->header->head_entry_realtime = o->entry.realtime;
1224
1225 f->header->tail_entry_realtime = o->entry.realtime;
1226 f->header->tail_entry_monotonic = o->entry.monotonic;
1227
1228 f->tail_entry_monotonic_valid = true;
1229
1230 /* Link up the items */
1231 n = journal_file_entry_n_items(o);
1232 for (i = 0; i < n; i++) {
1233 r = journal_file_link_entry_item(f, o, offset, i);
1234 if (r < 0)
1235 return r;
1236 }
1237
1238 return 0;
1239 }
1240
1241 static int journal_file_append_entry_internal(
1242 JournalFile *f,
1243 const dual_timestamp *ts,
1244 uint64_t xor_hash,
1245 const EntryItem items[], unsigned n_items,
1246 uint64_t *seqnum,
1247 Object **ret, uint64_t *offset) {
1248 uint64_t np;
1249 uint64_t osize;
1250 Object *o;
1251 int r;
1252
1253 assert(f);
1254 assert(items || n_items == 0);
1255 assert(ts);
1256
1257 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1258
1259 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1260 if (r < 0)
1261 return r;
1262
1263 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1264 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1265 o->entry.realtime = htole64(ts->realtime);
1266 o->entry.monotonic = htole64(ts->monotonic);
1267 o->entry.xor_hash = htole64(xor_hash);
1268 o->entry.boot_id = f->header->boot_id;
1269
1270 #ifdef HAVE_GCRYPT
1271 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1272 if (r < 0)
1273 return r;
1274 #endif
1275
1276 r = journal_file_link_entry(f, o, np);
1277 if (r < 0)
1278 return r;
1279
1280 if (ret)
1281 *ret = o;
1282
1283 if (offset)
1284 *offset = np;
1285
1286 return 0;
1287 }
1288
1289 void journal_file_post_change(JournalFile *f) {
1290 assert(f);
1291
1292 /* inotify() does not receive IN_MODIFY events from file
1293 * accesses done via mmap(). After each access we hence
1294 * trigger IN_MODIFY by truncating the journal file to its
1295 * current size which triggers IN_MODIFY. */
1296
1297 __sync_synchronize();
1298
1299 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1300 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1301 }
1302
1303 static int entry_item_cmp(const void *_a, const void *_b) {
1304 const EntryItem *a = _a, *b = _b;
1305
1306 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1307 return -1;
1308 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1309 return 1;
1310 return 0;
1311 }
1312
1313 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1314 unsigned i;
1315 EntryItem *items;
1316 int r;
1317 uint64_t xor_hash = 0;
1318 struct dual_timestamp _ts;
1319
1320 assert(f);
1321 assert(iovec || n_iovec == 0);
1322
1323 if (!ts) {
1324 dual_timestamp_get(&_ts);
1325 ts = &_ts;
1326 }
1327
1328 if (f->tail_entry_monotonic_valid &&
1329 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1330 return -EINVAL;
1331
1332 #ifdef HAVE_GCRYPT
1333 r = journal_file_maybe_append_tag(f, ts->realtime);
1334 if (r < 0)
1335 return r;
1336 #endif
1337
1338 /* alloca() can't take 0, hence let's allocate at least one */
1339 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1340
1341 for (i = 0; i < n_iovec; i++) {
1342 uint64_t p;
1343 Object *o;
1344
1345 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1346 if (r < 0)
1347 return r;
1348
1349 xor_hash ^= le64toh(o->data.hash);
1350 items[i].object_offset = htole64(p);
1351 items[i].hash = o->data.hash;
1352 }
1353
1354 /* Order by the position on disk, in order to improve seek
1355 * times for rotating media. */
1356 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1357
1358 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1359
1360 journal_file_post_change(f);
1361
1362 return r;
1363 }
1364
1365 typedef struct ChainCacheItem {
1366 uint64_t first; /* the array at the beginning of the chain */
1367 uint64_t array; /* the cached array */
1368 uint64_t begin; /* the first item in the cached array */
1369 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1370 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1371 } ChainCacheItem;
1372
1373 static void chain_cache_put(
1374 OrderedHashmap *h,
1375 ChainCacheItem *ci,
1376 uint64_t first,
1377 uint64_t array,
1378 uint64_t begin,
1379 uint64_t total,
1380 uint64_t last_index) {
1381
1382 if (!ci) {
1383 /* If the chain item to cache for this chain is the
1384 * first one it's not worth caching anything */
1385 if (array == first)
1386 return;
1387
1388 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1389 ci = ordered_hashmap_steal_first(h);
1390 assert(ci);
1391 } else {
1392 ci = new(ChainCacheItem, 1);
1393 if (!ci)
1394 return;
1395 }
1396
1397 ci->first = first;
1398
1399 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1400 free(ci);
1401 return;
1402 }
1403 } else
1404 assert(ci->first == first);
1405
1406 ci->array = array;
1407 ci->begin = begin;
1408 ci->total = total;
1409 ci->last_index = last_index;
1410 }
1411
1412 static int generic_array_get(
1413 JournalFile *f,
1414 uint64_t first,
1415 uint64_t i,
1416 Object **ret, uint64_t *offset) {
1417
1418 Object *o;
1419 uint64_t p = 0, a, t = 0;
1420 int r;
1421 ChainCacheItem *ci;
1422
1423 assert(f);
1424
1425 a = first;
1426
1427 /* Try the chain cache first */
1428 ci = ordered_hashmap_get(f->chain_cache, &first);
1429 if (ci && i > ci->total) {
1430 a = ci->array;
1431 i -= ci->total;
1432 t = ci->total;
1433 }
1434
1435 while (a > 0) {
1436 uint64_t k;
1437
1438 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1439 if (r < 0)
1440 return r;
1441
1442 k = journal_file_entry_array_n_items(o);
1443 if (i < k) {
1444 p = le64toh(o->entry_array.items[i]);
1445 goto found;
1446 }
1447
1448 i -= k;
1449 t += k;
1450 a = le64toh(o->entry_array.next_entry_array_offset);
1451 }
1452
1453 return 0;
1454
1455 found:
1456 /* Let's cache this item for the next invocation */
1457 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1458
1459 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1460 if (r < 0)
1461 return r;
1462
1463 if (ret)
1464 *ret = o;
1465
1466 if (offset)
1467 *offset = p;
1468
1469 return 1;
1470 }
1471
1472 static int generic_array_get_plus_one(
1473 JournalFile *f,
1474 uint64_t extra,
1475 uint64_t first,
1476 uint64_t i,
1477 Object **ret, uint64_t *offset) {
1478
1479 Object *o;
1480
1481 assert(f);
1482
1483 if (i == 0) {
1484 int r;
1485
1486 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1487 if (r < 0)
1488 return r;
1489
1490 if (ret)
1491 *ret = o;
1492
1493 if (offset)
1494 *offset = extra;
1495
1496 return 1;
1497 }
1498
1499 return generic_array_get(f, first, i-1, ret, offset);
1500 }
1501
1502 enum {
1503 TEST_FOUND,
1504 TEST_LEFT,
1505 TEST_RIGHT
1506 };
1507
1508 static int generic_array_bisect(
1509 JournalFile *f,
1510 uint64_t first,
1511 uint64_t n,
1512 uint64_t needle,
1513 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1514 direction_t direction,
1515 Object **ret,
1516 uint64_t *offset,
1517 uint64_t *idx) {
1518
1519 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1520 bool subtract_one = false;
1521 Object *o, *array = NULL;
1522 int r;
1523 ChainCacheItem *ci;
1524
1525 assert(f);
1526 assert(test_object);
1527
1528 /* Start with the first array in the chain */
1529 a = first;
1530
1531 ci = ordered_hashmap_get(f->chain_cache, &first);
1532 if (ci && n > ci->total) {
1533 /* Ah, we have iterated this bisection array chain
1534 * previously! Let's see if we can skip ahead in the
1535 * chain, as far as the last time. But we can't jump
1536 * backwards in the chain, so let's check that
1537 * first. */
1538
1539 r = test_object(f, ci->begin, needle);
1540 if (r < 0)
1541 return r;
1542
1543 if (r == TEST_LEFT) {
1544 /* OK, what we are looking for is right of the
1545 * begin of this EntryArray, so let's jump
1546 * straight to previously cached array in the
1547 * chain */
1548
1549 a = ci->array;
1550 n -= ci->total;
1551 t = ci->total;
1552 last_index = ci->last_index;
1553 }
1554 }
1555
1556 while (a > 0) {
1557 uint64_t left, right, k, lp;
1558
1559 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1560 if (r < 0)
1561 return r;
1562
1563 k = journal_file_entry_array_n_items(array);
1564 right = MIN(k, n);
1565 if (right <= 0)
1566 return 0;
1567
1568 i = right - 1;
1569 lp = p = le64toh(array->entry_array.items[i]);
1570 if (p <= 0)
1571 return -EBADMSG;
1572
1573 r = test_object(f, p, needle);
1574 if (r < 0)
1575 return r;
1576
1577 if (r == TEST_FOUND)
1578 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1579
1580 if (r == TEST_RIGHT) {
1581 left = 0;
1582 right -= 1;
1583
1584 if (last_index != (uint64_t) -1) {
1585 assert(last_index <= right);
1586
1587 /* If we cached the last index we
1588 * looked at, let's try to not to jump
1589 * too wildly around and see if we can
1590 * limit the range to look at early to
1591 * the immediate neighbors of the last
1592 * index we looked at. */
1593
1594 if (last_index > 0) {
1595 uint64_t x = last_index - 1;
1596
1597 p = le64toh(array->entry_array.items[x]);
1598 if (p <= 0)
1599 return -EBADMSG;
1600
1601 r = test_object(f, p, needle);
1602 if (r < 0)
1603 return r;
1604
1605 if (r == TEST_FOUND)
1606 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1607
1608 if (r == TEST_RIGHT)
1609 right = x;
1610 else
1611 left = x + 1;
1612 }
1613
1614 if (last_index < right) {
1615 uint64_t y = last_index + 1;
1616
1617 p = le64toh(array->entry_array.items[y]);
1618 if (p <= 0)
1619 return -EBADMSG;
1620
1621 r = test_object(f, p, needle);
1622 if (r < 0)
1623 return r;
1624
1625 if (r == TEST_FOUND)
1626 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1627
1628 if (r == TEST_RIGHT)
1629 right = y;
1630 else
1631 left = y + 1;
1632 }
1633 }
1634
1635 for (;;) {
1636 if (left == right) {
1637 if (direction == DIRECTION_UP)
1638 subtract_one = true;
1639
1640 i = left;
1641 goto found;
1642 }
1643
1644 assert(left < right);
1645 i = (left + right) / 2;
1646
1647 p = le64toh(array->entry_array.items[i]);
1648 if (p <= 0)
1649 return -EBADMSG;
1650
1651 r = test_object(f, p, needle);
1652 if (r < 0)
1653 return r;
1654
1655 if (r == TEST_FOUND)
1656 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1657
1658 if (r == TEST_RIGHT)
1659 right = i;
1660 else
1661 left = i + 1;
1662 }
1663 }
1664
1665 if (k >= n) {
1666 if (direction == DIRECTION_UP) {
1667 i = n;
1668 subtract_one = true;
1669 goto found;
1670 }
1671
1672 return 0;
1673 }
1674
1675 last_p = lp;
1676
1677 n -= k;
1678 t += k;
1679 last_index = (uint64_t) -1;
1680 a = le64toh(array->entry_array.next_entry_array_offset);
1681 }
1682
1683 return 0;
1684
1685 found:
1686 if (subtract_one && t == 0 && i == 0)
1687 return 0;
1688
1689 /* Let's cache this item for the next invocation */
1690 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1691
1692 if (subtract_one && i == 0)
1693 p = last_p;
1694 else if (subtract_one)
1695 p = le64toh(array->entry_array.items[i-1]);
1696 else
1697 p = le64toh(array->entry_array.items[i]);
1698
1699 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1700 if (r < 0)
1701 return r;
1702
1703 if (ret)
1704 *ret = o;
1705
1706 if (offset)
1707 *offset = p;
1708
1709 if (idx)
1710 *idx = t + i + (subtract_one ? -1 : 0);
1711
1712 return 1;
1713 }
1714
1715
1716 static int generic_array_bisect_plus_one(
1717 JournalFile *f,
1718 uint64_t extra,
1719 uint64_t first,
1720 uint64_t n,
1721 uint64_t needle,
1722 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1723 direction_t direction,
1724 Object **ret,
1725 uint64_t *offset,
1726 uint64_t *idx) {
1727
1728 int r;
1729 bool step_back = false;
1730 Object *o;
1731
1732 assert(f);
1733 assert(test_object);
1734
1735 if (n <= 0)
1736 return 0;
1737
1738 /* This bisects the array in object 'first', but first checks
1739 * an extra */
1740 r = test_object(f, extra, needle);
1741 if (r < 0)
1742 return r;
1743
1744 if (r == TEST_FOUND)
1745 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1746
1747 /* if we are looking with DIRECTION_UP then we need to first
1748 see if in the actual array there is a matching entry, and
1749 return the last one of that. But if there isn't any we need
1750 to return this one. Hence remember this, and return it
1751 below. */
1752 if (r == TEST_LEFT)
1753 step_back = direction == DIRECTION_UP;
1754
1755 if (r == TEST_RIGHT) {
1756 if (direction == DIRECTION_DOWN)
1757 goto found;
1758 else
1759 return 0;
1760 }
1761
1762 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1763
1764 if (r == 0 && step_back)
1765 goto found;
1766
1767 if (r > 0 && idx)
1768 (*idx) ++;
1769
1770 return r;
1771
1772 found:
1773 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1774 if (r < 0)
1775 return r;
1776
1777 if (ret)
1778 *ret = o;
1779
1780 if (offset)
1781 *offset = extra;
1782
1783 if (idx)
1784 *idx = 0;
1785
1786 return 1;
1787 }
1788
1789 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1790 assert(f);
1791 assert(p > 0);
1792
1793 if (p == needle)
1794 return TEST_FOUND;
1795 else if (p < needle)
1796 return TEST_LEFT;
1797 else
1798 return TEST_RIGHT;
1799 }
1800
1801 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1802 Object *o;
1803 int r;
1804
1805 assert(f);
1806 assert(p > 0);
1807
1808 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1809 if (r < 0)
1810 return r;
1811
1812 if (le64toh(o->entry.seqnum) == needle)
1813 return TEST_FOUND;
1814 else if (le64toh(o->entry.seqnum) < needle)
1815 return TEST_LEFT;
1816 else
1817 return TEST_RIGHT;
1818 }
1819
1820 int journal_file_move_to_entry_by_seqnum(
1821 JournalFile *f,
1822 uint64_t seqnum,
1823 direction_t direction,
1824 Object **ret,
1825 uint64_t *offset) {
1826
1827 return generic_array_bisect(f,
1828 le64toh(f->header->entry_array_offset),
1829 le64toh(f->header->n_entries),
1830 seqnum,
1831 test_object_seqnum,
1832 direction,
1833 ret, offset, NULL);
1834 }
1835
1836 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1837 Object *o;
1838 int r;
1839
1840 assert(f);
1841 assert(p > 0);
1842
1843 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1844 if (r < 0)
1845 return r;
1846
1847 if (le64toh(o->entry.realtime) == needle)
1848 return TEST_FOUND;
1849 else if (le64toh(o->entry.realtime) < needle)
1850 return TEST_LEFT;
1851 else
1852 return TEST_RIGHT;
1853 }
1854
1855 int journal_file_move_to_entry_by_realtime(
1856 JournalFile *f,
1857 uint64_t realtime,
1858 direction_t direction,
1859 Object **ret,
1860 uint64_t *offset) {
1861
1862 return generic_array_bisect(f,
1863 le64toh(f->header->entry_array_offset),
1864 le64toh(f->header->n_entries),
1865 realtime,
1866 test_object_realtime,
1867 direction,
1868 ret, offset, NULL);
1869 }
1870
1871 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1872 Object *o;
1873 int r;
1874
1875 assert(f);
1876 assert(p > 0);
1877
1878 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1879 if (r < 0)
1880 return r;
1881
1882 if (le64toh(o->entry.monotonic) == needle)
1883 return TEST_FOUND;
1884 else if (le64toh(o->entry.monotonic) < needle)
1885 return TEST_LEFT;
1886 else
1887 return TEST_RIGHT;
1888 }
1889
1890 static inline int find_data_object_by_boot_id(
1891 JournalFile *f,
1892 sd_id128_t boot_id,
1893 Object **o,
1894 uint64_t *b) {
1895 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1896
1897 sd_id128_to_string(boot_id, t + 9);
1898 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1899 }
1900
1901 int journal_file_move_to_entry_by_monotonic(
1902 JournalFile *f,
1903 sd_id128_t boot_id,
1904 uint64_t monotonic,
1905 direction_t direction,
1906 Object **ret,
1907 uint64_t *offset) {
1908
1909 Object *o;
1910 int r;
1911
1912 assert(f);
1913
1914 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1915 if (r < 0)
1916 return r;
1917 if (r == 0)
1918 return -ENOENT;
1919
1920 return generic_array_bisect_plus_one(f,
1921 le64toh(o->data.entry_offset),
1922 le64toh(o->data.entry_array_offset),
1923 le64toh(o->data.n_entries),
1924 monotonic,
1925 test_object_monotonic,
1926 direction,
1927 ret, offset, NULL);
1928 }
1929
1930 void journal_file_reset_location(JournalFile *f) {
1931 f->location_type = LOCATION_HEAD;
1932 f->current_offset = 0;
1933 f->current_seqnum = 0;
1934 f->current_realtime = 0;
1935 f->current_monotonic = 0;
1936 zero(f->current_boot_id);
1937 f->current_xor_hash = 0;
1938 }
1939
1940 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
1941 f->last_direction = direction;
1942 f->location_type = LOCATION_SEEK;
1943 f->current_offset = offset;
1944 f->current_seqnum = le64toh(o->entry.seqnum);
1945 f->current_realtime = le64toh(o->entry.realtime);
1946 f->current_monotonic = le64toh(o->entry.monotonic);
1947 f->current_boot_id = o->entry.boot_id;
1948 f->current_xor_hash = le64toh(o->entry.xor_hash);
1949 }
1950
1951 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
1952 assert(af);
1953 assert(bf);
1954 assert(af->location_type == LOCATION_SEEK);
1955 assert(bf->location_type == LOCATION_SEEK);
1956
1957 /* If contents and timestamps match, these entries are
1958 * identical, even if the seqnum does not match */
1959 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
1960 af->current_monotonic == bf->current_monotonic &&
1961 af->current_realtime == bf->current_realtime &&
1962 af->current_xor_hash == bf->current_xor_hash)
1963 return 0;
1964
1965 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
1966
1967 /* If this is from the same seqnum source, compare
1968 * seqnums */
1969 if (af->current_seqnum < bf->current_seqnum)
1970 return -1;
1971 if (af->current_seqnum > bf->current_seqnum)
1972 return 1;
1973
1974 /* Wow! This is weird, different data but the same
1975 * seqnums? Something is borked, but let's make the
1976 * best of it and compare by time. */
1977 }
1978
1979 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
1980
1981 /* If the boot id matches, compare monotonic time */
1982 if (af->current_monotonic < bf->current_monotonic)
1983 return -1;
1984 if (af->current_monotonic > bf->current_monotonic)
1985 return 1;
1986 }
1987
1988 /* Otherwise, compare UTC time */
1989 if (af->current_realtime < bf->current_realtime)
1990 return -1;
1991 if (af->current_realtime > bf->current_realtime)
1992 return 1;
1993
1994 /* Finally, compare by contents */
1995 if (af->current_xor_hash < bf->current_xor_hash)
1996 return -1;
1997 if (af->current_xor_hash > bf->current_xor_hash)
1998 return 1;
1999
2000 return 0;
2001 }
2002
2003 int journal_file_next_entry(
2004 JournalFile *f,
2005 uint64_t p,
2006 direction_t direction,
2007 Object **ret, uint64_t *offset) {
2008
2009 uint64_t i, n, ofs;
2010 int r;
2011
2012 assert(f);
2013
2014 n = le64toh(f->header->n_entries);
2015 if (n <= 0)
2016 return 0;
2017
2018 if (p == 0)
2019 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2020 else {
2021 r = generic_array_bisect(f,
2022 le64toh(f->header->entry_array_offset),
2023 le64toh(f->header->n_entries),
2024 p,
2025 test_object_offset,
2026 DIRECTION_DOWN,
2027 NULL, NULL,
2028 &i);
2029 if (r <= 0)
2030 return r;
2031
2032 if (direction == DIRECTION_DOWN) {
2033 if (i >= n - 1)
2034 return 0;
2035
2036 i++;
2037 } else {
2038 if (i <= 0)
2039 return 0;
2040
2041 i--;
2042 }
2043 }
2044
2045 /* And jump to it */
2046 r = generic_array_get(f,
2047 le64toh(f->header->entry_array_offset),
2048 i,
2049 ret, &ofs);
2050 if (r <= 0)
2051 return r;
2052
2053 if (p > 0 &&
2054 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2055 log_debug("%s: entry array corrupted at entry %"PRIu64,
2056 f->path, i);
2057 return -EBADMSG;
2058 }
2059
2060 if (offset)
2061 *offset = ofs;
2062
2063 return 1;
2064 }
2065
2066 int journal_file_next_entry_for_data(
2067 JournalFile *f,
2068 Object *o, uint64_t p,
2069 uint64_t data_offset,
2070 direction_t direction,
2071 Object **ret, uint64_t *offset) {
2072
2073 uint64_t n, i;
2074 int r;
2075 Object *d;
2076
2077 assert(f);
2078 assert(p > 0 || !o);
2079
2080 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2081 if (r < 0)
2082 return r;
2083
2084 n = le64toh(d->data.n_entries);
2085 if (n <= 0)
2086 return n;
2087
2088 if (!o)
2089 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2090 else {
2091 if (o->object.type != OBJECT_ENTRY)
2092 return -EINVAL;
2093
2094 r = generic_array_bisect_plus_one(f,
2095 le64toh(d->data.entry_offset),
2096 le64toh(d->data.entry_array_offset),
2097 le64toh(d->data.n_entries),
2098 p,
2099 test_object_offset,
2100 DIRECTION_DOWN,
2101 NULL, NULL,
2102 &i);
2103
2104 if (r <= 0)
2105 return r;
2106
2107 if (direction == DIRECTION_DOWN) {
2108 if (i >= n - 1)
2109 return 0;
2110
2111 i++;
2112 } else {
2113 if (i <= 0)
2114 return 0;
2115
2116 i--;
2117 }
2118
2119 }
2120
2121 return generic_array_get_plus_one(f,
2122 le64toh(d->data.entry_offset),
2123 le64toh(d->data.entry_array_offset),
2124 i,
2125 ret, offset);
2126 }
2127
2128 int journal_file_move_to_entry_by_offset_for_data(
2129 JournalFile *f,
2130 uint64_t data_offset,
2131 uint64_t p,
2132 direction_t direction,
2133 Object **ret, uint64_t *offset) {
2134
2135 int r;
2136 Object *d;
2137
2138 assert(f);
2139
2140 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2141 if (r < 0)
2142 return r;
2143
2144 return generic_array_bisect_plus_one(f,
2145 le64toh(d->data.entry_offset),
2146 le64toh(d->data.entry_array_offset),
2147 le64toh(d->data.n_entries),
2148 p,
2149 test_object_offset,
2150 direction,
2151 ret, offset, NULL);
2152 }
2153
2154 int journal_file_move_to_entry_by_monotonic_for_data(
2155 JournalFile *f,
2156 uint64_t data_offset,
2157 sd_id128_t boot_id,
2158 uint64_t monotonic,
2159 direction_t direction,
2160 Object **ret, uint64_t *offset) {
2161
2162 Object *o, *d;
2163 int r;
2164 uint64_t b, z;
2165
2166 assert(f);
2167
2168 /* First, seek by time */
2169 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2170 if (r < 0)
2171 return r;
2172 if (r == 0)
2173 return -ENOENT;
2174
2175 r = generic_array_bisect_plus_one(f,
2176 le64toh(o->data.entry_offset),
2177 le64toh(o->data.entry_array_offset),
2178 le64toh(o->data.n_entries),
2179 monotonic,
2180 test_object_monotonic,
2181 direction,
2182 NULL, &z, NULL);
2183 if (r <= 0)
2184 return r;
2185
2186 /* And now, continue seeking until we find an entry that
2187 * exists in both bisection arrays */
2188
2189 for (;;) {
2190 Object *qo;
2191 uint64_t p, q;
2192
2193 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2194 if (r < 0)
2195 return r;
2196
2197 r = generic_array_bisect_plus_one(f,
2198 le64toh(d->data.entry_offset),
2199 le64toh(d->data.entry_array_offset),
2200 le64toh(d->data.n_entries),
2201 z,
2202 test_object_offset,
2203 direction,
2204 NULL, &p, NULL);
2205 if (r <= 0)
2206 return r;
2207
2208 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2209 if (r < 0)
2210 return r;
2211
2212 r = generic_array_bisect_plus_one(f,
2213 le64toh(o->data.entry_offset),
2214 le64toh(o->data.entry_array_offset),
2215 le64toh(o->data.n_entries),
2216 p,
2217 test_object_offset,
2218 direction,
2219 &qo, &q, NULL);
2220
2221 if (r <= 0)
2222 return r;
2223
2224 if (p == q) {
2225 if (ret)
2226 *ret = qo;
2227 if (offset)
2228 *offset = q;
2229
2230 return 1;
2231 }
2232
2233 z = q;
2234 }
2235 }
2236
2237 int journal_file_move_to_entry_by_seqnum_for_data(
2238 JournalFile *f,
2239 uint64_t data_offset,
2240 uint64_t seqnum,
2241 direction_t direction,
2242 Object **ret, uint64_t *offset) {
2243
2244 Object *d;
2245 int r;
2246
2247 assert(f);
2248
2249 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2250 if (r < 0)
2251 return r;
2252
2253 return generic_array_bisect_plus_one(f,
2254 le64toh(d->data.entry_offset),
2255 le64toh(d->data.entry_array_offset),
2256 le64toh(d->data.n_entries),
2257 seqnum,
2258 test_object_seqnum,
2259 direction,
2260 ret, offset, NULL);
2261 }
2262
2263 int journal_file_move_to_entry_by_realtime_for_data(
2264 JournalFile *f,
2265 uint64_t data_offset,
2266 uint64_t realtime,
2267 direction_t direction,
2268 Object **ret, uint64_t *offset) {
2269
2270 Object *d;
2271 int r;
2272
2273 assert(f);
2274
2275 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2276 if (r < 0)
2277 return r;
2278
2279 return generic_array_bisect_plus_one(f,
2280 le64toh(d->data.entry_offset),
2281 le64toh(d->data.entry_array_offset),
2282 le64toh(d->data.n_entries),
2283 realtime,
2284 test_object_realtime,
2285 direction,
2286 ret, offset, NULL);
2287 }
2288
2289 void journal_file_dump(JournalFile *f) {
2290 Object *o;
2291 int r;
2292 uint64_t p;
2293
2294 assert(f);
2295
2296 journal_file_print_header(f);
2297
2298 p = le64toh(f->header->header_size);
2299 while (p != 0) {
2300 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2301 if (r < 0)
2302 goto fail;
2303
2304 switch (o->object.type) {
2305
2306 case OBJECT_UNUSED:
2307 printf("Type: OBJECT_UNUSED\n");
2308 break;
2309
2310 case OBJECT_DATA:
2311 printf("Type: OBJECT_DATA\n");
2312 break;
2313
2314 case OBJECT_FIELD:
2315 printf("Type: OBJECT_FIELD\n");
2316 break;
2317
2318 case OBJECT_ENTRY:
2319 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2320 le64toh(o->entry.seqnum),
2321 le64toh(o->entry.monotonic),
2322 le64toh(o->entry.realtime));
2323 break;
2324
2325 case OBJECT_FIELD_HASH_TABLE:
2326 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2327 break;
2328
2329 case OBJECT_DATA_HASH_TABLE:
2330 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2331 break;
2332
2333 case OBJECT_ENTRY_ARRAY:
2334 printf("Type: OBJECT_ENTRY_ARRAY\n");
2335 break;
2336
2337 case OBJECT_TAG:
2338 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2339 le64toh(o->tag.seqnum),
2340 le64toh(o->tag.epoch));
2341 break;
2342
2343 default:
2344 printf("Type: unknown (%u)\n", o->object.type);
2345 break;
2346 }
2347
2348 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2349 printf("Flags: %s\n",
2350 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2351
2352 if (p == le64toh(f->header->tail_object_offset))
2353 p = 0;
2354 else
2355 p = p + ALIGN64(le64toh(o->object.size));
2356 }
2357
2358 return;
2359 fail:
2360 log_error("File corrupt");
2361 }
2362
2363 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2364 const char *x;
2365
2366 x = format_timestamp(buf, l, t);
2367 if (x)
2368 return x;
2369 return " --- ";
2370 }
2371
2372 void journal_file_print_header(JournalFile *f) {
2373 char a[33], b[33], c[33], d[33];
2374 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2375 struct stat st;
2376 char bytes[FORMAT_BYTES_MAX];
2377
2378 assert(f);
2379
2380 printf("File Path: %s\n"
2381 "File ID: %s\n"
2382 "Machine ID: %s\n"
2383 "Boot ID: %s\n"
2384 "Sequential Number ID: %s\n"
2385 "State: %s\n"
2386 "Compatible Flags:%s%s\n"
2387 "Incompatible Flags:%s%s%s\n"
2388 "Header size: %"PRIu64"\n"
2389 "Arena size: %"PRIu64"\n"
2390 "Data Hash Table Size: %"PRIu64"\n"
2391 "Field Hash Table Size: %"PRIu64"\n"
2392 "Rotate Suggested: %s\n"
2393 "Head Sequential Number: %"PRIu64"\n"
2394 "Tail Sequential Number: %"PRIu64"\n"
2395 "Head Realtime Timestamp: %s\n"
2396 "Tail Realtime Timestamp: %s\n"
2397 "Tail Monotonic Timestamp: %s\n"
2398 "Objects: %"PRIu64"\n"
2399 "Entry Objects: %"PRIu64"\n",
2400 f->path,
2401 sd_id128_to_string(f->header->file_id, a),
2402 sd_id128_to_string(f->header->machine_id, b),
2403 sd_id128_to_string(f->header->boot_id, c),
2404 sd_id128_to_string(f->header->seqnum_id, d),
2405 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2406 f->header->state == STATE_ONLINE ? "ONLINE" :
2407 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2408 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2409 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2410 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2411 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2412 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2413 le64toh(f->header->header_size),
2414 le64toh(f->header->arena_size),
2415 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2416 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2417 yes_no(journal_file_rotate_suggested(f, 0)),
2418 le64toh(f->header->head_entry_seqnum),
2419 le64toh(f->header->tail_entry_seqnum),
2420 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2421 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2422 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2423 le64toh(f->header->n_objects),
2424 le64toh(f->header->n_entries));
2425
2426 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2427 printf("Data Objects: %"PRIu64"\n"
2428 "Data Hash Table Fill: %.1f%%\n",
2429 le64toh(f->header->n_data),
2430 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2431
2432 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2433 printf("Field Objects: %"PRIu64"\n"
2434 "Field Hash Table Fill: %.1f%%\n",
2435 le64toh(f->header->n_fields),
2436 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2437
2438 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2439 printf("Tag Objects: %"PRIu64"\n",
2440 le64toh(f->header->n_tags));
2441 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2442 printf("Entry Array Objects: %"PRIu64"\n",
2443 le64toh(f->header->n_entry_arrays));
2444
2445 if (fstat(f->fd, &st) >= 0)
2446 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2447 }
2448
2449 int journal_file_open(
2450 const char *fname,
2451 int flags,
2452 mode_t mode,
2453 bool compress,
2454 bool seal,
2455 JournalMetrics *metrics,
2456 MMapCache *mmap_cache,
2457 JournalFile *template,
2458 JournalFile **ret) {
2459
2460 JournalFile *f;
2461 int r;
2462 bool newly_created = false;
2463
2464 assert(fname);
2465 assert(ret);
2466
2467 if ((flags & O_ACCMODE) != O_RDONLY &&
2468 (flags & O_ACCMODE) != O_RDWR)
2469 return -EINVAL;
2470
2471 if (!endswith(fname, ".journal") &&
2472 !endswith(fname, ".journal~"))
2473 return -EINVAL;
2474
2475 f = new0(JournalFile, 1);
2476 if (!f)
2477 return -ENOMEM;
2478
2479 f->fd = -1;
2480 f->mode = mode;
2481
2482 f->flags = flags;
2483 f->prot = prot_from_flags(flags);
2484 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2485 #if defined(HAVE_LZ4)
2486 f->compress_lz4 = compress;
2487 #elif defined(HAVE_XZ)
2488 f->compress_xz = compress;
2489 #endif
2490 #ifdef HAVE_GCRYPT
2491 f->seal = seal;
2492 #endif
2493
2494 if (mmap_cache)
2495 f->mmap = mmap_cache_ref(mmap_cache);
2496 else {
2497 f->mmap = mmap_cache_new();
2498 if (!f->mmap) {
2499 r = -ENOMEM;
2500 goto fail;
2501 }
2502 }
2503
2504 f->path = strdup(fname);
2505 if (!f->path) {
2506 r = -ENOMEM;
2507 goto fail;
2508 }
2509
2510 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2511 if (!f->chain_cache) {
2512 r = -ENOMEM;
2513 goto fail;
2514 }
2515
2516 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2517 if (f->fd < 0) {
2518 r = -errno;
2519 goto fail;
2520 }
2521
2522 if (fstat(f->fd, &f->last_stat) < 0) {
2523 r = -errno;
2524 goto fail;
2525 }
2526
2527 if (f->last_stat.st_size == 0 && f->writable) {
2528 /* Let's attach the creation time to the journal file,
2529 * so that the vacuuming code knows the age of this
2530 * file even if the file might end up corrupted one
2531 * day... Ideally we'd just use the creation time many
2532 * file systems maintain for each file, but there is
2533 * currently no usable API to query this, hence let's
2534 * emulate this via extended attributes. If extended
2535 * attributes are not supported we'll just skip this,
2536 * and rely solely on mtime/atime/ctime of the file. */
2537
2538 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
2539
2540 #ifdef HAVE_GCRYPT
2541 /* Try to load the FSPRG state, and if we can't, then
2542 * just don't do sealing */
2543 if (f->seal) {
2544 r = journal_file_fss_load(f);
2545 if (r < 0)
2546 f->seal = false;
2547 }
2548 #endif
2549
2550 r = journal_file_init_header(f, template);
2551 if (r < 0)
2552 goto fail;
2553
2554 if (fstat(f->fd, &f->last_stat) < 0) {
2555 r = -errno;
2556 goto fail;
2557 }
2558
2559 newly_created = true;
2560 }
2561
2562 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2563 r = -EIO;
2564 goto fail;
2565 }
2566
2567 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2568 if (f->header == MAP_FAILED) {
2569 f->header = NULL;
2570 r = -errno;
2571 goto fail;
2572 }
2573
2574 if (!newly_created) {
2575 r = journal_file_verify_header(f);
2576 if (r < 0)
2577 goto fail;
2578 }
2579
2580 #ifdef HAVE_GCRYPT
2581 if (!newly_created && f->writable) {
2582 r = journal_file_fss_load(f);
2583 if (r < 0)
2584 goto fail;
2585 }
2586 #endif
2587
2588 if (f->writable) {
2589 if (metrics) {
2590 journal_default_metrics(metrics, f->fd);
2591 f->metrics = *metrics;
2592 } else if (template)
2593 f->metrics = template->metrics;
2594
2595 r = journal_file_refresh_header(f);
2596 if (r < 0)
2597 goto fail;
2598 }
2599
2600 #ifdef HAVE_GCRYPT
2601 r = journal_file_hmac_setup(f);
2602 if (r < 0)
2603 goto fail;
2604 #endif
2605
2606 if (newly_created) {
2607 r = journal_file_setup_field_hash_table(f);
2608 if (r < 0)
2609 goto fail;
2610
2611 r = journal_file_setup_data_hash_table(f);
2612 if (r < 0)
2613 goto fail;
2614
2615 #ifdef HAVE_GCRYPT
2616 r = journal_file_append_first_tag(f);
2617 if (r < 0)
2618 goto fail;
2619 #endif
2620 }
2621
2622 r = journal_file_map_field_hash_table(f);
2623 if (r < 0)
2624 goto fail;
2625
2626 r = journal_file_map_data_hash_table(f);
2627 if (r < 0)
2628 goto fail;
2629
2630 *ret = f;
2631 return 0;
2632
2633 fail:
2634 journal_file_close(f);
2635
2636 return r;
2637 }
2638
2639 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2640 _cleanup_free_ char *p = NULL;
2641 size_t l;
2642 JournalFile *old_file, *new_file = NULL;
2643 int r;
2644
2645 assert(f);
2646 assert(*f);
2647
2648 old_file = *f;
2649
2650 if (!old_file->writable)
2651 return -EINVAL;
2652
2653 if (!endswith(old_file->path, ".journal"))
2654 return -EINVAL;
2655
2656 l = strlen(old_file->path);
2657 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2658 (int) l - 8, old_file->path,
2659 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2660 le64toh((*f)->header->head_entry_seqnum),
2661 le64toh((*f)->header->head_entry_realtime));
2662 if (r < 0)
2663 return -ENOMEM;
2664
2665 r = rename(old_file->path, p);
2666 if (r < 0)
2667 return -errno;
2668
2669 old_file->header->state = STATE_ARCHIVED;
2670
2671 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2672 journal_file_close(old_file);
2673
2674 *f = new_file;
2675 return r;
2676 }
2677
2678 int journal_file_open_reliably(
2679 const char *fname,
2680 int flags,
2681 mode_t mode,
2682 bool compress,
2683 bool seal,
2684 JournalMetrics *metrics,
2685 MMapCache *mmap_cache,
2686 JournalFile *template,
2687 JournalFile **ret) {
2688
2689 int r;
2690 size_t l;
2691 _cleanup_free_ char *p = NULL;
2692
2693 r = journal_file_open(fname, flags, mode, compress, seal,
2694 metrics, mmap_cache, template, ret);
2695 if (r != -EBADMSG && /* corrupted */
2696 r != -ENODATA && /* truncated */
2697 r != -EHOSTDOWN && /* other machine */
2698 r != -EPROTONOSUPPORT && /* incompatible feature */
2699 r != -EBUSY && /* unclean shutdown */
2700 r != -ESHUTDOWN /* already archived */)
2701 return r;
2702
2703 if ((flags & O_ACCMODE) == O_RDONLY)
2704 return r;
2705
2706 if (!(flags & O_CREAT))
2707 return r;
2708
2709 if (!endswith(fname, ".journal"))
2710 return r;
2711
2712 /* The file is corrupted. Rotate it away and try it again (but only once) */
2713
2714 l = strlen(fname);
2715 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2716 (int) l - 8, fname,
2717 (unsigned long long) now(CLOCK_REALTIME),
2718 random_u64()) < 0)
2719 return -ENOMEM;
2720
2721 r = rename(fname, p);
2722 if (r < 0)
2723 return -errno;
2724
2725 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2726
2727 return journal_file_open(fname, flags, mode, compress, seal,
2728 metrics, mmap_cache, template, ret);
2729 }
2730
2731 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2732 uint64_t i, n;
2733 uint64_t q, xor_hash = 0;
2734 int r;
2735 EntryItem *items;
2736 dual_timestamp ts;
2737
2738 assert(from);
2739 assert(to);
2740 assert(o);
2741 assert(p);
2742
2743 if (!to->writable)
2744 return -EPERM;
2745
2746 ts.monotonic = le64toh(o->entry.monotonic);
2747 ts.realtime = le64toh(o->entry.realtime);
2748
2749 n = journal_file_entry_n_items(o);
2750 /* alloca() can't take 0, hence let's allocate at least one */
2751 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2752
2753 for (i = 0; i < n; i++) {
2754 uint64_t l, h;
2755 le64_t le_hash;
2756 size_t t;
2757 void *data;
2758 Object *u;
2759
2760 q = le64toh(o->entry.items[i].object_offset);
2761 le_hash = o->entry.items[i].hash;
2762
2763 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2764 if (r < 0)
2765 return r;
2766
2767 if (le_hash != o->data.hash)
2768 return -EBADMSG;
2769
2770 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2771 t = (size_t) l;
2772
2773 /* We hit the limit on 32bit machines */
2774 if ((uint64_t) t != l)
2775 return -E2BIG;
2776
2777 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2778 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2779 size_t rsize;
2780
2781 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2782 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2783 if (r < 0)
2784 return r;
2785
2786 data = from->compress_buffer;
2787 l = rsize;
2788 #else
2789 return -EPROTONOSUPPORT;
2790 #endif
2791 } else
2792 data = o->data.payload;
2793
2794 r = journal_file_append_data(to, data, l, &u, &h);
2795 if (r < 0)
2796 return r;
2797
2798 xor_hash ^= le64toh(u->data.hash);
2799 items[i].object_offset = htole64(h);
2800 items[i].hash = u->data.hash;
2801
2802 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2803 if (r < 0)
2804 return r;
2805 }
2806
2807 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2808 }
2809
2810 void journal_default_metrics(JournalMetrics *m, int fd) {
2811 uint64_t fs_size = 0;
2812 struct statvfs ss;
2813 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2814
2815 assert(m);
2816 assert(fd >= 0);
2817
2818 if (fstatvfs(fd, &ss) >= 0)
2819 fs_size = ss.f_frsize * ss.f_blocks;
2820
2821 if (m->max_use == (uint64_t) -1) {
2822
2823 if (fs_size > 0) {
2824 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2825
2826 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2827 m->max_use = DEFAULT_MAX_USE_UPPER;
2828
2829 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2830 m->max_use = DEFAULT_MAX_USE_LOWER;
2831 } else
2832 m->max_use = DEFAULT_MAX_USE_LOWER;
2833 } else {
2834 m->max_use = PAGE_ALIGN(m->max_use);
2835
2836 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2837 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2838 }
2839
2840 if (m->max_size == (uint64_t) -1) {
2841 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2842
2843 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2844 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2845 } else
2846 m->max_size = PAGE_ALIGN(m->max_size);
2847
2848 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2849 m->max_size = JOURNAL_FILE_SIZE_MIN;
2850
2851 if (m->max_size*2 > m->max_use)
2852 m->max_use = m->max_size*2;
2853
2854 if (m->min_size == (uint64_t) -1)
2855 m->min_size = JOURNAL_FILE_SIZE_MIN;
2856 else {
2857 m->min_size = PAGE_ALIGN(m->min_size);
2858
2859 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2860 m->min_size = JOURNAL_FILE_SIZE_MIN;
2861
2862 if (m->min_size > m->max_size)
2863 m->max_size = m->min_size;
2864 }
2865
2866 if (m->keep_free == (uint64_t) -1) {
2867
2868 if (fs_size > 0) {
2869 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2870
2871 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2872 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2873
2874 } else
2875 m->keep_free = DEFAULT_KEEP_FREE;
2876 }
2877
2878 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2879 format_bytes(a, sizeof(a), m->max_use),
2880 format_bytes(b, sizeof(b), m->max_size),
2881 format_bytes(c, sizeof(c), m->min_size),
2882 format_bytes(d, sizeof(d), m->keep_free));
2883 }
2884
2885 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2886 assert(f);
2887 assert(from || to);
2888
2889 if (from) {
2890 if (f->header->head_entry_realtime == 0)
2891 return -ENOENT;
2892
2893 *from = le64toh(f->header->head_entry_realtime);
2894 }
2895
2896 if (to) {
2897 if (f->header->tail_entry_realtime == 0)
2898 return -ENOENT;
2899
2900 *to = le64toh(f->header->tail_entry_realtime);
2901 }
2902
2903 return 1;
2904 }
2905
2906 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2907 Object *o;
2908 uint64_t p;
2909 int r;
2910
2911 assert(f);
2912 assert(from || to);
2913
2914 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2915 if (r <= 0)
2916 return r;
2917
2918 if (le64toh(o->data.n_entries) <= 0)
2919 return 0;
2920
2921 if (from) {
2922 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2923 if (r < 0)
2924 return r;
2925
2926 *from = le64toh(o->entry.monotonic);
2927 }
2928
2929 if (to) {
2930 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2931 if (r < 0)
2932 return r;
2933
2934 r = generic_array_get_plus_one(f,
2935 le64toh(o->data.entry_offset),
2936 le64toh(o->data.entry_array_offset),
2937 le64toh(o->data.n_entries)-1,
2938 &o, NULL);
2939 if (r <= 0)
2940 return r;
2941
2942 *to = le64toh(o->entry.monotonic);
2943 }
2944
2945 return 1;
2946 }
2947
2948 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2949 assert(f);
2950
2951 /* If we gained new header fields we gained new features,
2952 * hence suggest a rotation */
2953 if (le64toh(f->header->header_size) < sizeof(Header)) {
2954 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2955 return true;
2956 }
2957
2958 /* Let's check if the hash tables grew over a certain fill
2959 * level (75%, borrowing this value from Java's hash table
2960 * implementation), and if so suggest a rotation. To calculate
2961 * the fill level we need the n_data field, which only exists
2962 * in newer versions. */
2963
2964 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2965 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2966 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2967 f->path,
2968 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2969 le64toh(f->header->n_data),
2970 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2971 (unsigned long long) f->last_stat.st_size,
2972 f->last_stat.st_size / le64toh(f->header->n_data));
2973 return true;
2974 }
2975
2976 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2977 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2978 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2979 f->path,
2980 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2981 le64toh(f->header->n_fields),
2982 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2983 return true;
2984 }
2985
2986 /* Are the data objects properly indexed by field objects? */
2987 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2988 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2989 le64toh(f->header->n_data) > 0 &&
2990 le64toh(f->header->n_fields) == 0)
2991 return true;
2992
2993 if (max_file_usec > 0) {
2994 usec_t t, h;
2995
2996 h = le64toh(f->header->head_entry_realtime);
2997 t = now(CLOCK_REALTIME);
2998
2999 if (h > 0 && t > h + max_file_usec)
3000 return true;
3001 }
3002
3003 return false;
3004 }