]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
journalctl: print monotonic timestamp in --header
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58 * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62 * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 int journal_file_set_online(JournalFile *f) {
72 assert(f);
73
74 if (!f->writable)
75 return -EPERM;
76
77 if (!(f->fd >= 0 && f->header))
78 return -EINVAL;
79
80 switch(f->header->state) {
81 case STATE_ONLINE:
82 return 0;
83
84 case STATE_OFFLINE:
85 f->header->state = STATE_ONLINE;
86 fsync(f->fd);
87 return 0;
88
89 default:
90 return -EINVAL;
91 }
92 }
93
94 int journal_file_set_offline(JournalFile *f) {
95 assert(f);
96
97 if (!f->writable)
98 return -EPERM;
99
100 if (!(f->fd >= 0 && f->header))
101 return -EINVAL;
102
103 if (f->header->state != STATE_ONLINE)
104 return 0;
105
106 fsync(f->fd);
107
108 f->header->state = STATE_OFFLINE;
109
110 fsync(f->fd);
111
112 return 0;
113 }
114
115 void journal_file_close(JournalFile *f) {
116 assert(f);
117
118 #ifdef HAVE_GCRYPT
119 /* Write the final tag */
120 if (f->seal && f->writable)
121 journal_file_append_tag(f);
122 #endif
123
124 /* Sync everything to disk, before we mark the file offline */
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
127
128 journal_file_set_offline(f);
129
130 if (f->header)
131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
132
133 if (f->fd >= 0)
134 close_nointr_nofail(f->fd);
135
136 free(f->path);
137
138 if (f->mmap)
139 mmap_cache_unref(f->mmap);
140
141 hashmap_free_free(f->chain_cache);
142
143 #ifdef HAVE_XZ
144 free(f->compress_buffer);
145 #endif
146
147 #ifdef HAVE_GCRYPT
148 if (f->fss_file)
149 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
150 else if (f->fsprg_state)
151 free(f->fsprg_state);
152
153 free(f->fsprg_seed);
154
155 if (f->hmac)
156 gcry_md_close(f->hmac);
157 #endif
158
159 free(f);
160 }
161
162 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
163 Header h;
164 ssize_t k;
165 int r;
166
167 assert(f);
168
169 zero(h);
170 memcpy(h.signature, HEADER_SIGNATURE, 8);
171 h.header_size = htole64(ALIGN64(sizeof(h)));
172
173 h.incompatible_flags =
174 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
175
176 h.compatible_flags =
177 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
178
179 r = sd_id128_randomize(&h.file_id);
180 if (r < 0)
181 return r;
182
183 if (template) {
184 h.seqnum_id = template->header->seqnum_id;
185 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
186 } else
187 h.seqnum_id = h.file_id;
188
189 k = pwrite(f->fd, &h, sizeof(h), 0);
190 if (k < 0)
191 return -errno;
192
193 if (k != sizeof(h))
194 return -EIO;
195
196 return 0;
197 }
198
199 static int journal_file_refresh_header(JournalFile *f) {
200 int r;
201 sd_id128_t boot_id;
202
203 assert(f);
204
205 r = sd_id128_get_machine(&f->header->machine_id);
206 if (r < 0)
207 return r;
208
209 r = sd_id128_get_boot(&boot_id);
210 if (r < 0)
211 return r;
212
213 if (sd_id128_equal(boot_id, f->header->boot_id))
214 f->tail_entry_monotonic_valid = true;
215
216 f->header->boot_id = boot_id;
217
218 journal_file_set_online(f);
219
220 /* Sync the online state to disk */
221 msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
222 fdatasync(f->fd);
223
224 return 0;
225 }
226
227 static int journal_file_verify_header(JournalFile *f) {
228 assert(f);
229
230 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
231 return -EBADMSG;
232
233 /* In both read and write mode we refuse to open files with
234 * incompatible flags we don't know */
235 #ifdef HAVE_XZ
236 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
237 return -EPROTONOSUPPORT;
238 #else
239 if (f->header->incompatible_flags != 0)
240 return -EPROTONOSUPPORT;
241 #endif
242
243 /* When open for writing we refuse to open files with
244 * compatible flags, too */
245 if (f->writable) {
246 #ifdef HAVE_GCRYPT
247 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
248 return -EPROTONOSUPPORT;
249 #else
250 if (f->header->compatible_flags != 0)
251 return -EPROTONOSUPPORT;
252 #endif
253 }
254
255 if (f->header->state >= _STATE_MAX)
256 return -EBADMSG;
257
258 /* The first addition was n_data, so check that we are at least this large */
259 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
260 return -EBADMSG;
261
262 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
263 return -EBADMSG;
264
265 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
266 return -ENODATA;
267
268 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
269 return -ENODATA;
270
271 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
272 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
273 !VALID64(le64toh(f->header->tail_object_offset)) ||
274 !VALID64(le64toh(f->header->entry_array_offset)))
275 return -ENODATA;
276
277 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
278 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
279 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
280 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
281 return -ENODATA;
282
283 if (f->writable) {
284 uint8_t state;
285 sd_id128_t machine_id;
286 int r;
287
288 r = sd_id128_get_machine(&machine_id);
289 if (r < 0)
290 return r;
291
292 if (!sd_id128_equal(machine_id, f->header->machine_id))
293 return -EHOSTDOWN;
294
295 state = f->header->state;
296
297 if (state == STATE_ONLINE) {
298 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
299 return -EBUSY;
300 } else if (state == STATE_ARCHIVED)
301 return -ESHUTDOWN;
302 else if (state != STATE_OFFLINE) {
303 log_debug("Journal file %s has unknown state %u.", f->path, state);
304 return -EBUSY;
305 }
306 }
307
308 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
309
310 f->seal = JOURNAL_HEADER_SEALED(f->header);
311
312 return 0;
313 }
314
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316 uint64_t old_size, new_size;
317 int r;
318
319 assert(f);
320
321 /* We assume that this file is not sparse, and we know that
322 * for sure, since we always call posix_fallocate()
323 * ourselves */
324
325 old_size =
326 le64toh(f->header->header_size) +
327 le64toh(f->header->arena_size);
328
329 new_size = PAGE_ALIGN(offset + size);
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
332
333 if (new_size <= old_size)
334 return 0;
335
336 if (f->metrics.max_size > 0 &&
337 new_size > f->metrics.max_size)
338 return -E2BIG;
339
340 if (new_size > f->metrics.min_size &&
341 f->metrics.keep_free > 0) {
342 struct statvfs svfs;
343
344 if (fstatvfs(f->fd, &svfs) >= 0) {
345 uint64_t available;
346
347 available = svfs.f_bfree * svfs.f_bsize;
348
349 if (available >= f->metrics.keep_free)
350 available -= f->metrics.keep_free;
351 else
352 available = 0;
353
354 if (new_size - old_size > available)
355 return -E2BIG;
356 }
357 }
358
359 /* Note that the glibc fallocate() fallback is very
360 inefficient, hence we try to minimize the allocation area
361 as we can. */
362 r = posix_fallocate(f->fd, old_size, new_size - old_size);
363 if (r != 0)
364 return -r;
365
366 if (fstat(f->fd, &f->last_stat) < 0)
367 return -errno;
368
369 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
370
371 return 0;
372 }
373
374 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
375 assert(f);
376 assert(ret);
377
378 if (size <= 0)
379 return -EINVAL;
380
381 /* Avoid SIGBUS on invalid accesses */
382 if (offset + size > (uint64_t) f->last_stat.st_size) {
383 /* Hmm, out of range? Let's refresh the fstat() data
384 * first, before we trust that check. */
385
386 if (fstat(f->fd, &f->last_stat) < 0 ||
387 offset + size > (uint64_t) f->last_stat.st_size)
388 return -EADDRNOTAVAIL;
389 }
390
391 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
392 }
393
394 static uint64_t minimum_header_size(Object *o) {
395
396 static uint64_t table[] = {
397 [OBJECT_DATA] = sizeof(DataObject),
398 [OBJECT_FIELD] = sizeof(FieldObject),
399 [OBJECT_ENTRY] = sizeof(EntryObject),
400 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
401 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
402 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
403 [OBJECT_TAG] = sizeof(TagObject),
404 };
405
406 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
407 return sizeof(ObjectHeader);
408
409 return table[o->object.type];
410 }
411
412 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
413 int r;
414 void *t;
415 Object *o;
416 uint64_t s;
417 unsigned context;
418
419 assert(f);
420 assert(ret);
421
422 /* Objects may only be located at multiple of 64 bit */
423 if (!VALID64(offset))
424 return -EFAULT;
425
426 /* One context for each type, plus one catch-all for the rest */
427 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
428
429 r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
430 if (r < 0)
431 return r;
432
433 o = (Object*) t;
434 s = le64toh(o->object.size);
435
436 if (s < sizeof(ObjectHeader))
437 return -EBADMSG;
438
439 if (o->object.type <= OBJECT_UNUSED)
440 return -EBADMSG;
441
442 if (s < minimum_header_size(o))
443 return -EBADMSG;
444
445 if (type > 0 && o->object.type != type)
446 return -EBADMSG;
447
448 if (s > sizeof(ObjectHeader)) {
449 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
450 if (r < 0)
451 return r;
452
453 o = (Object*) t;
454 }
455
456 *ret = o;
457 return 0;
458 }
459
460 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
461 uint64_t r;
462
463 assert(f);
464
465 r = le64toh(f->header->tail_entry_seqnum) + 1;
466
467 if (seqnum) {
468 /* If an external seqnum counter was passed, we update
469 * both the local and the external one, and set it to
470 * the maximum of both */
471
472 if (*seqnum + 1 > r)
473 r = *seqnum + 1;
474
475 *seqnum = r;
476 }
477
478 f->header->tail_entry_seqnum = htole64(r);
479
480 if (f->header->head_entry_seqnum == 0)
481 f->header->head_entry_seqnum = htole64(r);
482
483 return r;
484 }
485
486 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
487 int r;
488 uint64_t p;
489 Object *tail, *o;
490 void *t;
491
492 assert(f);
493 assert(type > 0 && type < _OBJECT_TYPE_MAX);
494 assert(size >= sizeof(ObjectHeader));
495 assert(offset);
496 assert(ret);
497
498 r = journal_file_set_online(f);
499 if (r < 0)
500 return r;
501
502 p = le64toh(f->header->tail_object_offset);
503 if (p == 0)
504 p = le64toh(f->header->header_size);
505 else {
506 r = journal_file_move_to_object(f, -1, p, &tail);
507 if (r < 0)
508 return r;
509
510 p += ALIGN64(le64toh(tail->object.size));
511 }
512
513 r = journal_file_allocate(f, p, size);
514 if (r < 0)
515 return r;
516
517 r = journal_file_move_to(f, type, false, p, size, &t);
518 if (r < 0)
519 return r;
520
521 o = (Object*) t;
522
523 zero(o->object);
524 o->object.type = type;
525 o->object.size = htole64(size);
526
527 f->header->tail_object_offset = htole64(p);
528 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
529
530 *ret = o;
531 *offset = p;
532
533 return 0;
534 }
535
536 static int journal_file_setup_data_hash_table(JournalFile *f) {
537 uint64_t s, p;
538 Object *o;
539 int r;
540
541 assert(f);
542
543 /* We estimate that we need 1 hash table entry per 768 of
544 journal file and we want to make sure we never get beyond
545 75% fill level. Calculate the hash table size for the
546 maximum file size based on these metrics. */
547
548 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
549 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
550 s = DEFAULT_DATA_HASH_TABLE_SIZE;
551
552 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
553
554 r = journal_file_append_object(f,
555 OBJECT_DATA_HASH_TABLE,
556 offsetof(Object, hash_table.items) + s,
557 &o, &p);
558 if (r < 0)
559 return r;
560
561 memset(o->hash_table.items, 0, s);
562
563 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
564 f->header->data_hash_table_size = htole64(s);
565
566 return 0;
567 }
568
569 static int journal_file_setup_field_hash_table(JournalFile *f) {
570 uint64_t s, p;
571 Object *o;
572 int r;
573
574 assert(f);
575
576 /* We use a fixed size hash table for the fields as this
577 * number should grow very slowly only */
578
579 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
580 r = journal_file_append_object(f,
581 OBJECT_FIELD_HASH_TABLE,
582 offsetof(Object, hash_table.items) + s,
583 &o, &p);
584 if (r < 0)
585 return r;
586
587 memset(o->hash_table.items, 0, s);
588
589 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
590 f->header->field_hash_table_size = htole64(s);
591
592 return 0;
593 }
594
595 static int journal_file_map_data_hash_table(JournalFile *f) {
596 uint64_t s, p;
597 void *t;
598 int r;
599
600 assert(f);
601
602 p = le64toh(f->header->data_hash_table_offset);
603 s = le64toh(f->header->data_hash_table_size);
604
605 r = journal_file_move_to(f,
606 OBJECT_DATA_HASH_TABLE,
607 true,
608 p, s,
609 &t);
610 if (r < 0)
611 return r;
612
613 f->data_hash_table = t;
614 return 0;
615 }
616
617 static int journal_file_map_field_hash_table(JournalFile *f) {
618 uint64_t s, p;
619 void *t;
620 int r;
621
622 assert(f);
623
624 p = le64toh(f->header->field_hash_table_offset);
625 s = le64toh(f->header->field_hash_table_size);
626
627 r = journal_file_move_to(f,
628 OBJECT_FIELD_HASH_TABLE,
629 true,
630 p, s,
631 &t);
632 if (r < 0)
633 return r;
634
635 f->field_hash_table = t;
636 return 0;
637 }
638
639 static int journal_file_link_field(
640 JournalFile *f,
641 Object *o,
642 uint64_t offset,
643 uint64_t hash) {
644
645 uint64_t p, h;
646 int r;
647
648 assert(f);
649 assert(o);
650 assert(offset > 0);
651
652 if (o->object.type != OBJECT_FIELD)
653 return -EINVAL;
654
655 /* This might alter the window we are looking at */
656
657 o->field.next_hash_offset = o->field.head_data_offset = 0;
658
659 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
660 p = le64toh(f->field_hash_table[h].tail_hash_offset);
661 if (p == 0)
662 f->field_hash_table[h].head_hash_offset = htole64(offset);
663 else {
664 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
665 if (r < 0)
666 return r;
667
668 o->field.next_hash_offset = htole64(offset);
669 }
670
671 f->field_hash_table[h].tail_hash_offset = htole64(offset);
672
673 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
674 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
675
676 return 0;
677 }
678
679 static int journal_file_link_data(
680 JournalFile *f,
681 Object *o,
682 uint64_t offset,
683 uint64_t hash) {
684
685 uint64_t p, h;
686 int r;
687
688 assert(f);
689 assert(o);
690 assert(offset > 0);
691
692 if (o->object.type != OBJECT_DATA)
693 return -EINVAL;
694
695 /* This might alter the window we are looking at */
696
697 o->data.next_hash_offset = o->data.next_field_offset = 0;
698 o->data.entry_offset = o->data.entry_array_offset = 0;
699 o->data.n_entries = 0;
700
701 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
702 p = le64toh(f->data_hash_table[h].tail_hash_offset);
703 if (p == 0)
704 /* Only entry in the hash table is easy */
705 f->data_hash_table[h].head_hash_offset = htole64(offset);
706 else {
707 /* Move back to the previous data object, to patch in
708 * pointer */
709
710 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
711 if (r < 0)
712 return r;
713
714 o->data.next_hash_offset = htole64(offset);
715 }
716
717 f->data_hash_table[h].tail_hash_offset = htole64(offset);
718
719 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
720 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
721
722 return 0;
723 }
724
725 int journal_file_find_field_object_with_hash(
726 JournalFile *f,
727 const void *field, uint64_t size, uint64_t hash,
728 Object **ret, uint64_t *offset) {
729
730 uint64_t p, osize, h;
731 int r;
732
733 assert(f);
734 assert(field && size > 0);
735
736 osize = offsetof(Object, field.payload) + size;
737
738 if (f->header->field_hash_table_size == 0)
739 return -EBADMSG;
740
741 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
742 p = le64toh(f->field_hash_table[h].head_hash_offset);
743
744 while (p > 0) {
745 Object *o;
746
747 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
748 if (r < 0)
749 return r;
750
751 if (le64toh(o->field.hash) == hash &&
752 le64toh(o->object.size) == osize &&
753 memcmp(o->field.payload, field, size) == 0) {
754
755 if (ret)
756 *ret = o;
757 if (offset)
758 *offset = p;
759
760 return 1;
761 }
762
763 p = le64toh(o->field.next_hash_offset);
764 }
765
766 return 0;
767 }
768
769 int journal_file_find_field_object(
770 JournalFile *f,
771 const void *field, uint64_t size,
772 Object **ret, uint64_t *offset) {
773
774 uint64_t hash;
775
776 assert(f);
777 assert(field && size > 0);
778
779 hash = hash64(field, size);
780
781 return journal_file_find_field_object_with_hash(f,
782 field, size, hash,
783 ret, offset);
784 }
785
786 int journal_file_find_data_object_with_hash(
787 JournalFile *f,
788 const void *data, uint64_t size, uint64_t hash,
789 Object **ret, uint64_t *offset) {
790
791 uint64_t p, osize, h;
792 int r;
793
794 assert(f);
795 assert(data || size == 0);
796
797 osize = offsetof(Object, data.payload) + size;
798
799 if (f->header->data_hash_table_size == 0)
800 return -EBADMSG;
801
802 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
803 p = le64toh(f->data_hash_table[h].head_hash_offset);
804
805 while (p > 0) {
806 Object *o;
807
808 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
809 if (r < 0)
810 return r;
811
812 if (le64toh(o->data.hash) != hash)
813 goto next;
814
815 if (o->object.flags & OBJECT_COMPRESSED) {
816 #ifdef HAVE_XZ
817 uint64_t l, rsize;
818
819 l = le64toh(o->object.size);
820 if (l <= offsetof(Object, data.payload))
821 return -EBADMSG;
822
823 l -= offsetof(Object, data.payload);
824
825 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
826 return -EBADMSG;
827
828 if (rsize == size &&
829 memcmp(f->compress_buffer, data, size) == 0) {
830
831 if (ret)
832 *ret = o;
833
834 if (offset)
835 *offset = p;
836
837 return 1;
838 }
839 #else
840 return -EPROTONOSUPPORT;
841 #endif
842
843 } else if (le64toh(o->object.size) == osize &&
844 memcmp(o->data.payload, data, size) == 0) {
845
846 if (ret)
847 *ret = o;
848
849 if (offset)
850 *offset = p;
851
852 return 1;
853 }
854
855 next:
856 p = le64toh(o->data.next_hash_offset);
857 }
858
859 return 0;
860 }
861
862 int journal_file_find_data_object(
863 JournalFile *f,
864 const void *data, uint64_t size,
865 Object **ret, uint64_t *offset) {
866
867 uint64_t hash;
868
869 assert(f);
870 assert(data || size == 0);
871
872 hash = hash64(data, size);
873
874 return journal_file_find_data_object_with_hash(f,
875 data, size, hash,
876 ret, offset);
877 }
878
879 static int journal_file_append_field(
880 JournalFile *f,
881 const void *field, uint64_t size,
882 Object **ret, uint64_t *offset) {
883
884 uint64_t hash, p;
885 uint64_t osize;
886 Object *o;
887 int r;
888
889 assert(f);
890 assert(field && size > 0);
891
892 hash = hash64(field, size);
893
894 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
895 if (r < 0)
896 return r;
897 else if (r > 0) {
898
899 if (ret)
900 *ret = o;
901
902 if (offset)
903 *offset = p;
904
905 return 0;
906 }
907
908 osize = offsetof(Object, field.payload) + size;
909 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
910
911 o->field.hash = htole64(hash);
912 memcpy(o->field.payload, field, size);
913
914 r = journal_file_link_field(f, o, p, hash);
915 if (r < 0)
916 return r;
917
918 /* The linking might have altered the window, so let's
919 * refresh our pointer */
920 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
921 if (r < 0)
922 return r;
923
924 #ifdef HAVE_GCRYPT
925 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
926 if (r < 0)
927 return r;
928 #endif
929
930 if (ret)
931 *ret = o;
932
933 if (offset)
934 *offset = p;
935
936 return 0;
937 }
938
939 static int journal_file_append_data(
940 JournalFile *f,
941 const void *data, uint64_t size,
942 Object **ret, uint64_t *offset) {
943
944 uint64_t hash, p;
945 uint64_t osize;
946 Object *o;
947 int r;
948 bool compressed = false;
949 const void *eq;
950
951 assert(f);
952 assert(data || size == 0);
953
954 hash = hash64(data, size);
955
956 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
957 if (r < 0)
958 return r;
959 else if (r > 0) {
960
961 if (ret)
962 *ret = o;
963
964 if (offset)
965 *offset = p;
966
967 return 0;
968 }
969
970 osize = offsetof(Object, data.payload) + size;
971 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
972 if (r < 0)
973 return r;
974
975 o->data.hash = htole64(hash);
976
977 #ifdef HAVE_XZ
978 if (f->compress &&
979 size >= COMPRESSION_SIZE_THRESHOLD) {
980 uint64_t rsize;
981
982 compressed = compress_blob(data, size, o->data.payload, &rsize);
983
984 if (compressed) {
985 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
986 o->object.flags |= OBJECT_COMPRESSED;
987
988 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
989 }
990 }
991 #endif
992
993 if (!compressed && size > 0)
994 memcpy(o->data.payload, data, size);
995
996 r = journal_file_link_data(f, o, p, hash);
997 if (r < 0)
998 return r;
999
1000 /* The linking might have altered the window, so let's
1001 * refresh our pointer */
1002 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1003 if (r < 0)
1004 return r;
1005
1006 eq = memchr(data, '=', size);
1007 if (eq && eq > data) {
1008 uint64_t fp;
1009 Object *fo;
1010
1011 /* Create field object ... */
1012 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1013 if (r < 0)
1014 return r;
1015
1016 /* ... and link it in. */
1017 o->data.next_field_offset = fo->field.head_data_offset;
1018 fo->field.head_data_offset = le64toh(p);
1019 }
1020
1021 #ifdef HAVE_GCRYPT
1022 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1023 if (r < 0)
1024 return r;
1025 #endif
1026
1027 if (ret)
1028 *ret = o;
1029
1030 if (offset)
1031 *offset = p;
1032
1033 return 0;
1034 }
1035
1036 uint64_t journal_file_entry_n_items(Object *o) {
1037 assert(o);
1038
1039 if (o->object.type != OBJECT_ENTRY)
1040 return 0;
1041
1042 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1043 }
1044
1045 uint64_t journal_file_entry_array_n_items(Object *o) {
1046 assert(o);
1047
1048 if (o->object.type != OBJECT_ENTRY_ARRAY)
1049 return 0;
1050
1051 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1052 }
1053
1054 uint64_t journal_file_hash_table_n_items(Object *o) {
1055 assert(o);
1056
1057 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1058 o->object.type != OBJECT_FIELD_HASH_TABLE)
1059 return 0;
1060
1061 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1062 }
1063
1064 static int link_entry_into_array(JournalFile *f,
1065 le64_t *first,
1066 le64_t *idx,
1067 uint64_t p) {
1068 int r;
1069 uint64_t n = 0, ap = 0, q, i, a, hidx;
1070 Object *o;
1071
1072 assert(f);
1073 assert(first);
1074 assert(idx);
1075 assert(p > 0);
1076
1077 a = le64toh(*first);
1078 i = hidx = le64toh(*idx);
1079 while (a > 0) {
1080
1081 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1082 if (r < 0)
1083 return r;
1084
1085 n = journal_file_entry_array_n_items(o);
1086 if (i < n) {
1087 o->entry_array.items[i] = htole64(p);
1088 *idx = htole64(hidx + 1);
1089 return 0;
1090 }
1091
1092 i -= n;
1093 ap = a;
1094 a = le64toh(o->entry_array.next_entry_array_offset);
1095 }
1096
1097 if (hidx > n)
1098 n = (hidx+1) * 2;
1099 else
1100 n = n * 2;
1101
1102 if (n < 4)
1103 n = 4;
1104
1105 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1106 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1107 &o, &q);
1108 if (r < 0)
1109 return r;
1110
1111 #ifdef HAVE_GCRYPT
1112 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1113 if (r < 0)
1114 return r;
1115 #endif
1116
1117 o->entry_array.items[i] = htole64(p);
1118
1119 if (ap == 0)
1120 *first = htole64(q);
1121 else {
1122 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1123 if (r < 0)
1124 return r;
1125
1126 o->entry_array.next_entry_array_offset = htole64(q);
1127 }
1128
1129 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1130 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1131
1132 *idx = htole64(hidx + 1);
1133
1134 return 0;
1135 }
1136
1137 static int link_entry_into_array_plus_one(JournalFile *f,
1138 le64_t *extra,
1139 le64_t *first,
1140 le64_t *idx,
1141 uint64_t p) {
1142
1143 int r;
1144
1145 assert(f);
1146 assert(extra);
1147 assert(first);
1148 assert(idx);
1149 assert(p > 0);
1150
1151 if (*idx == 0)
1152 *extra = htole64(p);
1153 else {
1154 le64_t i;
1155
1156 i = htole64(le64toh(*idx) - 1);
1157 r = link_entry_into_array(f, first, &i, p);
1158 if (r < 0)
1159 return r;
1160 }
1161
1162 *idx = htole64(le64toh(*idx) + 1);
1163 return 0;
1164 }
1165
1166 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1167 uint64_t p;
1168 int r;
1169 assert(f);
1170 assert(o);
1171 assert(offset > 0);
1172
1173 p = le64toh(o->entry.items[i].object_offset);
1174 if (p == 0)
1175 return -EINVAL;
1176
1177 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1178 if (r < 0)
1179 return r;
1180
1181 return link_entry_into_array_plus_one(f,
1182 &o->data.entry_offset,
1183 &o->data.entry_array_offset,
1184 &o->data.n_entries,
1185 offset);
1186 }
1187
1188 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1189 uint64_t n, i;
1190 int r;
1191
1192 assert(f);
1193 assert(o);
1194 assert(offset > 0);
1195
1196 if (o->object.type != OBJECT_ENTRY)
1197 return -EINVAL;
1198
1199 __sync_synchronize();
1200
1201 /* Link up the entry itself */
1202 r = link_entry_into_array(f,
1203 &f->header->entry_array_offset,
1204 &f->header->n_entries,
1205 offset);
1206 if (r < 0)
1207 return r;
1208
1209 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1210
1211 if (f->header->head_entry_realtime == 0)
1212 f->header->head_entry_realtime = o->entry.realtime;
1213
1214 f->header->tail_entry_realtime = o->entry.realtime;
1215 f->header->tail_entry_monotonic = o->entry.monotonic;
1216
1217 f->tail_entry_monotonic_valid = true;
1218
1219 /* Link up the items */
1220 n = journal_file_entry_n_items(o);
1221 for (i = 0; i < n; i++) {
1222 r = journal_file_link_entry_item(f, o, offset, i);
1223 if (r < 0)
1224 return r;
1225 }
1226
1227 return 0;
1228 }
1229
1230 static int journal_file_append_entry_internal(
1231 JournalFile *f,
1232 const dual_timestamp *ts,
1233 uint64_t xor_hash,
1234 const EntryItem items[], unsigned n_items,
1235 uint64_t *seqnum,
1236 Object **ret, uint64_t *offset) {
1237 uint64_t np;
1238 uint64_t osize;
1239 Object *o;
1240 int r;
1241
1242 assert(f);
1243 assert(items || n_items == 0);
1244 assert(ts);
1245
1246 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1247
1248 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1249 if (r < 0)
1250 return r;
1251
1252 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1253 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1254 o->entry.realtime = htole64(ts->realtime);
1255 o->entry.monotonic = htole64(ts->monotonic);
1256 o->entry.xor_hash = htole64(xor_hash);
1257 o->entry.boot_id = f->header->boot_id;
1258
1259 #ifdef HAVE_GCRYPT
1260 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1261 if (r < 0)
1262 return r;
1263 #endif
1264
1265 r = journal_file_link_entry(f, o, np);
1266 if (r < 0)
1267 return r;
1268
1269 if (ret)
1270 *ret = o;
1271
1272 if (offset)
1273 *offset = np;
1274
1275 return 0;
1276 }
1277
1278 void journal_file_post_change(JournalFile *f) {
1279 assert(f);
1280
1281 /* inotify() does not receive IN_MODIFY events from file
1282 * accesses done via mmap(). After each access we hence
1283 * trigger IN_MODIFY by truncating the journal file to its
1284 * current size which triggers IN_MODIFY. */
1285
1286 __sync_synchronize();
1287
1288 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1289 log_error("Failed to truncate file to its own size: %m");
1290 }
1291
1292 static int entry_item_cmp(const void *_a, const void *_b) {
1293 const EntryItem *a = _a, *b = _b;
1294
1295 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1296 return -1;
1297 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1298 return 1;
1299 return 0;
1300 }
1301
1302 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1303 unsigned i;
1304 EntryItem *items;
1305 int r;
1306 uint64_t xor_hash = 0;
1307 struct dual_timestamp _ts;
1308
1309 assert(f);
1310 assert(iovec || n_iovec == 0);
1311
1312 if (!ts) {
1313 dual_timestamp_get(&_ts);
1314 ts = &_ts;
1315 }
1316
1317 if (f->tail_entry_monotonic_valid &&
1318 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1319 return -EINVAL;
1320
1321 #ifdef HAVE_GCRYPT
1322 r = journal_file_maybe_append_tag(f, ts->realtime);
1323 if (r < 0)
1324 return r;
1325 #endif
1326
1327 /* alloca() can't take 0, hence let's allocate at least one */
1328 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1329
1330 for (i = 0; i < n_iovec; i++) {
1331 uint64_t p;
1332 Object *o;
1333
1334 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1335 if (r < 0)
1336 return r;
1337
1338 xor_hash ^= le64toh(o->data.hash);
1339 items[i].object_offset = htole64(p);
1340 items[i].hash = o->data.hash;
1341 }
1342
1343 /* Order by the position on disk, in order to improve seek
1344 * times for rotating media. */
1345 qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1346
1347 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1348
1349 journal_file_post_change(f);
1350
1351 return r;
1352 }
1353
1354 typedef struct ChainCacheItem {
1355 uint64_t first; /* the array at the begin of the chain */
1356 uint64_t array; /* the cached array */
1357 uint64_t begin; /* the first item in the cached array */
1358 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1359 } ChainCacheItem;
1360
1361 static void chain_cache_put(
1362 Hashmap *h,
1363 ChainCacheItem *ci,
1364 uint64_t first,
1365 uint64_t array,
1366 uint64_t begin,
1367 uint64_t total) {
1368
1369 if (!ci) {
1370 /* If the chain item to cache for this chain is the
1371 * first one it's not worth caching anything */
1372 if (array == first)
1373 return;
1374
1375 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1376 ci = hashmap_steal_first(h);
1377 else {
1378 ci = new(ChainCacheItem, 1);
1379 if (!ci)
1380 return;
1381 }
1382
1383 ci->first = first;
1384
1385 if (hashmap_put(h, &ci->first, ci) < 0) {
1386 free(ci);
1387 return;
1388 }
1389 } else
1390 assert(ci->first == first);
1391
1392 ci->array = array;
1393 ci->begin = begin;
1394 ci->total = total;
1395 }
1396
1397 static int generic_array_get(JournalFile *f,
1398 uint64_t first,
1399 uint64_t i,
1400 Object **ret, uint64_t *offset) {
1401
1402 Object *o;
1403 uint64_t p = 0, a, t = 0;
1404 int r;
1405 ChainCacheItem *ci;
1406
1407 assert(f);
1408
1409 a = first;
1410
1411 /* Try the chain cache first */
1412 ci = hashmap_get(f->chain_cache, &first);
1413 if (ci && i > ci->total) {
1414 a = ci->array;
1415 i -= ci->total;
1416 t = ci->total;
1417 }
1418
1419 while (a > 0) {
1420 uint64_t k;
1421
1422 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1423 if (r < 0)
1424 return r;
1425
1426 k = journal_file_entry_array_n_items(o);
1427 if (i < k) {
1428 p = le64toh(o->entry_array.items[i]);
1429 goto found;
1430 }
1431
1432 i -= k;
1433 t += k;
1434 a = le64toh(o->entry_array.next_entry_array_offset);
1435 }
1436
1437 return 0;
1438
1439 found:
1440 /* Let's cache this item for the next invocation */
1441 chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
1442
1443 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1444 if (r < 0)
1445 return r;
1446
1447 if (ret)
1448 *ret = o;
1449
1450 if (offset)
1451 *offset = p;
1452
1453 return 1;
1454 }
1455
1456 static int generic_array_get_plus_one(JournalFile *f,
1457 uint64_t extra,
1458 uint64_t first,
1459 uint64_t i,
1460 Object **ret, uint64_t *offset) {
1461
1462 Object *o;
1463
1464 assert(f);
1465
1466 if (i == 0) {
1467 int r;
1468
1469 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1470 if (r < 0)
1471 return r;
1472
1473 if (ret)
1474 *ret = o;
1475
1476 if (offset)
1477 *offset = extra;
1478
1479 return 1;
1480 }
1481
1482 return generic_array_get(f, first, i-1, ret, offset);
1483 }
1484
1485 enum {
1486 TEST_FOUND,
1487 TEST_LEFT,
1488 TEST_RIGHT
1489 };
1490
1491 static int generic_array_bisect(JournalFile *f,
1492 uint64_t first,
1493 uint64_t n,
1494 uint64_t needle,
1495 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1496 direction_t direction,
1497 Object **ret,
1498 uint64_t *offset,
1499 uint64_t *idx) {
1500
1501 uint64_t a, p, t = 0, i = 0, last_p = 0;
1502 bool subtract_one = false;
1503 Object *o, *array = NULL;
1504 int r;
1505 ChainCacheItem *ci;
1506
1507 assert(f);
1508 assert(test_object);
1509
1510 /* Start with the first array in the chain */
1511 a = first;
1512
1513 ci = hashmap_get(f->chain_cache, &first);
1514 if (ci && n > ci->total) {
1515 /* Ah, we have iterated this bisection array chain
1516 * previously! Let's see if we can skip ahead in the
1517 * chain, as far as the last time. But we can't jump
1518 * backwards in the chain, so let's check that
1519 * first. */
1520
1521 r = test_object(f, ci->begin, needle);
1522 if (r < 0)
1523 return r;
1524
1525 if (r == TEST_LEFT) {
1526 /* OK, what we are looking for is right of th
1527 * begin of this EntryArray, so let's jump
1528 * straight to previously cached array in the
1529 * chain */
1530
1531 a = ci->array;
1532 n -= ci->total;
1533 t = ci->total;
1534 }
1535 }
1536
1537 while (a > 0) {
1538 uint64_t left, right, k, lp;
1539
1540 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1541 if (r < 0)
1542 return r;
1543
1544 k = journal_file_entry_array_n_items(array);
1545 right = MIN(k, n);
1546 if (right <= 0)
1547 return 0;
1548
1549 i = right - 1;
1550 lp = p = le64toh(array->entry_array.items[i]);
1551 if (p <= 0)
1552 return -EBADMSG;
1553
1554 r = test_object(f, p, needle);
1555 if (r < 0)
1556 return r;
1557
1558 if (r == TEST_FOUND)
1559 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1560
1561 if (r == TEST_RIGHT) {
1562 left = 0;
1563 right -= 1;
1564 for (;;) {
1565 if (left == right) {
1566 if (direction == DIRECTION_UP)
1567 subtract_one = true;
1568
1569 i = left;
1570 goto found;
1571 }
1572
1573 assert(left < right);
1574
1575 i = (left + right) / 2;
1576 p = le64toh(array->entry_array.items[i]);
1577 if (p <= 0)
1578 return -EBADMSG;
1579
1580 r = test_object(f, p, needle);
1581 if (r < 0)
1582 return r;
1583
1584 if (r == TEST_FOUND)
1585 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1586
1587 if (r == TEST_RIGHT)
1588 right = i;
1589 else
1590 left = i + 1;
1591 }
1592 }
1593
1594 if (k > n) {
1595 if (direction == DIRECTION_UP) {
1596 i = n;
1597 subtract_one = true;
1598 goto found;
1599 }
1600
1601 return 0;
1602 }
1603
1604 last_p = lp;
1605
1606 n -= k;
1607 t += k;
1608 a = le64toh(array->entry_array.next_entry_array_offset);
1609 }
1610
1611 return 0;
1612
1613 found:
1614 if (subtract_one && t == 0 && i == 0)
1615 return 0;
1616
1617 /* Let's cache this item for the next invocation */
1618 chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1619
1620 if (subtract_one && i == 0)
1621 p = last_p;
1622 else if (subtract_one)
1623 p = le64toh(array->entry_array.items[i-1]);
1624 else
1625 p = le64toh(array->entry_array.items[i]);
1626
1627 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1628 if (r < 0)
1629 return r;
1630
1631 if (ret)
1632 *ret = o;
1633
1634 if (offset)
1635 *offset = p;
1636
1637 if (idx)
1638 *idx = t + i + (subtract_one ? -1 : 0);
1639
1640 return 1;
1641 }
1642
1643 static int generic_array_bisect_plus_one(JournalFile *f,
1644 uint64_t extra,
1645 uint64_t first,
1646 uint64_t n,
1647 uint64_t needle,
1648 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1649 direction_t direction,
1650 Object **ret,
1651 uint64_t *offset,
1652 uint64_t *idx) {
1653
1654 int r;
1655 bool step_back = false;
1656 Object *o;
1657
1658 assert(f);
1659 assert(test_object);
1660
1661 if (n <= 0)
1662 return 0;
1663
1664 /* This bisects the array in object 'first', but first checks
1665 * an extra */
1666 r = test_object(f, extra, needle);
1667 if (r < 0)
1668 return r;
1669
1670 if (r == TEST_FOUND)
1671 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1672
1673 /* if we are looking with DIRECTION_UP then we need to first
1674 see if in the actual array there is a matching entry, and
1675 return the last one of that. But if there isn't any we need
1676 to return this one. Hence remember this, and return it
1677 below. */
1678 if (r == TEST_LEFT)
1679 step_back = direction == DIRECTION_UP;
1680
1681 if (r == TEST_RIGHT) {
1682 if (direction == DIRECTION_DOWN)
1683 goto found;
1684 else
1685 return 0;
1686 }
1687
1688 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1689
1690 if (r == 0 && step_back)
1691 goto found;
1692
1693 if (r > 0 && idx)
1694 (*idx) ++;
1695
1696 return r;
1697
1698 found:
1699 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1700 if (r < 0)
1701 return r;
1702
1703 if (ret)
1704 *ret = o;
1705
1706 if (offset)
1707 *offset = extra;
1708
1709 if (idx)
1710 *idx = 0;
1711
1712 return 1;
1713 }
1714
1715 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1716 assert(f);
1717 assert(p > 0);
1718
1719 if (p == needle)
1720 return TEST_FOUND;
1721 else if (p < needle)
1722 return TEST_LEFT;
1723 else
1724 return TEST_RIGHT;
1725 }
1726
1727 int journal_file_move_to_entry_by_offset(
1728 JournalFile *f,
1729 uint64_t p,
1730 direction_t direction,
1731 Object **ret,
1732 uint64_t *offset) {
1733
1734 return generic_array_bisect(f,
1735 le64toh(f->header->entry_array_offset),
1736 le64toh(f->header->n_entries),
1737 p,
1738 test_object_offset,
1739 direction,
1740 ret, offset, NULL);
1741 }
1742
1743
1744 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1745 Object *o;
1746 int r;
1747
1748 assert(f);
1749 assert(p > 0);
1750
1751 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1752 if (r < 0)
1753 return r;
1754
1755 if (le64toh(o->entry.seqnum) == needle)
1756 return TEST_FOUND;
1757 else if (le64toh(o->entry.seqnum) < needle)
1758 return TEST_LEFT;
1759 else
1760 return TEST_RIGHT;
1761 }
1762
1763 int journal_file_move_to_entry_by_seqnum(
1764 JournalFile *f,
1765 uint64_t seqnum,
1766 direction_t direction,
1767 Object **ret,
1768 uint64_t *offset) {
1769
1770 return generic_array_bisect(f,
1771 le64toh(f->header->entry_array_offset),
1772 le64toh(f->header->n_entries),
1773 seqnum,
1774 test_object_seqnum,
1775 direction,
1776 ret, offset, NULL);
1777 }
1778
1779 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1780 Object *o;
1781 int r;
1782
1783 assert(f);
1784 assert(p > 0);
1785
1786 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1787 if (r < 0)
1788 return r;
1789
1790 if (le64toh(o->entry.realtime) == needle)
1791 return TEST_FOUND;
1792 else if (le64toh(o->entry.realtime) < needle)
1793 return TEST_LEFT;
1794 else
1795 return TEST_RIGHT;
1796 }
1797
1798 int journal_file_move_to_entry_by_realtime(
1799 JournalFile *f,
1800 uint64_t realtime,
1801 direction_t direction,
1802 Object **ret,
1803 uint64_t *offset) {
1804
1805 return generic_array_bisect(f,
1806 le64toh(f->header->entry_array_offset),
1807 le64toh(f->header->n_entries),
1808 realtime,
1809 test_object_realtime,
1810 direction,
1811 ret, offset, NULL);
1812 }
1813
1814 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1815 Object *o;
1816 int r;
1817
1818 assert(f);
1819 assert(p > 0);
1820
1821 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1822 if (r < 0)
1823 return r;
1824
1825 if (le64toh(o->entry.monotonic) == needle)
1826 return TEST_FOUND;
1827 else if (le64toh(o->entry.monotonic) < needle)
1828 return TEST_LEFT;
1829 else
1830 return TEST_RIGHT;
1831 }
1832
1833 static inline int find_data_object_by_boot_id(
1834 JournalFile *f,
1835 sd_id128_t boot_id,
1836 Object **o,
1837 uint64_t *b) {
1838 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1839
1840 sd_id128_to_string(boot_id, t + 9);
1841 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1842 }
1843
1844 int journal_file_move_to_entry_by_monotonic(
1845 JournalFile *f,
1846 sd_id128_t boot_id,
1847 uint64_t monotonic,
1848 direction_t direction,
1849 Object **ret,
1850 uint64_t *offset) {
1851
1852 Object *o;
1853 int r;
1854
1855 assert(f);
1856
1857 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1858 if (r < 0)
1859 return r;
1860 if (r == 0)
1861 return -ENOENT;
1862
1863 return generic_array_bisect_plus_one(f,
1864 le64toh(o->data.entry_offset),
1865 le64toh(o->data.entry_array_offset),
1866 le64toh(o->data.n_entries),
1867 monotonic,
1868 test_object_monotonic,
1869 direction,
1870 ret, offset, NULL);
1871 }
1872
1873 int journal_file_next_entry(
1874 JournalFile *f,
1875 Object *o, uint64_t p,
1876 direction_t direction,
1877 Object **ret, uint64_t *offset) {
1878
1879 uint64_t i, n;
1880 int r;
1881
1882 assert(f);
1883 assert(p > 0 || !o);
1884
1885 n = le64toh(f->header->n_entries);
1886 if (n <= 0)
1887 return 0;
1888
1889 if (!o)
1890 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1891 else {
1892 if (o->object.type != OBJECT_ENTRY)
1893 return -EINVAL;
1894
1895 r = generic_array_bisect(f,
1896 le64toh(f->header->entry_array_offset),
1897 le64toh(f->header->n_entries),
1898 p,
1899 test_object_offset,
1900 DIRECTION_DOWN,
1901 NULL, NULL,
1902 &i);
1903 if (r <= 0)
1904 return r;
1905
1906 if (direction == DIRECTION_DOWN) {
1907 if (i >= n - 1)
1908 return 0;
1909
1910 i++;
1911 } else {
1912 if (i <= 0)
1913 return 0;
1914
1915 i--;
1916 }
1917 }
1918
1919 /* And jump to it */
1920 return generic_array_get(f,
1921 le64toh(f->header->entry_array_offset),
1922 i,
1923 ret, offset);
1924 }
1925
1926 int journal_file_skip_entry(
1927 JournalFile *f,
1928 Object *o, uint64_t p,
1929 int64_t skip,
1930 Object **ret, uint64_t *offset) {
1931
1932 uint64_t i, n;
1933 int r;
1934
1935 assert(f);
1936 assert(o);
1937 assert(p > 0);
1938
1939 if (o->object.type != OBJECT_ENTRY)
1940 return -EINVAL;
1941
1942 r = generic_array_bisect(f,
1943 le64toh(f->header->entry_array_offset),
1944 le64toh(f->header->n_entries),
1945 p,
1946 test_object_offset,
1947 DIRECTION_DOWN,
1948 NULL, NULL,
1949 &i);
1950 if (r <= 0)
1951 return r;
1952
1953 /* Calculate new index */
1954 if (skip < 0) {
1955 if ((uint64_t) -skip >= i)
1956 i = 0;
1957 else
1958 i = i - (uint64_t) -skip;
1959 } else
1960 i += (uint64_t) skip;
1961
1962 n = le64toh(f->header->n_entries);
1963 if (n <= 0)
1964 return -EBADMSG;
1965
1966 if (i >= n)
1967 i = n-1;
1968
1969 return generic_array_get(f,
1970 le64toh(f->header->entry_array_offset),
1971 i,
1972 ret, offset);
1973 }
1974
1975 int journal_file_next_entry_for_data(
1976 JournalFile *f,
1977 Object *o, uint64_t p,
1978 uint64_t data_offset,
1979 direction_t direction,
1980 Object **ret, uint64_t *offset) {
1981
1982 uint64_t n, i;
1983 int r;
1984 Object *d;
1985
1986 assert(f);
1987 assert(p > 0 || !o);
1988
1989 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1990 if (r < 0)
1991 return r;
1992
1993 n = le64toh(d->data.n_entries);
1994 if (n <= 0)
1995 return n;
1996
1997 if (!o)
1998 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1999 else {
2000 if (o->object.type != OBJECT_ENTRY)
2001 return -EINVAL;
2002
2003 r = generic_array_bisect_plus_one(f,
2004 le64toh(d->data.entry_offset),
2005 le64toh(d->data.entry_array_offset),
2006 le64toh(d->data.n_entries),
2007 p,
2008 test_object_offset,
2009 DIRECTION_DOWN,
2010 NULL, NULL,
2011 &i);
2012
2013 if (r <= 0)
2014 return r;
2015
2016 if (direction == DIRECTION_DOWN) {
2017 if (i >= n - 1)
2018 return 0;
2019
2020 i++;
2021 } else {
2022 if (i <= 0)
2023 return 0;
2024
2025 i--;
2026 }
2027
2028 }
2029
2030 return generic_array_get_plus_one(f,
2031 le64toh(d->data.entry_offset),
2032 le64toh(d->data.entry_array_offset),
2033 i,
2034 ret, offset);
2035 }
2036
2037 int journal_file_move_to_entry_by_offset_for_data(
2038 JournalFile *f,
2039 uint64_t data_offset,
2040 uint64_t p,
2041 direction_t direction,
2042 Object **ret, uint64_t *offset) {
2043
2044 int r;
2045 Object *d;
2046
2047 assert(f);
2048
2049 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2050 if (r < 0)
2051 return r;
2052
2053 return generic_array_bisect_plus_one(f,
2054 le64toh(d->data.entry_offset),
2055 le64toh(d->data.entry_array_offset),
2056 le64toh(d->data.n_entries),
2057 p,
2058 test_object_offset,
2059 direction,
2060 ret, offset, NULL);
2061 }
2062
2063 int journal_file_move_to_entry_by_monotonic_for_data(
2064 JournalFile *f,
2065 uint64_t data_offset,
2066 sd_id128_t boot_id,
2067 uint64_t monotonic,
2068 direction_t direction,
2069 Object **ret, uint64_t *offset) {
2070
2071 Object *o, *d;
2072 int r;
2073 uint64_t b, z;
2074
2075 assert(f);
2076
2077 /* First, seek by time */
2078 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2079 if (r < 0)
2080 return r;
2081 if (r == 0)
2082 return -ENOENT;
2083
2084 r = generic_array_bisect_plus_one(f,
2085 le64toh(o->data.entry_offset),
2086 le64toh(o->data.entry_array_offset),
2087 le64toh(o->data.n_entries),
2088 monotonic,
2089 test_object_monotonic,
2090 direction,
2091 NULL, &z, NULL);
2092 if (r <= 0)
2093 return r;
2094
2095 /* And now, continue seeking until we find an entry that
2096 * exists in both bisection arrays */
2097
2098 for (;;) {
2099 Object *qo;
2100 uint64_t p, q;
2101
2102 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2103 if (r < 0)
2104 return r;
2105
2106 r = generic_array_bisect_plus_one(f,
2107 le64toh(d->data.entry_offset),
2108 le64toh(d->data.entry_array_offset),
2109 le64toh(d->data.n_entries),
2110 z,
2111 test_object_offset,
2112 direction,
2113 NULL, &p, NULL);
2114 if (r <= 0)
2115 return r;
2116
2117 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2118 if (r < 0)
2119 return r;
2120
2121 r = generic_array_bisect_plus_one(f,
2122 le64toh(o->data.entry_offset),
2123 le64toh(o->data.entry_array_offset),
2124 le64toh(o->data.n_entries),
2125 p,
2126 test_object_offset,
2127 direction,
2128 &qo, &q, NULL);
2129
2130 if (r <= 0)
2131 return r;
2132
2133 if (p == q) {
2134 if (ret)
2135 *ret = qo;
2136 if (offset)
2137 *offset = q;
2138
2139 return 1;
2140 }
2141
2142 z = q;
2143 }
2144
2145 return 0;
2146 }
2147
2148 int journal_file_move_to_entry_by_seqnum_for_data(
2149 JournalFile *f,
2150 uint64_t data_offset,
2151 uint64_t seqnum,
2152 direction_t direction,
2153 Object **ret, uint64_t *offset) {
2154
2155 Object *d;
2156 int r;
2157
2158 assert(f);
2159
2160 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2161 if (r < 0)
2162 return r;
2163
2164 return generic_array_bisect_plus_one(f,
2165 le64toh(d->data.entry_offset),
2166 le64toh(d->data.entry_array_offset),
2167 le64toh(d->data.n_entries),
2168 seqnum,
2169 test_object_seqnum,
2170 direction,
2171 ret, offset, NULL);
2172 }
2173
2174 int journal_file_move_to_entry_by_realtime_for_data(
2175 JournalFile *f,
2176 uint64_t data_offset,
2177 uint64_t realtime,
2178 direction_t direction,
2179 Object **ret, uint64_t *offset) {
2180
2181 Object *d;
2182 int r;
2183
2184 assert(f);
2185
2186 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2187 if (r < 0)
2188 return r;
2189
2190 return generic_array_bisect_plus_one(f,
2191 le64toh(d->data.entry_offset),
2192 le64toh(d->data.entry_array_offset),
2193 le64toh(d->data.n_entries),
2194 realtime,
2195 test_object_realtime,
2196 direction,
2197 ret, offset, NULL);
2198 }
2199
2200 void journal_file_dump(JournalFile *f) {
2201 Object *o;
2202 int r;
2203 uint64_t p;
2204
2205 assert(f);
2206
2207 journal_file_print_header(f);
2208
2209 p = le64toh(f->header->header_size);
2210 while (p != 0) {
2211 r = journal_file_move_to_object(f, -1, p, &o);
2212 if (r < 0)
2213 goto fail;
2214
2215 switch (o->object.type) {
2216
2217 case OBJECT_UNUSED:
2218 printf("Type: OBJECT_UNUSED\n");
2219 break;
2220
2221 case OBJECT_DATA:
2222 printf("Type: OBJECT_DATA\n");
2223 break;
2224
2225 case OBJECT_FIELD:
2226 printf("Type: OBJECT_FIELD\n");
2227 break;
2228
2229 case OBJECT_ENTRY:
2230 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2231 le64toh(o->entry.seqnum),
2232 le64toh(o->entry.monotonic),
2233 le64toh(o->entry.realtime));
2234 break;
2235
2236 case OBJECT_FIELD_HASH_TABLE:
2237 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2238 break;
2239
2240 case OBJECT_DATA_HASH_TABLE:
2241 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2242 break;
2243
2244 case OBJECT_ENTRY_ARRAY:
2245 printf("Type: OBJECT_ENTRY_ARRAY\n");
2246 break;
2247
2248 case OBJECT_TAG:
2249 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2250 le64toh(o->tag.seqnum),
2251 le64toh(o->tag.epoch));
2252 break;
2253
2254 default:
2255 printf("Type: unknown (%u)\n", o->object.type);
2256 break;
2257 }
2258
2259 if (o->object.flags & OBJECT_COMPRESSED)
2260 printf("Flags: COMPRESSED\n");
2261
2262 if (p == le64toh(f->header->tail_object_offset))
2263 p = 0;
2264 else
2265 p = p + ALIGN64(le64toh(o->object.size));
2266 }
2267
2268 return;
2269 fail:
2270 log_error("File corrupt");
2271 }
2272
2273 void journal_file_print_header(JournalFile *f) {
2274 char a[33], b[33], c[33], d[33];
2275 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2276 struct stat st;
2277 char bytes[FORMAT_BYTES_MAX];
2278
2279 assert(f);
2280
2281 printf("File Path: %s\n"
2282 "File ID: %s\n"
2283 "Machine ID: %s\n"
2284 "Boot ID: %s\n"
2285 "Sequential Number ID: %s\n"
2286 "State: %s\n"
2287 "Compatible Flags:%s%s\n"
2288 "Incompatible Flags:%s%s\n"
2289 "Header size: %"PRIu64"\n"
2290 "Arena size: %"PRIu64"\n"
2291 "Data Hash Table Size: %"PRIu64"\n"
2292 "Field Hash Table Size: %"PRIu64"\n"
2293 "Rotate Suggested: %s\n"
2294 "Head Sequential Number: %"PRIu64"\n"
2295 "Tail Sequential Number: %"PRIu64"\n"
2296 "Head Realtime Timestamp: %s\n"
2297 "Tail Realtime Timestamp: %s\n"
2298 "Tail Monotonic Timestamp: %s\n"
2299 "Objects: %"PRIu64"\n"
2300 "Entry Objects: %"PRIu64"\n",
2301 f->path,
2302 sd_id128_to_string(f->header->file_id, a),
2303 sd_id128_to_string(f->header->machine_id, b),
2304 sd_id128_to_string(f->header->boot_id, c),
2305 sd_id128_to_string(f->header->seqnum_id, d),
2306 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2307 f->header->state == STATE_ONLINE ? "ONLINE" :
2308 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2309 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2310 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2311 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2312 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2313 le64toh(f->header->header_size),
2314 le64toh(f->header->arena_size),
2315 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2316 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2317 yes_no(journal_file_rotate_suggested(f, 0)),
2318 le64toh(f->header->head_entry_seqnum),
2319 le64toh(f->header->tail_entry_seqnum),
2320 format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2321 format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2322 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2323 le64toh(f->header->n_objects),
2324 le64toh(f->header->n_entries));
2325
2326 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2327 printf("Data Objects: %"PRIu64"\n"
2328 "Data Hash Table Fill: %.1f%%\n",
2329 le64toh(f->header->n_data),
2330 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2331
2332 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2333 printf("Field Objects: %"PRIu64"\n"
2334 "Field Hash Table Fill: %.1f%%\n",
2335 le64toh(f->header->n_fields),
2336 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2337
2338 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2339 printf("Tag Objects: %"PRIu64"\n",
2340 le64toh(f->header->n_tags));
2341 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2342 printf("Entry Array Objects: %"PRIu64"\n",
2343 le64toh(f->header->n_entry_arrays));
2344
2345 if (fstat(f->fd, &st) >= 0)
2346 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2347 }
2348
2349 int journal_file_open(
2350 const char *fname,
2351 int flags,
2352 mode_t mode,
2353 bool compress,
2354 bool seal,
2355 JournalMetrics *metrics,
2356 MMapCache *mmap_cache,
2357 JournalFile *template,
2358 JournalFile **ret) {
2359
2360 JournalFile *f;
2361 int r;
2362 bool newly_created = false;
2363
2364 assert(fname);
2365 assert(ret);
2366
2367 if ((flags & O_ACCMODE) != O_RDONLY &&
2368 (flags & O_ACCMODE) != O_RDWR)
2369 return -EINVAL;
2370
2371 if (!endswith(fname, ".journal") &&
2372 !endswith(fname, ".journal~"))
2373 return -EINVAL;
2374
2375 f = new0(JournalFile, 1);
2376 if (!f)
2377 return -ENOMEM;
2378
2379 f->fd = -1;
2380 f->mode = mode;
2381
2382 f->flags = flags;
2383 f->prot = prot_from_flags(flags);
2384 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2385 #ifdef HAVE_XZ
2386 f->compress = compress;
2387 #endif
2388 #ifdef HAVE_GCRYPT
2389 f->seal = seal;
2390 #endif
2391
2392 if (mmap_cache)
2393 f->mmap = mmap_cache_ref(mmap_cache);
2394 else {
2395 f->mmap = mmap_cache_new();
2396 if (!f->mmap) {
2397 r = -ENOMEM;
2398 goto fail;
2399 }
2400 }
2401
2402 f->path = strdup(fname);
2403 if (!f->path) {
2404 r = -ENOMEM;
2405 goto fail;
2406 }
2407
2408 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2409 if (!f->chain_cache) {
2410 r = -ENOMEM;
2411 goto fail;
2412 }
2413
2414 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2415 if (f->fd < 0) {
2416 r = -errno;
2417 goto fail;
2418 }
2419
2420 if (fstat(f->fd, &f->last_stat) < 0) {
2421 r = -errno;
2422 goto fail;
2423 }
2424
2425 if (f->last_stat.st_size == 0 && f->writable) {
2426 #ifdef HAVE_XATTR
2427 uint64_t crtime;
2428
2429 /* Let's attach the creation time to the journal file,
2430 * so that the vacuuming code knows the age of this
2431 * file even if the file might end up corrupted one
2432 * day... Ideally we'd just use the creation time many
2433 * file systems maintain for each file, but there is
2434 * currently no usable API to query this, hence let's
2435 * emulate this via extended attributes. If extended
2436 * attributes are not supported we'll just skip this,
2437 * and rely solely on mtime/atime/ctime of the file.*/
2438
2439 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2440 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2441 #endif
2442
2443 #ifdef HAVE_GCRYPT
2444 /* Try to load the FSPRG state, and if we can't, then
2445 * just don't do sealing */
2446 if (f->seal) {
2447 r = journal_file_fss_load(f);
2448 if (r < 0)
2449 f->seal = false;
2450 }
2451 #endif
2452
2453 r = journal_file_init_header(f, template);
2454 if (r < 0)
2455 goto fail;
2456
2457 if (fstat(f->fd, &f->last_stat) < 0) {
2458 r = -errno;
2459 goto fail;
2460 }
2461
2462 newly_created = true;
2463 }
2464
2465 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2466 r = -EIO;
2467 goto fail;
2468 }
2469
2470 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2471 if (f->header == MAP_FAILED) {
2472 f->header = NULL;
2473 r = -errno;
2474 goto fail;
2475 }
2476
2477 if (!newly_created) {
2478 r = journal_file_verify_header(f);
2479 if (r < 0)
2480 goto fail;
2481 }
2482
2483 #ifdef HAVE_GCRYPT
2484 if (!newly_created && f->writable) {
2485 r = journal_file_fss_load(f);
2486 if (r < 0)
2487 goto fail;
2488 }
2489 #endif
2490
2491 if (f->writable) {
2492 if (metrics) {
2493 journal_default_metrics(metrics, f->fd);
2494 f->metrics = *metrics;
2495 } else if (template)
2496 f->metrics = template->metrics;
2497
2498 r = journal_file_refresh_header(f);
2499 if (r < 0)
2500 goto fail;
2501 }
2502
2503 #ifdef HAVE_GCRYPT
2504 r = journal_file_hmac_setup(f);
2505 if (r < 0)
2506 goto fail;
2507 #endif
2508
2509 if (newly_created) {
2510 r = journal_file_setup_field_hash_table(f);
2511 if (r < 0)
2512 goto fail;
2513
2514 r = journal_file_setup_data_hash_table(f);
2515 if (r < 0)
2516 goto fail;
2517
2518 #ifdef HAVE_GCRYPT
2519 r = journal_file_append_first_tag(f);
2520 if (r < 0)
2521 goto fail;
2522 #endif
2523 }
2524
2525 r = journal_file_map_field_hash_table(f);
2526 if (r < 0)
2527 goto fail;
2528
2529 r = journal_file_map_data_hash_table(f);
2530 if (r < 0)
2531 goto fail;
2532
2533 *ret = f;
2534 return 0;
2535
2536 fail:
2537 journal_file_close(f);
2538
2539 return r;
2540 }
2541
2542 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2543 char *p;
2544 size_t l;
2545 JournalFile *old_file, *new_file = NULL;
2546 int r;
2547
2548 assert(f);
2549 assert(*f);
2550
2551 old_file = *f;
2552
2553 if (!old_file->writable)
2554 return -EINVAL;
2555
2556 if (!endswith(old_file->path, ".journal"))
2557 return -EINVAL;
2558
2559 l = strlen(old_file->path);
2560
2561 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2562 if (!p)
2563 return -ENOMEM;
2564
2565 memcpy(p, old_file->path, l - 8);
2566 p[l-8] = '@';
2567 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2568 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2569 "-%016"PRIx64"-%016"PRIx64".journal",
2570 le64toh((*f)->header->head_entry_seqnum),
2571 le64toh((*f)->header->head_entry_realtime));
2572
2573 r = rename(old_file->path, p);
2574 free(p);
2575
2576 if (r < 0)
2577 return -errno;
2578
2579 old_file->header->state = STATE_ARCHIVED;
2580
2581 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2582 journal_file_close(old_file);
2583
2584 *f = new_file;
2585 return r;
2586 }
2587
2588 int journal_file_open_reliably(
2589 const char *fname,
2590 int flags,
2591 mode_t mode,
2592 bool compress,
2593 bool seal,
2594 JournalMetrics *metrics,
2595 MMapCache *mmap_cache,
2596 JournalFile *template,
2597 JournalFile **ret) {
2598
2599 int r;
2600 size_t l;
2601 _cleanup_free_ char *p = NULL;
2602
2603 r = journal_file_open(fname, flags, mode, compress, seal,
2604 metrics, mmap_cache, template, ret);
2605 if (r != -EBADMSG && /* corrupted */
2606 r != -ENODATA && /* truncated */
2607 r != -EHOSTDOWN && /* other machine */
2608 r != -EPROTONOSUPPORT && /* incompatible feature */
2609 r != -EBUSY && /* unclean shutdown */
2610 r != -ESHUTDOWN /* already archived */)
2611 return r;
2612
2613 if ((flags & O_ACCMODE) == O_RDONLY)
2614 return r;
2615
2616 if (!(flags & O_CREAT))
2617 return r;
2618
2619 if (!endswith(fname, ".journal"))
2620 return r;
2621
2622 /* The file is corrupted. Rotate it away and try it again (but only once) */
2623
2624 l = strlen(fname);
2625 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2626 (int) (l-8), fname,
2627 (unsigned long long) now(CLOCK_REALTIME),
2628 random_ull()) < 0)
2629 return -ENOMEM;
2630
2631 r = rename(fname, p);
2632 if (r < 0)
2633 return -errno;
2634
2635 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2636
2637 return journal_file_open(fname, flags, mode, compress, seal,
2638 metrics, mmap_cache, template, ret);
2639 }
2640
2641 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2642 uint64_t i, n;
2643 uint64_t q, xor_hash = 0;
2644 int r;
2645 EntryItem *items;
2646 dual_timestamp ts;
2647
2648 assert(from);
2649 assert(to);
2650 assert(o);
2651 assert(p);
2652
2653 if (!to->writable)
2654 return -EPERM;
2655
2656 ts.monotonic = le64toh(o->entry.monotonic);
2657 ts.realtime = le64toh(o->entry.realtime);
2658
2659 if (to->tail_entry_monotonic_valid &&
2660 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2661 return -EINVAL;
2662
2663 n = journal_file_entry_n_items(o);
2664 items = alloca(sizeof(EntryItem) * n);
2665
2666 for (i = 0; i < n; i++) {
2667 uint64_t l, h;
2668 le64_t le_hash;
2669 size_t t;
2670 void *data;
2671 Object *u;
2672
2673 q = le64toh(o->entry.items[i].object_offset);
2674 le_hash = o->entry.items[i].hash;
2675
2676 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2677 if (r < 0)
2678 return r;
2679
2680 if (le_hash != o->data.hash)
2681 return -EBADMSG;
2682
2683 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2684 t = (size_t) l;
2685
2686 /* We hit the limit on 32bit machines */
2687 if ((uint64_t) t != l)
2688 return -E2BIG;
2689
2690 if (o->object.flags & OBJECT_COMPRESSED) {
2691 #ifdef HAVE_XZ
2692 uint64_t rsize;
2693
2694 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2695 return -EBADMSG;
2696
2697 data = from->compress_buffer;
2698 l = rsize;
2699 #else
2700 return -EPROTONOSUPPORT;
2701 #endif
2702 } else
2703 data = o->data.payload;
2704
2705 r = journal_file_append_data(to, data, l, &u, &h);
2706 if (r < 0)
2707 return r;
2708
2709 xor_hash ^= le64toh(u->data.hash);
2710 items[i].object_offset = htole64(h);
2711 items[i].hash = u->data.hash;
2712
2713 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2714 if (r < 0)
2715 return r;
2716 }
2717
2718 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2719 }
2720
2721 void journal_default_metrics(JournalMetrics *m, int fd) {
2722 uint64_t fs_size = 0;
2723 struct statvfs ss;
2724 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2725
2726 assert(m);
2727 assert(fd >= 0);
2728
2729 if (fstatvfs(fd, &ss) >= 0)
2730 fs_size = ss.f_frsize * ss.f_blocks;
2731
2732 if (m->max_use == (uint64_t) -1) {
2733
2734 if (fs_size > 0) {
2735 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2736
2737 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2738 m->max_use = DEFAULT_MAX_USE_UPPER;
2739
2740 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2741 m->max_use = DEFAULT_MAX_USE_LOWER;
2742 } else
2743 m->max_use = DEFAULT_MAX_USE_LOWER;
2744 } else {
2745 m->max_use = PAGE_ALIGN(m->max_use);
2746
2747 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2748 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2749 }
2750
2751 if (m->max_size == (uint64_t) -1) {
2752 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2753
2754 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2755 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2756 } else
2757 m->max_size = PAGE_ALIGN(m->max_size);
2758
2759 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2760 m->max_size = JOURNAL_FILE_SIZE_MIN;
2761
2762 if (m->max_size*2 > m->max_use)
2763 m->max_use = m->max_size*2;
2764
2765 if (m->min_size == (uint64_t) -1)
2766 m->min_size = JOURNAL_FILE_SIZE_MIN;
2767 else {
2768 m->min_size = PAGE_ALIGN(m->min_size);
2769
2770 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2771 m->min_size = JOURNAL_FILE_SIZE_MIN;
2772
2773 if (m->min_size > m->max_size)
2774 m->max_size = m->min_size;
2775 }
2776
2777 if (m->keep_free == (uint64_t) -1) {
2778
2779 if (fs_size > 0) {
2780 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2781
2782 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2783 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2784
2785 } else
2786 m->keep_free = DEFAULT_KEEP_FREE;
2787 }
2788
2789 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2790 format_bytes(a, sizeof(a), m->max_use),
2791 format_bytes(b, sizeof(b), m->max_size),
2792 format_bytes(c, sizeof(c), m->min_size),
2793 format_bytes(d, sizeof(d), m->keep_free));
2794 }
2795
2796 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2797 assert(f);
2798 assert(from || to);
2799
2800 if (from) {
2801 if (f->header->head_entry_realtime == 0)
2802 return -ENOENT;
2803
2804 *from = le64toh(f->header->head_entry_realtime);
2805 }
2806
2807 if (to) {
2808 if (f->header->tail_entry_realtime == 0)
2809 return -ENOENT;
2810
2811 *to = le64toh(f->header->tail_entry_realtime);
2812 }
2813
2814 return 1;
2815 }
2816
2817 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2818 Object *o;
2819 uint64_t p;
2820 int r;
2821
2822 assert(f);
2823 assert(from || to);
2824
2825 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2826 if (r <= 0)
2827 return r;
2828
2829 if (le64toh(o->data.n_entries) <= 0)
2830 return 0;
2831
2832 if (from) {
2833 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2834 if (r < 0)
2835 return r;
2836
2837 *from = le64toh(o->entry.monotonic);
2838 }
2839
2840 if (to) {
2841 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2842 if (r < 0)
2843 return r;
2844
2845 r = generic_array_get_plus_one(f,
2846 le64toh(o->data.entry_offset),
2847 le64toh(o->data.entry_array_offset),
2848 le64toh(o->data.n_entries)-1,
2849 &o, NULL);
2850 if (r <= 0)
2851 return r;
2852
2853 *to = le64toh(o->entry.monotonic);
2854 }
2855
2856 return 1;
2857 }
2858
2859 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2860 assert(f);
2861
2862 /* If we gained new header fields we gained new features,
2863 * hence suggest a rotation */
2864 if (le64toh(f->header->header_size) < sizeof(Header)) {
2865 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2866 return true;
2867 }
2868
2869 /* Let's check if the hash tables grew over a certain fill
2870 * level (75%, borrowing this value from Java's hash table
2871 * implementation), and if so suggest a rotation. To calculate
2872 * the fill level we need the n_data field, which only exists
2873 * in newer versions. */
2874
2875 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2876 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2877 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2878 f->path,
2879 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2880 le64toh(f->header->n_data),
2881 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2882 (unsigned long long) f->last_stat.st_size,
2883 f->last_stat.st_size / le64toh(f->header->n_data));
2884 return true;
2885 }
2886
2887 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2888 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2889 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2890 f->path,
2891 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2892 le64toh(f->header->n_fields),
2893 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2894 return true;
2895 }
2896
2897 /* Are the data objects properly indexed by field objects? */
2898 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2899 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2900 le64toh(f->header->n_data) > 0 &&
2901 le64toh(f->header->n_fields) == 0)
2902 return true;
2903
2904 if (max_file_usec > 0) {
2905 usec_t t, h;
2906
2907 h = le64toh(f->header->head_entry_realtime);
2908 t = now(CLOCK_REALTIME);
2909
2910 if (h > 0 && t > h + max_file_usec)
2911 return true;
2912 }
2913
2914 return false;
2915 }