]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
util-lib: split string parsing related calls from util.[ch] into parse-util.[ch]
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <linux/fs.h>
25 #include <stddef.h>
26 #include <sys/mman.h>
27 #include <sys/statvfs.h>
28 #include <sys/uio.h>
29 #include <unistd.h>
30
31 #include "btrfs-util.h"
32 #include "compress.h"
33 #include "fd-util.h"
34 #include "journal-authenticate.h"
35 #include "journal-def.h"
36 #include "journal-file.h"
37 #include "lookup3.h"
38 #include "parse-util.h"
39 #include "random-util.h"
40 #include "string-util.h"
41
42 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
43 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
44
45 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
46
47 /* This is the minimum journal file size */
48 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
49
50 /* These are the lower and upper bounds if we deduce the max_use value
51 * from the file system size */
52 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
53 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54
55 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
56 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
57
58 /* This is the upper bound if we deduce max_size from max_use */
59 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
60
61 /* This is the upper bound if we deduce the keep_free value from the
62 * file system size */
63 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
64
65 /* This is the keep_free value when we can't determine the system
66 * size */
67 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
68
69 /* This is the default maximum number of journal files to keep around. */
70 #define DEFAULT_N_MAX_FILES (100)
71
72 /* n_data was the first entry we added after the initial file format design */
73 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
74
75 /* How many entries to keep in the entry array chain cache at max */
76 #define CHAIN_CACHE_MAX 20
77
78 /* How much to increase the journal file size at once each time we allocate something new. */
79 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
80
81 /* Reread fstat() of the file for detecting deletions at least this often */
82 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
83
84 /* The mmap context to use for the header we pick as one above the last defined typed */
85 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
86
87 static int journal_file_set_online(JournalFile *f) {
88 assert(f);
89
90 if (!f->writable)
91 return -EPERM;
92
93 if (!(f->fd >= 0 && f->header))
94 return -EINVAL;
95
96 if (mmap_cache_got_sigbus(f->mmap, f->fd))
97 return -EIO;
98
99 switch(f->header->state) {
100 case STATE_ONLINE:
101 return 0;
102
103 case STATE_OFFLINE:
104 f->header->state = STATE_ONLINE;
105 fsync(f->fd);
106 return 0;
107
108 default:
109 return -EINVAL;
110 }
111 }
112
113 int journal_file_set_offline(JournalFile *f) {
114 assert(f);
115
116 if (!f->writable)
117 return -EPERM;
118
119 if (!(f->fd >= 0 && f->header))
120 return -EINVAL;
121
122 if (f->header->state != STATE_ONLINE)
123 return 0;
124
125 fsync(f->fd);
126
127 if (mmap_cache_got_sigbus(f->mmap, f->fd))
128 return -EIO;
129
130 f->header->state = STATE_OFFLINE;
131
132 if (mmap_cache_got_sigbus(f->mmap, f->fd))
133 return -EIO;
134
135 fsync(f->fd);
136
137 return 0;
138 }
139
140 JournalFile* journal_file_close(JournalFile *f) {
141 assert(f);
142
143 #ifdef HAVE_GCRYPT
144 /* Write the final tag */
145 if (f->seal && f->writable)
146 journal_file_append_tag(f);
147 #endif
148
149 journal_file_set_offline(f);
150
151 if (f->mmap && f->fd >= 0)
152 mmap_cache_close_fd(f->mmap, f->fd);
153
154 if (f->fd >= 0 && f->defrag_on_close) {
155
156 /* Be friendly to btrfs: turn COW back on again now,
157 * and defragment the file. We won't write to the file
158 * ever again, hence remove all fragmentation, and
159 * reenable all the good bits COW usually provides
160 * (such as data checksumming). */
161
162 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
163 (void) btrfs_defrag_fd(f->fd);
164 }
165
166 safe_close(f->fd);
167 free(f->path);
168
169 if (f->mmap)
170 mmap_cache_unref(f->mmap);
171
172 ordered_hashmap_free_free(f->chain_cache);
173
174 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
175 free(f->compress_buffer);
176 #endif
177
178 #ifdef HAVE_GCRYPT
179 if (f->fss_file)
180 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
181 else
182 free(f->fsprg_state);
183
184 free(f->fsprg_seed);
185
186 if (f->hmac)
187 gcry_md_close(f->hmac);
188 #endif
189
190 free(f);
191 return NULL;
192 }
193
194 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
195 Header h = {};
196 ssize_t k;
197 int r;
198
199 assert(f);
200
201 memcpy(h.signature, HEADER_SIGNATURE, 8);
202 h.header_size = htole64(ALIGN64(sizeof(h)));
203
204 h.incompatible_flags |= htole32(
205 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
206 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
207
208 h.compatible_flags = htole32(
209 f->seal * HEADER_COMPATIBLE_SEALED);
210
211 r = sd_id128_randomize(&h.file_id);
212 if (r < 0)
213 return r;
214
215 if (template) {
216 h.seqnum_id = template->header->seqnum_id;
217 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
218 } else
219 h.seqnum_id = h.file_id;
220
221 k = pwrite(f->fd, &h, sizeof(h), 0);
222 if (k < 0)
223 return -errno;
224
225 if (k != sizeof(h))
226 return -EIO;
227
228 return 0;
229 }
230
231 static int journal_file_refresh_header(JournalFile *f) {
232 sd_id128_t boot_id;
233 int r;
234
235 assert(f);
236
237 r = sd_id128_get_machine(&f->header->machine_id);
238 if (r < 0)
239 return r;
240
241 r = sd_id128_get_boot(&boot_id);
242 if (r < 0)
243 return r;
244
245 if (sd_id128_equal(boot_id, f->header->boot_id))
246 f->tail_entry_monotonic_valid = true;
247
248 f->header->boot_id = boot_id;
249
250 r = journal_file_set_online(f);
251
252 /* Sync the online state to disk */
253 fsync(f->fd);
254
255 return r;
256 }
257
258 static int journal_file_verify_header(JournalFile *f) {
259 uint32_t flags;
260
261 assert(f);
262
263 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
264 return -EBADMSG;
265
266 /* In both read and write mode we refuse to open files with
267 * incompatible flags we don't know */
268 flags = le32toh(f->header->incompatible_flags);
269 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
270 if (flags & ~HEADER_INCOMPATIBLE_ANY)
271 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
272 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
273 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
274 if (flags)
275 log_debug("Journal file %s uses incompatible flags %"PRIx32
276 " disabled at compilation time.", f->path, flags);
277 return -EPROTONOSUPPORT;
278 }
279
280 /* When open for writing we refuse to open files with
281 * compatible flags, too */
282 flags = le32toh(f->header->compatible_flags);
283 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
284 if (flags & ~HEADER_COMPATIBLE_ANY)
285 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
286 f->path, flags & ~HEADER_COMPATIBLE_ANY);
287 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
288 if (flags)
289 log_debug("Journal file %s uses compatible flags %"PRIx32
290 " disabled at compilation time.", f->path, flags);
291 return -EPROTONOSUPPORT;
292 }
293
294 if (f->header->state >= _STATE_MAX)
295 return -EBADMSG;
296
297 /* The first addition was n_data, so check that we are at least this large */
298 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
299 return -EBADMSG;
300
301 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
302 return -EBADMSG;
303
304 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
305 return -ENODATA;
306
307 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
308 return -ENODATA;
309
310 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
311 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
312 !VALID64(le64toh(f->header->tail_object_offset)) ||
313 !VALID64(le64toh(f->header->entry_array_offset)))
314 return -ENODATA;
315
316 if (f->writable) {
317 uint8_t state;
318 sd_id128_t machine_id;
319 int r;
320
321 r = sd_id128_get_machine(&machine_id);
322 if (r < 0)
323 return r;
324
325 if (!sd_id128_equal(machine_id, f->header->machine_id))
326 return -EHOSTDOWN;
327
328 state = f->header->state;
329
330 if (state == STATE_ONLINE) {
331 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
332 return -EBUSY;
333 } else if (state == STATE_ARCHIVED)
334 return -ESHUTDOWN;
335 else if (state != STATE_OFFLINE) {
336 log_debug("Journal file %s has unknown state %i.", f->path, state);
337 return -EBUSY;
338 }
339 }
340
341 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
342 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
343
344 f->seal = JOURNAL_HEADER_SEALED(f->header);
345
346 return 0;
347 }
348
349 static int journal_file_fstat(JournalFile *f) {
350 assert(f);
351 assert(f->fd >= 0);
352
353 if (fstat(f->fd, &f->last_stat) < 0)
354 return -errno;
355
356 f->last_stat_usec = now(CLOCK_MONOTONIC);
357
358 /* Refuse appending to files that are already deleted */
359 if (f->last_stat.st_nlink <= 0)
360 return -EIDRM;
361
362 return 0;
363 }
364
365 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
366 uint64_t old_size, new_size;
367 int r;
368
369 assert(f);
370
371 /* We assume that this file is not sparse, and we know that
372 * for sure, since we always call posix_fallocate()
373 * ourselves */
374
375 if (mmap_cache_got_sigbus(f->mmap, f->fd))
376 return -EIO;
377
378 old_size =
379 le64toh(f->header->header_size) +
380 le64toh(f->header->arena_size);
381
382 new_size = PAGE_ALIGN(offset + size);
383 if (new_size < le64toh(f->header->header_size))
384 new_size = le64toh(f->header->header_size);
385
386 if (new_size <= old_size) {
387
388 /* We already pre-allocated enough space, but before
389 * we write to it, let's check with fstat() if the
390 * file got deleted, in order make sure we don't throw
391 * away the data immediately. Don't check fstat() for
392 * all writes though, but only once ever 10s. */
393
394 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
395 return 0;
396
397 return journal_file_fstat(f);
398 }
399
400 /* Allocate more space. */
401
402 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
403 return -E2BIG;
404
405 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
406 struct statvfs svfs;
407
408 if (fstatvfs(f->fd, &svfs) >= 0) {
409 uint64_t available;
410
411 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
412
413 if (new_size - old_size > available)
414 return -E2BIG;
415 }
416 }
417
418 /* Increase by larger blocks at once */
419 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
420 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
421 new_size = f->metrics.max_size;
422
423 /* Note that the glibc fallocate() fallback is very
424 inefficient, hence we try to minimize the allocation area
425 as we can. */
426 r = posix_fallocate(f->fd, old_size, new_size - old_size);
427 if (r != 0)
428 return -r;
429
430 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
431
432 return journal_file_fstat(f);
433 }
434
435 static unsigned type_to_context(ObjectType type) {
436 /* One context for each type, plus one catch-all for the rest */
437 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
438 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
439 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
440 }
441
442 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
443 int r;
444
445 assert(f);
446 assert(ret);
447
448 if (size <= 0)
449 return -EINVAL;
450
451 /* Avoid SIGBUS on invalid accesses */
452 if (offset + size > (uint64_t) f->last_stat.st_size) {
453 /* Hmm, out of range? Let's refresh the fstat() data
454 * first, before we trust that check. */
455
456 r = journal_file_fstat(f);
457 if (r < 0)
458 return r;
459
460 if (offset + size > (uint64_t) f->last_stat.st_size)
461 return -EADDRNOTAVAIL;
462 }
463
464 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
465 }
466
467 static uint64_t minimum_header_size(Object *o) {
468
469 static const uint64_t table[] = {
470 [OBJECT_DATA] = sizeof(DataObject),
471 [OBJECT_FIELD] = sizeof(FieldObject),
472 [OBJECT_ENTRY] = sizeof(EntryObject),
473 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
474 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
475 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
476 [OBJECT_TAG] = sizeof(TagObject),
477 };
478
479 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
480 return sizeof(ObjectHeader);
481
482 return table[o->object.type];
483 }
484
485 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
486 int r;
487 void *t;
488 Object *o;
489 uint64_t s;
490
491 assert(f);
492 assert(ret);
493
494 /* Objects may only be located at multiple of 64 bit */
495 if (!VALID64(offset))
496 return -EFAULT;
497
498 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
499 if (r < 0)
500 return r;
501
502 o = (Object*) t;
503 s = le64toh(o->object.size);
504
505 if (s < sizeof(ObjectHeader))
506 return -EBADMSG;
507
508 if (o->object.type <= OBJECT_UNUSED)
509 return -EBADMSG;
510
511 if (s < minimum_header_size(o))
512 return -EBADMSG;
513
514 if (type > OBJECT_UNUSED && o->object.type != type)
515 return -EBADMSG;
516
517 if (s > sizeof(ObjectHeader)) {
518 r = journal_file_move_to(f, type, false, offset, s, &t);
519 if (r < 0)
520 return r;
521
522 o = (Object*) t;
523 }
524
525 *ret = o;
526 return 0;
527 }
528
529 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
530 uint64_t r;
531
532 assert(f);
533
534 r = le64toh(f->header->tail_entry_seqnum) + 1;
535
536 if (seqnum) {
537 /* If an external seqnum counter was passed, we update
538 * both the local and the external one, and set it to
539 * the maximum of both */
540
541 if (*seqnum + 1 > r)
542 r = *seqnum + 1;
543
544 *seqnum = r;
545 }
546
547 f->header->tail_entry_seqnum = htole64(r);
548
549 if (f->header->head_entry_seqnum == 0)
550 f->header->head_entry_seqnum = htole64(r);
551
552 return r;
553 }
554
555 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
556 int r;
557 uint64_t p;
558 Object *tail, *o;
559 void *t;
560
561 assert(f);
562 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
563 assert(size >= sizeof(ObjectHeader));
564 assert(offset);
565 assert(ret);
566
567 r = journal_file_set_online(f);
568 if (r < 0)
569 return r;
570
571 p = le64toh(f->header->tail_object_offset);
572 if (p == 0)
573 p = le64toh(f->header->header_size);
574 else {
575 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
576 if (r < 0)
577 return r;
578
579 p += ALIGN64(le64toh(tail->object.size));
580 }
581
582 r = journal_file_allocate(f, p, size);
583 if (r < 0)
584 return r;
585
586 r = journal_file_move_to(f, type, false, p, size, &t);
587 if (r < 0)
588 return r;
589
590 o = (Object*) t;
591
592 zero(o->object);
593 o->object.type = type;
594 o->object.size = htole64(size);
595
596 f->header->tail_object_offset = htole64(p);
597 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
598
599 *ret = o;
600 *offset = p;
601
602 return 0;
603 }
604
605 static int journal_file_setup_data_hash_table(JournalFile *f) {
606 uint64_t s, p;
607 Object *o;
608 int r;
609
610 assert(f);
611
612 /* We estimate that we need 1 hash table entry per 768 bytes
613 of journal file and we want to make sure we never get
614 beyond 75% fill level. Calculate the hash table size for
615 the maximum file size based on these metrics. */
616
617 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
618 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
619 s = DEFAULT_DATA_HASH_TABLE_SIZE;
620
621 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
622
623 r = journal_file_append_object(f,
624 OBJECT_DATA_HASH_TABLE,
625 offsetof(Object, hash_table.items) + s,
626 &o, &p);
627 if (r < 0)
628 return r;
629
630 memzero(o->hash_table.items, s);
631
632 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
633 f->header->data_hash_table_size = htole64(s);
634
635 return 0;
636 }
637
638 static int journal_file_setup_field_hash_table(JournalFile *f) {
639 uint64_t s, p;
640 Object *o;
641 int r;
642
643 assert(f);
644
645 /* We use a fixed size hash table for the fields as this
646 * number should grow very slowly only */
647
648 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
649 r = journal_file_append_object(f,
650 OBJECT_FIELD_HASH_TABLE,
651 offsetof(Object, hash_table.items) + s,
652 &o, &p);
653 if (r < 0)
654 return r;
655
656 memzero(o->hash_table.items, s);
657
658 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
659 f->header->field_hash_table_size = htole64(s);
660
661 return 0;
662 }
663
664 int journal_file_map_data_hash_table(JournalFile *f) {
665 uint64_t s, p;
666 void *t;
667 int r;
668
669 assert(f);
670
671 if (f->data_hash_table)
672 return 0;
673
674 p = le64toh(f->header->data_hash_table_offset);
675 s = le64toh(f->header->data_hash_table_size);
676
677 r = journal_file_move_to(f,
678 OBJECT_DATA_HASH_TABLE,
679 true,
680 p, s,
681 &t);
682 if (r < 0)
683 return r;
684
685 f->data_hash_table = t;
686 return 0;
687 }
688
689 int journal_file_map_field_hash_table(JournalFile *f) {
690 uint64_t s, p;
691 void *t;
692 int r;
693
694 assert(f);
695
696 if (f->field_hash_table)
697 return 0;
698
699 p = le64toh(f->header->field_hash_table_offset);
700 s = le64toh(f->header->field_hash_table_size);
701
702 r = journal_file_move_to(f,
703 OBJECT_FIELD_HASH_TABLE,
704 true,
705 p, s,
706 &t);
707 if (r < 0)
708 return r;
709
710 f->field_hash_table = t;
711 return 0;
712 }
713
714 static int journal_file_link_field(
715 JournalFile *f,
716 Object *o,
717 uint64_t offset,
718 uint64_t hash) {
719
720 uint64_t p, h, m;
721 int r;
722
723 assert(f);
724 assert(o);
725 assert(offset > 0);
726
727 if (o->object.type != OBJECT_FIELD)
728 return -EINVAL;
729
730 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
731 if (m <= 0)
732 return -EBADMSG;
733
734 /* This might alter the window we are looking at */
735 o->field.next_hash_offset = o->field.head_data_offset = 0;
736
737 h = hash % m;
738 p = le64toh(f->field_hash_table[h].tail_hash_offset);
739 if (p == 0)
740 f->field_hash_table[h].head_hash_offset = htole64(offset);
741 else {
742 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
743 if (r < 0)
744 return r;
745
746 o->field.next_hash_offset = htole64(offset);
747 }
748
749 f->field_hash_table[h].tail_hash_offset = htole64(offset);
750
751 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
752 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
753
754 return 0;
755 }
756
757 static int journal_file_link_data(
758 JournalFile *f,
759 Object *o,
760 uint64_t offset,
761 uint64_t hash) {
762
763 uint64_t p, h, m;
764 int r;
765
766 assert(f);
767 assert(o);
768 assert(offset > 0);
769
770 if (o->object.type != OBJECT_DATA)
771 return -EINVAL;
772
773 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
774 if (m <= 0)
775 return -EBADMSG;
776
777 /* This might alter the window we are looking at */
778 o->data.next_hash_offset = o->data.next_field_offset = 0;
779 o->data.entry_offset = o->data.entry_array_offset = 0;
780 o->data.n_entries = 0;
781
782 h = hash % m;
783 p = le64toh(f->data_hash_table[h].tail_hash_offset);
784 if (p == 0)
785 /* Only entry in the hash table is easy */
786 f->data_hash_table[h].head_hash_offset = htole64(offset);
787 else {
788 /* Move back to the previous data object, to patch in
789 * pointer */
790
791 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
792 if (r < 0)
793 return r;
794
795 o->data.next_hash_offset = htole64(offset);
796 }
797
798 f->data_hash_table[h].tail_hash_offset = htole64(offset);
799
800 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
801 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
802
803 return 0;
804 }
805
806 int journal_file_find_field_object_with_hash(
807 JournalFile *f,
808 const void *field, uint64_t size, uint64_t hash,
809 Object **ret, uint64_t *offset) {
810
811 uint64_t p, osize, h, m;
812 int r;
813
814 assert(f);
815 assert(field && size > 0);
816
817 /* If the field hash table is empty, we can't find anything */
818 if (le64toh(f->header->field_hash_table_size) <= 0)
819 return 0;
820
821 /* Map the field hash table, if it isn't mapped yet. */
822 r = journal_file_map_field_hash_table(f);
823 if (r < 0)
824 return r;
825
826 osize = offsetof(Object, field.payload) + size;
827
828 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
829 if (m <= 0)
830 return -EBADMSG;
831
832 h = hash % m;
833 p = le64toh(f->field_hash_table[h].head_hash_offset);
834
835 while (p > 0) {
836 Object *o;
837
838 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
839 if (r < 0)
840 return r;
841
842 if (le64toh(o->field.hash) == hash &&
843 le64toh(o->object.size) == osize &&
844 memcmp(o->field.payload, field, size) == 0) {
845
846 if (ret)
847 *ret = o;
848 if (offset)
849 *offset = p;
850
851 return 1;
852 }
853
854 p = le64toh(o->field.next_hash_offset);
855 }
856
857 return 0;
858 }
859
860 int journal_file_find_field_object(
861 JournalFile *f,
862 const void *field, uint64_t size,
863 Object **ret, uint64_t *offset) {
864
865 uint64_t hash;
866
867 assert(f);
868 assert(field && size > 0);
869
870 hash = hash64(field, size);
871
872 return journal_file_find_field_object_with_hash(f,
873 field, size, hash,
874 ret, offset);
875 }
876
877 int journal_file_find_data_object_with_hash(
878 JournalFile *f,
879 const void *data, uint64_t size, uint64_t hash,
880 Object **ret, uint64_t *offset) {
881
882 uint64_t p, osize, h, m;
883 int r;
884
885 assert(f);
886 assert(data || size == 0);
887
888 /* If there's no data hash table, then there's no entry. */
889 if (le64toh(f->header->data_hash_table_size) <= 0)
890 return 0;
891
892 /* Map the data hash table, if it isn't mapped yet. */
893 r = journal_file_map_data_hash_table(f);
894 if (r < 0)
895 return r;
896
897 osize = offsetof(Object, data.payload) + size;
898
899 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
900 if (m <= 0)
901 return -EBADMSG;
902
903 h = hash % m;
904 p = le64toh(f->data_hash_table[h].head_hash_offset);
905
906 while (p > 0) {
907 Object *o;
908
909 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
910 if (r < 0)
911 return r;
912
913 if (le64toh(o->data.hash) != hash)
914 goto next;
915
916 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
917 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
918 uint64_t l;
919 size_t rsize = 0;
920
921 l = le64toh(o->object.size);
922 if (l <= offsetof(Object, data.payload))
923 return -EBADMSG;
924
925 l -= offsetof(Object, data.payload);
926
927 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
928 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
929 if (r < 0)
930 return r;
931
932 if (rsize == size &&
933 memcmp(f->compress_buffer, data, size) == 0) {
934
935 if (ret)
936 *ret = o;
937
938 if (offset)
939 *offset = p;
940
941 return 1;
942 }
943 #else
944 return -EPROTONOSUPPORT;
945 #endif
946 } else if (le64toh(o->object.size) == osize &&
947 memcmp(o->data.payload, data, size) == 0) {
948
949 if (ret)
950 *ret = o;
951
952 if (offset)
953 *offset = p;
954
955 return 1;
956 }
957
958 next:
959 p = le64toh(o->data.next_hash_offset);
960 }
961
962 return 0;
963 }
964
965 int journal_file_find_data_object(
966 JournalFile *f,
967 const void *data, uint64_t size,
968 Object **ret, uint64_t *offset) {
969
970 uint64_t hash;
971
972 assert(f);
973 assert(data || size == 0);
974
975 hash = hash64(data, size);
976
977 return journal_file_find_data_object_with_hash(f,
978 data, size, hash,
979 ret, offset);
980 }
981
982 static int journal_file_append_field(
983 JournalFile *f,
984 const void *field, uint64_t size,
985 Object **ret, uint64_t *offset) {
986
987 uint64_t hash, p;
988 uint64_t osize;
989 Object *o;
990 int r;
991
992 assert(f);
993 assert(field && size > 0);
994
995 hash = hash64(field, size);
996
997 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
998 if (r < 0)
999 return r;
1000 else if (r > 0) {
1001
1002 if (ret)
1003 *ret = o;
1004
1005 if (offset)
1006 *offset = p;
1007
1008 return 0;
1009 }
1010
1011 osize = offsetof(Object, field.payload) + size;
1012 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1013 if (r < 0)
1014 return r;
1015
1016 o->field.hash = htole64(hash);
1017 memcpy(o->field.payload, field, size);
1018
1019 r = journal_file_link_field(f, o, p, hash);
1020 if (r < 0)
1021 return r;
1022
1023 /* The linking might have altered the window, so let's
1024 * refresh our pointer */
1025 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1026 if (r < 0)
1027 return r;
1028
1029 #ifdef HAVE_GCRYPT
1030 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1031 if (r < 0)
1032 return r;
1033 #endif
1034
1035 if (ret)
1036 *ret = o;
1037
1038 if (offset)
1039 *offset = p;
1040
1041 return 0;
1042 }
1043
1044 static int journal_file_append_data(
1045 JournalFile *f,
1046 const void *data, uint64_t size,
1047 Object **ret, uint64_t *offset) {
1048
1049 uint64_t hash, p;
1050 uint64_t osize;
1051 Object *o;
1052 int r, compression = 0;
1053 const void *eq;
1054
1055 assert(f);
1056 assert(data || size == 0);
1057
1058 hash = hash64(data, size);
1059
1060 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1061 if (r < 0)
1062 return r;
1063 if (r > 0) {
1064
1065 if (ret)
1066 *ret = o;
1067
1068 if (offset)
1069 *offset = p;
1070
1071 return 0;
1072 }
1073
1074 osize = offsetof(Object, data.payload) + size;
1075 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1076 if (r < 0)
1077 return r;
1078
1079 o->data.hash = htole64(hash);
1080
1081 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1082 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1083 size_t rsize = 0;
1084
1085 compression = compress_blob(data, size, o->data.payload, &rsize);
1086
1087 if (compression >= 0) {
1088 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1089 o->object.flags |= compression;
1090
1091 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1092 size, rsize, object_compressed_to_string(compression));
1093 } else
1094 /* Compression didn't work, we don't really care why, let's continue without compression */
1095 compression = 0;
1096 }
1097 #endif
1098
1099 if (compression == 0 && size > 0)
1100 memcpy(o->data.payload, data, size);
1101
1102 r = journal_file_link_data(f, o, p, hash);
1103 if (r < 0)
1104 return r;
1105
1106 /* The linking might have altered the window, so let's
1107 * refresh our pointer */
1108 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1109 if (r < 0)
1110 return r;
1111
1112 if (!data)
1113 eq = NULL;
1114 else
1115 eq = memchr(data, '=', size);
1116 if (eq && eq > data) {
1117 Object *fo = NULL;
1118 uint64_t fp;
1119
1120 /* Create field object ... */
1121 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1122 if (r < 0)
1123 return r;
1124
1125 /* ... and link it in. */
1126 o->data.next_field_offset = fo->field.head_data_offset;
1127 fo->field.head_data_offset = le64toh(p);
1128 }
1129
1130 #ifdef HAVE_GCRYPT
1131 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1132 if (r < 0)
1133 return r;
1134 #endif
1135
1136 if (ret)
1137 *ret = o;
1138
1139 if (offset)
1140 *offset = p;
1141
1142 return 0;
1143 }
1144
1145 uint64_t journal_file_entry_n_items(Object *o) {
1146 assert(o);
1147
1148 if (o->object.type != OBJECT_ENTRY)
1149 return 0;
1150
1151 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1152 }
1153
1154 uint64_t journal_file_entry_array_n_items(Object *o) {
1155 assert(o);
1156
1157 if (o->object.type != OBJECT_ENTRY_ARRAY)
1158 return 0;
1159
1160 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1161 }
1162
1163 uint64_t journal_file_hash_table_n_items(Object *o) {
1164 assert(o);
1165
1166 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1167 o->object.type != OBJECT_FIELD_HASH_TABLE)
1168 return 0;
1169
1170 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1171 }
1172
1173 static int link_entry_into_array(JournalFile *f,
1174 le64_t *first,
1175 le64_t *idx,
1176 uint64_t p) {
1177 int r;
1178 uint64_t n = 0, ap = 0, q, i, a, hidx;
1179 Object *o;
1180
1181 assert(f);
1182 assert(first);
1183 assert(idx);
1184 assert(p > 0);
1185
1186 a = le64toh(*first);
1187 i = hidx = le64toh(*idx);
1188 while (a > 0) {
1189
1190 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1191 if (r < 0)
1192 return r;
1193
1194 n = journal_file_entry_array_n_items(o);
1195 if (i < n) {
1196 o->entry_array.items[i] = htole64(p);
1197 *idx = htole64(hidx + 1);
1198 return 0;
1199 }
1200
1201 i -= n;
1202 ap = a;
1203 a = le64toh(o->entry_array.next_entry_array_offset);
1204 }
1205
1206 if (hidx > n)
1207 n = (hidx+1) * 2;
1208 else
1209 n = n * 2;
1210
1211 if (n < 4)
1212 n = 4;
1213
1214 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1215 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1216 &o, &q);
1217 if (r < 0)
1218 return r;
1219
1220 #ifdef HAVE_GCRYPT
1221 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1222 if (r < 0)
1223 return r;
1224 #endif
1225
1226 o->entry_array.items[i] = htole64(p);
1227
1228 if (ap == 0)
1229 *first = htole64(q);
1230 else {
1231 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1232 if (r < 0)
1233 return r;
1234
1235 o->entry_array.next_entry_array_offset = htole64(q);
1236 }
1237
1238 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1239 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1240
1241 *idx = htole64(hidx + 1);
1242
1243 return 0;
1244 }
1245
1246 static int link_entry_into_array_plus_one(JournalFile *f,
1247 le64_t *extra,
1248 le64_t *first,
1249 le64_t *idx,
1250 uint64_t p) {
1251
1252 int r;
1253
1254 assert(f);
1255 assert(extra);
1256 assert(first);
1257 assert(idx);
1258 assert(p > 0);
1259
1260 if (*idx == 0)
1261 *extra = htole64(p);
1262 else {
1263 le64_t i;
1264
1265 i = htole64(le64toh(*idx) - 1);
1266 r = link_entry_into_array(f, first, &i, p);
1267 if (r < 0)
1268 return r;
1269 }
1270
1271 *idx = htole64(le64toh(*idx) + 1);
1272 return 0;
1273 }
1274
1275 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1276 uint64_t p;
1277 int r;
1278 assert(f);
1279 assert(o);
1280 assert(offset > 0);
1281
1282 p = le64toh(o->entry.items[i].object_offset);
1283 if (p == 0)
1284 return -EINVAL;
1285
1286 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1287 if (r < 0)
1288 return r;
1289
1290 return link_entry_into_array_plus_one(f,
1291 &o->data.entry_offset,
1292 &o->data.entry_array_offset,
1293 &o->data.n_entries,
1294 offset);
1295 }
1296
1297 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1298 uint64_t n, i;
1299 int r;
1300
1301 assert(f);
1302 assert(o);
1303 assert(offset > 0);
1304
1305 if (o->object.type != OBJECT_ENTRY)
1306 return -EINVAL;
1307
1308 __sync_synchronize();
1309
1310 /* Link up the entry itself */
1311 r = link_entry_into_array(f,
1312 &f->header->entry_array_offset,
1313 &f->header->n_entries,
1314 offset);
1315 if (r < 0)
1316 return r;
1317
1318 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1319
1320 if (f->header->head_entry_realtime == 0)
1321 f->header->head_entry_realtime = o->entry.realtime;
1322
1323 f->header->tail_entry_realtime = o->entry.realtime;
1324 f->header->tail_entry_monotonic = o->entry.monotonic;
1325
1326 f->tail_entry_monotonic_valid = true;
1327
1328 /* Link up the items */
1329 n = journal_file_entry_n_items(o);
1330 for (i = 0; i < n; i++) {
1331 r = journal_file_link_entry_item(f, o, offset, i);
1332 if (r < 0)
1333 return r;
1334 }
1335
1336 return 0;
1337 }
1338
1339 static int journal_file_append_entry_internal(
1340 JournalFile *f,
1341 const dual_timestamp *ts,
1342 uint64_t xor_hash,
1343 const EntryItem items[], unsigned n_items,
1344 uint64_t *seqnum,
1345 Object **ret, uint64_t *offset) {
1346 uint64_t np;
1347 uint64_t osize;
1348 Object *o;
1349 int r;
1350
1351 assert(f);
1352 assert(items || n_items == 0);
1353 assert(ts);
1354
1355 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1356
1357 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1358 if (r < 0)
1359 return r;
1360
1361 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1362 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1363 o->entry.realtime = htole64(ts->realtime);
1364 o->entry.monotonic = htole64(ts->monotonic);
1365 o->entry.xor_hash = htole64(xor_hash);
1366 o->entry.boot_id = f->header->boot_id;
1367
1368 #ifdef HAVE_GCRYPT
1369 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1370 if (r < 0)
1371 return r;
1372 #endif
1373
1374 r = journal_file_link_entry(f, o, np);
1375 if (r < 0)
1376 return r;
1377
1378 if (ret)
1379 *ret = o;
1380
1381 if (offset)
1382 *offset = np;
1383
1384 return 0;
1385 }
1386
1387 void journal_file_post_change(JournalFile *f) {
1388 assert(f);
1389
1390 /* inotify() does not receive IN_MODIFY events from file
1391 * accesses done via mmap(). After each access we hence
1392 * trigger IN_MODIFY by truncating the journal file to its
1393 * current size which triggers IN_MODIFY. */
1394
1395 __sync_synchronize();
1396
1397 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1398 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1399 }
1400
1401 static int entry_item_cmp(const void *_a, const void *_b) {
1402 const EntryItem *a = _a, *b = _b;
1403
1404 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1405 return -1;
1406 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1407 return 1;
1408 return 0;
1409 }
1410
1411 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1412 unsigned i;
1413 EntryItem *items;
1414 int r;
1415 uint64_t xor_hash = 0;
1416 struct dual_timestamp _ts;
1417
1418 assert(f);
1419 assert(iovec || n_iovec == 0);
1420
1421 if (!ts) {
1422 dual_timestamp_get(&_ts);
1423 ts = &_ts;
1424 }
1425
1426 if (f->tail_entry_monotonic_valid &&
1427 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1428 return -EINVAL;
1429
1430 #ifdef HAVE_GCRYPT
1431 r = journal_file_maybe_append_tag(f, ts->realtime);
1432 if (r < 0)
1433 return r;
1434 #endif
1435
1436 /* alloca() can't take 0, hence let's allocate at least one */
1437 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1438
1439 for (i = 0; i < n_iovec; i++) {
1440 uint64_t p;
1441 Object *o;
1442
1443 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1444 if (r < 0)
1445 return r;
1446
1447 xor_hash ^= le64toh(o->data.hash);
1448 items[i].object_offset = htole64(p);
1449 items[i].hash = o->data.hash;
1450 }
1451
1452 /* Order by the position on disk, in order to improve seek
1453 * times for rotating media. */
1454 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1455
1456 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1457
1458 /* If the memory mapping triggered a SIGBUS then we return an
1459 * IO error and ignore the error code passed down to us, since
1460 * it is very likely just an effect of a nullified replacement
1461 * mapping page */
1462
1463 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1464 r = -EIO;
1465
1466 journal_file_post_change(f);
1467
1468 return r;
1469 }
1470
1471 typedef struct ChainCacheItem {
1472 uint64_t first; /* the array at the beginning of the chain */
1473 uint64_t array; /* the cached array */
1474 uint64_t begin; /* the first item in the cached array */
1475 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1476 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1477 } ChainCacheItem;
1478
1479 static void chain_cache_put(
1480 OrderedHashmap *h,
1481 ChainCacheItem *ci,
1482 uint64_t first,
1483 uint64_t array,
1484 uint64_t begin,
1485 uint64_t total,
1486 uint64_t last_index) {
1487
1488 if (!ci) {
1489 /* If the chain item to cache for this chain is the
1490 * first one it's not worth caching anything */
1491 if (array == first)
1492 return;
1493
1494 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1495 ci = ordered_hashmap_steal_first(h);
1496 assert(ci);
1497 } else {
1498 ci = new(ChainCacheItem, 1);
1499 if (!ci)
1500 return;
1501 }
1502
1503 ci->first = first;
1504
1505 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1506 free(ci);
1507 return;
1508 }
1509 } else
1510 assert(ci->first == first);
1511
1512 ci->array = array;
1513 ci->begin = begin;
1514 ci->total = total;
1515 ci->last_index = last_index;
1516 }
1517
1518 static int generic_array_get(
1519 JournalFile *f,
1520 uint64_t first,
1521 uint64_t i,
1522 Object **ret, uint64_t *offset) {
1523
1524 Object *o;
1525 uint64_t p = 0, a, t = 0;
1526 int r;
1527 ChainCacheItem *ci;
1528
1529 assert(f);
1530
1531 a = first;
1532
1533 /* Try the chain cache first */
1534 ci = ordered_hashmap_get(f->chain_cache, &first);
1535 if (ci && i > ci->total) {
1536 a = ci->array;
1537 i -= ci->total;
1538 t = ci->total;
1539 }
1540
1541 while (a > 0) {
1542 uint64_t k;
1543
1544 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1545 if (r < 0)
1546 return r;
1547
1548 k = journal_file_entry_array_n_items(o);
1549 if (i < k) {
1550 p = le64toh(o->entry_array.items[i]);
1551 goto found;
1552 }
1553
1554 i -= k;
1555 t += k;
1556 a = le64toh(o->entry_array.next_entry_array_offset);
1557 }
1558
1559 return 0;
1560
1561 found:
1562 /* Let's cache this item for the next invocation */
1563 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1564
1565 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1566 if (r < 0)
1567 return r;
1568
1569 if (ret)
1570 *ret = o;
1571
1572 if (offset)
1573 *offset = p;
1574
1575 return 1;
1576 }
1577
1578 static int generic_array_get_plus_one(
1579 JournalFile *f,
1580 uint64_t extra,
1581 uint64_t first,
1582 uint64_t i,
1583 Object **ret, uint64_t *offset) {
1584
1585 Object *o;
1586
1587 assert(f);
1588
1589 if (i == 0) {
1590 int r;
1591
1592 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1593 if (r < 0)
1594 return r;
1595
1596 if (ret)
1597 *ret = o;
1598
1599 if (offset)
1600 *offset = extra;
1601
1602 return 1;
1603 }
1604
1605 return generic_array_get(f, first, i-1, ret, offset);
1606 }
1607
1608 enum {
1609 TEST_FOUND,
1610 TEST_LEFT,
1611 TEST_RIGHT
1612 };
1613
1614 static int generic_array_bisect(
1615 JournalFile *f,
1616 uint64_t first,
1617 uint64_t n,
1618 uint64_t needle,
1619 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1620 direction_t direction,
1621 Object **ret,
1622 uint64_t *offset,
1623 uint64_t *idx) {
1624
1625 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1626 bool subtract_one = false;
1627 Object *o, *array = NULL;
1628 int r;
1629 ChainCacheItem *ci;
1630
1631 assert(f);
1632 assert(test_object);
1633
1634 /* Start with the first array in the chain */
1635 a = first;
1636
1637 ci = ordered_hashmap_get(f->chain_cache, &first);
1638 if (ci && n > ci->total) {
1639 /* Ah, we have iterated this bisection array chain
1640 * previously! Let's see if we can skip ahead in the
1641 * chain, as far as the last time. But we can't jump
1642 * backwards in the chain, so let's check that
1643 * first. */
1644
1645 r = test_object(f, ci->begin, needle);
1646 if (r < 0)
1647 return r;
1648
1649 if (r == TEST_LEFT) {
1650 /* OK, what we are looking for is right of the
1651 * begin of this EntryArray, so let's jump
1652 * straight to previously cached array in the
1653 * chain */
1654
1655 a = ci->array;
1656 n -= ci->total;
1657 t = ci->total;
1658 last_index = ci->last_index;
1659 }
1660 }
1661
1662 while (a > 0) {
1663 uint64_t left, right, k, lp;
1664
1665 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1666 if (r < 0)
1667 return r;
1668
1669 k = journal_file_entry_array_n_items(array);
1670 right = MIN(k, n);
1671 if (right <= 0)
1672 return 0;
1673
1674 i = right - 1;
1675 lp = p = le64toh(array->entry_array.items[i]);
1676 if (p <= 0)
1677 return -EBADMSG;
1678
1679 r = test_object(f, p, needle);
1680 if (r < 0)
1681 return r;
1682
1683 if (r == TEST_FOUND)
1684 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1685
1686 if (r == TEST_RIGHT) {
1687 left = 0;
1688 right -= 1;
1689
1690 if (last_index != (uint64_t) -1) {
1691 assert(last_index <= right);
1692
1693 /* If we cached the last index we
1694 * looked at, let's try to not to jump
1695 * too wildly around and see if we can
1696 * limit the range to look at early to
1697 * the immediate neighbors of the last
1698 * index we looked at. */
1699
1700 if (last_index > 0) {
1701 uint64_t x = last_index - 1;
1702
1703 p = le64toh(array->entry_array.items[x]);
1704 if (p <= 0)
1705 return -EBADMSG;
1706
1707 r = test_object(f, p, needle);
1708 if (r < 0)
1709 return r;
1710
1711 if (r == TEST_FOUND)
1712 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1713
1714 if (r == TEST_RIGHT)
1715 right = x;
1716 else
1717 left = x + 1;
1718 }
1719
1720 if (last_index < right) {
1721 uint64_t y = last_index + 1;
1722
1723 p = le64toh(array->entry_array.items[y]);
1724 if (p <= 0)
1725 return -EBADMSG;
1726
1727 r = test_object(f, p, needle);
1728 if (r < 0)
1729 return r;
1730
1731 if (r == TEST_FOUND)
1732 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1733
1734 if (r == TEST_RIGHT)
1735 right = y;
1736 else
1737 left = y + 1;
1738 }
1739 }
1740
1741 for (;;) {
1742 if (left == right) {
1743 if (direction == DIRECTION_UP)
1744 subtract_one = true;
1745
1746 i = left;
1747 goto found;
1748 }
1749
1750 assert(left < right);
1751 i = (left + right) / 2;
1752
1753 p = le64toh(array->entry_array.items[i]);
1754 if (p <= 0)
1755 return -EBADMSG;
1756
1757 r = test_object(f, p, needle);
1758 if (r < 0)
1759 return r;
1760
1761 if (r == TEST_FOUND)
1762 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1763
1764 if (r == TEST_RIGHT)
1765 right = i;
1766 else
1767 left = i + 1;
1768 }
1769 }
1770
1771 if (k >= n) {
1772 if (direction == DIRECTION_UP) {
1773 i = n;
1774 subtract_one = true;
1775 goto found;
1776 }
1777
1778 return 0;
1779 }
1780
1781 last_p = lp;
1782
1783 n -= k;
1784 t += k;
1785 last_index = (uint64_t) -1;
1786 a = le64toh(array->entry_array.next_entry_array_offset);
1787 }
1788
1789 return 0;
1790
1791 found:
1792 if (subtract_one && t == 0 && i == 0)
1793 return 0;
1794
1795 /* Let's cache this item for the next invocation */
1796 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1797
1798 if (subtract_one && i == 0)
1799 p = last_p;
1800 else if (subtract_one)
1801 p = le64toh(array->entry_array.items[i-1]);
1802 else
1803 p = le64toh(array->entry_array.items[i]);
1804
1805 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1806 if (r < 0)
1807 return r;
1808
1809 if (ret)
1810 *ret = o;
1811
1812 if (offset)
1813 *offset = p;
1814
1815 if (idx)
1816 *idx = t + i + (subtract_one ? -1 : 0);
1817
1818 return 1;
1819 }
1820
1821 static int generic_array_bisect_plus_one(
1822 JournalFile *f,
1823 uint64_t extra,
1824 uint64_t first,
1825 uint64_t n,
1826 uint64_t needle,
1827 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1828 direction_t direction,
1829 Object **ret,
1830 uint64_t *offset,
1831 uint64_t *idx) {
1832
1833 int r;
1834 bool step_back = false;
1835 Object *o;
1836
1837 assert(f);
1838 assert(test_object);
1839
1840 if (n <= 0)
1841 return 0;
1842
1843 /* This bisects the array in object 'first', but first checks
1844 * an extra */
1845 r = test_object(f, extra, needle);
1846 if (r < 0)
1847 return r;
1848
1849 if (r == TEST_FOUND)
1850 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1851
1852 /* if we are looking with DIRECTION_UP then we need to first
1853 see if in the actual array there is a matching entry, and
1854 return the last one of that. But if there isn't any we need
1855 to return this one. Hence remember this, and return it
1856 below. */
1857 if (r == TEST_LEFT)
1858 step_back = direction == DIRECTION_UP;
1859
1860 if (r == TEST_RIGHT) {
1861 if (direction == DIRECTION_DOWN)
1862 goto found;
1863 else
1864 return 0;
1865 }
1866
1867 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1868
1869 if (r == 0 && step_back)
1870 goto found;
1871
1872 if (r > 0 && idx)
1873 (*idx) ++;
1874
1875 return r;
1876
1877 found:
1878 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1879 if (r < 0)
1880 return r;
1881
1882 if (ret)
1883 *ret = o;
1884
1885 if (offset)
1886 *offset = extra;
1887
1888 if (idx)
1889 *idx = 0;
1890
1891 return 1;
1892 }
1893
1894 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1895 assert(f);
1896 assert(p > 0);
1897
1898 if (p == needle)
1899 return TEST_FOUND;
1900 else if (p < needle)
1901 return TEST_LEFT;
1902 else
1903 return TEST_RIGHT;
1904 }
1905
1906 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1907 Object *o;
1908 int r;
1909
1910 assert(f);
1911 assert(p > 0);
1912
1913 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1914 if (r < 0)
1915 return r;
1916
1917 if (le64toh(o->entry.seqnum) == needle)
1918 return TEST_FOUND;
1919 else if (le64toh(o->entry.seqnum) < needle)
1920 return TEST_LEFT;
1921 else
1922 return TEST_RIGHT;
1923 }
1924
1925 int journal_file_move_to_entry_by_seqnum(
1926 JournalFile *f,
1927 uint64_t seqnum,
1928 direction_t direction,
1929 Object **ret,
1930 uint64_t *offset) {
1931
1932 return generic_array_bisect(f,
1933 le64toh(f->header->entry_array_offset),
1934 le64toh(f->header->n_entries),
1935 seqnum,
1936 test_object_seqnum,
1937 direction,
1938 ret, offset, NULL);
1939 }
1940
1941 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1942 Object *o;
1943 int r;
1944
1945 assert(f);
1946 assert(p > 0);
1947
1948 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1949 if (r < 0)
1950 return r;
1951
1952 if (le64toh(o->entry.realtime) == needle)
1953 return TEST_FOUND;
1954 else if (le64toh(o->entry.realtime) < needle)
1955 return TEST_LEFT;
1956 else
1957 return TEST_RIGHT;
1958 }
1959
1960 int journal_file_move_to_entry_by_realtime(
1961 JournalFile *f,
1962 uint64_t realtime,
1963 direction_t direction,
1964 Object **ret,
1965 uint64_t *offset) {
1966
1967 return generic_array_bisect(f,
1968 le64toh(f->header->entry_array_offset),
1969 le64toh(f->header->n_entries),
1970 realtime,
1971 test_object_realtime,
1972 direction,
1973 ret, offset, NULL);
1974 }
1975
1976 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1977 Object *o;
1978 int r;
1979
1980 assert(f);
1981 assert(p > 0);
1982
1983 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1984 if (r < 0)
1985 return r;
1986
1987 if (le64toh(o->entry.monotonic) == needle)
1988 return TEST_FOUND;
1989 else if (le64toh(o->entry.monotonic) < needle)
1990 return TEST_LEFT;
1991 else
1992 return TEST_RIGHT;
1993 }
1994
1995 static int find_data_object_by_boot_id(
1996 JournalFile *f,
1997 sd_id128_t boot_id,
1998 Object **o,
1999 uint64_t *b) {
2000
2001 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2002
2003 sd_id128_to_string(boot_id, t + 9);
2004 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2005 }
2006
2007 int journal_file_move_to_entry_by_monotonic(
2008 JournalFile *f,
2009 sd_id128_t boot_id,
2010 uint64_t monotonic,
2011 direction_t direction,
2012 Object **ret,
2013 uint64_t *offset) {
2014
2015 Object *o;
2016 int r;
2017
2018 assert(f);
2019
2020 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2021 if (r < 0)
2022 return r;
2023 if (r == 0)
2024 return -ENOENT;
2025
2026 return generic_array_bisect_plus_one(f,
2027 le64toh(o->data.entry_offset),
2028 le64toh(o->data.entry_array_offset),
2029 le64toh(o->data.n_entries),
2030 monotonic,
2031 test_object_monotonic,
2032 direction,
2033 ret, offset, NULL);
2034 }
2035
2036 void journal_file_reset_location(JournalFile *f) {
2037 f->location_type = LOCATION_HEAD;
2038 f->current_offset = 0;
2039 f->current_seqnum = 0;
2040 f->current_realtime = 0;
2041 f->current_monotonic = 0;
2042 zero(f->current_boot_id);
2043 f->current_xor_hash = 0;
2044 }
2045
2046 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2047 f->location_type = LOCATION_SEEK;
2048 f->current_offset = offset;
2049 f->current_seqnum = le64toh(o->entry.seqnum);
2050 f->current_realtime = le64toh(o->entry.realtime);
2051 f->current_monotonic = le64toh(o->entry.monotonic);
2052 f->current_boot_id = o->entry.boot_id;
2053 f->current_xor_hash = le64toh(o->entry.xor_hash);
2054 }
2055
2056 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2057 assert(af);
2058 assert(bf);
2059 assert(af->location_type == LOCATION_SEEK);
2060 assert(bf->location_type == LOCATION_SEEK);
2061
2062 /* If contents and timestamps match, these entries are
2063 * identical, even if the seqnum does not match */
2064 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2065 af->current_monotonic == bf->current_monotonic &&
2066 af->current_realtime == bf->current_realtime &&
2067 af->current_xor_hash == bf->current_xor_hash)
2068 return 0;
2069
2070 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2071
2072 /* If this is from the same seqnum source, compare
2073 * seqnums */
2074 if (af->current_seqnum < bf->current_seqnum)
2075 return -1;
2076 if (af->current_seqnum > bf->current_seqnum)
2077 return 1;
2078
2079 /* Wow! This is weird, different data but the same
2080 * seqnums? Something is borked, but let's make the
2081 * best of it and compare by time. */
2082 }
2083
2084 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2085
2086 /* If the boot id matches, compare monotonic time */
2087 if (af->current_monotonic < bf->current_monotonic)
2088 return -1;
2089 if (af->current_monotonic > bf->current_monotonic)
2090 return 1;
2091 }
2092
2093 /* Otherwise, compare UTC time */
2094 if (af->current_realtime < bf->current_realtime)
2095 return -1;
2096 if (af->current_realtime > bf->current_realtime)
2097 return 1;
2098
2099 /* Finally, compare by contents */
2100 if (af->current_xor_hash < bf->current_xor_hash)
2101 return -1;
2102 if (af->current_xor_hash > bf->current_xor_hash)
2103 return 1;
2104
2105 return 0;
2106 }
2107
2108 int journal_file_next_entry(
2109 JournalFile *f,
2110 uint64_t p,
2111 direction_t direction,
2112 Object **ret, uint64_t *offset) {
2113
2114 uint64_t i, n, ofs;
2115 int r;
2116
2117 assert(f);
2118
2119 n = le64toh(f->header->n_entries);
2120 if (n <= 0)
2121 return 0;
2122
2123 if (p == 0)
2124 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2125 else {
2126 r = generic_array_bisect(f,
2127 le64toh(f->header->entry_array_offset),
2128 le64toh(f->header->n_entries),
2129 p,
2130 test_object_offset,
2131 DIRECTION_DOWN,
2132 NULL, NULL,
2133 &i);
2134 if (r <= 0)
2135 return r;
2136
2137 if (direction == DIRECTION_DOWN) {
2138 if (i >= n - 1)
2139 return 0;
2140
2141 i++;
2142 } else {
2143 if (i <= 0)
2144 return 0;
2145
2146 i--;
2147 }
2148 }
2149
2150 /* And jump to it */
2151 r = generic_array_get(f,
2152 le64toh(f->header->entry_array_offset),
2153 i,
2154 ret, &ofs);
2155 if (r <= 0)
2156 return r;
2157
2158 if (p > 0 &&
2159 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2160 log_debug("%s: entry array corrupted at entry %"PRIu64,
2161 f->path, i);
2162 return -EBADMSG;
2163 }
2164
2165 if (offset)
2166 *offset = ofs;
2167
2168 return 1;
2169 }
2170
2171 int journal_file_next_entry_for_data(
2172 JournalFile *f,
2173 Object *o, uint64_t p,
2174 uint64_t data_offset,
2175 direction_t direction,
2176 Object **ret, uint64_t *offset) {
2177
2178 uint64_t n, i;
2179 int r;
2180 Object *d;
2181
2182 assert(f);
2183 assert(p > 0 || !o);
2184
2185 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2186 if (r < 0)
2187 return r;
2188
2189 n = le64toh(d->data.n_entries);
2190 if (n <= 0)
2191 return n;
2192
2193 if (!o)
2194 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2195 else {
2196 if (o->object.type != OBJECT_ENTRY)
2197 return -EINVAL;
2198
2199 r = generic_array_bisect_plus_one(f,
2200 le64toh(d->data.entry_offset),
2201 le64toh(d->data.entry_array_offset),
2202 le64toh(d->data.n_entries),
2203 p,
2204 test_object_offset,
2205 DIRECTION_DOWN,
2206 NULL, NULL,
2207 &i);
2208
2209 if (r <= 0)
2210 return r;
2211
2212 if (direction == DIRECTION_DOWN) {
2213 if (i >= n - 1)
2214 return 0;
2215
2216 i++;
2217 } else {
2218 if (i <= 0)
2219 return 0;
2220
2221 i--;
2222 }
2223
2224 }
2225
2226 return generic_array_get_plus_one(f,
2227 le64toh(d->data.entry_offset),
2228 le64toh(d->data.entry_array_offset),
2229 i,
2230 ret, offset);
2231 }
2232
2233 int journal_file_move_to_entry_by_offset_for_data(
2234 JournalFile *f,
2235 uint64_t data_offset,
2236 uint64_t p,
2237 direction_t direction,
2238 Object **ret, uint64_t *offset) {
2239
2240 int r;
2241 Object *d;
2242
2243 assert(f);
2244
2245 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2246 if (r < 0)
2247 return r;
2248
2249 return generic_array_bisect_plus_one(f,
2250 le64toh(d->data.entry_offset),
2251 le64toh(d->data.entry_array_offset),
2252 le64toh(d->data.n_entries),
2253 p,
2254 test_object_offset,
2255 direction,
2256 ret, offset, NULL);
2257 }
2258
2259 int journal_file_move_to_entry_by_monotonic_for_data(
2260 JournalFile *f,
2261 uint64_t data_offset,
2262 sd_id128_t boot_id,
2263 uint64_t monotonic,
2264 direction_t direction,
2265 Object **ret, uint64_t *offset) {
2266
2267 Object *o, *d;
2268 int r;
2269 uint64_t b, z;
2270
2271 assert(f);
2272
2273 /* First, seek by time */
2274 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2275 if (r < 0)
2276 return r;
2277 if (r == 0)
2278 return -ENOENT;
2279
2280 r = generic_array_bisect_plus_one(f,
2281 le64toh(o->data.entry_offset),
2282 le64toh(o->data.entry_array_offset),
2283 le64toh(o->data.n_entries),
2284 monotonic,
2285 test_object_monotonic,
2286 direction,
2287 NULL, &z, NULL);
2288 if (r <= 0)
2289 return r;
2290
2291 /* And now, continue seeking until we find an entry that
2292 * exists in both bisection arrays */
2293
2294 for (;;) {
2295 Object *qo;
2296 uint64_t p, q;
2297
2298 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2299 if (r < 0)
2300 return r;
2301
2302 r = generic_array_bisect_plus_one(f,
2303 le64toh(d->data.entry_offset),
2304 le64toh(d->data.entry_array_offset),
2305 le64toh(d->data.n_entries),
2306 z,
2307 test_object_offset,
2308 direction,
2309 NULL, &p, NULL);
2310 if (r <= 0)
2311 return r;
2312
2313 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2314 if (r < 0)
2315 return r;
2316
2317 r = generic_array_bisect_plus_one(f,
2318 le64toh(o->data.entry_offset),
2319 le64toh(o->data.entry_array_offset),
2320 le64toh(o->data.n_entries),
2321 p,
2322 test_object_offset,
2323 direction,
2324 &qo, &q, NULL);
2325
2326 if (r <= 0)
2327 return r;
2328
2329 if (p == q) {
2330 if (ret)
2331 *ret = qo;
2332 if (offset)
2333 *offset = q;
2334
2335 return 1;
2336 }
2337
2338 z = q;
2339 }
2340 }
2341
2342 int journal_file_move_to_entry_by_seqnum_for_data(
2343 JournalFile *f,
2344 uint64_t data_offset,
2345 uint64_t seqnum,
2346 direction_t direction,
2347 Object **ret, uint64_t *offset) {
2348
2349 Object *d;
2350 int r;
2351
2352 assert(f);
2353
2354 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2355 if (r < 0)
2356 return r;
2357
2358 return generic_array_bisect_plus_one(f,
2359 le64toh(d->data.entry_offset),
2360 le64toh(d->data.entry_array_offset),
2361 le64toh(d->data.n_entries),
2362 seqnum,
2363 test_object_seqnum,
2364 direction,
2365 ret, offset, NULL);
2366 }
2367
2368 int journal_file_move_to_entry_by_realtime_for_data(
2369 JournalFile *f,
2370 uint64_t data_offset,
2371 uint64_t realtime,
2372 direction_t direction,
2373 Object **ret, uint64_t *offset) {
2374
2375 Object *d;
2376 int r;
2377
2378 assert(f);
2379
2380 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2381 if (r < 0)
2382 return r;
2383
2384 return generic_array_bisect_plus_one(f,
2385 le64toh(d->data.entry_offset),
2386 le64toh(d->data.entry_array_offset),
2387 le64toh(d->data.n_entries),
2388 realtime,
2389 test_object_realtime,
2390 direction,
2391 ret, offset, NULL);
2392 }
2393
2394 void journal_file_dump(JournalFile *f) {
2395 Object *o;
2396 int r;
2397 uint64_t p;
2398
2399 assert(f);
2400
2401 journal_file_print_header(f);
2402
2403 p = le64toh(f->header->header_size);
2404 while (p != 0) {
2405 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2406 if (r < 0)
2407 goto fail;
2408
2409 switch (o->object.type) {
2410
2411 case OBJECT_UNUSED:
2412 printf("Type: OBJECT_UNUSED\n");
2413 break;
2414
2415 case OBJECT_DATA:
2416 printf("Type: OBJECT_DATA\n");
2417 break;
2418
2419 case OBJECT_FIELD:
2420 printf("Type: OBJECT_FIELD\n");
2421 break;
2422
2423 case OBJECT_ENTRY:
2424 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2425 le64toh(o->entry.seqnum),
2426 le64toh(o->entry.monotonic),
2427 le64toh(o->entry.realtime));
2428 break;
2429
2430 case OBJECT_FIELD_HASH_TABLE:
2431 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2432 break;
2433
2434 case OBJECT_DATA_HASH_TABLE:
2435 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2436 break;
2437
2438 case OBJECT_ENTRY_ARRAY:
2439 printf("Type: OBJECT_ENTRY_ARRAY\n");
2440 break;
2441
2442 case OBJECT_TAG:
2443 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2444 le64toh(o->tag.seqnum),
2445 le64toh(o->tag.epoch));
2446 break;
2447
2448 default:
2449 printf("Type: unknown (%i)\n", o->object.type);
2450 break;
2451 }
2452
2453 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2454 printf("Flags: %s\n",
2455 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2456
2457 if (p == le64toh(f->header->tail_object_offset))
2458 p = 0;
2459 else
2460 p = p + ALIGN64(le64toh(o->object.size));
2461 }
2462
2463 return;
2464 fail:
2465 log_error("File corrupt");
2466 }
2467
2468 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2469 const char *x;
2470
2471 x = format_timestamp(buf, l, t);
2472 if (x)
2473 return x;
2474 return " --- ";
2475 }
2476
2477 void journal_file_print_header(JournalFile *f) {
2478 char a[33], b[33], c[33], d[33];
2479 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2480 struct stat st;
2481 char bytes[FORMAT_BYTES_MAX];
2482
2483 assert(f);
2484
2485 printf("File Path: %s\n"
2486 "File ID: %s\n"
2487 "Machine ID: %s\n"
2488 "Boot ID: %s\n"
2489 "Sequential Number ID: %s\n"
2490 "State: %s\n"
2491 "Compatible Flags:%s%s\n"
2492 "Incompatible Flags:%s%s%s\n"
2493 "Header size: %"PRIu64"\n"
2494 "Arena size: %"PRIu64"\n"
2495 "Data Hash Table Size: %"PRIu64"\n"
2496 "Field Hash Table Size: %"PRIu64"\n"
2497 "Rotate Suggested: %s\n"
2498 "Head Sequential Number: %"PRIu64"\n"
2499 "Tail Sequential Number: %"PRIu64"\n"
2500 "Head Realtime Timestamp: %s\n"
2501 "Tail Realtime Timestamp: %s\n"
2502 "Tail Monotonic Timestamp: %s\n"
2503 "Objects: %"PRIu64"\n"
2504 "Entry Objects: %"PRIu64"\n",
2505 f->path,
2506 sd_id128_to_string(f->header->file_id, a),
2507 sd_id128_to_string(f->header->machine_id, b),
2508 sd_id128_to_string(f->header->boot_id, c),
2509 sd_id128_to_string(f->header->seqnum_id, d),
2510 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2511 f->header->state == STATE_ONLINE ? "ONLINE" :
2512 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2513 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2514 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2515 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2516 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2517 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2518 le64toh(f->header->header_size),
2519 le64toh(f->header->arena_size),
2520 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2521 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2522 yes_no(journal_file_rotate_suggested(f, 0)),
2523 le64toh(f->header->head_entry_seqnum),
2524 le64toh(f->header->tail_entry_seqnum),
2525 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2526 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2527 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2528 le64toh(f->header->n_objects),
2529 le64toh(f->header->n_entries));
2530
2531 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2532 printf("Data Objects: %"PRIu64"\n"
2533 "Data Hash Table Fill: %.1f%%\n",
2534 le64toh(f->header->n_data),
2535 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2536
2537 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2538 printf("Field Objects: %"PRIu64"\n"
2539 "Field Hash Table Fill: %.1f%%\n",
2540 le64toh(f->header->n_fields),
2541 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2542
2543 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2544 printf("Tag Objects: %"PRIu64"\n",
2545 le64toh(f->header->n_tags));
2546 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2547 printf("Entry Array Objects: %"PRIu64"\n",
2548 le64toh(f->header->n_entry_arrays));
2549
2550 if (fstat(f->fd, &st) >= 0)
2551 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2552 }
2553
2554 static int journal_file_warn_btrfs(JournalFile *f) {
2555 unsigned attrs;
2556 int r;
2557
2558 assert(f);
2559
2560 /* Before we write anything, check if the COW logic is turned
2561 * off on btrfs. Given our write pattern that is quite
2562 * unfriendly to COW file systems this should greatly improve
2563 * performance on COW file systems, such as btrfs, at the
2564 * expense of data integrity features (which shouldn't be too
2565 * bad, given that we do our own checksumming). */
2566
2567 r = btrfs_is_filesystem(f->fd);
2568 if (r < 0)
2569 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2570 if (!r)
2571 return 0;
2572
2573 r = read_attr_fd(f->fd, &attrs);
2574 if (r < 0)
2575 return log_warning_errno(r, "Failed to read file attributes: %m");
2576
2577 if (attrs & FS_NOCOW_FL) {
2578 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2579 return 0;
2580 }
2581
2582 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2583 "This is likely to slow down journal access substantially, please consider turning "
2584 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2585
2586 return 1;
2587 }
2588
2589 int journal_file_open(
2590 const char *fname,
2591 int flags,
2592 mode_t mode,
2593 bool compress,
2594 bool seal,
2595 JournalMetrics *metrics,
2596 MMapCache *mmap_cache,
2597 JournalFile *template,
2598 JournalFile **ret) {
2599
2600 bool newly_created = false;
2601 JournalFile *f;
2602 void *h;
2603 int r;
2604
2605 assert(fname);
2606 assert(ret);
2607
2608 if ((flags & O_ACCMODE) != O_RDONLY &&
2609 (flags & O_ACCMODE) != O_RDWR)
2610 return -EINVAL;
2611
2612 if (!endswith(fname, ".journal") &&
2613 !endswith(fname, ".journal~"))
2614 return -EINVAL;
2615
2616 f = new0(JournalFile, 1);
2617 if (!f)
2618 return -ENOMEM;
2619
2620 f->fd = -1;
2621 f->mode = mode;
2622
2623 f->flags = flags;
2624 f->prot = prot_from_flags(flags);
2625 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2626 #if defined(HAVE_LZ4)
2627 f->compress_lz4 = compress;
2628 #elif defined(HAVE_XZ)
2629 f->compress_xz = compress;
2630 #endif
2631 #ifdef HAVE_GCRYPT
2632 f->seal = seal;
2633 #endif
2634
2635 if (mmap_cache)
2636 f->mmap = mmap_cache_ref(mmap_cache);
2637 else {
2638 f->mmap = mmap_cache_new();
2639 if (!f->mmap) {
2640 r = -ENOMEM;
2641 goto fail;
2642 }
2643 }
2644
2645 f->path = strdup(fname);
2646 if (!f->path) {
2647 r = -ENOMEM;
2648 goto fail;
2649 }
2650
2651 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2652 if (!f->chain_cache) {
2653 r = -ENOMEM;
2654 goto fail;
2655 }
2656
2657 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2658 if (f->fd < 0) {
2659 r = -errno;
2660 goto fail;
2661 }
2662
2663 r = journal_file_fstat(f);
2664 if (r < 0)
2665 goto fail;
2666
2667 if (f->last_stat.st_size == 0 && f->writable) {
2668
2669 (void) journal_file_warn_btrfs(f);
2670
2671 /* Let's attach the creation time to the journal file,
2672 * so that the vacuuming code knows the age of this
2673 * file even if the file might end up corrupted one
2674 * day... Ideally we'd just use the creation time many
2675 * file systems maintain for each file, but there is
2676 * currently no usable API to query this, hence let's
2677 * emulate this via extended attributes. If extended
2678 * attributes are not supported we'll just skip this,
2679 * and rely solely on mtime/atime/ctime of the file. */
2680
2681 fd_setcrtime(f->fd, 0);
2682
2683 #ifdef HAVE_GCRYPT
2684 /* Try to load the FSPRG state, and if we can't, then
2685 * just don't do sealing */
2686 if (f->seal) {
2687 r = journal_file_fss_load(f);
2688 if (r < 0)
2689 f->seal = false;
2690 }
2691 #endif
2692
2693 r = journal_file_init_header(f, template);
2694 if (r < 0)
2695 goto fail;
2696
2697 r = journal_file_fstat(f);
2698 if (r < 0)
2699 goto fail;
2700
2701 newly_created = true;
2702 }
2703
2704 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2705 r = -EIO;
2706 goto fail;
2707 }
2708
2709 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2710 if (r < 0)
2711 goto fail;
2712
2713 f->header = h;
2714
2715 if (!newly_created) {
2716 r = journal_file_verify_header(f);
2717 if (r < 0)
2718 goto fail;
2719 }
2720
2721 #ifdef HAVE_GCRYPT
2722 if (!newly_created && f->writable) {
2723 r = journal_file_fss_load(f);
2724 if (r < 0)
2725 goto fail;
2726 }
2727 #endif
2728
2729 if (f->writable) {
2730 if (metrics) {
2731 journal_default_metrics(metrics, f->fd);
2732 f->metrics = *metrics;
2733 } else if (template)
2734 f->metrics = template->metrics;
2735
2736 r = journal_file_refresh_header(f);
2737 if (r < 0)
2738 goto fail;
2739 }
2740
2741 #ifdef HAVE_GCRYPT
2742 r = journal_file_hmac_setup(f);
2743 if (r < 0)
2744 goto fail;
2745 #endif
2746
2747 if (newly_created) {
2748 r = journal_file_setup_field_hash_table(f);
2749 if (r < 0)
2750 goto fail;
2751
2752 r = journal_file_setup_data_hash_table(f);
2753 if (r < 0)
2754 goto fail;
2755
2756 #ifdef HAVE_GCRYPT
2757 r = journal_file_append_first_tag(f);
2758 if (r < 0)
2759 goto fail;
2760 #endif
2761 }
2762
2763 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2764 r = -EIO;
2765 goto fail;
2766 }
2767
2768 *ret = f;
2769 return 0;
2770
2771 fail:
2772 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2773 r = -EIO;
2774
2775 journal_file_close(f);
2776
2777 return r;
2778 }
2779
2780 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2781 _cleanup_free_ char *p = NULL;
2782 size_t l;
2783 JournalFile *old_file, *new_file = NULL;
2784 int r;
2785
2786 assert(f);
2787 assert(*f);
2788
2789 old_file = *f;
2790
2791 if (!old_file->writable)
2792 return -EINVAL;
2793
2794 if (!endswith(old_file->path, ".journal"))
2795 return -EINVAL;
2796
2797 l = strlen(old_file->path);
2798 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2799 (int) l - 8, old_file->path,
2800 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2801 le64toh((*f)->header->head_entry_seqnum),
2802 le64toh((*f)->header->head_entry_realtime));
2803 if (r < 0)
2804 return -ENOMEM;
2805
2806 /* Try to rename the file to the archived version. If the file
2807 * already was deleted, we'll get ENOENT, let's ignore that
2808 * case. */
2809 r = rename(old_file->path, p);
2810 if (r < 0 && errno != ENOENT)
2811 return -errno;
2812
2813 old_file->header->state = STATE_ARCHIVED;
2814
2815 /* Currently, btrfs is not very good with out write patterns
2816 * and fragments heavily. Let's defrag our journal files when
2817 * we archive them */
2818 old_file->defrag_on_close = true;
2819
2820 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2821 journal_file_close(old_file);
2822
2823 *f = new_file;
2824 return r;
2825 }
2826
2827 int journal_file_open_reliably(
2828 const char *fname,
2829 int flags,
2830 mode_t mode,
2831 bool compress,
2832 bool seal,
2833 JournalMetrics *metrics,
2834 MMapCache *mmap_cache,
2835 JournalFile *template,
2836 JournalFile **ret) {
2837
2838 int r;
2839 size_t l;
2840 _cleanup_free_ char *p = NULL;
2841
2842 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2843 if (!IN_SET(r,
2844 -EBADMSG, /* corrupted */
2845 -ENODATA, /* truncated */
2846 -EHOSTDOWN, /* other machine */
2847 -EPROTONOSUPPORT, /* incompatible feature */
2848 -EBUSY, /* unclean shutdown */
2849 -ESHUTDOWN, /* already archived */
2850 -EIO, /* IO error, including SIGBUS on mmap */
2851 -EIDRM /* File has been deleted */))
2852 return r;
2853
2854 if ((flags & O_ACCMODE) == O_RDONLY)
2855 return r;
2856
2857 if (!(flags & O_CREAT))
2858 return r;
2859
2860 if (!endswith(fname, ".journal"))
2861 return r;
2862
2863 /* The file is corrupted. Rotate it away and try it again (but only once) */
2864
2865 l = strlen(fname);
2866 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2867 (int) l - 8, fname,
2868 now(CLOCK_REALTIME),
2869 random_u64()) < 0)
2870 return -ENOMEM;
2871
2872 if (rename(fname, p) < 0)
2873 return -errno;
2874
2875 /* btrfs doesn't cope well with our write pattern and
2876 * fragments heavily. Let's defrag all files we rotate */
2877
2878 (void) chattr_path(p, false, FS_NOCOW_FL);
2879 (void) btrfs_defrag(p);
2880
2881 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2882
2883 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2884 }
2885
2886 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2887 uint64_t i, n;
2888 uint64_t q, xor_hash = 0;
2889 int r;
2890 EntryItem *items;
2891 dual_timestamp ts;
2892
2893 assert(from);
2894 assert(to);
2895 assert(o);
2896 assert(p);
2897
2898 if (!to->writable)
2899 return -EPERM;
2900
2901 ts.monotonic = le64toh(o->entry.monotonic);
2902 ts.realtime = le64toh(o->entry.realtime);
2903
2904 n = journal_file_entry_n_items(o);
2905 /* alloca() can't take 0, hence let's allocate at least one */
2906 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2907
2908 for (i = 0; i < n; i++) {
2909 uint64_t l, h;
2910 le64_t le_hash;
2911 size_t t;
2912 void *data;
2913 Object *u;
2914
2915 q = le64toh(o->entry.items[i].object_offset);
2916 le_hash = o->entry.items[i].hash;
2917
2918 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2919 if (r < 0)
2920 return r;
2921
2922 if (le_hash != o->data.hash)
2923 return -EBADMSG;
2924
2925 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2926 t = (size_t) l;
2927
2928 /* We hit the limit on 32bit machines */
2929 if ((uint64_t) t != l)
2930 return -E2BIG;
2931
2932 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2933 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2934 size_t rsize = 0;
2935
2936 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2937 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2938 if (r < 0)
2939 return r;
2940
2941 data = from->compress_buffer;
2942 l = rsize;
2943 #else
2944 return -EPROTONOSUPPORT;
2945 #endif
2946 } else
2947 data = o->data.payload;
2948
2949 r = journal_file_append_data(to, data, l, &u, &h);
2950 if (r < 0)
2951 return r;
2952
2953 xor_hash ^= le64toh(u->data.hash);
2954 items[i].object_offset = htole64(h);
2955 items[i].hash = u->data.hash;
2956
2957 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2958 if (r < 0)
2959 return r;
2960 }
2961
2962 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2963
2964 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2965 return -EIO;
2966
2967 return r;
2968 }
2969
2970 void journal_reset_metrics(JournalMetrics *m) {
2971 assert(m);
2972
2973 /* Set everything to "pick automatic values". */
2974
2975 *m = (JournalMetrics) {
2976 .min_use = (uint64_t) -1,
2977 .max_use = (uint64_t) -1,
2978 .min_size = (uint64_t) -1,
2979 .max_size = (uint64_t) -1,
2980 .keep_free = (uint64_t) -1,
2981 .n_max_files = (uint64_t) -1,
2982 };
2983 }
2984
2985 void journal_default_metrics(JournalMetrics *m, int fd) {
2986 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
2987 struct statvfs ss;
2988 uint64_t fs_size;
2989
2990 assert(m);
2991 assert(fd >= 0);
2992
2993 if (fstatvfs(fd, &ss) >= 0)
2994 fs_size = ss.f_frsize * ss.f_blocks;
2995 else {
2996 log_debug_errno(errno, "Failed to detremine disk size: %m");
2997 fs_size = 0;
2998 }
2999
3000 if (m->max_use == (uint64_t) -1) {
3001
3002 if (fs_size > 0) {
3003 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3004
3005 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3006 m->max_use = DEFAULT_MAX_USE_UPPER;
3007
3008 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3009 m->max_use = DEFAULT_MAX_USE_LOWER;
3010 } else
3011 m->max_use = DEFAULT_MAX_USE_LOWER;
3012 } else {
3013 m->max_use = PAGE_ALIGN(m->max_use);
3014
3015 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3016 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3017 }
3018
3019 if (m->min_use == (uint64_t) -1)
3020 m->min_use = DEFAULT_MIN_USE;
3021
3022 if (m->min_use > m->max_use)
3023 m->min_use = m->max_use;
3024
3025 if (m->max_size == (uint64_t) -1) {
3026 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3027
3028 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3029 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3030 } else
3031 m->max_size = PAGE_ALIGN(m->max_size);
3032
3033 if (m->max_size != 0) {
3034 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3035 m->max_size = JOURNAL_FILE_SIZE_MIN;
3036
3037 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3038 m->max_use = m->max_size*2;
3039 }
3040
3041 if (m->min_size == (uint64_t) -1)
3042 m->min_size = JOURNAL_FILE_SIZE_MIN;
3043 else {
3044 m->min_size = PAGE_ALIGN(m->min_size);
3045
3046 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3047 m->min_size = JOURNAL_FILE_SIZE_MIN;
3048
3049 if (m->max_size != 0 && m->min_size > m->max_size)
3050 m->max_size = m->min_size;
3051 }
3052
3053 if (m->keep_free == (uint64_t) -1) {
3054
3055 if (fs_size > 0) {
3056 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3057
3058 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3059 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3060
3061 } else
3062 m->keep_free = DEFAULT_KEEP_FREE;
3063 }
3064
3065 if (m->n_max_files == (uint64_t) -1)
3066 m->n_max_files = DEFAULT_N_MAX_FILES;
3067
3068 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3069 format_bytes(a, sizeof(a), m->min_use),
3070 format_bytes(b, sizeof(b), m->max_use),
3071 format_bytes(c, sizeof(c), m->max_size),
3072 format_bytes(d, sizeof(d), m->min_size),
3073 format_bytes(e, sizeof(e), m->keep_free),
3074 m->n_max_files);
3075 }
3076
3077 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3078 assert(f);
3079 assert(from || to);
3080
3081 if (from) {
3082 if (f->header->head_entry_realtime == 0)
3083 return -ENOENT;
3084
3085 *from = le64toh(f->header->head_entry_realtime);
3086 }
3087
3088 if (to) {
3089 if (f->header->tail_entry_realtime == 0)
3090 return -ENOENT;
3091
3092 *to = le64toh(f->header->tail_entry_realtime);
3093 }
3094
3095 return 1;
3096 }
3097
3098 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3099 Object *o;
3100 uint64_t p;
3101 int r;
3102
3103 assert(f);
3104 assert(from || to);
3105
3106 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3107 if (r <= 0)
3108 return r;
3109
3110 if (le64toh(o->data.n_entries) <= 0)
3111 return 0;
3112
3113 if (from) {
3114 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3115 if (r < 0)
3116 return r;
3117
3118 *from = le64toh(o->entry.monotonic);
3119 }
3120
3121 if (to) {
3122 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3123 if (r < 0)
3124 return r;
3125
3126 r = generic_array_get_plus_one(f,
3127 le64toh(o->data.entry_offset),
3128 le64toh(o->data.entry_array_offset),
3129 le64toh(o->data.n_entries)-1,
3130 &o, NULL);
3131 if (r <= 0)
3132 return r;
3133
3134 *to = le64toh(o->entry.monotonic);
3135 }
3136
3137 return 1;
3138 }
3139
3140 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3141 assert(f);
3142
3143 /* If we gained new header fields we gained new features,
3144 * hence suggest a rotation */
3145 if (le64toh(f->header->header_size) < sizeof(Header)) {
3146 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3147 return true;
3148 }
3149
3150 /* Let's check if the hash tables grew over a certain fill
3151 * level (75%, borrowing this value from Java's hash table
3152 * implementation), and if so suggest a rotation. To calculate
3153 * the fill level we need the n_data field, which only exists
3154 * in newer versions. */
3155
3156 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3157 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3158 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3159 f->path,
3160 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3161 le64toh(f->header->n_data),
3162 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3163 (unsigned long long) f->last_stat.st_size,
3164 f->last_stat.st_size / le64toh(f->header->n_data));
3165 return true;
3166 }
3167
3168 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3169 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3170 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3171 f->path,
3172 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3173 le64toh(f->header->n_fields),
3174 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3175 return true;
3176 }
3177
3178 /* Are the data objects properly indexed by field objects? */
3179 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3180 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3181 le64toh(f->header->n_data) > 0 &&
3182 le64toh(f->header->n_fields) == 0)
3183 return true;
3184
3185 if (max_file_usec > 0) {
3186 usec_t t, h;
3187
3188 h = le64toh(f->header->head_entry_realtime);
3189 t = now(CLOCK_REALTIME);
3190
3191 if (h > 0 && t > h + max_file_usec)
3192 return true;
3193 }
3194
3195 return false;
3196 }