]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
util-lib: split out file attribute calls to chattr-util.[ch]
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <linux/fs.h>
25 #include <stddef.h>
26 #include <sys/mman.h>
27 #include <sys/statvfs.h>
28 #include <sys/uio.h>
29 #include <unistd.h>
30
31 #include "btrfs-util.h"
32 #include "chattr-util.h"
33 #include "compress.h"
34 #include "fd-util.h"
35 #include "journal-authenticate.h"
36 #include "journal-def.h"
37 #include "journal-file.h"
38 #include "lookup3.h"
39 #include "parse-util.h"
40 #include "random-util.h"
41 #include "string-util.h"
42 #include "xattr-util.h"
43
44 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
45 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
46
47 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
48
49 /* This is the minimum journal file size */
50 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
51
52 /* These are the lower and upper bounds if we deduce the max_use value
53 * from the file system size */
54 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
55 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
58 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
59
60 /* This is the upper bound if we deduce max_size from max_use */
61 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
62
63 /* This is the upper bound if we deduce the keep_free value from the
64 * file system size */
65 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
66
67 /* This is the keep_free value when we can't determine the system
68 * size */
69 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
70
71 /* This is the default maximum number of journal files to keep around. */
72 #define DEFAULT_N_MAX_FILES (100)
73
74 /* n_data was the first entry we added after the initial file format design */
75 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
76
77 /* How many entries to keep in the entry array chain cache at max */
78 #define CHAIN_CACHE_MAX 20
79
80 /* How much to increase the journal file size at once each time we allocate something new. */
81 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
82
83 /* Reread fstat() of the file for detecting deletions at least this often */
84 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
85
86 /* The mmap context to use for the header we pick as one above the last defined typed */
87 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
88
89 static int journal_file_set_online(JournalFile *f) {
90 assert(f);
91
92 if (!f->writable)
93 return -EPERM;
94
95 if (!(f->fd >= 0 && f->header))
96 return -EINVAL;
97
98 if (mmap_cache_got_sigbus(f->mmap, f->fd))
99 return -EIO;
100
101 switch(f->header->state) {
102 case STATE_ONLINE:
103 return 0;
104
105 case STATE_OFFLINE:
106 f->header->state = STATE_ONLINE;
107 fsync(f->fd);
108 return 0;
109
110 default:
111 return -EINVAL;
112 }
113 }
114
115 int journal_file_set_offline(JournalFile *f) {
116 assert(f);
117
118 if (!f->writable)
119 return -EPERM;
120
121 if (!(f->fd >= 0 && f->header))
122 return -EINVAL;
123
124 if (f->header->state != STATE_ONLINE)
125 return 0;
126
127 fsync(f->fd);
128
129 if (mmap_cache_got_sigbus(f->mmap, f->fd))
130 return -EIO;
131
132 f->header->state = STATE_OFFLINE;
133
134 if (mmap_cache_got_sigbus(f->mmap, f->fd))
135 return -EIO;
136
137 fsync(f->fd);
138
139 return 0;
140 }
141
142 JournalFile* journal_file_close(JournalFile *f) {
143 assert(f);
144
145 #ifdef HAVE_GCRYPT
146 /* Write the final tag */
147 if (f->seal && f->writable)
148 journal_file_append_tag(f);
149 #endif
150
151 journal_file_set_offline(f);
152
153 if (f->mmap && f->fd >= 0)
154 mmap_cache_close_fd(f->mmap, f->fd);
155
156 if (f->fd >= 0 && f->defrag_on_close) {
157
158 /* Be friendly to btrfs: turn COW back on again now,
159 * and defragment the file. We won't write to the file
160 * ever again, hence remove all fragmentation, and
161 * reenable all the good bits COW usually provides
162 * (such as data checksumming). */
163
164 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
165 (void) btrfs_defrag_fd(f->fd);
166 }
167
168 safe_close(f->fd);
169 free(f->path);
170
171 if (f->mmap)
172 mmap_cache_unref(f->mmap);
173
174 ordered_hashmap_free_free(f->chain_cache);
175
176 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
177 free(f->compress_buffer);
178 #endif
179
180 #ifdef HAVE_GCRYPT
181 if (f->fss_file)
182 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
183 else
184 free(f->fsprg_state);
185
186 free(f->fsprg_seed);
187
188 if (f->hmac)
189 gcry_md_close(f->hmac);
190 #endif
191
192 free(f);
193 return NULL;
194 }
195
196 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
197 Header h = {};
198 ssize_t k;
199 int r;
200
201 assert(f);
202
203 memcpy(h.signature, HEADER_SIGNATURE, 8);
204 h.header_size = htole64(ALIGN64(sizeof(h)));
205
206 h.incompatible_flags |= htole32(
207 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
208 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
209
210 h.compatible_flags = htole32(
211 f->seal * HEADER_COMPATIBLE_SEALED);
212
213 r = sd_id128_randomize(&h.file_id);
214 if (r < 0)
215 return r;
216
217 if (template) {
218 h.seqnum_id = template->header->seqnum_id;
219 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
220 } else
221 h.seqnum_id = h.file_id;
222
223 k = pwrite(f->fd, &h, sizeof(h), 0);
224 if (k < 0)
225 return -errno;
226
227 if (k != sizeof(h))
228 return -EIO;
229
230 return 0;
231 }
232
233 static int journal_file_refresh_header(JournalFile *f) {
234 sd_id128_t boot_id;
235 int r;
236
237 assert(f);
238
239 r = sd_id128_get_machine(&f->header->machine_id);
240 if (r < 0)
241 return r;
242
243 r = sd_id128_get_boot(&boot_id);
244 if (r < 0)
245 return r;
246
247 if (sd_id128_equal(boot_id, f->header->boot_id))
248 f->tail_entry_monotonic_valid = true;
249
250 f->header->boot_id = boot_id;
251
252 r = journal_file_set_online(f);
253
254 /* Sync the online state to disk */
255 fsync(f->fd);
256
257 return r;
258 }
259
260 static int journal_file_verify_header(JournalFile *f) {
261 uint32_t flags;
262
263 assert(f);
264
265 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
266 return -EBADMSG;
267
268 /* In both read and write mode we refuse to open files with
269 * incompatible flags we don't know */
270 flags = le32toh(f->header->incompatible_flags);
271 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
272 if (flags & ~HEADER_INCOMPATIBLE_ANY)
273 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
274 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
275 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
276 if (flags)
277 log_debug("Journal file %s uses incompatible flags %"PRIx32
278 " disabled at compilation time.", f->path, flags);
279 return -EPROTONOSUPPORT;
280 }
281
282 /* When open for writing we refuse to open files with
283 * compatible flags, too */
284 flags = le32toh(f->header->compatible_flags);
285 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
286 if (flags & ~HEADER_COMPATIBLE_ANY)
287 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
288 f->path, flags & ~HEADER_COMPATIBLE_ANY);
289 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
290 if (flags)
291 log_debug("Journal file %s uses compatible flags %"PRIx32
292 " disabled at compilation time.", f->path, flags);
293 return -EPROTONOSUPPORT;
294 }
295
296 if (f->header->state >= _STATE_MAX)
297 return -EBADMSG;
298
299 /* The first addition was n_data, so check that we are at least this large */
300 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
301 return -EBADMSG;
302
303 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
304 return -EBADMSG;
305
306 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
307 return -ENODATA;
308
309 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
310 return -ENODATA;
311
312 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
313 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
314 !VALID64(le64toh(f->header->tail_object_offset)) ||
315 !VALID64(le64toh(f->header->entry_array_offset)))
316 return -ENODATA;
317
318 if (f->writable) {
319 uint8_t state;
320 sd_id128_t machine_id;
321 int r;
322
323 r = sd_id128_get_machine(&machine_id);
324 if (r < 0)
325 return r;
326
327 if (!sd_id128_equal(machine_id, f->header->machine_id))
328 return -EHOSTDOWN;
329
330 state = f->header->state;
331
332 if (state == STATE_ONLINE) {
333 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
334 return -EBUSY;
335 } else if (state == STATE_ARCHIVED)
336 return -ESHUTDOWN;
337 else if (state != STATE_OFFLINE) {
338 log_debug("Journal file %s has unknown state %i.", f->path, state);
339 return -EBUSY;
340 }
341 }
342
343 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
344 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
345
346 f->seal = JOURNAL_HEADER_SEALED(f->header);
347
348 return 0;
349 }
350
351 static int journal_file_fstat(JournalFile *f) {
352 assert(f);
353 assert(f->fd >= 0);
354
355 if (fstat(f->fd, &f->last_stat) < 0)
356 return -errno;
357
358 f->last_stat_usec = now(CLOCK_MONOTONIC);
359
360 /* Refuse appending to files that are already deleted */
361 if (f->last_stat.st_nlink <= 0)
362 return -EIDRM;
363
364 return 0;
365 }
366
367 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
368 uint64_t old_size, new_size;
369 int r;
370
371 assert(f);
372
373 /* We assume that this file is not sparse, and we know that
374 * for sure, since we always call posix_fallocate()
375 * ourselves */
376
377 if (mmap_cache_got_sigbus(f->mmap, f->fd))
378 return -EIO;
379
380 old_size =
381 le64toh(f->header->header_size) +
382 le64toh(f->header->arena_size);
383
384 new_size = PAGE_ALIGN(offset + size);
385 if (new_size < le64toh(f->header->header_size))
386 new_size = le64toh(f->header->header_size);
387
388 if (new_size <= old_size) {
389
390 /* We already pre-allocated enough space, but before
391 * we write to it, let's check with fstat() if the
392 * file got deleted, in order make sure we don't throw
393 * away the data immediately. Don't check fstat() for
394 * all writes though, but only once ever 10s. */
395
396 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
397 return 0;
398
399 return journal_file_fstat(f);
400 }
401
402 /* Allocate more space. */
403
404 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
405 return -E2BIG;
406
407 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
408 struct statvfs svfs;
409
410 if (fstatvfs(f->fd, &svfs) >= 0) {
411 uint64_t available;
412
413 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
414
415 if (new_size - old_size > available)
416 return -E2BIG;
417 }
418 }
419
420 /* Increase by larger blocks at once */
421 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
422 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
423 new_size = f->metrics.max_size;
424
425 /* Note that the glibc fallocate() fallback is very
426 inefficient, hence we try to minimize the allocation area
427 as we can. */
428 r = posix_fallocate(f->fd, old_size, new_size - old_size);
429 if (r != 0)
430 return -r;
431
432 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
433
434 return journal_file_fstat(f);
435 }
436
437 static unsigned type_to_context(ObjectType type) {
438 /* One context for each type, plus one catch-all for the rest */
439 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
440 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
441 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
442 }
443
444 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
445 int r;
446
447 assert(f);
448 assert(ret);
449
450 if (size <= 0)
451 return -EINVAL;
452
453 /* Avoid SIGBUS on invalid accesses */
454 if (offset + size > (uint64_t) f->last_stat.st_size) {
455 /* Hmm, out of range? Let's refresh the fstat() data
456 * first, before we trust that check. */
457
458 r = journal_file_fstat(f);
459 if (r < 0)
460 return r;
461
462 if (offset + size > (uint64_t) f->last_stat.st_size)
463 return -EADDRNOTAVAIL;
464 }
465
466 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
467 }
468
469 static uint64_t minimum_header_size(Object *o) {
470
471 static const uint64_t table[] = {
472 [OBJECT_DATA] = sizeof(DataObject),
473 [OBJECT_FIELD] = sizeof(FieldObject),
474 [OBJECT_ENTRY] = sizeof(EntryObject),
475 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
476 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
477 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
478 [OBJECT_TAG] = sizeof(TagObject),
479 };
480
481 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
482 return sizeof(ObjectHeader);
483
484 return table[o->object.type];
485 }
486
487 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
488 int r;
489 void *t;
490 Object *o;
491 uint64_t s;
492
493 assert(f);
494 assert(ret);
495
496 /* Objects may only be located at multiple of 64 bit */
497 if (!VALID64(offset))
498 return -EFAULT;
499
500 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
501 if (r < 0)
502 return r;
503
504 o = (Object*) t;
505 s = le64toh(o->object.size);
506
507 if (s < sizeof(ObjectHeader))
508 return -EBADMSG;
509
510 if (o->object.type <= OBJECT_UNUSED)
511 return -EBADMSG;
512
513 if (s < minimum_header_size(o))
514 return -EBADMSG;
515
516 if (type > OBJECT_UNUSED && o->object.type != type)
517 return -EBADMSG;
518
519 if (s > sizeof(ObjectHeader)) {
520 r = journal_file_move_to(f, type, false, offset, s, &t);
521 if (r < 0)
522 return r;
523
524 o = (Object*) t;
525 }
526
527 *ret = o;
528 return 0;
529 }
530
531 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
532 uint64_t r;
533
534 assert(f);
535
536 r = le64toh(f->header->tail_entry_seqnum) + 1;
537
538 if (seqnum) {
539 /* If an external seqnum counter was passed, we update
540 * both the local and the external one, and set it to
541 * the maximum of both */
542
543 if (*seqnum + 1 > r)
544 r = *seqnum + 1;
545
546 *seqnum = r;
547 }
548
549 f->header->tail_entry_seqnum = htole64(r);
550
551 if (f->header->head_entry_seqnum == 0)
552 f->header->head_entry_seqnum = htole64(r);
553
554 return r;
555 }
556
557 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
558 int r;
559 uint64_t p;
560 Object *tail, *o;
561 void *t;
562
563 assert(f);
564 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
565 assert(size >= sizeof(ObjectHeader));
566 assert(offset);
567 assert(ret);
568
569 r = journal_file_set_online(f);
570 if (r < 0)
571 return r;
572
573 p = le64toh(f->header->tail_object_offset);
574 if (p == 0)
575 p = le64toh(f->header->header_size);
576 else {
577 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
578 if (r < 0)
579 return r;
580
581 p += ALIGN64(le64toh(tail->object.size));
582 }
583
584 r = journal_file_allocate(f, p, size);
585 if (r < 0)
586 return r;
587
588 r = journal_file_move_to(f, type, false, p, size, &t);
589 if (r < 0)
590 return r;
591
592 o = (Object*) t;
593
594 zero(o->object);
595 o->object.type = type;
596 o->object.size = htole64(size);
597
598 f->header->tail_object_offset = htole64(p);
599 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
600
601 *ret = o;
602 *offset = p;
603
604 return 0;
605 }
606
607 static int journal_file_setup_data_hash_table(JournalFile *f) {
608 uint64_t s, p;
609 Object *o;
610 int r;
611
612 assert(f);
613
614 /* We estimate that we need 1 hash table entry per 768 bytes
615 of journal file and we want to make sure we never get
616 beyond 75% fill level. Calculate the hash table size for
617 the maximum file size based on these metrics. */
618
619 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
620 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
621 s = DEFAULT_DATA_HASH_TABLE_SIZE;
622
623 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
624
625 r = journal_file_append_object(f,
626 OBJECT_DATA_HASH_TABLE,
627 offsetof(Object, hash_table.items) + s,
628 &o, &p);
629 if (r < 0)
630 return r;
631
632 memzero(o->hash_table.items, s);
633
634 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
635 f->header->data_hash_table_size = htole64(s);
636
637 return 0;
638 }
639
640 static int journal_file_setup_field_hash_table(JournalFile *f) {
641 uint64_t s, p;
642 Object *o;
643 int r;
644
645 assert(f);
646
647 /* We use a fixed size hash table for the fields as this
648 * number should grow very slowly only */
649
650 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
651 r = journal_file_append_object(f,
652 OBJECT_FIELD_HASH_TABLE,
653 offsetof(Object, hash_table.items) + s,
654 &o, &p);
655 if (r < 0)
656 return r;
657
658 memzero(o->hash_table.items, s);
659
660 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
661 f->header->field_hash_table_size = htole64(s);
662
663 return 0;
664 }
665
666 int journal_file_map_data_hash_table(JournalFile *f) {
667 uint64_t s, p;
668 void *t;
669 int r;
670
671 assert(f);
672
673 if (f->data_hash_table)
674 return 0;
675
676 p = le64toh(f->header->data_hash_table_offset);
677 s = le64toh(f->header->data_hash_table_size);
678
679 r = journal_file_move_to(f,
680 OBJECT_DATA_HASH_TABLE,
681 true,
682 p, s,
683 &t);
684 if (r < 0)
685 return r;
686
687 f->data_hash_table = t;
688 return 0;
689 }
690
691 int journal_file_map_field_hash_table(JournalFile *f) {
692 uint64_t s, p;
693 void *t;
694 int r;
695
696 assert(f);
697
698 if (f->field_hash_table)
699 return 0;
700
701 p = le64toh(f->header->field_hash_table_offset);
702 s = le64toh(f->header->field_hash_table_size);
703
704 r = journal_file_move_to(f,
705 OBJECT_FIELD_HASH_TABLE,
706 true,
707 p, s,
708 &t);
709 if (r < 0)
710 return r;
711
712 f->field_hash_table = t;
713 return 0;
714 }
715
716 static int journal_file_link_field(
717 JournalFile *f,
718 Object *o,
719 uint64_t offset,
720 uint64_t hash) {
721
722 uint64_t p, h, m;
723 int r;
724
725 assert(f);
726 assert(o);
727 assert(offset > 0);
728
729 if (o->object.type != OBJECT_FIELD)
730 return -EINVAL;
731
732 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
733 if (m <= 0)
734 return -EBADMSG;
735
736 /* This might alter the window we are looking at */
737 o->field.next_hash_offset = o->field.head_data_offset = 0;
738
739 h = hash % m;
740 p = le64toh(f->field_hash_table[h].tail_hash_offset);
741 if (p == 0)
742 f->field_hash_table[h].head_hash_offset = htole64(offset);
743 else {
744 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
745 if (r < 0)
746 return r;
747
748 o->field.next_hash_offset = htole64(offset);
749 }
750
751 f->field_hash_table[h].tail_hash_offset = htole64(offset);
752
753 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
754 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
755
756 return 0;
757 }
758
759 static int journal_file_link_data(
760 JournalFile *f,
761 Object *o,
762 uint64_t offset,
763 uint64_t hash) {
764
765 uint64_t p, h, m;
766 int r;
767
768 assert(f);
769 assert(o);
770 assert(offset > 0);
771
772 if (o->object.type != OBJECT_DATA)
773 return -EINVAL;
774
775 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
776 if (m <= 0)
777 return -EBADMSG;
778
779 /* This might alter the window we are looking at */
780 o->data.next_hash_offset = o->data.next_field_offset = 0;
781 o->data.entry_offset = o->data.entry_array_offset = 0;
782 o->data.n_entries = 0;
783
784 h = hash % m;
785 p = le64toh(f->data_hash_table[h].tail_hash_offset);
786 if (p == 0)
787 /* Only entry in the hash table is easy */
788 f->data_hash_table[h].head_hash_offset = htole64(offset);
789 else {
790 /* Move back to the previous data object, to patch in
791 * pointer */
792
793 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
794 if (r < 0)
795 return r;
796
797 o->data.next_hash_offset = htole64(offset);
798 }
799
800 f->data_hash_table[h].tail_hash_offset = htole64(offset);
801
802 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
803 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
804
805 return 0;
806 }
807
808 int journal_file_find_field_object_with_hash(
809 JournalFile *f,
810 const void *field, uint64_t size, uint64_t hash,
811 Object **ret, uint64_t *offset) {
812
813 uint64_t p, osize, h, m;
814 int r;
815
816 assert(f);
817 assert(field && size > 0);
818
819 /* If the field hash table is empty, we can't find anything */
820 if (le64toh(f->header->field_hash_table_size) <= 0)
821 return 0;
822
823 /* Map the field hash table, if it isn't mapped yet. */
824 r = journal_file_map_field_hash_table(f);
825 if (r < 0)
826 return r;
827
828 osize = offsetof(Object, field.payload) + size;
829
830 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
831 if (m <= 0)
832 return -EBADMSG;
833
834 h = hash % m;
835 p = le64toh(f->field_hash_table[h].head_hash_offset);
836
837 while (p > 0) {
838 Object *o;
839
840 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
841 if (r < 0)
842 return r;
843
844 if (le64toh(o->field.hash) == hash &&
845 le64toh(o->object.size) == osize &&
846 memcmp(o->field.payload, field, size) == 0) {
847
848 if (ret)
849 *ret = o;
850 if (offset)
851 *offset = p;
852
853 return 1;
854 }
855
856 p = le64toh(o->field.next_hash_offset);
857 }
858
859 return 0;
860 }
861
862 int journal_file_find_field_object(
863 JournalFile *f,
864 const void *field, uint64_t size,
865 Object **ret, uint64_t *offset) {
866
867 uint64_t hash;
868
869 assert(f);
870 assert(field && size > 0);
871
872 hash = hash64(field, size);
873
874 return journal_file_find_field_object_with_hash(f,
875 field, size, hash,
876 ret, offset);
877 }
878
879 int journal_file_find_data_object_with_hash(
880 JournalFile *f,
881 const void *data, uint64_t size, uint64_t hash,
882 Object **ret, uint64_t *offset) {
883
884 uint64_t p, osize, h, m;
885 int r;
886
887 assert(f);
888 assert(data || size == 0);
889
890 /* If there's no data hash table, then there's no entry. */
891 if (le64toh(f->header->data_hash_table_size) <= 0)
892 return 0;
893
894 /* Map the data hash table, if it isn't mapped yet. */
895 r = journal_file_map_data_hash_table(f);
896 if (r < 0)
897 return r;
898
899 osize = offsetof(Object, data.payload) + size;
900
901 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
902 if (m <= 0)
903 return -EBADMSG;
904
905 h = hash % m;
906 p = le64toh(f->data_hash_table[h].head_hash_offset);
907
908 while (p > 0) {
909 Object *o;
910
911 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
912 if (r < 0)
913 return r;
914
915 if (le64toh(o->data.hash) != hash)
916 goto next;
917
918 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
919 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
920 uint64_t l;
921 size_t rsize = 0;
922
923 l = le64toh(o->object.size);
924 if (l <= offsetof(Object, data.payload))
925 return -EBADMSG;
926
927 l -= offsetof(Object, data.payload);
928
929 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
930 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
931 if (r < 0)
932 return r;
933
934 if (rsize == size &&
935 memcmp(f->compress_buffer, data, size) == 0) {
936
937 if (ret)
938 *ret = o;
939
940 if (offset)
941 *offset = p;
942
943 return 1;
944 }
945 #else
946 return -EPROTONOSUPPORT;
947 #endif
948 } else if (le64toh(o->object.size) == osize &&
949 memcmp(o->data.payload, data, size) == 0) {
950
951 if (ret)
952 *ret = o;
953
954 if (offset)
955 *offset = p;
956
957 return 1;
958 }
959
960 next:
961 p = le64toh(o->data.next_hash_offset);
962 }
963
964 return 0;
965 }
966
967 int journal_file_find_data_object(
968 JournalFile *f,
969 const void *data, uint64_t size,
970 Object **ret, uint64_t *offset) {
971
972 uint64_t hash;
973
974 assert(f);
975 assert(data || size == 0);
976
977 hash = hash64(data, size);
978
979 return journal_file_find_data_object_with_hash(f,
980 data, size, hash,
981 ret, offset);
982 }
983
984 static int journal_file_append_field(
985 JournalFile *f,
986 const void *field, uint64_t size,
987 Object **ret, uint64_t *offset) {
988
989 uint64_t hash, p;
990 uint64_t osize;
991 Object *o;
992 int r;
993
994 assert(f);
995 assert(field && size > 0);
996
997 hash = hash64(field, size);
998
999 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1000 if (r < 0)
1001 return r;
1002 else if (r > 0) {
1003
1004 if (ret)
1005 *ret = o;
1006
1007 if (offset)
1008 *offset = p;
1009
1010 return 0;
1011 }
1012
1013 osize = offsetof(Object, field.payload) + size;
1014 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1015 if (r < 0)
1016 return r;
1017
1018 o->field.hash = htole64(hash);
1019 memcpy(o->field.payload, field, size);
1020
1021 r = journal_file_link_field(f, o, p, hash);
1022 if (r < 0)
1023 return r;
1024
1025 /* The linking might have altered the window, so let's
1026 * refresh our pointer */
1027 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1028 if (r < 0)
1029 return r;
1030
1031 #ifdef HAVE_GCRYPT
1032 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1033 if (r < 0)
1034 return r;
1035 #endif
1036
1037 if (ret)
1038 *ret = o;
1039
1040 if (offset)
1041 *offset = p;
1042
1043 return 0;
1044 }
1045
1046 static int journal_file_append_data(
1047 JournalFile *f,
1048 const void *data, uint64_t size,
1049 Object **ret, uint64_t *offset) {
1050
1051 uint64_t hash, p;
1052 uint64_t osize;
1053 Object *o;
1054 int r, compression = 0;
1055 const void *eq;
1056
1057 assert(f);
1058 assert(data || size == 0);
1059
1060 hash = hash64(data, size);
1061
1062 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1063 if (r < 0)
1064 return r;
1065 if (r > 0) {
1066
1067 if (ret)
1068 *ret = o;
1069
1070 if (offset)
1071 *offset = p;
1072
1073 return 0;
1074 }
1075
1076 osize = offsetof(Object, data.payload) + size;
1077 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1078 if (r < 0)
1079 return r;
1080
1081 o->data.hash = htole64(hash);
1082
1083 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1084 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1085 size_t rsize = 0;
1086
1087 compression = compress_blob(data, size, o->data.payload, &rsize);
1088
1089 if (compression >= 0) {
1090 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1091 o->object.flags |= compression;
1092
1093 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1094 size, rsize, object_compressed_to_string(compression));
1095 } else
1096 /* Compression didn't work, we don't really care why, let's continue without compression */
1097 compression = 0;
1098 }
1099 #endif
1100
1101 if (compression == 0 && size > 0)
1102 memcpy(o->data.payload, data, size);
1103
1104 r = journal_file_link_data(f, o, p, hash);
1105 if (r < 0)
1106 return r;
1107
1108 /* The linking might have altered the window, so let's
1109 * refresh our pointer */
1110 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1111 if (r < 0)
1112 return r;
1113
1114 if (!data)
1115 eq = NULL;
1116 else
1117 eq = memchr(data, '=', size);
1118 if (eq && eq > data) {
1119 Object *fo = NULL;
1120 uint64_t fp;
1121
1122 /* Create field object ... */
1123 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1124 if (r < 0)
1125 return r;
1126
1127 /* ... and link it in. */
1128 o->data.next_field_offset = fo->field.head_data_offset;
1129 fo->field.head_data_offset = le64toh(p);
1130 }
1131
1132 #ifdef HAVE_GCRYPT
1133 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1134 if (r < 0)
1135 return r;
1136 #endif
1137
1138 if (ret)
1139 *ret = o;
1140
1141 if (offset)
1142 *offset = p;
1143
1144 return 0;
1145 }
1146
1147 uint64_t journal_file_entry_n_items(Object *o) {
1148 assert(o);
1149
1150 if (o->object.type != OBJECT_ENTRY)
1151 return 0;
1152
1153 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1154 }
1155
1156 uint64_t journal_file_entry_array_n_items(Object *o) {
1157 assert(o);
1158
1159 if (o->object.type != OBJECT_ENTRY_ARRAY)
1160 return 0;
1161
1162 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1163 }
1164
1165 uint64_t journal_file_hash_table_n_items(Object *o) {
1166 assert(o);
1167
1168 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1169 o->object.type != OBJECT_FIELD_HASH_TABLE)
1170 return 0;
1171
1172 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1173 }
1174
1175 static int link_entry_into_array(JournalFile *f,
1176 le64_t *first,
1177 le64_t *idx,
1178 uint64_t p) {
1179 int r;
1180 uint64_t n = 0, ap = 0, q, i, a, hidx;
1181 Object *o;
1182
1183 assert(f);
1184 assert(first);
1185 assert(idx);
1186 assert(p > 0);
1187
1188 a = le64toh(*first);
1189 i = hidx = le64toh(*idx);
1190 while (a > 0) {
1191
1192 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1193 if (r < 0)
1194 return r;
1195
1196 n = journal_file_entry_array_n_items(o);
1197 if (i < n) {
1198 o->entry_array.items[i] = htole64(p);
1199 *idx = htole64(hidx + 1);
1200 return 0;
1201 }
1202
1203 i -= n;
1204 ap = a;
1205 a = le64toh(o->entry_array.next_entry_array_offset);
1206 }
1207
1208 if (hidx > n)
1209 n = (hidx+1) * 2;
1210 else
1211 n = n * 2;
1212
1213 if (n < 4)
1214 n = 4;
1215
1216 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1217 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1218 &o, &q);
1219 if (r < 0)
1220 return r;
1221
1222 #ifdef HAVE_GCRYPT
1223 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1224 if (r < 0)
1225 return r;
1226 #endif
1227
1228 o->entry_array.items[i] = htole64(p);
1229
1230 if (ap == 0)
1231 *first = htole64(q);
1232 else {
1233 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1234 if (r < 0)
1235 return r;
1236
1237 o->entry_array.next_entry_array_offset = htole64(q);
1238 }
1239
1240 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1241 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1242
1243 *idx = htole64(hidx + 1);
1244
1245 return 0;
1246 }
1247
1248 static int link_entry_into_array_plus_one(JournalFile *f,
1249 le64_t *extra,
1250 le64_t *first,
1251 le64_t *idx,
1252 uint64_t p) {
1253
1254 int r;
1255
1256 assert(f);
1257 assert(extra);
1258 assert(first);
1259 assert(idx);
1260 assert(p > 0);
1261
1262 if (*idx == 0)
1263 *extra = htole64(p);
1264 else {
1265 le64_t i;
1266
1267 i = htole64(le64toh(*idx) - 1);
1268 r = link_entry_into_array(f, first, &i, p);
1269 if (r < 0)
1270 return r;
1271 }
1272
1273 *idx = htole64(le64toh(*idx) + 1);
1274 return 0;
1275 }
1276
1277 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1278 uint64_t p;
1279 int r;
1280 assert(f);
1281 assert(o);
1282 assert(offset > 0);
1283
1284 p = le64toh(o->entry.items[i].object_offset);
1285 if (p == 0)
1286 return -EINVAL;
1287
1288 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1289 if (r < 0)
1290 return r;
1291
1292 return link_entry_into_array_plus_one(f,
1293 &o->data.entry_offset,
1294 &o->data.entry_array_offset,
1295 &o->data.n_entries,
1296 offset);
1297 }
1298
1299 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1300 uint64_t n, i;
1301 int r;
1302
1303 assert(f);
1304 assert(o);
1305 assert(offset > 0);
1306
1307 if (o->object.type != OBJECT_ENTRY)
1308 return -EINVAL;
1309
1310 __sync_synchronize();
1311
1312 /* Link up the entry itself */
1313 r = link_entry_into_array(f,
1314 &f->header->entry_array_offset,
1315 &f->header->n_entries,
1316 offset);
1317 if (r < 0)
1318 return r;
1319
1320 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1321
1322 if (f->header->head_entry_realtime == 0)
1323 f->header->head_entry_realtime = o->entry.realtime;
1324
1325 f->header->tail_entry_realtime = o->entry.realtime;
1326 f->header->tail_entry_monotonic = o->entry.monotonic;
1327
1328 f->tail_entry_monotonic_valid = true;
1329
1330 /* Link up the items */
1331 n = journal_file_entry_n_items(o);
1332 for (i = 0; i < n; i++) {
1333 r = journal_file_link_entry_item(f, o, offset, i);
1334 if (r < 0)
1335 return r;
1336 }
1337
1338 return 0;
1339 }
1340
1341 static int journal_file_append_entry_internal(
1342 JournalFile *f,
1343 const dual_timestamp *ts,
1344 uint64_t xor_hash,
1345 const EntryItem items[], unsigned n_items,
1346 uint64_t *seqnum,
1347 Object **ret, uint64_t *offset) {
1348 uint64_t np;
1349 uint64_t osize;
1350 Object *o;
1351 int r;
1352
1353 assert(f);
1354 assert(items || n_items == 0);
1355 assert(ts);
1356
1357 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1358
1359 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1360 if (r < 0)
1361 return r;
1362
1363 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1364 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1365 o->entry.realtime = htole64(ts->realtime);
1366 o->entry.monotonic = htole64(ts->monotonic);
1367 o->entry.xor_hash = htole64(xor_hash);
1368 o->entry.boot_id = f->header->boot_id;
1369
1370 #ifdef HAVE_GCRYPT
1371 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1372 if (r < 0)
1373 return r;
1374 #endif
1375
1376 r = journal_file_link_entry(f, o, np);
1377 if (r < 0)
1378 return r;
1379
1380 if (ret)
1381 *ret = o;
1382
1383 if (offset)
1384 *offset = np;
1385
1386 return 0;
1387 }
1388
1389 void journal_file_post_change(JournalFile *f) {
1390 assert(f);
1391
1392 /* inotify() does not receive IN_MODIFY events from file
1393 * accesses done via mmap(). After each access we hence
1394 * trigger IN_MODIFY by truncating the journal file to its
1395 * current size which triggers IN_MODIFY. */
1396
1397 __sync_synchronize();
1398
1399 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1400 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1401 }
1402
1403 static int entry_item_cmp(const void *_a, const void *_b) {
1404 const EntryItem *a = _a, *b = _b;
1405
1406 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1407 return -1;
1408 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1409 return 1;
1410 return 0;
1411 }
1412
1413 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1414 unsigned i;
1415 EntryItem *items;
1416 int r;
1417 uint64_t xor_hash = 0;
1418 struct dual_timestamp _ts;
1419
1420 assert(f);
1421 assert(iovec || n_iovec == 0);
1422
1423 if (!ts) {
1424 dual_timestamp_get(&_ts);
1425 ts = &_ts;
1426 }
1427
1428 if (f->tail_entry_monotonic_valid &&
1429 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1430 return -EINVAL;
1431
1432 #ifdef HAVE_GCRYPT
1433 r = journal_file_maybe_append_tag(f, ts->realtime);
1434 if (r < 0)
1435 return r;
1436 #endif
1437
1438 /* alloca() can't take 0, hence let's allocate at least one */
1439 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1440
1441 for (i = 0; i < n_iovec; i++) {
1442 uint64_t p;
1443 Object *o;
1444
1445 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1446 if (r < 0)
1447 return r;
1448
1449 xor_hash ^= le64toh(o->data.hash);
1450 items[i].object_offset = htole64(p);
1451 items[i].hash = o->data.hash;
1452 }
1453
1454 /* Order by the position on disk, in order to improve seek
1455 * times for rotating media. */
1456 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1457
1458 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1459
1460 /* If the memory mapping triggered a SIGBUS then we return an
1461 * IO error and ignore the error code passed down to us, since
1462 * it is very likely just an effect of a nullified replacement
1463 * mapping page */
1464
1465 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1466 r = -EIO;
1467
1468 journal_file_post_change(f);
1469
1470 return r;
1471 }
1472
1473 typedef struct ChainCacheItem {
1474 uint64_t first; /* the array at the beginning of the chain */
1475 uint64_t array; /* the cached array */
1476 uint64_t begin; /* the first item in the cached array */
1477 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1478 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1479 } ChainCacheItem;
1480
1481 static void chain_cache_put(
1482 OrderedHashmap *h,
1483 ChainCacheItem *ci,
1484 uint64_t first,
1485 uint64_t array,
1486 uint64_t begin,
1487 uint64_t total,
1488 uint64_t last_index) {
1489
1490 if (!ci) {
1491 /* If the chain item to cache for this chain is the
1492 * first one it's not worth caching anything */
1493 if (array == first)
1494 return;
1495
1496 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1497 ci = ordered_hashmap_steal_first(h);
1498 assert(ci);
1499 } else {
1500 ci = new(ChainCacheItem, 1);
1501 if (!ci)
1502 return;
1503 }
1504
1505 ci->first = first;
1506
1507 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1508 free(ci);
1509 return;
1510 }
1511 } else
1512 assert(ci->first == first);
1513
1514 ci->array = array;
1515 ci->begin = begin;
1516 ci->total = total;
1517 ci->last_index = last_index;
1518 }
1519
1520 static int generic_array_get(
1521 JournalFile *f,
1522 uint64_t first,
1523 uint64_t i,
1524 Object **ret, uint64_t *offset) {
1525
1526 Object *o;
1527 uint64_t p = 0, a, t = 0;
1528 int r;
1529 ChainCacheItem *ci;
1530
1531 assert(f);
1532
1533 a = first;
1534
1535 /* Try the chain cache first */
1536 ci = ordered_hashmap_get(f->chain_cache, &first);
1537 if (ci && i > ci->total) {
1538 a = ci->array;
1539 i -= ci->total;
1540 t = ci->total;
1541 }
1542
1543 while (a > 0) {
1544 uint64_t k;
1545
1546 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1547 if (r < 0)
1548 return r;
1549
1550 k = journal_file_entry_array_n_items(o);
1551 if (i < k) {
1552 p = le64toh(o->entry_array.items[i]);
1553 goto found;
1554 }
1555
1556 i -= k;
1557 t += k;
1558 a = le64toh(o->entry_array.next_entry_array_offset);
1559 }
1560
1561 return 0;
1562
1563 found:
1564 /* Let's cache this item for the next invocation */
1565 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1566
1567 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1568 if (r < 0)
1569 return r;
1570
1571 if (ret)
1572 *ret = o;
1573
1574 if (offset)
1575 *offset = p;
1576
1577 return 1;
1578 }
1579
1580 static int generic_array_get_plus_one(
1581 JournalFile *f,
1582 uint64_t extra,
1583 uint64_t first,
1584 uint64_t i,
1585 Object **ret, uint64_t *offset) {
1586
1587 Object *o;
1588
1589 assert(f);
1590
1591 if (i == 0) {
1592 int r;
1593
1594 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1595 if (r < 0)
1596 return r;
1597
1598 if (ret)
1599 *ret = o;
1600
1601 if (offset)
1602 *offset = extra;
1603
1604 return 1;
1605 }
1606
1607 return generic_array_get(f, first, i-1, ret, offset);
1608 }
1609
1610 enum {
1611 TEST_FOUND,
1612 TEST_LEFT,
1613 TEST_RIGHT
1614 };
1615
1616 static int generic_array_bisect(
1617 JournalFile *f,
1618 uint64_t first,
1619 uint64_t n,
1620 uint64_t needle,
1621 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1622 direction_t direction,
1623 Object **ret,
1624 uint64_t *offset,
1625 uint64_t *idx) {
1626
1627 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1628 bool subtract_one = false;
1629 Object *o, *array = NULL;
1630 int r;
1631 ChainCacheItem *ci;
1632
1633 assert(f);
1634 assert(test_object);
1635
1636 /* Start with the first array in the chain */
1637 a = first;
1638
1639 ci = ordered_hashmap_get(f->chain_cache, &first);
1640 if (ci && n > ci->total) {
1641 /* Ah, we have iterated this bisection array chain
1642 * previously! Let's see if we can skip ahead in the
1643 * chain, as far as the last time. But we can't jump
1644 * backwards in the chain, so let's check that
1645 * first. */
1646
1647 r = test_object(f, ci->begin, needle);
1648 if (r < 0)
1649 return r;
1650
1651 if (r == TEST_LEFT) {
1652 /* OK, what we are looking for is right of the
1653 * begin of this EntryArray, so let's jump
1654 * straight to previously cached array in the
1655 * chain */
1656
1657 a = ci->array;
1658 n -= ci->total;
1659 t = ci->total;
1660 last_index = ci->last_index;
1661 }
1662 }
1663
1664 while (a > 0) {
1665 uint64_t left, right, k, lp;
1666
1667 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1668 if (r < 0)
1669 return r;
1670
1671 k = journal_file_entry_array_n_items(array);
1672 right = MIN(k, n);
1673 if (right <= 0)
1674 return 0;
1675
1676 i = right - 1;
1677 lp = p = le64toh(array->entry_array.items[i]);
1678 if (p <= 0)
1679 return -EBADMSG;
1680
1681 r = test_object(f, p, needle);
1682 if (r < 0)
1683 return r;
1684
1685 if (r == TEST_FOUND)
1686 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1687
1688 if (r == TEST_RIGHT) {
1689 left = 0;
1690 right -= 1;
1691
1692 if (last_index != (uint64_t) -1) {
1693 assert(last_index <= right);
1694
1695 /* If we cached the last index we
1696 * looked at, let's try to not to jump
1697 * too wildly around and see if we can
1698 * limit the range to look at early to
1699 * the immediate neighbors of the last
1700 * index we looked at. */
1701
1702 if (last_index > 0) {
1703 uint64_t x = last_index - 1;
1704
1705 p = le64toh(array->entry_array.items[x]);
1706 if (p <= 0)
1707 return -EBADMSG;
1708
1709 r = test_object(f, p, needle);
1710 if (r < 0)
1711 return r;
1712
1713 if (r == TEST_FOUND)
1714 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1715
1716 if (r == TEST_RIGHT)
1717 right = x;
1718 else
1719 left = x + 1;
1720 }
1721
1722 if (last_index < right) {
1723 uint64_t y = last_index + 1;
1724
1725 p = le64toh(array->entry_array.items[y]);
1726 if (p <= 0)
1727 return -EBADMSG;
1728
1729 r = test_object(f, p, needle);
1730 if (r < 0)
1731 return r;
1732
1733 if (r == TEST_FOUND)
1734 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1735
1736 if (r == TEST_RIGHT)
1737 right = y;
1738 else
1739 left = y + 1;
1740 }
1741 }
1742
1743 for (;;) {
1744 if (left == right) {
1745 if (direction == DIRECTION_UP)
1746 subtract_one = true;
1747
1748 i = left;
1749 goto found;
1750 }
1751
1752 assert(left < right);
1753 i = (left + right) / 2;
1754
1755 p = le64toh(array->entry_array.items[i]);
1756 if (p <= 0)
1757 return -EBADMSG;
1758
1759 r = test_object(f, p, needle);
1760 if (r < 0)
1761 return r;
1762
1763 if (r == TEST_FOUND)
1764 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1765
1766 if (r == TEST_RIGHT)
1767 right = i;
1768 else
1769 left = i + 1;
1770 }
1771 }
1772
1773 if (k >= n) {
1774 if (direction == DIRECTION_UP) {
1775 i = n;
1776 subtract_one = true;
1777 goto found;
1778 }
1779
1780 return 0;
1781 }
1782
1783 last_p = lp;
1784
1785 n -= k;
1786 t += k;
1787 last_index = (uint64_t) -1;
1788 a = le64toh(array->entry_array.next_entry_array_offset);
1789 }
1790
1791 return 0;
1792
1793 found:
1794 if (subtract_one && t == 0 && i == 0)
1795 return 0;
1796
1797 /* Let's cache this item for the next invocation */
1798 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1799
1800 if (subtract_one && i == 0)
1801 p = last_p;
1802 else if (subtract_one)
1803 p = le64toh(array->entry_array.items[i-1]);
1804 else
1805 p = le64toh(array->entry_array.items[i]);
1806
1807 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1808 if (r < 0)
1809 return r;
1810
1811 if (ret)
1812 *ret = o;
1813
1814 if (offset)
1815 *offset = p;
1816
1817 if (idx)
1818 *idx = t + i + (subtract_one ? -1 : 0);
1819
1820 return 1;
1821 }
1822
1823 static int generic_array_bisect_plus_one(
1824 JournalFile *f,
1825 uint64_t extra,
1826 uint64_t first,
1827 uint64_t n,
1828 uint64_t needle,
1829 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1830 direction_t direction,
1831 Object **ret,
1832 uint64_t *offset,
1833 uint64_t *idx) {
1834
1835 int r;
1836 bool step_back = false;
1837 Object *o;
1838
1839 assert(f);
1840 assert(test_object);
1841
1842 if (n <= 0)
1843 return 0;
1844
1845 /* This bisects the array in object 'first', but first checks
1846 * an extra */
1847 r = test_object(f, extra, needle);
1848 if (r < 0)
1849 return r;
1850
1851 if (r == TEST_FOUND)
1852 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1853
1854 /* if we are looking with DIRECTION_UP then we need to first
1855 see if in the actual array there is a matching entry, and
1856 return the last one of that. But if there isn't any we need
1857 to return this one. Hence remember this, and return it
1858 below. */
1859 if (r == TEST_LEFT)
1860 step_back = direction == DIRECTION_UP;
1861
1862 if (r == TEST_RIGHT) {
1863 if (direction == DIRECTION_DOWN)
1864 goto found;
1865 else
1866 return 0;
1867 }
1868
1869 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1870
1871 if (r == 0 && step_back)
1872 goto found;
1873
1874 if (r > 0 && idx)
1875 (*idx) ++;
1876
1877 return r;
1878
1879 found:
1880 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1881 if (r < 0)
1882 return r;
1883
1884 if (ret)
1885 *ret = o;
1886
1887 if (offset)
1888 *offset = extra;
1889
1890 if (idx)
1891 *idx = 0;
1892
1893 return 1;
1894 }
1895
1896 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1897 assert(f);
1898 assert(p > 0);
1899
1900 if (p == needle)
1901 return TEST_FOUND;
1902 else if (p < needle)
1903 return TEST_LEFT;
1904 else
1905 return TEST_RIGHT;
1906 }
1907
1908 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1909 Object *o;
1910 int r;
1911
1912 assert(f);
1913 assert(p > 0);
1914
1915 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1916 if (r < 0)
1917 return r;
1918
1919 if (le64toh(o->entry.seqnum) == needle)
1920 return TEST_FOUND;
1921 else if (le64toh(o->entry.seqnum) < needle)
1922 return TEST_LEFT;
1923 else
1924 return TEST_RIGHT;
1925 }
1926
1927 int journal_file_move_to_entry_by_seqnum(
1928 JournalFile *f,
1929 uint64_t seqnum,
1930 direction_t direction,
1931 Object **ret,
1932 uint64_t *offset) {
1933
1934 return generic_array_bisect(f,
1935 le64toh(f->header->entry_array_offset),
1936 le64toh(f->header->n_entries),
1937 seqnum,
1938 test_object_seqnum,
1939 direction,
1940 ret, offset, NULL);
1941 }
1942
1943 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1944 Object *o;
1945 int r;
1946
1947 assert(f);
1948 assert(p > 0);
1949
1950 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1951 if (r < 0)
1952 return r;
1953
1954 if (le64toh(o->entry.realtime) == needle)
1955 return TEST_FOUND;
1956 else if (le64toh(o->entry.realtime) < needle)
1957 return TEST_LEFT;
1958 else
1959 return TEST_RIGHT;
1960 }
1961
1962 int journal_file_move_to_entry_by_realtime(
1963 JournalFile *f,
1964 uint64_t realtime,
1965 direction_t direction,
1966 Object **ret,
1967 uint64_t *offset) {
1968
1969 return generic_array_bisect(f,
1970 le64toh(f->header->entry_array_offset),
1971 le64toh(f->header->n_entries),
1972 realtime,
1973 test_object_realtime,
1974 direction,
1975 ret, offset, NULL);
1976 }
1977
1978 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1979 Object *o;
1980 int r;
1981
1982 assert(f);
1983 assert(p > 0);
1984
1985 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1986 if (r < 0)
1987 return r;
1988
1989 if (le64toh(o->entry.monotonic) == needle)
1990 return TEST_FOUND;
1991 else if (le64toh(o->entry.monotonic) < needle)
1992 return TEST_LEFT;
1993 else
1994 return TEST_RIGHT;
1995 }
1996
1997 static int find_data_object_by_boot_id(
1998 JournalFile *f,
1999 sd_id128_t boot_id,
2000 Object **o,
2001 uint64_t *b) {
2002
2003 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2004
2005 sd_id128_to_string(boot_id, t + 9);
2006 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2007 }
2008
2009 int journal_file_move_to_entry_by_monotonic(
2010 JournalFile *f,
2011 sd_id128_t boot_id,
2012 uint64_t monotonic,
2013 direction_t direction,
2014 Object **ret,
2015 uint64_t *offset) {
2016
2017 Object *o;
2018 int r;
2019
2020 assert(f);
2021
2022 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2023 if (r < 0)
2024 return r;
2025 if (r == 0)
2026 return -ENOENT;
2027
2028 return generic_array_bisect_plus_one(f,
2029 le64toh(o->data.entry_offset),
2030 le64toh(o->data.entry_array_offset),
2031 le64toh(o->data.n_entries),
2032 monotonic,
2033 test_object_monotonic,
2034 direction,
2035 ret, offset, NULL);
2036 }
2037
2038 void journal_file_reset_location(JournalFile *f) {
2039 f->location_type = LOCATION_HEAD;
2040 f->current_offset = 0;
2041 f->current_seqnum = 0;
2042 f->current_realtime = 0;
2043 f->current_monotonic = 0;
2044 zero(f->current_boot_id);
2045 f->current_xor_hash = 0;
2046 }
2047
2048 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2049 f->location_type = LOCATION_SEEK;
2050 f->current_offset = offset;
2051 f->current_seqnum = le64toh(o->entry.seqnum);
2052 f->current_realtime = le64toh(o->entry.realtime);
2053 f->current_monotonic = le64toh(o->entry.monotonic);
2054 f->current_boot_id = o->entry.boot_id;
2055 f->current_xor_hash = le64toh(o->entry.xor_hash);
2056 }
2057
2058 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2059 assert(af);
2060 assert(bf);
2061 assert(af->location_type == LOCATION_SEEK);
2062 assert(bf->location_type == LOCATION_SEEK);
2063
2064 /* If contents and timestamps match, these entries are
2065 * identical, even if the seqnum does not match */
2066 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2067 af->current_monotonic == bf->current_monotonic &&
2068 af->current_realtime == bf->current_realtime &&
2069 af->current_xor_hash == bf->current_xor_hash)
2070 return 0;
2071
2072 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2073
2074 /* If this is from the same seqnum source, compare
2075 * seqnums */
2076 if (af->current_seqnum < bf->current_seqnum)
2077 return -1;
2078 if (af->current_seqnum > bf->current_seqnum)
2079 return 1;
2080
2081 /* Wow! This is weird, different data but the same
2082 * seqnums? Something is borked, but let's make the
2083 * best of it and compare by time. */
2084 }
2085
2086 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2087
2088 /* If the boot id matches, compare monotonic time */
2089 if (af->current_monotonic < bf->current_monotonic)
2090 return -1;
2091 if (af->current_monotonic > bf->current_monotonic)
2092 return 1;
2093 }
2094
2095 /* Otherwise, compare UTC time */
2096 if (af->current_realtime < bf->current_realtime)
2097 return -1;
2098 if (af->current_realtime > bf->current_realtime)
2099 return 1;
2100
2101 /* Finally, compare by contents */
2102 if (af->current_xor_hash < bf->current_xor_hash)
2103 return -1;
2104 if (af->current_xor_hash > bf->current_xor_hash)
2105 return 1;
2106
2107 return 0;
2108 }
2109
2110 int journal_file_next_entry(
2111 JournalFile *f,
2112 uint64_t p,
2113 direction_t direction,
2114 Object **ret, uint64_t *offset) {
2115
2116 uint64_t i, n, ofs;
2117 int r;
2118
2119 assert(f);
2120
2121 n = le64toh(f->header->n_entries);
2122 if (n <= 0)
2123 return 0;
2124
2125 if (p == 0)
2126 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2127 else {
2128 r = generic_array_bisect(f,
2129 le64toh(f->header->entry_array_offset),
2130 le64toh(f->header->n_entries),
2131 p,
2132 test_object_offset,
2133 DIRECTION_DOWN,
2134 NULL, NULL,
2135 &i);
2136 if (r <= 0)
2137 return r;
2138
2139 if (direction == DIRECTION_DOWN) {
2140 if (i >= n - 1)
2141 return 0;
2142
2143 i++;
2144 } else {
2145 if (i <= 0)
2146 return 0;
2147
2148 i--;
2149 }
2150 }
2151
2152 /* And jump to it */
2153 r = generic_array_get(f,
2154 le64toh(f->header->entry_array_offset),
2155 i,
2156 ret, &ofs);
2157 if (r <= 0)
2158 return r;
2159
2160 if (p > 0 &&
2161 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2162 log_debug("%s: entry array corrupted at entry %"PRIu64,
2163 f->path, i);
2164 return -EBADMSG;
2165 }
2166
2167 if (offset)
2168 *offset = ofs;
2169
2170 return 1;
2171 }
2172
2173 int journal_file_next_entry_for_data(
2174 JournalFile *f,
2175 Object *o, uint64_t p,
2176 uint64_t data_offset,
2177 direction_t direction,
2178 Object **ret, uint64_t *offset) {
2179
2180 uint64_t n, i;
2181 int r;
2182 Object *d;
2183
2184 assert(f);
2185 assert(p > 0 || !o);
2186
2187 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2188 if (r < 0)
2189 return r;
2190
2191 n = le64toh(d->data.n_entries);
2192 if (n <= 0)
2193 return n;
2194
2195 if (!o)
2196 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2197 else {
2198 if (o->object.type != OBJECT_ENTRY)
2199 return -EINVAL;
2200
2201 r = generic_array_bisect_plus_one(f,
2202 le64toh(d->data.entry_offset),
2203 le64toh(d->data.entry_array_offset),
2204 le64toh(d->data.n_entries),
2205 p,
2206 test_object_offset,
2207 DIRECTION_DOWN,
2208 NULL, NULL,
2209 &i);
2210
2211 if (r <= 0)
2212 return r;
2213
2214 if (direction == DIRECTION_DOWN) {
2215 if (i >= n - 1)
2216 return 0;
2217
2218 i++;
2219 } else {
2220 if (i <= 0)
2221 return 0;
2222
2223 i--;
2224 }
2225
2226 }
2227
2228 return generic_array_get_plus_one(f,
2229 le64toh(d->data.entry_offset),
2230 le64toh(d->data.entry_array_offset),
2231 i,
2232 ret, offset);
2233 }
2234
2235 int journal_file_move_to_entry_by_offset_for_data(
2236 JournalFile *f,
2237 uint64_t data_offset,
2238 uint64_t p,
2239 direction_t direction,
2240 Object **ret, uint64_t *offset) {
2241
2242 int r;
2243 Object *d;
2244
2245 assert(f);
2246
2247 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2248 if (r < 0)
2249 return r;
2250
2251 return generic_array_bisect_plus_one(f,
2252 le64toh(d->data.entry_offset),
2253 le64toh(d->data.entry_array_offset),
2254 le64toh(d->data.n_entries),
2255 p,
2256 test_object_offset,
2257 direction,
2258 ret, offset, NULL);
2259 }
2260
2261 int journal_file_move_to_entry_by_monotonic_for_data(
2262 JournalFile *f,
2263 uint64_t data_offset,
2264 sd_id128_t boot_id,
2265 uint64_t monotonic,
2266 direction_t direction,
2267 Object **ret, uint64_t *offset) {
2268
2269 Object *o, *d;
2270 int r;
2271 uint64_t b, z;
2272
2273 assert(f);
2274
2275 /* First, seek by time */
2276 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2277 if (r < 0)
2278 return r;
2279 if (r == 0)
2280 return -ENOENT;
2281
2282 r = generic_array_bisect_plus_one(f,
2283 le64toh(o->data.entry_offset),
2284 le64toh(o->data.entry_array_offset),
2285 le64toh(o->data.n_entries),
2286 monotonic,
2287 test_object_monotonic,
2288 direction,
2289 NULL, &z, NULL);
2290 if (r <= 0)
2291 return r;
2292
2293 /* And now, continue seeking until we find an entry that
2294 * exists in both bisection arrays */
2295
2296 for (;;) {
2297 Object *qo;
2298 uint64_t p, q;
2299
2300 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2301 if (r < 0)
2302 return r;
2303
2304 r = generic_array_bisect_plus_one(f,
2305 le64toh(d->data.entry_offset),
2306 le64toh(d->data.entry_array_offset),
2307 le64toh(d->data.n_entries),
2308 z,
2309 test_object_offset,
2310 direction,
2311 NULL, &p, NULL);
2312 if (r <= 0)
2313 return r;
2314
2315 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2316 if (r < 0)
2317 return r;
2318
2319 r = generic_array_bisect_plus_one(f,
2320 le64toh(o->data.entry_offset),
2321 le64toh(o->data.entry_array_offset),
2322 le64toh(o->data.n_entries),
2323 p,
2324 test_object_offset,
2325 direction,
2326 &qo, &q, NULL);
2327
2328 if (r <= 0)
2329 return r;
2330
2331 if (p == q) {
2332 if (ret)
2333 *ret = qo;
2334 if (offset)
2335 *offset = q;
2336
2337 return 1;
2338 }
2339
2340 z = q;
2341 }
2342 }
2343
2344 int journal_file_move_to_entry_by_seqnum_for_data(
2345 JournalFile *f,
2346 uint64_t data_offset,
2347 uint64_t seqnum,
2348 direction_t direction,
2349 Object **ret, uint64_t *offset) {
2350
2351 Object *d;
2352 int r;
2353
2354 assert(f);
2355
2356 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2357 if (r < 0)
2358 return r;
2359
2360 return generic_array_bisect_plus_one(f,
2361 le64toh(d->data.entry_offset),
2362 le64toh(d->data.entry_array_offset),
2363 le64toh(d->data.n_entries),
2364 seqnum,
2365 test_object_seqnum,
2366 direction,
2367 ret, offset, NULL);
2368 }
2369
2370 int journal_file_move_to_entry_by_realtime_for_data(
2371 JournalFile *f,
2372 uint64_t data_offset,
2373 uint64_t realtime,
2374 direction_t direction,
2375 Object **ret, uint64_t *offset) {
2376
2377 Object *d;
2378 int r;
2379
2380 assert(f);
2381
2382 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2383 if (r < 0)
2384 return r;
2385
2386 return generic_array_bisect_plus_one(f,
2387 le64toh(d->data.entry_offset),
2388 le64toh(d->data.entry_array_offset),
2389 le64toh(d->data.n_entries),
2390 realtime,
2391 test_object_realtime,
2392 direction,
2393 ret, offset, NULL);
2394 }
2395
2396 void journal_file_dump(JournalFile *f) {
2397 Object *o;
2398 int r;
2399 uint64_t p;
2400
2401 assert(f);
2402
2403 journal_file_print_header(f);
2404
2405 p = le64toh(f->header->header_size);
2406 while (p != 0) {
2407 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2408 if (r < 0)
2409 goto fail;
2410
2411 switch (o->object.type) {
2412
2413 case OBJECT_UNUSED:
2414 printf("Type: OBJECT_UNUSED\n");
2415 break;
2416
2417 case OBJECT_DATA:
2418 printf("Type: OBJECT_DATA\n");
2419 break;
2420
2421 case OBJECT_FIELD:
2422 printf("Type: OBJECT_FIELD\n");
2423 break;
2424
2425 case OBJECT_ENTRY:
2426 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2427 le64toh(o->entry.seqnum),
2428 le64toh(o->entry.monotonic),
2429 le64toh(o->entry.realtime));
2430 break;
2431
2432 case OBJECT_FIELD_HASH_TABLE:
2433 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2434 break;
2435
2436 case OBJECT_DATA_HASH_TABLE:
2437 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2438 break;
2439
2440 case OBJECT_ENTRY_ARRAY:
2441 printf("Type: OBJECT_ENTRY_ARRAY\n");
2442 break;
2443
2444 case OBJECT_TAG:
2445 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2446 le64toh(o->tag.seqnum),
2447 le64toh(o->tag.epoch));
2448 break;
2449
2450 default:
2451 printf("Type: unknown (%i)\n", o->object.type);
2452 break;
2453 }
2454
2455 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2456 printf("Flags: %s\n",
2457 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2458
2459 if (p == le64toh(f->header->tail_object_offset))
2460 p = 0;
2461 else
2462 p = p + ALIGN64(le64toh(o->object.size));
2463 }
2464
2465 return;
2466 fail:
2467 log_error("File corrupt");
2468 }
2469
2470 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2471 const char *x;
2472
2473 x = format_timestamp(buf, l, t);
2474 if (x)
2475 return x;
2476 return " --- ";
2477 }
2478
2479 void journal_file_print_header(JournalFile *f) {
2480 char a[33], b[33], c[33], d[33];
2481 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2482 struct stat st;
2483 char bytes[FORMAT_BYTES_MAX];
2484
2485 assert(f);
2486
2487 printf("File Path: %s\n"
2488 "File ID: %s\n"
2489 "Machine ID: %s\n"
2490 "Boot ID: %s\n"
2491 "Sequential Number ID: %s\n"
2492 "State: %s\n"
2493 "Compatible Flags:%s%s\n"
2494 "Incompatible Flags:%s%s%s\n"
2495 "Header size: %"PRIu64"\n"
2496 "Arena size: %"PRIu64"\n"
2497 "Data Hash Table Size: %"PRIu64"\n"
2498 "Field Hash Table Size: %"PRIu64"\n"
2499 "Rotate Suggested: %s\n"
2500 "Head Sequential Number: %"PRIu64"\n"
2501 "Tail Sequential Number: %"PRIu64"\n"
2502 "Head Realtime Timestamp: %s\n"
2503 "Tail Realtime Timestamp: %s\n"
2504 "Tail Monotonic Timestamp: %s\n"
2505 "Objects: %"PRIu64"\n"
2506 "Entry Objects: %"PRIu64"\n",
2507 f->path,
2508 sd_id128_to_string(f->header->file_id, a),
2509 sd_id128_to_string(f->header->machine_id, b),
2510 sd_id128_to_string(f->header->boot_id, c),
2511 sd_id128_to_string(f->header->seqnum_id, d),
2512 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2513 f->header->state == STATE_ONLINE ? "ONLINE" :
2514 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2515 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2516 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2517 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2518 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2519 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2520 le64toh(f->header->header_size),
2521 le64toh(f->header->arena_size),
2522 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2523 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2524 yes_no(journal_file_rotate_suggested(f, 0)),
2525 le64toh(f->header->head_entry_seqnum),
2526 le64toh(f->header->tail_entry_seqnum),
2527 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2528 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2529 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2530 le64toh(f->header->n_objects),
2531 le64toh(f->header->n_entries));
2532
2533 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2534 printf("Data Objects: %"PRIu64"\n"
2535 "Data Hash Table Fill: %.1f%%\n",
2536 le64toh(f->header->n_data),
2537 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2538
2539 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2540 printf("Field Objects: %"PRIu64"\n"
2541 "Field Hash Table Fill: %.1f%%\n",
2542 le64toh(f->header->n_fields),
2543 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2544
2545 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2546 printf("Tag Objects: %"PRIu64"\n",
2547 le64toh(f->header->n_tags));
2548 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2549 printf("Entry Array Objects: %"PRIu64"\n",
2550 le64toh(f->header->n_entry_arrays));
2551
2552 if (fstat(f->fd, &st) >= 0)
2553 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2554 }
2555
2556 static int journal_file_warn_btrfs(JournalFile *f) {
2557 unsigned attrs;
2558 int r;
2559
2560 assert(f);
2561
2562 /* Before we write anything, check if the COW logic is turned
2563 * off on btrfs. Given our write pattern that is quite
2564 * unfriendly to COW file systems this should greatly improve
2565 * performance on COW file systems, such as btrfs, at the
2566 * expense of data integrity features (which shouldn't be too
2567 * bad, given that we do our own checksumming). */
2568
2569 r = btrfs_is_filesystem(f->fd);
2570 if (r < 0)
2571 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2572 if (!r)
2573 return 0;
2574
2575 r = read_attr_fd(f->fd, &attrs);
2576 if (r < 0)
2577 return log_warning_errno(r, "Failed to read file attributes: %m");
2578
2579 if (attrs & FS_NOCOW_FL) {
2580 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2581 return 0;
2582 }
2583
2584 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2585 "This is likely to slow down journal access substantially, please consider turning "
2586 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2587
2588 return 1;
2589 }
2590
2591 int journal_file_open(
2592 const char *fname,
2593 int flags,
2594 mode_t mode,
2595 bool compress,
2596 bool seal,
2597 JournalMetrics *metrics,
2598 MMapCache *mmap_cache,
2599 JournalFile *template,
2600 JournalFile **ret) {
2601
2602 bool newly_created = false;
2603 JournalFile *f;
2604 void *h;
2605 int r;
2606
2607 assert(fname);
2608 assert(ret);
2609
2610 if ((flags & O_ACCMODE) != O_RDONLY &&
2611 (flags & O_ACCMODE) != O_RDWR)
2612 return -EINVAL;
2613
2614 if (!endswith(fname, ".journal") &&
2615 !endswith(fname, ".journal~"))
2616 return -EINVAL;
2617
2618 f = new0(JournalFile, 1);
2619 if (!f)
2620 return -ENOMEM;
2621
2622 f->fd = -1;
2623 f->mode = mode;
2624
2625 f->flags = flags;
2626 f->prot = prot_from_flags(flags);
2627 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2628 #if defined(HAVE_LZ4)
2629 f->compress_lz4 = compress;
2630 #elif defined(HAVE_XZ)
2631 f->compress_xz = compress;
2632 #endif
2633 #ifdef HAVE_GCRYPT
2634 f->seal = seal;
2635 #endif
2636
2637 if (mmap_cache)
2638 f->mmap = mmap_cache_ref(mmap_cache);
2639 else {
2640 f->mmap = mmap_cache_new();
2641 if (!f->mmap) {
2642 r = -ENOMEM;
2643 goto fail;
2644 }
2645 }
2646
2647 f->path = strdup(fname);
2648 if (!f->path) {
2649 r = -ENOMEM;
2650 goto fail;
2651 }
2652
2653 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2654 if (!f->chain_cache) {
2655 r = -ENOMEM;
2656 goto fail;
2657 }
2658
2659 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2660 if (f->fd < 0) {
2661 r = -errno;
2662 goto fail;
2663 }
2664
2665 r = journal_file_fstat(f);
2666 if (r < 0)
2667 goto fail;
2668
2669 if (f->last_stat.st_size == 0 && f->writable) {
2670
2671 (void) journal_file_warn_btrfs(f);
2672
2673 /* Let's attach the creation time to the journal file,
2674 * so that the vacuuming code knows the age of this
2675 * file even if the file might end up corrupted one
2676 * day... Ideally we'd just use the creation time many
2677 * file systems maintain for each file, but there is
2678 * currently no usable API to query this, hence let's
2679 * emulate this via extended attributes. If extended
2680 * attributes are not supported we'll just skip this,
2681 * and rely solely on mtime/atime/ctime of the file. */
2682
2683 fd_setcrtime(f->fd, 0);
2684
2685 #ifdef HAVE_GCRYPT
2686 /* Try to load the FSPRG state, and if we can't, then
2687 * just don't do sealing */
2688 if (f->seal) {
2689 r = journal_file_fss_load(f);
2690 if (r < 0)
2691 f->seal = false;
2692 }
2693 #endif
2694
2695 r = journal_file_init_header(f, template);
2696 if (r < 0)
2697 goto fail;
2698
2699 r = journal_file_fstat(f);
2700 if (r < 0)
2701 goto fail;
2702
2703 newly_created = true;
2704 }
2705
2706 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2707 r = -EIO;
2708 goto fail;
2709 }
2710
2711 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2712 if (r < 0)
2713 goto fail;
2714
2715 f->header = h;
2716
2717 if (!newly_created) {
2718 r = journal_file_verify_header(f);
2719 if (r < 0)
2720 goto fail;
2721 }
2722
2723 #ifdef HAVE_GCRYPT
2724 if (!newly_created && f->writable) {
2725 r = journal_file_fss_load(f);
2726 if (r < 0)
2727 goto fail;
2728 }
2729 #endif
2730
2731 if (f->writable) {
2732 if (metrics) {
2733 journal_default_metrics(metrics, f->fd);
2734 f->metrics = *metrics;
2735 } else if (template)
2736 f->metrics = template->metrics;
2737
2738 r = journal_file_refresh_header(f);
2739 if (r < 0)
2740 goto fail;
2741 }
2742
2743 #ifdef HAVE_GCRYPT
2744 r = journal_file_hmac_setup(f);
2745 if (r < 0)
2746 goto fail;
2747 #endif
2748
2749 if (newly_created) {
2750 r = journal_file_setup_field_hash_table(f);
2751 if (r < 0)
2752 goto fail;
2753
2754 r = journal_file_setup_data_hash_table(f);
2755 if (r < 0)
2756 goto fail;
2757
2758 #ifdef HAVE_GCRYPT
2759 r = journal_file_append_first_tag(f);
2760 if (r < 0)
2761 goto fail;
2762 #endif
2763 }
2764
2765 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2766 r = -EIO;
2767 goto fail;
2768 }
2769
2770 *ret = f;
2771 return 0;
2772
2773 fail:
2774 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2775 r = -EIO;
2776
2777 journal_file_close(f);
2778
2779 return r;
2780 }
2781
2782 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2783 _cleanup_free_ char *p = NULL;
2784 size_t l;
2785 JournalFile *old_file, *new_file = NULL;
2786 int r;
2787
2788 assert(f);
2789 assert(*f);
2790
2791 old_file = *f;
2792
2793 if (!old_file->writable)
2794 return -EINVAL;
2795
2796 if (!endswith(old_file->path, ".journal"))
2797 return -EINVAL;
2798
2799 l = strlen(old_file->path);
2800 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2801 (int) l - 8, old_file->path,
2802 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2803 le64toh((*f)->header->head_entry_seqnum),
2804 le64toh((*f)->header->head_entry_realtime));
2805 if (r < 0)
2806 return -ENOMEM;
2807
2808 /* Try to rename the file to the archived version. If the file
2809 * already was deleted, we'll get ENOENT, let's ignore that
2810 * case. */
2811 r = rename(old_file->path, p);
2812 if (r < 0 && errno != ENOENT)
2813 return -errno;
2814
2815 old_file->header->state = STATE_ARCHIVED;
2816
2817 /* Currently, btrfs is not very good with out write patterns
2818 * and fragments heavily. Let's defrag our journal files when
2819 * we archive them */
2820 old_file->defrag_on_close = true;
2821
2822 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2823 journal_file_close(old_file);
2824
2825 *f = new_file;
2826 return r;
2827 }
2828
2829 int journal_file_open_reliably(
2830 const char *fname,
2831 int flags,
2832 mode_t mode,
2833 bool compress,
2834 bool seal,
2835 JournalMetrics *metrics,
2836 MMapCache *mmap_cache,
2837 JournalFile *template,
2838 JournalFile **ret) {
2839
2840 int r;
2841 size_t l;
2842 _cleanup_free_ char *p = NULL;
2843
2844 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2845 if (!IN_SET(r,
2846 -EBADMSG, /* corrupted */
2847 -ENODATA, /* truncated */
2848 -EHOSTDOWN, /* other machine */
2849 -EPROTONOSUPPORT, /* incompatible feature */
2850 -EBUSY, /* unclean shutdown */
2851 -ESHUTDOWN, /* already archived */
2852 -EIO, /* IO error, including SIGBUS on mmap */
2853 -EIDRM /* File has been deleted */))
2854 return r;
2855
2856 if ((flags & O_ACCMODE) == O_RDONLY)
2857 return r;
2858
2859 if (!(flags & O_CREAT))
2860 return r;
2861
2862 if (!endswith(fname, ".journal"))
2863 return r;
2864
2865 /* The file is corrupted. Rotate it away and try it again (but only once) */
2866
2867 l = strlen(fname);
2868 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2869 (int) l - 8, fname,
2870 now(CLOCK_REALTIME),
2871 random_u64()) < 0)
2872 return -ENOMEM;
2873
2874 if (rename(fname, p) < 0)
2875 return -errno;
2876
2877 /* btrfs doesn't cope well with our write pattern and
2878 * fragments heavily. Let's defrag all files we rotate */
2879
2880 (void) chattr_path(p, false, FS_NOCOW_FL);
2881 (void) btrfs_defrag(p);
2882
2883 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2884
2885 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2886 }
2887
2888 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2889 uint64_t i, n;
2890 uint64_t q, xor_hash = 0;
2891 int r;
2892 EntryItem *items;
2893 dual_timestamp ts;
2894
2895 assert(from);
2896 assert(to);
2897 assert(o);
2898 assert(p);
2899
2900 if (!to->writable)
2901 return -EPERM;
2902
2903 ts.monotonic = le64toh(o->entry.monotonic);
2904 ts.realtime = le64toh(o->entry.realtime);
2905
2906 n = journal_file_entry_n_items(o);
2907 /* alloca() can't take 0, hence let's allocate at least one */
2908 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2909
2910 for (i = 0; i < n; i++) {
2911 uint64_t l, h;
2912 le64_t le_hash;
2913 size_t t;
2914 void *data;
2915 Object *u;
2916
2917 q = le64toh(o->entry.items[i].object_offset);
2918 le_hash = o->entry.items[i].hash;
2919
2920 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2921 if (r < 0)
2922 return r;
2923
2924 if (le_hash != o->data.hash)
2925 return -EBADMSG;
2926
2927 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2928 t = (size_t) l;
2929
2930 /* We hit the limit on 32bit machines */
2931 if ((uint64_t) t != l)
2932 return -E2BIG;
2933
2934 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2935 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2936 size_t rsize = 0;
2937
2938 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2939 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2940 if (r < 0)
2941 return r;
2942
2943 data = from->compress_buffer;
2944 l = rsize;
2945 #else
2946 return -EPROTONOSUPPORT;
2947 #endif
2948 } else
2949 data = o->data.payload;
2950
2951 r = journal_file_append_data(to, data, l, &u, &h);
2952 if (r < 0)
2953 return r;
2954
2955 xor_hash ^= le64toh(u->data.hash);
2956 items[i].object_offset = htole64(h);
2957 items[i].hash = u->data.hash;
2958
2959 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2960 if (r < 0)
2961 return r;
2962 }
2963
2964 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2965
2966 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2967 return -EIO;
2968
2969 return r;
2970 }
2971
2972 void journal_reset_metrics(JournalMetrics *m) {
2973 assert(m);
2974
2975 /* Set everything to "pick automatic values". */
2976
2977 *m = (JournalMetrics) {
2978 .min_use = (uint64_t) -1,
2979 .max_use = (uint64_t) -1,
2980 .min_size = (uint64_t) -1,
2981 .max_size = (uint64_t) -1,
2982 .keep_free = (uint64_t) -1,
2983 .n_max_files = (uint64_t) -1,
2984 };
2985 }
2986
2987 void journal_default_metrics(JournalMetrics *m, int fd) {
2988 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
2989 struct statvfs ss;
2990 uint64_t fs_size;
2991
2992 assert(m);
2993 assert(fd >= 0);
2994
2995 if (fstatvfs(fd, &ss) >= 0)
2996 fs_size = ss.f_frsize * ss.f_blocks;
2997 else {
2998 log_debug_errno(errno, "Failed to detremine disk size: %m");
2999 fs_size = 0;
3000 }
3001
3002 if (m->max_use == (uint64_t) -1) {
3003
3004 if (fs_size > 0) {
3005 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3006
3007 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3008 m->max_use = DEFAULT_MAX_USE_UPPER;
3009
3010 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3011 m->max_use = DEFAULT_MAX_USE_LOWER;
3012 } else
3013 m->max_use = DEFAULT_MAX_USE_LOWER;
3014 } else {
3015 m->max_use = PAGE_ALIGN(m->max_use);
3016
3017 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3018 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3019 }
3020
3021 if (m->min_use == (uint64_t) -1)
3022 m->min_use = DEFAULT_MIN_USE;
3023
3024 if (m->min_use > m->max_use)
3025 m->min_use = m->max_use;
3026
3027 if (m->max_size == (uint64_t) -1) {
3028 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3029
3030 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3031 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3032 } else
3033 m->max_size = PAGE_ALIGN(m->max_size);
3034
3035 if (m->max_size != 0) {
3036 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3037 m->max_size = JOURNAL_FILE_SIZE_MIN;
3038
3039 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3040 m->max_use = m->max_size*2;
3041 }
3042
3043 if (m->min_size == (uint64_t) -1)
3044 m->min_size = JOURNAL_FILE_SIZE_MIN;
3045 else {
3046 m->min_size = PAGE_ALIGN(m->min_size);
3047
3048 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3049 m->min_size = JOURNAL_FILE_SIZE_MIN;
3050
3051 if (m->max_size != 0 && m->min_size > m->max_size)
3052 m->max_size = m->min_size;
3053 }
3054
3055 if (m->keep_free == (uint64_t) -1) {
3056
3057 if (fs_size > 0) {
3058 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3059
3060 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3061 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3062
3063 } else
3064 m->keep_free = DEFAULT_KEEP_FREE;
3065 }
3066
3067 if (m->n_max_files == (uint64_t) -1)
3068 m->n_max_files = DEFAULT_N_MAX_FILES;
3069
3070 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3071 format_bytes(a, sizeof(a), m->min_use),
3072 format_bytes(b, sizeof(b), m->max_use),
3073 format_bytes(c, sizeof(c), m->max_size),
3074 format_bytes(d, sizeof(d), m->min_size),
3075 format_bytes(e, sizeof(e), m->keep_free),
3076 m->n_max_files);
3077 }
3078
3079 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3080 assert(f);
3081 assert(from || to);
3082
3083 if (from) {
3084 if (f->header->head_entry_realtime == 0)
3085 return -ENOENT;
3086
3087 *from = le64toh(f->header->head_entry_realtime);
3088 }
3089
3090 if (to) {
3091 if (f->header->tail_entry_realtime == 0)
3092 return -ENOENT;
3093
3094 *to = le64toh(f->header->tail_entry_realtime);
3095 }
3096
3097 return 1;
3098 }
3099
3100 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3101 Object *o;
3102 uint64_t p;
3103 int r;
3104
3105 assert(f);
3106 assert(from || to);
3107
3108 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3109 if (r <= 0)
3110 return r;
3111
3112 if (le64toh(o->data.n_entries) <= 0)
3113 return 0;
3114
3115 if (from) {
3116 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3117 if (r < 0)
3118 return r;
3119
3120 *from = le64toh(o->entry.monotonic);
3121 }
3122
3123 if (to) {
3124 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3125 if (r < 0)
3126 return r;
3127
3128 r = generic_array_get_plus_one(f,
3129 le64toh(o->data.entry_offset),
3130 le64toh(o->data.entry_array_offset),
3131 le64toh(o->data.n_entries)-1,
3132 &o, NULL);
3133 if (r <= 0)
3134 return r;
3135
3136 *to = le64toh(o->entry.monotonic);
3137 }
3138
3139 return 1;
3140 }
3141
3142 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3143 assert(f);
3144
3145 /* If we gained new header fields we gained new features,
3146 * hence suggest a rotation */
3147 if (le64toh(f->header->header_size) < sizeof(Header)) {
3148 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3149 return true;
3150 }
3151
3152 /* Let's check if the hash tables grew over a certain fill
3153 * level (75%, borrowing this value from Java's hash table
3154 * implementation), and if so suggest a rotation. To calculate
3155 * the fill level we need the n_data field, which only exists
3156 * in newer versions. */
3157
3158 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3159 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3160 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3161 f->path,
3162 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3163 le64toh(f->header->n_data),
3164 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3165 (unsigned long long) f->last_stat.st_size,
3166 f->last_stat.st_size / le64toh(f->header->n_data));
3167 return true;
3168 }
3169
3170 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3171 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3172 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3173 f->path,
3174 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3175 le64toh(f->header->n_fields),
3176 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3177 return true;
3178 }
3179
3180 /* Are the data objects properly indexed by field objects? */
3181 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3182 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3183 le64toh(f->header->n_data) > 0 &&
3184 le64toh(f->header->n_fields) == 0)
3185 return true;
3186
3187 if (max_file_usec > 0) {
3188 usec_t t, h;
3189
3190 h = le64toh(f->header->head_entry_realtime);
3191 t = now(CLOCK_REALTIME);
3192
3193 if (h > 0 && t > h + max_file_usec)
3194 return true;
3195 }
3196
3197 return false;
3198 }