]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
929ad0aa7ce485611a3f917fd6ea9175eed72134
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <linux/fs.h>
25 #include <stddef.h>
26 #include <sys/mman.h>
27 #include <sys/statvfs.h>
28 #include <sys/uio.h>
29 #include <unistd.h>
30
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
34 #include "compress.h"
35 #include "fd-util.h"
36 #include "journal-authenticate.h"
37 #include "journal-def.h"
38 #include "journal-file.h"
39 #include "lookup3.h"
40 #include "parse-util.h"
41 #include "random-util.h"
42 #include "sd-event.h"
43 #include "string-util.h"
44 #include "xattr-util.h"
45
46 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
47 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
48
49 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
50
51 /* This is the minimum journal file size */
52 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
53
54 /* These are the lower and upper bounds if we deduce the max_use value
55 * from the file system size */
56 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
57 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58
59 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
60 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
61
62 /* This is the upper bound if we deduce max_size from max_use */
63 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
64
65 /* This is the upper bound if we deduce the keep_free value from the
66 * file system size */
67 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
68
69 /* This is the keep_free value when we can't determine the system
70 * size */
71 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
72
73 /* This is the default maximum number of journal files to keep around. */
74 #define DEFAULT_N_MAX_FILES (100)
75
76 /* n_data was the first entry we added after the initial file format design */
77 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
78
79 /* How many entries to keep in the entry array chain cache at max */
80 #define CHAIN_CACHE_MAX 20
81
82 /* How much to increase the journal file size at once each time we allocate something new. */
83 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
84
85 /* Reread fstat() of the file for detecting deletions at least this often */
86 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
87
88 /* The mmap context to use for the header we pick as one above the last defined typed */
89 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
90
91 static int journal_file_set_online(JournalFile *f) {
92 assert(f);
93
94 if (!f->writable)
95 return -EPERM;
96
97 if (!(f->fd >= 0 && f->header))
98 return -EINVAL;
99
100 if (mmap_cache_got_sigbus(f->mmap, f->fd))
101 return -EIO;
102
103 switch(f->header->state) {
104 case STATE_ONLINE:
105 return 0;
106
107 case STATE_OFFLINE:
108 f->header->state = STATE_ONLINE;
109 fsync(f->fd);
110 return 0;
111
112 default:
113 return -EINVAL;
114 }
115 }
116
117 int journal_file_set_offline(JournalFile *f) {
118 assert(f);
119
120 if (!f->writable)
121 return -EPERM;
122
123 if (!(f->fd >= 0 && f->header))
124 return -EINVAL;
125
126 if (f->header->state != STATE_ONLINE)
127 return 0;
128
129 fsync(f->fd);
130
131 if (mmap_cache_got_sigbus(f->mmap, f->fd))
132 return -EIO;
133
134 f->header->state = STATE_OFFLINE;
135
136 if (mmap_cache_got_sigbus(f->mmap, f->fd))
137 return -EIO;
138
139 fsync(f->fd);
140
141 return 0;
142 }
143
144 JournalFile* journal_file_close(JournalFile *f) {
145 assert(f);
146
147 #ifdef HAVE_GCRYPT
148 /* Write the final tag */
149 if (f->seal && f->writable)
150 journal_file_append_tag(f);
151 #endif
152
153 if (f->post_change_timer) {
154 int enabled;
155
156 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
157 if (enabled == SD_EVENT_ONESHOT)
158 journal_file_post_change(f);
159
160 sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
161 sd_event_source_unref(f->post_change_timer);
162 }
163
164 journal_file_set_offline(f);
165
166 if (f->mmap && f->fd >= 0)
167 mmap_cache_close_fd(f->mmap, f->fd);
168
169 if (f->fd >= 0 && f->defrag_on_close) {
170
171 /* Be friendly to btrfs: turn COW back on again now,
172 * and defragment the file. We won't write to the file
173 * ever again, hence remove all fragmentation, and
174 * reenable all the good bits COW usually provides
175 * (such as data checksumming). */
176
177 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
178 (void) btrfs_defrag_fd(f->fd);
179 }
180
181 safe_close(f->fd);
182 free(f->path);
183
184 mmap_cache_unref(f->mmap);
185
186 ordered_hashmap_free_free(f->chain_cache);
187
188 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
189 free(f->compress_buffer);
190 #endif
191
192 #ifdef HAVE_GCRYPT
193 if (f->fss_file)
194 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
195 else
196 free(f->fsprg_state);
197
198 free(f->fsprg_seed);
199
200 if (f->hmac)
201 gcry_md_close(f->hmac);
202 #endif
203
204 free(f);
205 return NULL;
206 }
207
208 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
209 Header h = {};
210 ssize_t k;
211 int r;
212
213 assert(f);
214
215 memcpy(h.signature, HEADER_SIGNATURE, 8);
216 h.header_size = htole64(ALIGN64(sizeof(h)));
217
218 h.incompatible_flags |= htole32(
219 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
220 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
221
222 h.compatible_flags = htole32(
223 f->seal * HEADER_COMPATIBLE_SEALED);
224
225 r = sd_id128_randomize(&h.file_id);
226 if (r < 0)
227 return r;
228
229 if (template) {
230 h.seqnum_id = template->header->seqnum_id;
231 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
232 } else
233 h.seqnum_id = h.file_id;
234
235 k = pwrite(f->fd, &h, sizeof(h), 0);
236 if (k < 0)
237 return -errno;
238
239 if (k != sizeof(h))
240 return -EIO;
241
242 return 0;
243 }
244
245 static int journal_file_refresh_header(JournalFile *f) {
246 sd_id128_t boot_id;
247 int r;
248
249 assert(f);
250
251 r = sd_id128_get_machine(&f->header->machine_id);
252 if (r < 0)
253 return r;
254
255 r = sd_id128_get_boot(&boot_id);
256 if (r < 0)
257 return r;
258
259 if (sd_id128_equal(boot_id, f->header->boot_id))
260 f->tail_entry_monotonic_valid = true;
261
262 f->header->boot_id = boot_id;
263
264 r = journal_file_set_online(f);
265
266 /* Sync the online state to disk */
267 fsync(f->fd);
268
269 return r;
270 }
271
272 static int journal_file_verify_header(JournalFile *f) {
273 uint32_t flags;
274
275 assert(f);
276
277 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
278 return -EBADMSG;
279
280 /* In both read and write mode we refuse to open files with
281 * incompatible flags we don't know */
282 flags = le32toh(f->header->incompatible_flags);
283 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
284 if (flags & ~HEADER_INCOMPATIBLE_ANY)
285 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
286 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
287 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
288 if (flags)
289 log_debug("Journal file %s uses incompatible flags %"PRIx32
290 " disabled at compilation time.", f->path, flags);
291 return -EPROTONOSUPPORT;
292 }
293
294 /* When open for writing we refuse to open files with
295 * compatible flags, too */
296 flags = le32toh(f->header->compatible_flags);
297 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
298 if (flags & ~HEADER_COMPATIBLE_ANY)
299 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
300 f->path, flags & ~HEADER_COMPATIBLE_ANY);
301 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
302 if (flags)
303 log_debug("Journal file %s uses compatible flags %"PRIx32
304 " disabled at compilation time.", f->path, flags);
305 return -EPROTONOSUPPORT;
306 }
307
308 if (f->header->state >= _STATE_MAX)
309 return -EBADMSG;
310
311 /* The first addition was n_data, so check that we are at least this large */
312 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
313 return -EBADMSG;
314
315 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
316 return -EBADMSG;
317
318 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
319 return -ENODATA;
320
321 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
322 return -ENODATA;
323
324 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
325 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
326 !VALID64(le64toh(f->header->tail_object_offset)) ||
327 !VALID64(le64toh(f->header->entry_array_offset)))
328 return -ENODATA;
329
330 if (f->writable) {
331 uint8_t state;
332 sd_id128_t machine_id;
333 int r;
334
335 r = sd_id128_get_machine(&machine_id);
336 if (r < 0)
337 return r;
338
339 if (!sd_id128_equal(machine_id, f->header->machine_id))
340 return -EHOSTDOWN;
341
342 state = f->header->state;
343
344 if (state == STATE_ONLINE) {
345 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
346 return -EBUSY;
347 } else if (state == STATE_ARCHIVED)
348 return -ESHUTDOWN;
349 else if (state != STATE_OFFLINE) {
350 log_debug("Journal file %s has unknown state %i.", f->path, state);
351 return -EBUSY;
352 }
353 }
354
355 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
356 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
357
358 f->seal = JOURNAL_HEADER_SEALED(f->header);
359
360 return 0;
361 }
362
363 static int journal_file_fstat(JournalFile *f) {
364 assert(f);
365 assert(f->fd >= 0);
366
367 if (fstat(f->fd, &f->last_stat) < 0)
368 return -errno;
369
370 f->last_stat_usec = now(CLOCK_MONOTONIC);
371
372 /* Refuse appending to files that are already deleted */
373 if (f->last_stat.st_nlink <= 0)
374 return -EIDRM;
375
376 return 0;
377 }
378
379 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
380 uint64_t old_size, new_size;
381 int r;
382
383 assert(f);
384
385 /* We assume that this file is not sparse, and we know that
386 * for sure, since we always call posix_fallocate()
387 * ourselves */
388
389 if (mmap_cache_got_sigbus(f->mmap, f->fd))
390 return -EIO;
391
392 old_size =
393 le64toh(f->header->header_size) +
394 le64toh(f->header->arena_size);
395
396 new_size = PAGE_ALIGN(offset + size);
397 if (new_size < le64toh(f->header->header_size))
398 new_size = le64toh(f->header->header_size);
399
400 if (new_size <= old_size) {
401
402 /* We already pre-allocated enough space, but before
403 * we write to it, let's check with fstat() if the
404 * file got deleted, in order make sure we don't throw
405 * away the data immediately. Don't check fstat() for
406 * all writes though, but only once ever 10s. */
407
408 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
409 return 0;
410
411 return journal_file_fstat(f);
412 }
413
414 /* Allocate more space. */
415
416 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
417 return -E2BIG;
418
419 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
420 struct statvfs svfs;
421
422 if (fstatvfs(f->fd, &svfs) >= 0) {
423 uint64_t available;
424
425 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
426
427 if (new_size - old_size > available)
428 return -E2BIG;
429 }
430 }
431
432 /* Increase by larger blocks at once */
433 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
434 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
435 new_size = f->metrics.max_size;
436
437 /* Note that the glibc fallocate() fallback is very
438 inefficient, hence we try to minimize the allocation area
439 as we can. */
440 r = posix_fallocate(f->fd, old_size, new_size - old_size);
441 if (r != 0)
442 return -r;
443
444 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
445
446 return journal_file_fstat(f);
447 }
448
449 static unsigned type_to_context(ObjectType type) {
450 /* One context for each type, plus one catch-all for the rest */
451 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
452 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
453 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
454 }
455
456 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
457 int r;
458
459 assert(f);
460 assert(ret);
461
462 if (size <= 0)
463 return -EINVAL;
464
465 /* Avoid SIGBUS on invalid accesses */
466 if (offset + size > (uint64_t) f->last_stat.st_size) {
467 /* Hmm, out of range? Let's refresh the fstat() data
468 * first, before we trust that check. */
469
470 r = journal_file_fstat(f);
471 if (r < 0)
472 return r;
473
474 if (offset + size > (uint64_t) f->last_stat.st_size)
475 return -EADDRNOTAVAIL;
476 }
477
478 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
479 }
480
481 static uint64_t minimum_header_size(Object *o) {
482
483 static const uint64_t table[] = {
484 [OBJECT_DATA] = sizeof(DataObject),
485 [OBJECT_FIELD] = sizeof(FieldObject),
486 [OBJECT_ENTRY] = sizeof(EntryObject),
487 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
488 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
489 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
490 [OBJECT_TAG] = sizeof(TagObject),
491 };
492
493 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
494 return sizeof(ObjectHeader);
495
496 return table[o->object.type];
497 }
498
499 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
500 int r;
501 void *t;
502 Object *o;
503 uint64_t s;
504
505 assert(f);
506 assert(ret);
507
508 /* Objects may only be located at multiple of 64 bit */
509 if (!VALID64(offset))
510 return -EFAULT;
511
512 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
513 if (r < 0)
514 return r;
515
516 o = (Object*) t;
517 s = le64toh(o->object.size);
518
519 if (s < sizeof(ObjectHeader))
520 return -EBADMSG;
521
522 if (o->object.type <= OBJECT_UNUSED)
523 return -EBADMSG;
524
525 if (s < minimum_header_size(o))
526 return -EBADMSG;
527
528 if (type > OBJECT_UNUSED && o->object.type != type)
529 return -EBADMSG;
530
531 if (s > sizeof(ObjectHeader)) {
532 r = journal_file_move_to(f, type, false, offset, s, &t);
533 if (r < 0)
534 return r;
535
536 o = (Object*) t;
537 }
538
539 *ret = o;
540 return 0;
541 }
542
543 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
544 uint64_t r;
545
546 assert(f);
547
548 r = le64toh(f->header->tail_entry_seqnum) + 1;
549
550 if (seqnum) {
551 /* If an external seqnum counter was passed, we update
552 * both the local and the external one, and set it to
553 * the maximum of both */
554
555 if (*seqnum + 1 > r)
556 r = *seqnum + 1;
557
558 *seqnum = r;
559 }
560
561 f->header->tail_entry_seqnum = htole64(r);
562
563 if (f->header->head_entry_seqnum == 0)
564 f->header->head_entry_seqnum = htole64(r);
565
566 return r;
567 }
568
569 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
570 int r;
571 uint64_t p;
572 Object *tail, *o;
573 void *t;
574
575 assert(f);
576 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
577 assert(size >= sizeof(ObjectHeader));
578 assert(offset);
579 assert(ret);
580
581 r = journal_file_set_online(f);
582 if (r < 0)
583 return r;
584
585 p = le64toh(f->header->tail_object_offset);
586 if (p == 0)
587 p = le64toh(f->header->header_size);
588 else {
589 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
590 if (r < 0)
591 return r;
592
593 p += ALIGN64(le64toh(tail->object.size));
594 }
595
596 r = journal_file_allocate(f, p, size);
597 if (r < 0)
598 return r;
599
600 r = journal_file_move_to(f, type, false, p, size, &t);
601 if (r < 0)
602 return r;
603
604 o = (Object*) t;
605
606 zero(o->object);
607 o->object.type = type;
608 o->object.size = htole64(size);
609
610 f->header->tail_object_offset = htole64(p);
611 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
612
613 *ret = o;
614 *offset = p;
615
616 return 0;
617 }
618
619 static int journal_file_setup_data_hash_table(JournalFile *f) {
620 uint64_t s, p;
621 Object *o;
622 int r;
623
624 assert(f);
625
626 /* We estimate that we need 1 hash table entry per 768 bytes
627 of journal file and we want to make sure we never get
628 beyond 75% fill level. Calculate the hash table size for
629 the maximum file size based on these metrics. */
630
631 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
632 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
633 s = DEFAULT_DATA_HASH_TABLE_SIZE;
634
635 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
636
637 r = journal_file_append_object(f,
638 OBJECT_DATA_HASH_TABLE,
639 offsetof(Object, hash_table.items) + s,
640 &o, &p);
641 if (r < 0)
642 return r;
643
644 memzero(o->hash_table.items, s);
645
646 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
647 f->header->data_hash_table_size = htole64(s);
648
649 return 0;
650 }
651
652 static int journal_file_setup_field_hash_table(JournalFile *f) {
653 uint64_t s, p;
654 Object *o;
655 int r;
656
657 assert(f);
658
659 /* We use a fixed size hash table for the fields as this
660 * number should grow very slowly only */
661
662 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
663 r = journal_file_append_object(f,
664 OBJECT_FIELD_HASH_TABLE,
665 offsetof(Object, hash_table.items) + s,
666 &o, &p);
667 if (r < 0)
668 return r;
669
670 memzero(o->hash_table.items, s);
671
672 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
673 f->header->field_hash_table_size = htole64(s);
674
675 return 0;
676 }
677
678 int journal_file_map_data_hash_table(JournalFile *f) {
679 uint64_t s, p;
680 void *t;
681 int r;
682
683 assert(f);
684
685 if (f->data_hash_table)
686 return 0;
687
688 p = le64toh(f->header->data_hash_table_offset);
689 s = le64toh(f->header->data_hash_table_size);
690
691 r = journal_file_move_to(f,
692 OBJECT_DATA_HASH_TABLE,
693 true,
694 p, s,
695 &t);
696 if (r < 0)
697 return r;
698
699 f->data_hash_table = t;
700 return 0;
701 }
702
703 int journal_file_map_field_hash_table(JournalFile *f) {
704 uint64_t s, p;
705 void *t;
706 int r;
707
708 assert(f);
709
710 if (f->field_hash_table)
711 return 0;
712
713 p = le64toh(f->header->field_hash_table_offset);
714 s = le64toh(f->header->field_hash_table_size);
715
716 r = journal_file_move_to(f,
717 OBJECT_FIELD_HASH_TABLE,
718 true,
719 p, s,
720 &t);
721 if (r < 0)
722 return r;
723
724 f->field_hash_table = t;
725 return 0;
726 }
727
728 static int journal_file_link_field(
729 JournalFile *f,
730 Object *o,
731 uint64_t offset,
732 uint64_t hash) {
733
734 uint64_t p, h, m;
735 int r;
736
737 assert(f);
738 assert(o);
739 assert(offset > 0);
740
741 if (o->object.type != OBJECT_FIELD)
742 return -EINVAL;
743
744 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
745 if (m <= 0)
746 return -EBADMSG;
747
748 /* This might alter the window we are looking at */
749 o->field.next_hash_offset = o->field.head_data_offset = 0;
750
751 h = hash % m;
752 p = le64toh(f->field_hash_table[h].tail_hash_offset);
753 if (p == 0)
754 f->field_hash_table[h].head_hash_offset = htole64(offset);
755 else {
756 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
757 if (r < 0)
758 return r;
759
760 o->field.next_hash_offset = htole64(offset);
761 }
762
763 f->field_hash_table[h].tail_hash_offset = htole64(offset);
764
765 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
766 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
767
768 return 0;
769 }
770
771 static int journal_file_link_data(
772 JournalFile *f,
773 Object *o,
774 uint64_t offset,
775 uint64_t hash) {
776
777 uint64_t p, h, m;
778 int r;
779
780 assert(f);
781 assert(o);
782 assert(offset > 0);
783
784 if (o->object.type != OBJECT_DATA)
785 return -EINVAL;
786
787 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
788 if (m <= 0)
789 return -EBADMSG;
790
791 /* This might alter the window we are looking at */
792 o->data.next_hash_offset = o->data.next_field_offset = 0;
793 o->data.entry_offset = o->data.entry_array_offset = 0;
794 o->data.n_entries = 0;
795
796 h = hash % m;
797 p = le64toh(f->data_hash_table[h].tail_hash_offset);
798 if (p == 0)
799 /* Only entry in the hash table is easy */
800 f->data_hash_table[h].head_hash_offset = htole64(offset);
801 else {
802 /* Move back to the previous data object, to patch in
803 * pointer */
804
805 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
806 if (r < 0)
807 return r;
808
809 o->data.next_hash_offset = htole64(offset);
810 }
811
812 f->data_hash_table[h].tail_hash_offset = htole64(offset);
813
814 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
815 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
816
817 return 0;
818 }
819
820 int journal_file_find_field_object_with_hash(
821 JournalFile *f,
822 const void *field, uint64_t size, uint64_t hash,
823 Object **ret, uint64_t *offset) {
824
825 uint64_t p, osize, h, m;
826 int r;
827
828 assert(f);
829 assert(field && size > 0);
830
831 /* If the field hash table is empty, we can't find anything */
832 if (le64toh(f->header->field_hash_table_size) <= 0)
833 return 0;
834
835 /* Map the field hash table, if it isn't mapped yet. */
836 r = journal_file_map_field_hash_table(f);
837 if (r < 0)
838 return r;
839
840 osize = offsetof(Object, field.payload) + size;
841
842 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
843 if (m <= 0)
844 return -EBADMSG;
845
846 h = hash % m;
847 p = le64toh(f->field_hash_table[h].head_hash_offset);
848
849 while (p > 0) {
850 Object *o;
851
852 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
853 if (r < 0)
854 return r;
855
856 if (le64toh(o->field.hash) == hash &&
857 le64toh(o->object.size) == osize &&
858 memcmp(o->field.payload, field, size) == 0) {
859
860 if (ret)
861 *ret = o;
862 if (offset)
863 *offset = p;
864
865 return 1;
866 }
867
868 p = le64toh(o->field.next_hash_offset);
869 }
870
871 return 0;
872 }
873
874 int journal_file_find_field_object(
875 JournalFile *f,
876 const void *field, uint64_t size,
877 Object **ret, uint64_t *offset) {
878
879 uint64_t hash;
880
881 assert(f);
882 assert(field && size > 0);
883
884 hash = hash64(field, size);
885
886 return journal_file_find_field_object_with_hash(f,
887 field, size, hash,
888 ret, offset);
889 }
890
891 int journal_file_find_data_object_with_hash(
892 JournalFile *f,
893 const void *data, uint64_t size, uint64_t hash,
894 Object **ret, uint64_t *offset) {
895
896 uint64_t p, osize, h, m;
897 int r;
898
899 assert(f);
900 assert(data || size == 0);
901
902 /* If there's no data hash table, then there's no entry. */
903 if (le64toh(f->header->data_hash_table_size) <= 0)
904 return 0;
905
906 /* Map the data hash table, if it isn't mapped yet. */
907 r = journal_file_map_data_hash_table(f);
908 if (r < 0)
909 return r;
910
911 osize = offsetof(Object, data.payload) + size;
912
913 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
914 if (m <= 0)
915 return -EBADMSG;
916
917 h = hash % m;
918 p = le64toh(f->data_hash_table[h].head_hash_offset);
919
920 while (p > 0) {
921 Object *o;
922
923 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
924 if (r < 0)
925 return r;
926
927 if (le64toh(o->data.hash) != hash)
928 goto next;
929
930 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
931 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
932 uint64_t l;
933 size_t rsize = 0;
934
935 l = le64toh(o->object.size);
936 if (l <= offsetof(Object, data.payload))
937 return -EBADMSG;
938
939 l -= offsetof(Object, data.payload);
940
941 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
942 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
943 if (r < 0)
944 return r;
945
946 if (rsize == size &&
947 memcmp(f->compress_buffer, data, size) == 0) {
948
949 if (ret)
950 *ret = o;
951
952 if (offset)
953 *offset = p;
954
955 return 1;
956 }
957 #else
958 return -EPROTONOSUPPORT;
959 #endif
960 } else if (le64toh(o->object.size) == osize &&
961 memcmp(o->data.payload, data, size) == 0) {
962
963 if (ret)
964 *ret = o;
965
966 if (offset)
967 *offset = p;
968
969 return 1;
970 }
971
972 next:
973 p = le64toh(o->data.next_hash_offset);
974 }
975
976 return 0;
977 }
978
979 int journal_file_find_data_object(
980 JournalFile *f,
981 const void *data, uint64_t size,
982 Object **ret, uint64_t *offset) {
983
984 uint64_t hash;
985
986 assert(f);
987 assert(data || size == 0);
988
989 hash = hash64(data, size);
990
991 return journal_file_find_data_object_with_hash(f,
992 data, size, hash,
993 ret, offset);
994 }
995
996 static int journal_file_append_field(
997 JournalFile *f,
998 const void *field, uint64_t size,
999 Object **ret, uint64_t *offset) {
1000
1001 uint64_t hash, p;
1002 uint64_t osize;
1003 Object *o;
1004 int r;
1005
1006 assert(f);
1007 assert(field && size > 0);
1008
1009 hash = hash64(field, size);
1010
1011 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1012 if (r < 0)
1013 return r;
1014 else if (r > 0) {
1015
1016 if (ret)
1017 *ret = o;
1018
1019 if (offset)
1020 *offset = p;
1021
1022 return 0;
1023 }
1024
1025 osize = offsetof(Object, field.payload) + size;
1026 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1027 if (r < 0)
1028 return r;
1029
1030 o->field.hash = htole64(hash);
1031 memcpy(o->field.payload, field, size);
1032
1033 r = journal_file_link_field(f, o, p, hash);
1034 if (r < 0)
1035 return r;
1036
1037 /* The linking might have altered the window, so let's
1038 * refresh our pointer */
1039 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1040 if (r < 0)
1041 return r;
1042
1043 #ifdef HAVE_GCRYPT
1044 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1045 if (r < 0)
1046 return r;
1047 #endif
1048
1049 if (ret)
1050 *ret = o;
1051
1052 if (offset)
1053 *offset = p;
1054
1055 return 0;
1056 }
1057
1058 static int journal_file_append_data(
1059 JournalFile *f,
1060 const void *data, uint64_t size,
1061 Object **ret, uint64_t *offset) {
1062
1063 uint64_t hash, p;
1064 uint64_t osize;
1065 Object *o;
1066 int r, compression = 0;
1067 const void *eq;
1068
1069 assert(f);
1070 assert(data || size == 0);
1071
1072 hash = hash64(data, size);
1073
1074 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1075 if (r < 0)
1076 return r;
1077 if (r > 0) {
1078
1079 if (ret)
1080 *ret = o;
1081
1082 if (offset)
1083 *offset = p;
1084
1085 return 0;
1086 }
1087
1088 osize = offsetof(Object, data.payload) + size;
1089 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1090 if (r < 0)
1091 return r;
1092
1093 o->data.hash = htole64(hash);
1094
1095 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1096 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1097 size_t rsize = 0;
1098
1099 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1100
1101 if (compression >= 0) {
1102 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1103 o->object.flags |= compression;
1104
1105 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1106 size, rsize, object_compressed_to_string(compression));
1107 } else
1108 /* Compression didn't work, we don't really care why, let's continue without compression */
1109 compression = 0;
1110 }
1111 #endif
1112
1113 if (compression == 0 && size > 0)
1114 memcpy(o->data.payload, data, size);
1115
1116 r = journal_file_link_data(f, o, p, hash);
1117 if (r < 0)
1118 return r;
1119
1120 /* The linking might have altered the window, so let's
1121 * refresh our pointer */
1122 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1123 if (r < 0)
1124 return r;
1125
1126 if (!data)
1127 eq = NULL;
1128 else
1129 eq = memchr(data, '=', size);
1130 if (eq && eq > data) {
1131 Object *fo = NULL;
1132 uint64_t fp;
1133
1134 /* Create field object ... */
1135 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1136 if (r < 0)
1137 return r;
1138
1139 /* ... and link it in. */
1140 o->data.next_field_offset = fo->field.head_data_offset;
1141 fo->field.head_data_offset = le64toh(p);
1142 }
1143
1144 #ifdef HAVE_GCRYPT
1145 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1146 if (r < 0)
1147 return r;
1148 #endif
1149
1150 if (ret)
1151 *ret = o;
1152
1153 if (offset)
1154 *offset = p;
1155
1156 return 0;
1157 }
1158
1159 uint64_t journal_file_entry_n_items(Object *o) {
1160 assert(o);
1161
1162 if (o->object.type != OBJECT_ENTRY)
1163 return 0;
1164
1165 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1166 }
1167
1168 uint64_t journal_file_entry_array_n_items(Object *o) {
1169 assert(o);
1170
1171 if (o->object.type != OBJECT_ENTRY_ARRAY)
1172 return 0;
1173
1174 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1175 }
1176
1177 uint64_t journal_file_hash_table_n_items(Object *o) {
1178 assert(o);
1179
1180 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1181 o->object.type != OBJECT_FIELD_HASH_TABLE)
1182 return 0;
1183
1184 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1185 }
1186
1187 static int link_entry_into_array(JournalFile *f,
1188 le64_t *first,
1189 le64_t *idx,
1190 uint64_t p) {
1191 int r;
1192 uint64_t n = 0, ap = 0, q, i, a, hidx;
1193 Object *o;
1194
1195 assert(f);
1196 assert(first);
1197 assert(idx);
1198 assert(p > 0);
1199
1200 a = le64toh(*first);
1201 i = hidx = le64toh(*idx);
1202 while (a > 0) {
1203
1204 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1205 if (r < 0)
1206 return r;
1207
1208 n = journal_file_entry_array_n_items(o);
1209 if (i < n) {
1210 o->entry_array.items[i] = htole64(p);
1211 *idx = htole64(hidx + 1);
1212 return 0;
1213 }
1214
1215 i -= n;
1216 ap = a;
1217 a = le64toh(o->entry_array.next_entry_array_offset);
1218 }
1219
1220 if (hidx > n)
1221 n = (hidx+1) * 2;
1222 else
1223 n = n * 2;
1224
1225 if (n < 4)
1226 n = 4;
1227
1228 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1229 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1230 &o, &q);
1231 if (r < 0)
1232 return r;
1233
1234 #ifdef HAVE_GCRYPT
1235 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1236 if (r < 0)
1237 return r;
1238 #endif
1239
1240 o->entry_array.items[i] = htole64(p);
1241
1242 if (ap == 0)
1243 *first = htole64(q);
1244 else {
1245 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1246 if (r < 0)
1247 return r;
1248
1249 o->entry_array.next_entry_array_offset = htole64(q);
1250 }
1251
1252 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1253 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1254
1255 *idx = htole64(hidx + 1);
1256
1257 return 0;
1258 }
1259
1260 static int link_entry_into_array_plus_one(JournalFile *f,
1261 le64_t *extra,
1262 le64_t *first,
1263 le64_t *idx,
1264 uint64_t p) {
1265
1266 int r;
1267
1268 assert(f);
1269 assert(extra);
1270 assert(first);
1271 assert(idx);
1272 assert(p > 0);
1273
1274 if (*idx == 0)
1275 *extra = htole64(p);
1276 else {
1277 le64_t i;
1278
1279 i = htole64(le64toh(*idx) - 1);
1280 r = link_entry_into_array(f, first, &i, p);
1281 if (r < 0)
1282 return r;
1283 }
1284
1285 *idx = htole64(le64toh(*idx) + 1);
1286 return 0;
1287 }
1288
1289 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1290 uint64_t p;
1291 int r;
1292 assert(f);
1293 assert(o);
1294 assert(offset > 0);
1295
1296 p = le64toh(o->entry.items[i].object_offset);
1297 if (p == 0)
1298 return -EINVAL;
1299
1300 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1301 if (r < 0)
1302 return r;
1303
1304 return link_entry_into_array_plus_one(f,
1305 &o->data.entry_offset,
1306 &o->data.entry_array_offset,
1307 &o->data.n_entries,
1308 offset);
1309 }
1310
1311 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1312 uint64_t n, i;
1313 int r;
1314
1315 assert(f);
1316 assert(o);
1317 assert(offset > 0);
1318
1319 if (o->object.type != OBJECT_ENTRY)
1320 return -EINVAL;
1321
1322 __sync_synchronize();
1323
1324 /* Link up the entry itself */
1325 r = link_entry_into_array(f,
1326 &f->header->entry_array_offset,
1327 &f->header->n_entries,
1328 offset);
1329 if (r < 0)
1330 return r;
1331
1332 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1333
1334 if (f->header->head_entry_realtime == 0)
1335 f->header->head_entry_realtime = o->entry.realtime;
1336
1337 f->header->tail_entry_realtime = o->entry.realtime;
1338 f->header->tail_entry_monotonic = o->entry.monotonic;
1339
1340 f->tail_entry_monotonic_valid = true;
1341
1342 /* Link up the items */
1343 n = journal_file_entry_n_items(o);
1344 for (i = 0; i < n; i++) {
1345 r = journal_file_link_entry_item(f, o, offset, i);
1346 if (r < 0)
1347 return r;
1348 }
1349
1350 return 0;
1351 }
1352
1353 static int journal_file_append_entry_internal(
1354 JournalFile *f,
1355 const dual_timestamp *ts,
1356 uint64_t xor_hash,
1357 const EntryItem items[], unsigned n_items,
1358 uint64_t *seqnum,
1359 Object **ret, uint64_t *offset) {
1360 uint64_t np;
1361 uint64_t osize;
1362 Object *o;
1363 int r;
1364
1365 assert(f);
1366 assert(items || n_items == 0);
1367 assert(ts);
1368
1369 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1370
1371 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1372 if (r < 0)
1373 return r;
1374
1375 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1376 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1377 o->entry.realtime = htole64(ts->realtime);
1378 o->entry.monotonic = htole64(ts->monotonic);
1379 o->entry.xor_hash = htole64(xor_hash);
1380 o->entry.boot_id = f->header->boot_id;
1381
1382 #ifdef HAVE_GCRYPT
1383 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1384 if (r < 0)
1385 return r;
1386 #endif
1387
1388 r = journal_file_link_entry(f, o, np);
1389 if (r < 0)
1390 return r;
1391
1392 if (ret)
1393 *ret = o;
1394
1395 if (offset)
1396 *offset = np;
1397
1398 return 0;
1399 }
1400
1401 void journal_file_post_change(JournalFile *f) {
1402 assert(f);
1403
1404 /* inotify() does not receive IN_MODIFY events from file
1405 * accesses done via mmap(). After each access we hence
1406 * trigger IN_MODIFY by truncating the journal file to its
1407 * current size which triggers IN_MODIFY. */
1408
1409 __sync_synchronize();
1410
1411 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1412 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1413 }
1414
1415 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1416 assert(userdata);
1417
1418 journal_file_post_change(userdata);
1419
1420 return 1;
1421 }
1422
1423 static void schedule_post_change(JournalFile *f) {
1424 sd_event_source *timer;
1425 int enabled, r;
1426 uint64_t now;
1427
1428 assert(f);
1429 assert(f->post_change_timer);
1430
1431 timer = f->post_change_timer;
1432
1433 r = sd_event_source_get_enabled(timer, &enabled);
1434 if (r < 0) {
1435 log_error_errno(-r, "Failed to get ftruncate timer state: %m");
1436 return;
1437 }
1438
1439 if (enabled == SD_EVENT_ONESHOT)
1440 return;
1441
1442 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1443 if (r < 0) {
1444 log_error_errno(-r, "Failed to get clock's now for scheduling ftruncate: %m");
1445 return;
1446 }
1447
1448 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1449 if (r < 0) {
1450 log_error_errno(-r, "Failed to set time for scheduling ftruncate: %m");
1451 return;
1452 }
1453
1454 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1455 if (r < 0) {
1456 log_error_errno(-r, "Failed to enable scheduled ftruncate: %m");
1457 return;
1458 }
1459 }
1460
1461 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1462 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1463 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1464 int r;
1465
1466 assert(f);
1467 assert_return(!f->post_change_timer, -EINVAL);
1468 assert(e);
1469 assert(t);
1470
1471 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1472 if (r < 0)
1473 return r;
1474
1475 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1476 if (r < 0)
1477 return r;
1478
1479 f->post_change_timer = timer;
1480 timer = NULL;
1481 f->post_change_timer_period = t;
1482
1483 return r;
1484 }
1485
1486 static int entry_item_cmp(const void *_a, const void *_b) {
1487 const EntryItem *a = _a, *b = _b;
1488
1489 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1490 return -1;
1491 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1492 return 1;
1493 return 0;
1494 }
1495
1496 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1497 unsigned i;
1498 EntryItem *items;
1499 int r;
1500 uint64_t xor_hash = 0;
1501 struct dual_timestamp _ts;
1502
1503 assert(f);
1504 assert(iovec || n_iovec == 0);
1505
1506 if (!ts) {
1507 dual_timestamp_get(&_ts);
1508 ts = &_ts;
1509 }
1510
1511 if (f->tail_entry_monotonic_valid &&
1512 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1513 return -EINVAL;
1514
1515 #ifdef HAVE_GCRYPT
1516 r = journal_file_maybe_append_tag(f, ts->realtime);
1517 if (r < 0)
1518 return r;
1519 #endif
1520
1521 /* alloca() can't take 0, hence let's allocate at least one */
1522 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1523
1524 for (i = 0; i < n_iovec; i++) {
1525 uint64_t p;
1526 Object *o;
1527
1528 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1529 if (r < 0)
1530 return r;
1531
1532 xor_hash ^= le64toh(o->data.hash);
1533 items[i].object_offset = htole64(p);
1534 items[i].hash = o->data.hash;
1535 }
1536
1537 /* Order by the position on disk, in order to improve seek
1538 * times for rotating media. */
1539 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1540
1541 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1542
1543 /* If the memory mapping triggered a SIGBUS then we return an
1544 * IO error and ignore the error code passed down to us, since
1545 * it is very likely just an effect of a nullified replacement
1546 * mapping page */
1547
1548 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1549 r = -EIO;
1550
1551 if (f->post_change_timer)
1552 schedule_post_change(f);
1553 else
1554 journal_file_post_change(f);
1555
1556 return r;
1557 }
1558
1559 typedef struct ChainCacheItem {
1560 uint64_t first; /* the array at the beginning of the chain */
1561 uint64_t array; /* the cached array */
1562 uint64_t begin; /* the first item in the cached array */
1563 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1564 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1565 } ChainCacheItem;
1566
1567 static void chain_cache_put(
1568 OrderedHashmap *h,
1569 ChainCacheItem *ci,
1570 uint64_t first,
1571 uint64_t array,
1572 uint64_t begin,
1573 uint64_t total,
1574 uint64_t last_index) {
1575
1576 if (!ci) {
1577 /* If the chain item to cache for this chain is the
1578 * first one it's not worth caching anything */
1579 if (array == first)
1580 return;
1581
1582 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1583 ci = ordered_hashmap_steal_first(h);
1584 assert(ci);
1585 } else {
1586 ci = new(ChainCacheItem, 1);
1587 if (!ci)
1588 return;
1589 }
1590
1591 ci->first = first;
1592
1593 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1594 free(ci);
1595 return;
1596 }
1597 } else
1598 assert(ci->first == first);
1599
1600 ci->array = array;
1601 ci->begin = begin;
1602 ci->total = total;
1603 ci->last_index = last_index;
1604 }
1605
1606 static int generic_array_get(
1607 JournalFile *f,
1608 uint64_t first,
1609 uint64_t i,
1610 Object **ret, uint64_t *offset) {
1611
1612 Object *o;
1613 uint64_t p = 0, a, t = 0;
1614 int r;
1615 ChainCacheItem *ci;
1616
1617 assert(f);
1618
1619 a = first;
1620
1621 /* Try the chain cache first */
1622 ci = ordered_hashmap_get(f->chain_cache, &first);
1623 if (ci && i > ci->total) {
1624 a = ci->array;
1625 i -= ci->total;
1626 t = ci->total;
1627 }
1628
1629 while (a > 0) {
1630 uint64_t k;
1631
1632 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1633 if (r < 0)
1634 return r;
1635
1636 k = journal_file_entry_array_n_items(o);
1637 if (i < k) {
1638 p = le64toh(o->entry_array.items[i]);
1639 goto found;
1640 }
1641
1642 i -= k;
1643 t += k;
1644 a = le64toh(o->entry_array.next_entry_array_offset);
1645 }
1646
1647 return 0;
1648
1649 found:
1650 /* Let's cache this item for the next invocation */
1651 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1652
1653 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1654 if (r < 0)
1655 return r;
1656
1657 if (ret)
1658 *ret = o;
1659
1660 if (offset)
1661 *offset = p;
1662
1663 return 1;
1664 }
1665
1666 static int generic_array_get_plus_one(
1667 JournalFile *f,
1668 uint64_t extra,
1669 uint64_t first,
1670 uint64_t i,
1671 Object **ret, uint64_t *offset) {
1672
1673 Object *o;
1674
1675 assert(f);
1676
1677 if (i == 0) {
1678 int r;
1679
1680 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1681 if (r < 0)
1682 return r;
1683
1684 if (ret)
1685 *ret = o;
1686
1687 if (offset)
1688 *offset = extra;
1689
1690 return 1;
1691 }
1692
1693 return generic_array_get(f, first, i-1, ret, offset);
1694 }
1695
1696 enum {
1697 TEST_FOUND,
1698 TEST_LEFT,
1699 TEST_RIGHT
1700 };
1701
1702 static int generic_array_bisect(
1703 JournalFile *f,
1704 uint64_t first,
1705 uint64_t n,
1706 uint64_t needle,
1707 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1708 direction_t direction,
1709 Object **ret,
1710 uint64_t *offset,
1711 uint64_t *idx) {
1712
1713 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1714 bool subtract_one = false;
1715 Object *o, *array = NULL;
1716 int r;
1717 ChainCacheItem *ci;
1718
1719 assert(f);
1720 assert(test_object);
1721
1722 /* Start with the first array in the chain */
1723 a = first;
1724
1725 ci = ordered_hashmap_get(f->chain_cache, &first);
1726 if (ci && n > ci->total) {
1727 /* Ah, we have iterated this bisection array chain
1728 * previously! Let's see if we can skip ahead in the
1729 * chain, as far as the last time. But we can't jump
1730 * backwards in the chain, so let's check that
1731 * first. */
1732
1733 r = test_object(f, ci->begin, needle);
1734 if (r < 0)
1735 return r;
1736
1737 if (r == TEST_LEFT) {
1738 /* OK, what we are looking for is right of the
1739 * begin of this EntryArray, so let's jump
1740 * straight to previously cached array in the
1741 * chain */
1742
1743 a = ci->array;
1744 n -= ci->total;
1745 t = ci->total;
1746 last_index = ci->last_index;
1747 }
1748 }
1749
1750 while (a > 0) {
1751 uint64_t left, right, k, lp;
1752
1753 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1754 if (r < 0)
1755 return r;
1756
1757 k = journal_file_entry_array_n_items(array);
1758 right = MIN(k, n);
1759 if (right <= 0)
1760 return 0;
1761
1762 i = right - 1;
1763 lp = p = le64toh(array->entry_array.items[i]);
1764 if (p <= 0)
1765 return -EBADMSG;
1766
1767 r = test_object(f, p, needle);
1768 if (r < 0)
1769 return r;
1770
1771 if (r == TEST_FOUND)
1772 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1773
1774 if (r == TEST_RIGHT) {
1775 left = 0;
1776 right -= 1;
1777
1778 if (last_index != (uint64_t) -1) {
1779 assert(last_index <= right);
1780
1781 /* If we cached the last index we
1782 * looked at, let's try to not to jump
1783 * too wildly around and see if we can
1784 * limit the range to look at early to
1785 * the immediate neighbors of the last
1786 * index we looked at. */
1787
1788 if (last_index > 0) {
1789 uint64_t x = last_index - 1;
1790
1791 p = le64toh(array->entry_array.items[x]);
1792 if (p <= 0)
1793 return -EBADMSG;
1794
1795 r = test_object(f, p, needle);
1796 if (r < 0)
1797 return r;
1798
1799 if (r == TEST_FOUND)
1800 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1801
1802 if (r == TEST_RIGHT)
1803 right = x;
1804 else
1805 left = x + 1;
1806 }
1807
1808 if (last_index < right) {
1809 uint64_t y = last_index + 1;
1810
1811 p = le64toh(array->entry_array.items[y]);
1812 if (p <= 0)
1813 return -EBADMSG;
1814
1815 r = test_object(f, p, needle);
1816 if (r < 0)
1817 return r;
1818
1819 if (r == TEST_FOUND)
1820 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1821
1822 if (r == TEST_RIGHT)
1823 right = y;
1824 else
1825 left = y + 1;
1826 }
1827 }
1828
1829 for (;;) {
1830 if (left == right) {
1831 if (direction == DIRECTION_UP)
1832 subtract_one = true;
1833
1834 i = left;
1835 goto found;
1836 }
1837
1838 assert(left < right);
1839 i = (left + right) / 2;
1840
1841 p = le64toh(array->entry_array.items[i]);
1842 if (p <= 0)
1843 return -EBADMSG;
1844
1845 r = test_object(f, p, needle);
1846 if (r < 0)
1847 return r;
1848
1849 if (r == TEST_FOUND)
1850 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1851
1852 if (r == TEST_RIGHT)
1853 right = i;
1854 else
1855 left = i + 1;
1856 }
1857 }
1858
1859 if (k >= n) {
1860 if (direction == DIRECTION_UP) {
1861 i = n;
1862 subtract_one = true;
1863 goto found;
1864 }
1865
1866 return 0;
1867 }
1868
1869 last_p = lp;
1870
1871 n -= k;
1872 t += k;
1873 last_index = (uint64_t) -1;
1874 a = le64toh(array->entry_array.next_entry_array_offset);
1875 }
1876
1877 return 0;
1878
1879 found:
1880 if (subtract_one && t == 0 && i == 0)
1881 return 0;
1882
1883 /* Let's cache this item for the next invocation */
1884 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1885
1886 if (subtract_one && i == 0)
1887 p = last_p;
1888 else if (subtract_one)
1889 p = le64toh(array->entry_array.items[i-1]);
1890 else
1891 p = le64toh(array->entry_array.items[i]);
1892
1893 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1894 if (r < 0)
1895 return r;
1896
1897 if (ret)
1898 *ret = o;
1899
1900 if (offset)
1901 *offset = p;
1902
1903 if (idx)
1904 *idx = t + i + (subtract_one ? -1 : 0);
1905
1906 return 1;
1907 }
1908
1909 static int generic_array_bisect_plus_one(
1910 JournalFile *f,
1911 uint64_t extra,
1912 uint64_t first,
1913 uint64_t n,
1914 uint64_t needle,
1915 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1916 direction_t direction,
1917 Object **ret,
1918 uint64_t *offset,
1919 uint64_t *idx) {
1920
1921 int r;
1922 bool step_back = false;
1923 Object *o;
1924
1925 assert(f);
1926 assert(test_object);
1927
1928 if (n <= 0)
1929 return 0;
1930
1931 /* This bisects the array in object 'first', but first checks
1932 * an extra */
1933 r = test_object(f, extra, needle);
1934 if (r < 0)
1935 return r;
1936
1937 if (r == TEST_FOUND)
1938 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1939
1940 /* if we are looking with DIRECTION_UP then we need to first
1941 see if in the actual array there is a matching entry, and
1942 return the last one of that. But if there isn't any we need
1943 to return this one. Hence remember this, and return it
1944 below. */
1945 if (r == TEST_LEFT)
1946 step_back = direction == DIRECTION_UP;
1947
1948 if (r == TEST_RIGHT) {
1949 if (direction == DIRECTION_DOWN)
1950 goto found;
1951 else
1952 return 0;
1953 }
1954
1955 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1956
1957 if (r == 0 && step_back)
1958 goto found;
1959
1960 if (r > 0 && idx)
1961 (*idx) ++;
1962
1963 return r;
1964
1965 found:
1966 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1967 if (r < 0)
1968 return r;
1969
1970 if (ret)
1971 *ret = o;
1972
1973 if (offset)
1974 *offset = extra;
1975
1976 if (idx)
1977 *idx = 0;
1978
1979 return 1;
1980 }
1981
1982 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1983 assert(f);
1984 assert(p > 0);
1985
1986 if (p == needle)
1987 return TEST_FOUND;
1988 else if (p < needle)
1989 return TEST_LEFT;
1990 else
1991 return TEST_RIGHT;
1992 }
1993
1994 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1995 Object *o;
1996 int r;
1997
1998 assert(f);
1999 assert(p > 0);
2000
2001 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2002 if (r < 0)
2003 return r;
2004
2005 if (le64toh(o->entry.seqnum) == needle)
2006 return TEST_FOUND;
2007 else if (le64toh(o->entry.seqnum) < needle)
2008 return TEST_LEFT;
2009 else
2010 return TEST_RIGHT;
2011 }
2012
2013 int journal_file_move_to_entry_by_seqnum(
2014 JournalFile *f,
2015 uint64_t seqnum,
2016 direction_t direction,
2017 Object **ret,
2018 uint64_t *offset) {
2019
2020 return generic_array_bisect(f,
2021 le64toh(f->header->entry_array_offset),
2022 le64toh(f->header->n_entries),
2023 seqnum,
2024 test_object_seqnum,
2025 direction,
2026 ret, offset, NULL);
2027 }
2028
2029 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2030 Object *o;
2031 int r;
2032
2033 assert(f);
2034 assert(p > 0);
2035
2036 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2037 if (r < 0)
2038 return r;
2039
2040 if (le64toh(o->entry.realtime) == needle)
2041 return TEST_FOUND;
2042 else if (le64toh(o->entry.realtime) < needle)
2043 return TEST_LEFT;
2044 else
2045 return TEST_RIGHT;
2046 }
2047
2048 int journal_file_move_to_entry_by_realtime(
2049 JournalFile *f,
2050 uint64_t realtime,
2051 direction_t direction,
2052 Object **ret,
2053 uint64_t *offset) {
2054
2055 return generic_array_bisect(f,
2056 le64toh(f->header->entry_array_offset),
2057 le64toh(f->header->n_entries),
2058 realtime,
2059 test_object_realtime,
2060 direction,
2061 ret, offset, NULL);
2062 }
2063
2064 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2065 Object *o;
2066 int r;
2067
2068 assert(f);
2069 assert(p > 0);
2070
2071 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2072 if (r < 0)
2073 return r;
2074
2075 if (le64toh(o->entry.monotonic) == needle)
2076 return TEST_FOUND;
2077 else if (le64toh(o->entry.monotonic) < needle)
2078 return TEST_LEFT;
2079 else
2080 return TEST_RIGHT;
2081 }
2082
2083 static int find_data_object_by_boot_id(
2084 JournalFile *f,
2085 sd_id128_t boot_id,
2086 Object **o,
2087 uint64_t *b) {
2088
2089 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2090
2091 sd_id128_to_string(boot_id, t + 9);
2092 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2093 }
2094
2095 int journal_file_move_to_entry_by_monotonic(
2096 JournalFile *f,
2097 sd_id128_t boot_id,
2098 uint64_t monotonic,
2099 direction_t direction,
2100 Object **ret,
2101 uint64_t *offset) {
2102
2103 Object *o;
2104 int r;
2105
2106 assert(f);
2107
2108 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2109 if (r < 0)
2110 return r;
2111 if (r == 0)
2112 return -ENOENT;
2113
2114 return generic_array_bisect_plus_one(f,
2115 le64toh(o->data.entry_offset),
2116 le64toh(o->data.entry_array_offset),
2117 le64toh(o->data.n_entries),
2118 monotonic,
2119 test_object_monotonic,
2120 direction,
2121 ret, offset, NULL);
2122 }
2123
2124 void journal_file_reset_location(JournalFile *f) {
2125 f->location_type = LOCATION_HEAD;
2126 f->current_offset = 0;
2127 f->current_seqnum = 0;
2128 f->current_realtime = 0;
2129 f->current_monotonic = 0;
2130 zero(f->current_boot_id);
2131 f->current_xor_hash = 0;
2132 }
2133
2134 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2135 f->location_type = LOCATION_SEEK;
2136 f->current_offset = offset;
2137 f->current_seqnum = le64toh(o->entry.seqnum);
2138 f->current_realtime = le64toh(o->entry.realtime);
2139 f->current_monotonic = le64toh(o->entry.monotonic);
2140 f->current_boot_id = o->entry.boot_id;
2141 f->current_xor_hash = le64toh(o->entry.xor_hash);
2142 }
2143
2144 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2145 assert(af);
2146 assert(bf);
2147 assert(af->location_type == LOCATION_SEEK);
2148 assert(bf->location_type == LOCATION_SEEK);
2149
2150 /* If contents and timestamps match, these entries are
2151 * identical, even if the seqnum does not match */
2152 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2153 af->current_monotonic == bf->current_monotonic &&
2154 af->current_realtime == bf->current_realtime &&
2155 af->current_xor_hash == bf->current_xor_hash)
2156 return 0;
2157
2158 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2159
2160 /* If this is from the same seqnum source, compare
2161 * seqnums */
2162 if (af->current_seqnum < bf->current_seqnum)
2163 return -1;
2164 if (af->current_seqnum > bf->current_seqnum)
2165 return 1;
2166
2167 /* Wow! This is weird, different data but the same
2168 * seqnums? Something is borked, but let's make the
2169 * best of it and compare by time. */
2170 }
2171
2172 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2173
2174 /* If the boot id matches, compare monotonic time */
2175 if (af->current_monotonic < bf->current_monotonic)
2176 return -1;
2177 if (af->current_monotonic > bf->current_monotonic)
2178 return 1;
2179 }
2180
2181 /* Otherwise, compare UTC time */
2182 if (af->current_realtime < bf->current_realtime)
2183 return -1;
2184 if (af->current_realtime > bf->current_realtime)
2185 return 1;
2186
2187 /* Finally, compare by contents */
2188 if (af->current_xor_hash < bf->current_xor_hash)
2189 return -1;
2190 if (af->current_xor_hash > bf->current_xor_hash)
2191 return 1;
2192
2193 return 0;
2194 }
2195
2196 int journal_file_next_entry(
2197 JournalFile *f,
2198 uint64_t p,
2199 direction_t direction,
2200 Object **ret, uint64_t *offset) {
2201
2202 uint64_t i, n, ofs;
2203 int r;
2204
2205 assert(f);
2206
2207 n = le64toh(f->header->n_entries);
2208 if (n <= 0)
2209 return 0;
2210
2211 if (p == 0)
2212 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2213 else {
2214 r = generic_array_bisect(f,
2215 le64toh(f->header->entry_array_offset),
2216 le64toh(f->header->n_entries),
2217 p,
2218 test_object_offset,
2219 DIRECTION_DOWN,
2220 NULL, NULL,
2221 &i);
2222 if (r <= 0)
2223 return r;
2224
2225 if (direction == DIRECTION_DOWN) {
2226 if (i >= n - 1)
2227 return 0;
2228
2229 i++;
2230 } else {
2231 if (i <= 0)
2232 return 0;
2233
2234 i--;
2235 }
2236 }
2237
2238 /* And jump to it */
2239 r = generic_array_get(f,
2240 le64toh(f->header->entry_array_offset),
2241 i,
2242 ret, &ofs);
2243 if (r <= 0)
2244 return r;
2245
2246 if (p > 0 &&
2247 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2248 log_debug("%s: entry array corrupted at entry %"PRIu64,
2249 f->path, i);
2250 return -EBADMSG;
2251 }
2252
2253 if (offset)
2254 *offset = ofs;
2255
2256 return 1;
2257 }
2258
2259 int journal_file_next_entry_for_data(
2260 JournalFile *f,
2261 Object *o, uint64_t p,
2262 uint64_t data_offset,
2263 direction_t direction,
2264 Object **ret, uint64_t *offset) {
2265
2266 uint64_t n, i;
2267 int r;
2268 Object *d;
2269
2270 assert(f);
2271 assert(p > 0 || !o);
2272
2273 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2274 if (r < 0)
2275 return r;
2276
2277 n = le64toh(d->data.n_entries);
2278 if (n <= 0)
2279 return n;
2280
2281 if (!o)
2282 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2283 else {
2284 if (o->object.type != OBJECT_ENTRY)
2285 return -EINVAL;
2286
2287 r = generic_array_bisect_plus_one(f,
2288 le64toh(d->data.entry_offset),
2289 le64toh(d->data.entry_array_offset),
2290 le64toh(d->data.n_entries),
2291 p,
2292 test_object_offset,
2293 DIRECTION_DOWN,
2294 NULL, NULL,
2295 &i);
2296
2297 if (r <= 0)
2298 return r;
2299
2300 if (direction == DIRECTION_DOWN) {
2301 if (i >= n - 1)
2302 return 0;
2303
2304 i++;
2305 } else {
2306 if (i <= 0)
2307 return 0;
2308
2309 i--;
2310 }
2311
2312 }
2313
2314 return generic_array_get_plus_one(f,
2315 le64toh(d->data.entry_offset),
2316 le64toh(d->data.entry_array_offset),
2317 i,
2318 ret, offset);
2319 }
2320
2321 int journal_file_move_to_entry_by_offset_for_data(
2322 JournalFile *f,
2323 uint64_t data_offset,
2324 uint64_t p,
2325 direction_t direction,
2326 Object **ret, uint64_t *offset) {
2327
2328 int r;
2329 Object *d;
2330
2331 assert(f);
2332
2333 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2334 if (r < 0)
2335 return r;
2336
2337 return generic_array_bisect_plus_one(f,
2338 le64toh(d->data.entry_offset),
2339 le64toh(d->data.entry_array_offset),
2340 le64toh(d->data.n_entries),
2341 p,
2342 test_object_offset,
2343 direction,
2344 ret, offset, NULL);
2345 }
2346
2347 int journal_file_move_to_entry_by_monotonic_for_data(
2348 JournalFile *f,
2349 uint64_t data_offset,
2350 sd_id128_t boot_id,
2351 uint64_t monotonic,
2352 direction_t direction,
2353 Object **ret, uint64_t *offset) {
2354
2355 Object *o, *d;
2356 int r;
2357 uint64_t b, z;
2358
2359 assert(f);
2360
2361 /* First, seek by time */
2362 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2363 if (r < 0)
2364 return r;
2365 if (r == 0)
2366 return -ENOENT;
2367
2368 r = generic_array_bisect_plus_one(f,
2369 le64toh(o->data.entry_offset),
2370 le64toh(o->data.entry_array_offset),
2371 le64toh(o->data.n_entries),
2372 monotonic,
2373 test_object_monotonic,
2374 direction,
2375 NULL, &z, NULL);
2376 if (r <= 0)
2377 return r;
2378
2379 /* And now, continue seeking until we find an entry that
2380 * exists in both bisection arrays */
2381
2382 for (;;) {
2383 Object *qo;
2384 uint64_t p, q;
2385
2386 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2387 if (r < 0)
2388 return r;
2389
2390 r = generic_array_bisect_plus_one(f,
2391 le64toh(d->data.entry_offset),
2392 le64toh(d->data.entry_array_offset),
2393 le64toh(d->data.n_entries),
2394 z,
2395 test_object_offset,
2396 direction,
2397 NULL, &p, NULL);
2398 if (r <= 0)
2399 return r;
2400
2401 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2402 if (r < 0)
2403 return r;
2404
2405 r = generic_array_bisect_plus_one(f,
2406 le64toh(o->data.entry_offset),
2407 le64toh(o->data.entry_array_offset),
2408 le64toh(o->data.n_entries),
2409 p,
2410 test_object_offset,
2411 direction,
2412 &qo, &q, NULL);
2413
2414 if (r <= 0)
2415 return r;
2416
2417 if (p == q) {
2418 if (ret)
2419 *ret = qo;
2420 if (offset)
2421 *offset = q;
2422
2423 return 1;
2424 }
2425
2426 z = q;
2427 }
2428 }
2429
2430 int journal_file_move_to_entry_by_seqnum_for_data(
2431 JournalFile *f,
2432 uint64_t data_offset,
2433 uint64_t seqnum,
2434 direction_t direction,
2435 Object **ret, uint64_t *offset) {
2436
2437 Object *d;
2438 int r;
2439
2440 assert(f);
2441
2442 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2443 if (r < 0)
2444 return r;
2445
2446 return generic_array_bisect_plus_one(f,
2447 le64toh(d->data.entry_offset),
2448 le64toh(d->data.entry_array_offset),
2449 le64toh(d->data.n_entries),
2450 seqnum,
2451 test_object_seqnum,
2452 direction,
2453 ret, offset, NULL);
2454 }
2455
2456 int journal_file_move_to_entry_by_realtime_for_data(
2457 JournalFile *f,
2458 uint64_t data_offset,
2459 uint64_t realtime,
2460 direction_t direction,
2461 Object **ret, uint64_t *offset) {
2462
2463 Object *d;
2464 int r;
2465
2466 assert(f);
2467
2468 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2469 if (r < 0)
2470 return r;
2471
2472 return generic_array_bisect_plus_one(f,
2473 le64toh(d->data.entry_offset),
2474 le64toh(d->data.entry_array_offset),
2475 le64toh(d->data.n_entries),
2476 realtime,
2477 test_object_realtime,
2478 direction,
2479 ret, offset, NULL);
2480 }
2481
2482 void journal_file_dump(JournalFile *f) {
2483 Object *o;
2484 int r;
2485 uint64_t p;
2486
2487 assert(f);
2488
2489 journal_file_print_header(f);
2490
2491 p = le64toh(f->header->header_size);
2492 while (p != 0) {
2493 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2494 if (r < 0)
2495 goto fail;
2496
2497 switch (o->object.type) {
2498
2499 case OBJECT_UNUSED:
2500 printf("Type: OBJECT_UNUSED\n");
2501 break;
2502
2503 case OBJECT_DATA:
2504 printf("Type: OBJECT_DATA\n");
2505 break;
2506
2507 case OBJECT_FIELD:
2508 printf("Type: OBJECT_FIELD\n");
2509 break;
2510
2511 case OBJECT_ENTRY:
2512 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2513 le64toh(o->entry.seqnum),
2514 le64toh(o->entry.monotonic),
2515 le64toh(o->entry.realtime));
2516 break;
2517
2518 case OBJECT_FIELD_HASH_TABLE:
2519 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2520 break;
2521
2522 case OBJECT_DATA_HASH_TABLE:
2523 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2524 break;
2525
2526 case OBJECT_ENTRY_ARRAY:
2527 printf("Type: OBJECT_ENTRY_ARRAY\n");
2528 break;
2529
2530 case OBJECT_TAG:
2531 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2532 le64toh(o->tag.seqnum),
2533 le64toh(o->tag.epoch));
2534 break;
2535
2536 default:
2537 printf("Type: unknown (%i)\n", o->object.type);
2538 break;
2539 }
2540
2541 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2542 printf("Flags: %s\n",
2543 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2544
2545 if (p == le64toh(f->header->tail_object_offset))
2546 p = 0;
2547 else
2548 p = p + ALIGN64(le64toh(o->object.size));
2549 }
2550
2551 return;
2552 fail:
2553 log_error("File corrupt");
2554 }
2555
2556 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2557 const char *x;
2558
2559 x = format_timestamp(buf, l, t);
2560 if (x)
2561 return x;
2562 return " --- ";
2563 }
2564
2565 void journal_file_print_header(JournalFile *f) {
2566 char a[33], b[33], c[33], d[33];
2567 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2568 struct stat st;
2569 char bytes[FORMAT_BYTES_MAX];
2570
2571 assert(f);
2572
2573 printf("File Path: %s\n"
2574 "File ID: %s\n"
2575 "Machine ID: %s\n"
2576 "Boot ID: %s\n"
2577 "Sequential Number ID: %s\n"
2578 "State: %s\n"
2579 "Compatible Flags:%s%s\n"
2580 "Incompatible Flags:%s%s%s\n"
2581 "Header size: %"PRIu64"\n"
2582 "Arena size: %"PRIu64"\n"
2583 "Data Hash Table Size: %"PRIu64"\n"
2584 "Field Hash Table Size: %"PRIu64"\n"
2585 "Rotate Suggested: %s\n"
2586 "Head Sequential Number: %"PRIu64"\n"
2587 "Tail Sequential Number: %"PRIu64"\n"
2588 "Head Realtime Timestamp: %s\n"
2589 "Tail Realtime Timestamp: %s\n"
2590 "Tail Monotonic Timestamp: %s\n"
2591 "Objects: %"PRIu64"\n"
2592 "Entry Objects: %"PRIu64"\n",
2593 f->path,
2594 sd_id128_to_string(f->header->file_id, a),
2595 sd_id128_to_string(f->header->machine_id, b),
2596 sd_id128_to_string(f->header->boot_id, c),
2597 sd_id128_to_string(f->header->seqnum_id, d),
2598 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2599 f->header->state == STATE_ONLINE ? "ONLINE" :
2600 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2601 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2602 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2603 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2604 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2605 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2606 le64toh(f->header->header_size),
2607 le64toh(f->header->arena_size),
2608 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2609 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2610 yes_no(journal_file_rotate_suggested(f, 0)),
2611 le64toh(f->header->head_entry_seqnum),
2612 le64toh(f->header->tail_entry_seqnum),
2613 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2614 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2615 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2616 le64toh(f->header->n_objects),
2617 le64toh(f->header->n_entries));
2618
2619 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2620 printf("Data Objects: %"PRIu64"\n"
2621 "Data Hash Table Fill: %.1f%%\n",
2622 le64toh(f->header->n_data),
2623 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2624
2625 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2626 printf("Field Objects: %"PRIu64"\n"
2627 "Field Hash Table Fill: %.1f%%\n",
2628 le64toh(f->header->n_fields),
2629 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2630
2631 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2632 printf("Tag Objects: %"PRIu64"\n",
2633 le64toh(f->header->n_tags));
2634 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2635 printf("Entry Array Objects: %"PRIu64"\n",
2636 le64toh(f->header->n_entry_arrays));
2637
2638 if (fstat(f->fd, &st) >= 0)
2639 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2640 }
2641
2642 static int journal_file_warn_btrfs(JournalFile *f) {
2643 unsigned attrs;
2644 int r;
2645
2646 assert(f);
2647
2648 /* Before we write anything, check if the COW logic is turned
2649 * off on btrfs. Given our write pattern that is quite
2650 * unfriendly to COW file systems this should greatly improve
2651 * performance on COW file systems, such as btrfs, at the
2652 * expense of data integrity features (which shouldn't be too
2653 * bad, given that we do our own checksumming). */
2654
2655 r = btrfs_is_filesystem(f->fd);
2656 if (r < 0)
2657 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2658 if (!r)
2659 return 0;
2660
2661 r = read_attr_fd(f->fd, &attrs);
2662 if (r < 0)
2663 return log_warning_errno(r, "Failed to read file attributes: %m");
2664
2665 if (attrs & FS_NOCOW_FL) {
2666 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2667 return 0;
2668 }
2669
2670 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2671 "This is likely to slow down journal access substantially, please consider turning "
2672 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2673
2674 return 1;
2675 }
2676
2677 int journal_file_open(
2678 const char *fname,
2679 int flags,
2680 mode_t mode,
2681 bool compress,
2682 bool seal,
2683 JournalMetrics *metrics,
2684 MMapCache *mmap_cache,
2685 JournalFile *template,
2686 JournalFile **ret) {
2687
2688 bool newly_created = false;
2689 JournalFile *f;
2690 void *h;
2691 int r;
2692
2693 assert(fname);
2694 assert(ret);
2695
2696 if ((flags & O_ACCMODE) != O_RDONLY &&
2697 (flags & O_ACCMODE) != O_RDWR)
2698 return -EINVAL;
2699
2700 if (!endswith(fname, ".journal") &&
2701 !endswith(fname, ".journal~"))
2702 return -EINVAL;
2703
2704 f = new0(JournalFile, 1);
2705 if (!f)
2706 return -ENOMEM;
2707
2708 f->fd = -1;
2709 f->mode = mode;
2710
2711 f->flags = flags;
2712 f->prot = prot_from_flags(flags);
2713 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2714 #if defined(HAVE_LZ4)
2715 f->compress_lz4 = compress;
2716 #elif defined(HAVE_XZ)
2717 f->compress_xz = compress;
2718 #endif
2719 #ifdef HAVE_GCRYPT
2720 f->seal = seal;
2721 #endif
2722
2723 if (mmap_cache)
2724 f->mmap = mmap_cache_ref(mmap_cache);
2725 else {
2726 f->mmap = mmap_cache_new();
2727 if (!f->mmap) {
2728 r = -ENOMEM;
2729 goto fail;
2730 }
2731 }
2732
2733 f->path = strdup(fname);
2734 if (!f->path) {
2735 r = -ENOMEM;
2736 goto fail;
2737 }
2738
2739 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2740 if (!f->chain_cache) {
2741 r = -ENOMEM;
2742 goto fail;
2743 }
2744
2745 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2746 if (f->fd < 0) {
2747 r = -errno;
2748 goto fail;
2749 }
2750
2751 r = journal_file_fstat(f);
2752 if (r < 0)
2753 goto fail;
2754
2755 if (f->last_stat.st_size == 0 && f->writable) {
2756
2757 (void) journal_file_warn_btrfs(f);
2758
2759 /* Let's attach the creation time to the journal file,
2760 * so that the vacuuming code knows the age of this
2761 * file even if the file might end up corrupted one
2762 * day... Ideally we'd just use the creation time many
2763 * file systems maintain for each file, but there is
2764 * currently no usable API to query this, hence let's
2765 * emulate this via extended attributes. If extended
2766 * attributes are not supported we'll just skip this,
2767 * and rely solely on mtime/atime/ctime of the file. */
2768
2769 fd_setcrtime(f->fd, 0);
2770
2771 #ifdef HAVE_GCRYPT
2772 /* Try to load the FSPRG state, and if we can't, then
2773 * just don't do sealing */
2774 if (f->seal) {
2775 r = journal_file_fss_load(f);
2776 if (r < 0)
2777 f->seal = false;
2778 }
2779 #endif
2780
2781 r = journal_file_init_header(f, template);
2782 if (r < 0)
2783 goto fail;
2784
2785 r = journal_file_fstat(f);
2786 if (r < 0)
2787 goto fail;
2788
2789 newly_created = true;
2790 }
2791
2792 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2793 r = -ENODATA;
2794 goto fail;
2795 }
2796
2797 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2798 if (r < 0)
2799 goto fail;
2800
2801 f->header = h;
2802
2803 if (!newly_created) {
2804 r = journal_file_verify_header(f);
2805 if (r < 0)
2806 goto fail;
2807 }
2808
2809 #ifdef HAVE_GCRYPT
2810 if (!newly_created && f->writable) {
2811 r = journal_file_fss_load(f);
2812 if (r < 0)
2813 goto fail;
2814 }
2815 #endif
2816
2817 if (f->writable) {
2818 if (metrics) {
2819 journal_default_metrics(metrics, f->fd);
2820 f->metrics = *metrics;
2821 } else if (template)
2822 f->metrics = template->metrics;
2823
2824 r = journal_file_refresh_header(f);
2825 if (r < 0)
2826 goto fail;
2827 }
2828
2829 #ifdef HAVE_GCRYPT
2830 r = journal_file_hmac_setup(f);
2831 if (r < 0)
2832 goto fail;
2833 #endif
2834
2835 if (newly_created) {
2836 r = journal_file_setup_field_hash_table(f);
2837 if (r < 0)
2838 goto fail;
2839
2840 r = journal_file_setup_data_hash_table(f);
2841 if (r < 0)
2842 goto fail;
2843
2844 #ifdef HAVE_GCRYPT
2845 r = journal_file_append_first_tag(f);
2846 if (r < 0)
2847 goto fail;
2848 #endif
2849 }
2850
2851 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2852 r = -EIO;
2853 goto fail;
2854 }
2855
2856 if (template && template->post_change_timer) {
2857 sd_event *e = sd_event_source_get_event(template->post_change_timer);
2858
2859 r = journal_file_enable_post_change_timer(f, e, template->post_change_timer_period);
2860 if (r < 0)
2861 goto fail;
2862 }
2863
2864 *ret = f;
2865 return 0;
2866
2867 fail:
2868 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2869 r = -EIO;
2870
2871 journal_file_close(f);
2872
2873 return r;
2874 }
2875
2876 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2877 _cleanup_free_ char *p = NULL;
2878 size_t l;
2879 JournalFile *old_file, *new_file = NULL;
2880 int r;
2881
2882 assert(f);
2883 assert(*f);
2884
2885 old_file = *f;
2886
2887 if (!old_file->writable)
2888 return -EINVAL;
2889
2890 if (!endswith(old_file->path, ".journal"))
2891 return -EINVAL;
2892
2893 l = strlen(old_file->path);
2894 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2895 (int) l - 8, old_file->path,
2896 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2897 le64toh((*f)->header->head_entry_seqnum),
2898 le64toh((*f)->header->head_entry_realtime));
2899 if (r < 0)
2900 return -ENOMEM;
2901
2902 /* Try to rename the file to the archived version. If the file
2903 * already was deleted, we'll get ENOENT, let's ignore that
2904 * case. */
2905 r = rename(old_file->path, p);
2906 if (r < 0 && errno != ENOENT)
2907 return -errno;
2908
2909 old_file->header->state = STATE_ARCHIVED;
2910
2911 /* Currently, btrfs is not very good with out write patterns
2912 * and fragments heavily. Let's defrag our journal files when
2913 * we archive them */
2914 old_file->defrag_on_close = true;
2915
2916 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2917 journal_file_close(old_file);
2918
2919 *f = new_file;
2920 return r;
2921 }
2922
2923 int journal_file_open_reliably(
2924 const char *fname,
2925 int flags,
2926 mode_t mode,
2927 bool compress,
2928 bool seal,
2929 JournalMetrics *metrics,
2930 MMapCache *mmap_cache,
2931 JournalFile *template,
2932 JournalFile **ret) {
2933
2934 int r;
2935 size_t l;
2936 _cleanup_free_ char *p = NULL;
2937
2938 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2939 if (!IN_SET(r,
2940 -EBADMSG, /* corrupted */
2941 -ENODATA, /* truncated */
2942 -EHOSTDOWN, /* other machine */
2943 -EPROTONOSUPPORT, /* incompatible feature */
2944 -EBUSY, /* unclean shutdown */
2945 -ESHUTDOWN, /* already archived */
2946 -EIO, /* IO error, including SIGBUS on mmap */
2947 -EIDRM /* File has been deleted */))
2948 return r;
2949
2950 if ((flags & O_ACCMODE) == O_RDONLY)
2951 return r;
2952
2953 if (!(flags & O_CREAT))
2954 return r;
2955
2956 if (!endswith(fname, ".journal"))
2957 return r;
2958
2959 /* The file is corrupted. Rotate it away and try it again (but only once) */
2960
2961 l = strlen(fname);
2962 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2963 (int) l - 8, fname,
2964 now(CLOCK_REALTIME),
2965 random_u64()) < 0)
2966 return -ENOMEM;
2967
2968 if (rename(fname, p) < 0)
2969 return -errno;
2970
2971 /* btrfs doesn't cope well with our write pattern and
2972 * fragments heavily. Let's defrag all files we rotate */
2973
2974 (void) chattr_path(p, false, FS_NOCOW_FL);
2975 (void) btrfs_defrag(p);
2976
2977 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2978
2979 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2980 }
2981
2982 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2983 uint64_t i, n;
2984 uint64_t q, xor_hash = 0;
2985 int r;
2986 EntryItem *items;
2987 dual_timestamp ts;
2988
2989 assert(from);
2990 assert(to);
2991 assert(o);
2992 assert(p);
2993
2994 if (!to->writable)
2995 return -EPERM;
2996
2997 ts.monotonic = le64toh(o->entry.monotonic);
2998 ts.realtime = le64toh(o->entry.realtime);
2999
3000 n = journal_file_entry_n_items(o);
3001 /* alloca() can't take 0, hence let's allocate at least one */
3002 items = alloca(sizeof(EntryItem) * MAX(1u, n));
3003
3004 for (i = 0; i < n; i++) {
3005 uint64_t l, h;
3006 le64_t le_hash;
3007 size_t t;
3008 void *data;
3009 Object *u;
3010
3011 q = le64toh(o->entry.items[i].object_offset);
3012 le_hash = o->entry.items[i].hash;
3013
3014 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3015 if (r < 0)
3016 return r;
3017
3018 if (le_hash != o->data.hash)
3019 return -EBADMSG;
3020
3021 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3022 t = (size_t) l;
3023
3024 /* We hit the limit on 32bit machines */
3025 if ((uint64_t) t != l)
3026 return -E2BIG;
3027
3028 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3029 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
3030 size_t rsize = 0;
3031
3032 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3033 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3034 if (r < 0)
3035 return r;
3036
3037 data = from->compress_buffer;
3038 l = rsize;
3039 #else
3040 return -EPROTONOSUPPORT;
3041 #endif
3042 } else
3043 data = o->data.payload;
3044
3045 r = journal_file_append_data(to, data, l, &u, &h);
3046 if (r < 0)
3047 return r;
3048
3049 xor_hash ^= le64toh(u->data.hash);
3050 items[i].object_offset = htole64(h);
3051 items[i].hash = u->data.hash;
3052
3053 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3054 if (r < 0)
3055 return r;
3056 }
3057
3058 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3059
3060 if (mmap_cache_got_sigbus(to->mmap, to->fd))
3061 return -EIO;
3062
3063 return r;
3064 }
3065
3066 void journal_reset_metrics(JournalMetrics *m) {
3067 assert(m);
3068
3069 /* Set everything to "pick automatic values". */
3070
3071 *m = (JournalMetrics) {
3072 .min_use = (uint64_t) -1,
3073 .max_use = (uint64_t) -1,
3074 .min_size = (uint64_t) -1,
3075 .max_size = (uint64_t) -1,
3076 .keep_free = (uint64_t) -1,
3077 .n_max_files = (uint64_t) -1,
3078 };
3079 }
3080
3081 void journal_default_metrics(JournalMetrics *m, int fd) {
3082 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3083 struct statvfs ss;
3084 uint64_t fs_size;
3085
3086 assert(m);
3087 assert(fd >= 0);
3088
3089 if (fstatvfs(fd, &ss) >= 0)
3090 fs_size = ss.f_frsize * ss.f_blocks;
3091 else {
3092 log_debug_errno(errno, "Failed to detremine disk size: %m");
3093 fs_size = 0;
3094 }
3095
3096 if (m->max_use == (uint64_t) -1) {
3097
3098 if (fs_size > 0) {
3099 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3100
3101 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3102 m->max_use = DEFAULT_MAX_USE_UPPER;
3103
3104 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3105 m->max_use = DEFAULT_MAX_USE_LOWER;
3106 } else
3107 m->max_use = DEFAULT_MAX_USE_LOWER;
3108 } else {
3109 m->max_use = PAGE_ALIGN(m->max_use);
3110
3111 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3112 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3113 }
3114
3115 if (m->min_use == (uint64_t) -1)
3116 m->min_use = DEFAULT_MIN_USE;
3117
3118 if (m->min_use > m->max_use)
3119 m->min_use = m->max_use;
3120
3121 if (m->max_size == (uint64_t) -1) {
3122 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3123
3124 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3125 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3126 } else
3127 m->max_size = PAGE_ALIGN(m->max_size);
3128
3129 if (m->max_size != 0) {
3130 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3131 m->max_size = JOURNAL_FILE_SIZE_MIN;
3132
3133 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3134 m->max_use = m->max_size*2;
3135 }
3136
3137 if (m->min_size == (uint64_t) -1)
3138 m->min_size = JOURNAL_FILE_SIZE_MIN;
3139 else {
3140 m->min_size = PAGE_ALIGN(m->min_size);
3141
3142 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3143 m->min_size = JOURNAL_FILE_SIZE_MIN;
3144
3145 if (m->max_size != 0 && m->min_size > m->max_size)
3146 m->max_size = m->min_size;
3147 }
3148
3149 if (m->keep_free == (uint64_t) -1) {
3150
3151 if (fs_size > 0) {
3152 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3153
3154 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3155 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3156
3157 } else
3158 m->keep_free = DEFAULT_KEEP_FREE;
3159 }
3160
3161 if (m->n_max_files == (uint64_t) -1)
3162 m->n_max_files = DEFAULT_N_MAX_FILES;
3163
3164 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3165 format_bytes(a, sizeof(a), m->min_use),
3166 format_bytes(b, sizeof(b), m->max_use),
3167 format_bytes(c, sizeof(c), m->max_size),
3168 format_bytes(d, sizeof(d), m->min_size),
3169 format_bytes(e, sizeof(e), m->keep_free),
3170 m->n_max_files);
3171 }
3172
3173 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3174 assert(f);
3175 assert(from || to);
3176
3177 if (from) {
3178 if (f->header->head_entry_realtime == 0)
3179 return -ENOENT;
3180
3181 *from = le64toh(f->header->head_entry_realtime);
3182 }
3183
3184 if (to) {
3185 if (f->header->tail_entry_realtime == 0)
3186 return -ENOENT;
3187
3188 *to = le64toh(f->header->tail_entry_realtime);
3189 }
3190
3191 return 1;
3192 }
3193
3194 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3195 Object *o;
3196 uint64_t p;
3197 int r;
3198
3199 assert(f);
3200 assert(from || to);
3201
3202 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3203 if (r <= 0)
3204 return r;
3205
3206 if (le64toh(o->data.n_entries) <= 0)
3207 return 0;
3208
3209 if (from) {
3210 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3211 if (r < 0)
3212 return r;
3213
3214 *from = le64toh(o->entry.monotonic);
3215 }
3216
3217 if (to) {
3218 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3219 if (r < 0)
3220 return r;
3221
3222 r = generic_array_get_plus_one(f,
3223 le64toh(o->data.entry_offset),
3224 le64toh(o->data.entry_array_offset),
3225 le64toh(o->data.n_entries)-1,
3226 &o, NULL);
3227 if (r <= 0)
3228 return r;
3229
3230 *to = le64toh(o->entry.monotonic);
3231 }
3232
3233 return 1;
3234 }
3235
3236 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3237 assert(f);
3238
3239 /* If we gained new header fields we gained new features,
3240 * hence suggest a rotation */
3241 if (le64toh(f->header->header_size) < sizeof(Header)) {
3242 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3243 return true;
3244 }
3245
3246 /* Let's check if the hash tables grew over a certain fill
3247 * level (75%, borrowing this value from Java's hash table
3248 * implementation), and if so suggest a rotation. To calculate
3249 * the fill level we need the n_data field, which only exists
3250 * in newer versions. */
3251
3252 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3253 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3254 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3255 f->path,
3256 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3257 le64toh(f->header->n_data),
3258 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3259 (unsigned long long) f->last_stat.st_size,
3260 f->last_stat.st_size / le64toh(f->header->n_data));
3261 return true;
3262 }
3263
3264 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3265 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3266 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3267 f->path,
3268 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3269 le64toh(f->header->n_fields),
3270 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3271 return true;
3272 }
3273
3274 /* Are the data objects properly indexed by field objects? */
3275 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3276 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3277 le64toh(f->header->n_data) > 0 &&
3278 le64toh(f->header->n_fields) == 0)
3279 return true;
3280
3281 if (max_file_usec > 0) {
3282 usec_t t, h;
3283
3284 h = le64toh(f->header->head_entry_realtime);
3285 t = now(CLOCK_REALTIME);
3286
3287 if (h > 0 && t > h + max_file_usec)
3288 return true;
3289 }
3290
3291 return false;
3292 }