]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
Merge pull request #2306 from walyong/exec_v01
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <linux/fs.h>
25 #include <stddef.h>
26 #include <sys/mman.h>
27 #include <sys/statvfs.h>
28 #include <sys/uio.h>
29 #include <unistd.h>
30
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
34 #include "compress.h"
35 #include "fd-util.h"
36 #include "journal-authenticate.h"
37 #include "journal-def.h"
38 #include "journal-file.h"
39 #include "lookup3.h"
40 #include "parse-util.h"
41 #include "random-util.h"
42 #include "sd-event.h"
43 #include "string-util.h"
44 #include "xattr-util.h"
45
46 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
47 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
48
49 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
50
51 /* This is the minimum journal file size */
52 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
53
54 /* These are the lower and upper bounds if we deduce the max_use value
55 * from the file system size */
56 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
57 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58
59 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
60 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
61
62 /* This is the upper bound if we deduce max_size from max_use */
63 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
64
65 /* This is the upper bound if we deduce the keep_free value from the
66 * file system size */
67 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
68
69 /* This is the keep_free value when we can't determine the system
70 * size */
71 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
72
73 /* This is the default maximum number of journal files to keep around. */
74 #define DEFAULT_N_MAX_FILES (100)
75
76 /* n_data was the first entry we added after the initial file format design */
77 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
78
79 /* How many entries to keep in the entry array chain cache at max */
80 #define CHAIN_CACHE_MAX 20
81
82 /* How much to increase the journal file size at once each time we allocate something new. */
83 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
84
85 /* Reread fstat() of the file for detecting deletions at least this often */
86 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
87
88 /* The mmap context to use for the header we pick as one above the last defined typed */
89 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
90
91 static int journal_file_set_online(JournalFile *f) {
92 assert(f);
93
94 if (!f->writable)
95 return -EPERM;
96
97 if (!(f->fd >= 0 && f->header))
98 return -EINVAL;
99
100 if (mmap_cache_got_sigbus(f->mmap, f->fd))
101 return -EIO;
102
103 switch(f->header->state) {
104 case STATE_ONLINE:
105 return 0;
106
107 case STATE_OFFLINE:
108 f->header->state = STATE_ONLINE;
109 fsync(f->fd);
110 return 0;
111
112 default:
113 return -EINVAL;
114 }
115 }
116
117 int journal_file_set_offline(JournalFile *f) {
118 assert(f);
119
120 if (!f->writable)
121 return -EPERM;
122
123 if (!(f->fd >= 0 && f->header))
124 return -EINVAL;
125
126 if (f->header->state != STATE_ONLINE)
127 return 0;
128
129 fsync(f->fd);
130
131 if (mmap_cache_got_sigbus(f->mmap, f->fd))
132 return -EIO;
133
134 f->header->state = STATE_OFFLINE;
135
136 if (mmap_cache_got_sigbus(f->mmap, f->fd))
137 return -EIO;
138
139 fsync(f->fd);
140
141 return 0;
142 }
143
144 JournalFile* journal_file_close(JournalFile *f) {
145 assert(f);
146
147 #ifdef HAVE_GCRYPT
148 /* Write the final tag */
149 if (f->seal && f->writable)
150 journal_file_append_tag(f);
151 #endif
152
153 if (f->post_change_timer) {
154 int enabled;
155
156 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
157 if (enabled == SD_EVENT_ONESHOT)
158 journal_file_post_change(f);
159
160 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
161 sd_event_source_unref(f->post_change_timer);
162 }
163
164 journal_file_set_offline(f);
165
166 if (f->mmap && f->fd >= 0)
167 mmap_cache_close_fd(f->mmap, f->fd);
168
169 if (f->fd >= 0 && f->defrag_on_close) {
170
171 /* Be friendly to btrfs: turn COW back on again now,
172 * and defragment the file. We won't write to the file
173 * ever again, hence remove all fragmentation, and
174 * reenable all the good bits COW usually provides
175 * (such as data checksumming). */
176
177 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
178 (void) btrfs_defrag_fd(f->fd);
179 }
180
181 safe_close(f->fd);
182 free(f->path);
183
184 mmap_cache_unref(f->mmap);
185
186 ordered_hashmap_free_free(f->chain_cache);
187
188 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
189 free(f->compress_buffer);
190 #endif
191
192 #ifdef HAVE_GCRYPT
193 if (f->fss_file)
194 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
195 else
196 free(f->fsprg_state);
197
198 free(f->fsprg_seed);
199
200 if (f->hmac)
201 gcry_md_close(f->hmac);
202 #endif
203
204 free(f);
205 return NULL;
206 }
207
208 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
209 Header h = {};
210 ssize_t k;
211 int r;
212
213 assert(f);
214
215 memcpy(h.signature, HEADER_SIGNATURE, 8);
216 h.header_size = htole64(ALIGN64(sizeof(h)));
217
218 h.incompatible_flags |= htole32(
219 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
220 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
221
222 h.compatible_flags = htole32(
223 f->seal * HEADER_COMPATIBLE_SEALED);
224
225 r = sd_id128_randomize(&h.file_id);
226 if (r < 0)
227 return r;
228
229 if (template) {
230 h.seqnum_id = template->header->seqnum_id;
231 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
232 } else
233 h.seqnum_id = h.file_id;
234
235 k = pwrite(f->fd, &h, sizeof(h), 0);
236 if (k < 0)
237 return -errno;
238
239 if (k != sizeof(h))
240 return -EIO;
241
242 return 0;
243 }
244
245 static int journal_file_refresh_header(JournalFile *f) {
246 sd_id128_t boot_id;
247 int r;
248
249 assert(f);
250
251 r = sd_id128_get_machine(&f->header->machine_id);
252 if (r < 0)
253 return r;
254
255 r = sd_id128_get_boot(&boot_id);
256 if (r < 0)
257 return r;
258
259 if (sd_id128_equal(boot_id, f->header->boot_id))
260 f->tail_entry_monotonic_valid = true;
261
262 f->header->boot_id = boot_id;
263
264 r = journal_file_set_online(f);
265
266 /* Sync the online state to disk */
267 fsync(f->fd);
268
269 return r;
270 }
271
272 static int journal_file_verify_header(JournalFile *f) {
273 uint32_t flags;
274
275 assert(f);
276
277 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
278 return -EBADMSG;
279
280 /* In both read and write mode we refuse to open files with
281 * incompatible flags we don't know */
282 flags = le32toh(f->header->incompatible_flags);
283 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
284 if (flags & ~HEADER_INCOMPATIBLE_ANY)
285 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
286 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
287 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
288 if (flags)
289 log_debug("Journal file %s uses incompatible flags %"PRIx32
290 " disabled at compilation time.", f->path, flags);
291 return -EPROTONOSUPPORT;
292 }
293
294 /* When open for writing we refuse to open files with
295 * compatible flags, too */
296 flags = le32toh(f->header->compatible_flags);
297 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
298 if (flags & ~HEADER_COMPATIBLE_ANY)
299 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
300 f->path, flags & ~HEADER_COMPATIBLE_ANY);
301 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
302 if (flags)
303 log_debug("Journal file %s uses compatible flags %"PRIx32
304 " disabled at compilation time.", f->path, flags);
305 return -EPROTONOSUPPORT;
306 }
307
308 if (f->header->state >= _STATE_MAX)
309 return -EBADMSG;
310
311 /* The first addition was n_data, so check that we are at least this large */
312 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
313 return -EBADMSG;
314
315 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
316 return -EBADMSG;
317
318 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
319 return -ENODATA;
320
321 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
322 return -ENODATA;
323
324 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
325 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
326 !VALID64(le64toh(f->header->tail_object_offset)) ||
327 !VALID64(le64toh(f->header->entry_array_offset)))
328 return -ENODATA;
329
330 if (f->writable) {
331 uint8_t state;
332 sd_id128_t machine_id;
333 int r;
334
335 r = sd_id128_get_machine(&machine_id);
336 if (r < 0)
337 return r;
338
339 if (!sd_id128_equal(machine_id, f->header->machine_id))
340 return -EHOSTDOWN;
341
342 state = f->header->state;
343
344 if (state == STATE_ONLINE) {
345 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
346 return -EBUSY;
347 } else if (state == STATE_ARCHIVED)
348 return -ESHUTDOWN;
349 else if (state != STATE_OFFLINE) {
350 log_debug("Journal file %s has unknown state %i.", f->path, state);
351 return -EBUSY;
352 }
353 }
354
355 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
356 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
357
358 f->seal = JOURNAL_HEADER_SEALED(f->header);
359
360 return 0;
361 }
362
363 static int journal_file_fstat(JournalFile *f) {
364 assert(f);
365 assert(f->fd >= 0);
366
367 if (fstat(f->fd, &f->last_stat) < 0)
368 return -errno;
369
370 f->last_stat_usec = now(CLOCK_MONOTONIC);
371
372 /* Refuse appending to files that are already deleted */
373 if (f->last_stat.st_nlink <= 0)
374 return -EIDRM;
375
376 return 0;
377 }
378
379 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
380 uint64_t old_size, new_size;
381 int r;
382
383 assert(f);
384
385 /* We assume that this file is not sparse, and we know that
386 * for sure, since we always call posix_fallocate()
387 * ourselves */
388
389 if (mmap_cache_got_sigbus(f->mmap, f->fd))
390 return -EIO;
391
392 old_size =
393 le64toh(f->header->header_size) +
394 le64toh(f->header->arena_size);
395
396 new_size = PAGE_ALIGN(offset + size);
397 if (new_size < le64toh(f->header->header_size))
398 new_size = le64toh(f->header->header_size);
399
400 if (new_size <= old_size) {
401
402 /* We already pre-allocated enough space, but before
403 * we write to it, let's check with fstat() if the
404 * file got deleted, in order make sure we don't throw
405 * away the data immediately. Don't check fstat() for
406 * all writes though, but only once ever 10s. */
407
408 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
409 return 0;
410
411 return journal_file_fstat(f);
412 }
413
414 /* Allocate more space. */
415
416 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
417 return -E2BIG;
418
419 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
420 struct statvfs svfs;
421
422 if (fstatvfs(f->fd, &svfs) >= 0) {
423 uint64_t available;
424
425 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
426
427 if (new_size - old_size > available)
428 return -E2BIG;
429 }
430 }
431
432 /* Increase by larger blocks at once */
433 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
434 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
435 new_size = f->metrics.max_size;
436
437 /* Note that the glibc fallocate() fallback is very
438 inefficient, hence we try to minimize the allocation area
439 as we can. */
440 r = posix_fallocate(f->fd, old_size, new_size - old_size);
441 if (r != 0)
442 return -r;
443
444 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
445
446 return journal_file_fstat(f);
447 }
448
449 static unsigned type_to_context(ObjectType type) {
450 /* One context for each type, plus one catch-all for the rest */
451 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
452 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
453 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
454 }
455
456 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
457 int r;
458
459 assert(f);
460 assert(ret);
461
462 if (size <= 0)
463 return -EINVAL;
464
465 /* Avoid SIGBUS on invalid accesses */
466 if (offset + size > (uint64_t) f->last_stat.st_size) {
467 /* Hmm, out of range? Let's refresh the fstat() data
468 * first, before we trust that check. */
469
470 r = journal_file_fstat(f);
471 if (r < 0)
472 return r;
473
474 if (offset + size > (uint64_t) f->last_stat.st_size)
475 return -EADDRNOTAVAIL;
476 }
477
478 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
479 }
480
481 static uint64_t minimum_header_size(Object *o) {
482
483 static const uint64_t table[] = {
484 [OBJECT_DATA] = sizeof(DataObject),
485 [OBJECT_FIELD] = sizeof(FieldObject),
486 [OBJECT_ENTRY] = sizeof(EntryObject),
487 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
488 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
489 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
490 [OBJECT_TAG] = sizeof(TagObject),
491 };
492
493 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
494 return sizeof(ObjectHeader);
495
496 return table[o->object.type];
497 }
498
499 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
500 int r;
501 void *t;
502 Object *o;
503 uint64_t s;
504
505 assert(f);
506 assert(ret);
507
508 /* Objects may only be located at multiple of 64 bit */
509 if (!VALID64(offset))
510 return -EFAULT;
511
512 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
513 if (r < 0)
514 return r;
515
516 o = (Object*) t;
517 s = le64toh(o->object.size);
518
519 if (s < sizeof(ObjectHeader))
520 return -EBADMSG;
521
522 if (o->object.type <= OBJECT_UNUSED)
523 return -EBADMSG;
524
525 if (s < minimum_header_size(o))
526 return -EBADMSG;
527
528 if (type > OBJECT_UNUSED && o->object.type != type)
529 return -EBADMSG;
530
531 if (s > sizeof(ObjectHeader)) {
532 r = journal_file_move_to(f, type, false, offset, s, &t);
533 if (r < 0)
534 return r;
535
536 o = (Object*) t;
537 }
538
539 *ret = o;
540 return 0;
541 }
542
543 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
544 uint64_t r;
545
546 assert(f);
547
548 r = le64toh(f->header->tail_entry_seqnum) + 1;
549
550 if (seqnum) {
551 /* If an external seqnum counter was passed, we update
552 * both the local and the external one, and set it to
553 * the maximum of both */
554
555 if (*seqnum + 1 > r)
556 r = *seqnum + 1;
557
558 *seqnum = r;
559 }
560
561 f->header->tail_entry_seqnum = htole64(r);
562
563 if (f->header->head_entry_seqnum == 0)
564 f->header->head_entry_seqnum = htole64(r);
565
566 return r;
567 }
568
569 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
570 int r;
571 uint64_t p;
572 Object *tail, *o;
573 void *t;
574
575 assert(f);
576 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
577 assert(size >= sizeof(ObjectHeader));
578 assert(offset);
579 assert(ret);
580
581 r = journal_file_set_online(f);
582 if (r < 0)
583 return r;
584
585 p = le64toh(f->header->tail_object_offset);
586 if (p == 0)
587 p = le64toh(f->header->header_size);
588 else {
589 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
590 if (r < 0)
591 return r;
592
593 p += ALIGN64(le64toh(tail->object.size));
594 }
595
596 r = journal_file_allocate(f, p, size);
597 if (r < 0)
598 return r;
599
600 r = journal_file_move_to(f, type, false, p, size, &t);
601 if (r < 0)
602 return r;
603
604 o = (Object*) t;
605
606 zero(o->object);
607 o->object.type = type;
608 o->object.size = htole64(size);
609
610 f->header->tail_object_offset = htole64(p);
611 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
612
613 *ret = o;
614 *offset = p;
615
616 return 0;
617 }
618
619 static int journal_file_setup_data_hash_table(JournalFile *f) {
620 uint64_t s, p;
621 Object *o;
622 int r;
623
624 assert(f);
625
626 /* We estimate that we need 1 hash table entry per 768 bytes
627 of journal file and we want to make sure we never get
628 beyond 75% fill level. Calculate the hash table size for
629 the maximum file size based on these metrics. */
630
631 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
632 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
633 s = DEFAULT_DATA_HASH_TABLE_SIZE;
634
635 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
636
637 r = journal_file_append_object(f,
638 OBJECT_DATA_HASH_TABLE,
639 offsetof(Object, hash_table.items) + s,
640 &o, &p);
641 if (r < 0)
642 return r;
643
644 memzero(o->hash_table.items, s);
645
646 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
647 f->header->data_hash_table_size = htole64(s);
648
649 return 0;
650 }
651
652 static int journal_file_setup_field_hash_table(JournalFile *f) {
653 uint64_t s, p;
654 Object *o;
655 int r;
656
657 assert(f);
658
659 /* We use a fixed size hash table for the fields as this
660 * number should grow very slowly only */
661
662 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
663 r = journal_file_append_object(f,
664 OBJECT_FIELD_HASH_TABLE,
665 offsetof(Object, hash_table.items) + s,
666 &o, &p);
667 if (r < 0)
668 return r;
669
670 memzero(o->hash_table.items, s);
671
672 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
673 f->header->field_hash_table_size = htole64(s);
674
675 return 0;
676 }
677
678 int journal_file_map_data_hash_table(JournalFile *f) {
679 uint64_t s, p;
680 void *t;
681 int r;
682
683 assert(f);
684
685 if (f->data_hash_table)
686 return 0;
687
688 p = le64toh(f->header->data_hash_table_offset);
689 s = le64toh(f->header->data_hash_table_size);
690
691 r = journal_file_move_to(f,
692 OBJECT_DATA_HASH_TABLE,
693 true,
694 p, s,
695 &t);
696 if (r < 0)
697 return r;
698
699 f->data_hash_table = t;
700 return 0;
701 }
702
703 int journal_file_map_field_hash_table(JournalFile *f) {
704 uint64_t s, p;
705 void *t;
706 int r;
707
708 assert(f);
709
710 if (f->field_hash_table)
711 return 0;
712
713 p = le64toh(f->header->field_hash_table_offset);
714 s = le64toh(f->header->field_hash_table_size);
715
716 r = journal_file_move_to(f,
717 OBJECT_FIELD_HASH_TABLE,
718 true,
719 p, s,
720 &t);
721 if (r < 0)
722 return r;
723
724 f->field_hash_table = t;
725 return 0;
726 }
727
728 static int journal_file_link_field(
729 JournalFile *f,
730 Object *o,
731 uint64_t offset,
732 uint64_t hash) {
733
734 uint64_t p, h, m;
735 int r;
736
737 assert(f);
738 assert(o);
739 assert(offset > 0);
740
741 if (o->object.type != OBJECT_FIELD)
742 return -EINVAL;
743
744 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
745 if (m <= 0)
746 return -EBADMSG;
747
748 /* This might alter the window we are looking at */
749 o->field.next_hash_offset = o->field.head_data_offset = 0;
750
751 h = hash % m;
752 p = le64toh(f->field_hash_table[h].tail_hash_offset);
753 if (p == 0)
754 f->field_hash_table[h].head_hash_offset = htole64(offset);
755 else {
756 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
757 if (r < 0)
758 return r;
759
760 o->field.next_hash_offset = htole64(offset);
761 }
762
763 f->field_hash_table[h].tail_hash_offset = htole64(offset);
764
765 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
766 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
767
768 return 0;
769 }
770
771 static int journal_file_link_data(
772 JournalFile *f,
773 Object *o,
774 uint64_t offset,
775 uint64_t hash) {
776
777 uint64_t p, h, m;
778 int r;
779
780 assert(f);
781 assert(o);
782 assert(offset > 0);
783
784 if (o->object.type != OBJECT_DATA)
785 return -EINVAL;
786
787 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
788 if (m <= 0)
789 return -EBADMSG;
790
791 /* This might alter the window we are looking at */
792 o->data.next_hash_offset = o->data.next_field_offset = 0;
793 o->data.entry_offset = o->data.entry_array_offset = 0;
794 o->data.n_entries = 0;
795
796 h = hash % m;
797 p = le64toh(f->data_hash_table[h].tail_hash_offset);
798 if (p == 0)
799 /* Only entry in the hash table is easy */
800 f->data_hash_table[h].head_hash_offset = htole64(offset);
801 else {
802 /* Move back to the previous data object, to patch in
803 * pointer */
804
805 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
806 if (r < 0)
807 return r;
808
809 o->data.next_hash_offset = htole64(offset);
810 }
811
812 f->data_hash_table[h].tail_hash_offset = htole64(offset);
813
814 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
815 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
816
817 return 0;
818 }
819
820 int journal_file_find_field_object_with_hash(
821 JournalFile *f,
822 const void *field, uint64_t size, uint64_t hash,
823 Object **ret, uint64_t *offset) {
824
825 uint64_t p, osize, h, m;
826 int r;
827
828 assert(f);
829 assert(field && size > 0);
830
831 /* If the field hash table is empty, we can't find anything */
832 if (le64toh(f->header->field_hash_table_size) <= 0)
833 return 0;
834
835 /* Map the field hash table, if it isn't mapped yet. */
836 r = journal_file_map_field_hash_table(f);
837 if (r < 0)
838 return r;
839
840 osize = offsetof(Object, field.payload) + size;
841
842 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
843 if (m <= 0)
844 return -EBADMSG;
845
846 h = hash % m;
847 p = le64toh(f->field_hash_table[h].head_hash_offset);
848
849 while (p > 0) {
850 Object *o;
851
852 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
853 if (r < 0)
854 return r;
855
856 if (le64toh(o->field.hash) == hash &&
857 le64toh(o->object.size) == osize &&
858 memcmp(o->field.payload, field, size) == 0) {
859
860 if (ret)
861 *ret = o;
862 if (offset)
863 *offset = p;
864
865 return 1;
866 }
867
868 p = le64toh(o->field.next_hash_offset);
869 }
870
871 return 0;
872 }
873
874 int journal_file_find_field_object(
875 JournalFile *f,
876 const void *field, uint64_t size,
877 Object **ret, uint64_t *offset) {
878
879 uint64_t hash;
880
881 assert(f);
882 assert(field && size > 0);
883
884 hash = hash64(field, size);
885
886 return journal_file_find_field_object_with_hash(f,
887 field, size, hash,
888 ret, offset);
889 }
890
891 int journal_file_find_data_object_with_hash(
892 JournalFile *f,
893 const void *data, uint64_t size, uint64_t hash,
894 Object **ret, uint64_t *offset) {
895
896 uint64_t p, osize, h, m;
897 int r;
898
899 assert(f);
900 assert(data || size == 0);
901
902 /* If there's no data hash table, then there's no entry. */
903 if (le64toh(f->header->data_hash_table_size) <= 0)
904 return 0;
905
906 /* Map the data hash table, if it isn't mapped yet. */
907 r = journal_file_map_data_hash_table(f);
908 if (r < 0)
909 return r;
910
911 osize = offsetof(Object, data.payload) + size;
912
913 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
914 if (m <= 0)
915 return -EBADMSG;
916
917 h = hash % m;
918 p = le64toh(f->data_hash_table[h].head_hash_offset);
919
920 while (p > 0) {
921 Object *o;
922
923 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
924 if (r < 0)
925 return r;
926
927 if (le64toh(o->data.hash) != hash)
928 goto next;
929
930 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
931 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
932 uint64_t l;
933 size_t rsize = 0;
934
935 l = le64toh(o->object.size);
936 if (l <= offsetof(Object, data.payload))
937 return -EBADMSG;
938
939 l -= offsetof(Object, data.payload);
940
941 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
942 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
943 if (r < 0)
944 return r;
945
946 if (rsize == size &&
947 memcmp(f->compress_buffer, data, size) == 0) {
948
949 if (ret)
950 *ret = o;
951
952 if (offset)
953 *offset = p;
954
955 return 1;
956 }
957 #else
958 return -EPROTONOSUPPORT;
959 #endif
960 } else if (le64toh(o->object.size) == osize &&
961 memcmp(o->data.payload, data, size) == 0) {
962
963 if (ret)
964 *ret = o;
965
966 if (offset)
967 *offset = p;
968
969 return 1;
970 }
971
972 next:
973 p = le64toh(o->data.next_hash_offset);
974 }
975
976 return 0;
977 }
978
979 int journal_file_find_data_object(
980 JournalFile *f,
981 const void *data, uint64_t size,
982 Object **ret, uint64_t *offset) {
983
984 uint64_t hash;
985
986 assert(f);
987 assert(data || size == 0);
988
989 hash = hash64(data, size);
990
991 return journal_file_find_data_object_with_hash(f,
992 data, size, hash,
993 ret, offset);
994 }
995
996 static int journal_file_append_field(
997 JournalFile *f,
998 const void *field, uint64_t size,
999 Object **ret, uint64_t *offset) {
1000
1001 uint64_t hash, p;
1002 uint64_t osize;
1003 Object *o;
1004 int r;
1005
1006 assert(f);
1007 assert(field && size > 0);
1008
1009 hash = hash64(field, size);
1010
1011 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1012 if (r < 0)
1013 return r;
1014 else if (r > 0) {
1015
1016 if (ret)
1017 *ret = o;
1018
1019 if (offset)
1020 *offset = p;
1021
1022 return 0;
1023 }
1024
1025 osize = offsetof(Object, field.payload) + size;
1026 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1027 if (r < 0)
1028 return r;
1029
1030 o->field.hash = htole64(hash);
1031 memcpy(o->field.payload, field, size);
1032
1033 r = journal_file_link_field(f, o, p, hash);
1034 if (r < 0)
1035 return r;
1036
1037 /* The linking might have altered the window, so let's
1038 * refresh our pointer */
1039 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1040 if (r < 0)
1041 return r;
1042
1043 #ifdef HAVE_GCRYPT
1044 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1045 if (r < 0)
1046 return r;
1047 #endif
1048
1049 if (ret)
1050 *ret = o;
1051
1052 if (offset)
1053 *offset = p;
1054
1055 return 0;
1056 }
1057
1058 static int journal_file_append_data(
1059 JournalFile *f,
1060 const void *data, uint64_t size,
1061 Object **ret, uint64_t *offset) {
1062
1063 uint64_t hash, p;
1064 uint64_t osize;
1065 Object *o;
1066 int r, compression = 0;
1067 const void *eq;
1068
1069 assert(f);
1070 assert(data || size == 0);
1071
1072 hash = hash64(data, size);
1073
1074 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1075 if (r < 0)
1076 return r;
1077 if (r > 0) {
1078
1079 if (ret)
1080 *ret = o;
1081
1082 if (offset)
1083 *offset = p;
1084
1085 return 0;
1086 }
1087
1088 osize = offsetof(Object, data.payload) + size;
1089 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1090 if (r < 0)
1091 return r;
1092
1093 o->data.hash = htole64(hash);
1094
1095 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1096 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1097 size_t rsize = 0;
1098
1099 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1100
1101 if (compression >= 0) {
1102 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1103 o->object.flags |= compression;
1104
1105 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1106 size, rsize, object_compressed_to_string(compression));
1107 } else
1108 /* Compression didn't work, we don't really care why, let's continue without compression */
1109 compression = 0;
1110 }
1111 #endif
1112
1113 if (compression == 0 && size > 0)
1114 memcpy(o->data.payload, data, size);
1115
1116 r = journal_file_link_data(f, o, p, hash);
1117 if (r < 0)
1118 return r;
1119
1120 /* The linking might have altered the window, so let's
1121 * refresh our pointer */
1122 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1123 if (r < 0)
1124 return r;
1125
1126 if (!data)
1127 eq = NULL;
1128 else
1129 eq = memchr(data, '=', size);
1130 if (eq && eq > data) {
1131 Object *fo = NULL;
1132 uint64_t fp;
1133
1134 /* Create field object ... */
1135 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1136 if (r < 0)
1137 return r;
1138
1139 /* ... and link it in. */
1140 o->data.next_field_offset = fo->field.head_data_offset;
1141 fo->field.head_data_offset = le64toh(p);
1142 }
1143
1144 #ifdef HAVE_GCRYPT
1145 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1146 if (r < 0)
1147 return r;
1148 #endif
1149
1150 if (ret)
1151 *ret = o;
1152
1153 if (offset)
1154 *offset = p;
1155
1156 return 0;
1157 }
1158
1159 uint64_t journal_file_entry_n_items(Object *o) {
1160 assert(o);
1161
1162 if (o->object.type != OBJECT_ENTRY)
1163 return 0;
1164
1165 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1166 }
1167
1168 uint64_t journal_file_entry_array_n_items(Object *o) {
1169 assert(o);
1170
1171 if (o->object.type != OBJECT_ENTRY_ARRAY)
1172 return 0;
1173
1174 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1175 }
1176
1177 uint64_t journal_file_hash_table_n_items(Object *o) {
1178 assert(o);
1179
1180 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1181 o->object.type != OBJECT_FIELD_HASH_TABLE)
1182 return 0;
1183
1184 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1185 }
1186
1187 static int link_entry_into_array(JournalFile *f,
1188 le64_t *first,
1189 le64_t *idx,
1190 uint64_t p) {
1191 int r;
1192 uint64_t n = 0, ap = 0, q, i, a, hidx;
1193 Object *o;
1194
1195 assert(f);
1196 assert(first);
1197 assert(idx);
1198 assert(p > 0);
1199
1200 a = le64toh(*first);
1201 i = hidx = le64toh(*idx);
1202 while (a > 0) {
1203
1204 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1205 if (r < 0)
1206 return r;
1207
1208 n = journal_file_entry_array_n_items(o);
1209 if (i < n) {
1210 o->entry_array.items[i] = htole64(p);
1211 *idx = htole64(hidx + 1);
1212 return 0;
1213 }
1214
1215 i -= n;
1216 ap = a;
1217 a = le64toh(o->entry_array.next_entry_array_offset);
1218 }
1219
1220 if (hidx > n)
1221 n = (hidx+1) * 2;
1222 else
1223 n = n * 2;
1224
1225 if (n < 4)
1226 n = 4;
1227
1228 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1229 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1230 &o, &q);
1231 if (r < 0)
1232 return r;
1233
1234 #ifdef HAVE_GCRYPT
1235 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1236 if (r < 0)
1237 return r;
1238 #endif
1239
1240 o->entry_array.items[i] = htole64(p);
1241
1242 if (ap == 0)
1243 *first = htole64(q);
1244 else {
1245 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1246 if (r < 0)
1247 return r;
1248
1249 o->entry_array.next_entry_array_offset = htole64(q);
1250 }
1251
1252 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1253 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1254
1255 *idx = htole64(hidx + 1);
1256
1257 return 0;
1258 }
1259
1260 static int link_entry_into_array_plus_one(JournalFile *f,
1261 le64_t *extra,
1262 le64_t *first,
1263 le64_t *idx,
1264 uint64_t p) {
1265
1266 int r;
1267
1268 assert(f);
1269 assert(extra);
1270 assert(first);
1271 assert(idx);
1272 assert(p > 0);
1273
1274 if (*idx == 0)
1275 *extra = htole64(p);
1276 else {
1277 le64_t i;
1278
1279 i = htole64(le64toh(*idx) - 1);
1280 r = link_entry_into_array(f, first, &i, p);
1281 if (r < 0)
1282 return r;
1283 }
1284
1285 *idx = htole64(le64toh(*idx) + 1);
1286 return 0;
1287 }
1288
1289 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1290 uint64_t p;
1291 int r;
1292 assert(f);
1293 assert(o);
1294 assert(offset > 0);
1295
1296 p = le64toh(o->entry.items[i].object_offset);
1297 if (p == 0)
1298 return -EINVAL;
1299
1300 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1301 if (r < 0)
1302 return r;
1303
1304 return link_entry_into_array_plus_one(f,
1305 &o->data.entry_offset,
1306 &o->data.entry_array_offset,
1307 &o->data.n_entries,
1308 offset);
1309 }
1310
1311 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1312 uint64_t n, i;
1313 int r;
1314
1315 assert(f);
1316 assert(o);
1317 assert(offset > 0);
1318
1319 if (o->object.type != OBJECT_ENTRY)
1320 return -EINVAL;
1321
1322 __sync_synchronize();
1323
1324 /* Link up the entry itself */
1325 r = link_entry_into_array(f,
1326 &f->header->entry_array_offset,
1327 &f->header->n_entries,
1328 offset);
1329 if (r < 0)
1330 return r;
1331
1332 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1333
1334 if (f->header->head_entry_realtime == 0)
1335 f->header->head_entry_realtime = o->entry.realtime;
1336
1337 f->header->tail_entry_realtime = o->entry.realtime;
1338 f->header->tail_entry_monotonic = o->entry.monotonic;
1339
1340 f->tail_entry_monotonic_valid = true;
1341
1342 /* Link up the items */
1343 n = journal_file_entry_n_items(o);
1344 for (i = 0; i < n; i++) {
1345 r = journal_file_link_entry_item(f, o, offset, i);
1346 if (r < 0)
1347 return r;
1348 }
1349
1350 return 0;
1351 }
1352
1353 static int journal_file_append_entry_internal(
1354 JournalFile *f,
1355 const dual_timestamp *ts,
1356 uint64_t xor_hash,
1357 const EntryItem items[], unsigned n_items,
1358 uint64_t *seqnum,
1359 Object **ret, uint64_t *offset) {
1360 uint64_t np;
1361 uint64_t osize;
1362 Object *o;
1363 int r;
1364
1365 assert(f);
1366 assert(items || n_items == 0);
1367 assert(ts);
1368
1369 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1370
1371 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1372 if (r < 0)
1373 return r;
1374
1375 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1376 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1377 o->entry.realtime = htole64(ts->realtime);
1378 o->entry.monotonic = htole64(ts->monotonic);
1379 o->entry.xor_hash = htole64(xor_hash);
1380 o->entry.boot_id = f->header->boot_id;
1381
1382 #ifdef HAVE_GCRYPT
1383 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1384 if (r < 0)
1385 return r;
1386 #endif
1387
1388 r = journal_file_link_entry(f, o, np);
1389 if (r < 0)
1390 return r;
1391
1392 if (ret)
1393 *ret = o;
1394
1395 if (offset)
1396 *offset = np;
1397
1398 return 0;
1399 }
1400
1401 void journal_file_post_change(JournalFile *f) {
1402 assert(f);
1403
1404 /* inotify() does not receive IN_MODIFY events from file
1405 * accesses done via mmap(). After each access we hence
1406 * trigger IN_MODIFY by truncating the journal file to its
1407 * current size which triggers IN_MODIFY. */
1408
1409 __sync_synchronize();
1410
1411 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1412 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1413 }
1414
1415 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1416 assert(userdata);
1417
1418 journal_file_post_change(userdata);
1419
1420 return 1;
1421 }
1422
1423 static void schedule_post_change(JournalFile *f) {
1424 sd_event_source *timer;
1425 int enabled, r;
1426 uint64_t now;
1427
1428 assert(f);
1429 assert(f->post_change_timer);
1430
1431 timer = f->post_change_timer;
1432
1433 r = sd_event_source_get_enabled(timer, &enabled);
1434 if (r < 0) {
1435 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1436 goto fail;
1437 }
1438
1439 if (enabled == SD_EVENT_ONESHOT)
1440 return;
1441
1442 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1443 if (r < 0) {
1444 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1445 goto fail;
1446 }
1447
1448 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1449 if (r < 0) {
1450 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1451 goto fail;
1452 }
1453
1454 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1455 if (r < 0) {
1456 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1457 goto fail;
1458 }
1459
1460 return;
1461
1462 fail:
1463 /* On failure, let's simply post the change immediately. */
1464 journal_file_post_change(f);
1465 }
1466
1467 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1468 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1469 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1470 int r;
1471
1472 assert(f);
1473 assert_return(!f->post_change_timer, -EINVAL);
1474 assert(e);
1475 assert(t);
1476
1477 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1478 if (r < 0)
1479 return r;
1480
1481 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1482 if (r < 0)
1483 return r;
1484
1485 f->post_change_timer = timer;
1486 timer = NULL;
1487 f->post_change_timer_period = t;
1488
1489 return r;
1490 }
1491
1492 static int entry_item_cmp(const void *_a, const void *_b) {
1493 const EntryItem *a = _a, *b = _b;
1494
1495 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1496 return -1;
1497 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1498 return 1;
1499 return 0;
1500 }
1501
1502 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1503 unsigned i;
1504 EntryItem *items;
1505 int r;
1506 uint64_t xor_hash = 0;
1507 struct dual_timestamp _ts;
1508
1509 assert(f);
1510 assert(iovec || n_iovec == 0);
1511
1512 if (!ts) {
1513 dual_timestamp_get(&_ts);
1514 ts = &_ts;
1515 }
1516
1517 if (f->tail_entry_monotonic_valid &&
1518 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1519 return -EINVAL;
1520
1521 #ifdef HAVE_GCRYPT
1522 r = journal_file_maybe_append_tag(f, ts->realtime);
1523 if (r < 0)
1524 return r;
1525 #endif
1526
1527 /* alloca() can't take 0, hence let's allocate at least one */
1528 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1529
1530 for (i = 0; i < n_iovec; i++) {
1531 uint64_t p;
1532 Object *o;
1533
1534 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1535 if (r < 0)
1536 return r;
1537
1538 xor_hash ^= le64toh(o->data.hash);
1539 items[i].object_offset = htole64(p);
1540 items[i].hash = o->data.hash;
1541 }
1542
1543 /* Order by the position on disk, in order to improve seek
1544 * times for rotating media. */
1545 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1546
1547 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1548
1549 /* If the memory mapping triggered a SIGBUS then we return an
1550 * IO error and ignore the error code passed down to us, since
1551 * it is very likely just an effect of a nullified replacement
1552 * mapping page */
1553
1554 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1555 r = -EIO;
1556
1557 if (f->post_change_timer)
1558 schedule_post_change(f);
1559 else
1560 journal_file_post_change(f);
1561
1562 return r;
1563 }
1564
1565 typedef struct ChainCacheItem {
1566 uint64_t first; /* the array at the beginning of the chain */
1567 uint64_t array; /* the cached array */
1568 uint64_t begin; /* the first item in the cached array */
1569 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1570 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1571 } ChainCacheItem;
1572
1573 static void chain_cache_put(
1574 OrderedHashmap *h,
1575 ChainCacheItem *ci,
1576 uint64_t first,
1577 uint64_t array,
1578 uint64_t begin,
1579 uint64_t total,
1580 uint64_t last_index) {
1581
1582 if (!ci) {
1583 /* If the chain item to cache for this chain is the
1584 * first one it's not worth caching anything */
1585 if (array == first)
1586 return;
1587
1588 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1589 ci = ordered_hashmap_steal_first(h);
1590 assert(ci);
1591 } else {
1592 ci = new(ChainCacheItem, 1);
1593 if (!ci)
1594 return;
1595 }
1596
1597 ci->first = first;
1598
1599 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1600 free(ci);
1601 return;
1602 }
1603 } else
1604 assert(ci->first == first);
1605
1606 ci->array = array;
1607 ci->begin = begin;
1608 ci->total = total;
1609 ci->last_index = last_index;
1610 }
1611
1612 static int generic_array_get(
1613 JournalFile *f,
1614 uint64_t first,
1615 uint64_t i,
1616 Object **ret, uint64_t *offset) {
1617
1618 Object *o;
1619 uint64_t p = 0, a, t = 0;
1620 int r;
1621 ChainCacheItem *ci;
1622
1623 assert(f);
1624
1625 a = first;
1626
1627 /* Try the chain cache first */
1628 ci = ordered_hashmap_get(f->chain_cache, &first);
1629 if (ci && i > ci->total) {
1630 a = ci->array;
1631 i -= ci->total;
1632 t = ci->total;
1633 }
1634
1635 while (a > 0) {
1636 uint64_t k;
1637
1638 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1639 if (r < 0)
1640 return r;
1641
1642 k = journal_file_entry_array_n_items(o);
1643 if (i < k) {
1644 p = le64toh(o->entry_array.items[i]);
1645 goto found;
1646 }
1647
1648 i -= k;
1649 t += k;
1650 a = le64toh(o->entry_array.next_entry_array_offset);
1651 }
1652
1653 return 0;
1654
1655 found:
1656 /* Let's cache this item for the next invocation */
1657 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1658
1659 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1660 if (r < 0)
1661 return r;
1662
1663 if (ret)
1664 *ret = o;
1665
1666 if (offset)
1667 *offset = p;
1668
1669 return 1;
1670 }
1671
1672 static int generic_array_get_plus_one(
1673 JournalFile *f,
1674 uint64_t extra,
1675 uint64_t first,
1676 uint64_t i,
1677 Object **ret, uint64_t *offset) {
1678
1679 Object *o;
1680
1681 assert(f);
1682
1683 if (i == 0) {
1684 int r;
1685
1686 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1687 if (r < 0)
1688 return r;
1689
1690 if (ret)
1691 *ret = o;
1692
1693 if (offset)
1694 *offset = extra;
1695
1696 return 1;
1697 }
1698
1699 return generic_array_get(f, first, i-1, ret, offset);
1700 }
1701
1702 enum {
1703 TEST_FOUND,
1704 TEST_LEFT,
1705 TEST_RIGHT
1706 };
1707
1708 static int generic_array_bisect(
1709 JournalFile *f,
1710 uint64_t first,
1711 uint64_t n,
1712 uint64_t needle,
1713 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1714 direction_t direction,
1715 Object **ret,
1716 uint64_t *offset,
1717 uint64_t *idx) {
1718
1719 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1720 bool subtract_one = false;
1721 Object *o, *array = NULL;
1722 int r;
1723 ChainCacheItem *ci;
1724
1725 assert(f);
1726 assert(test_object);
1727
1728 /* Start with the first array in the chain */
1729 a = first;
1730
1731 ci = ordered_hashmap_get(f->chain_cache, &first);
1732 if (ci && n > ci->total) {
1733 /* Ah, we have iterated this bisection array chain
1734 * previously! Let's see if we can skip ahead in the
1735 * chain, as far as the last time. But we can't jump
1736 * backwards in the chain, so let's check that
1737 * first. */
1738
1739 r = test_object(f, ci->begin, needle);
1740 if (r < 0)
1741 return r;
1742
1743 if (r == TEST_LEFT) {
1744 /* OK, what we are looking for is right of the
1745 * begin of this EntryArray, so let's jump
1746 * straight to previously cached array in the
1747 * chain */
1748
1749 a = ci->array;
1750 n -= ci->total;
1751 t = ci->total;
1752 last_index = ci->last_index;
1753 }
1754 }
1755
1756 while (a > 0) {
1757 uint64_t left, right, k, lp;
1758
1759 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1760 if (r < 0)
1761 return r;
1762
1763 k = journal_file_entry_array_n_items(array);
1764 right = MIN(k, n);
1765 if (right <= 0)
1766 return 0;
1767
1768 i = right - 1;
1769 lp = p = le64toh(array->entry_array.items[i]);
1770 if (p <= 0)
1771 return -EBADMSG;
1772
1773 r = test_object(f, p, needle);
1774 if (r < 0)
1775 return r;
1776
1777 if (r == TEST_FOUND)
1778 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1779
1780 if (r == TEST_RIGHT) {
1781 left = 0;
1782 right -= 1;
1783
1784 if (last_index != (uint64_t) -1) {
1785 assert(last_index <= right);
1786
1787 /* If we cached the last index we
1788 * looked at, let's try to not to jump
1789 * too wildly around and see if we can
1790 * limit the range to look at early to
1791 * the immediate neighbors of the last
1792 * index we looked at. */
1793
1794 if (last_index > 0) {
1795 uint64_t x = last_index - 1;
1796
1797 p = le64toh(array->entry_array.items[x]);
1798 if (p <= 0)
1799 return -EBADMSG;
1800
1801 r = test_object(f, p, needle);
1802 if (r < 0)
1803 return r;
1804
1805 if (r == TEST_FOUND)
1806 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1807
1808 if (r == TEST_RIGHT)
1809 right = x;
1810 else
1811 left = x + 1;
1812 }
1813
1814 if (last_index < right) {
1815 uint64_t y = last_index + 1;
1816
1817 p = le64toh(array->entry_array.items[y]);
1818 if (p <= 0)
1819 return -EBADMSG;
1820
1821 r = test_object(f, p, needle);
1822 if (r < 0)
1823 return r;
1824
1825 if (r == TEST_FOUND)
1826 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1827
1828 if (r == TEST_RIGHT)
1829 right = y;
1830 else
1831 left = y + 1;
1832 }
1833 }
1834
1835 for (;;) {
1836 if (left == right) {
1837 if (direction == DIRECTION_UP)
1838 subtract_one = true;
1839
1840 i = left;
1841 goto found;
1842 }
1843
1844 assert(left < right);
1845 i = (left + right) / 2;
1846
1847 p = le64toh(array->entry_array.items[i]);
1848 if (p <= 0)
1849 return -EBADMSG;
1850
1851 r = test_object(f, p, needle);
1852 if (r < 0)
1853 return r;
1854
1855 if (r == TEST_FOUND)
1856 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1857
1858 if (r == TEST_RIGHT)
1859 right = i;
1860 else
1861 left = i + 1;
1862 }
1863 }
1864
1865 if (k >= n) {
1866 if (direction == DIRECTION_UP) {
1867 i = n;
1868 subtract_one = true;
1869 goto found;
1870 }
1871
1872 return 0;
1873 }
1874
1875 last_p = lp;
1876
1877 n -= k;
1878 t += k;
1879 last_index = (uint64_t) -1;
1880 a = le64toh(array->entry_array.next_entry_array_offset);
1881 }
1882
1883 return 0;
1884
1885 found:
1886 if (subtract_one && t == 0 && i == 0)
1887 return 0;
1888
1889 /* Let's cache this item for the next invocation */
1890 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1891
1892 if (subtract_one && i == 0)
1893 p = last_p;
1894 else if (subtract_one)
1895 p = le64toh(array->entry_array.items[i-1]);
1896 else
1897 p = le64toh(array->entry_array.items[i]);
1898
1899 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1900 if (r < 0)
1901 return r;
1902
1903 if (ret)
1904 *ret = o;
1905
1906 if (offset)
1907 *offset = p;
1908
1909 if (idx)
1910 *idx = t + i + (subtract_one ? -1 : 0);
1911
1912 return 1;
1913 }
1914
1915 static int generic_array_bisect_plus_one(
1916 JournalFile *f,
1917 uint64_t extra,
1918 uint64_t first,
1919 uint64_t n,
1920 uint64_t needle,
1921 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1922 direction_t direction,
1923 Object **ret,
1924 uint64_t *offset,
1925 uint64_t *idx) {
1926
1927 int r;
1928 bool step_back = false;
1929 Object *o;
1930
1931 assert(f);
1932 assert(test_object);
1933
1934 if (n <= 0)
1935 return 0;
1936
1937 /* This bisects the array in object 'first', but first checks
1938 * an extra */
1939 r = test_object(f, extra, needle);
1940 if (r < 0)
1941 return r;
1942
1943 if (r == TEST_FOUND)
1944 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1945
1946 /* if we are looking with DIRECTION_UP then we need to first
1947 see if in the actual array there is a matching entry, and
1948 return the last one of that. But if there isn't any we need
1949 to return this one. Hence remember this, and return it
1950 below. */
1951 if (r == TEST_LEFT)
1952 step_back = direction == DIRECTION_UP;
1953
1954 if (r == TEST_RIGHT) {
1955 if (direction == DIRECTION_DOWN)
1956 goto found;
1957 else
1958 return 0;
1959 }
1960
1961 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1962
1963 if (r == 0 && step_back)
1964 goto found;
1965
1966 if (r > 0 && idx)
1967 (*idx) ++;
1968
1969 return r;
1970
1971 found:
1972 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1973 if (r < 0)
1974 return r;
1975
1976 if (ret)
1977 *ret = o;
1978
1979 if (offset)
1980 *offset = extra;
1981
1982 if (idx)
1983 *idx = 0;
1984
1985 return 1;
1986 }
1987
1988 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1989 assert(f);
1990 assert(p > 0);
1991
1992 if (p == needle)
1993 return TEST_FOUND;
1994 else if (p < needle)
1995 return TEST_LEFT;
1996 else
1997 return TEST_RIGHT;
1998 }
1999
2000 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2001 Object *o;
2002 int r;
2003
2004 assert(f);
2005 assert(p > 0);
2006
2007 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2008 if (r < 0)
2009 return r;
2010
2011 if (le64toh(o->entry.seqnum) == needle)
2012 return TEST_FOUND;
2013 else if (le64toh(o->entry.seqnum) < needle)
2014 return TEST_LEFT;
2015 else
2016 return TEST_RIGHT;
2017 }
2018
2019 int journal_file_move_to_entry_by_seqnum(
2020 JournalFile *f,
2021 uint64_t seqnum,
2022 direction_t direction,
2023 Object **ret,
2024 uint64_t *offset) {
2025
2026 return generic_array_bisect(f,
2027 le64toh(f->header->entry_array_offset),
2028 le64toh(f->header->n_entries),
2029 seqnum,
2030 test_object_seqnum,
2031 direction,
2032 ret, offset, NULL);
2033 }
2034
2035 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2036 Object *o;
2037 int r;
2038
2039 assert(f);
2040 assert(p > 0);
2041
2042 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2043 if (r < 0)
2044 return r;
2045
2046 if (le64toh(o->entry.realtime) == needle)
2047 return TEST_FOUND;
2048 else if (le64toh(o->entry.realtime) < needle)
2049 return TEST_LEFT;
2050 else
2051 return TEST_RIGHT;
2052 }
2053
2054 int journal_file_move_to_entry_by_realtime(
2055 JournalFile *f,
2056 uint64_t realtime,
2057 direction_t direction,
2058 Object **ret,
2059 uint64_t *offset) {
2060
2061 return generic_array_bisect(f,
2062 le64toh(f->header->entry_array_offset),
2063 le64toh(f->header->n_entries),
2064 realtime,
2065 test_object_realtime,
2066 direction,
2067 ret, offset, NULL);
2068 }
2069
2070 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2071 Object *o;
2072 int r;
2073
2074 assert(f);
2075 assert(p > 0);
2076
2077 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2078 if (r < 0)
2079 return r;
2080
2081 if (le64toh(o->entry.monotonic) == needle)
2082 return TEST_FOUND;
2083 else if (le64toh(o->entry.monotonic) < needle)
2084 return TEST_LEFT;
2085 else
2086 return TEST_RIGHT;
2087 }
2088
2089 static int find_data_object_by_boot_id(
2090 JournalFile *f,
2091 sd_id128_t boot_id,
2092 Object **o,
2093 uint64_t *b) {
2094
2095 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2096
2097 sd_id128_to_string(boot_id, t + 9);
2098 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2099 }
2100
2101 int journal_file_move_to_entry_by_monotonic(
2102 JournalFile *f,
2103 sd_id128_t boot_id,
2104 uint64_t monotonic,
2105 direction_t direction,
2106 Object **ret,
2107 uint64_t *offset) {
2108
2109 Object *o;
2110 int r;
2111
2112 assert(f);
2113
2114 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2115 if (r < 0)
2116 return r;
2117 if (r == 0)
2118 return -ENOENT;
2119
2120 return generic_array_bisect_plus_one(f,
2121 le64toh(o->data.entry_offset),
2122 le64toh(o->data.entry_array_offset),
2123 le64toh(o->data.n_entries),
2124 monotonic,
2125 test_object_monotonic,
2126 direction,
2127 ret, offset, NULL);
2128 }
2129
2130 void journal_file_reset_location(JournalFile *f) {
2131 f->location_type = LOCATION_HEAD;
2132 f->current_offset = 0;
2133 f->current_seqnum = 0;
2134 f->current_realtime = 0;
2135 f->current_monotonic = 0;
2136 zero(f->current_boot_id);
2137 f->current_xor_hash = 0;
2138 }
2139
2140 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2141 f->location_type = LOCATION_SEEK;
2142 f->current_offset = offset;
2143 f->current_seqnum = le64toh(o->entry.seqnum);
2144 f->current_realtime = le64toh(o->entry.realtime);
2145 f->current_monotonic = le64toh(o->entry.monotonic);
2146 f->current_boot_id = o->entry.boot_id;
2147 f->current_xor_hash = le64toh(o->entry.xor_hash);
2148 }
2149
2150 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2151 assert(af);
2152 assert(bf);
2153 assert(af->location_type == LOCATION_SEEK);
2154 assert(bf->location_type == LOCATION_SEEK);
2155
2156 /* If contents and timestamps match, these entries are
2157 * identical, even if the seqnum does not match */
2158 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2159 af->current_monotonic == bf->current_monotonic &&
2160 af->current_realtime == bf->current_realtime &&
2161 af->current_xor_hash == bf->current_xor_hash)
2162 return 0;
2163
2164 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2165
2166 /* If this is from the same seqnum source, compare
2167 * seqnums */
2168 if (af->current_seqnum < bf->current_seqnum)
2169 return -1;
2170 if (af->current_seqnum > bf->current_seqnum)
2171 return 1;
2172
2173 /* Wow! This is weird, different data but the same
2174 * seqnums? Something is borked, but let's make the
2175 * best of it and compare by time. */
2176 }
2177
2178 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2179
2180 /* If the boot id matches, compare monotonic time */
2181 if (af->current_monotonic < bf->current_monotonic)
2182 return -1;
2183 if (af->current_monotonic > bf->current_monotonic)
2184 return 1;
2185 }
2186
2187 /* Otherwise, compare UTC time */
2188 if (af->current_realtime < bf->current_realtime)
2189 return -1;
2190 if (af->current_realtime > bf->current_realtime)
2191 return 1;
2192
2193 /* Finally, compare by contents */
2194 if (af->current_xor_hash < bf->current_xor_hash)
2195 return -1;
2196 if (af->current_xor_hash > bf->current_xor_hash)
2197 return 1;
2198
2199 return 0;
2200 }
2201
2202 int journal_file_next_entry(
2203 JournalFile *f,
2204 uint64_t p,
2205 direction_t direction,
2206 Object **ret, uint64_t *offset) {
2207
2208 uint64_t i, n, ofs;
2209 int r;
2210
2211 assert(f);
2212
2213 n = le64toh(f->header->n_entries);
2214 if (n <= 0)
2215 return 0;
2216
2217 if (p == 0)
2218 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2219 else {
2220 r = generic_array_bisect(f,
2221 le64toh(f->header->entry_array_offset),
2222 le64toh(f->header->n_entries),
2223 p,
2224 test_object_offset,
2225 DIRECTION_DOWN,
2226 NULL, NULL,
2227 &i);
2228 if (r <= 0)
2229 return r;
2230
2231 if (direction == DIRECTION_DOWN) {
2232 if (i >= n - 1)
2233 return 0;
2234
2235 i++;
2236 } else {
2237 if (i <= 0)
2238 return 0;
2239
2240 i--;
2241 }
2242 }
2243
2244 /* And jump to it */
2245 r = generic_array_get(f,
2246 le64toh(f->header->entry_array_offset),
2247 i,
2248 ret, &ofs);
2249 if (r <= 0)
2250 return r;
2251
2252 if (p > 0 &&
2253 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2254 log_debug("%s: entry array corrupted at entry %"PRIu64,
2255 f->path, i);
2256 return -EBADMSG;
2257 }
2258
2259 if (offset)
2260 *offset = ofs;
2261
2262 return 1;
2263 }
2264
2265 int journal_file_next_entry_for_data(
2266 JournalFile *f,
2267 Object *o, uint64_t p,
2268 uint64_t data_offset,
2269 direction_t direction,
2270 Object **ret, uint64_t *offset) {
2271
2272 uint64_t n, i;
2273 int r;
2274 Object *d;
2275
2276 assert(f);
2277 assert(p > 0 || !o);
2278
2279 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2280 if (r < 0)
2281 return r;
2282
2283 n = le64toh(d->data.n_entries);
2284 if (n <= 0)
2285 return n;
2286
2287 if (!o)
2288 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2289 else {
2290 if (o->object.type != OBJECT_ENTRY)
2291 return -EINVAL;
2292
2293 r = generic_array_bisect_plus_one(f,
2294 le64toh(d->data.entry_offset),
2295 le64toh(d->data.entry_array_offset),
2296 le64toh(d->data.n_entries),
2297 p,
2298 test_object_offset,
2299 DIRECTION_DOWN,
2300 NULL, NULL,
2301 &i);
2302
2303 if (r <= 0)
2304 return r;
2305
2306 if (direction == DIRECTION_DOWN) {
2307 if (i >= n - 1)
2308 return 0;
2309
2310 i++;
2311 } else {
2312 if (i <= 0)
2313 return 0;
2314
2315 i--;
2316 }
2317
2318 }
2319
2320 return generic_array_get_plus_one(f,
2321 le64toh(d->data.entry_offset),
2322 le64toh(d->data.entry_array_offset),
2323 i,
2324 ret, offset);
2325 }
2326
2327 int journal_file_move_to_entry_by_offset_for_data(
2328 JournalFile *f,
2329 uint64_t data_offset,
2330 uint64_t p,
2331 direction_t direction,
2332 Object **ret, uint64_t *offset) {
2333
2334 int r;
2335 Object *d;
2336
2337 assert(f);
2338
2339 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2340 if (r < 0)
2341 return r;
2342
2343 return generic_array_bisect_plus_one(f,
2344 le64toh(d->data.entry_offset),
2345 le64toh(d->data.entry_array_offset),
2346 le64toh(d->data.n_entries),
2347 p,
2348 test_object_offset,
2349 direction,
2350 ret, offset, NULL);
2351 }
2352
2353 int journal_file_move_to_entry_by_monotonic_for_data(
2354 JournalFile *f,
2355 uint64_t data_offset,
2356 sd_id128_t boot_id,
2357 uint64_t monotonic,
2358 direction_t direction,
2359 Object **ret, uint64_t *offset) {
2360
2361 Object *o, *d;
2362 int r;
2363 uint64_t b, z;
2364
2365 assert(f);
2366
2367 /* First, seek by time */
2368 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2369 if (r < 0)
2370 return r;
2371 if (r == 0)
2372 return -ENOENT;
2373
2374 r = generic_array_bisect_plus_one(f,
2375 le64toh(o->data.entry_offset),
2376 le64toh(o->data.entry_array_offset),
2377 le64toh(o->data.n_entries),
2378 monotonic,
2379 test_object_monotonic,
2380 direction,
2381 NULL, &z, NULL);
2382 if (r <= 0)
2383 return r;
2384
2385 /* And now, continue seeking until we find an entry that
2386 * exists in both bisection arrays */
2387
2388 for (;;) {
2389 Object *qo;
2390 uint64_t p, q;
2391
2392 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2393 if (r < 0)
2394 return r;
2395
2396 r = generic_array_bisect_plus_one(f,
2397 le64toh(d->data.entry_offset),
2398 le64toh(d->data.entry_array_offset),
2399 le64toh(d->data.n_entries),
2400 z,
2401 test_object_offset,
2402 direction,
2403 NULL, &p, NULL);
2404 if (r <= 0)
2405 return r;
2406
2407 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2408 if (r < 0)
2409 return r;
2410
2411 r = generic_array_bisect_plus_one(f,
2412 le64toh(o->data.entry_offset),
2413 le64toh(o->data.entry_array_offset),
2414 le64toh(o->data.n_entries),
2415 p,
2416 test_object_offset,
2417 direction,
2418 &qo, &q, NULL);
2419
2420 if (r <= 0)
2421 return r;
2422
2423 if (p == q) {
2424 if (ret)
2425 *ret = qo;
2426 if (offset)
2427 *offset = q;
2428
2429 return 1;
2430 }
2431
2432 z = q;
2433 }
2434 }
2435
2436 int journal_file_move_to_entry_by_seqnum_for_data(
2437 JournalFile *f,
2438 uint64_t data_offset,
2439 uint64_t seqnum,
2440 direction_t direction,
2441 Object **ret, uint64_t *offset) {
2442
2443 Object *d;
2444 int r;
2445
2446 assert(f);
2447
2448 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2449 if (r < 0)
2450 return r;
2451
2452 return generic_array_bisect_plus_one(f,
2453 le64toh(d->data.entry_offset),
2454 le64toh(d->data.entry_array_offset),
2455 le64toh(d->data.n_entries),
2456 seqnum,
2457 test_object_seqnum,
2458 direction,
2459 ret, offset, NULL);
2460 }
2461
2462 int journal_file_move_to_entry_by_realtime_for_data(
2463 JournalFile *f,
2464 uint64_t data_offset,
2465 uint64_t realtime,
2466 direction_t direction,
2467 Object **ret, uint64_t *offset) {
2468
2469 Object *d;
2470 int r;
2471
2472 assert(f);
2473
2474 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2475 if (r < 0)
2476 return r;
2477
2478 return generic_array_bisect_plus_one(f,
2479 le64toh(d->data.entry_offset),
2480 le64toh(d->data.entry_array_offset),
2481 le64toh(d->data.n_entries),
2482 realtime,
2483 test_object_realtime,
2484 direction,
2485 ret, offset, NULL);
2486 }
2487
2488 void journal_file_dump(JournalFile *f) {
2489 Object *o;
2490 int r;
2491 uint64_t p;
2492
2493 assert(f);
2494
2495 journal_file_print_header(f);
2496
2497 p = le64toh(f->header->header_size);
2498 while (p != 0) {
2499 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2500 if (r < 0)
2501 goto fail;
2502
2503 switch (o->object.type) {
2504
2505 case OBJECT_UNUSED:
2506 printf("Type: OBJECT_UNUSED\n");
2507 break;
2508
2509 case OBJECT_DATA:
2510 printf("Type: OBJECT_DATA\n");
2511 break;
2512
2513 case OBJECT_FIELD:
2514 printf("Type: OBJECT_FIELD\n");
2515 break;
2516
2517 case OBJECT_ENTRY:
2518 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2519 le64toh(o->entry.seqnum),
2520 le64toh(o->entry.monotonic),
2521 le64toh(o->entry.realtime));
2522 break;
2523
2524 case OBJECT_FIELD_HASH_TABLE:
2525 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2526 break;
2527
2528 case OBJECT_DATA_HASH_TABLE:
2529 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2530 break;
2531
2532 case OBJECT_ENTRY_ARRAY:
2533 printf("Type: OBJECT_ENTRY_ARRAY\n");
2534 break;
2535
2536 case OBJECT_TAG:
2537 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2538 le64toh(o->tag.seqnum),
2539 le64toh(o->tag.epoch));
2540 break;
2541
2542 default:
2543 printf("Type: unknown (%i)\n", o->object.type);
2544 break;
2545 }
2546
2547 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2548 printf("Flags: %s\n",
2549 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2550
2551 if (p == le64toh(f->header->tail_object_offset))
2552 p = 0;
2553 else
2554 p = p + ALIGN64(le64toh(o->object.size));
2555 }
2556
2557 return;
2558 fail:
2559 log_error("File corrupt");
2560 }
2561
2562 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2563 const char *x;
2564
2565 x = format_timestamp(buf, l, t);
2566 if (x)
2567 return x;
2568 return " --- ";
2569 }
2570
2571 void journal_file_print_header(JournalFile *f) {
2572 char a[33], b[33], c[33], d[33];
2573 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2574 struct stat st;
2575 char bytes[FORMAT_BYTES_MAX];
2576
2577 assert(f);
2578
2579 printf("File Path: %s\n"
2580 "File ID: %s\n"
2581 "Machine ID: %s\n"
2582 "Boot ID: %s\n"
2583 "Sequential Number ID: %s\n"
2584 "State: %s\n"
2585 "Compatible Flags:%s%s\n"
2586 "Incompatible Flags:%s%s%s\n"
2587 "Header size: %"PRIu64"\n"
2588 "Arena size: %"PRIu64"\n"
2589 "Data Hash Table Size: %"PRIu64"\n"
2590 "Field Hash Table Size: %"PRIu64"\n"
2591 "Rotate Suggested: %s\n"
2592 "Head Sequential Number: %"PRIu64"\n"
2593 "Tail Sequential Number: %"PRIu64"\n"
2594 "Head Realtime Timestamp: %s\n"
2595 "Tail Realtime Timestamp: %s\n"
2596 "Tail Monotonic Timestamp: %s\n"
2597 "Objects: %"PRIu64"\n"
2598 "Entry Objects: %"PRIu64"\n",
2599 f->path,
2600 sd_id128_to_string(f->header->file_id, a),
2601 sd_id128_to_string(f->header->machine_id, b),
2602 sd_id128_to_string(f->header->boot_id, c),
2603 sd_id128_to_string(f->header->seqnum_id, d),
2604 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2605 f->header->state == STATE_ONLINE ? "ONLINE" :
2606 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2607 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2608 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2609 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2610 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2611 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2612 le64toh(f->header->header_size),
2613 le64toh(f->header->arena_size),
2614 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2615 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2616 yes_no(journal_file_rotate_suggested(f, 0)),
2617 le64toh(f->header->head_entry_seqnum),
2618 le64toh(f->header->tail_entry_seqnum),
2619 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2620 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2621 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2622 le64toh(f->header->n_objects),
2623 le64toh(f->header->n_entries));
2624
2625 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2626 printf("Data Objects: %"PRIu64"\n"
2627 "Data Hash Table Fill: %.1f%%\n",
2628 le64toh(f->header->n_data),
2629 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2630
2631 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2632 printf("Field Objects: %"PRIu64"\n"
2633 "Field Hash Table Fill: %.1f%%\n",
2634 le64toh(f->header->n_fields),
2635 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2636
2637 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2638 printf("Tag Objects: %"PRIu64"\n",
2639 le64toh(f->header->n_tags));
2640 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2641 printf("Entry Array Objects: %"PRIu64"\n",
2642 le64toh(f->header->n_entry_arrays));
2643
2644 if (fstat(f->fd, &st) >= 0)
2645 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
2646 }
2647
2648 static int journal_file_warn_btrfs(JournalFile *f) {
2649 unsigned attrs;
2650 int r;
2651
2652 assert(f);
2653
2654 /* Before we write anything, check if the COW logic is turned
2655 * off on btrfs. Given our write pattern that is quite
2656 * unfriendly to COW file systems this should greatly improve
2657 * performance on COW file systems, such as btrfs, at the
2658 * expense of data integrity features (which shouldn't be too
2659 * bad, given that we do our own checksumming). */
2660
2661 r = btrfs_is_filesystem(f->fd);
2662 if (r < 0)
2663 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2664 if (!r)
2665 return 0;
2666
2667 r = read_attr_fd(f->fd, &attrs);
2668 if (r < 0)
2669 return log_warning_errno(r, "Failed to read file attributes: %m");
2670
2671 if (attrs & FS_NOCOW_FL) {
2672 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2673 return 0;
2674 }
2675
2676 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2677 "This is likely to slow down journal access substantially, please consider turning "
2678 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2679
2680 return 1;
2681 }
2682
2683 int journal_file_open(
2684 const char *fname,
2685 int flags,
2686 mode_t mode,
2687 bool compress,
2688 bool seal,
2689 JournalMetrics *metrics,
2690 MMapCache *mmap_cache,
2691 JournalFile *template,
2692 JournalFile **ret) {
2693
2694 bool newly_created = false;
2695 JournalFile *f;
2696 void *h;
2697 int r;
2698
2699 assert(fname);
2700 assert(ret);
2701
2702 if ((flags & O_ACCMODE) != O_RDONLY &&
2703 (flags & O_ACCMODE) != O_RDWR)
2704 return -EINVAL;
2705
2706 if (!endswith(fname, ".journal") &&
2707 !endswith(fname, ".journal~"))
2708 return -EINVAL;
2709
2710 f = new0(JournalFile, 1);
2711 if (!f)
2712 return -ENOMEM;
2713
2714 f->fd = -1;
2715 f->mode = mode;
2716
2717 f->flags = flags;
2718 f->prot = prot_from_flags(flags);
2719 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2720 #if defined(HAVE_LZ4)
2721 f->compress_lz4 = compress;
2722 #elif defined(HAVE_XZ)
2723 f->compress_xz = compress;
2724 #endif
2725 #ifdef HAVE_GCRYPT
2726 f->seal = seal;
2727 #endif
2728
2729 if (mmap_cache)
2730 f->mmap = mmap_cache_ref(mmap_cache);
2731 else {
2732 f->mmap = mmap_cache_new();
2733 if (!f->mmap) {
2734 r = -ENOMEM;
2735 goto fail;
2736 }
2737 }
2738
2739 f->path = strdup(fname);
2740 if (!f->path) {
2741 r = -ENOMEM;
2742 goto fail;
2743 }
2744
2745 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2746 if (!f->chain_cache) {
2747 r = -ENOMEM;
2748 goto fail;
2749 }
2750
2751 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2752 if (f->fd < 0) {
2753 r = -errno;
2754 goto fail;
2755 }
2756
2757 r = journal_file_fstat(f);
2758 if (r < 0)
2759 goto fail;
2760
2761 if (f->last_stat.st_size == 0 && f->writable) {
2762
2763 (void) journal_file_warn_btrfs(f);
2764
2765 /* Let's attach the creation time to the journal file,
2766 * so that the vacuuming code knows the age of this
2767 * file even if the file might end up corrupted one
2768 * day... Ideally we'd just use the creation time many
2769 * file systems maintain for each file, but there is
2770 * currently no usable API to query this, hence let's
2771 * emulate this via extended attributes. If extended
2772 * attributes are not supported we'll just skip this,
2773 * and rely solely on mtime/atime/ctime of the file. */
2774
2775 fd_setcrtime(f->fd, 0);
2776
2777 #ifdef HAVE_GCRYPT
2778 /* Try to load the FSPRG state, and if we can't, then
2779 * just don't do sealing */
2780 if (f->seal) {
2781 r = journal_file_fss_load(f);
2782 if (r < 0)
2783 f->seal = false;
2784 }
2785 #endif
2786
2787 r = journal_file_init_header(f, template);
2788 if (r < 0)
2789 goto fail;
2790
2791 r = journal_file_fstat(f);
2792 if (r < 0)
2793 goto fail;
2794
2795 newly_created = true;
2796 }
2797
2798 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2799 r = -ENODATA;
2800 goto fail;
2801 }
2802
2803 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2804 if (r < 0)
2805 goto fail;
2806
2807 f->header = h;
2808
2809 if (!newly_created) {
2810 r = journal_file_verify_header(f);
2811 if (r < 0)
2812 goto fail;
2813 }
2814
2815 #ifdef HAVE_GCRYPT
2816 if (!newly_created && f->writable) {
2817 r = journal_file_fss_load(f);
2818 if (r < 0)
2819 goto fail;
2820 }
2821 #endif
2822
2823 if (f->writable) {
2824 if (metrics) {
2825 journal_default_metrics(metrics, f->fd);
2826 f->metrics = *metrics;
2827 } else if (template)
2828 f->metrics = template->metrics;
2829
2830 r = journal_file_refresh_header(f);
2831 if (r < 0)
2832 goto fail;
2833 }
2834
2835 #ifdef HAVE_GCRYPT
2836 r = journal_file_hmac_setup(f);
2837 if (r < 0)
2838 goto fail;
2839 #endif
2840
2841 if (newly_created) {
2842 r = journal_file_setup_field_hash_table(f);
2843 if (r < 0)
2844 goto fail;
2845
2846 r = journal_file_setup_data_hash_table(f);
2847 if (r < 0)
2848 goto fail;
2849
2850 #ifdef HAVE_GCRYPT
2851 r = journal_file_append_first_tag(f);
2852 if (r < 0)
2853 goto fail;
2854 #endif
2855 }
2856
2857 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2858 r = -EIO;
2859 goto fail;
2860 }
2861
2862 if (template && template->post_change_timer) {
2863 r = journal_file_enable_post_change_timer(
2864 f,
2865 sd_event_source_get_event(template->post_change_timer),
2866 template->post_change_timer_period);
2867
2868 if (r < 0)
2869 goto fail;
2870 }
2871
2872 *ret = f;
2873 return 0;
2874
2875 fail:
2876 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2877 r = -EIO;
2878
2879 journal_file_close(f);
2880
2881 return r;
2882 }
2883
2884 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2885 _cleanup_free_ char *p = NULL;
2886 size_t l;
2887 JournalFile *old_file, *new_file = NULL;
2888 int r;
2889
2890 assert(f);
2891 assert(*f);
2892
2893 old_file = *f;
2894
2895 if (!old_file->writable)
2896 return -EINVAL;
2897
2898 if (!endswith(old_file->path, ".journal"))
2899 return -EINVAL;
2900
2901 l = strlen(old_file->path);
2902 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2903 (int) l - 8, old_file->path,
2904 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2905 le64toh((*f)->header->head_entry_seqnum),
2906 le64toh((*f)->header->head_entry_realtime));
2907 if (r < 0)
2908 return -ENOMEM;
2909
2910 /* Try to rename the file to the archived version. If the file
2911 * already was deleted, we'll get ENOENT, let's ignore that
2912 * case. */
2913 r = rename(old_file->path, p);
2914 if (r < 0 && errno != ENOENT)
2915 return -errno;
2916
2917 old_file->header->state = STATE_ARCHIVED;
2918
2919 /* Currently, btrfs is not very good with out write patterns
2920 * and fragments heavily. Let's defrag our journal files when
2921 * we archive them */
2922 old_file->defrag_on_close = true;
2923
2924 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2925 journal_file_close(old_file);
2926
2927 *f = new_file;
2928 return r;
2929 }
2930
2931 int journal_file_open_reliably(
2932 const char *fname,
2933 int flags,
2934 mode_t mode,
2935 bool compress,
2936 bool seal,
2937 JournalMetrics *metrics,
2938 MMapCache *mmap_cache,
2939 JournalFile *template,
2940 JournalFile **ret) {
2941
2942 int r;
2943 size_t l;
2944 _cleanup_free_ char *p = NULL;
2945
2946 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2947 if (!IN_SET(r,
2948 -EBADMSG, /* corrupted */
2949 -ENODATA, /* truncated */
2950 -EHOSTDOWN, /* other machine */
2951 -EPROTONOSUPPORT, /* incompatible feature */
2952 -EBUSY, /* unclean shutdown */
2953 -ESHUTDOWN, /* already archived */
2954 -EIO, /* IO error, including SIGBUS on mmap */
2955 -EIDRM /* File has been deleted */))
2956 return r;
2957
2958 if ((flags & O_ACCMODE) == O_RDONLY)
2959 return r;
2960
2961 if (!(flags & O_CREAT))
2962 return r;
2963
2964 if (!endswith(fname, ".journal"))
2965 return r;
2966
2967 /* The file is corrupted. Rotate it away and try it again (but only once) */
2968
2969 l = strlen(fname);
2970 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2971 (int) l - 8, fname,
2972 now(CLOCK_REALTIME),
2973 random_u64()) < 0)
2974 return -ENOMEM;
2975
2976 if (rename(fname, p) < 0)
2977 return -errno;
2978
2979 /* btrfs doesn't cope well with our write pattern and
2980 * fragments heavily. Let's defrag all files we rotate */
2981
2982 (void) chattr_path(p, false, FS_NOCOW_FL);
2983 (void) btrfs_defrag(p);
2984
2985 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2986
2987 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
2988 }
2989
2990 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2991 uint64_t i, n;
2992 uint64_t q, xor_hash = 0;
2993 int r;
2994 EntryItem *items;
2995 dual_timestamp ts;
2996
2997 assert(from);
2998 assert(to);
2999 assert(o);
3000 assert(p);
3001
3002 if (!to->writable)
3003 return -EPERM;
3004
3005 ts.monotonic = le64toh(o->entry.monotonic);
3006 ts.realtime = le64toh(o->entry.realtime);
3007
3008 n = journal_file_entry_n_items(o);
3009 /* alloca() can't take 0, hence let's allocate at least one */
3010 items = alloca(sizeof(EntryItem) * MAX(1u, n));
3011
3012 for (i = 0; i < n; i++) {
3013 uint64_t l, h;
3014 le64_t le_hash;
3015 size_t t;
3016 void *data;
3017 Object *u;
3018
3019 q = le64toh(o->entry.items[i].object_offset);
3020 le_hash = o->entry.items[i].hash;
3021
3022 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3023 if (r < 0)
3024 return r;
3025
3026 if (le_hash != o->data.hash)
3027 return -EBADMSG;
3028
3029 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3030 t = (size_t) l;
3031
3032 /* We hit the limit on 32bit machines */
3033 if ((uint64_t) t != l)
3034 return -E2BIG;
3035
3036 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3037 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
3038 size_t rsize = 0;
3039
3040 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3041 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3042 if (r < 0)
3043 return r;
3044
3045 data = from->compress_buffer;
3046 l = rsize;
3047 #else
3048 return -EPROTONOSUPPORT;
3049 #endif
3050 } else
3051 data = o->data.payload;
3052
3053 r = journal_file_append_data(to, data, l, &u, &h);
3054 if (r < 0)
3055 return r;
3056
3057 xor_hash ^= le64toh(u->data.hash);
3058 items[i].object_offset = htole64(h);
3059 items[i].hash = u->data.hash;
3060
3061 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3062 if (r < 0)
3063 return r;
3064 }
3065
3066 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3067
3068 if (mmap_cache_got_sigbus(to->mmap, to->fd))
3069 return -EIO;
3070
3071 return r;
3072 }
3073
3074 void journal_reset_metrics(JournalMetrics *m) {
3075 assert(m);
3076
3077 /* Set everything to "pick automatic values". */
3078
3079 *m = (JournalMetrics) {
3080 .min_use = (uint64_t) -1,
3081 .max_use = (uint64_t) -1,
3082 .min_size = (uint64_t) -1,
3083 .max_size = (uint64_t) -1,
3084 .keep_free = (uint64_t) -1,
3085 .n_max_files = (uint64_t) -1,
3086 };
3087 }
3088
3089 void journal_default_metrics(JournalMetrics *m, int fd) {
3090 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3091 struct statvfs ss;
3092 uint64_t fs_size;
3093
3094 assert(m);
3095 assert(fd >= 0);
3096
3097 if (fstatvfs(fd, &ss) >= 0)
3098 fs_size = ss.f_frsize * ss.f_blocks;
3099 else {
3100 log_debug_errno(errno, "Failed to detremine disk size: %m");
3101 fs_size = 0;
3102 }
3103
3104 if (m->max_use == (uint64_t) -1) {
3105
3106 if (fs_size > 0) {
3107 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3108
3109 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3110 m->max_use = DEFAULT_MAX_USE_UPPER;
3111
3112 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3113 m->max_use = DEFAULT_MAX_USE_LOWER;
3114 } else
3115 m->max_use = DEFAULT_MAX_USE_LOWER;
3116 } else {
3117 m->max_use = PAGE_ALIGN(m->max_use);
3118
3119 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3120 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3121 }
3122
3123 if (m->min_use == (uint64_t) -1)
3124 m->min_use = DEFAULT_MIN_USE;
3125
3126 if (m->min_use > m->max_use)
3127 m->min_use = m->max_use;
3128
3129 if (m->max_size == (uint64_t) -1) {
3130 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3131
3132 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3133 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3134 } else
3135 m->max_size = PAGE_ALIGN(m->max_size);
3136
3137 if (m->max_size != 0) {
3138 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3139 m->max_size = JOURNAL_FILE_SIZE_MIN;
3140
3141 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3142 m->max_use = m->max_size*2;
3143 }
3144
3145 if (m->min_size == (uint64_t) -1)
3146 m->min_size = JOURNAL_FILE_SIZE_MIN;
3147 else {
3148 m->min_size = PAGE_ALIGN(m->min_size);
3149
3150 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3151 m->min_size = JOURNAL_FILE_SIZE_MIN;
3152
3153 if (m->max_size != 0 && m->min_size > m->max_size)
3154 m->max_size = m->min_size;
3155 }
3156
3157 if (m->keep_free == (uint64_t) -1) {
3158
3159 if (fs_size > 0) {
3160 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3161
3162 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3163 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3164
3165 } else
3166 m->keep_free = DEFAULT_KEEP_FREE;
3167 }
3168
3169 if (m->n_max_files == (uint64_t) -1)
3170 m->n_max_files = DEFAULT_N_MAX_FILES;
3171
3172 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3173 format_bytes(a, sizeof(a), m->min_use),
3174 format_bytes(b, sizeof(b), m->max_use),
3175 format_bytes(c, sizeof(c), m->max_size),
3176 format_bytes(d, sizeof(d), m->min_size),
3177 format_bytes(e, sizeof(e), m->keep_free),
3178 m->n_max_files);
3179 }
3180
3181 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3182 assert(f);
3183 assert(from || to);
3184
3185 if (from) {
3186 if (f->header->head_entry_realtime == 0)
3187 return -ENOENT;
3188
3189 *from = le64toh(f->header->head_entry_realtime);
3190 }
3191
3192 if (to) {
3193 if (f->header->tail_entry_realtime == 0)
3194 return -ENOENT;
3195
3196 *to = le64toh(f->header->tail_entry_realtime);
3197 }
3198
3199 return 1;
3200 }
3201
3202 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3203 Object *o;
3204 uint64_t p;
3205 int r;
3206
3207 assert(f);
3208 assert(from || to);
3209
3210 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3211 if (r <= 0)
3212 return r;
3213
3214 if (le64toh(o->data.n_entries) <= 0)
3215 return 0;
3216
3217 if (from) {
3218 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3219 if (r < 0)
3220 return r;
3221
3222 *from = le64toh(o->entry.monotonic);
3223 }
3224
3225 if (to) {
3226 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3227 if (r < 0)
3228 return r;
3229
3230 r = generic_array_get_plus_one(f,
3231 le64toh(o->data.entry_offset),
3232 le64toh(o->data.entry_array_offset),
3233 le64toh(o->data.n_entries)-1,
3234 &o, NULL);
3235 if (r <= 0)
3236 return r;
3237
3238 *to = le64toh(o->entry.monotonic);
3239 }
3240
3241 return 1;
3242 }
3243
3244 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3245 assert(f);
3246
3247 /* If we gained new header fields we gained new features,
3248 * hence suggest a rotation */
3249 if (le64toh(f->header->header_size) < sizeof(Header)) {
3250 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3251 return true;
3252 }
3253
3254 /* Let's check if the hash tables grew over a certain fill
3255 * level (75%, borrowing this value from Java's hash table
3256 * implementation), and if so suggest a rotation. To calculate
3257 * the fill level we need the n_data field, which only exists
3258 * in newer versions. */
3259
3260 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3261 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3262 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3263 f->path,
3264 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3265 le64toh(f->header->n_data),
3266 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3267 (unsigned long long) f->last_stat.st_size,
3268 f->last_stat.st_size / le64toh(f->header->n_data));
3269 return true;
3270 }
3271
3272 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3273 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3274 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3275 f->path,
3276 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3277 le64toh(f->header->n_fields),
3278 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3279 return true;
3280 }
3281
3282 /* Are the data objects properly indexed by field objects? */
3283 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3284 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3285 le64toh(f->header->n_data) > 0 &&
3286 le64toh(f->header->n_fields) == 0)
3287 return true;
3288
3289 if (max_file_usec > 0) {
3290 usec_t t, h;
3291
3292 h = le64toh(f->header->head_entry_realtime);
3293 t = now(CLOCK_REALTIME);
3294
3295 if (h > 0 && t > h + max_file_usec)
3296 return true;
3297 }
3298
3299 return false;
3300 }