]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
Merge pull request #13994 from keszybz/bpf-refactor
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/fs.h>
6 #include <pthread.h>
7 #include <stddef.h>
8 #include <sys/mman.h>
9 #include <sys/statvfs.h>
10 #include <sys/uio.h>
11 #include <unistd.h>
12
13 #include "sd-event.h"
14
15 #include "alloc-util.h"
16 #include "btrfs-util.h"
17 #include "chattr-util.h"
18 #include "compress.h"
19 #include "fd-util.h"
20 #include "format-util.h"
21 #include "fs-util.h"
22 #include "journal-authenticate.h"
23 #include "journal-def.h"
24 #include "journal-file.h"
25 #include "lookup3.h"
26 #include "memory-util.h"
27 #include "path-util.h"
28 #include "random-util.h"
29 #include "set.h"
30 #include "sort-util.h"
31 #include "stat-util.h"
32 #include "string-util.h"
33 #include "strv.h"
34 #include "xattr-util.h"
35
36 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
37 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
38
39 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
40 #define MIN_COMPRESS_THRESHOLD (8ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (512 * 1024ULL) /* 512 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define MAX_USE_LOWER (1 * 1024 * 1024ULL) /* 1 MiB */
48 #define MAX_USE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
49
50 /* Those are the lower and upper bounds for the minimal use limit,
51 * i.e. how much we'll use even if keep_free suggests otherwise. */
52 #define MIN_USE_LOW (1 * 1024 * 1024ULL) /* 1 MiB */
53 #define MIN_USE_HIGH (16 * 1024 * 1024ULL) /* 16 MiB */
54
55 /* This is the upper bound if we deduce max_size from max_use */
56 #define MAX_SIZE_UPPER (128 * 1024 * 1024ULL) /* 128 MiB */
57
58 /* This is the upper bound if we deduce the keep_free value from the
59 * file system size */
60 #define KEEP_FREE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
61
62 /* This is the keep_free value when we can't determine the system
63 * size */
64 #define DEFAULT_KEEP_FREE (1024 * 1024ULL) /* 1 MB */
65
66 /* This is the default maximum number of journal files to keep around. */
67 #define DEFAULT_N_MAX_FILES 100
68
69 /* n_data was the first entry we added after the initial file format design */
70 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
71
72 /* How many entries to keep in the entry array chain cache at max */
73 #define CHAIN_CACHE_MAX 20
74
75 /* How much to increase the journal file size at once each time we allocate something new. */
76 #define FILE_SIZE_INCREASE (8 * 1024 * 1024ULL) /* 8MB */
77
78 /* Reread fstat() of the file for detecting deletions at least this often */
79 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
80
81 /* The mmap context to use for the header we pick as one above the last defined typed */
82 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
83
84 #ifdef __clang__
85 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
86 #endif
87
88 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
89 * As a result we use atomic operations on f->offline_state for inter-thread communications with
90 * journal_file_set_offline() and journal_file_set_online(). */
91 static void journal_file_set_offline_internal(JournalFile *f) {
92 assert(f);
93 assert(f->fd >= 0);
94 assert(f->header);
95
96 for (;;) {
97 switch (f->offline_state) {
98 case OFFLINE_CANCEL:
99 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
100 continue;
101 return;
102
103 case OFFLINE_AGAIN_FROM_SYNCING:
104 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
105 continue;
106 break;
107
108 case OFFLINE_AGAIN_FROM_OFFLINING:
109 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
110 continue;
111 break;
112
113 case OFFLINE_SYNCING:
114 (void) fsync(f->fd);
115
116 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
117 continue;
118
119 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
120 (void) fsync(f->fd);
121 break;
122
123 case OFFLINE_OFFLINING:
124 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
125 continue;
126 _fallthrough_;
127 case OFFLINE_DONE:
128 return;
129
130 case OFFLINE_JOINED:
131 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
132 return;
133 }
134 }
135 }
136
137 static void * journal_file_set_offline_thread(void *arg) {
138 JournalFile *f = arg;
139
140 (void) pthread_setname_np(pthread_self(), "journal-offline");
141
142 journal_file_set_offline_internal(f);
143
144 return NULL;
145 }
146
147 static int journal_file_set_offline_thread_join(JournalFile *f) {
148 int r;
149
150 assert(f);
151
152 if (f->offline_state == OFFLINE_JOINED)
153 return 0;
154
155 r = pthread_join(f->offline_thread, NULL);
156 if (r)
157 return -r;
158
159 f->offline_state = OFFLINE_JOINED;
160
161 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
162 return -EIO;
163
164 return 0;
165 }
166
167 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
168 static bool journal_file_set_offline_try_restart(JournalFile *f) {
169 for (;;) {
170 switch (f->offline_state) {
171 case OFFLINE_AGAIN_FROM_SYNCING:
172 case OFFLINE_AGAIN_FROM_OFFLINING:
173 return true;
174
175 case OFFLINE_CANCEL:
176 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
177 continue;
178 return true;
179
180 case OFFLINE_SYNCING:
181 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
182 continue;
183 return true;
184
185 case OFFLINE_OFFLINING:
186 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
187 continue;
188 return true;
189
190 default:
191 return false;
192 }
193 }
194 }
195
196 /* Sets a journal offline.
197 *
198 * If wait is false then an offline is dispatched in a separate thread for a
199 * subsequent journal_file_set_offline() or journal_file_set_online() of the
200 * same journal to synchronize with.
201 *
202 * If wait is true, then either an existing offline thread will be restarted
203 * and joined, or if none exists the offline is simply performed in this
204 * context without involving another thread.
205 */
206 int journal_file_set_offline(JournalFile *f, bool wait) {
207 bool restarted;
208 int r;
209
210 assert(f);
211
212 if (!f->writable)
213 return -EPERM;
214
215 if (f->fd < 0 || !f->header)
216 return -EINVAL;
217
218 /* An offlining journal is implicitly online and may modify f->header->state,
219 * we must also join any potentially lingering offline thread when not online. */
220 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
221 return journal_file_set_offline_thread_join(f);
222
223 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
224 restarted = journal_file_set_offline_try_restart(f);
225 if ((restarted && wait) || !restarted) {
226 r = journal_file_set_offline_thread_join(f);
227 if (r < 0)
228 return r;
229 }
230
231 if (restarted)
232 return 0;
233
234 /* Initiate a new offline. */
235 f->offline_state = OFFLINE_SYNCING;
236
237 if (wait) /* Without using a thread if waiting. */
238 journal_file_set_offline_internal(f);
239 else {
240 sigset_t ss, saved_ss;
241 int k;
242
243 assert_se(sigfillset(&ss) >= 0);
244 /* Don't block SIGBUS since the offlining thread accesses a memory mapped file.
245 * Asynchronous SIGBUS signals can safely be handled by either thread. */
246 assert_se(sigdelset(&ss, SIGBUS) >= 0);
247
248 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
249 if (r > 0)
250 return -r;
251
252 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
253
254 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
255 if (r > 0) {
256 f->offline_state = OFFLINE_JOINED;
257 return -r;
258 }
259 if (k > 0)
260 return -k;
261 }
262
263 return 0;
264 }
265
266 static int journal_file_set_online(JournalFile *f) {
267 bool wait = true;
268
269 assert(f);
270
271 if (!f->writable)
272 return -EPERM;
273
274 if (f->fd < 0 || !f->header)
275 return -EINVAL;
276
277 while (wait) {
278 switch (f->offline_state) {
279 case OFFLINE_JOINED:
280 /* No offline thread, no need to wait. */
281 wait = false;
282 break;
283
284 case OFFLINE_SYNCING:
285 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
286 continue;
287 /* Canceled syncing prior to offlining, no need to wait. */
288 wait = false;
289 break;
290
291 case OFFLINE_AGAIN_FROM_SYNCING:
292 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
293 continue;
294 /* Canceled restart from syncing, no need to wait. */
295 wait = false;
296 break;
297
298 case OFFLINE_AGAIN_FROM_OFFLINING:
299 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
300 continue;
301 /* Canceled restart from offlining, must wait for offlining to complete however. */
302 _fallthrough_;
303 default: {
304 int r;
305
306 r = journal_file_set_offline_thread_join(f);
307 if (r < 0)
308 return r;
309
310 wait = false;
311 break;
312 }
313 }
314 }
315
316 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
317 return -EIO;
318
319 switch (f->header->state) {
320 case STATE_ONLINE:
321 return 0;
322
323 case STATE_OFFLINE:
324 f->header->state = STATE_ONLINE;
325 (void) fsync(f->fd);
326 return 0;
327
328 default:
329 return -EINVAL;
330 }
331 }
332
333 bool journal_file_is_offlining(JournalFile *f) {
334 assert(f);
335
336 __sync_synchronize();
337
338 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
339 return false;
340
341 return true;
342 }
343
344 JournalFile* journal_file_close(JournalFile *f) {
345 if (!f)
346 return NULL;
347
348 #if HAVE_GCRYPT
349 /* Write the final tag */
350 if (f->seal && f->writable) {
351 int r;
352
353 r = journal_file_append_tag(f);
354 if (r < 0)
355 log_error_errno(r, "Failed to append tag when closing journal: %m");
356 }
357 #endif
358
359 if (f->post_change_timer) {
360 if (sd_event_source_get_enabled(f->post_change_timer, NULL) > 0)
361 journal_file_post_change(f);
362
363 sd_event_source_disable_unref(f->post_change_timer);
364 }
365
366 journal_file_set_offline(f, true);
367
368 if (f->mmap && f->cache_fd)
369 mmap_cache_free_fd(f->mmap, f->cache_fd);
370
371 if (f->fd >= 0 && f->defrag_on_close) {
372
373 /* Be friendly to btrfs: turn COW back on again now,
374 * and defragment the file. We won't write to the file
375 * ever again, hence remove all fragmentation, and
376 * reenable all the good bits COW usually provides
377 * (such as data checksumming). */
378
379 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL);
380 (void) btrfs_defrag_fd(f->fd);
381 }
382
383 if (f->close_fd)
384 safe_close(f->fd);
385 free(f->path);
386
387 mmap_cache_unref(f->mmap);
388
389 ordered_hashmap_free_free(f->chain_cache);
390
391 #if HAVE_XZ || HAVE_LZ4
392 free(f->compress_buffer);
393 #endif
394
395 #if HAVE_GCRYPT
396 if (f->fss_file)
397 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
398 else
399 free(f->fsprg_state);
400
401 free(f->fsprg_seed);
402
403 if (f->hmac)
404 gcry_md_close(f->hmac);
405 #endif
406
407 return mfree(f);
408 }
409
410 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
411 Header h = {};
412 ssize_t k;
413 int r;
414
415 assert(f);
416
417 memcpy(h.signature, HEADER_SIGNATURE, 8);
418 h.header_size = htole64(ALIGN64(sizeof(h)));
419
420 h.incompatible_flags |= htole32(
421 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
422 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
423
424 h.compatible_flags = htole32(
425 f->seal * HEADER_COMPATIBLE_SEALED);
426
427 r = sd_id128_randomize(&h.file_id);
428 if (r < 0)
429 return r;
430
431 if (template) {
432 h.seqnum_id = template->header->seqnum_id;
433 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
434 } else
435 h.seqnum_id = h.file_id;
436
437 k = pwrite(f->fd, &h, sizeof(h), 0);
438 if (k < 0)
439 return -errno;
440
441 if (k != sizeof(h))
442 return -EIO;
443
444 return 0;
445 }
446
447 static int journal_file_refresh_header(JournalFile *f) {
448 sd_id128_t boot_id;
449 int r;
450
451 assert(f);
452 assert(f->header);
453
454 r = sd_id128_get_machine(&f->header->machine_id);
455 if (IN_SET(r, -ENOENT, -ENOMEDIUM))
456 /* We don't have a machine-id, let's continue without */
457 zero(f->header->machine_id);
458 else if (r < 0)
459 return r;
460
461 r = sd_id128_get_boot(&boot_id);
462 if (r < 0)
463 return r;
464
465 f->header->boot_id = boot_id;
466
467 r = journal_file_set_online(f);
468
469 /* Sync the online state to disk */
470 (void) fsync(f->fd);
471
472 /* We likely just created a new file, also sync the directory this file is located in. */
473 (void) fsync_directory_of_file(f->fd);
474
475 return r;
476 }
477
478 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
479 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
480 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
481 const char *type = compatible ? "compatible" : "incompatible";
482 uint32_t flags;
483
484 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
485
486 if (flags & ~supported) {
487 if (flags & ~any)
488 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
489 f->path, type, flags & ~any);
490 flags = (flags & any) & ~supported;
491 if (flags) {
492 const char* strv[3];
493 unsigned n = 0;
494 _cleanup_free_ char *t = NULL;
495
496 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
497 strv[n++] = "sealed";
498 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
499 strv[n++] = "xz-compressed";
500 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
501 strv[n++] = "lz4-compressed";
502 strv[n] = NULL;
503 assert(n < ELEMENTSOF(strv));
504
505 t = strv_join((char**) strv, ", ");
506 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
507 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
508 }
509 return true;
510 }
511
512 return false;
513 }
514
515 static int journal_file_verify_header(JournalFile *f) {
516 uint64_t arena_size, header_size;
517
518 assert(f);
519 assert(f->header);
520
521 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
522 return -EBADMSG;
523
524 /* In both read and write mode we refuse to open files with incompatible
525 * flags we don't know. */
526 if (warn_wrong_flags(f, false))
527 return -EPROTONOSUPPORT;
528
529 /* When open for writing we refuse to open files with compatible flags, too. */
530 if (f->writable && warn_wrong_flags(f, true))
531 return -EPROTONOSUPPORT;
532
533 if (f->header->state >= _STATE_MAX)
534 return -EBADMSG;
535
536 header_size = le64toh(f->header->header_size);
537
538 /* The first addition was n_data, so check that we are at least this large */
539 if (header_size < HEADER_SIZE_MIN)
540 return -EBADMSG;
541
542 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
543 return -EBADMSG;
544
545 arena_size = le64toh(f->header->arena_size);
546
547 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
548 return -ENODATA;
549
550 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
551 return -ENODATA;
552
553 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
554 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
555 !VALID64(le64toh(f->header->tail_object_offset)) ||
556 !VALID64(le64toh(f->header->entry_array_offset)))
557 return -ENODATA;
558
559 if (f->writable) {
560 sd_id128_t machine_id;
561 uint8_t state;
562 int r;
563
564 r = sd_id128_get_machine(&machine_id);
565 if (r < 0)
566 return r;
567
568 if (!sd_id128_equal(machine_id, f->header->machine_id))
569 return -EHOSTDOWN;
570
571 state = f->header->state;
572
573 if (state == STATE_ARCHIVED)
574 return -ESHUTDOWN; /* Already archived */
575 else if (state == STATE_ONLINE)
576 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
577 "Journal file %s is already online. Assuming unclean closing.",
578 f->path);
579 else if (state != STATE_OFFLINE)
580 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
581 "Journal file %s has unknown state %i.",
582 f->path, state);
583
584 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
585 return -EBADMSG;
586
587 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
588 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
589 * bisection. */
590 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME))
591 return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY),
592 "Journal file %s is from the future, refusing to append new data to it that'd be older.",
593 f->path);
594 }
595
596 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
597 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
598
599 f->seal = JOURNAL_HEADER_SEALED(f->header);
600
601 return 0;
602 }
603
604 static int journal_file_fstat(JournalFile *f) {
605 int r;
606
607 assert(f);
608 assert(f->fd >= 0);
609
610 if (fstat(f->fd, &f->last_stat) < 0)
611 return -errno;
612
613 f->last_stat_usec = now(CLOCK_MONOTONIC);
614
615 /* Refuse dealing with with files that aren't regular */
616 r = stat_verify_regular(&f->last_stat);
617 if (r < 0)
618 return r;
619
620 /* Refuse appending to files that are already deleted */
621 if (f->last_stat.st_nlink <= 0)
622 return -EIDRM;
623
624 return 0;
625 }
626
627 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
628 uint64_t old_size, new_size;
629 int r;
630
631 assert(f);
632 assert(f->header);
633
634 /* We assume that this file is not sparse, and we know that
635 * for sure, since we always call posix_fallocate()
636 * ourselves */
637
638 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
639 return -EIO;
640
641 old_size =
642 le64toh(f->header->header_size) +
643 le64toh(f->header->arena_size);
644
645 new_size = PAGE_ALIGN(offset + size);
646 if (new_size < le64toh(f->header->header_size))
647 new_size = le64toh(f->header->header_size);
648
649 if (new_size <= old_size) {
650
651 /* We already pre-allocated enough space, but before
652 * we write to it, let's check with fstat() if the
653 * file got deleted, in order make sure we don't throw
654 * away the data immediately. Don't check fstat() for
655 * all writes though, but only once ever 10s. */
656
657 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
658 return 0;
659
660 return journal_file_fstat(f);
661 }
662
663 /* Allocate more space. */
664
665 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
666 return -E2BIG;
667
668 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
669 struct statvfs svfs;
670
671 if (fstatvfs(f->fd, &svfs) >= 0) {
672 uint64_t available;
673
674 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
675
676 if (new_size - old_size > available)
677 return -E2BIG;
678 }
679 }
680
681 /* Increase by larger blocks at once */
682 new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
683 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
684 new_size = f->metrics.max_size;
685
686 /* Note that the glibc fallocate() fallback is very
687 inefficient, hence we try to minimize the allocation area
688 as we can. */
689 r = posix_fallocate(f->fd, old_size, new_size - old_size);
690 if (r != 0)
691 return -r;
692
693 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
694
695 return journal_file_fstat(f);
696 }
697
698 static unsigned type_to_context(ObjectType type) {
699 /* One context for each type, plus one catch-all for the rest */
700 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
701 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
702 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
703 }
704
705 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
706 int r;
707
708 assert(f);
709 assert(ret);
710
711 if (size <= 0)
712 return -EINVAL;
713
714 /* Avoid SIGBUS on invalid accesses */
715 if (offset + size > (uint64_t) f->last_stat.st_size) {
716 /* Hmm, out of range? Let's refresh the fstat() data
717 * first, before we trust that check. */
718
719 r = journal_file_fstat(f);
720 if (r < 0)
721 return r;
722
723 if (offset + size > (uint64_t) f->last_stat.st_size)
724 return -EADDRNOTAVAIL;
725 }
726
727 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
728 }
729
730 static uint64_t minimum_header_size(Object *o) {
731
732 static const uint64_t table[] = {
733 [OBJECT_DATA] = sizeof(DataObject),
734 [OBJECT_FIELD] = sizeof(FieldObject),
735 [OBJECT_ENTRY] = sizeof(EntryObject),
736 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
737 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
738 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
739 [OBJECT_TAG] = sizeof(TagObject),
740 };
741
742 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
743 return sizeof(ObjectHeader);
744
745 return table[o->object.type];
746 }
747
748 /* Lightweight object checks. We want this to be fast, so that we won't
749 * slowdown every journal_file_move_to_object() call too much. */
750 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
751 assert(f);
752 assert(o);
753
754 switch (o->object.type) {
755
756 case OBJECT_DATA:
757 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0))
758 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
759 "Bad n_entries: %" PRIu64 ": %" PRIu64,
760 le64toh(o->data.n_entries),
761 offset);
762
763 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0)
764 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
765 "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,
766 offsetof(DataObject, payload),
767 le64toh(o->object.size),
768 offset);
769
770 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
771 !VALID64(le64toh(o->data.next_field_offset)) ||
772 !VALID64(le64toh(o->data.entry_offset)) ||
773 !VALID64(le64toh(o->data.entry_array_offset)))
774 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
775 "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64,
776 le64toh(o->data.next_hash_offset),
777 le64toh(o->data.next_field_offset),
778 le64toh(o->data.entry_offset),
779 le64toh(o->data.entry_array_offset),
780 offset);
781
782 break;
783
784 case OBJECT_FIELD:
785 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0)
786 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
787 "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64,
788 offsetof(FieldObject, payload),
789 le64toh(o->object.size),
790 offset);
791
792 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
793 !VALID64(le64toh(o->field.head_data_offset)))
794 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
795 "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64,
796 le64toh(o->field.next_hash_offset),
797 le64toh(o->field.head_data_offset),
798 offset);
799 break;
800
801 case OBJECT_ENTRY:
802 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0)
803 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
804 "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,
805 offsetof(EntryObject, items),
806 le64toh(o->object.size),
807 offset);
808
809 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0)
810 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
811 "Invalid number items in entry: %" PRIu64 ": %" PRIu64,
812 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
813 offset);
814
815 if (le64toh(o->entry.seqnum) <= 0)
816 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
817 "Invalid entry seqnum: %" PRIx64 ": %" PRIu64,
818 le64toh(o->entry.seqnum),
819 offset);
820
821 if (!VALID_REALTIME(le64toh(o->entry.realtime)))
822 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
823 "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64,
824 le64toh(o->entry.realtime),
825 offset);
826
827 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic)))
828 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
829 "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64,
830 le64toh(o->entry.monotonic),
831 offset);
832
833 break;
834
835 case OBJECT_DATA_HASH_TABLE:
836 case OBJECT_FIELD_HASH_TABLE:
837 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
838 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0)
839 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
840 "Invalid %s hash table size: %" PRIu64 ": %" PRIu64,
841 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
842 le64toh(o->object.size),
843 offset);
844
845 break;
846
847 case OBJECT_ENTRY_ARRAY:
848 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
849 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0)
850 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
851 "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
852 le64toh(o->object.size),
853 offset);
854
855 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset)))
856 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
857 "Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64,
858 le64toh(o->entry_array.next_entry_array_offset),
859 offset);
860
861 break;
862
863 case OBJECT_TAG:
864 if (le64toh(o->object.size) != sizeof(TagObject))
865 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
866 "Invalid object tag size: %" PRIu64 ": %" PRIu64,
867 le64toh(o->object.size),
868 offset);
869
870 if (!VALID_EPOCH(le64toh(o->tag.epoch)))
871 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
872 "Invalid object tag epoch: %" PRIu64 ": %" PRIu64,
873 le64toh(o->tag.epoch), offset);
874
875 break;
876 }
877
878 return 0;
879 }
880
881 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
882 int r;
883 void *t;
884 size_t tsize;
885 Object *o;
886 uint64_t s;
887
888 assert(f);
889 assert(ret);
890
891 /* Objects may only be located at multiple of 64 bit */
892 if (!VALID64(offset))
893 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
894 "Attempt to move to object at non-64bit boundary: %" PRIu64,
895 offset);
896
897 /* Object may not be located in the file header */
898 if (offset < le64toh(f->header->header_size))
899 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
900 "Attempt to move to object located in file header: %" PRIu64,
901 offset);
902
903 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
904 if (r < 0)
905 return r;
906
907 o = (Object*) t;
908 s = le64toh(o->object.size);
909
910 if (s == 0)
911 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
912 "Attempt to move to uninitialized object: %" PRIu64,
913 offset);
914 if (s < sizeof(ObjectHeader))
915 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
916 "Attempt to move to overly short object: %" PRIu64,
917 offset);
918
919 if (o->object.type <= OBJECT_UNUSED)
920 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
921 "Attempt to move to object with invalid type: %" PRIu64,
922 offset);
923
924 if (s < minimum_header_size(o))
925 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
926 "Attempt to move to truncated object: %" PRIu64,
927 offset);
928
929 if (type > OBJECT_UNUSED && o->object.type != type)
930 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
931 "Attempt to move to object of unexpected type: %" PRIu64,
932 offset);
933
934 if (s > tsize) {
935 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
936 if (r < 0)
937 return r;
938
939 o = (Object*) t;
940 }
941
942 r = journal_file_check_object(f, offset, o);
943 if (r < 0)
944 return r;
945
946 *ret = o;
947 return 0;
948 }
949
950 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
951 uint64_t r;
952
953 assert(f);
954 assert(f->header);
955
956 r = le64toh(f->header->tail_entry_seqnum) + 1;
957
958 if (seqnum) {
959 /* If an external seqnum counter was passed, we update
960 * both the local and the external one, and set it to
961 * the maximum of both */
962
963 if (*seqnum + 1 > r)
964 r = *seqnum + 1;
965
966 *seqnum = r;
967 }
968
969 f->header->tail_entry_seqnum = htole64(r);
970
971 if (f->header->head_entry_seqnum == 0)
972 f->header->head_entry_seqnum = htole64(r);
973
974 return r;
975 }
976
977 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
978 int r;
979 uint64_t p;
980 Object *tail, *o;
981 void *t;
982
983 assert(f);
984 assert(f->header);
985 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
986 assert(size >= sizeof(ObjectHeader));
987 assert(offset);
988 assert(ret);
989
990 r = journal_file_set_online(f);
991 if (r < 0)
992 return r;
993
994 p = le64toh(f->header->tail_object_offset);
995 if (p == 0)
996 p = le64toh(f->header->header_size);
997 else {
998 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
999 if (r < 0)
1000 return r;
1001
1002 p += ALIGN64(le64toh(tail->object.size));
1003 }
1004
1005 r = journal_file_allocate(f, p, size);
1006 if (r < 0)
1007 return r;
1008
1009 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1010 if (r < 0)
1011 return r;
1012
1013 o = (Object*) t;
1014
1015 zero(o->object);
1016 o->object.type = type;
1017 o->object.size = htole64(size);
1018
1019 f->header->tail_object_offset = htole64(p);
1020 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1021
1022 *ret = o;
1023 *offset = p;
1024
1025 return 0;
1026 }
1027
1028 static int journal_file_setup_data_hash_table(JournalFile *f) {
1029 uint64_t s, p;
1030 Object *o;
1031 int r;
1032
1033 assert(f);
1034 assert(f->header);
1035
1036 /* We estimate that we need 1 hash table entry per 768 bytes
1037 of journal file and we want to make sure we never get
1038 beyond 75% fill level. Calculate the hash table size for
1039 the maximum file size based on these metrics. */
1040
1041 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1042 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1043 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1044
1045 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1046
1047 r = journal_file_append_object(f,
1048 OBJECT_DATA_HASH_TABLE,
1049 offsetof(Object, hash_table.items) + s,
1050 &o, &p);
1051 if (r < 0)
1052 return r;
1053
1054 memzero(o->hash_table.items, s);
1055
1056 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1057 f->header->data_hash_table_size = htole64(s);
1058
1059 return 0;
1060 }
1061
1062 static int journal_file_setup_field_hash_table(JournalFile *f) {
1063 uint64_t s, p;
1064 Object *o;
1065 int r;
1066
1067 assert(f);
1068 assert(f->header);
1069
1070 /* We use a fixed size hash table for the fields as this
1071 * number should grow very slowly only */
1072
1073 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1074 r = journal_file_append_object(f,
1075 OBJECT_FIELD_HASH_TABLE,
1076 offsetof(Object, hash_table.items) + s,
1077 &o, &p);
1078 if (r < 0)
1079 return r;
1080
1081 memzero(o->hash_table.items, s);
1082
1083 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1084 f->header->field_hash_table_size = htole64(s);
1085
1086 return 0;
1087 }
1088
1089 int journal_file_map_data_hash_table(JournalFile *f) {
1090 uint64_t s, p;
1091 void *t;
1092 int r;
1093
1094 assert(f);
1095 assert(f->header);
1096
1097 if (f->data_hash_table)
1098 return 0;
1099
1100 p = le64toh(f->header->data_hash_table_offset);
1101 s = le64toh(f->header->data_hash_table_size);
1102
1103 r = journal_file_move_to(f,
1104 OBJECT_DATA_HASH_TABLE,
1105 true,
1106 p, s,
1107 &t, NULL);
1108 if (r < 0)
1109 return r;
1110
1111 f->data_hash_table = t;
1112 return 0;
1113 }
1114
1115 int journal_file_map_field_hash_table(JournalFile *f) {
1116 uint64_t s, p;
1117 void *t;
1118 int r;
1119
1120 assert(f);
1121 assert(f->header);
1122
1123 if (f->field_hash_table)
1124 return 0;
1125
1126 p = le64toh(f->header->field_hash_table_offset);
1127 s = le64toh(f->header->field_hash_table_size);
1128
1129 r = journal_file_move_to(f,
1130 OBJECT_FIELD_HASH_TABLE,
1131 true,
1132 p, s,
1133 &t, NULL);
1134 if (r < 0)
1135 return r;
1136
1137 f->field_hash_table = t;
1138 return 0;
1139 }
1140
1141 static int journal_file_link_field(
1142 JournalFile *f,
1143 Object *o,
1144 uint64_t offset,
1145 uint64_t hash) {
1146
1147 uint64_t p, h, m;
1148 int r;
1149
1150 assert(f);
1151 assert(f->header);
1152 assert(f->field_hash_table);
1153 assert(o);
1154 assert(offset > 0);
1155
1156 if (o->object.type != OBJECT_FIELD)
1157 return -EINVAL;
1158
1159 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1160 if (m <= 0)
1161 return -EBADMSG;
1162
1163 /* This might alter the window we are looking at */
1164 o->field.next_hash_offset = o->field.head_data_offset = 0;
1165
1166 h = hash % m;
1167 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1168 if (p == 0)
1169 f->field_hash_table[h].head_hash_offset = htole64(offset);
1170 else {
1171 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1172 if (r < 0)
1173 return r;
1174
1175 o->field.next_hash_offset = htole64(offset);
1176 }
1177
1178 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1179
1180 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1181 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1182
1183 return 0;
1184 }
1185
1186 static int journal_file_link_data(
1187 JournalFile *f,
1188 Object *o,
1189 uint64_t offset,
1190 uint64_t hash) {
1191
1192 uint64_t p, h, m;
1193 int r;
1194
1195 assert(f);
1196 assert(f->header);
1197 assert(f->data_hash_table);
1198 assert(o);
1199 assert(offset > 0);
1200
1201 if (o->object.type != OBJECT_DATA)
1202 return -EINVAL;
1203
1204 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1205 if (m <= 0)
1206 return -EBADMSG;
1207
1208 /* This might alter the window we are looking at */
1209 o->data.next_hash_offset = o->data.next_field_offset = 0;
1210 o->data.entry_offset = o->data.entry_array_offset = 0;
1211 o->data.n_entries = 0;
1212
1213 h = hash % m;
1214 p = le64toh(f->data_hash_table[h].tail_hash_offset);
1215 if (p == 0)
1216 /* Only entry in the hash table is easy */
1217 f->data_hash_table[h].head_hash_offset = htole64(offset);
1218 else {
1219 /* Move back to the previous data object, to patch in
1220 * pointer */
1221
1222 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1223 if (r < 0)
1224 return r;
1225
1226 o->data.next_hash_offset = htole64(offset);
1227 }
1228
1229 f->data_hash_table[h].tail_hash_offset = htole64(offset);
1230
1231 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1232 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1233
1234 return 0;
1235 }
1236
1237 int journal_file_find_field_object_with_hash(
1238 JournalFile *f,
1239 const void *field, uint64_t size, uint64_t hash,
1240 Object **ret, uint64_t *offset) {
1241
1242 uint64_t p, osize, h, m;
1243 int r;
1244
1245 assert(f);
1246 assert(f->header);
1247 assert(field && size > 0);
1248
1249 /* If the field hash table is empty, we can't find anything */
1250 if (le64toh(f->header->field_hash_table_size) <= 0)
1251 return 0;
1252
1253 /* Map the field hash table, if it isn't mapped yet. */
1254 r = journal_file_map_field_hash_table(f);
1255 if (r < 0)
1256 return r;
1257
1258 osize = offsetof(Object, field.payload) + size;
1259
1260 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1261 if (m <= 0)
1262 return -EBADMSG;
1263
1264 h = hash % m;
1265 p = le64toh(f->field_hash_table[h].head_hash_offset);
1266
1267 while (p > 0) {
1268 Object *o;
1269
1270 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1271 if (r < 0)
1272 return r;
1273
1274 if (le64toh(o->field.hash) == hash &&
1275 le64toh(o->object.size) == osize &&
1276 memcmp(o->field.payload, field, size) == 0) {
1277
1278 if (ret)
1279 *ret = o;
1280 if (offset)
1281 *offset = p;
1282
1283 return 1;
1284 }
1285
1286 p = le64toh(o->field.next_hash_offset);
1287 }
1288
1289 return 0;
1290 }
1291
1292 int journal_file_find_field_object(
1293 JournalFile *f,
1294 const void *field, uint64_t size,
1295 Object **ret, uint64_t *offset) {
1296
1297 uint64_t hash;
1298
1299 assert(f);
1300 assert(field && size > 0);
1301
1302 hash = hash64(field, size);
1303
1304 return journal_file_find_field_object_with_hash(f,
1305 field, size, hash,
1306 ret, offset);
1307 }
1308
1309 int journal_file_find_data_object_with_hash(
1310 JournalFile *f,
1311 const void *data, uint64_t size, uint64_t hash,
1312 Object **ret, uint64_t *offset) {
1313
1314 uint64_t p, osize, h, m;
1315 int r;
1316
1317 assert(f);
1318 assert(f->header);
1319 assert(data || size == 0);
1320
1321 /* If there's no data hash table, then there's no entry. */
1322 if (le64toh(f->header->data_hash_table_size) <= 0)
1323 return 0;
1324
1325 /* Map the data hash table, if it isn't mapped yet. */
1326 r = journal_file_map_data_hash_table(f);
1327 if (r < 0)
1328 return r;
1329
1330 osize = offsetof(Object, data.payload) + size;
1331
1332 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1333 if (m <= 0)
1334 return -EBADMSG;
1335
1336 h = hash % m;
1337 p = le64toh(f->data_hash_table[h].head_hash_offset);
1338
1339 while (p > 0) {
1340 Object *o;
1341
1342 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1343 if (r < 0)
1344 return r;
1345
1346 if (le64toh(o->data.hash) != hash)
1347 goto next;
1348
1349 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1350 #if HAVE_XZ || HAVE_LZ4
1351 uint64_t l;
1352 size_t rsize = 0;
1353
1354 l = le64toh(o->object.size);
1355 if (l <= offsetof(Object, data.payload))
1356 return -EBADMSG;
1357
1358 l -= offsetof(Object, data.payload);
1359
1360 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1361 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1362 if (r < 0)
1363 return r;
1364
1365 if (rsize == size &&
1366 memcmp(f->compress_buffer, data, size) == 0) {
1367
1368 if (ret)
1369 *ret = o;
1370
1371 if (offset)
1372 *offset = p;
1373
1374 return 1;
1375 }
1376 #else
1377 return -EPROTONOSUPPORT;
1378 #endif
1379 } else if (le64toh(o->object.size) == osize &&
1380 memcmp(o->data.payload, data, size) == 0) {
1381
1382 if (ret)
1383 *ret = o;
1384
1385 if (offset)
1386 *offset = p;
1387
1388 return 1;
1389 }
1390
1391 next:
1392 p = le64toh(o->data.next_hash_offset);
1393 }
1394
1395 return 0;
1396 }
1397
1398 int journal_file_find_data_object(
1399 JournalFile *f,
1400 const void *data, uint64_t size,
1401 Object **ret, uint64_t *offset) {
1402
1403 uint64_t hash;
1404
1405 assert(f);
1406 assert(data || size == 0);
1407
1408 hash = hash64(data, size);
1409
1410 return journal_file_find_data_object_with_hash(f,
1411 data, size, hash,
1412 ret, offset);
1413 }
1414
1415 static int journal_file_append_field(
1416 JournalFile *f,
1417 const void *field, uint64_t size,
1418 Object **ret, uint64_t *offset) {
1419
1420 uint64_t hash, p;
1421 uint64_t osize;
1422 Object *o;
1423 int r;
1424
1425 assert(f);
1426 assert(field && size > 0);
1427
1428 hash = hash64(field, size);
1429
1430 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1431 if (r < 0)
1432 return r;
1433 else if (r > 0) {
1434
1435 if (ret)
1436 *ret = o;
1437
1438 if (offset)
1439 *offset = p;
1440
1441 return 0;
1442 }
1443
1444 osize = offsetof(Object, field.payload) + size;
1445 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1446 if (r < 0)
1447 return r;
1448
1449 o->field.hash = htole64(hash);
1450 memcpy(o->field.payload, field, size);
1451
1452 r = journal_file_link_field(f, o, p, hash);
1453 if (r < 0)
1454 return r;
1455
1456 /* The linking might have altered the window, so let's
1457 * refresh our pointer */
1458 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1459 if (r < 0)
1460 return r;
1461
1462 #if HAVE_GCRYPT
1463 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1464 if (r < 0)
1465 return r;
1466 #endif
1467
1468 if (ret)
1469 *ret = o;
1470
1471 if (offset)
1472 *offset = p;
1473
1474 return 0;
1475 }
1476
1477 static int journal_file_append_data(
1478 JournalFile *f,
1479 const void *data, uint64_t size,
1480 Object **ret, uint64_t *offset) {
1481
1482 uint64_t hash, p;
1483 uint64_t osize;
1484 Object *o;
1485 int r, compression = 0;
1486 const void *eq;
1487
1488 assert(f);
1489 assert(data || size == 0);
1490
1491 hash = hash64(data, size);
1492
1493 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1494 if (r < 0)
1495 return r;
1496 if (r > 0) {
1497
1498 if (ret)
1499 *ret = o;
1500
1501 if (offset)
1502 *offset = p;
1503
1504 return 0;
1505 }
1506
1507 osize = offsetof(Object, data.payload) + size;
1508 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1509 if (r < 0)
1510 return r;
1511
1512 o->data.hash = htole64(hash);
1513
1514 #if HAVE_XZ || HAVE_LZ4
1515 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
1516 size_t rsize = 0;
1517
1518 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1519
1520 if (compression >= 0) {
1521 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1522 o->object.flags |= compression;
1523
1524 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1525 size, rsize, object_compressed_to_string(compression));
1526 } else
1527 /* Compression didn't work, we don't really care why, let's continue without compression */
1528 compression = 0;
1529 }
1530 #endif
1531
1532 if (compression == 0)
1533 memcpy_safe(o->data.payload, data, size);
1534
1535 r = journal_file_link_data(f, o, p, hash);
1536 if (r < 0)
1537 return r;
1538
1539 #if HAVE_GCRYPT
1540 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1541 if (r < 0)
1542 return r;
1543 #endif
1544
1545 /* The linking might have altered the window, so let's
1546 * refresh our pointer */
1547 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1548 if (r < 0)
1549 return r;
1550
1551 if (!data)
1552 eq = NULL;
1553 else
1554 eq = memchr(data, '=', size);
1555 if (eq && eq > data) {
1556 Object *fo = NULL;
1557 uint64_t fp;
1558
1559 /* Create field object ... */
1560 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1561 if (r < 0)
1562 return r;
1563
1564 /* ... and link it in. */
1565 o->data.next_field_offset = fo->field.head_data_offset;
1566 fo->field.head_data_offset = le64toh(p);
1567 }
1568
1569 if (ret)
1570 *ret = o;
1571
1572 if (offset)
1573 *offset = p;
1574
1575 return 0;
1576 }
1577
1578 uint64_t journal_file_entry_n_items(Object *o) {
1579 assert(o);
1580
1581 if (o->object.type != OBJECT_ENTRY)
1582 return 0;
1583
1584 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1585 }
1586
1587 uint64_t journal_file_entry_array_n_items(Object *o) {
1588 assert(o);
1589
1590 if (o->object.type != OBJECT_ENTRY_ARRAY)
1591 return 0;
1592
1593 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1594 }
1595
1596 uint64_t journal_file_hash_table_n_items(Object *o) {
1597 assert(o);
1598
1599 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1600 return 0;
1601
1602 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1603 }
1604
1605 static int link_entry_into_array(JournalFile *f,
1606 le64_t *first,
1607 le64_t *idx,
1608 uint64_t p) {
1609 int r;
1610 uint64_t n = 0, ap = 0, q, i, a, hidx;
1611 Object *o;
1612
1613 assert(f);
1614 assert(f->header);
1615 assert(first);
1616 assert(idx);
1617 assert(p > 0);
1618
1619 a = le64toh(*first);
1620 i = hidx = le64toh(*idx);
1621 while (a > 0) {
1622
1623 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1624 if (r < 0)
1625 return r;
1626
1627 n = journal_file_entry_array_n_items(o);
1628 if (i < n) {
1629 o->entry_array.items[i] = htole64(p);
1630 *idx = htole64(hidx + 1);
1631 return 0;
1632 }
1633
1634 i -= n;
1635 ap = a;
1636 a = le64toh(o->entry_array.next_entry_array_offset);
1637 }
1638
1639 if (hidx > n)
1640 n = (hidx+1) * 2;
1641 else
1642 n = n * 2;
1643
1644 if (n < 4)
1645 n = 4;
1646
1647 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1648 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1649 &o, &q);
1650 if (r < 0)
1651 return r;
1652
1653 #if HAVE_GCRYPT
1654 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1655 if (r < 0)
1656 return r;
1657 #endif
1658
1659 o->entry_array.items[i] = htole64(p);
1660
1661 if (ap == 0)
1662 *first = htole64(q);
1663 else {
1664 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1665 if (r < 0)
1666 return r;
1667
1668 o->entry_array.next_entry_array_offset = htole64(q);
1669 }
1670
1671 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1672 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1673
1674 *idx = htole64(hidx + 1);
1675
1676 return 0;
1677 }
1678
1679 static int link_entry_into_array_plus_one(JournalFile *f,
1680 le64_t *extra,
1681 le64_t *first,
1682 le64_t *idx,
1683 uint64_t p) {
1684
1685 int r;
1686
1687 assert(f);
1688 assert(extra);
1689 assert(first);
1690 assert(idx);
1691 assert(p > 0);
1692
1693 if (*idx == 0)
1694 *extra = htole64(p);
1695 else {
1696 le64_t i;
1697
1698 i = htole64(le64toh(*idx) - 1);
1699 r = link_entry_into_array(f, first, &i, p);
1700 if (r < 0)
1701 return r;
1702 }
1703
1704 *idx = htole64(le64toh(*idx) + 1);
1705 return 0;
1706 }
1707
1708 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1709 uint64_t p;
1710 int r;
1711 assert(f);
1712 assert(o);
1713 assert(offset > 0);
1714
1715 p = le64toh(o->entry.items[i].object_offset);
1716 if (p == 0)
1717 return -EINVAL;
1718
1719 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1720 if (r < 0)
1721 return r;
1722
1723 return link_entry_into_array_plus_one(f,
1724 &o->data.entry_offset,
1725 &o->data.entry_array_offset,
1726 &o->data.n_entries,
1727 offset);
1728 }
1729
1730 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1731 uint64_t n, i;
1732 int r;
1733
1734 assert(f);
1735 assert(f->header);
1736 assert(o);
1737 assert(offset > 0);
1738
1739 if (o->object.type != OBJECT_ENTRY)
1740 return -EINVAL;
1741
1742 __sync_synchronize();
1743
1744 /* Link up the entry itself */
1745 r = link_entry_into_array(f,
1746 &f->header->entry_array_offset,
1747 &f->header->n_entries,
1748 offset);
1749 if (r < 0)
1750 return r;
1751
1752 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1753
1754 if (f->header->head_entry_realtime == 0)
1755 f->header->head_entry_realtime = o->entry.realtime;
1756
1757 f->header->tail_entry_realtime = o->entry.realtime;
1758 f->header->tail_entry_monotonic = o->entry.monotonic;
1759
1760 /* Link up the items */
1761 n = journal_file_entry_n_items(o);
1762 for (i = 0; i < n; i++) {
1763 r = journal_file_link_entry_item(f, o, offset, i);
1764 if (r < 0)
1765 return r;
1766 }
1767
1768 return 0;
1769 }
1770
1771 static int journal_file_append_entry_internal(
1772 JournalFile *f,
1773 const dual_timestamp *ts,
1774 const sd_id128_t *boot_id,
1775 uint64_t xor_hash,
1776 const EntryItem items[], unsigned n_items,
1777 uint64_t *seqnum,
1778 Object **ret, uint64_t *offset) {
1779 uint64_t np;
1780 uint64_t osize;
1781 Object *o;
1782 int r;
1783
1784 assert(f);
1785 assert(f->header);
1786 assert(items || n_items == 0);
1787 assert(ts);
1788
1789 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1790
1791 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1792 if (r < 0)
1793 return r;
1794
1795 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1796 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1797 o->entry.realtime = htole64(ts->realtime);
1798 o->entry.monotonic = htole64(ts->monotonic);
1799 o->entry.xor_hash = htole64(xor_hash);
1800 if (boot_id)
1801 f->header->boot_id = *boot_id;
1802 o->entry.boot_id = f->header->boot_id;
1803
1804 #if HAVE_GCRYPT
1805 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1806 if (r < 0)
1807 return r;
1808 #endif
1809
1810 r = journal_file_link_entry(f, o, np);
1811 if (r < 0)
1812 return r;
1813
1814 if (ret)
1815 *ret = o;
1816
1817 if (offset)
1818 *offset = np;
1819
1820 return 0;
1821 }
1822
1823 void journal_file_post_change(JournalFile *f) {
1824 assert(f);
1825
1826 if (f->fd < 0)
1827 return;
1828
1829 /* inotify() does not receive IN_MODIFY events from file
1830 * accesses done via mmap(). After each access we hence
1831 * trigger IN_MODIFY by truncating the journal file to its
1832 * current size which triggers IN_MODIFY. */
1833
1834 __sync_synchronize();
1835
1836 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1837 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1838 }
1839
1840 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1841 assert(userdata);
1842
1843 journal_file_post_change(userdata);
1844
1845 return 1;
1846 }
1847
1848 static void schedule_post_change(JournalFile *f) {
1849 uint64_t now;
1850 int r;
1851
1852 assert(f);
1853 assert(f->post_change_timer);
1854
1855 r = sd_event_source_get_enabled(f->post_change_timer, NULL);
1856 if (r < 0) {
1857 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1858 goto fail;
1859 }
1860 if (r > 0)
1861 return;
1862
1863 r = sd_event_now(sd_event_source_get_event(f->post_change_timer), CLOCK_MONOTONIC, &now);
1864 if (r < 0) {
1865 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1866 goto fail;
1867 }
1868
1869 r = sd_event_source_set_time(f->post_change_timer, now + f->post_change_timer_period);
1870 if (r < 0) {
1871 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1872 goto fail;
1873 }
1874
1875 r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
1876 if (r < 0) {
1877 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1878 goto fail;
1879 }
1880
1881 return;
1882
1883 fail:
1884 /* On failure, let's simply post the change immediately. */
1885 journal_file_post_change(f);
1886 }
1887
1888 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1889 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1890 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1891 int r;
1892
1893 assert(f);
1894 assert_return(!f->post_change_timer, -EINVAL);
1895 assert(e);
1896 assert(t);
1897
1898 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1899 if (r < 0)
1900 return r;
1901
1902 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1903 if (r < 0)
1904 return r;
1905
1906 f->post_change_timer = TAKE_PTR(timer);
1907 f->post_change_timer_period = t;
1908
1909 return r;
1910 }
1911
1912 static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
1913 return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
1914 }
1915
1916 int journal_file_append_entry(
1917 JournalFile *f,
1918 const dual_timestamp *ts,
1919 const sd_id128_t *boot_id,
1920 const struct iovec iovec[], unsigned n_iovec,
1921 uint64_t *seqnum,
1922 Object **ret, uint64_t *offset) {
1923
1924 unsigned i;
1925 EntryItem *items;
1926 int r;
1927 uint64_t xor_hash = 0;
1928 struct dual_timestamp _ts;
1929
1930 assert(f);
1931 assert(f->header);
1932 assert(iovec || n_iovec == 0);
1933
1934 if (ts) {
1935 if (!VALID_REALTIME(ts->realtime))
1936 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1937 "Invalid realtime timestamp %" PRIu64 ", refusing entry.",
1938 ts->realtime);
1939 if (!VALID_MONOTONIC(ts->monotonic))
1940 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1941 "Invalid monotomic timestamp %" PRIu64 ", refusing entry.",
1942 ts->monotonic);
1943 } else {
1944 dual_timestamp_get(&_ts);
1945 ts = &_ts;
1946 }
1947
1948 #if HAVE_GCRYPT
1949 r = journal_file_maybe_append_tag(f, ts->realtime);
1950 if (r < 0)
1951 return r;
1952 #endif
1953
1954 /* alloca() can't take 0, hence let's allocate at least one */
1955 items = newa(EntryItem, MAX(1u, n_iovec));
1956
1957 for (i = 0; i < n_iovec; i++) {
1958 uint64_t p;
1959 Object *o;
1960
1961 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1962 if (r < 0)
1963 return r;
1964
1965 xor_hash ^= le64toh(o->data.hash);
1966 items[i].object_offset = htole64(p);
1967 items[i].hash = o->data.hash;
1968 }
1969
1970 /* Order by the position on disk, in order to improve seek
1971 * times for rotating media. */
1972 typesafe_qsort(items, n_iovec, entry_item_cmp);
1973
1974 r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, offset);
1975
1976 /* If the memory mapping triggered a SIGBUS then we return an
1977 * IO error and ignore the error code passed down to us, since
1978 * it is very likely just an effect of a nullified replacement
1979 * mapping page */
1980
1981 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
1982 r = -EIO;
1983
1984 if (f->post_change_timer)
1985 schedule_post_change(f);
1986 else
1987 journal_file_post_change(f);
1988
1989 return r;
1990 }
1991
1992 typedef struct ChainCacheItem {
1993 uint64_t first; /* the array at the beginning of the chain */
1994 uint64_t array; /* the cached array */
1995 uint64_t begin; /* the first item in the cached array */
1996 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1997 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1998 } ChainCacheItem;
1999
2000 static void chain_cache_put(
2001 OrderedHashmap *h,
2002 ChainCacheItem *ci,
2003 uint64_t first,
2004 uint64_t array,
2005 uint64_t begin,
2006 uint64_t total,
2007 uint64_t last_index) {
2008
2009 if (!ci) {
2010 /* If the chain item to cache for this chain is the
2011 * first one it's not worth caching anything */
2012 if (array == first)
2013 return;
2014
2015 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2016 ci = ordered_hashmap_steal_first(h);
2017 assert(ci);
2018 } else {
2019 ci = new(ChainCacheItem, 1);
2020 if (!ci)
2021 return;
2022 }
2023
2024 ci->first = first;
2025
2026 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2027 free(ci);
2028 return;
2029 }
2030 } else
2031 assert(ci->first == first);
2032
2033 ci->array = array;
2034 ci->begin = begin;
2035 ci->total = total;
2036 ci->last_index = last_index;
2037 }
2038
2039 static int generic_array_get(
2040 JournalFile *f,
2041 uint64_t first,
2042 uint64_t i,
2043 Object **ret, uint64_t *offset) {
2044
2045 Object *o;
2046 uint64_t p = 0, a, t = 0;
2047 int r;
2048 ChainCacheItem *ci;
2049
2050 assert(f);
2051
2052 a = first;
2053
2054 /* Try the chain cache first */
2055 ci = ordered_hashmap_get(f->chain_cache, &first);
2056 if (ci && i > ci->total) {
2057 a = ci->array;
2058 i -= ci->total;
2059 t = ci->total;
2060 }
2061
2062 while (a > 0) {
2063 uint64_t k;
2064
2065 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2066 if (r < 0)
2067 return r;
2068
2069 k = journal_file_entry_array_n_items(o);
2070 if (i < k) {
2071 p = le64toh(o->entry_array.items[i]);
2072 goto found;
2073 }
2074
2075 i -= k;
2076 t += k;
2077 a = le64toh(o->entry_array.next_entry_array_offset);
2078 }
2079
2080 return 0;
2081
2082 found:
2083 /* Let's cache this item for the next invocation */
2084 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2085
2086 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2087 if (r < 0)
2088 return r;
2089
2090 if (ret)
2091 *ret = o;
2092
2093 if (offset)
2094 *offset = p;
2095
2096 return 1;
2097 }
2098
2099 static int generic_array_get_plus_one(
2100 JournalFile *f,
2101 uint64_t extra,
2102 uint64_t first,
2103 uint64_t i,
2104 Object **ret, uint64_t *offset) {
2105
2106 Object *o;
2107
2108 assert(f);
2109
2110 if (i == 0) {
2111 int r;
2112
2113 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2114 if (r < 0)
2115 return r;
2116
2117 if (ret)
2118 *ret = o;
2119
2120 if (offset)
2121 *offset = extra;
2122
2123 return 1;
2124 }
2125
2126 return generic_array_get(f, first, i-1, ret, offset);
2127 }
2128
2129 enum {
2130 TEST_FOUND,
2131 TEST_LEFT,
2132 TEST_RIGHT
2133 };
2134
2135 static int generic_array_bisect(
2136 JournalFile *f,
2137 uint64_t first,
2138 uint64_t n,
2139 uint64_t needle,
2140 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2141 direction_t direction,
2142 Object **ret,
2143 uint64_t *offset,
2144 uint64_t *idx) {
2145
2146 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2147 bool subtract_one = false;
2148 Object *o, *array = NULL;
2149 int r;
2150 ChainCacheItem *ci;
2151
2152 assert(f);
2153 assert(test_object);
2154
2155 /* Start with the first array in the chain */
2156 a = first;
2157
2158 ci = ordered_hashmap_get(f->chain_cache, &first);
2159 if (ci && n > ci->total && ci->begin != 0) {
2160 /* Ah, we have iterated this bisection array chain
2161 * previously! Let's see if we can skip ahead in the
2162 * chain, as far as the last time. But we can't jump
2163 * backwards in the chain, so let's check that
2164 * first. */
2165
2166 r = test_object(f, ci->begin, needle);
2167 if (r < 0)
2168 return r;
2169
2170 if (r == TEST_LEFT) {
2171 /* OK, what we are looking for is right of the
2172 * begin of this EntryArray, so let's jump
2173 * straight to previously cached array in the
2174 * chain */
2175
2176 a = ci->array;
2177 n -= ci->total;
2178 t = ci->total;
2179 last_index = ci->last_index;
2180 }
2181 }
2182
2183 while (a > 0) {
2184 uint64_t left, right, k, lp;
2185
2186 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2187 if (r < 0)
2188 return r;
2189
2190 k = journal_file_entry_array_n_items(array);
2191 right = MIN(k, n);
2192 if (right <= 0)
2193 return 0;
2194
2195 i = right - 1;
2196 lp = p = le64toh(array->entry_array.items[i]);
2197 if (p <= 0)
2198 r = -EBADMSG;
2199 else
2200 r = test_object(f, p, needle);
2201 if (r == -EBADMSG) {
2202 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2203 n = i;
2204 continue;
2205 }
2206 if (r < 0)
2207 return r;
2208
2209 if (r == TEST_FOUND)
2210 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2211
2212 if (r == TEST_RIGHT) {
2213 left = 0;
2214 right -= 1;
2215
2216 if (last_index != (uint64_t) -1) {
2217 assert(last_index <= right);
2218
2219 /* If we cached the last index we
2220 * looked at, let's try to not to jump
2221 * too wildly around and see if we can
2222 * limit the range to look at early to
2223 * the immediate neighbors of the last
2224 * index we looked at. */
2225
2226 if (last_index > 0) {
2227 uint64_t x = last_index - 1;
2228
2229 p = le64toh(array->entry_array.items[x]);
2230 if (p <= 0)
2231 return -EBADMSG;
2232
2233 r = test_object(f, p, needle);
2234 if (r < 0)
2235 return r;
2236
2237 if (r == TEST_FOUND)
2238 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2239
2240 if (r == TEST_RIGHT)
2241 right = x;
2242 else
2243 left = x + 1;
2244 }
2245
2246 if (last_index < right) {
2247 uint64_t y = last_index + 1;
2248
2249 p = le64toh(array->entry_array.items[y]);
2250 if (p <= 0)
2251 return -EBADMSG;
2252
2253 r = test_object(f, p, needle);
2254 if (r < 0)
2255 return r;
2256
2257 if (r == TEST_FOUND)
2258 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2259
2260 if (r == TEST_RIGHT)
2261 right = y;
2262 else
2263 left = y + 1;
2264 }
2265 }
2266
2267 for (;;) {
2268 if (left == right) {
2269 if (direction == DIRECTION_UP)
2270 subtract_one = true;
2271
2272 i = left;
2273 goto found;
2274 }
2275
2276 assert(left < right);
2277 i = (left + right) / 2;
2278
2279 p = le64toh(array->entry_array.items[i]);
2280 if (p <= 0)
2281 r = -EBADMSG;
2282 else
2283 r = test_object(f, p, needle);
2284 if (r == -EBADMSG) {
2285 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2286 right = n = i;
2287 continue;
2288 }
2289 if (r < 0)
2290 return r;
2291
2292 if (r == TEST_FOUND)
2293 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2294
2295 if (r == TEST_RIGHT)
2296 right = i;
2297 else
2298 left = i + 1;
2299 }
2300 }
2301
2302 if (k >= n) {
2303 if (direction == DIRECTION_UP) {
2304 i = n;
2305 subtract_one = true;
2306 goto found;
2307 }
2308
2309 return 0;
2310 }
2311
2312 last_p = lp;
2313
2314 n -= k;
2315 t += k;
2316 last_index = (uint64_t) -1;
2317 a = le64toh(array->entry_array.next_entry_array_offset);
2318 }
2319
2320 return 0;
2321
2322 found:
2323 if (subtract_one && t == 0 && i == 0)
2324 return 0;
2325
2326 /* Let's cache this item for the next invocation */
2327 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2328
2329 if (subtract_one && i == 0)
2330 p = last_p;
2331 else if (subtract_one)
2332 p = le64toh(array->entry_array.items[i-1]);
2333 else
2334 p = le64toh(array->entry_array.items[i]);
2335
2336 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2337 if (r < 0)
2338 return r;
2339
2340 if (ret)
2341 *ret = o;
2342
2343 if (offset)
2344 *offset = p;
2345
2346 if (idx)
2347 *idx = t + i + (subtract_one ? -1 : 0);
2348
2349 return 1;
2350 }
2351
2352 static int generic_array_bisect_plus_one(
2353 JournalFile *f,
2354 uint64_t extra,
2355 uint64_t first,
2356 uint64_t n,
2357 uint64_t needle,
2358 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2359 direction_t direction,
2360 Object **ret,
2361 uint64_t *offset,
2362 uint64_t *idx) {
2363
2364 int r;
2365 bool step_back = false;
2366 Object *o;
2367
2368 assert(f);
2369 assert(test_object);
2370
2371 if (n <= 0)
2372 return 0;
2373
2374 /* This bisects the array in object 'first', but first checks
2375 * an extra */
2376 r = test_object(f, extra, needle);
2377 if (r < 0)
2378 return r;
2379
2380 if (r == TEST_FOUND)
2381 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2382
2383 /* if we are looking with DIRECTION_UP then we need to first
2384 see if in the actual array there is a matching entry, and
2385 return the last one of that. But if there isn't any we need
2386 to return this one. Hence remember this, and return it
2387 below. */
2388 if (r == TEST_LEFT)
2389 step_back = direction == DIRECTION_UP;
2390
2391 if (r == TEST_RIGHT) {
2392 if (direction == DIRECTION_DOWN)
2393 goto found;
2394 else
2395 return 0;
2396 }
2397
2398 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2399
2400 if (r == 0 && step_back)
2401 goto found;
2402
2403 if (r > 0 && idx)
2404 (*idx)++;
2405
2406 return r;
2407
2408 found:
2409 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2410 if (r < 0)
2411 return r;
2412
2413 if (ret)
2414 *ret = o;
2415
2416 if (offset)
2417 *offset = extra;
2418
2419 if (idx)
2420 *idx = 0;
2421
2422 return 1;
2423 }
2424
2425 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2426 assert(f);
2427 assert(p > 0);
2428
2429 if (p == needle)
2430 return TEST_FOUND;
2431 else if (p < needle)
2432 return TEST_LEFT;
2433 else
2434 return TEST_RIGHT;
2435 }
2436
2437 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2438 Object *o;
2439 int r;
2440
2441 assert(f);
2442 assert(p > 0);
2443
2444 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2445 if (r < 0)
2446 return r;
2447
2448 if (le64toh(o->entry.seqnum) == needle)
2449 return TEST_FOUND;
2450 else if (le64toh(o->entry.seqnum) < needle)
2451 return TEST_LEFT;
2452 else
2453 return TEST_RIGHT;
2454 }
2455
2456 int journal_file_move_to_entry_by_seqnum(
2457 JournalFile *f,
2458 uint64_t seqnum,
2459 direction_t direction,
2460 Object **ret,
2461 uint64_t *offset) {
2462 assert(f);
2463 assert(f->header);
2464
2465 return generic_array_bisect(f,
2466 le64toh(f->header->entry_array_offset),
2467 le64toh(f->header->n_entries),
2468 seqnum,
2469 test_object_seqnum,
2470 direction,
2471 ret, offset, NULL);
2472 }
2473
2474 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2475 Object *o;
2476 int r;
2477
2478 assert(f);
2479 assert(p > 0);
2480
2481 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2482 if (r < 0)
2483 return r;
2484
2485 if (le64toh(o->entry.realtime) == needle)
2486 return TEST_FOUND;
2487 else if (le64toh(o->entry.realtime) < needle)
2488 return TEST_LEFT;
2489 else
2490 return TEST_RIGHT;
2491 }
2492
2493 int journal_file_move_to_entry_by_realtime(
2494 JournalFile *f,
2495 uint64_t realtime,
2496 direction_t direction,
2497 Object **ret,
2498 uint64_t *offset) {
2499 assert(f);
2500 assert(f->header);
2501
2502 return generic_array_bisect(f,
2503 le64toh(f->header->entry_array_offset),
2504 le64toh(f->header->n_entries),
2505 realtime,
2506 test_object_realtime,
2507 direction,
2508 ret, offset, NULL);
2509 }
2510
2511 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2512 Object *o;
2513 int r;
2514
2515 assert(f);
2516 assert(p > 0);
2517
2518 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2519 if (r < 0)
2520 return r;
2521
2522 if (le64toh(o->entry.monotonic) == needle)
2523 return TEST_FOUND;
2524 else if (le64toh(o->entry.monotonic) < needle)
2525 return TEST_LEFT;
2526 else
2527 return TEST_RIGHT;
2528 }
2529
2530 static int find_data_object_by_boot_id(
2531 JournalFile *f,
2532 sd_id128_t boot_id,
2533 Object **o,
2534 uint64_t *b) {
2535
2536 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2537
2538 sd_id128_to_string(boot_id, t + 9);
2539 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2540 }
2541
2542 int journal_file_move_to_entry_by_monotonic(
2543 JournalFile *f,
2544 sd_id128_t boot_id,
2545 uint64_t monotonic,
2546 direction_t direction,
2547 Object **ret,
2548 uint64_t *offset) {
2549
2550 Object *o;
2551 int r;
2552
2553 assert(f);
2554
2555 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2556 if (r < 0)
2557 return r;
2558 if (r == 0)
2559 return -ENOENT;
2560
2561 return generic_array_bisect_plus_one(f,
2562 le64toh(o->data.entry_offset),
2563 le64toh(o->data.entry_array_offset),
2564 le64toh(o->data.n_entries),
2565 monotonic,
2566 test_object_monotonic,
2567 direction,
2568 ret, offset, NULL);
2569 }
2570
2571 void journal_file_reset_location(JournalFile *f) {
2572 f->location_type = LOCATION_HEAD;
2573 f->current_offset = 0;
2574 f->current_seqnum = 0;
2575 f->current_realtime = 0;
2576 f->current_monotonic = 0;
2577 zero(f->current_boot_id);
2578 f->current_xor_hash = 0;
2579 }
2580
2581 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2582 f->location_type = LOCATION_SEEK;
2583 f->current_offset = offset;
2584 f->current_seqnum = le64toh(o->entry.seqnum);
2585 f->current_realtime = le64toh(o->entry.realtime);
2586 f->current_monotonic = le64toh(o->entry.monotonic);
2587 f->current_boot_id = o->entry.boot_id;
2588 f->current_xor_hash = le64toh(o->entry.xor_hash);
2589 }
2590
2591 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2592 int r;
2593
2594 assert(af);
2595 assert(af->header);
2596 assert(bf);
2597 assert(bf->header);
2598 assert(af->location_type == LOCATION_SEEK);
2599 assert(bf->location_type == LOCATION_SEEK);
2600
2601 /* If contents and timestamps match, these entries are
2602 * identical, even if the seqnum does not match */
2603 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2604 af->current_monotonic == bf->current_monotonic &&
2605 af->current_realtime == bf->current_realtime &&
2606 af->current_xor_hash == bf->current_xor_hash)
2607 return 0;
2608
2609 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2610
2611 /* If this is from the same seqnum source, compare
2612 * seqnums */
2613 r = CMP(af->current_seqnum, bf->current_seqnum);
2614 if (r != 0)
2615 return r;
2616
2617 /* Wow! This is weird, different data but the same
2618 * seqnums? Something is borked, but let's make the
2619 * best of it and compare by time. */
2620 }
2621
2622 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2623
2624 /* If the boot id matches, compare monotonic time */
2625 r = CMP(af->current_monotonic, bf->current_monotonic);
2626 if (r != 0)
2627 return r;
2628 }
2629
2630 /* Otherwise, compare UTC time */
2631 r = CMP(af->current_realtime, bf->current_realtime);
2632 if (r != 0)
2633 return r;
2634
2635 /* Finally, compare by contents */
2636 return CMP(af->current_xor_hash, bf->current_xor_hash);
2637 }
2638
2639 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2640
2641 /* Increase or decrease the specified index, in the right direction. */
2642
2643 if (direction == DIRECTION_DOWN) {
2644 if (*i >= n - 1)
2645 return 0;
2646
2647 (*i) ++;
2648 } else {
2649 if (*i <= 0)
2650 return 0;
2651
2652 (*i) --;
2653 }
2654
2655 return 1;
2656 }
2657
2658 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2659
2660 /* Consider it an error if any of the two offsets is uninitialized */
2661 if (old_offset == 0 || new_offset == 0)
2662 return false;
2663
2664 /* If we go down, the new offset must be larger than the old one. */
2665 return direction == DIRECTION_DOWN ?
2666 new_offset > old_offset :
2667 new_offset < old_offset;
2668 }
2669
2670 int journal_file_next_entry(
2671 JournalFile *f,
2672 uint64_t p,
2673 direction_t direction,
2674 Object **ret, uint64_t *offset) {
2675
2676 uint64_t i, n, ofs;
2677 int r;
2678
2679 assert(f);
2680 assert(f->header);
2681
2682 n = le64toh(f->header->n_entries);
2683 if (n <= 0)
2684 return 0;
2685
2686 if (p == 0)
2687 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2688 else {
2689 r = generic_array_bisect(f,
2690 le64toh(f->header->entry_array_offset),
2691 le64toh(f->header->n_entries),
2692 p,
2693 test_object_offset,
2694 DIRECTION_DOWN,
2695 NULL, NULL,
2696 &i);
2697 if (r <= 0)
2698 return r;
2699
2700 r = bump_array_index(&i, direction, n);
2701 if (r <= 0)
2702 return r;
2703 }
2704
2705 /* And jump to it */
2706 for (;;) {
2707 r = generic_array_get(f,
2708 le64toh(f->header->entry_array_offset),
2709 i,
2710 ret, &ofs);
2711 if (r > 0)
2712 break;
2713 if (r != -EBADMSG)
2714 return r;
2715
2716 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2717 * the next one might work for us instead. */
2718 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2719
2720 r = bump_array_index(&i, direction, n);
2721 if (r <= 0)
2722 return r;
2723 }
2724
2725 /* Ensure our array is properly ordered. */
2726 if (p > 0 && !check_properly_ordered(ofs, p, direction))
2727 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2728 "%s: entry array not properly ordered at entry %" PRIu64,
2729 f->path, i);
2730
2731 if (offset)
2732 *offset = ofs;
2733
2734 return 1;
2735 }
2736
2737 int journal_file_next_entry_for_data(
2738 JournalFile *f,
2739 Object *o, uint64_t p,
2740 uint64_t data_offset,
2741 direction_t direction,
2742 Object **ret, uint64_t *offset) {
2743
2744 uint64_t i, n, ofs;
2745 Object *d;
2746 int r;
2747
2748 assert(f);
2749 assert(p > 0 || !o);
2750
2751 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2752 if (r < 0)
2753 return r;
2754
2755 n = le64toh(d->data.n_entries);
2756 if (n <= 0)
2757 return n;
2758
2759 if (!o)
2760 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2761 else {
2762 if (o->object.type != OBJECT_ENTRY)
2763 return -EINVAL;
2764
2765 r = generic_array_bisect_plus_one(f,
2766 le64toh(d->data.entry_offset),
2767 le64toh(d->data.entry_array_offset),
2768 le64toh(d->data.n_entries),
2769 p,
2770 test_object_offset,
2771 DIRECTION_DOWN,
2772 NULL, NULL,
2773 &i);
2774
2775 if (r <= 0)
2776 return r;
2777
2778 r = bump_array_index(&i, direction, n);
2779 if (r <= 0)
2780 return r;
2781 }
2782
2783 for (;;) {
2784 r = generic_array_get_plus_one(f,
2785 le64toh(d->data.entry_offset),
2786 le64toh(d->data.entry_array_offset),
2787 i,
2788 ret, &ofs);
2789 if (r > 0)
2790 break;
2791 if (r != -EBADMSG)
2792 return r;
2793
2794 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2795
2796 r = bump_array_index(&i, direction, n);
2797 if (r <= 0)
2798 return r;
2799 }
2800
2801 /* Ensure our array is properly ordered. */
2802 if (p > 0 && check_properly_ordered(ofs, p, direction))
2803 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2804 "%s data entry array not properly ordered at entry %" PRIu64,
2805 f->path, i);
2806
2807 if (offset)
2808 *offset = ofs;
2809
2810 return 1;
2811 }
2812
2813 int journal_file_move_to_entry_by_offset_for_data(
2814 JournalFile *f,
2815 uint64_t data_offset,
2816 uint64_t p,
2817 direction_t direction,
2818 Object **ret, uint64_t *offset) {
2819
2820 int r;
2821 Object *d;
2822
2823 assert(f);
2824
2825 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2826 if (r < 0)
2827 return r;
2828
2829 return generic_array_bisect_plus_one(f,
2830 le64toh(d->data.entry_offset),
2831 le64toh(d->data.entry_array_offset),
2832 le64toh(d->data.n_entries),
2833 p,
2834 test_object_offset,
2835 direction,
2836 ret, offset, NULL);
2837 }
2838
2839 int journal_file_move_to_entry_by_monotonic_for_data(
2840 JournalFile *f,
2841 uint64_t data_offset,
2842 sd_id128_t boot_id,
2843 uint64_t monotonic,
2844 direction_t direction,
2845 Object **ret, uint64_t *offset) {
2846
2847 Object *o, *d;
2848 int r;
2849 uint64_t b, z;
2850
2851 assert(f);
2852
2853 /* First, seek by time */
2854 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2855 if (r < 0)
2856 return r;
2857 if (r == 0)
2858 return -ENOENT;
2859
2860 r = generic_array_bisect_plus_one(f,
2861 le64toh(o->data.entry_offset),
2862 le64toh(o->data.entry_array_offset),
2863 le64toh(o->data.n_entries),
2864 monotonic,
2865 test_object_monotonic,
2866 direction,
2867 NULL, &z, NULL);
2868 if (r <= 0)
2869 return r;
2870
2871 /* And now, continue seeking until we find an entry that
2872 * exists in both bisection arrays */
2873
2874 for (;;) {
2875 Object *qo;
2876 uint64_t p, q;
2877
2878 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2879 if (r < 0)
2880 return r;
2881
2882 r = generic_array_bisect_plus_one(f,
2883 le64toh(d->data.entry_offset),
2884 le64toh(d->data.entry_array_offset),
2885 le64toh(d->data.n_entries),
2886 z,
2887 test_object_offset,
2888 direction,
2889 NULL, &p, NULL);
2890 if (r <= 0)
2891 return r;
2892
2893 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2894 if (r < 0)
2895 return r;
2896
2897 r = generic_array_bisect_plus_one(f,
2898 le64toh(o->data.entry_offset),
2899 le64toh(o->data.entry_array_offset),
2900 le64toh(o->data.n_entries),
2901 p,
2902 test_object_offset,
2903 direction,
2904 &qo, &q, NULL);
2905
2906 if (r <= 0)
2907 return r;
2908
2909 if (p == q) {
2910 if (ret)
2911 *ret = qo;
2912 if (offset)
2913 *offset = q;
2914
2915 return 1;
2916 }
2917
2918 z = q;
2919 }
2920 }
2921
2922 int journal_file_move_to_entry_by_seqnum_for_data(
2923 JournalFile *f,
2924 uint64_t data_offset,
2925 uint64_t seqnum,
2926 direction_t direction,
2927 Object **ret, uint64_t *offset) {
2928
2929 Object *d;
2930 int r;
2931
2932 assert(f);
2933
2934 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2935 if (r < 0)
2936 return r;
2937
2938 return generic_array_bisect_plus_one(f,
2939 le64toh(d->data.entry_offset),
2940 le64toh(d->data.entry_array_offset),
2941 le64toh(d->data.n_entries),
2942 seqnum,
2943 test_object_seqnum,
2944 direction,
2945 ret, offset, NULL);
2946 }
2947
2948 int journal_file_move_to_entry_by_realtime_for_data(
2949 JournalFile *f,
2950 uint64_t data_offset,
2951 uint64_t realtime,
2952 direction_t direction,
2953 Object **ret, uint64_t *offset) {
2954
2955 Object *d;
2956 int r;
2957
2958 assert(f);
2959
2960 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2961 if (r < 0)
2962 return r;
2963
2964 return generic_array_bisect_plus_one(f,
2965 le64toh(d->data.entry_offset),
2966 le64toh(d->data.entry_array_offset),
2967 le64toh(d->data.n_entries),
2968 realtime,
2969 test_object_realtime,
2970 direction,
2971 ret, offset, NULL);
2972 }
2973
2974 void journal_file_dump(JournalFile *f) {
2975 Object *o;
2976 int r;
2977 uint64_t p;
2978
2979 assert(f);
2980 assert(f->header);
2981
2982 journal_file_print_header(f);
2983
2984 p = le64toh(f->header->header_size);
2985 while (p != 0) {
2986 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2987 if (r < 0)
2988 goto fail;
2989
2990 switch (o->object.type) {
2991
2992 case OBJECT_UNUSED:
2993 printf("Type: OBJECT_UNUSED\n");
2994 break;
2995
2996 case OBJECT_DATA:
2997 printf("Type: OBJECT_DATA\n");
2998 break;
2999
3000 case OBJECT_FIELD:
3001 printf("Type: OBJECT_FIELD\n");
3002 break;
3003
3004 case OBJECT_ENTRY:
3005 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3006 le64toh(o->entry.seqnum),
3007 le64toh(o->entry.monotonic),
3008 le64toh(o->entry.realtime));
3009 break;
3010
3011 case OBJECT_FIELD_HASH_TABLE:
3012 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3013 break;
3014
3015 case OBJECT_DATA_HASH_TABLE:
3016 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3017 break;
3018
3019 case OBJECT_ENTRY_ARRAY:
3020 printf("Type: OBJECT_ENTRY_ARRAY\n");
3021 break;
3022
3023 case OBJECT_TAG:
3024 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3025 le64toh(o->tag.seqnum),
3026 le64toh(o->tag.epoch));
3027 break;
3028
3029 default:
3030 printf("Type: unknown (%i)\n", o->object.type);
3031 break;
3032 }
3033
3034 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3035 printf("Flags: %s\n",
3036 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3037
3038 if (p == le64toh(f->header->tail_object_offset))
3039 p = 0;
3040 else
3041 p = p + ALIGN64(le64toh(o->object.size));
3042 }
3043
3044 return;
3045 fail:
3046 log_error("File corrupt");
3047 }
3048
3049 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3050 const char *x;
3051
3052 x = format_timestamp(buf, l, t);
3053 if (x)
3054 return x;
3055 return " --- ";
3056 }
3057
3058 void journal_file_print_header(JournalFile *f) {
3059 char a[33], b[33], c[33], d[33];
3060 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3061 struct stat st;
3062 char bytes[FORMAT_BYTES_MAX];
3063
3064 assert(f);
3065 assert(f->header);
3066
3067 printf("File path: %s\n"
3068 "File ID: %s\n"
3069 "Machine ID: %s\n"
3070 "Boot ID: %s\n"
3071 "Sequential number ID: %s\n"
3072 "State: %s\n"
3073 "Compatible flags:%s%s\n"
3074 "Incompatible flags:%s%s%s\n"
3075 "Header size: %"PRIu64"\n"
3076 "Arena size: %"PRIu64"\n"
3077 "Data hash table size: %"PRIu64"\n"
3078 "Field hash table size: %"PRIu64"\n"
3079 "Rotate suggested: %s\n"
3080 "Head sequential number: %"PRIu64" (%"PRIx64")\n"
3081 "Tail sequential number: %"PRIu64" (%"PRIx64")\n"
3082 "Head realtime timestamp: %s (%"PRIx64")\n"
3083 "Tail realtime timestamp: %s (%"PRIx64")\n"
3084 "Tail monotonic timestamp: %s (%"PRIx64")\n"
3085 "Objects: %"PRIu64"\n"
3086 "Entry objects: %"PRIu64"\n",
3087 f->path,
3088 sd_id128_to_string(f->header->file_id, a),
3089 sd_id128_to_string(f->header->machine_id, b),
3090 sd_id128_to_string(f->header->boot_id, c),
3091 sd_id128_to_string(f->header->seqnum_id, d),
3092 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3093 f->header->state == STATE_ONLINE ? "ONLINE" :
3094 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3095 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3096 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3097 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3098 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3099 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3100 le64toh(f->header->header_size),
3101 le64toh(f->header->arena_size),
3102 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3103 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3104 yes_no(journal_file_rotate_suggested(f, 0)),
3105 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3106 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3107 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3108 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3109 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3110 le64toh(f->header->n_objects),
3111 le64toh(f->header->n_entries));
3112
3113 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3114 printf("Data objects: %"PRIu64"\n"
3115 "Data hash table fill: %.1f%%\n",
3116 le64toh(f->header->n_data),
3117 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3118
3119 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3120 printf("Field objects: %"PRIu64"\n"
3121 "Field hash table fill: %.1f%%\n",
3122 le64toh(f->header->n_fields),
3123 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3124
3125 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3126 printf("Tag objects: %"PRIu64"\n",
3127 le64toh(f->header->n_tags));
3128 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3129 printf("Entry array objects: %"PRIu64"\n",
3130 le64toh(f->header->n_entry_arrays));
3131
3132 if (fstat(f->fd, &st) >= 0)
3133 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3134 }
3135
3136 static int journal_file_warn_btrfs(JournalFile *f) {
3137 unsigned attrs;
3138 int r;
3139
3140 assert(f);
3141
3142 /* Before we write anything, check if the COW logic is turned
3143 * off on btrfs. Given our write pattern that is quite
3144 * unfriendly to COW file systems this should greatly improve
3145 * performance on COW file systems, such as btrfs, at the
3146 * expense of data integrity features (which shouldn't be too
3147 * bad, given that we do our own checksumming). */
3148
3149 r = btrfs_is_filesystem(f->fd);
3150 if (r < 0)
3151 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3152 if (!r)
3153 return 0;
3154
3155 r = read_attr_fd(f->fd, &attrs);
3156 if (r < 0)
3157 return log_warning_errno(r, "Failed to read file attributes: %m");
3158
3159 if (attrs & FS_NOCOW_FL) {
3160 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3161 return 0;
3162 }
3163
3164 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3165 "This is likely to slow down journal access substantially, please consider turning "
3166 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3167
3168 return 1;
3169 }
3170
3171 int journal_file_open(
3172 int fd,
3173 const char *fname,
3174 int flags,
3175 mode_t mode,
3176 bool compress,
3177 uint64_t compress_threshold_bytes,
3178 bool seal,
3179 JournalMetrics *metrics,
3180 MMapCache *mmap_cache,
3181 Set *deferred_closes,
3182 JournalFile *template,
3183 JournalFile **ret) {
3184
3185 bool newly_created = false;
3186 JournalFile *f;
3187 void *h;
3188 int r;
3189
3190 assert(ret);
3191 assert(fd >= 0 || fname);
3192
3193 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3194 return -EINVAL;
3195
3196 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3197 return -EINVAL;
3198
3199 f = new(JournalFile, 1);
3200 if (!f)
3201 return -ENOMEM;
3202
3203 *f = (JournalFile) {
3204 .fd = fd,
3205 .mode = mode,
3206
3207 .flags = flags,
3208 .prot = prot_from_flags(flags),
3209 .writable = (flags & O_ACCMODE) != O_RDONLY,
3210
3211 #if HAVE_LZ4
3212 .compress_lz4 = compress,
3213 #elif HAVE_XZ
3214 .compress_xz = compress,
3215 #endif
3216 .compress_threshold_bytes = compress_threshold_bytes == (uint64_t) -1 ?
3217 DEFAULT_COMPRESS_THRESHOLD :
3218 MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
3219 #if HAVE_GCRYPT
3220 .seal = seal,
3221 #endif
3222 };
3223
3224 if (DEBUG_LOGGING) {
3225 static int last_seal = -1, last_compress = -1;
3226 static uint64_t last_bytes = UINT64_MAX;
3227 char bytes[FORMAT_BYTES_MAX];
3228
3229 if (last_seal != f->seal ||
3230 last_compress != JOURNAL_FILE_COMPRESS(f) ||
3231 last_bytes != f->compress_threshold_bytes) {
3232
3233 log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
3234 yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
3235 format_bytes(bytes, sizeof bytes, f->compress_threshold_bytes));
3236 last_seal = f->seal;
3237 last_compress = JOURNAL_FILE_COMPRESS(f);
3238 last_bytes = f->compress_threshold_bytes;
3239 }
3240 }
3241
3242 if (mmap_cache)
3243 f->mmap = mmap_cache_ref(mmap_cache);
3244 else {
3245 f->mmap = mmap_cache_new();
3246 if (!f->mmap) {
3247 r = -ENOMEM;
3248 goto fail;
3249 }
3250 }
3251
3252 if (fname) {
3253 f->path = strdup(fname);
3254 if (!f->path) {
3255 r = -ENOMEM;
3256 goto fail;
3257 }
3258 } else {
3259 assert(fd >= 0);
3260
3261 /* If we don't know the path, fill in something explanatory and vaguely useful */
3262 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3263 r = -ENOMEM;
3264 goto fail;
3265 }
3266 }
3267
3268 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3269 if (!f->chain_cache) {
3270 r = -ENOMEM;
3271 goto fail;
3272 }
3273
3274 if (f->fd < 0) {
3275 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3276 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3277 * it doesn't hurt in that case. */
3278
3279 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
3280 if (f->fd < 0) {
3281 r = -errno;
3282 goto fail;
3283 }
3284
3285 /* fds we opened here by us should also be closed by us. */
3286 f->close_fd = true;
3287
3288 r = fd_nonblock(f->fd, false);
3289 if (r < 0)
3290 goto fail;
3291 }
3292
3293 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3294 if (!f->cache_fd) {
3295 r = -ENOMEM;
3296 goto fail;
3297 }
3298
3299 r = journal_file_fstat(f);
3300 if (r < 0)
3301 goto fail;
3302
3303 if (f->last_stat.st_size == 0 && f->writable) {
3304
3305 (void) journal_file_warn_btrfs(f);
3306
3307 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3308 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3309 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3310 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3311 * solely on mtime/atime/ctime of the file. */
3312 (void) fd_setcrtime(f->fd, 0);
3313
3314 #if HAVE_GCRYPT
3315 /* Try to load the FSPRG state, and if we can't, then
3316 * just don't do sealing */
3317 if (f->seal) {
3318 r = journal_file_fss_load(f);
3319 if (r < 0)
3320 f->seal = false;
3321 }
3322 #endif
3323
3324 r = journal_file_init_header(f, template);
3325 if (r < 0)
3326 goto fail;
3327
3328 r = journal_file_fstat(f);
3329 if (r < 0)
3330 goto fail;
3331
3332 newly_created = true;
3333 }
3334
3335 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3336 r = -ENODATA;
3337 goto fail;
3338 }
3339
3340 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3341 if (r == -EINVAL) {
3342 /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only
3343 * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error
3344 * code. */
3345 r = -EAFNOSUPPORT;
3346 goto fail;
3347 }
3348 if (r < 0)
3349 goto fail;
3350
3351 f->header = h;
3352
3353 if (!newly_created) {
3354 set_clear_with_destructor(deferred_closes, journal_file_close);
3355
3356 r = journal_file_verify_header(f);
3357 if (r < 0)
3358 goto fail;
3359 }
3360
3361 #if HAVE_GCRYPT
3362 if (!newly_created && f->writable) {
3363 r = journal_file_fss_load(f);
3364 if (r < 0)
3365 goto fail;
3366 }
3367 #endif
3368
3369 if (f->writable) {
3370 if (metrics) {
3371 journal_default_metrics(metrics, f->fd);
3372 f->metrics = *metrics;
3373 } else if (template)
3374 f->metrics = template->metrics;
3375
3376 r = journal_file_refresh_header(f);
3377 if (r < 0)
3378 goto fail;
3379 }
3380
3381 #if HAVE_GCRYPT
3382 r = journal_file_hmac_setup(f);
3383 if (r < 0)
3384 goto fail;
3385 #endif
3386
3387 if (newly_created) {
3388 r = journal_file_setup_field_hash_table(f);
3389 if (r < 0)
3390 goto fail;
3391
3392 r = journal_file_setup_data_hash_table(f);
3393 if (r < 0)
3394 goto fail;
3395
3396 #if HAVE_GCRYPT
3397 r = journal_file_append_first_tag(f);
3398 if (r < 0)
3399 goto fail;
3400 #endif
3401 }
3402
3403 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3404 r = -EIO;
3405 goto fail;
3406 }
3407
3408 if (template && template->post_change_timer) {
3409 r = journal_file_enable_post_change_timer(
3410 f,
3411 sd_event_source_get_event(template->post_change_timer),
3412 template->post_change_timer_period);
3413
3414 if (r < 0)
3415 goto fail;
3416 }
3417
3418 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3419 f->close_fd = true;
3420
3421 *ret = f;
3422 return 0;
3423
3424 fail:
3425 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3426 r = -EIO;
3427
3428 (void) journal_file_close(f);
3429
3430 return r;
3431 }
3432
3433 int journal_file_archive(JournalFile *f) {
3434 _cleanup_free_ char *p = NULL;
3435
3436 assert(f);
3437
3438 if (!f->writable)
3439 return -EINVAL;
3440
3441 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3442 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3443 if (path_startswith(f->path, "/proc/self/fd"))
3444 return -EINVAL;
3445
3446 if (!endswith(f->path, ".journal"))
3447 return -EINVAL;
3448
3449 if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3450 (int) strlen(f->path) - 8, f->path,
3451 SD_ID128_FORMAT_VAL(f->header->seqnum_id),
3452 le64toh(f->header->head_entry_seqnum),
3453 le64toh(f->header->head_entry_realtime)) < 0)
3454 return -ENOMEM;
3455
3456 /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
3457 * ignore that case. */
3458 if (rename(f->path, p) < 0 && errno != ENOENT)
3459 return -errno;
3460
3461 /* Sync the rename to disk */
3462 (void) fsync_directory_of_file(f->fd);
3463
3464 /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
3465 * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
3466 * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue
3467 * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
3468 * occurs. */
3469 f->archive = true;
3470
3471 /* Currently, btrfs is not very good with out write patterns and fragments heavily. Let's defrag our journal
3472 * files when we archive them */
3473 f->defrag_on_close = true;
3474
3475 return 0;
3476 }
3477
3478 JournalFile* journal_initiate_close(
3479 JournalFile *f,
3480 Set *deferred_closes) {
3481
3482 int r;
3483
3484 assert(f);
3485
3486 if (deferred_closes) {
3487
3488 r = set_put(deferred_closes, f);
3489 if (r < 0)
3490 log_debug_errno(r, "Failed to add file to deferred close set, closing immediately.");
3491 else {
3492 (void) journal_file_set_offline(f, false);
3493 return NULL;
3494 }
3495 }
3496
3497 return journal_file_close(f);
3498 }
3499
3500 int journal_file_rotate(
3501 JournalFile **f,
3502 bool compress,
3503 uint64_t compress_threshold_bytes,
3504 bool seal,
3505 Set *deferred_closes) {
3506
3507 JournalFile *new_file = NULL;
3508 int r;
3509
3510 assert(f);
3511 assert(*f);
3512
3513 r = journal_file_archive(*f);
3514 if (r < 0)
3515 return r;
3516
3517 r = journal_file_open(
3518 -1,
3519 (*f)->path,
3520 (*f)->flags,
3521 (*f)->mode,
3522 compress,
3523 compress_threshold_bytes,
3524 seal,
3525 NULL, /* metrics */
3526 (*f)->mmap,
3527 deferred_closes,
3528 *f, /* template */
3529 &new_file);
3530
3531 journal_initiate_close(*f, deferred_closes);
3532 *f = new_file;
3533
3534 return r;
3535 }
3536
3537 int journal_file_dispose(int dir_fd, const char *fname) {
3538 _cleanup_free_ char *p = NULL;
3539 _cleanup_close_ int fd = -1;
3540
3541 assert(fname);
3542
3543 /* Renames a journal file to *.journal~, i.e. to mark it as corruped or otherwise uncleanly shutdown. Note that
3544 * this is done without looking into the file or changing any of its contents. The idea is that this is called
3545 * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
3546 * for writing anymore. */
3547
3548 if (!endswith(fname, ".journal"))
3549 return -EINVAL;
3550
3551 if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
3552 (int) strlen(fname) - 8, fname,
3553 now(CLOCK_REALTIME),
3554 random_u64()) < 0)
3555 return -ENOMEM;
3556
3557 if (renameat(dir_fd, fname, dir_fd, p) < 0)
3558 return -errno;
3559
3560 /* btrfs doesn't cope well with our write pattern and fragments heavily. Let's defrag all files we rotate */
3561 fd = openat(dir_fd, p, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
3562 if (fd < 0)
3563 log_debug_errno(errno, "Failed to open file for defragmentation/FS_NOCOW_FL, ignoring: %m");
3564 else {
3565 (void) chattr_fd(fd, 0, FS_NOCOW_FL, NULL);
3566 (void) btrfs_defrag_fd(fd);
3567 }
3568
3569 return 0;
3570 }
3571
3572 int journal_file_open_reliably(
3573 const char *fname,
3574 int flags,
3575 mode_t mode,
3576 bool compress,
3577 uint64_t compress_threshold_bytes,
3578 bool seal,
3579 JournalMetrics *metrics,
3580 MMapCache *mmap_cache,
3581 Set *deferred_closes,
3582 JournalFile *template,
3583 JournalFile **ret) {
3584
3585 int r;
3586
3587 r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3588 deferred_closes, template, ret);
3589 if (!IN_SET(r,
3590 -EBADMSG, /* Corrupted */
3591 -ENODATA, /* Truncated */
3592 -EHOSTDOWN, /* Other machine */
3593 -EPROTONOSUPPORT, /* Incompatible feature */
3594 -EBUSY, /* Unclean shutdown */
3595 -ESHUTDOWN, /* Already archived */
3596 -EIO, /* IO error, including SIGBUS on mmap */
3597 -EIDRM, /* File has been deleted */
3598 -ETXTBSY)) /* File is from the future */
3599 return r;
3600
3601 if ((flags & O_ACCMODE) == O_RDONLY)
3602 return r;
3603
3604 if (!(flags & O_CREAT))
3605 return r;
3606
3607 if (!endswith(fname, ".journal"))
3608 return r;
3609
3610 /* The file is corrupted. Rotate it away and try it again (but only once) */
3611 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3612
3613 r = journal_file_dispose(AT_FDCWD, fname);
3614 if (r < 0)
3615 return r;
3616
3617 return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3618 deferred_closes, template, ret);
3619 }
3620
3621 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
3622 uint64_t i, n;
3623 uint64_t q, xor_hash = 0;
3624 int r;
3625 EntryItem *items;
3626 dual_timestamp ts;
3627 const sd_id128_t *boot_id;
3628
3629 assert(from);
3630 assert(to);
3631 assert(o);
3632 assert(p);
3633
3634 if (!to->writable)
3635 return -EPERM;
3636
3637 ts.monotonic = le64toh(o->entry.monotonic);
3638 ts.realtime = le64toh(o->entry.realtime);
3639 boot_id = &o->entry.boot_id;
3640
3641 n = journal_file_entry_n_items(o);
3642 /* alloca() can't take 0, hence let's allocate at least one */
3643 items = newa(EntryItem, MAX(1u, n));
3644
3645 for (i = 0; i < n; i++) {
3646 uint64_t l, h;
3647 le64_t le_hash;
3648 size_t t;
3649 void *data;
3650 Object *u;
3651
3652 q = le64toh(o->entry.items[i].object_offset);
3653 le_hash = o->entry.items[i].hash;
3654
3655 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3656 if (r < 0)
3657 return r;
3658
3659 if (le_hash != o->data.hash)
3660 return -EBADMSG;
3661
3662 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3663 t = (size_t) l;
3664
3665 /* We hit the limit on 32bit machines */
3666 if ((uint64_t) t != l)
3667 return -E2BIG;
3668
3669 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3670 #if HAVE_XZ || HAVE_LZ4
3671 size_t rsize = 0;
3672
3673 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3674 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3675 if (r < 0)
3676 return r;
3677
3678 data = from->compress_buffer;
3679 l = rsize;
3680 #else
3681 return -EPROTONOSUPPORT;
3682 #endif
3683 } else
3684 data = o->data.payload;
3685
3686 r = journal_file_append_data(to, data, l, &u, &h);
3687 if (r < 0)
3688 return r;
3689
3690 xor_hash ^= le64toh(u->data.hash);
3691 items[i].object_offset = htole64(h);
3692 items[i].hash = u->data.hash;
3693
3694 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3695 if (r < 0)
3696 return r;
3697 }
3698
3699 r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n,
3700 NULL, NULL, NULL);
3701
3702 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3703 return -EIO;
3704
3705 return r;
3706 }
3707
3708 void journal_reset_metrics(JournalMetrics *m) {
3709 assert(m);
3710
3711 /* Set everything to "pick automatic values". */
3712
3713 *m = (JournalMetrics) {
3714 .min_use = (uint64_t) -1,
3715 .max_use = (uint64_t) -1,
3716 .min_size = (uint64_t) -1,
3717 .max_size = (uint64_t) -1,
3718 .keep_free = (uint64_t) -1,
3719 .n_max_files = (uint64_t) -1,
3720 };
3721 }
3722
3723 void journal_default_metrics(JournalMetrics *m, int fd) {
3724 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3725 struct statvfs ss;
3726 uint64_t fs_size = 0;
3727
3728 assert(m);
3729 assert(fd >= 0);
3730
3731 if (fstatvfs(fd, &ss) >= 0)
3732 fs_size = ss.f_frsize * ss.f_blocks;
3733 else
3734 log_debug_errno(errno, "Failed to determine disk size: %m");
3735
3736 if (m->max_use == (uint64_t) -1) {
3737
3738 if (fs_size > 0)
3739 m->max_use = CLAMP(PAGE_ALIGN(fs_size / 10), /* 10% of file system size */
3740 MAX_USE_LOWER, MAX_USE_UPPER);
3741 else
3742 m->max_use = MAX_USE_LOWER;
3743 } else {
3744 m->max_use = PAGE_ALIGN(m->max_use);
3745
3746 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3747 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3748 }
3749
3750 if (m->min_use == (uint64_t) -1) {
3751 if (fs_size > 0)
3752 m->min_use = CLAMP(PAGE_ALIGN(fs_size / 50), /* 2% of file system size */
3753 MIN_USE_LOW, MIN_USE_HIGH);
3754 else
3755 m->min_use = MIN_USE_LOW;
3756 }
3757
3758 if (m->min_use > m->max_use)
3759 m->min_use = m->max_use;
3760
3761 if (m->max_size == (uint64_t) -1)
3762 m->max_size = MIN(PAGE_ALIGN(m->max_use / 8), /* 8 chunks */
3763 MAX_SIZE_UPPER);
3764 else
3765 m->max_size = PAGE_ALIGN(m->max_size);
3766
3767 if (m->max_size != 0) {
3768 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3769 m->max_size = JOURNAL_FILE_SIZE_MIN;
3770
3771 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3772 m->max_use = m->max_size*2;
3773 }
3774
3775 if (m->min_size == (uint64_t) -1)
3776 m->min_size = JOURNAL_FILE_SIZE_MIN;
3777 else
3778 m->min_size = CLAMP(PAGE_ALIGN(m->min_size),
3779 JOURNAL_FILE_SIZE_MIN,
3780 m->max_size ?: UINT64_MAX);
3781
3782 if (m->keep_free == (uint64_t) -1) {
3783 if (fs_size > 0)
3784 m->keep_free = MIN(PAGE_ALIGN(fs_size / 20), /* 5% of file system size */
3785 KEEP_FREE_UPPER);
3786 else
3787 m->keep_free = DEFAULT_KEEP_FREE;
3788 }
3789
3790 if (m->n_max_files == (uint64_t) -1)
3791 m->n_max_files = DEFAULT_N_MAX_FILES;
3792
3793 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3794 format_bytes(a, sizeof(a), m->min_use),
3795 format_bytes(b, sizeof(b), m->max_use),
3796 format_bytes(c, sizeof(c), m->max_size),
3797 format_bytes(d, sizeof(d), m->min_size),
3798 format_bytes(e, sizeof(e), m->keep_free),
3799 m->n_max_files);
3800 }
3801
3802 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3803 assert(f);
3804 assert(f->header);
3805 assert(from || to);
3806
3807 if (from) {
3808 if (f->header->head_entry_realtime == 0)
3809 return -ENOENT;
3810
3811 *from = le64toh(f->header->head_entry_realtime);
3812 }
3813
3814 if (to) {
3815 if (f->header->tail_entry_realtime == 0)
3816 return -ENOENT;
3817
3818 *to = le64toh(f->header->tail_entry_realtime);
3819 }
3820
3821 return 1;
3822 }
3823
3824 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3825 Object *o;
3826 uint64_t p;
3827 int r;
3828
3829 assert(f);
3830 assert(from || to);
3831
3832 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3833 if (r <= 0)
3834 return r;
3835
3836 if (le64toh(o->data.n_entries) <= 0)
3837 return 0;
3838
3839 if (from) {
3840 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3841 if (r < 0)
3842 return r;
3843
3844 *from = le64toh(o->entry.monotonic);
3845 }
3846
3847 if (to) {
3848 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3849 if (r < 0)
3850 return r;
3851
3852 r = generic_array_get_plus_one(f,
3853 le64toh(o->data.entry_offset),
3854 le64toh(o->data.entry_array_offset),
3855 le64toh(o->data.n_entries)-1,
3856 &o, NULL);
3857 if (r <= 0)
3858 return r;
3859
3860 *to = le64toh(o->entry.monotonic);
3861 }
3862
3863 return 1;
3864 }
3865
3866 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3867 assert(f);
3868 assert(f->header);
3869
3870 /* If we gained new header fields we gained new features,
3871 * hence suggest a rotation */
3872 if (le64toh(f->header->header_size) < sizeof(Header)) {
3873 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3874 return true;
3875 }
3876
3877 /* Let's check if the hash tables grew over a certain fill
3878 * level (75%, borrowing this value from Java's hash table
3879 * implementation), and if so suggest a rotation. To calculate
3880 * the fill level we need the n_data field, which only exists
3881 * in newer versions. */
3882
3883 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3884 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3885 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3886 f->path,
3887 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3888 le64toh(f->header->n_data),
3889 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3890 (unsigned long long) f->last_stat.st_size,
3891 f->last_stat.st_size / le64toh(f->header->n_data));
3892 return true;
3893 }
3894
3895 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3896 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3897 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3898 f->path,
3899 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3900 le64toh(f->header->n_fields),
3901 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3902 return true;
3903 }
3904
3905 /* Are the data objects properly indexed by field objects? */
3906 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3907 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3908 le64toh(f->header->n_data) > 0 &&
3909 le64toh(f->header->n_fields) == 0)
3910 return true;
3911
3912 if (max_file_usec > 0) {
3913 usec_t t, h;
3914
3915 h = le64toh(f->header->head_entry_realtime);
3916 t = now(CLOCK_REALTIME);
3917
3918 if (h > 0 && t > h + max_file_usec)
3919 return true;
3920 }
3921
3922 return false;
3923 }