]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
util: split out sorting related calls to new sort-util.[ch]
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/fs.h>
6 #include <pthread.h>
7 #include <stddef.h>
8 #include <sys/mman.h>
9 #include <sys/statvfs.h>
10 #include <sys/uio.h>
11 #include <unistd.h>
12
13 #include "sd-event.h"
14
15 #include "alloc-util.h"
16 #include "btrfs-util.h"
17 #include "chattr-util.h"
18 #include "compress.h"
19 #include "fd-util.h"
20 #include "fs-util.h"
21 #include "journal-authenticate.h"
22 #include "journal-def.h"
23 #include "journal-file.h"
24 #include "lookup3.h"
25 #include "memory-util.h"
26 #include "parse-util.h"
27 #include "path-util.h"
28 #include "random-util.h"
29 #include "set.h"
30 #include "sort-util.h"
31 #include "stat-util.h"
32 #include "string-util.h"
33 #include "strv.h"
34 #include "xattr-util.h"
35
36 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
37 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
38
39 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
40 #define MIN_COMPRESS_THRESHOLD (8ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
49
50 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
51 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
52
53 /* This is the upper bound if we deduce max_size from max_use */
54 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
55
56 /* This is the upper bound if we deduce the keep_free value from the
57 * file system size */
58 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
59
60 /* This is the keep_free value when we can't determine the system
61 * size */
62 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
63
64 /* This is the default maximum number of journal files to keep around. */
65 #define DEFAULT_N_MAX_FILES (100)
66
67 /* n_data was the first entry we added after the initial file format design */
68 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
69
70 /* How many entries to keep in the entry array chain cache at max */
71 #define CHAIN_CACHE_MAX 20
72
73 /* How much to increase the journal file size at once each time we allocate something new. */
74 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
75
76 /* Reread fstat() of the file for detecting deletions at least this often */
77 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
78
79 /* The mmap context to use for the header we pick as one above the last defined typed */
80 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
81
82 #ifdef __clang__
83 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
84 #endif
85
86 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
87 * As a result we use atomic operations on f->offline_state for inter-thread communications with
88 * journal_file_set_offline() and journal_file_set_online(). */
89 static void journal_file_set_offline_internal(JournalFile *f) {
90 assert(f);
91 assert(f->fd >= 0);
92 assert(f->header);
93
94 for (;;) {
95 switch (f->offline_state) {
96 case OFFLINE_CANCEL:
97 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
98 continue;
99 return;
100
101 case OFFLINE_AGAIN_FROM_SYNCING:
102 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
103 continue;
104 break;
105
106 case OFFLINE_AGAIN_FROM_OFFLINING:
107 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
108 continue;
109 break;
110
111 case OFFLINE_SYNCING:
112 (void) fsync(f->fd);
113
114 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
115 continue;
116
117 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
118 (void) fsync(f->fd);
119 break;
120
121 case OFFLINE_OFFLINING:
122 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
123 continue;
124 _fallthrough_;
125 case OFFLINE_DONE:
126 return;
127
128 case OFFLINE_JOINED:
129 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
130 return;
131 }
132 }
133 }
134
135 static void * journal_file_set_offline_thread(void *arg) {
136 JournalFile *f = arg;
137
138 (void) pthread_setname_np(pthread_self(), "journal-offline");
139
140 journal_file_set_offline_internal(f);
141
142 return NULL;
143 }
144
145 static int journal_file_set_offline_thread_join(JournalFile *f) {
146 int r;
147
148 assert(f);
149
150 if (f->offline_state == OFFLINE_JOINED)
151 return 0;
152
153 r = pthread_join(f->offline_thread, NULL);
154 if (r)
155 return -r;
156
157 f->offline_state = OFFLINE_JOINED;
158
159 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
160 return -EIO;
161
162 return 0;
163 }
164
165 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
166 static bool journal_file_set_offline_try_restart(JournalFile *f) {
167 for (;;) {
168 switch (f->offline_state) {
169 case OFFLINE_AGAIN_FROM_SYNCING:
170 case OFFLINE_AGAIN_FROM_OFFLINING:
171 return true;
172
173 case OFFLINE_CANCEL:
174 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
175 continue;
176 return true;
177
178 case OFFLINE_SYNCING:
179 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
180 continue;
181 return true;
182
183 case OFFLINE_OFFLINING:
184 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
185 continue;
186 return true;
187
188 default:
189 return false;
190 }
191 }
192 }
193
194 /* Sets a journal offline.
195 *
196 * If wait is false then an offline is dispatched in a separate thread for a
197 * subsequent journal_file_set_offline() or journal_file_set_online() of the
198 * same journal to synchronize with.
199 *
200 * If wait is true, then either an existing offline thread will be restarted
201 * and joined, or if none exists the offline is simply performed in this
202 * context without involving another thread.
203 */
204 int journal_file_set_offline(JournalFile *f, bool wait) {
205 bool restarted;
206 int r;
207
208 assert(f);
209
210 if (!f->writable)
211 return -EPERM;
212
213 if (f->fd < 0 || !f->header)
214 return -EINVAL;
215
216 /* An offlining journal is implicitly online and may modify f->header->state,
217 * we must also join any potentially lingering offline thread when not online. */
218 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
219 return journal_file_set_offline_thread_join(f);
220
221 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
222 restarted = journal_file_set_offline_try_restart(f);
223 if ((restarted && wait) || !restarted) {
224 r = journal_file_set_offline_thread_join(f);
225 if (r < 0)
226 return r;
227 }
228
229 if (restarted)
230 return 0;
231
232 /* Initiate a new offline. */
233 f->offline_state = OFFLINE_SYNCING;
234
235 if (wait) /* Without using a thread if waiting. */
236 journal_file_set_offline_internal(f);
237 else {
238 sigset_t ss, saved_ss;
239 int k;
240
241 assert_se(sigfillset(&ss) >= 0);
242
243 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
244 if (r > 0)
245 return -r;
246
247 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
248
249 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
250 if (r > 0) {
251 f->offline_state = OFFLINE_JOINED;
252 return -r;
253 }
254 if (k > 0)
255 return -k;
256 }
257
258 return 0;
259 }
260
261 static int journal_file_set_online(JournalFile *f) {
262 bool wait = true;
263
264 assert(f);
265
266 if (!f->writable)
267 return -EPERM;
268
269 if (f->fd < 0 || !f->header)
270 return -EINVAL;
271
272 while (wait) {
273 switch (f->offline_state) {
274 case OFFLINE_JOINED:
275 /* No offline thread, no need to wait. */
276 wait = false;
277 break;
278
279 case OFFLINE_SYNCING:
280 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
281 continue;
282 /* Canceled syncing prior to offlining, no need to wait. */
283 wait = false;
284 break;
285
286 case OFFLINE_AGAIN_FROM_SYNCING:
287 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
288 continue;
289 /* Canceled restart from syncing, no need to wait. */
290 wait = false;
291 break;
292
293 case OFFLINE_AGAIN_FROM_OFFLINING:
294 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
295 continue;
296 /* Canceled restart from offlining, must wait for offlining to complete however. */
297 _fallthrough_;
298 default: {
299 int r;
300
301 r = journal_file_set_offline_thread_join(f);
302 if (r < 0)
303 return r;
304
305 wait = false;
306 break;
307 }
308 }
309 }
310
311 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
312 return -EIO;
313
314 switch (f->header->state) {
315 case STATE_ONLINE:
316 return 0;
317
318 case STATE_OFFLINE:
319 f->header->state = STATE_ONLINE;
320 (void) fsync(f->fd);
321 return 0;
322
323 default:
324 return -EINVAL;
325 }
326 }
327
328 bool journal_file_is_offlining(JournalFile *f) {
329 assert(f);
330
331 __sync_synchronize();
332
333 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
334 return false;
335
336 return true;
337 }
338
339 JournalFile* journal_file_close(JournalFile *f) {
340 assert(f);
341
342 #if HAVE_GCRYPT
343 /* Write the final tag */
344 if (f->seal && f->writable) {
345 int r;
346
347 r = journal_file_append_tag(f);
348 if (r < 0)
349 log_error_errno(r, "Failed to append tag when closing journal: %m");
350 }
351 #endif
352
353 if (f->post_change_timer) {
354 if (sd_event_source_get_enabled(f->post_change_timer, NULL) > 0)
355 journal_file_post_change(f);
356
357 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
358 sd_event_source_unref(f->post_change_timer);
359 }
360
361 journal_file_set_offline(f, true);
362
363 if (f->mmap && f->cache_fd)
364 mmap_cache_free_fd(f->mmap, f->cache_fd);
365
366 if (f->fd >= 0 && f->defrag_on_close) {
367
368 /* Be friendly to btrfs: turn COW back on again now,
369 * and defragment the file. We won't write to the file
370 * ever again, hence remove all fragmentation, and
371 * reenable all the good bits COW usually provides
372 * (such as data checksumming). */
373
374 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL);
375 (void) btrfs_defrag_fd(f->fd);
376 }
377
378 if (f->close_fd)
379 safe_close(f->fd);
380 free(f->path);
381
382 mmap_cache_unref(f->mmap);
383
384 ordered_hashmap_free_free(f->chain_cache);
385
386 #if HAVE_XZ || HAVE_LZ4
387 free(f->compress_buffer);
388 #endif
389
390 #if HAVE_GCRYPT
391 if (f->fss_file)
392 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
393 else
394 free(f->fsprg_state);
395
396 free(f->fsprg_seed);
397
398 if (f->hmac)
399 gcry_md_close(f->hmac);
400 #endif
401
402 return mfree(f);
403 }
404
405 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
406 Header h = {};
407 ssize_t k;
408 int r;
409
410 assert(f);
411
412 memcpy(h.signature, HEADER_SIGNATURE, 8);
413 h.header_size = htole64(ALIGN64(sizeof(h)));
414
415 h.incompatible_flags |= htole32(
416 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
417 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
418
419 h.compatible_flags = htole32(
420 f->seal * HEADER_COMPATIBLE_SEALED);
421
422 r = sd_id128_randomize(&h.file_id);
423 if (r < 0)
424 return r;
425
426 if (template) {
427 h.seqnum_id = template->header->seqnum_id;
428 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
429 } else
430 h.seqnum_id = h.file_id;
431
432 k = pwrite(f->fd, &h, sizeof(h), 0);
433 if (k < 0)
434 return -errno;
435
436 if (k != sizeof(h))
437 return -EIO;
438
439 return 0;
440 }
441
442 static int journal_file_refresh_header(JournalFile *f) {
443 sd_id128_t boot_id;
444 int r;
445
446 assert(f);
447 assert(f->header);
448
449 r = sd_id128_get_machine(&f->header->machine_id);
450 if (IN_SET(r, -ENOENT, -ENOMEDIUM))
451 /* We don't have a machine-id, let's continue without */
452 zero(f->header->machine_id);
453 else if (r < 0)
454 return r;
455
456 r = sd_id128_get_boot(&boot_id);
457 if (r < 0)
458 return r;
459
460 f->header->boot_id = boot_id;
461
462 r = journal_file_set_online(f);
463
464 /* Sync the online state to disk */
465 (void) fsync(f->fd);
466
467 /* We likely just created a new file, also sync the directory this file is located in. */
468 (void) fsync_directory_of_file(f->fd);
469
470 return r;
471 }
472
473 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
474 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
475 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
476 const char *type = compatible ? "compatible" : "incompatible";
477 uint32_t flags;
478
479 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
480
481 if (flags & ~supported) {
482 if (flags & ~any)
483 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
484 f->path, type, flags & ~any);
485 flags = (flags & any) & ~supported;
486 if (flags) {
487 const char* strv[3];
488 unsigned n = 0;
489 _cleanup_free_ char *t = NULL;
490
491 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
492 strv[n++] = "sealed";
493 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
494 strv[n++] = "xz-compressed";
495 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
496 strv[n++] = "lz4-compressed";
497 strv[n] = NULL;
498 assert(n < ELEMENTSOF(strv));
499
500 t = strv_join((char**) strv, ", ");
501 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
502 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
503 }
504 return true;
505 }
506
507 return false;
508 }
509
510 static int journal_file_verify_header(JournalFile *f) {
511 uint64_t arena_size, header_size;
512
513 assert(f);
514 assert(f->header);
515
516 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
517 return -EBADMSG;
518
519 /* In both read and write mode we refuse to open files with incompatible
520 * flags we don't know. */
521 if (warn_wrong_flags(f, false))
522 return -EPROTONOSUPPORT;
523
524 /* When open for writing we refuse to open files with compatible flags, too. */
525 if (f->writable && warn_wrong_flags(f, true))
526 return -EPROTONOSUPPORT;
527
528 if (f->header->state >= _STATE_MAX)
529 return -EBADMSG;
530
531 header_size = le64toh(f->header->header_size);
532
533 /* The first addition was n_data, so check that we are at least this large */
534 if (header_size < HEADER_SIZE_MIN)
535 return -EBADMSG;
536
537 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
538 return -EBADMSG;
539
540 arena_size = le64toh(f->header->arena_size);
541
542 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
543 return -ENODATA;
544
545 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
546 return -ENODATA;
547
548 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
549 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
550 !VALID64(le64toh(f->header->tail_object_offset)) ||
551 !VALID64(le64toh(f->header->entry_array_offset)))
552 return -ENODATA;
553
554 if (f->writable) {
555 sd_id128_t machine_id;
556 uint8_t state;
557 int r;
558
559 r = sd_id128_get_machine(&machine_id);
560 if (r < 0)
561 return r;
562
563 if (!sd_id128_equal(machine_id, f->header->machine_id))
564 return -EHOSTDOWN;
565
566 state = f->header->state;
567
568 if (state == STATE_ARCHIVED)
569 return -ESHUTDOWN; /* Already archived */
570 else if (state == STATE_ONLINE)
571 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
572 "Journal file %s is already online. Assuming unclean closing.",
573 f->path);
574 else if (state != STATE_OFFLINE)
575 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
576 "Journal file %s has unknown state %i.",
577 f->path, state);
578
579 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
580 return -EBADMSG;
581
582 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
583 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
584 * bisection. */
585 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME))
586 return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY),
587 "Journal file %s is from the future, refusing to append new data to it that'd be older.",
588 f->path);
589 }
590
591 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
592 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
593
594 f->seal = JOURNAL_HEADER_SEALED(f->header);
595
596 return 0;
597 }
598
599 static int journal_file_fstat(JournalFile *f) {
600 int r;
601
602 assert(f);
603 assert(f->fd >= 0);
604
605 if (fstat(f->fd, &f->last_stat) < 0)
606 return -errno;
607
608 f->last_stat_usec = now(CLOCK_MONOTONIC);
609
610 /* Refuse dealing with with files that aren't regular */
611 r = stat_verify_regular(&f->last_stat);
612 if (r < 0)
613 return r;
614
615 /* Refuse appending to files that are already deleted */
616 if (f->last_stat.st_nlink <= 0)
617 return -EIDRM;
618
619 return 0;
620 }
621
622 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
623 uint64_t old_size, new_size;
624 int r;
625
626 assert(f);
627 assert(f->header);
628
629 /* We assume that this file is not sparse, and we know that
630 * for sure, since we always call posix_fallocate()
631 * ourselves */
632
633 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
634 return -EIO;
635
636 old_size =
637 le64toh(f->header->header_size) +
638 le64toh(f->header->arena_size);
639
640 new_size = PAGE_ALIGN(offset + size);
641 if (new_size < le64toh(f->header->header_size))
642 new_size = le64toh(f->header->header_size);
643
644 if (new_size <= old_size) {
645
646 /* We already pre-allocated enough space, but before
647 * we write to it, let's check with fstat() if the
648 * file got deleted, in order make sure we don't throw
649 * away the data immediately. Don't check fstat() for
650 * all writes though, but only once ever 10s. */
651
652 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
653 return 0;
654
655 return journal_file_fstat(f);
656 }
657
658 /* Allocate more space. */
659
660 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
661 return -E2BIG;
662
663 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
664 struct statvfs svfs;
665
666 if (fstatvfs(f->fd, &svfs) >= 0) {
667 uint64_t available;
668
669 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
670
671 if (new_size - old_size > available)
672 return -E2BIG;
673 }
674 }
675
676 /* Increase by larger blocks at once */
677 new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
678 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
679 new_size = f->metrics.max_size;
680
681 /* Note that the glibc fallocate() fallback is very
682 inefficient, hence we try to minimize the allocation area
683 as we can. */
684 r = posix_fallocate(f->fd, old_size, new_size - old_size);
685 if (r != 0)
686 return -r;
687
688 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
689
690 return journal_file_fstat(f);
691 }
692
693 static unsigned type_to_context(ObjectType type) {
694 /* One context for each type, plus one catch-all for the rest */
695 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
696 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
697 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
698 }
699
700 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
701 int r;
702
703 assert(f);
704 assert(ret);
705
706 if (size <= 0)
707 return -EINVAL;
708
709 /* Avoid SIGBUS on invalid accesses */
710 if (offset + size > (uint64_t) f->last_stat.st_size) {
711 /* Hmm, out of range? Let's refresh the fstat() data
712 * first, before we trust that check. */
713
714 r = journal_file_fstat(f);
715 if (r < 0)
716 return r;
717
718 if (offset + size > (uint64_t) f->last_stat.st_size)
719 return -EADDRNOTAVAIL;
720 }
721
722 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
723 }
724
725 static uint64_t minimum_header_size(Object *o) {
726
727 static const uint64_t table[] = {
728 [OBJECT_DATA] = sizeof(DataObject),
729 [OBJECT_FIELD] = sizeof(FieldObject),
730 [OBJECT_ENTRY] = sizeof(EntryObject),
731 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
732 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
733 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
734 [OBJECT_TAG] = sizeof(TagObject),
735 };
736
737 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
738 return sizeof(ObjectHeader);
739
740 return table[o->object.type];
741 }
742
743 /* Lightweight object checks. We want this to be fast, so that we won't
744 * slowdown every journal_file_move_to_object() call too much. */
745 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
746 assert(f);
747 assert(o);
748
749 switch (o->object.type) {
750
751 case OBJECT_DATA: {
752 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0))
753 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
754 "Bad n_entries: %" PRIu64 ": %" PRIu64,
755 le64toh(o->data.n_entries),
756 offset);
757
758 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0)
759 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
760 "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,
761 offsetof(DataObject, payload),
762 le64toh(o->object.size),
763 offset);
764
765 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
766 !VALID64(le64toh(o->data.next_field_offset)) ||
767 !VALID64(le64toh(o->data.entry_offset)) ||
768 !VALID64(le64toh(o->data.entry_array_offset)))
769 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
770 "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64,
771 le64toh(o->data.next_hash_offset),
772 le64toh(o->data.next_field_offset),
773 le64toh(o->data.entry_offset),
774 le64toh(o->data.entry_array_offset),
775 offset);
776
777 break;
778 }
779
780 case OBJECT_FIELD:
781 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0)
782 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
783 "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64,
784 offsetof(FieldObject, payload),
785 le64toh(o->object.size),
786 offset);
787
788 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
789 !VALID64(le64toh(o->field.head_data_offset)))
790 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
791 "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64,
792 le64toh(o->field.next_hash_offset),
793 le64toh(o->field.head_data_offset),
794 offset);
795 break;
796
797 case OBJECT_ENTRY:
798 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0)
799 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
800 "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,
801 offsetof(EntryObject, items),
802 le64toh(o->object.size),
803 offset);
804
805 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0)
806 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
807 "Invalid number items in entry: %" PRIu64 ": %" PRIu64,
808 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
809 offset);
810
811 if (le64toh(o->entry.seqnum) <= 0)
812 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
813 "Invalid entry seqnum: %" PRIx64 ": %" PRIu64,
814 le64toh(o->entry.seqnum),
815 offset);
816
817 if (!VALID_REALTIME(le64toh(o->entry.realtime)))
818 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
819 "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64,
820 le64toh(o->entry.realtime),
821 offset);
822
823 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic)))
824 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
825 "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64,
826 le64toh(o->entry.monotonic),
827 offset);
828
829 break;
830
831 case OBJECT_DATA_HASH_TABLE:
832 case OBJECT_FIELD_HASH_TABLE:
833 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
834 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0)
835 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
836 "Invalid %s hash table size: %" PRIu64 ": %" PRIu64,
837 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
838 le64toh(o->object.size),
839 offset);
840
841 break;
842
843 case OBJECT_ENTRY_ARRAY:
844 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
845 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0)
846 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
847 "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
848 le64toh(o->object.size),
849 offset);
850
851 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset)))
852 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
853 "Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64,
854 le64toh(o->entry_array.next_entry_array_offset),
855 offset);
856
857 break;
858
859 case OBJECT_TAG:
860 if (le64toh(o->object.size) != sizeof(TagObject))
861 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
862 "Invalid object tag size: %" PRIu64 ": %" PRIu64,
863 le64toh(o->object.size),
864 offset);
865
866 if (!VALID_EPOCH(le64toh(o->tag.epoch)))
867 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
868 "Invalid object tag epoch: %" PRIu64 ": %" PRIu64,
869 le64toh(o->tag.epoch), offset);
870
871 break;
872 }
873
874 return 0;
875 }
876
877 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
878 int r;
879 void *t;
880 size_t tsize;
881 Object *o;
882 uint64_t s;
883
884 assert(f);
885 assert(ret);
886
887 /* Objects may only be located at multiple of 64 bit */
888 if (!VALID64(offset))
889 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
890 "Attempt to move to object at non-64bit boundary: %" PRIu64,
891 offset);
892
893 /* Object may not be located in the file header */
894 if (offset < le64toh(f->header->header_size))
895 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
896 "Attempt to move to object located in file header: %" PRIu64,
897 offset);
898
899 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
900 if (r < 0)
901 return r;
902
903 o = (Object*) t;
904 s = le64toh(o->object.size);
905
906 if (s == 0)
907 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
908 "Attempt to move to uninitialized object: %" PRIu64,
909 offset);
910 if (s < sizeof(ObjectHeader))
911 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
912 "Attempt to move to overly short object: %" PRIu64,
913 offset);
914
915 if (o->object.type <= OBJECT_UNUSED)
916 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
917 "Attempt to move to object with invalid type: %" PRIu64,
918 offset);
919
920 if (s < minimum_header_size(o))
921 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
922 "Attempt to move to truncated object: %" PRIu64,
923 offset);
924
925 if (type > OBJECT_UNUSED && o->object.type != type)
926 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
927 "Attempt to move to object of unexpected type: %" PRIu64,
928 offset);
929
930 if (s > tsize) {
931 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
932 if (r < 0)
933 return r;
934
935 o = (Object*) t;
936 }
937
938 r = journal_file_check_object(f, offset, o);
939 if (r < 0)
940 return r;
941
942 *ret = o;
943 return 0;
944 }
945
946 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
947 uint64_t r;
948
949 assert(f);
950 assert(f->header);
951
952 r = le64toh(f->header->tail_entry_seqnum) + 1;
953
954 if (seqnum) {
955 /* If an external seqnum counter was passed, we update
956 * both the local and the external one, and set it to
957 * the maximum of both */
958
959 if (*seqnum + 1 > r)
960 r = *seqnum + 1;
961
962 *seqnum = r;
963 }
964
965 f->header->tail_entry_seqnum = htole64(r);
966
967 if (f->header->head_entry_seqnum == 0)
968 f->header->head_entry_seqnum = htole64(r);
969
970 return r;
971 }
972
973 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
974 int r;
975 uint64_t p;
976 Object *tail, *o;
977 void *t;
978
979 assert(f);
980 assert(f->header);
981 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
982 assert(size >= sizeof(ObjectHeader));
983 assert(offset);
984 assert(ret);
985
986 r = journal_file_set_online(f);
987 if (r < 0)
988 return r;
989
990 p = le64toh(f->header->tail_object_offset);
991 if (p == 0)
992 p = le64toh(f->header->header_size);
993 else {
994 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
995 if (r < 0)
996 return r;
997
998 p += ALIGN64(le64toh(tail->object.size));
999 }
1000
1001 r = journal_file_allocate(f, p, size);
1002 if (r < 0)
1003 return r;
1004
1005 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1006 if (r < 0)
1007 return r;
1008
1009 o = (Object*) t;
1010
1011 zero(o->object);
1012 o->object.type = type;
1013 o->object.size = htole64(size);
1014
1015 f->header->tail_object_offset = htole64(p);
1016 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1017
1018 *ret = o;
1019 *offset = p;
1020
1021 return 0;
1022 }
1023
1024 static int journal_file_setup_data_hash_table(JournalFile *f) {
1025 uint64_t s, p;
1026 Object *o;
1027 int r;
1028
1029 assert(f);
1030 assert(f->header);
1031
1032 /* We estimate that we need 1 hash table entry per 768 bytes
1033 of journal file and we want to make sure we never get
1034 beyond 75% fill level. Calculate the hash table size for
1035 the maximum file size based on these metrics. */
1036
1037 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1038 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1039 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1040
1041 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1042
1043 r = journal_file_append_object(f,
1044 OBJECT_DATA_HASH_TABLE,
1045 offsetof(Object, hash_table.items) + s,
1046 &o, &p);
1047 if (r < 0)
1048 return r;
1049
1050 memzero(o->hash_table.items, s);
1051
1052 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1053 f->header->data_hash_table_size = htole64(s);
1054
1055 return 0;
1056 }
1057
1058 static int journal_file_setup_field_hash_table(JournalFile *f) {
1059 uint64_t s, p;
1060 Object *o;
1061 int r;
1062
1063 assert(f);
1064 assert(f->header);
1065
1066 /* We use a fixed size hash table for the fields as this
1067 * number should grow very slowly only */
1068
1069 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1070 r = journal_file_append_object(f,
1071 OBJECT_FIELD_HASH_TABLE,
1072 offsetof(Object, hash_table.items) + s,
1073 &o, &p);
1074 if (r < 0)
1075 return r;
1076
1077 memzero(o->hash_table.items, s);
1078
1079 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1080 f->header->field_hash_table_size = htole64(s);
1081
1082 return 0;
1083 }
1084
1085 int journal_file_map_data_hash_table(JournalFile *f) {
1086 uint64_t s, p;
1087 void *t;
1088 int r;
1089
1090 assert(f);
1091 assert(f->header);
1092
1093 if (f->data_hash_table)
1094 return 0;
1095
1096 p = le64toh(f->header->data_hash_table_offset);
1097 s = le64toh(f->header->data_hash_table_size);
1098
1099 r = journal_file_move_to(f,
1100 OBJECT_DATA_HASH_TABLE,
1101 true,
1102 p, s,
1103 &t, NULL);
1104 if (r < 0)
1105 return r;
1106
1107 f->data_hash_table = t;
1108 return 0;
1109 }
1110
1111 int journal_file_map_field_hash_table(JournalFile *f) {
1112 uint64_t s, p;
1113 void *t;
1114 int r;
1115
1116 assert(f);
1117 assert(f->header);
1118
1119 if (f->field_hash_table)
1120 return 0;
1121
1122 p = le64toh(f->header->field_hash_table_offset);
1123 s = le64toh(f->header->field_hash_table_size);
1124
1125 r = journal_file_move_to(f,
1126 OBJECT_FIELD_HASH_TABLE,
1127 true,
1128 p, s,
1129 &t, NULL);
1130 if (r < 0)
1131 return r;
1132
1133 f->field_hash_table = t;
1134 return 0;
1135 }
1136
1137 static int journal_file_link_field(
1138 JournalFile *f,
1139 Object *o,
1140 uint64_t offset,
1141 uint64_t hash) {
1142
1143 uint64_t p, h, m;
1144 int r;
1145
1146 assert(f);
1147 assert(f->header);
1148 assert(f->field_hash_table);
1149 assert(o);
1150 assert(offset > 0);
1151
1152 if (o->object.type != OBJECT_FIELD)
1153 return -EINVAL;
1154
1155 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1156 if (m <= 0)
1157 return -EBADMSG;
1158
1159 /* This might alter the window we are looking at */
1160 o->field.next_hash_offset = o->field.head_data_offset = 0;
1161
1162 h = hash % m;
1163 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1164 if (p == 0)
1165 f->field_hash_table[h].head_hash_offset = htole64(offset);
1166 else {
1167 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1168 if (r < 0)
1169 return r;
1170
1171 o->field.next_hash_offset = htole64(offset);
1172 }
1173
1174 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1175
1176 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1177 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1178
1179 return 0;
1180 }
1181
1182 static int journal_file_link_data(
1183 JournalFile *f,
1184 Object *o,
1185 uint64_t offset,
1186 uint64_t hash) {
1187
1188 uint64_t p, h, m;
1189 int r;
1190
1191 assert(f);
1192 assert(f->header);
1193 assert(f->data_hash_table);
1194 assert(o);
1195 assert(offset > 0);
1196
1197 if (o->object.type != OBJECT_DATA)
1198 return -EINVAL;
1199
1200 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1201 if (m <= 0)
1202 return -EBADMSG;
1203
1204 /* This might alter the window we are looking at */
1205 o->data.next_hash_offset = o->data.next_field_offset = 0;
1206 o->data.entry_offset = o->data.entry_array_offset = 0;
1207 o->data.n_entries = 0;
1208
1209 h = hash % m;
1210 p = le64toh(f->data_hash_table[h].tail_hash_offset);
1211 if (p == 0)
1212 /* Only entry in the hash table is easy */
1213 f->data_hash_table[h].head_hash_offset = htole64(offset);
1214 else {
1215 /* Move back to the previous data object, to patch in
1216 * pointer */
1217
1218 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1219 if (r < 0)
1220 return r;
1221
1222 o->data.next_hash_offset = htole64(offset);
1223 }
1224
1225 f->data_hash_table[h].tail_hash_offset = htole64(offset);
1226
1227 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1228 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1229
1230 return 0;
1231 }
1232
1233 int journal_file_find_field_object_with_hash(
1234 JournalFile *f,
1235 const void *field, uint64_t size, uint64_t hash,
1236 Object **ret, uint64_t *offset) {
1237
1238 uint64_t p, osize, h, m;
1239 int r;
1240
1241 assert(f);
1242 assert(f->header);
1243 assert(field && size > 0);
1244
1245 /* If the field hash table is empty, we can't find anything */
1246 if (le64toh(f->header->field_hash_table_size) <= 0)
1247 return 0;
1248
1249 /* Map the field hash table, if it isn't mapped yet. */
1250 r = journal_file_map_field_hash_table(f);
1251 if (r < 0)
1252 return r;
1253
1254 osize = offsetof(Object, field.payload) + size;
1255
1256 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1257 if (m <= 0)
1258 return -EBADMSG;
1259
1260 h = hash % m;
1261 p = le64toh(f->field_hash_table[h].head_hash_offset);
1262
1263 while (p > 0) {
1264 Object *o;
1265
1266 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1267 if (r < 0)
1268 return r;
1269
1270 if (le64toh(o->field.hash) == hash &&
1271 le64toh(o->object.size) == osize &&
1272 memcmp(o->field.payload, field, size) == 0) {
1273
1274 if (ret)
1275 *ret = o;
1276 if (offset)
1277 *offset = p;
1278
1279 return 1;
1280 }
1281
1282 p = le64toh(o->field.next_hash_offset);
1283 }
1284
1285 return 0;
1286 }
1287
1288 int journal_file_find_field_object(
1289 JournalFile *f,
1290 const void *field, uint64_t size,
1291 Object **ret, uint64_t *offset) {
1292
1293 uint64_t hash;
1294
1295 assert(f);
1296 assert(field && size > 0);
1297
1298 hash = hash64(field, size);
1299
1300 return journal_file_find_field_object_with_hash(f,
1301 field, size, hash,
1302 ret, offset);
1303 }
1304
1305 int journal_file_find_data_object_with_hash(
1306 JournalFile *f,
1307 const void *data, uint64_t size, uint64_t hash,
1308 Object **ret, uint64_t *offset) {
1309
1310 uint64_t p, osize, h, m;
1311 int r;
1312
1313 assert(f);
1314 assert(f->header);
1315 assert(data || size == 0);
1316
1317 /* If there's no data hash table, then there's no entry. */
1318 if (le64toh(f->header->data_hash_table_size) <= 0)
1319 return 0;
1320
1321 /* Map the data hash table, if it isn't mapped yet. */
1322 r = journal_file_map_data_hash_table(f);
1323 if (r < 0)
1324 return r;
1325
1326 osize = offsetof(Object, data.payload) + size;
1327
1328 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1329 if (m <= 0)
1330 return -EBADMSG;
1331
1332 h = hash % m;
1333 p = le64toh(f->data_hash_table[h].head_hash_offset);
1334
1335 while (p > 0) {
1336 Object *o;
1337
1338 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1339 if (r < 0)
1340 return r;
1341
1342 if (le64toh(o->data.hash) != hash)
1343 goto next;
1344
1345 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1346 #if HAVE_XZ || HAVE_LZ4
1347 uint64_t l;
1348 size_t rsize = 0;
1349
1350 l = le64toh(o->object.size);
1351 if (l <= offsetof(Object, data.payload))
1352 return -EBADMSG;
1353
1354 l -= offsetof(Object, data.payload);
1355
1356 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1357 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1358 if (r < 0)
1359 return r;
1360
1361 if (rsize == size &&
1362 memcmp(f->compress_buffer, data, size) == 0) {
1363
1364 if (ret)
1365 *ret = o;
1366
1367 if (offset)
1368 *offset = p;
1369
1370 return 1;
1371 }
1372 #else
1373 return -EPROTONOSUPPORT;
1374 #endif
1375 } else if (le64toh(o->object.size) == osize &&
1376 memcmp(o->data.payload, data, size) == 0) {
1377
1378 if (ret)
1379 *ret = o;
1380
1381 if (offset)
1382 *offset = p;
1383
1384 return 1;
1385 }
1386
1387 next:
1388 p = le64toh(o->data.next_hash_offset);
1389 }
1390
1391 return 0;
1392 }
1393
1394 int journal_file_find_data_object(
1395 JournalFile *f,
1396 const void *data, uint64_t size,
1397 Object **ret, uint64_t *offset) {
1398
1399 uint64_t hash;
1400
1401 assert(f);
1402 assert(data || size == 0);
1403
1404 hash = hash64(data, size);
1405
1406 return journal_file_find_data_object_with_hash(f,
1407 data, size, hash,
1408 ret, offset);
1409 }
1410
1411 static int journal_file_append_field(
1412 JournalFile *f,
1413 const void *field, uint64_t size,
1414 Object **ret, uint64_t *offset) {
1415
1416 uint64_t hash, p;
1417 uint64_t osize;
1418 Object *o;
1419 int r;
1420
1421 assert(f);
1422 assert(field && size > 0);
1423
1424 hash = hash64(field, size);
1425
1426 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1427 if (r < 0)
1428 return r;
1429 else if (r > 0) {
1430
1431 if (ret)
1432 *ret = o;
1433
1434 if (offset)
1435 *offset = p;
1436
1437 return 0;
1438 }
1439
1440 osize = offsetof(Object, field.payload) + size;
1441 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1442 if (r < 0)
1443 return r;
1444
1445 o->field.hash = htole64(hash);
1446 memcpy(o->field.payload, field, size);
1447
1448 r = journal_file_link_field(f, o, p, hash);
1449 if (r < 0)
1450 return r;
1451
1452 /* The linking might have altered the window, so let's
1453 * refresh our pointer */
1454 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1455 if (r < 0)
1456 return r;
1457
1458 #if HAVE_GCRYPT
1459 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1460 if (r < 0)
1461 return r;
1462 #endif
1463
1464 if (ret)
1465 *ret = o;
1466
1467 if (offset)
1468 *offset = p;
1469
1470 return 0;
1471 }
1472
1473 static int journal_file_append_data(
1474 JournalFile *f,
1475 const void *data, uint64_t size,
1476 Object **ret, uint64_t *offset) {
1477
1478 uint64_t hash, p;
1479 uint64_t osize;
1480 Object *o;
1481 int r, compression = 0;
1482 const void *eq;
1483
1484 assert(f);
1485 assert(data || size == 0);
1486
1487 hash = hash64(data, size);
1488
1489 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1490 if (r < 0)
1491 return r;
1492 if (r > 0) {
1493
1494 if (ret)
1495 *ret = o;
1496
1497 if (offset)
1498 *offset = p;
1499
1500 return 0;
1501 }
1502
1503 osize = offsetof(Object, data.payload) + size;
1504 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1505 if (r < 0)
1506 return r;
1507
1508 o->data.hash = htole64(hash);
1509
1510 #if HAVE_XZ || HAVE_LZ4
1511 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
1512 size_t rsize = 0;
1513
1514 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1515
1516 if (compression >= 0) {
1517 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1518 o->object.flags |= compression;
1519
1520 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1521 size, rsize, object_compressed_to_string(compression));
1522 } else
1523 /* Compression didn't work, we don't really care why, let's continue without compression */
1524 compression = 0;
1525 }
1526 #endif
1527
1528 if (compression == 0)
1529 memcpy_safe(o->data.payload, data, size);
1530
1531 r = journal_file_link_data(f, o, p, hash);
1532 if (r < 0)
1533 return r;
1534
1535 #if HAVE_GCRYPT
1536 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1537 if (r < 0)
1538 return r;
1539 #endif
1540
1541 /* The linking might have altered the window, so let's
1542 * refresh our pointer */
1543 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1544 if (r < 0)
1545 return r;
1546
1547 if (!data)
1548 eq = NULL;
1549 else
1550 eq = memchr(data, '=', size);
1551 if (eq && eq > data) {
1552 Object *fo = NULL;
1553 uint64_t fp;
1554
1555 /* Create field object ... */
1556 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1557 if (r < 0)
1558 return r;
1559
1560 /* ... and link it in. */
1561 o->data.next_field_offset = fo->field.head_data_offset;
1562 fo->field.head_data_offset = le64toh(p);
1563 }
1564
1565 if (ret)
1566 *ret = o;
1567
1568 if (offset)
1569 *offset = p;
1570
1571 return 0;
1572 }
1573
1574 uint64_t journal_file_entry_n_items(Object *o) {
1575 assert(o);
1576
1577 if (o->object.type != OBJECT_ENTRY)
1578 return 0;
1579
1580 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1581 }
1582
1583 uint64_t journal_file_entry_array_n_items(Object *o) {
1584 assert(o);
1585
1586 if (o->object.type != OBJECT_ENTRY_ARRAY)
1587 return 0;
1588
1589 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1590 }
1591
1592 uint64_t journal_file_hash_table_n_items(Object *o) {
1593 assert(o);
1594
1595 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1596 return 0;
1597
1598 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1599 }
1600
1601 static int link_entry_into_array(JournalFile *f,
1602 le64_t *first,
1603 le64_t *idx,
1604 uint64_t p) {
1605 int r;
1606 uint64_t n = 0, ap = 0, q, i, a, hidx;
1607 Object *o;
1608
1609 assert(f);
1610 assert(f->header);
1611 assert(first);
1612 assert(idx);
1613 assert(p > 0);
1614
1615 a = le64toh(*first);
1616 i = hidx = le64toh(*idx);
1617 while (a > 0) {
1618
1619 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1620 if (r < 0)
1621 return r;
1622
1623 n = journal_file_entry_array_n_items(o);
1624 if (i < n) {
1625 o->entry_array.items[i] = htole64(p);
1626 *idx = htole64(hidx + 1);
1627 return 0;
1628 }
1629
1630 i -= n;
1631 ap = a;
1632 a = le64toh(o->entry_array.next_entry_array_offset);
1633 }
1634
1635 if (hidx > n)
1636 n = (hidx+1) * 2;
1637 else
1638 n = n * 2;
1639
1640 if (n < 4)
1641 n = 4;
1642
1643 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1644 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1645 &o, &q);
1646 if (r < 0)
1647 return r;
1648
1649 #if HAVE_GCRYPT
1650 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1651 if (r < 0)
1652 return r;
1653 #endif
1654
1655 o->entry_array.items[i] = htole64(p);
1656
1657 if (ap == 0)
1658 *first = htole64(q);
1659 else {
1660 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1661 if (r < 0)
1662 return r;
1663
1664 o->entry_array.next_entry_array_offset = htole64(q);
1665 }
1666
1667 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1668 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1669
1670 *idx = htole64(hidx + 1);
1671
1672 return 0;
1673 }
1674
1675 static int link_entry_into_array_plus_one(JournalFile *f,
1676 le64_t *extra,
1677 le64_t *first,
1678 le64_t *idx,
1679 uint64_t p) {
1680
1681 int r;
1682
1683 assert(f);
1684 assert(extra);
1685 assert(first);
1686 assert(idx);
1687 assert(p > 0);
1688
1689 if (*idx == 0)
1690 *extra = htole64(p);
1691 else {
1692 le64_t i;
1693
1694 i = htole64(le64toh(*idx) - 1);
1695 r = link_entry_into_array(f, first, &i, p);
1696 if (r < 0)
1697 return r;
1698 }
1699
1700 *idx = htole64(le64toh(*idx) + 1);
1701 return 0;
1702 }
1703
1704 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1705 uint64_t p;
1706 int r;
1707 assert(f);
1708 assert(o);
1709 assert(offset > 0);
1710
1711 p = le64toh(o->entry.items[i].object_offset);
1712 if (p == 0)
1713 return -EINVAL;
1714
1715 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1716 if (r < 0)
1717 return r;
1718
1719 return link_entry_into_array_plus_one(f,
1720 &o->data.entry_offset,
1721 &o->data.entry_array_offset,
1722 &o->data.n_entries,
1723 offset);
1724 }
1725
1726 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1727 uint64_t n, i;
1728 int r;
1729
1730 assert(f);
1731 assert(f->header);
1732 assert(o);
1733 assert(offset > 0);
1734
1735 if (o->object.type != OBJECT_ENTRY)
1736 return -EINVAL;
1737
1738 __sync_synchronize();
1739
1740 /* Link up the entry itself */
1741 r = link_entry_into_array(f,
1742 &f->header->entry_array_offset,
1743 &f->header->n_entries,
1744 offset);
1745 if (r < 0)
1746 return r;
1747
1748 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1749
1750 if (f->header->head_entry_realtime == 0)
1751 f->header->head_entry_realtime = o->entry.realtime;
1752
1753 f->header->tail_entry_realtime = o->entry.realtime;
1754 f->header->tail_entry_monotonic = o->entry.monotonic;
1755
1756 /* Link up the items */
1757 n = journal_file_entry_n_items(o);
1758 for (i = 0; i < n; i++) {
1759 r = journal_file_link_entry_item(f, o, offset, i);
1760 if (r < 0)
1761 return r;
1762 }
1763
1764 return 0;
1765 }
1766
1767 static int journal_file_append_entry_internal(
1768 JournalFile *f,
1769 const dual_timestamp *ts,
1770 const sd_id128_t *boot_id,
1771 uint64_t xor_hash,
1772 const EntryItem items[], unsigned n_items,
1773 uint64_t *seqnum,
1774 Object **ret, uint64_t *offset) {
1775 uint64_t np;
1776 uint64_t osize;
1777 Object *o;
1778 int r;
1779
1780 assert(f);
1781 assert(f->header);
1782 assert(items || n_items == 0);
1783 assert(ts);
1784
1785 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1786
1787 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1788 if (r < 0)
1789 return r;
1790
1791 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1792 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1793 o->entry.realtime = htole64(ts->realtime);
1794 o->entry.monotonic = htole64(ts->monotonic);
1795 o->entry.xor_hash = htole64(xor_hash);
1796 o->entry.boot_id = boot_id ? *boot_id : f->header->boot_id;
1797
1798 #if HAVE_GCRYPT
1799 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1800 if (r < 0)
1801 return r;
1802 #endif
1803
1804 r = journal_file_link_entry(f, o, np);
1805 if (r < 0)
1806 return r;
1807
1808 if (ret)
1809 *ret = o;
1810
1811 if (offset)
1812 *offset = np;
1813
1814 return 0;
1815 }
1816
1817 void journal_file_post_change(JournalFile *f) {
1818 assert(f);
1819
1820 if (f->fd < 0)
1821 return;
1822
1823 /* inotify() does not receive IN_MODIFY events from file
1824 * accesses done via mmap(). After each access we hence
1825 * trigger IN_MODIFY by truncating the journal file to its
1826 * current size which triggers IN_MODIFY. */
1827
1828 __sync_synchronize();
1829
1830 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1831 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1832 }
1833
1834 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1835 assert(userdata);
1836
1837 journal_file_post_change(userdata);
1838
1839 return 1;
1840 }
1841
1842 static void schedule_post_change(JournalFile *f) {
1843 uint64_t now;
1844 int r;
1845
1846 assert(f);
1847 assert(f->post_change_timer);
1848
1849 r = sd_event_source_get_enabled(f->post_change_timer, NULL);
1850 if (r < 0) {
1851 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1852 goto fail;
1853 }
1854 if (r > 0)
1855 return;
1856
1857 r = sd_event_now(sd_event_source_get_event(f->post_change_timer), CLOCK_MONOTONIC, &now);
1858 if (r < 0) {
1859 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1860 goto fail;
1861 }
1862
1863 r = sd_event_source_set_time(f->post_change_timer, now + f->post_change_timer_period);
1864 if (r < 0) {
1865 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1866 goto fail;
1867 }
1868
1869 r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
1870 if (r < 0) {
1871 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1872 goto fail;
1873 }
1874
1875 return;
1876
1877 fail:
1878 /* On failure, let's simply post the change immediately. */
1879 journal_file_post_change(f);
1880 }
1881
1882 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1883 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1884 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1885 int r;
1886
1887 assert(f);
1888 assert_return(!f->post_change_timer, -EINVAL);
1889 assert(e);
1890 assert(t);
1891
1892 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1893 if (r < 0)
1894 return r;
1895
1896 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1897 if (r < 0)
1898 return r;
1899
1900 f->post_change_timer = TAKE_PTR(timer);
1901 f->post_change_timer_period = t;
1902
1903 return r;
1904 }
1905
1906 static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
1907 return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
1908 }
1909
1910 int journal_file_append_entry(
1911 JournalFile *f,
1912 const dual_timestamp *ts,
1913 const sd_id128_t *boot_id,
1914 const struct iovec iovec[], unsigned n_iovec,
1915 uint64_t *seqnum,
1916 Object **ret, uint64_t *offset) {
1917
1918 unsigned i;
1919 EntryItem *items;
1920 int r;
1921 uint64_t xor_hash = 0;
1922 struct dual_timestamp _ts;
1923
1924 assert(f);
1925 assert(f->header);
1926 assert(iovec || n_iovec == 0);
1927
1928 if (ts) {
1929 if (!VALID_REALTIME(ts->realtime))
1930 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1931 "Invalid realtime timestamp %" PRIu64 ", refusing entry.",
1932 ts->realtime);
1933 if (!VALID_MONOTONIC(ts->monotonic))
1934 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1935 "Invalid monotomic timestamp %" PRIu64 ", refusing entry.",
1936 ts->monotonic);
1937 } else {
1938 dual_timestamp_get(&_ts);
1939 ts = &_ts;
1940 }
1941
1942 #if HAVE_GCRYPT
1943 r = journal_file_maybe_append_tag(f, ts->realtime);
1944 if (r < 0)
1945 return r;
1946 #endif
1947
1948 /* alloca() can't take 0, hence let's allocate at least one */
1949 items = newa(EntryItem, MAX(1u, n_iovec));
1950
1951 for (i = 0; i < n_iovec; i++) {
1952 uint64_t p;
1953 Object *o;
1954
1955 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1956 if (r < 0)
1957 return r;
1958
1959 xor_hash ^= le64toh(o->data.hash);
1960 items[i].object_offset = htole64(p);
1961 items[i].hash = o->data.hash;
1962 }
1963
1964 /* Order by the position on disk, in order to improve seek
1965 * times for rotating media. */
1966 typesafe_qsort(items, n_iovec, entry_item_cmp);
1967
1968 r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, offset);
1969
1970 /* If the memory mapping triggered a SIGBUS then we return an
1971 * IO error and ignore the error code passed down to us, since
1972 * it is very likely just an effect of a nullified replacement
1973 * mapping page */
1974
1975 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
1976 r = -EIO;
1977
1978 if (f->post_change_timer)
1979 schedule_post_change(f);
1980 else
1981 journal_file_post_change(f);
1982
1983 return r;
1984 }
1985
1986 typedef struct ChainCacheItem {
1987 uint64_t first; /* the array at the beginning of the chain */
1988 uint64_t array; /* the cached array */
1989 uint64_t begin; /* the first item in the cached array */
1990 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1991 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1992 } ChainCacheItem;
1993
1994 static void chain_cache_put(
1995 OrderedHashmap *h,
1996 ChainCacheItem *ci,
1997 uint64_t first,
1998 uint64_t array,
1999 uint64_t begin,
2000 uint64_t total,
2001 uint64_t last_index) {
2002
2003 if (!ci) {
2004 /* If the chain item to cache for this chain is the
2005 * first one it's not worth caching anything */
2006 if (array == first)
2007 return;
2008
2009 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2010 ci = ordered_hashmap_steal_first(h);
2011 assert(ci);
2012 } else {
2013 ci = new(ChainCacheItem, 1);
2014 if (!ci)
2015 return;
2016 }
2017
2018 ci->first = first;
2019
2020 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2021 free(ci);
2022 return;
2023 }
2024 } else
2025 assert(ci->first == first);
2026
2027 ci->array = array;
2028 ci->begin = begin;
2029 ci->total = total;
2030 ci->last_index = last_index;
2031 }
2032
2033 static int generic_array_get(
2034 JournalFile *f,
2035 uint64_t first,
2036 uint64_t i,
2037 Object **ret, uint64_t *offset) {
2038
2039 Object *o;
2040 uint64_t p = 0, a, t = 0;
2041 int r;
2042 ChainCacheItem *ci;
2043
2044 assert(f);
2045
2046 a = first;
2047
2048 /* Try the chain cache first */
2049 ci = ordered_hashmap_get(f->chain_cache, &first);
2050 if (ci && i > ci->total) {
2051 a = ci->array;
2052 i -= ci->total;
2053 t = ci->total;
2054 }
2055
2056 while (a > 0) {
2057 uint64_t k;
2058
2059 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2060 if (r < 0)
2061 return r;
2062
2063 k = journal_file_entry_array_n_items(o);
2064 if (i < k) {
2065 p = le64toh(o->entry_array.items[i]);
2066 goto found;
2067 }
2068
2069 i -= k;
2070 t += k;
2071 a = le64toh(o->entry_array.next_entry_array_offset);
2072 }
2073
2074 return 0;
2075
2076 found:
2077 /* Let's cache this item for the next invocation */
2078 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2079
2080 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2081 if (r < 0)
2082 return r;
2083
2084 if (ret)
2085 *ret = o;
2086
2087 if (offset)
2088 *offset = p;
2089
2090 return 1;
2091 }
2092
2093 static int generic_array_get_plus_one(
2094 JournalFile *f,
2095 uint64_t extra,
2096 uint64_t first,
2097 uint64_t i,
2098 Object **ret, uint64_t *offset) {
2099
2100 Object *o;
2101
2102 assert(f);
2103
2104 if (i == 0) {
2105 int r;
2106
2107 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2108 if (r < 0)
2109 return r;
2110
2111 if (ret)
2112 *ret = o;
2113
2114 if (offset)
2115 *offset = extra;
2116
2117 return 1;
2118 }
2119
2120 return generic_array_get(f, first, i-1, ret, offset);
2121 }
2122
2123 enum {
2124 TEST_FOUND,
2125 TEST_LEFT,
2126 TEST_RIGHT
2127 };
2128
2129 static int generic_array_bisect(
2130 JournalFile *f,
2131 uint64_t first,
2132 uint64_t n,
2133 uint64_t needle,
2134 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2135 direction_t direction,
2136 Object **ret,
2137 uint64_t *offset,
2138 uint64_t *idx) {
2139
2140 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2141 bool subtract_one = false;
2142 Object *o, *array = NULL;
2143 int r;
2144 ChainCacheItem *ci;
2145
2146 assert(f);
2147 assert(test_object);
2148
2149 /* Start with the first array in the chain */
2150 a = first;
2151
2152 ci = ordered_hashmap_get(f->chain_cache, &first);
2153 if (ci && n > ci->total && ci->begin != 0) {
2154 /* Ah, we have iterated this bisection array chain
2155 * previously! Let's see if we can skip ahead in the
2156 * chain, as far as the last time. But we can't jump
2157 * backwards in the chain, so let's check that
2158 * first. */
2159
2160 r = test_object(f, ci->begin, needle);
2161 if (r < 0)
2162 return r;
2163
2164 if (r == TEST_LEFT) {
2165 /* OK, what we are looking for is right of the
2166 * begin of this EntryArray, so let's jump
2167 * straight to previously cached array in the
2168 * chain */
2169
2170 a = ci->array;
2171 n -= ci->total;
2172 t = ci->total;
2173 last_index = ci->last_index;
2174 }
2175 }
2176
2177 while (a > 0) {
2178 uint64_t left, right, k, lp;
2179
2180 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2181 if (r < 0)
2182 return r;
2183
2184 k = journal_file_entry_array_n_items(array);
2185 right = MIN(k, n);
2186 if (right <= 0)
2187 return 0;
2188
2189 i = right - 1;
2190 lp = p = le64toh(array->entry_array.items[i]);
2191 if (p <= 0)
2192 r = -EBADMSG;
2193 else
2194 r = test_object(f, p, needle);
2195 if (r == -EBADMSG) {
2196 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2197 n = i;
2198 continue;
2199 }
2200 if (r < 0)
2201 return r;
2202
2203 if (r == TEST_FOUND)
2204 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2205
2206 if (r == TEST_RIGHT) {
2207 left = 0;
2208 right -= 1;
2209
2210 if (last_index != (uint64_t) -1) {
2211 assert(last_index <= right);
2212
2213 /* If we cached the last index we
2214 * looked at, let's try to not to jump
2215 * too wildly around and see if we can
2216 * limit the range to look at early to
2217 * the immediate neighbors of the last
2218 * index we looked at. */
2219
2220 if (last_index > 0) {
2221 uint64_t x = last_index - 1;
2222
2223 p = le64toh(array->entry_array.items[x]);
2224 if (p <= 0)
2225 return -EBADMSG;
2226
2227 r = test_object(f, p, needle);
2228 if (r < 0)
2229 return r;
2230
2231 if (r == TEST_FOUND)
2232 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2233
2234 if (r == TEST_RIGHT)
2235 right = x;
2236 else
2237 left = x + 1;
2238 }
2239
2240 if (last_index < right) {
2241 uint64_t y = last_index + 1;
2242
2243 p = le64toh(array->entry_array.items[y]);
2244 if (p <= 0)
2245 return -EBADMSG;
2246
2247 r = test_object(f, p, needle);
2248 if (r < 0)
2249 return r;
2250
2251 if (r == TEST_FOUND)
2252 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2253
2254 if (r == TEST_RIGHT)
2255 right = y;
2256 else
2257 left = y + 1;
2258 }
2259 }
2260
2261 for (;;) {
2262 if (left == right) {
2263 if (direction == DIRECTION_UP)
2264 subtract_one = true;
2265
2266 i = left;
2267 goto found;
2268 }
2269
2270 assert(left < right);
2271 i = (left + right) / 2;
2272
2273 p = le64toh(array->entry_array.items[i]);
2274 if (p <= 0)
2275 r = -EBADMSG;
2276 else
2277 r = test_object(f, p, needle);
2278 if (r == -EBADMSG) {
2279 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2280 right = n = i;
2281 continue;
2282 }
2283 if (r < 0)
2284 return r;
2285
2286 if (r == TEST_FOUND)
2287 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2288
2289 if (r == TEST_RIGHT)
2290 right = i;
2291 else
2292 left = i + 1;
2293 }
2294 }
2295
2296 if (k >= n) {
2297 if (direction == DIRECTION_UP) {
2298 i = n;
2299 subtract_one = true;
2300 goto found;
2301 }
2302
2303 return 0;
2304 }
2305
2306 last_p = lp;
2307
2308 n -= k;
2309 t += k;
2310 last_index = (uint64_t) -1;
2311 a = le64toh(array->entry_array.next_entry_array_offset);
2312 }
2313
2314 return 0;
2315
2316 found:
2317 if (subtract_one && t == 0 && i == 0)
2318 return 0;
2319
2320 /* Let's cache this item for the next invocation */
2321 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2322
2323 if (subtract_one && i == 0)
2324 p = last_p;
2325 else if (subtract_one)
2326 p = le64toh(array->entry_array.items[i-1]);
2327 else
2328 p = le64toh(array->entry_array.items[i]);
2329
2330 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2331 if (r < 0)
2332 return r;
2333
2334 if (ret)
2335 *ret = o;
2336
2337 if (offset)
2338 *offset = p;
2339
2340 if (idx)
2341 *idx = t + i + (subtract_one ? -1 : 0);
2342
2343 return 1;
2344 }
2345
2346 static int generic_array_bisect_plus_one(
2347 JournalFile *f,
2348 uint64_t extra,
2349 uint64_t first,
2350 uint64_t n,
2351 uint64_t needle,
2352 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2353 direction_t direction,
2354 Object **ret,
2355 uint64_t *offset,
2356 uint64_t *idx) {
2357
2358 int r;
2359 bool step_back = false;
2360 Object *o;
2361
2362 assert(f);
2363 assert(test_object);
2364
2365 if (n <= 0)
2366 return 0;
2367
2368 /* This bisects the array in object 'first', but first checks
2369 * an extra */
2370 r = test_object(f, extra, needle);
2371 if (r < 0)
2372 return r;
2373
2374 if (r == TEST_FOUND)
2375 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2376
2377 /* if we are looking with DIRECTION_UP then we need to first
2378 see if in the actual array there is a matching entry, and
2379 return the last one of that. But if there isn't any we need
2380 to return this one. Hence remember this, and return it
2381 below. */
2382 if (r == TEST_LEFT)
2383 step_back = direction == DIRECTION_UP;
2384
2385 if (r == TEST_RIGHT) {
2386 if (direction == DIRECTION_DOWN)
2387 goto found;
2388 else
2389 return 0;
2390 }
2391
2392 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2393
2394 if (r == 0 && step_back)
2395 goto found;
2396
2397 if (r > 0 && idx)
2398 (*idx)++;
2399
2400 return r;
2401
2402 found:
2403 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2404 if (r < 0)
2405 return r;
2406
2407 if (ret)
2408 *ret = o;
2409
2410 if (offset)
2411 *offset = extra;
2412
2413 if (idx)
2414 *idx = 0;
2415
2416 return 1;
2417 }
2418
2419 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2420 assert(f);
2421 assert(p > 0);
2422
2423 if (p == needle)
2424 return TEST_FOUND;
2425 else if (p < needle)
2426 return TEST_LEFT;
2427 else
2428 return TEST_RIGHT;
2429 }
2430
2431 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2432 Object *o;
2433 int r;
2434
2435 assert(f);
2436 assert(p > 0);
2437
2438 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2439 if (r < 0)
2440 return r;
2441
2442 if (le64toh(o->entry.seqnum) == needle)
2443 return TEST_FOUND;
2444 else if (le64toh(o->entry.seqnum) < needle)
2445 return TEST_LEFT;
2446 else
2447 return TEST_RIGHT;
2448 }
2449
2450 int journal_file_move_to_entry_by_seqnum(
2451 JournalFile *f,
2452 uint64_t seqnum,
2453 direction_t direction,
2454 Object **ret,
2455 uint64_t *offset) {
2456 assert(f);
2457 assert(f->header);
2458
2459 return generic_array_bisect(f,
2460 le64toh(f->header->entry_array_offset),
2461 le64toh(f->header->n_entries),
2462 seqnum,
2463 test_object_seqnum,
2464 direction,
2465 ret, offset, NULL);
2466 }
2467
2468 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2469 Object *o;
2470 int r;
2471
2472 assert(f);
2473 assert(p > 0);
2474
2475 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2476 if (r < 0)
2477 return r;
2478
2479 if (le64toh(o->entry.realtime) == needle)
2480 return TEST_FOUND;
2481 else if (le64toh(o->entry.realtime) < needle)
2482 return TEST_LEFT;
2483 else
2484 return TEST_RIGHT;
2485 }
2486
2487 int journal_file_move_to_entry_by_realtime(
2488 JournalFile *f,
2489 uint64_t realtime,
2490 direction_t direction,
2491 Object **ret,
2492 uint64_t *offset) {
2493 assert(f);
2494 assert(f->header);
2495
2496 return generic_array_bisect(f,
2497 le64toh(f->header->entry_array_offset),
2498 le64toh(f->header->n_entries),
2499 realtime,
2500 test_object_realtime,
2501 direction,
2502 ret, offset, NULL);
2503 }
2504
2505 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2506 Object *o;
2507 int r;
2508
2509 assert(f);
2510 assert(p > 0);
2511
2512 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2513 if (r < 0)
2514 return r;
2515
2516 if (le64toh(o->entry.monotonic) == needle)
2517 return TEST_FOUND;
2518 else if (le64toh(o->entry.monotonic) < needle)
2519 return TEST_LEFT;
2520 else
2521 return TEST_RIGHT;
2522 }
2523
2524 static int find_data_object_by_boot_id(
2525 JournalFile *f,
2526 sd_id128_t boot_id,
2527 Object **o,
2528 uint64_t *b) {
2529
2530 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2531
2532 sd_id128_to_string(boot_id, t + 9);
2533 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2534 }
2535
2536 int journal_file_move_to_entry_by_monotonic(
2537 JournalFile *f,
2538 sd_id128_t boot_id,
2539 uint64_t monotonic,
2540 direction_t direction,
2541 Object **ret,
2542 uint64_t *offset) {
2543
2544 Object *o;
2545 int r;
2546
2547 assert(f);
2548
2549 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2550 if (r < 0)
2551 return r;
2552 if (r == 0)
2553 return -ENOENT;
2554
2555 return generic_array_bisect_plus_one(f,
2556 le64toh(o->data.entry_offset),
2557 le64toh(o->data.entry_array_offset),
2558 le64toh(o->data.n_entries),
2559 monotonic,
2560 test_object_monotonic,
2561 direction,
2562 ret, offset, NULL);
2563 }
2564
2565 void journal_file_reset_location(JournalFile *f) {
2566 f->location_type = LOCATION_HEAD;
2567 f->current_offset = 0;
2568 f->current_seqnum = 0;
2569 f->current_realtime = 0;
2570 f->current_monotonic = 0;
2571 zero(f->current_boot_id);
2572 f->current_xor_hash = 0;
2573 }
2574
2575 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2576 f->location_type = LOCATION_SEEK;
2577 f->current_offset = offset;
2578 f->current_seqnum = le64toh(o->entry.seqnum);
2579 f->current_realtime = le64toh(o->entry.realtime);
2580 f->current_monotonic = le64toh(o->entry.monotonic);
2581 f->current_boot_id = o->entry.boot_id;
2582 f->current_xor_hash = le64toh(o->entry.xor_hash);
2583 }
2584
2585 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2586 int r;
2587
2588 assert(af);
2589 assert(af->header);
2590 assert(bf);
2591 assert(bf->header);
2592 assert(af->location_type == LOCATION_SEEK);
2593 assert(bf->location_type == LOCATION_SEEK);
2594
2595 /* If contents and timestamps match, these entries are
2596 * identical, even if the seqnum does not match */
2597 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2598 af->current_monotonic == bf->current_monotonic &&
2599 af->current_realtime == bf->current_realtime &&
2600 af->current_xor_hash == bf->current_xor_hash)
2601 return 0;
2602
2603 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2604
2605 /* If this is from the same seqnum source, compare
2606 * seqnums */
2607 r = CMP(af->current_seqnum, bf->current_seqnum);
2608 if (r != 0)
2609 return r;
2610
2611 /* Wow! This is weird, different data but the same
2612 * seqnums? Something is borked, but let's make the
2613 * best of it and compare by time. */
2614 }
2615
2616 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2617
2618 /* If the boot id matches, compare monotonic time */
2619 r = CMP(af->current_monotonic, bf->current_monotonic);
2620 if (r != 0)
2621 return r;
2622 }
2623
2624 /* Otherwise, compare UTC time */
2625 r = CMP(af->current_realtime, bf->current_realtime);
2626 if (r != 0)
2627 return r;
2628
2629 /* Finally, compare by contents */
2630 return CMP(af->current_xor_hash, bf->current_xor_hash);
2631 }
2632
2633 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2634
2635 /* Increase or decrease the specified index, in the right direction. */
2636
2637 if (direction == DIRECTION_DOWN) {
2638 if (*i >= n - 1)
2639 return 0;
2640
2641 (*i) ++;
2642 } else {
2643 if (*i <= 0)
2644 return 0;
2645
2646 (*i) --;
2647 }
2648
2649 return 1;
2650 }
2651
2652 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2653
2654 /* Consider it an error if any of the two offsets is uninitialized */
2655 if (old_offset == 0 || new_offset == 0)
2656 return false;
2657
2658 /* If we go down, the new offset must be larger than the old one. */
2659 return direction == DIRECTION_DOWN ?
2660 new_offset > old_offset :
2661 new_offset < old_offset;
2662 }
2663
2664 int journal_file_next_entry(
2665 JournalFile *f,
2666 uint64_t p,
2667 direction_t direction,
2668 Object **ret, uint64_t *offset) {
2669
2670 uint64_t i, n, ofs;
2671 int r;
2672
2673 assert(f);
2674 assert(f->header);
2675
2676 n = le64toh(f->header->n_entries);
2677 if (n <= 0)
2678 return 0;
2679
2680 if (p == 0)
2681 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2682 else {
2683 r = generic_array_bisect(f,
2684 le64toh(f->header->entry_array_offset),
2685 le64toh(f->header->n_entries),
2686 p,
2687 test_object_offset,
2688 DIRECTION_DOWN,
2689 NULL, NULL,
2690 &i);
2691 if (r <= 0)
2692 return r;
2693
2694 r = bump_array_index(&i, direction, n);
2695 if (r <= 0)
2696 return r;
2697 }
2698
2699 /* And jump to it */
2700 for (;;) {
2701 r = generic_array_get(f,
2702 le64toh(f->header->entry_array_offset),
2703 i,
2704 ret, &ofs);
2705 if (r > 0)
2706 break;
2707 if (r != -EBADMSG)
2708 return r;
2709
2710 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2711 * the next one might work for us instead. */
2712 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2713
2714 r = bump_array_index(&i, direction, n);
2715 if (r <= 0)
2716 return r;
2717 }
2718
2719 /* Ensure our array is properly ordered. */
2720 if (p > 0 && !check_properly_ordered(ofs, p, direction))
2721 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2722 "%s: entry array not properly ordered at entry %" PRIu64,
2723 f->path, i);
2724
2725 if (offset)
2726 *offset = ofs;
2727
2728 return 1;
2729 }
2730
2731 int journal_file_next_entry_for_data(
2732 JournalFile *f,
2733 Object *o, uint64_t p,
2734 uint64_t data_offset,
2735 direction_t direction,
2736 Object **ret, uint64_t *offset) {
2737
2738 uint64_t i, n, ofs;
2739 Object *d;
2740 int r;
2741
2742 assert(f);
2743 assert(p > 0 || !o);
2744
2745 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2746 if (r < 0)
2747 return r;
2748
2749 n = le64toh(d->data.n_entries);
2750 if (n <= 0)
2751 return n;
2752
2753 if (!o)
2754 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2755 else {
2756 if (o->object.type != OBJECT_ENTRY)
2757 return -EINVAL;
2758
2759 r = generic_array_bisect_plus_one(f,
2760 le64toh(d->data.entry_offset),
2761 le64toh(d->data.entry_array_offset),
2762 le64toh(d->data.n_entries),
2763 p,
2764 test_object_offset,
2765 DIRECTION_DOWN,
2766 NULL, NULL,
2767 &i);
2768
2769 if (r <= 0)
2770 return r;
2771
2772 r = bump_array_index(&i, direction, n);
2773 if (r <= 0)
2774 return r;
2775 }
2776
2777 for (;;) {
2778 r = generic_array_get_plus_one(f,
2779 le64toh(d->data.entry_offset),
2780 le64toh(d->data.entry_array_offset),
2781 i,
2782 ret, &ofs);
2783 if (r > 0)
2784 break;
2785 if (r != -EBADMSG)
2786 return r;
2787
2788 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2789
2790 r = bump_array_index(&i, direction, n);
2791 if (r <= 0)
2792 return r;
2793 }
2794
2795 /* Ensure our array is properly ordered. */
2796 if (p > 0 && check_properly_ordered(ofs, p, direction))
2797 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2798 "%s data entry array not properly ordered at entry %" PRIu64,
2799 f->path, i);
2800
2801 if (offset)
2802 *offset = ofs;
2803
2804 return 1;
2805 }
2806
2807 int journal_file_move_to_entry_by_offset_for_data(
2808 JournalFile *f,
2809 uint64_t data_offset,
2810 uint64_t p,
2811 direction_t direction,
2812 Object **ret, uint64_t *offset) {
2813
2814 int r;
2815 Object *d;
2816
2817 assert(f);
2818
2819 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2820 if (r < 0)
2821 return r;
2822
2823 return generic_array_bisect_plus_one(f,
2824 le64toh(d->data.entry_offset),
2825 le64toh(d->data.entry_array_offset),
2826 le64toh(d->data.n_entries),
2827 p,
2828 test_object_offset,
2829 direction,
2830 ret, offset, NULL);
2831 }
2832
2833 int journal_file_move_to_entry_by_monotonic_for_data(
2834 JournalFile *f,
2835 uint64_t data_offset,
2836 sd_id128_t boot_id,
2837 uint64_t monotonic,
2838 direction_t direction,
2839 Object **ret, uint64_t *offset) {
2840
2841 Object *o, *d;
2842 int r;
2843 uint64_t b, z;
2844
2845 assert(f);
2846
2847 /* First, seek by time */
2848 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2849 if (r < 0)
2850 return r;
2851 if (r == 0)
2852 return -ENOENT;
2853
2854 r = generic_array_bisect_plus_one(f,
2855 le64toh(o->data.entry_offset),
2856 le64toh(o->data.entry_array_offset),
2857 le64toh(o->data.n_entries),
2858 monotonic,
2859 test_object_monotonic,
2860 direction,
2861 NULL, &z, NULL);
2862 if (r <= 0)
2863 return r;
2864
2865 /* And now, continue seeking until we find an entry that
2866 * exists in both bisection arrays */
2867
2868 for (;;) {
2869 Object *qo;
2870 uint64_t p, q;
2871
2872 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2873 if (r < 0)
2874 return r;
2875
2876 r = generic_array_bisect_plus_one(f,
2877 le64toh(d->data.entry_offset),
2878 le64toh(d->data.entry_array_offset),
2879 le64toh(d->data.n_entries),
2880 z,
2881 test_object_offset,
2882 direction,
2883 NULL, &p, NULL);
2884 if (r <= 0)
2885 return r;
2886
2887 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2888 if (r < 0)
2889 return r;
2890
2891 r = generic_array_bisect_plus_one(f,
2892 le64toh(o->data.entry_offset),
2893 le64toh(o->data.entry_array_offset),
2894 le64toh(o->data.n_entries),
2895 p,
2896 test_object_offset,
2897 direction,
2898 &qo, &q, NULL);
2899
2900 if (r <= 0)
2901 return r;
2902
2903 if (p == q) {
2904 if (ret)
2905 *ret = qo;
2906 if (offset)
2907 *offset = q;
2908
2909 return 1;
2910 }
2911
2912 z = q;
2913 }
2914 }
2915
2916 int journal_file_move_to_entry_by_seqnum_for_data(
2917 JournalFile *f,
2918 uint64_t data_offset,
2919 uint64_t seqnum,
2920 direction_t direction,
2921 Object **ret, uint64_t *offset) {
2922
2923 Object *d;
2924 int r;
2925
2926 assert(f);
2927
2928 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2929 if (r < 0)
2930 return r;
2931
2932 return generic_array_bisect_plus_one(f,
2933 le64toh(d->data.entry_offset),
2934 le64toh(d->data.entry_array_offset),
2935 le64toh(d->data.n_entries),
2936 seqnum,
2937 test_object_seqnum,
2938 direction,
2939 ret, offset, NULL);
2940 }
2941
2942 int journal_file_move_to_entry_by_realtime_for_data(
2943 JournalFile *f,
2944 uint64_t data_offset,
2945 uint64_t realtime,
2946 direction_t direction,
2947 Object **ret, uint64_t *offset) {
2948
2949 Object *d;
2950 int r;
2951
2952 assert(f);
2953
2954 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2955 if (r < 0)
2956 return r;
2957
2958 return generic_array_bisect_plus_one(f,
2959 le64toh(d->data.entry_offset),
2960 le64toh(d->data.entry_array_offset),
2961 le64toh(d->data.n_entries),
2962 realtime,
2963 test_object_realtime,
2964 direction,
2965 ret, offset, NULL);
2966 }
2967
2968 void journal_file_dump(JournalFile *f) {
2969 Object *o;
2970 int r;
2971 uint64_t p;
2972
2973 assert(f);
2974 assert(f->header);
2975
2976 journal_file_print_header(f);
2977
2978 p = le64toh(f->header->header_size);
2979 while (p != 0) {
2980 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2981 if (r < 0)
2982 goto fail;
2983
2984 switch (o->object.type) {
2985
2986 case OBJECT_UNUSED:
2987 printf("Type: OBJECT_UNUSED\n");
2988 break;
2989
2990 case OBJECT_DATA:
2991 printf("Type: OBJECT_DATA\n");
2992 break;
2993
2994 case OBJECT_FIELD:
2995 printf("Type: OBJECT_FIELD\n");
2996 break;
2997
2998 case OBJECT_ENTRY:
2999 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3000 le64toh(o->entry.seqnum),
3001 le64toh(o->entry.monotonic),
3002 le64toh(o->entry.realtime));
3003 break;
3004
3005 case OBJECT_FIELD_HASH_TABLE:
3006 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3007 break;
3008
3009 case OBJECT_DATA_HASH_TABLE:
3010 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3011 break;
3012
3013 case OBJECT_ENTRY_ARRAY:
3014 printf("Type: OBJECT_ENTRY_ARRAY\n");
3015 break;
3016
3017 case OBJECT_TAG:
3018 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3019 le64toh(o->tag.seqnum),
3020 le64toh(o->tag.epoch));
3021 break;
3022
3023 default:
3024 printf("Type: unknown (%i)\n", o->object.type);
3025 break;
3026 }
3027
3028 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3029 printf("Flags: %s\n",
3030 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3031
3032 if (p == le64toh(f->header->tail_object_offset))
3033 p = 0;
3034 else
3035 p = p + ALIGN64(le64toh(o->object.size));
3036 }
3037
3038 return;
3039 fail:
3040 log_error("File corrupt");
3041 }
3042
3043 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3044 const char *x;
3045
3046 x = format_timestamp(buf, l, t);
3047 if (x)
3048 return x;
3049 return " --- ";
3050 }
3051
3052 void journal_file_print_header(JournalFile *f) {
3053 char a[33], b[33], c[33], d[33];
3054 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3055 struct stat st;
3056 char bytes[FORMAT_BYTES_MAX];
3057
3058 assert(f);
3059 assert(f->header);
3060
3061 printf("File Path: %s\n"
3062 "File ID: %s\n"
3063 "Machine ID: %s\n"
3064 "Boot ID: %s\n"
3065 "Sequential Number ID: %s\n"
3066 "State: %s\n"
3067 "Compatible Flags:%s%s\n"
3068 "Incompatible Flags:%s%s%s\n"
3069 "Header size: %"PRIu64"\n"
3070 "Arena size: %"PRIu64"\n"
3071 "Data Hash Table Size: %"PRIu64"\n"
3072 "Field Hash Table Size: %"PRIu64"\n"
3073 "Rotate Suggested: %s\n"
3074 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3075 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3076 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3077 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3078 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3079 "Objects: %"PRIu64"\n"
3080 "Entry Objects: %"PRIu64"\n",
3081 f->path,
3082 sd_id128_to_string(f->header->file_id, a),
3083 sd_id128_to_string(f->header->machine_id, b),
3084 sd_id128_to_string(f->header->boot_id, c),
3085 sd_id128_to_string(f->header->seqnum_id, d),
3086 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3087 f->header->state == STATE_ONLINE ? "ONLINE" :
3088 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3089 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3090 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3091 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3092 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3093 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3094 le64toh(f->header->header_size),
3095 le64toh(f->header->arena_size),
3096 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3097 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3098 yes_no(journal_file_rotate_suggested(f, 0)),
3099 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3100 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3101 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3102 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3103 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3104 le64toh(f->header->n_objects),
3105 le64toh(f->header->n_entries));
3106
3107 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3108 printf("Data Objects: %"PRIu64"\n"
3109 "Data Hash Table Fill: %.1f%%\n",
3110 le64toh(f->header->n_data),
3111 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3112
3113 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3114 printf("Field Objects: %"PRIu64"\n"
3115 "Field Hash Table Fill: %.1f%%\n",
3116 le64toh(f->header->n_fields),
3117 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3118
3119 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3120 printf("Tag Objects: %"PRIu64"\n",
3121 le64toh(f->header->n_tags));
3122 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3123 printf("Entry Array Objects: %"PRIu64"\n",
3124 le64toh(f->header->n_entry_arrays));
3125
3126 if (fstat(f->fd, &st) >= 0)
3127 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3128 }
3129
3130 static int journal_file_warn_btrfs(JournalFile *f) {
3131 unsigned attrs;
3132 int r;
3133
3134 assert(f);
3135
3136 /* Before we write anything, check if the COW logic is turned
3137 * off on btrfs. Given our write pattern that is quite
3138 * unfriendly to COW file systems this should greatly improve
3139 * performance on COW file systems, such as btrfs, at the
3140 * expense of data integrity features (which shouldn't be too
3141 * bad, given that we do our own checksumming). */
3142
3143 r = btrfs_is_filesystem(f->fd);
3144 if (r < 0)
3145 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3146 if (!r)
3147 return 0;
3148
3149 r = read_attr_fd(f->fd, &attrs);
3150 if (r < 0)
3151 return log_warning_errno(r, "Failed to read file attributes: %m");
3152
3153 if (attrs & FS_NOCOW_FL) {
3154 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3155 return 0;
3156 }
3157
3158 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3159 "This is likely to slow down journal access substantially, please consider turning "
3160 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3161
3162 return 1;
3163 }
3164
3165 int journal_file_open(
3166 int fd,
3167 const char *fname,
3168 int flags,
3169 mode_t mode,
3170 bool compress,
3171 uint64_t compress_threshold_bytes,
3172 bool seal,
3173 JournalMetrics *metrics,
3174 MMapCache *mmap_cache,
3175 Set *deferred_closes,
3176 JournalFile *template,
3177 JournalFile **ret) {
3178
3179 bool newly_created = false;
3180 JournalFile *f;
3181 void *h;
3182 int r;
3183 char bytes[FORMAT_BYTES_MAX];
3184
3185 assert(ret);
3186 assert(fd >= 0 || fname);
3187
3188 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3189 return -EINVAL;
3190
3191 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3192 return -EINVAL;
3193
3194 f = new(JournalFile, 1);
3195 if (!f)
3196 return -ENOMEM;
3197
3198 *f = (JournalFile) {
3199 .fd = fd,
3200 .mode = mode,
3201
3202 .flags = flags,
3203 .prot = prot_from_flags(flags),
3204 .writable = (flags & O_ACCMODE) != O_RDONLY,
3205
3206 #if HAVE_LZ4
3207 .compress_lz4 = compress,
3208 #elif HAVE_XZ
3209 .compress_xz = compress,
3210 #endif
3211 .compress_threshold_bytes = compress_threshold_bytes == (uint64_t) -1 ?
3212 DEFAULT_COMPRESS_THRESHOLD :
3213 MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
3214 #if HAVE_GCRYPT
3215 .seal = seal,
3216 #endif
3217 };
3218
3219 log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
3220 yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
3221 format_bytes(bytes, sizeof(bytes), f->compress_threshold_bytes));
3222
3223 if (mmap_cache)
3224 f->mmap = mmap_cache_ref(mmap_cache);
3225 else {
3226 f->mmap = mmap_cache_new();
3227 if (!f->mmap) {
3228 r = -ENOMEM;
3229 goto fail;
3230 }
3231 }
3232
3233 if (fname) {
3234 f->path = strdup(fname);
3235 if (!f->path) {
3236 r = -ENOMEM;
3237 goto fail;
3238 }
3239 } else {
3240 assert(fd >= 0);
3241
3242 /* If we don't know the path, fill in something explanatory and vaguely useful */
3243 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3244 r = -ENOMEM;
3245 goto fail;
3246 }
3247 }
3248
3249 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3250 if (!f->chain_cache) {
3251 r = -ENOMEM;
3252 goto fail;
3253 }
3254
3255 if (f->fd < 0) {
3256 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3257 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3258 * it doesn't hurt in that case. */
3259
3260 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
3261 if (f->fd < 0) {
3262 r = -errno;
3263 goto fail;
3264 }
3265
3266 /* fds we opened here by us should also be closed by us. */
3267 f->close_fd = true;
3268
3269 r = fd_nonblock(f->fd, false);
3270 if (r < 0)
3271 goto fail;
3272 }
3273
3274 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3275 if (!f->cache_fd) {
3276 r = -ENOMEM;
3277 goto fail;
3278 }
3279
3280 r = journal_file_fstat(f);
3281 if (r < 0)
3282 goto fail;
3283
3284 if (f->last_stat.st_size == 0 && f->writable) {
3285
3286 (void) journal_file_warn_btrfs(f);
3287
3288 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3289 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3290 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3291 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3292 * solely on mtime/atime/ctime of the file. */
3293 (void) fd_setcrtime(f->fd, 0);
3294
3295 #if HAVE_GCRYPT
3296 /* Try to load the FSPRG state, and if we can't, then
3297 * just don't do sealing */
3298 if (f->seal) {
3299 r = journal_file_fss_load(f);
3300 if (r < 0)
3301 f->seal = false;
3302 }
3303 #endif
3304
3305 r = journal_file_init_header(f, template);
3306 if (r < 0)
3307 goto fail;
3308
3309 r = journal_file_fstat(f);
3310 if (r < 0)
3311 goto fail;
3312
3313 newly_created = true;
3314 }
3315
3316 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3317 r = -ENODATA;
3318 goto fail;
3319 }
3320
3321 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3322 if (r < 0)
3323 goto fail;
3324
3325 f->header = h;
3326
3327 if (!newly_created) {
3328 set_clear_with_destructor(deferred_closes, journal_file_close);
3329
3330 r = journal_file_verify_header(f);
3331 if (r < 0)
3332 goto fail;
3333 }
3334
3335 #if HAVE_GCRYPT
3336 if (!newly_created && f->writable) {
3337 r = journal_file_fss_load(f);
3338 if (r < 0)
3339 goto fail;
3340 }
3341 #endif
3342
3343 if (f->writable) {
3344 if (metrics) {
3345 journal_default_metrics(metrics, f->fd);
3346 f->metrics = *metrics;
3347 } else if (template)
3348 f->metrics = template->metrics;
3349
3350 r = journal_file_refresh_header(f);
3351 if (r < 0)
3352 goto fail;
3353 }
3354
3355 #if HAVE_GCRYPT
3356 r = journal_file_hmac_setup(f);
3357 if (r < 0)
3358 goto fail;
3359 #endif
3360
3361 if (newly_created) {
3362 r = journal_file_setup_field_hash_table(f);
3363 if (r < 0)
3364 goto fail;
3365
3366 r = journal_file_setup_data_hash_table(f);
3367 if (r < 0)
3368 goto fail;
3369
3370 #if HAVE_GCRYPT
3371 r = journal_file_append_first_tag(f);
3372 if (r < 0)
3373 goto fail;
3374 #endif
3375 }
3376
3377 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3378 r = -EIO;
3379 goto fail;
3380 }
3381
3382 if (template && template->post_change_timer) {
3383 r = journal_file_enable_post_change_timer(
3384 f,
3385 sd_event_source_get_event(template->post_change_timer),
3386 template->post_change_timer_period);
3387
3388 if (r < 0)
3389 goto fail;
3390 }
3391
3392 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3393 f->close_fd = true;
3394
3395 *ret = f;
3396 return 0;
3397
3398 fail:
3399 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3400 r = -EIO;
3401
3402 (void) journal_file_close(f);
3403
3404 return r;
3405 }
3406
3407 int journal_file_archive(JournalFile *f) {
3408 _cleanup_free_ char *p = NULL;
3409
3410 assert(f);
3411
3412 if (!f->writable)
3413 return -EINVAL;
3414
3415 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3416 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3417 if (path_startswith(f->path, "/proc/self/fd"))
3418 return -EINVAL;
3419
3420 if (!endswith(f->path, ".journal"))
3421 return -EINVAL;
3422
3423 if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3424 (int) strlen(f->path) - 8, f->path,
3425 SD_ID128_FORMAT_VAL(f->header->seqnum_id),
3426 le64toh(f->header->head_entry_seqnum),
3427 le64toh(f->header->head_entry_realtime)) < 0)
3428 return -ENOMEM;
3429
3430 /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
3431 * ignore that case. */
3432 if (rename(f->path, p) < 0 && errno != ENOENT)
3433 return -errno;
3434
3435 /* Sync the rename to disk */
3436 (void) fsync_directory_of_file(f->fd);
3437
3438 /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
3439 * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
3440 * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue
3441 * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
3442 * occurs. */
3443 f->archive = true;
3444
3445 /* Currently, btrfs is not very good with out write patterns and fragments heavily. Let's defrag our journal
3446 * files when we archive them */
3447 f->defrag_on_close = true;
3448
3449 return 0;
3450 }
3451
3452 JournalFile* journal_initiate_close(
3453 JournalFile *f,
3454 Set *deferred_closes) {
3455
3456 int r;
3457
3458 assert(f);
3459
3460 if (deferred_closes) {
3461
3462 r = set_put(deferred_closes, f);
3463 if (r < 0)
3464 log_debug_errno(r, "Failed to add file to deferred close set, closing immediately.");
3465 else {
3466 (void) journal_file_set_offline(f, false);
3467 return NULL;
3468 }
3469 }
3470
3471 return journal_file_close(f);
3472 }
3473
3474 int journal_file_rotate(
3475 JournalFile **f,
3476 bool compress,
3477 uint64_t compress_threshold_bytes,
3478 bool seal,
3479 Set *deferred_closes) {
3480
3481 JournalFile *new_file = NULL;
3482 int r;
3483
3484 assert(f);
3485 assert(*f);
3486
3487 r = journal_file_archive(*f);
3488 if (r < 0)
3489 return r;
3490
3491 r = journal_file_open(
3492 -1,
3493 (*f)->path,
3494 (*f)->flags,
3495 (*f)->mode,
3496 compress,
3497 compress_threshold_bytes,
3498 seal,
3499 NULL, /* metrics */
3500 (*f)->mmap,
3501 deferred_closes,
3502 *f, /* template */
3503 &new_file);
3504
3505 journal_initiate_close(*f, deferred_closes);
3506 *f = new_file;
3507
3508 return r;
3509 }
3510
3511 int journal_file_dispose(int dir_fd, const char *fname) {
3512 _cleanup_free_ char *p = NULL;
3513 _cleanup_close_ int fd = -1;
3514
3515 assert(fname);
3516
3517 /* Renames a journal file to *.journal~, i.e. to mark it as corruped or otherwise uncleanly shutdown. Note that
3518 * this is done without looking into the file or changing any of its contents. The idea is that this is called
3519 * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
3520 * for writing anymore. */
3521
3522 if (!endswith(fname, ".journal"))
3523 return -EINVAL;
3524
3525 if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
3526 (int) strlen(fname) - 8, fname,
3527 now(CLOCK_REALTIME),
3528 random_u64()) < 0)
3529 return -ENOMEM;
3530
3531 if (renameat(dir_fd, fname, dir_fd, p) < 0)
3532 return -errno;
3533
3534 /* btrfs doesn't cope well with our write pattern and fragments heavily. Let's defrag all files we rotate */
3535 fd = openat(dir_fd, p, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
3536 if (fd < 0)
3537 log_debug_errno(errno, "Failed to open file for defragmentation/FS_NOCOW_FL, ignoring: %m");
3538 else {
3539 (void) chattr_fd(fd, 0, FS_NOCOW_FL, NULL);
3540 (void) btrfs_defrag_fd(fd);
3541 }
3542
3543 return 0;
3544 }
3545
3546 int journal_file_open_reliably(
3547 const char *fname,
3548 int flags,
3549 mode_t mode,
3550 bool compress,
3551 uint64_t compress_threshold_bytes,
3552 bool seal,
3553 JournalMetrics *metrics,
3554 MMapCache *mmap_cache,
3555 Set *deferred_closes,
3556 JournalFile *template,
3557 JournalFile **ret) {
3558
3559 int r;
3560
3561 r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3562 deferred_closes, template, ret);
3563 if (!IN_SET(r,
3564 -EBADMSG, /* Corrupted */
3565 -ENODATA, /* Truncated */
3566 -EHOSTDOWN, /* Other machine */
3567 -EPROTONOSUPPORT, /* Incompatible feature */
3568 -EBUSY, /* Unclean shutdown */
3569 -ESHUTDOWN, /* Already archived */
3570 -EIO, /* IO error, including SIGBUS on mmap */
3571 -EIDRM, /* File has been deleted */
3572 -ETXTBSY)) /* File is from the future */
3573 return r;
3574
3575 if ((flags & O_ACCMODE) == O_RDONLY)
3576 return r;
3577
3578 if (!(flags & O_CREAT))
3579 return r;
3580
3581 if (!endswith(fname, ".journal"))
3582 return r;
3583
3584 /* The file is corrupted. Rotate it away and try it again (but only once) */
3585 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3586
3587 r = journal_file_dispose(AT_FDCWD, fname);
3588 if (r < 0)
3589 return r;
3590
3591 return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3592 deferred_closes, template, ret);
3593 }
3594
3595 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
3596 uint64_t i, n;
3597 uint64_t q, xor_hash = 0;
3598 int r;
3599 EntryItem *items;
3600 dual_timestamp ts;
3601 const sd_id128_t *boot_id;
3602
3603 assert(from);
3604 assert(to);
3605 assert(o);
3606 assert(p);
3607
3608 if (!to->writable)
3609 return -EPERM;
3610
3611 ts.monotonic = le64toh(o->entry.monotonic);
3612 ts.realtime = le64toh(o->entry.realtime);
3613 boot_id = &o->entry.boot_id;
3614
3615 n = journal_file_entry_n_items(o);
3616 /* alloca() can't take 0, hence let's allocate at least one */
3617 items = newa(EntryItem, MAX(1u, n));
3618
3619 for (i = 0; i < n; i++) {
3620 uint64_t l, h;
3621 le64_t le_hash;
3622 size_t t;
3623 void *data;
3624 Object *u;
3625
3626 q = le64toh(o->entry.items[i].object_offset);
3627 le_hash = o->entry.items[i].hash;
3628
3629 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3630 if (r < 0)
3631 return r;
3632
3633 if (le_hash != o->data.hash)
3634 return -EBADMSG;
3635
3636 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3637 t = (size_t) l;
3638
3639 /* We hit the limit on 32bit machines */
3640 if ((uint64_t) t != l)
3641 return -E2BIG;
3642
3643 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3644 #if HAVE_XZ || HAVE_LZ4
3645 size_t rsize = 0;
3646
3647 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3648 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3649 if (r < 0)
3650 return r;
3651
3652 data = from->compress_buffer;
3653 l = rsize;
3654 #else
3655 return -EPROTONOSUPPORT;
3656 #endif
3657 } else
3658 data = o->data.payload;
3659
3660 r = journal_file_append_data(to, data, l, &u, &h);
3661 if (r < 0)
3662 return r;
3663
3664 xor_hash ^= le64toh(u->data.hash);
3665 items[i].object_offset = htole64(h);
3666 items[i].hash = u->data.hash;
3667
3668 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3669 if (r < 0)
3670 return r;
3671 }
3672
3673 r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n,
3674 NULL, NULL, NULL);
3675
3676 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3677 return -EIO;
3678
3679 return r;
3680 }
3681
3682 void journal_reset_metrics(JournalMetrics *m) {
3683 assert(m);
3684
3685 /* Set everything to "pick automatic values". */
3686
3687 *m = (JournalMetrics) {
3688 .min_use = (uint64_t) -1,
3689 .max_use = (uint64_t) -1,
3690 .min_size = (uint64_t) -1,
3691 .max_size = (uint64_t) -1,
3692 .keep_free = (uint64_t) -1,
3693 .n_max_files = (uint64_t) -1,
3694 };
3695 }
3696
3697 void journal_default_metrics(JournalMetrics *m, int fd) {
3698 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3699 struct statvfs ss;
3700 uint64_t fs_size;
3701
3702 assert(m);
3703 assert(fd >= 0);
3704
3705 if (fstatvfs(fd, &ss) >= 0)
3706 fs_size = ss.f_frsize * ss.f_blocks;
3707 else {
3708 log_debug_errno(errno, "Failed to determine disk size: %m");
3709 fs_size = 0;
3710 }
3711
3712 if (m->max_use == (uint64_t) -1) {
3713
3714 if (fs_size > 0) {
3715 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3716
3717 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3718 m->max_use = DEFAULT_MAX_USE_UPPER;
3719
3720 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3721 m->max_use = DEFAULT_MAX_USE_LOWER;
3722 } else
3723 m->max_use = DEFAULT_MAX_USE_LOWER;
3724 } else {
3725 m->max_use = PAGE_ALIGN(m->max_use);
3726
3727 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3728 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3729 }
3730
3731 if (m->min_use == (uint64_t) -1)
3732 m->min_use = DEFAULT_MIN_USE;
3733
3734 if (m->min_use > m->max_use)
3735 m->min_use = m->max_use;
3736
3737 if (m->max_size == (uint64_t) -1) {
3738 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3739
3740 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3741 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3742 } else
3743 m->max_size = PAGE_ALIGN(m->max_size);
3744
3745 if (m->max_size != 0) {
3746 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3747 m->max_size = JOURNAL_FILE_SIZE_MIN;
3748
3749 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3750 m->max_use = m->max_size*2;
3751 }
3752
3753 if (m->min_size == (uint64_t) -1)
3754 m->min_size = JOURNAL_FILE_SIZE_MIN;
3755 else {
3756 m->min_size = PAGE_ALIGN(m->min_size);
3757
3758 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3759 m->min_size = JOURNAL_FILE_SIZE_MIN;
3760
3761 if (m->max_size != 0 && m->min_size > m->max_size)
3762 m->max_size = m->min_size;
3763 }
3764
3765 if (m->keep_free == (uint64_t) -1) {
3766
3767 if (fs_size > 0) {
3768 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3769
3770 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3771 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3772
3773 } else
3774 m->keep_free = DEFAULT_KEEP_FREE;
3775 }
3776
3777 if (m->n_max_files == (uint64_t) -1)
3778 m->n_max_files = DEFAULT_N_MAX_FILES;
3779
3780 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3781 format_bytes(a, sizeof(a), m->min_use),
3782 format_bytes(b, sizeof(b), m->max_use),
3783 format_bytes(c, sizeof(c), m->max_size),
3784 format_bytes(d, sizeof(d), m->min_size),
3785 format_bytes(e, sizeof(e), m->keep_free),
3786 m->n_max_files);
3787 }
3788
3789 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3790 assert(f);
3791 assert(f->header);
3792 assert(from || to);
3793
3794 if (from) {
3795 if (f->header->head_entry_realtime == 0)
3796 return -ENOENT;
3797
3798 *from = le64toh(f->header->head_entry_realtime);
3799 }
3800
3801 if (to) {
3802 if (f->header->tail_entry_realtime == 0)
3803 return -ENOENT;
3804
3805 *to = le64toh(f->header->tail_entry_realtime);
3806 }
3807
3808 return 1;
3809 }
3810
3811 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3812 Object *o;
3813 uint64_t p;
3814 int r;
3815
3816 assert(f);
3817 assert(from || to);
3818
3819 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3820 if (r <= 0)
3821 return r;
3822
3823 if (le64toh(o->data.n_entries) <= 0)
3824 return 0;
3825
3826 if (from) {
3827 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3828 if (r < 0)
3829 return r;
3830
3831 *from = le64toh(o->entry.monotonic);
3832 }
3833
3834 if (to) {
3835 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3836 if (r < 0)
3837 return r;
3838
3839 r = generic_array_get_plus_one(f,
3840 le64toh(o->data.entry_offset),
3841 le64toh(o->data.entry_array_offset),
3842 le64toh(o->data.n_entries)-1,
3843 &o, NULL);
3844 if (r <= 0)
3845 return r;
3846
3847 *to = le64toh(o->entry.monotonic);
3848 }
3849
3850 return 1;
3851 }
3852
3853 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3854 assert(f);
3855 assert(f->header);
3856
3857 /* If we gained new header fields we gained new features,
3858 * hence suggest a rotation */
3859 if (le64toh(f->header->header_size) < sizeof(Header)) {
3860 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3861 return true;
3862 }
3863
3864 /* Let's check if the hash tables grew over a certain fill
3865 * level (75%, borrowing this value from Java's hash table
3866 * implementation), and if so suggest a rotation. To calculate
3867 * the fill level we need the n_data field, which only exists
3868 * in newer versions. */
3869
3870 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3871 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3872 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3873 f->path,
3874 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3875 le64toh(f->header->n_data),
3876 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3877 (unsigned long long) f->last_stat.st_size,
3878 f->last_stat.st_size / le64toh(f->header->n_data));
3879 return true;
3880 }
3881
3882 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3883 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3884 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3885 f->path,
3886 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3887 le64toh(f->header->n_fields),
3888 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3889 return true;
3890 }
3891
3892 /* Are the data objects properly indexed by field objects? */
3893 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3894 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3895 le64toh(f->header->n_data) > 0 &&
3896 le64toh(f->header->n_fields) == 0)
3897 return true;
3898
3899 if (max_file_usec > 0) {
3900 usec_t t, h;
3901
3902 h = le64toh(f->header->head_entry_realtime);
3903 t = now(CLOCK_REALTIME);
3904
3905 if (h > 0 && t > h + max_file_usec)
3906 return true;
3907 }
3908
3909 return false;
3910 }