]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
journal: allow boot_id to be passed to journal_append_entry()
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2011 Lennart Poettering
6 ***/
7
8 #include <errno.h>
9 #include <fcntl.h>
10 #include <linux/fs.h>
11 #include <pthread.h>
12 #include <stddef.h>
13 #include <sys/mman.h>
14 #include <sys/statvfs.h>
15 #include <sys/uio.h>
16 #include <unistd.h>
17
18 #include "alloc-util.h"
19 #include "btrfs-util.h"
20 #include "chattr-util.h"
21 #include "compress.h"
22 #include "fd-util.h"
23 #include "fs-util.h"
24 #include "journal-authenticate.h"
25 #include "journal-def.h"
26 #include "journal-file.h"
27 #include "lookup3.h"
28 #include "parse-util.h"
29 #include "path-util.h"
30 #include "random-util.h"
31 #include "sd-event.h"
32 #include "set.h"
33 #include "stat-util.h"
34 #include "string-util.h"
35 #include "strv.h"
36 #include "xattr-util.h"
37
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40
41 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
42 #define MIN_COMPRESS_THRESHOLD (8ULL)
43
44 /* This is the minimum journal file size */
45 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
46
47 /* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51
52 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
53 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
54
55 /* This is the upper bound if we deduce max_size from max_use */
56 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57
58 /* This is the upper bound if we deduce the keep_free value from the
59 * file system size */
60 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61
62 /* This is the keep_free value when we can't determine the system
63 * size */
64 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65
66 /* This is the default maximum number of journal files to keep around. */
67 #define DEFAULT_N_MAX_FILES (100)
68
69 /* n_data was the first entry we added after the initial file format design */
70 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
71
72 /* How many entries to keep in the entry array chain cache at max */
73 #define CHAIN_CACHE_MAX 20
74
75 /* How much to increase the journal file size at once each time we allocate something new. */
76 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
77
78 /* Reread fstat() of the file for detecting deletions at least this often */
79 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
80
81 /* The mmap context to use for the header we pick as one above the last defined typed */
82 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
83
84 #ifdef __clang__
85 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
86 #endif
87
88 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
89 * As a result we use atomic operations on f->offline_state for inter-thread communications with
90 * journal_file_set_offline() and journal_file_set_online(). */
91 static void journal_file_set_offline_internal(JournalFile *f) {
92 assert(f);
93 assert(f->fd >= 0);
94 assert(f->header);
95
96 for (;;) {
97 switch (f->offline_state) {
98 case OFFLINE_CANCEL:
99 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
100 continue;
101 return;
102
103 case OFFLINE_AGAIN_FROM_SYNCING:
104 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
105 continue;
106 break;
107
108 case OFFLINE_AGAIN_FROM_OFFLINING:
109 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
110 continue;
111 break;
112
113 case OFFLINE_SYNCING:
114 (void) fsync(f->fd);
115
116 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
117 continue;
118
119 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
120 (void) fsync(f->fd);
121 break;
122
123 case OFFLINE_OFFLINING:
124 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
125 continue;
126 _fallthrough_;
127 case OFFLINE_DONE:
128 return;
129
130 case OFFLINE_JOINED:
131 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
132 return;
133 }
134 }
135 }
136
137 static void * journal_file_set_offline_thread(void *arg) {
138 JournalFile *f = arg;
139
140 (void) pthread_setname_np(pthread_self(), "journal-offline");
141
142 journal_file_set_offline_internal(f);
143
144 return NULL;
145 }
146
147 static int journal_file_set_offline_thread_join(JournalFile *f) {
148 int r;
149
150 assert(f);
151
152 if (f->offline_state == OFFLINE_JOINED)
153 return 0;
154
155 r = pthread_join(f->offline_thread, NULL);
156 if (r)
157 return -r;
158
159 f->offline_state = OFFLINE_JOINED;
160
161 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
162 return -EIO;
163
164 return 0;
165 }
166
167 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
168 static bool journal_file_set_offline_try_restart(JournalFile *f) {
169 for (;;) {
170 switch (f->offline_state) {
171 case OFFLINE_AGAIN_FROM_SYNCING:
172 case OFFLINE_AGAIN_FROM_OFFLINING:
173 return true;
174
175 case OFFLINE_CANCEL:
176 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
177 continue;
178 return true;
179
180 case OFFLINE_SYNCING:
181 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
182 continue;
183 return true;
184
185 case OFFLINE_OFFLINING:
186 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
187 continue;
188 return true;
189
190 default:
191 return false;
192 }
193 }
194 }
195
196 /* Sets a journal offline.
197 *
198 * If wait is false then an offline is dispatched in a separate thread for a
199 * subsequent journal_file_set_offline() or journal_file_set_online() of the
200 * same journal to synchronize with.
201 *
202 * If wait is true, then either an existing offline thread will be restarted
203 * and joined, or if none exists the offline is simply performed in this
204 * context without involving another thread.
205 */
206 int journal_file_set_offline(JournalFile *f, bool wait) {
207 bool restarted;
208 int r;
209
210 assert(f);
211
212 if (!f->writable)
213 return -EPERM;
214
215 if (!(f->fd >= 0 && f->header))
216 return -EINVAL;
217
218 /* An offlining journal is implicitly online and may modify f->header->state,
219 * we must also join any potentially lingering offline thread when not online. */
220 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
221 return journal_file_set_offline_thread_join(f);
222
223 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
224 restarted = journal_file_set_offline_try_restart(f);
225 if ((restarted && wait) || !restarted) {
226 r = journal_file_set_offline_thread_join(f);
227 if (r < 0)
228 return r;
229 }
230
231 if (restarted)
232 return 0;
233
234 /* Initiate a new offline. */
235 f->offline_state = OFFLINE_SYNCING;
236
237 if (wait) /* Without using a thread if waiting. */
238 journal_file_set_offline_internal(f);
239 else {
240 sigset_t ss, saved_ss;
241 int k;
242
243 if (sigfillset(&ss) < 0)
244 return -errno;
245
246 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
247 if (r > 0)
248 return -r;
249
250 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
251
252 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
253 if (r > 0) {
254 f->offline_state = OFFLINE_JOINED;
255 return -r;
256 }
257 if (k > 0)
258 return -k;
259 }
260
261 return 0;
262 }
263
264 static int journal_file_set_online(JournalFile *f) {
265 bool wait = true;
266
267 assert(f);
268
269 if (!f->writable)
270 return -EPERM;
271
272 if (!(f->fd >= 0 && f->header))
273 return -EINVAL;
274
275 while (wait) {
276 switch (f->offline_state) {
277 case OFFLINE_JOINED:
278 /* No offline thread, no need to wait. */
279 wait = false;
280 break;
281
282 case OFFLINE_SYNCING:
283 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
284 continue;
285 /* Canceled syncing prior to offlining, no need to wait. */
286 wait = false;
287 break;
288
289 case OFFLINE_AGAIN_FROM_SYNCING:
290 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
291 continue;
292 /* Canceled restart from syncing, no need to wait. */
293 wait = false;
294 break;
295
296 case OFFLINE_AGAIN_FROM_OFFLINING:
297 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
298 continue;
299 /* Canceled restart from offlining, must wait for offlining to complete however. */
300 _fallthrough_;
301 default: {
302 int r;
303
304 r = journal_file_set_offline_thread_join(f);
305 if (r < 0)
306 return r;
307
308 wait = false;
309 break;
310 }
311 }
312 }
313
314 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
315 return -EIO;
316
317 switch (f->header->state) {
318 case STATE_ONLINE:
319 return 0;
320
321 case STATE_OFFLINE:
322 f->header->state = STATE_ONLINE;
323 (void) fsync(f->fd);
324 return 0;
325
326 default:
327 return -EINVAL;
328 }
329 }
330
331 bool journal_file_is_offlining(JournalFile *f) {
332 assert(f);
333
334 __sync_synchronize();
335
336 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
337 return false;
338
339 return true;
340 }
341
342 JournalFile* journal_file_close(JournalFile *f) {
343 assert(f);
344
345 #if HAVE_GCRYPT
346 /* Write the final tag */
347 if (f->seal && f->writable) {
348 int r;
349
350 r = journal_file_append_tag(f);
351 if (r < 0)
352 log_error_errno(r, "Failed to append tag when closing journal: %m");
353 }
354 #endif
355
356 if (f->post_change_timer) {
357 int enabled;
358
359 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
360 if (enabled == SD_EVENT_ONESHOT)
361 journal_file_post_change(f);
362
363 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
364 sd_event_source_unref(f->post_change_timer);
365 }
366
367 journal_file_set_offline(f, true);
368
369 if (f->mmap && f->cache_fd)
370 mmap_cache_free_fd(f->mmap, f->cache_fd);
371
372 if (f->fd >= 0 && f->defrag_on_close) {
373
374 /* Be friendly to btrfs: turn COW back on again now,
375 * and defragment the file. We won't write to the file
376 * ever again, hence remove all fragmentation, and
377 * reenable all the good bits COW usually provides
378 * (such as data checksumming). */
379
380 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
381 (void) btrfs_defrag_fd(f->fd);
382 }
383
384 if (f->close_fd)
385 safe_close(f->fd);
386 free(f->path);
387
388 mmap_cache_unref(f->mmap);
389
390 ordered_hashmap_free_free(f->chain_cache);
391
392 #if HAVE_XZ || HAVE_LZ4
393 free(f->compress_buffer);
394 #endif
395
396 #if HAVE_GCRYPT
397 if (f->fss_file)
398 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
399 else
400 free(f->fsprg_state);
401
402 free(f->fsprg_seed);
403
404 if (f->hmac)
405 gcry_md_close(f->hmac);
406 #endif
407
408 return mfree(f);
409 }
410
411 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
412 Header h = {};
413 ssize_t k;
414 int r;
415
416 assert(f);
417
418 memcpy(h.signature, HEADER_SIGNATURE, 8);
419 h.header_size = htole64(ALIGN64(sizeof(h)));
420
421 h.incompatible_flags |= htole32(
422 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
423 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
424
425 h.compatible_flags = htole32(
426 f->seal * HEADER_COMPATIBLE_SEALED);
427
428 r = sd_id128_randomize(&h.file_id);
429 if (r < 0)
430 return r;
431
432 if (template) {
433 h.seqnum_id = template->header->seqnum_id;
434 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
435 } else
436 h.seqnum_id = h.file_id;
437
438 k = pwrite(f->fd, &h, sizeof(h), 0);
439 if (k < 0)
440 return -errno;
441
442 if (k != sizeof(h))
443 return -EIO;
444
445 return 0;
446 }
447
448 static int journal_file_refresh_header(JournalFile *f) {
449 sd_id128_t boot_id;
450 int r;
451
452 assert(f);
453 assert(f->header);
454
455 r = sd_id128_get_machine(&f->header->machine_id);
456 if (IN_SET(r, -ENOENT, -ENOMEDIUM))
457 /* We don't have a machine-id, let's continue without */
458 zero(f->header->machine_id);
459 else if (r < 0)
460 return r;
461
462 r = sd_id128_get_boot(&boot_id);
463 if (r < 0)
464 return r;
465
466 f->header->boot_id = boot_id;
467
468 r = journal_file_set_online(f);
469
470 /* Sync the online state to disk */
471 (void) fsync(f->fd);
472
473 /* We likely just created a new file, also sync the directory this file is located in. */
474 (void) fsync_directory_of_file(f->fd);
475
476 return r;
477 }
478
479 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
480 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
481 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
482 const char *type = compatible ? "compatible" : "incompatible";
483 uint32_t flags;
484
485 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
486
487 if (flags & ~supported) {
488 if (flags & ~any)
489 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
490 f->path, type, flags & ~any);
491 flags = (flags & any) & ~supported;
492 if (flags) {
493 const char* strv[3];
494 unsigned n = 0;
495 _cleanup_free_ char *t = NULL;
496
497 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
498 strv[n++] = "sealed";
499 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
500 strv[n++] = "xz-compressed";
501 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
502 strv[n++] = "lz4-compressed";
503 strv[n] = NULL;
504 assert(n < ELEMENTSOF(strv));
505
506 t = strv_join((char**) strv, ", ");
507 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
508 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
509 }
510 return true;
511 }
512
513 return false;
514 }
515
516 static int journal_file_verify_header(JournalFile *f) {
517 uint64_t arena_size, header_size;
518
519 assert(f);
520 assert(f->header);
521
522 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
523 return -EBADMSG;
524
525 /* In both read and write mode we refuse to open files with incompatible
526 * flags we don't know. */
527 if (warn_wrong_flags(f, false))
528 return -EPROTONOSUPPORT;
529
530 /* When open for writing we refuse to open files with compatible flags, too. */
531 if (f->writable && warn_wrong_flags(f, true))
532 return -EPROTONOSUPPORT;
533
534 if (f->header->state >= _STATE_MAX)
535 return -EBADMSG;
536
537 header_size = le64toh(f->header->header_size);
538
539 /* The first addition was n_data, so check that we are at least this large */
540 if (header_size < HEADER_SIZE_MIN)
541 return -EBADMSG;
542
543 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
544 return -EBADMSG;
545
546 arena_size = le64toh(f->header->arena_size);
547
548 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
549 return -ENODATA;
550
551 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
552 return -ENODATA;
553
554 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
555 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
556 !VALID64(le64toh(f->header->tail_object_offset)) ||
557 !VALID64(le64toh(f->header->entry_array_offset)))
558 return -ENODATA;
559
560 if (f->writable) {
561 sd_id128_t machine_id;
562 uint8_t state;
563 int r;
564
565 r = sd_id128_get_machine(&machine_id);
566 if (r < 0)
567 return r;
568
569 if (!sd_id128_equal(machine_id, f->header->machine_id))
570 return -EHOSTDOWN;
571
572 state = f->header->state;
573
574 if (state == STATE_ARCHIVED)
575 return -ESHUTDOWN; /* Already archived */
576 else if (state == STATE_ONLINE) {
577 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
578 return -EBUSY;
579 } else if (state != STATE_OFFLINE) {
580 log_debug("Journal file %s has unknown state %i.", f->path, state);
581 return -EBUSY;
582 }
583
584 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
585 return -EBADMSG;
586
587 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
588 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
589 * bisection. */
590 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
591 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
592 return -ETXTBSY;
593 }
594 }
595
596 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
597 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
598
599 f->seal = JOURNAL_HEADER_SEALED(f->header);
600
601 return 0;
602 }
603
604 static int journal_file_fstat(JournalFile *f) {
605 int r;
606
607 assert(f);
608 assert(f->fd >= 0);
609
610 if (fstat(f->fd, &f->last_stat) < 0)
611 return -errno;
612
613 f->last_stat_usec = now(CLOCK_MONOTONIC);
614
615 /* Refuse dealing with with files that aren't regular */
616 r = stat_verify_regular(&f->last_stat);
617 if (r < 0)
618 return r;
619
620 /* Refuse appending to files that are already deleted */
621 if (f->last_stat.st_nlink <= 0)
622 return -EIDRM;
623
624 return 0;
625 }
626
627 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
628 uint64_t old_size, new_size;
629 int r;
630
631 assert(f);
632 assert(f->header);
633
634 /* We assume that this file is not sparse, and we know that
635 * for sure, since we always call posix_fallocate()
636 * ourselves */
637
638 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
639 return -EIO;
640
641 old_size =
642 le64toh(f->header->header_size) +
643 le64toh(f->header->arena_size);
644
645 new_size = PAGE_ALIGN(offset + size);
646 if (new_size < le64toh(f->header->header_size))
647 new_size = le64toh(f->header->header_size);
648
649 if (new_size <= old_size) {
650
651 /* We already pre-allocated enough space, but before
652 * we write to it, let's check with fstat() if the
653 * file got deleted, in order make sure we don't throw
654 * away the data immediately. Don't check fstat() for
655 * all writes though, but only once ever 10s. */
656
657 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
658 return 0;
659
660 return journal_file_fstat(f);
661 }
662
663 /* Allocate more space. */
664
665 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
666 return -E2BIG;
667
668 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
669 struct statvfs svfs;
670
671 if (fstatvfs(f->fd, &svfs) >= 0) {
672 uint64_t available;
673
674 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
675
676 if (new_size - old_size > available)
677 return -E2BIG;
678 }
679 }
680
681 /* Increase by larger blocks at once */
682 new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
683 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
684 new_size = f->metrics.max_size;
685
686 /* Note that the glibc fallocate() fallback is very
687 inefficient, hence we try to minimize the allocation area
688 as we can. */
689 r = posix_fallocate(f->fd, old_size, new_size - old_size);
690 if (r != 0)
691 return -r;
692
693 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
694
695 return journal_file_fstat(f);
696 }
697
698 static unsigned type_to_context(ObjectType type) {
699 /* One context for each type, plus one catch-all for the rest */
700 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
701 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
702 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
703 }
704
705 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
706 int r;
707
708 assert(f);
709 assert(ret);
710
711 if (size <= 0)
712 return -EINVAL;
713
714 /* Avoid SIGBUS on invalid accesses */
715 if (offset + size > (uint64_t) f->last_stat.st_size) {
716 /* Hmm, out of range? Let's refresh the fstat() data
717 * first, before we trust that check. */
718
719 r = journal_file_fstat(f);
720 if (r < 0)
721 return r;
722
723 if (offset + size > (uint64_t) f->last_stat.st_size)
724 return -EADDRNOTAVAIL;
725 }
726
727 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
728 }
729
730 static uint64_t minimum_header_size(Object *o) {
731
732 static const uint64_t table[] = {
733 [OBJECT_DATA] = sizeof(DataObject),
734 [OBJECT_FIELD] = sizeof(FieldObject),
735 [OBJECT_ENTRY] = sizeof(EntryObject),
736 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
737 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
738 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
739 [OBJECT_TAG] = sizeof(TagObject),
740 };
741
742 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
743 return sizeof(ObjectHeader);
744
745 return table[o->object.type];
746 }
747
748 /* Lightweight object checks. We want this to be fast, so that we won't
749 * slowdown every journal_file_move_to_object() call too much. */
750 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
751 assert(f);
752 assert(o);
753
754 switch (o->object.type) {
755
756 case OBJECT_DATA: {
757 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
758 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
759 le64toh(o->data.n_entries), offset);
760 return -EBADMSG;
761 }
762
763 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
764 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
765 offsetof(DataObject, payload),
766 le64toh(o->object.size),
767 offset);
768 return -EBADMSG;
769 }
770
771 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
772 !VALID64(le64toh(o->data.next_field_offset)) ||
773 !VALID64(le64toh(o->data.entry_offset)) ||
774 !VALID64(le64toh(o->data.entry_array_offset))) {
775 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
776 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
777 le64toh(o->data.next_hash_offset),
778 le64toh(o->data.next_field_offset),
779 le64toh(o->data.entry_offset),
780 le64toh(o->data.entry_array_offset),
781 offset);
782 return -EBADMSG;
783 }
784
785 break;
786 }
787
788 case OBJECT_FIELD:
789 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
790 log_debug(
791 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
792 offsetof(FieldObject, payload),
793 le64toh(o->object.size),
794 offset);
795 return -EBADMSG;
796 }
797
798 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
799 !VALID64(le64toh(o->field.head_data_offset))) {
800 log_debug(
801 "Invalid offset, next_hash_offset="OFSfmt
802 ", head_data_offset="OFSfmt": %"PRIu64,
803 le64toh(o->field.next_hash_offset),
804 le64toh(o->field.head_data_offset),
805 offset);
806 return -EBADMSG;
807 }
808 break;
809
810 case OBJECT_ENTRY:
811 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
812 log_debug(
813 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
814 offsetof(EntryObject, items),
815 le64toh(o->object.size),
816 offset);
817 return -EBADMSG;
818 }
819
820 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
821 log_debug(
822 "Invalid number items in entry: %"PRIu64": %"PRIu64,
823 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
824 offset);
825 return -EBADMSG;
826 }
827
828 if (le64toh(o->entry.seqnum) <= 0) {
829 log_debug(
830 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
831 le64toh(o->entry.seqnum),
832 offset);
833 return -EBADMSG;
834 }
835
836 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
837 log_debug(
838 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
839 le64toh(o->entry.realtime),
840 offset);
841 return -EBADMSG;
842 }
843
844 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
845 log_debug(
846 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
847 le64toh(o->entry.monotonic),
848 offset);
849 return -EBADMSG;
850 }
851
852 break;
853
854 case OBJECT_DATA_HASH_TABLE:
855 case OBJECT_FIELD_HASH_TABLE:
856 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
857 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
858 log_debug(
859 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
860 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
861 le64toh(o->object.size),
862 offset);
863 return -EBADMSG;
864 }
865
866 break;
867
868 case OBJECT_ENTRY_ARRAY:
869 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
870 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
871 log_debug(
872 "Invalid object entry array size: %"PRIu64": %"PRIu64,
873 le64toh(o->object.size),
874 offset);
875 return -EBADMSG;
876 }
877
878 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
879 log_debug(
880 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
881 le64toh(o->entry_array.next_entry_array_offset),
882 offset);
883 return -EBADMSG;
884 }
885
886 break;
887
888 case OBJECT_TAG:
889 if (le64toh(o->object.size) != sizeof(TagObject)) {
890 log_debug(
891 "Invalid object tag size: %"PRIu64": %"PRIu64,
892 le64toh(o->object.size),
893 offset);
894 return -EBADMSG;
895 }
896
897 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
898 log_debug(
899 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
900 le64toh(o->tag.epoch),
901 offset);
902 return -EBADMSG;
903 }
904
905 break;
906 }
907
908 return 0;
909 }
910
911 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
912 int r;
913 void *t;
914 size_t tsize;
915 Object *o;
916 uint64_t s;
917
918 assert(f);
919 assert(ret);
920
921 /* Objects may only be located at multiple of 64 bit */
922 if (!VALID64(offset)) {
923 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
924 return -EBADMSG;
925 }
926
927 /* Object may not be located in the file header */
928 if (offset < le64toh(f->header->header_size)) {
929 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
930 return -EBADMSG;
931 }
932
933 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
934 if (r < 0)
935 return r;
936
937 o = (Object*) t;
938 s = le64toh(o->object.size);
939
940 if (s == 0) {
941 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
942 return -EBADMSG;
943 }
944 if (s < sizeof(ObjectHeader)) {
945 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
946 return -EBADMSG;
947 }
948
949 if (o->object.type <= OBJECT_UNUSED) {
950 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
951 return -EBADMSG;
952 }
953
954 if (s < minimum_header_size(o)) {
955 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
956 return -EBADMSG;
957 }
958
959 if (type > OBJECT_UNUSED && o->object.type != type) {
960 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
961 return -EBADMSG;
962 }
963
964 if (s > tsize) {
965 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
966 if (r < 0)
967 return r;
968
969 o = (Object*) t;
970 }
971
972 r = journal_file_check_object(f, offset, o);
973 if (r < 0)
974 return r;
975
976 *ret = o;
977 return 0;
978 }
979
980 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
981 uint64_t r;
982
983 assert(f);
984 assert(f->header);
985
986 r = le64toh(f->header->tail_entry_seqnum) + 1;
987
988 if (seqnum) {
989 /* If an external seqnum counter was passed, we update
990 * both the local and the external one, and set it to
991 * the maximum of both */
992
993 if (*seqnum + 1 > r)
994 r = *seqnum + 1;
995
996 *seqnum = r;
997 }
998
999 f->header->tail_entry_seqnum = htole64(r);
1000
1001 if (f->header->head_entry_seqnum == 0)
1002 f->header->head_entry_seqnum = htole64(r);
1003
1004 return r;
1005 }
1006
1007 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
1008 int r;
1009 uint64_t p;
1010 Object *tail, *o;
1011 void *t;
1012
1013 assert(f);
1014 assert(f->header);
1015 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
1016 assert(size >= sizeof(ObjectHeader));
1017 assert(offset);
1018 assert(ret);
1019
1020 r = journal_file_set_online(f);
1021 if (r < 0)
1022 return r;
1023
1024 p = le64toh(f->header->tail_object_offset);
1025 if (p == 0)
1026 p = le64toh(f->header->header_size);
1027 else {
1028 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1029 if (r < 0)
1030 return r;
1031
1032 p += ALIGN64(le64toh(tail->object.size));
1033 }
1034
1035 r = journal_file_allocate(f, p, size);
1036 if (r < 0)
1037 return r;
1038
1039 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1040 if (r < 0)
1041 return r;
1042
1043 o = (Object*) t;
1044
1045 zero(o->object);
1046 o->object.type = type;
1047 o->object.size = htole64(size);
1048
1049 f->header->tail_object_offset = htole64(p);
1050 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1051
1052 *ret = o;
1053 *offset = p;
1054
1055 return 0;
1056 }
1057
1058 static int journal_file_setup_data_hash_table(JournalFile *f) {
1059 uint64_t s, p;
1060 Object *o;
1061 int r;
1062
1063 assert(f);
1064 assert(f->header);
1065
1066 /* We estimate that we need 1 hash table entry per 768 bytes
1067 of journal file and we want to make sure we never get
1068 beyond 75% fill level. Calculate the hash table size for
1069 the maximum file size based on these metrics. */
1070
1071 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1072 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1073 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1074
1075 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1076
1077 r = journal_file_append_object(f,
1078 OBJECT_DATA_HASH_TABLE,
1079 offsetof(Object, hash_table.items) + s,
1080 &o, &p);
1081 if (r < 0)
1082 return r;
1083
1084 memzero(o->hash_table.items, s);
1085
1086 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1087 f->header->data_hash_table_size = htole64(s);
1088
1089 return 0;
1090 }
1091
1092 static int journal_file_setup_field_hash_table(JournalFile *f) {
1093 uint64_t s, p;
1094 Object *o;
1095 int r;
1096
1097 assert(f);
1098 assert(f->header);
1099
1100 /* We use a fixed size hash table for the fields as this
1101 * number should grow very slowly only */
1102
1103 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1104 r = journal_file_append_object(f,
1105 OBJECT_FIELD_HASH_TABLE,
1106 offsetof(Object, hash_table.items) + s,
1107 &o, &p);
1108 if (r < 0)
1109 return r;
1110
1111 memzero(o->hash_table.items, s);
1112
1113 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1114 f->header->field_hash_table_size = htole64(s);
1115
1116 return 0;
1117 }
1118
1119 int journal_file_map_data_hash_table(JournalFile *f) {
1120 uint64_t s, p;
1121 void *t;
1122 int r;
1123
1124 assert(f);
1125 assert(f->header);
1126
1127 if (f->data_hash_table)
1128 return 0;
1129
1130 p = le64toh(f->header->data_hash_table_offset);
1131 s = le64toh(f->header->data_hash_table_size);
1132
1133 r = journal_file_move_to(f,
1134 OBJECT_DATA_HASH_TABLE,
1135 true,
1136 p, s,
1137 &t, NULL);
1138 if (r < 0)
1139 return r;
1140
1141 f->data_hash_table = t;
1142 return 0;
1143 }
1144
1145 int journal_file_map_field_hash_table(JournalFile *f) {
1146 uint64_t s, p;
1147 void *t;
1148 int r;
1149
1150 assert(f);
1151 assert(f->header);
1152
1153 if (f->field_hash_table)
1154 return 0;
1155
1156 p = le64toh(f->header->field_hash_table_offset);
1157 s = le64toh(f->header->field_hash_table_size);
1158
1159 r = journal_file_move_to(f,
1160 OBJECT_FIELD_HASH_TABLE,
1161 true,
1162 p, s,
1163 &t, NULL);
1164 if (r < 0)
1165 return r;
1166
1167 f->field_hash_table = t;
1168 return 0;
1169 }
1170
1171 static int journal_file_link_field(
1172 JournalFile *f,
1173 Object *o,
1174 uint64_t offset,
1175 uint64_t hash) {
1176
1177 uint64_t p, h, m;
1178 int r;
1179
1180 assert(f);
1181 assert(f->header);
1182 assert(f->field_hash_table);
1183 assert(o);
1184 assert(offset > 0);
1185
1186 if (o->object.type != OBJECT_FIELD)
1187 return -EINVAL;
1188
1189 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1190 if (m <= 0)
1191 return -EBADMSG;
1192
1193 /* This might alter the window we are looking at */
1194 o->field.next_hash_offset = o->field.head_data_offset = 0;
1195
1196 h = hash % m;
1197 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1198 if (p == 0)
1199 f->field_hash_table[h].head_hash_offset = htole64(offset);
1200 else {
1201 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1202 if (r < 0)
1203 return r;
1204
1205 o->field.next_hash_offset = htole64(offset);
1206 }
1207
1208 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1209
1210 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1211 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1212
1213 return 0;
1214 }
1215
1216 static int journal_file_link_data(
1217 JournalFile *f,
1218 Object *o,
1219 uint64_t offset,
1220 uint64_t hash) {
1221
1222 uint64_t p, h, m;
1223 int r;
1224
1225 assert(f);
1226 assert(f->header);
1227 assert(f->data_hash_table);
1228 assert(o);
1229 assert(offset > 0);
1230
1231 if (o->object.type != OBJECT_DATA)
1232 return -EINVAL;
1233
1234 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1235 if (m <= 0)
1236 return -EBADMSG;
1237
1238 /* This might alter the window we are looking at */
1239 o->data.next_hash_offset = o->data.next_field_offset = 0;
1240 o->data.entry_offset = o->data.entry_array_offset = 0;
1241 o->data.n_entries = 0;
1242
1243 h = hash % m;
1244 p = le64toh(f->data_hash_table[h].tail_hash_offset);
1245 if (p == 0)
1246 /* Only entry in the hash table is easy */
1247 f->data_hash_table[h].head_hash_offset = htole64(offset);
1248 else {
1249 /* Move back to the previous data object, to patch in
1250 * pointer */
1251
1252 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1253 if (r < 0)
1254 return r;
1255
1256 o->data.next_hash_offset = htole64(offset);
1257 }
1258
1259 f->data_hash_table[h].tail_hash_offset = htole64(offset);
1260
1261 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1262 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1263
1264 return 0;
1265 }
1266
1267 int journal_file_find_field_object_with_hash(
1268 JournalFile *f,
1269 const void *field, uint64_t size, uint64_t hash,
1270 Object **ret, uint64_t *offset) {
1271
1272 uint64_t p, osize, h, m;
1273 int r;
1274
1275 assert(f);
1276 assert(f->header);
1277 assert(field && size > 0);
1278
1279 /* If the field hash table is empty, we can't find anything */
1280 if (le64toh(f->header->field_hash_table_size) <= 0)
1281 return 0;
1282
1283 /* Map the field hash table, if it isn't mapped yet. */
1284 r = journal_file_map_field_hash_table(f);
1285 if (r < 0)
1286 return r;
1287
1288 osize = offsetof(Object, field.payload) + size;
1289
1290 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1291 if (m <= 0)
1292 return -EBADMSG;
1293
1294 h = hash % m;
1295 p = le64toh(f->field_hash_table[h].head_hash_offset);
1296
1297 while (p > 0) {
1298 Object *o;
1299
1300 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1301 if (r < 0)
1302 return r;
1303
1304 if (le64toh(o->field.hash) == hash &&
1305 le64toh(o->object.size) == osize &&
1306 memcmp(o->field.payload, field, size) == 0) {
1307
1308 if (ret)
1309 *ret = o;
1310 if (offset)
1311 *offset = p;
1312
1313 return 1;
1314 }
1315
1316 p = le64toh(o->field.next_hash_offset);
1317 }
1318
1319 return 0;
1320 }
1321
1322 int journal_file_find_field_object(
1323 JournalFile *f,
1324 const void *field, uint64_t size,
1325 Object **ret, uint64_t *offset) {
1326
1327 uint64_t hash;
1328
1329 assert(f);
1330 assert(field && size > 0);
1331
1332 hash = hash64(field, size);
1333
1334 return journal_file_find_field_object_with_hash(f,
1335 field, size, hash,
1336 ret, offset);
1337 }
1338
1339 int journal_file_find_data_object_with_hash(
1340 JournalFile *f,
1341 const void *data, uint64_t size, uint64_t hash,
1342 Object **ret, uint64_t *offset) {
1343
1344 uint64_t p, osize, h, m;
1345 int r;
1346
1347 assert(f);
1348 assert(f->header);
1349 assert(data || size == 0);
1350
1351 /* If there's no data hash table, then there's no entry. */
1352 if (le64toh(f->header->data_hash_table_size) <= 0)
1353 return 0;
1354
1355 /* Map the data hash table, if it isn't mapped yet. */
1356 r = journal_file_map_data_hash_table(f);
1357 if (r < 0)
1358 return r;
1359
1360 osize = offsetof(Object, data.payload) + size;
1361
1362 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1363 if (m <= 0)
1364 return -EBADMSG;
1365
1366 h = hash % m;
1367 p = le64toh(f->data_hash_table[h].head_hash_offset);
1368
1369 while (p > 0) {
1370 Object *o;
1371
1372 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1373 if (r < 0)
1374 return r;
1375
1376 if (le64toh(o->data.hash) != hash)
1377 goto next;
1378
1379 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1380 #if HAVE_XZ || HAVE_LZ4
1381 uint64_t l;
1382 size_t rsize = 0;
1383
1384 l = le64toh(o->object.size);
1385 if (l <= offsetof(Object, data.payload))
1386 return -EBADMSG;
1387
1388 l -= offsetof(Object, data.payload);
1389
1390 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1391 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1392 if (r < 0)
1393 return r;
1394
1395 if (rsize == size &&
1396 memcmp(f->compress_buffer, data, size) == 0) {
1397
1398 if (ret)
1399 *ret = o;
1400
1401 if (offset)
1402 *offset = p;
1403
1404 return 1;
1405 }
1406 #else
1407 return -EPROTONOSUPPORT;
1408 #endif
1409 } else if (le64toh(o->object.size) == osize &&
1410 memcmp(o->data.payload, data, size) == 0) {
1411
1412 if (ret)
1413 *ret = o;
1414
1415 if (offset)
1416 *offset = p;
1417
1418 return 1;
1419 }
1420
1421 next:
1422 p = le64toh(o->data.next_hash_offset);
1423 }
1424
1425 return 0;
1426 }
1427
1428 int journal_file_find_data_object(
1429 JournalFile *f,
1430 const void *data, uint64_t size,
1431 Object **ret, uint64_t *offset) {
1432
1433 uint64_t hash;
1434
1435 assert(f);
1436 assert(data || size == 0);
1437
1438 hash = hash64(data, size);
1439
1440 return journal_file_find_data_object_with_hash(f,
1441 data, size, hash,
1442 ret, offset);
1443 }
1444
1445 static int journal_file_append_field(
1446 JournalFile *f,
1447 const void *field, uint64_t size,
1448 Object **ret, uint64_t *offset) {
1449
1450 uint64_t hash, p;
1451 uint64_t osize;
1452 Object *o;
1453 int r;
1454
1455 assert(f);
1456 assert(field && size > 0);
1457
1458 hash = hash64(field, size);
1459
1460 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1461 if (r < 0)
1462 return r;
1463 else if (r > 0) {
1464
1465 if (ret)
1466 *ret = o;
1467
1468 if (offset)
1469 *offset = p;
1470
1471 return 0;
1472 }
1473
1474 osize = offsetof(Object, field.payload) + size;
1475 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1476 if (r < 0)
1477 return r;
1478
1479 o->field.hash = htole64(hash);
1480 memcpy(o->field.payload, field, size);
1481
1482 r = journal_file_link_field(f, o, p, hash);
1483 if (r < 0)
1484 return r;
1485
1486 /* The linking might have altered the window, so let's
1487 * refresh our pointer */
1488 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1489 if (r < 0)
1490 return r;
1491
1492 #if HAVE_GCRYPT
1493 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1494 if (r < 0)
1495 return r;
1496 #endif
1497
1498 if (ret)
1499 *ret = o;
1500
1501 if (offset)
1502 *offset = p;
1503
1504 return 0;
1505 }
1506
1507 static int journal_file_append_data(
1508 JournalFile *f,
1509 const void *data, uint64_t size,
1510 Object **ret, uint64_t *offset) {
1511
1512 uint64_t hash, p;
1513 uint64_t osize;
1514 Object *o;
1515 int r, compression = 0;
1516 const void *eq;
1517
1518 assert(f);
1519 assert(data || size == 0);
1520
1521 hash = hash64(data, size);
1522
1523 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1524 if (r < 0)
1525 return r;
1526 if (r > 0) {
1527
1528 if (ret)
1529 *ret = o;
1530
1531 if (offset)
1532 *offset = p;
1533
1534 return 0;
1535 }
1536
1537 osize = offsetof(Object, data.payload) + size;
1538 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1539 if (r < 0)
1540 return r;
1541
1542 o->data.hash = htole64(hash);
1543
1544 #if HAVE_XZ || HAVE_LZ4
1545 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
1546 size_t rsize = 0;
1547
1548 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1549
1550 if (compression >= 0) {
1551 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1552 o->object.flags |= compression;
1553
1554 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1555 size, rsize, object_compressed_to_string(compression));
1556 } else
1557 /* Compression didn't work, we don't really care why, let's continue without compression */
1558 compression = 0;
1559 }
1560 #endif
1561
1562 if (compression == 0)
1563 memcpy_safe(o->data.payload, data, size);
1564
1565 r = journal_file_link_data(f, o, p, hash);
1566 if (r < 0)
1567 return r;
1568
1569 #if HAVE_GCRYPT
1570 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1571 if (r < 0)
1572 return r;
1573 #endif
1574
1575 /* The linking might have altered the window, so let's
1576 * refresh our pointer */
1577 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1578 if (r < 0)
1579 return r;
1580
1581 if (!data)
1582 eq = NULL;
1583 else
1584 eq = memchr(data, '=', size);
1585 if (eq && eq > data) {
1586 Object *fo = NULL;
1587 uint64_t fp;
1588
1589 /* Create field object ... */
1590 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1591 if (r < 0)
1592 return r;
1593
1594 /* ... and link it in. */
1595 o->data.next_field_offset = fo->field.head_data_offset;
1596 fo->field.head_data_offset = le64toh(p);
1597 }
1598
1599 if (ret)
1600 *ret = o;
1601
1602 if (offset)
1603 *offset = p;
1604
1605 return 0;
1606 }
1607
1608 uint64_t journal_file_entry_n_items(Object *o) {
1609 assert(o);
1610
1611 if (o->object.type != OBJECT_ENTRY)
1612 return 0;
1613
1614 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1615 }
1616
1617 uint64_t journal_file_entry_array_n_items(Object *o) {
1618 assert(o);
1619
1620 if (o->object.type != OBJECT_ENTRY_ARRAY)
1621 return 0;
1622
1623 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1624 }
1625
1626 uint64_t journal_file_hash_table_n_items(Object *o) {
1627 assert(o);
1628
1629 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1630 return 0;
1631
1632 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1633 }
1634
1635 static int link_entry_into_array(JournalFile *f,
1636 le64_t *first,
1637 le64_t *idx,
1638 uint64_t p) {
1639 int r;
1640 uint64_t n = 0, ap = 0, q, i, a, hidx;
1641 Object *o;
1642
1643 assert(f);
1644 assert(f->header);
1645 assert(first);
1646 assert(idx);
1647 assert(p > 0);
1648
1649 a = le64toh(*first);
1650 i = hidx = le64toh(*idx);
1651 while (a > 0) {
1652
1653 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1654 if (r < 0)
1655 return r;
1656
1657 n = journal_file_entry_array_n_items(o);
1658 if (i < n) {
1659 o->entry_array.items[i] = htole64(p);
1660 *idx = htole64(hidx + 1);
1661 return 0;
1662 }
1663
1664 i -= n;
1665 ap = a;
1666 a = le64toh(o->entry_array.next_entry_array_offset);
1667 }
1668
1669 if (hidx > n)
1670 n = (hidx+1) * 2;
1671 else
1672 n = n * 2;
1673
1674 if (n < 4)
1675 n = 4;
1676
1677 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1678 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1679 &o, &q);
1680 if (r < 0)
1681 return r;
1682
1683 #if HAVE_GCRYPT
1684 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1685 if (r < 0)
1686 return r;
1687 #endif
1688
1689 o->entry_array.items[i] = htole64(p);
1690
1691 if (ap == 0)
1692 *first = htole64(q);
1693 else {
1694 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1695 if (r < 0)
1696 return r;
1697
1698 o->entry_array.next_entry_array_offset = htole64(q);
1699 }
1700
1701 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1702 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1703
1704 *idx = htole64(hidx + 1);
1705
1706 return 0;
1707 }
1708
1709 static int link_entry_into_array_plus_one(JournalFile *f,
1710 le64_t *extra,
1711 le64_t *first,
1712 le64_t *idx,
1713 uint64_t p) {
1714
1715 int r;
1716
1717 assert(f);
1718 assert(extra);
1719 assert(first);
1720 assert(idx);
1721 assert(p > 0);
1722
1723 if (*idx == 0)
1724 *extra = htole64(p);
1725 else {
1726 le64_t i;
1727
1728 i = htole64(le64toh(*idx) - 1);
1729 r = link_entry_into_array(f, first, &i, p);
1730 if (r < 0)
1731 return r;
1732 }
1733
1734 *idx = htole64(le64toh(*idx) + 1);
1735 return 0;
1736 }
1737
1738 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1739 uint64_t p;
1740 int r;
1741 assert(f);
1742 assert(o);
1743 assert(offset > 0);
1744
1745 p = le64toh(o->entry.items[i].object_offset);
1746 if (p == 0)
1747 return -EINVAL;
1748
1749 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1750 if (r < 0)
1751 return r;
1752
1753 return link_entry_into_array_plus_one(f,
1754 &o->data.entry_offset,
1755 &o->data.entry_array_offset,
1756 &o->data.n_entries,
1757 offset);
1758 }
1759
1760 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1761 uint64_t n, i;
1762 int r;
1763
1764 assert(f);
1765 assert(f->header);
1766 assert(o);
1767 assert(offset > 0);
1768
1769 if (o->object.type != OBJECT_ENTRY)
1770 return -EINVAL;
1771
1772 __sync_synchronize();
1773
1774 /* Link up the entry itself */
1775 r = link_entry_into_array(f,
1776 &f->header->entry_array_offset,
1777 &f->header->n_entries,
1778 offset);
1779 if (r < 0)
1780 return r;
1781
1782 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1783
1784 if (f->header->head_entry_realtime == 0)
1785 f->header->head_entry_realtime = o->entry.realtime;
1786
1787 f->header->tail_entry_realtime = o->entry.realtime;
1788 f->header->tail_entry_monotonic = o->entry.monotonic;
1789
1790 /* Link up the items */
1791 n = journal_file_entry_n_items(o);
1792 for (i = 0; i < n; i++) {
1793 r = journal_file_link_entry_item(f, o, offset, i);
1794 if (r < 0)
1795 return r;
1796 }
1797
1798 return 0;
1799 }
1800
1801 static int journal_file_append_entry_internal(
1802 JournalFile *f,
1803 const dual_timestamp *ts,
1804 const sd_id128_t *boot_id,
1805 uint64_t xor_hash,
1806 const EntryItem items[], unsigned n_items,
1807 uint64_t *seqnum,
1808 Object **ret, uint64_t *offset) {
1809 uint64_t np;
1810 uint64_t osize;
1811 Object *o;
1812 int r;
1813
1814 assert(f);
1815 assert(f->header);
1816 assert(items || n_items == 0);
1817 assert(ts);
1818
1819 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1820
1821 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1822 if (r < 0)
1823 return r;
1824
1825 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1826 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1827 o->entry.realtime = htole64(ts->realtime);
1828 o->entry.monotonic = htole64(ts->monotonic);
1829 o->entry.xor_hash = htole64(xor_hash);
1830 o->entry.boot_id = boot_id ? *boot_id : f->header->boot_id;
1831
1832 #if HAVE_GCRYPT
1833 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1834 if (r < 0)
1835 return r;
1836 #endif
1837
1838 r = journal_file_link_entry(f, o, np);
1839 if (r < 0)
1840 return r;
1841
1842 if (ret)
1843 *ret = o;
1844
1845 if (offset)
1846 *offset = np;
1847
1848 return 0;
1849 }
1850
1851 void journal_file_post_change(JournalFile *f) {
1852 assert(f);
1853
1854 /* inotify() does not receive IN_MODIFY events from file
1855 * accesses done via mmap(). After each access we hence
1856 * trigger IN_MODIFY by truncating the journal file to its
1857 * current size which triggers IN_MODIFY. */
1858
1859 __sync_synchronize();
1860
1861 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1862 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1863 }
1864
1865 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1866 assert(userdata);
1867
1868 journal_file_post_change(userdata);
1869
1870 return 1;
1871 }
1872
1873 static void schedule_post_change(JournalFile *f) {
1874 sd_event_source *timer;
1875 int enabled, r;
1876 uint64_t now;
1877
1878 assert(f);
1879 assert(f->post_change_timer);
1880
1881 timer = f->post_change_timer;
1882
1883 r = sd_event_source_get_enabled(timer, &enabled);
1884 if (r < 0) {
1885 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1886 goto fail;
1887 }
1888
1889 if (enabled == SD_EVENT_ONESHOT)
1890 return;
1891
1892 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1893 if (r < 0) {
1894 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1895 goto fail;
1896 }
1897
1898 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1899 if (r < 0) {
1900 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1901 goto fail;
1902 }
1903
1904 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1905 if (r < 0) {
1906 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1907 goto fail;
1908 }
1909
1910 return;
1911
1912 fail:
1913 /* On failure, let's simply post the change immediately. */
1914 journal_file_post_change(f);
1915 }
1916
1917 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1918 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1919 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1920 int r;
1921
1922 assert(f);
1923 assert_return(!f->post_change_timer, -EINVAL);
1924 assert(e);
1925 assert(t);
1926
1927 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1928 if (r < 0)
1929 return r;
1930
1931 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1932 if (r < 0)
1933 return r;
1934
1935 f->post_change_timer = TAKE_PTR(timer);
1936 f->post_change_timer_period = t;
1937
1938 return r;
1939 }
1940
1941 static int entry_item_cmp(const void *_a, const void *_b) {
1942 const EntryItem *a = _a, *b = _b;
1943
1944 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1945 return -1;
1946 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1947 return 1;
1948 return 0;
1949 }
1950
1951 int journal_file_append_entry(
1952 JournalFile *f,
1953 const dual_timestamp *ts,
1954 const sd_id128_t *boot_id,
1955 const struct iovec iovec[], unsigned n_iovec,
1956 uint64_t *seqnum,
1957 Object **ret, uint64_t *offset) {
1958
1959 unsigned i;
1960 EntryItem *items;
1961 int r;
1962 uint64_t xor_hash = 0;
1963 struct dual_timestamp _ts;
1964
1965 assert(f);
1966 assert(f->header);
1967 assert(iovec || n_iovec == 0);
1968
1969 if (ts) {
1970 if (!VALID_REALTIME(ts->realtime)) {
1971 log_debug("Invalid realtime timestamp %"PRIu64", refusing entry.", ts->realtime);
1972 return -EBADMSG;
1973 }
1974 if (!VALID_MONOTONIC(ts->monotonic)) {
1975 log_debug("Invalid monotomic timestamp %"PRIu64", refusing entry.", ts->monotonic);
1976 return -EBADMSG;
1977 }
1978 } else {
1979 dual_timestamp_get(&_ts);
1980 ts = &_ts;
1981 }
1982
1983 #if HAVE_GCRYPT
1984 r = journal_file_maybe_append_tag(f, ts->realtime);
1985 if (r < 0)
1986 return r;
1987 #endif
1988
1989 /* alloca() can't take 0, hence let's allocate at least one */
1990 items = newa(EntryItem, MAX(1u, n_iovec));
1991
1992 for (i = 0; i < n_iovec; i++) {
1993 uint64_t p;
1994 Object *o;
1995
1996 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1997 if (r < 0)
1998 return r;
1999
2000 xor_hash ^= le64toh(o->data.hash);
2001 items[i].object_offset = htole64(p);
2002 items[i].hash = o->data.hash;
2003 }
2004
2005 /* Order by the position on disk, in order to improve seek
2006 * times for rotating media. */
2007 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
2008
2009 r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, offset);
2010
2011 /* If the memory mapping triggered a SIGBUS then we return an
2012 * IO error and ignore the error code passed down to us, since
2013 * it is very likely just an effect of a nullified replacement
2014 * mapping page */
2015
2016 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
2017 r = -EIO;
2018
2019 if (f->post_change_timer)
2020 schedule_post_change(f);
2021 else
2022 journal_file_post_change(f);
2023
2024 return r;
2025 }
2026
2027 typedef struct ChainCacheItem {
2028 uint64_t first; /* the array at the beginning of the chain */
2029 uint64_t array; /* the cached array */
2030 uint64_t begin; /* the first item in the cached array */
2031 uint64_t total; /* the total number of items in all arrays before this one in the chain */
2032 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2033 } ChainCacheItem;
2034
2035 static void chain_cache_put(
2036 OrderedHashmap *h,
2037 ChainCacheItem *ci,
2038 uint64_t first,
2039 uint64_t array,
2040 uint64_t begin,
2041 uint64_t total,
2042 uint64_t last_index) {
2043
2044 if (!ci) {
2045 /* If the chain item to cache for this chain is the
2046 * first one it's not worth caching anything */
2047 if (array == first)
2048 return;
2049
2050 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2051 ci = ordered_hashmap_steal_first(h);
2052 assert(ci);
2053 } else {
2054 ci = new(ChainCacheItem, 1);
2055 if (!ci)
2056 return;
2057 }
2058
2059 ci->first = first;
2060
2061 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2062 free(ci);
2063 return;
2064 }
2065 } else
2066 assert(ci->first == first);
2067
2068 ci->array = array;
2069 ci->begin = begin;
2070 ci->total = total;
2071 ci->last_index = last_index;
2072 }
2073
2074 static int generic_array_get(
2075 JournalFile *f,
2076 uint64_t first,
2077 uint64_t i,
2078 Object **ret, uint64_t *offset) {
2079
2080 Object *o;
2081 uint64_t p = 0, a, t = 0;
2082 int r;
2083 ChainCacheItem *ci;
2084
2085 assert(f);
2086
2087 a = first;
2088
2089 /* Try the chain cache first */
2090 ci = ordered_hashmap_get(f->chain_cache, &first);
2091 if (ci && i > ci->total) {
2092 a = ci->array;
2093 i -= ci->total;
2094 t = ci->total;
2095 }
2096
2097 while (a > 0) {
2098 uint64_t k;
2099
2100 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2101 if (r < 0)
2102 return r;
2103
2104 k = journal_file_entry_array_n_items(o);
2105 if (i < k) {
2106 p = le64toh(o->entry_array.items[i]);
2107 goto found;
2108 }
2109
2110 i -= k;
2111 t += k;
2112 a = le64toh(o->entry_array.next_entry_array_offset);
2113 }
2114
2115 return 0;
2116
2117 found:
2118 /* Let's cache this item for the next invocation */
2119 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2120
2121 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2122 if (r < 0)
2123 return r;
2124
2125 if (ret)
2126 *ret = o;
2127
2128 if (offset)
2129 *offset = p;
2130
2131 return 1;
2132 }
2133
2134 static int generic_array_get_plus_one(
2135 JournalFile *f,
2136 uint64_t extra,
2137 uint64_t first,
2138 uint64_t i,
2139 Object **ret, uint64_t *offset) {
2140
2141 Object *o;
2142
2143 assert(f);
2144
2145 if (i == 0) {
2146 int r;
2147
2148 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2149 if (r < 0)
2150 return r;
2151
2152 if (ret)
2153 *ret = o;
2154
2155 if (offset)
2156 *offset = extra;
2157
2158 return 1;
2159 }
2160
2161 return generic_array_get(f, first, i-1, ret, offset);
2162 }
2163
2164 enum {
2165 TEST_FOUND,
2166 TEST_LEFT,
2167 TEST_RIGHT
2168 };
2169
2170 static int generic_array_bisect(
2171 JournalFile *f,
2172 uint64_t first,
2173 uint64_t n,
2174 uint64_t needle,
2175 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2176 direction_t direction,
2177 Object **ret,
2178 uint64_t *offset,
2179 uint64_t *idx) {
2180
2181 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2182 bool subtract_one = false;
2183 Object *o, *array = NULL;
2184 int r;
2185 ChainCacheItem *ci;
2186
2187 assert(f);
2188 assert(test_object);
2189
2190 /* Start with the first array in the chain */
2191 a = first;
2192
2193 ci = ordered_hashmap_get(f->chain_cache, &first);
2194 if (ci && n > ci->total && ci->begin != 0) {
2195 /* Ah, we have iterated this bisection array chain
2196 * previously! Let's see if we can skip ahead in the
2197 * chain, as far as the last time. But we can't jump
2198 * backwards in the chain, so let's check that
2199 * first. */
2200
2201 r = test_object(f, ci->begin, needle);
2202 if (r < 0)
2203 return r;
2204
2205 if (r == TEST_LEFT) {
2206 /* OK, what we are looking for is right of the
2207 * begin of this EntryArray, so let's jump
2208 * straight to previously cached array in the
2209 * chain */
2210
2211 a = ci->array;
2212 n -= ci->total;
2213 t = ci->total;
2214 last_index = ci->last_index;
2215 }
2216 }
2217
2218 while (a > 0) {
2219 uint64_t left, right, k, lp;
2220
2221 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2222 if (r < 0)
2223 return r;
2224
2225 k = journal_file_entry_array_n_items(array);
2226 right = MIN(k, n);
2227 if (right <= 0)
2228 return 0;
2229
2230 i = right - 1;
2231 lp = p = le64toh(array->entry_array.items[i]);
2232 if (p <= 0)
2233 r = -EBADMSG;
2234 else
2235 r = test_object(f, p, needle);
2236 if (r == -EBADMSG) {
2237 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2238 n = i;
2239 continue;
2240 }
2241 if (r < 0)
2242 return r;
2243
2244 if (r == TEST_FOUND)
2245 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2246
2247 if (r == TEST_RIGHT) {
2248 left = 0;
2249 right -= 1;
2250
2251 if (last_index != (uint64_t) -1) {
2252 assert(last_index <= right);
2253
2254 /* If we cached the last index we
2255 * looked at, let's try to not to jump
2256 * too wildly around and see if we can
2257 * limit the range to look at early to
2258 * the immediate neighbors of the last
2259 * index we looked at. */
2260
2261 if (last_index > 0) {
2262 uint64_t x = last_index - 1;
2263
2264 p = le64toh(array->entry_array.items[x]);
2265 if (p <= 0)
2266 return -EBADMSG;
2267
2268 r = test_object(f, p, needle);
2269 if (r < 0)
2270 return r;
2271
2272 if (r == TEST_FOUND)
2273 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2274
2275 if (r == TEST_RIGHT)
2276 right = x;
2277 else
2278 left = x + 1;
2279 }
2280
2281 if (last_index < right) {
2282 uint64_t y = last_index + 1;
2283
2284 p = le64toh(array->entry_array.items[y]);
2285 if (p <= 0)
2286 return -EBADMSG;
2287
2288 r = test_object(f, p, needle);
2289 if (r < 0)
2290 return r;
2291
2292 if (r == TEST_FOUND)
2293 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2294
2295 if (r == TEST_RIGHT)
2296 right = y;
2297 else
2298 left = y + 1;
2299 }
2300 }
2301
2302 for (;;) {
2303 if (left == right) {
2304 if (direction == DIRECTION_UP)
2305 subtract_one = true;
2306
2307 i = left;
2308 goto found;
2309 }
2310
2311 assert(left < right);
2312 i = (left + right) / 2;
2313
2314 p = le64toh(array->entry_array.items[i]);
2315 if (p <= 0)
2316 r = -EBADMSG;
2317 else
2318 r = test_object(f, p, needle);
2319 if (r == -EBADMSG) {
2320 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2321 right = n = i;
2322 continue;
2323 }
2324 if (r < 0)
2325 return r;
2326
2327 if (r == TEST_FOUND)
2328 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2329
2330 if (r == TEST_RIGHT)
2331 right = i;
2332 else
2333 left = i + 1;
2334 }
2335 }
2336
2337 if (k >= n) {
2338 if (direction == DIRECTION_UP) {
2339 i = n;
2340 subtract_one = true;
2341 goto found;
2342 }
2343
2344 return 0;
2345 }
2346
2347 last_p = lp;
2348
2349 n -= k;
2350 t += k;
2351 last_index = (uint64_t) -1;
2352 a = le64toh(array->entry_array.next_entry_array_offset);
2353 }
2354
2355 return 0;
2356
2357 found:
2358 if (subtract_one && t == 0 && i == 0)
2359 return 0;
2360
2361 /* Let's cache this item for the next invocation */
2362 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2363
2364 if (subtract_one && i == 0)
2365 p = last_p;
2366 else if (subtract_one)
2367 p = le64toh(array->entry_array.items[i-1]);
2368 else
2369 p = le64toh(array->entry_array.items[i]);
2370
2371 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2372 if (r < 0)
2373 return r;
2374
2375 if (ret)
2376 *ret = o;
2377
2378 if (offset)
2379 *offset = p;
2380
2381 if (idx)
2382 *idx = t + i + (subtract_one ? -1 : 0);
2383
2384 return 1;
2385 }
2386
2387 static int generic_array_bisect_plus_one(
2388 JournalFile *f,
2389 uint64_t extra,
2390 uint64_t first,
2391 uint64_t n,
2392 uint64_t needle,
2393 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2394 direction_t direction,
2395 Object **ret,
2396 uint64_t *offset,
2397 uint64_t *idx) {
2398
2399 int r;
2400 bool step_back = false;
2401 Object *o;
2402
2403 assert(f);
2404 assert(test_object);
2405
2406 if (n <= 0)
2407 return 0;
2408
2409 /* This bisects the array in object 'first', but first checks
2410 * an extra */
2411 r = test_object(f, extra, needle);
2412 if (r < 0)
2413 return r;
2414
2415 if (r == TEST_FOUND)
2416 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2417
2418 /* if we are looking with DIRECTION_UP then we need to first
2419 see if in the actual array there is a matching entry, and
2420 return the last one of that. But if there isn't any we need
2421 to return this one. Hence remember this, and return it
2422 below. */
2423 if (r == TEST_LEFT)
2424 step_back = direction == DIRECTION_UP;
2425
2426 if (r == TEST_RIGHT) {
2427 if (direction == DIRECTION_DOWN)
2428 goto found;
2429 else
2430 return 0;
2431 }
2432
2433 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2434
2435 if (r == 0 && step_back)
2436 goto found;
2437
2438 if (r > 0 && idx)
2439 (*idx)++;
2440
2441 return r;
2442
2443 found:
2444 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2445 if (r < 0)
2446 return r;
2447
2448 if (ret)
2449 *ret = o;
2450
2451 if (offset)
2452 *offset = extra;
2453
2454 if (idx)
2455 *idx = 0;
2456
2457 return 1;
2458 }
2459
2460 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2461 assert(f);
2462 assert(p > 0);
2463
2464 if (p == needle)
2465 return TEST_FOUND;
2466 else if (p < needle)
2467 return TEST_LEFT;
2468 else
2469 return TEST_RIGHT;
2470 }
2471
2472 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2473 Object *o;
2474 int r;
2475
2476 assert(f);
2477 assert(p > 0);
2478
2479 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2480 if (r < 0)
2481 return r;
2482
2483 if (le64toh(o->entry.seqnum) == needle)
2484 return TEST_FOUND;
2485 else if (le64toh(o->entry.seqnum) < needle)
2486 return TEST_LEFT;
2487 else
2488 return TEST_RIGHT;
2489 }
2490
2491 int journal_file_move_to_entry_by_seqnum(
2492 JournalFile *f,
2493 uint64_t seqnum,
2494 direction_t direction,
2495 Object **ret,
2496 uint64_t *offset) {
2497 assert(f);
2498 assert(f->header);
2499
2500 return generic_array_bisect(f,
2501 le64toh(f->header->entry_array_offset),
2502 le64toh(f->header->n_entries),
2503 seqnum,
2504 test_object_seqnum,
2505 direction,
2506 ret, offset, NULL);
2507 }
2508
2509 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2510 Object *o;
2511 int r;
2512
2513 assert(f);
2514 assert(p > 0);
2515
2516 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2517 if (r < 0)
2518 return r;
2519
2520 if (le64toh(o->entry.realtime) == needle)
2521 return TEST_FOUND;
2522 else if (le64toh(o->entry.realtime) < needle)
2523 return TEST_LEFT;
2524 else
2525 return TEST_RIGHT;
2526 }
2527
2528 int journal_file_move_to_entry_by_realtime(
2529 JournalFile *f,
2530 uint64_t realtime,
2531 direction_t direction,
2532 Object **ret,
2533 uint64_t *offset) {
2534 assert(f);
2535 assert(f->header);
2536
2537 return generic_array_bisect(f,
2538 le64toh(f->header->entry_array_offset),
2539 le64toh(f->header->n_entries),
2540 realtime,
2541 test_object_realtime,
2542 direction,
2543 ret, offset, NULL);
2544 }
2545
2546 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2547 Object *o;
2548 int r;
2549
2550 assert(f);
2551 assert(p > 0);
2552
2553 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2554 if (r < 0)
2555 return r;
2556
2557 if (le64toh(o->entry.monotonic) == needle)
2558 return TEST_FOUND;
2559 else if (le64toh(o->entry.monotonic) < needle)
2560 return TEST_LEFT;
2561 else
2562 return TEST_RIGHT;
2563 }
2564
2565 static int find_data_object_by_boot_id(
2566 JournalFile *f,
2567 sd_id128_t boot_id,
2568 Object **o,
2569 uint64_t *b) {
2570
2571 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2572
2573 sd_id128_to_string(boot_id, t + 9);
2574 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2575 }
2576
2577 int journal_file_move_to_entry_by_monotonic(
2578 JournalFile *f,
2579 sd_id128_t boot_id,
2580 uint64_t monotonic,
2581 direction_t direction,
2582 Object **ret,
2583 uint64_t *offset) {
2584
2585 Object *o;
2586 int r;
2587
2588 assert(f);
2589
2590 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2591 if (r < 0)
2592 return r;
2593 if (r == 0)
2594 return -ENOENT;
2595
2596 return generic_array_bisect_plus_one(f,
2597 le64toh(o->data.entry_offset),
2598 le64toh(o->data.entry_array_offset),
2599 le64toh(o->data.n_entries),
2600 monotonic,
2601 test_object_monotonic,
2602 direction,
2603 ret, offset, NULL);
2604 }
2605
2606 void journal_file_reset_location(JournalFile *f) {
2607 f->location_type = LOCATION_HEAD;
2608 f->current_offset = 0;
2609 f->current_seqnum = 0;
2610 f->current_realtime = 0;
2611 f->current_monotonic = 0;
2612 zero(f->current_boot_id);
2613 f->current_xor_hash = 0;
2614 }
2615
2616 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2617 f->location_type = LOCATION_SEEK;
2618 f->current_offset = offset;
2619 f->current_seqnum = le64toh(o->entry.seqnum);
2620 f->current_realtime = le64toh(o->entry.realtime);
2621 f->current_monotonic = le64toh(o->entry.monotonic);
2622 f->current_boot_id = o->entry.boot_id;
2623 f->current_xor_hash = le64toh(o->entry.xor_hash);
2624 }
2625
2626 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2627 assert(af);
2628 assert(af->header);
2629 assert(bf);
2630 assert(bf->header);
2631 assert(af->location_type == LOCATION_SEEK);
2632 assert(bf->location_type == LOCATION_SEEK);
2633
2634 /* If contents and timestamps match, these entries are
2635 * identical, even if the seqnum does not match */
2636 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2637 af->current_monotonic == bf->current_monotonic &&
2638 af->current_realtime == bf->current_realtime &&
2639 af->current_xor_hash == bf->current_xor_hash)
2640 return 0;
2641
2642 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2643
2644 /* If this is from the same seqnum source, compare
2645 * seqnums */
2646 if (af->current_seqnum < bf->current_seqnum)
2647 return -1;
2648 if (af->current_seqnum > bf->current_seqnum)
2649 return 1;
2650
2651 /* Wow! This is weird, different data but the same
2652 * seqnums? Something is borked, but let's make the
2653 * best of it and compare by time. */
2654 }
2655
2656 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2657
2658 /* If the boot id matches, compare monotonic time */
2659 if (af->current_monotonic < bf->current_monotonic)
2660 return -1;
2661 if (af->current_monotonic > bf->current_monotonic)
2662 return 1;
2663 }
2664
2665 /* Otherwise, compare UTC time */
2666 if (af->current_realtime < bf->current_realtime)
2667 return -1;
2668 if (af->current_realtime > bf->current_realtime)
2669 return 1;
2670
2671 /* Finally, compare by contents */
2672 if (af->current_xor_hash < bf->current_xor_hash)
2673 return -1;
2674 if (af->current_xor_hash > bf->current_xor_hash)
2675 return 1;
2676
2677 return 0;
2678 }
2679
2680 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2681
2682 /* Increase or decrease the specified index, in the right direction. */
2683
2684 if (direction == DIRECTION_DOWN) {
2685 if (*i >= n - 1)
2686 return 0;
2687
2688 (*i) ++;
2689 } else {
2690 if (*i <= 0)
2691 return 0;
2692
2693 (*i) --;
2694 }
2695
2696 return 1;
2697 }
2698
2699 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2700
2701 /* Consider it an error if any of the two offsets is uninitialized */
2702 if (old_offset == 0 || new_offset == 0)
2703 return false;
2704
2705 /* If we go down, the new offset must be larger than the old one. */
2706 return direction == DIRECTION_DOWN ?
2707 new_offset > old_offset :
2708 new_offset < old_offset;
2709 }
2710
2711 int journal_file_next_entry(
2712 JournalFile *f,
2713 uint64_t p,
2714 direction_t direction,
2715 Object **ret, uint64_t *offset) {
2716
2717 uint64_t i, n, ofs;
2718 int r;
2719
2720 assert(f);
2721 assert(f->header);
2722
2723 n = le64toh(f->header->n_entries);
2724 if (n <= 0)
2725 return 0;
2726
2727 if (p == 0)
2728 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2729 else {
2730 r = generic_array_bisect(f,
2731 le64toh(f->header->entry_array_offset),
2732 le64toh(f->header->n_entries),
2733 p,
2734 test_object_offset,
2735 DIRECTION_DOWN,
2736 NULL, NULL,
2737 &i);
2738 if (r <= 0)
2739 return r;
2740
2741 r = bump_array_index(&i, direction, n);
2742 if (r <= 0)
2743 return r;
2744 }
2745
2746 /* And jump to it */
2747 for (;;) {
2748 r = generic_array_get(f,
2749 le64toh(f->header->entry_array_offset),
2750 i,
2751 ret, &ofs);
2752 if (r > 0)
2753 break;
2754 if (r != -EBADMSG)
2755 return r;
2756
2757 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2758 * the next one might work for us instead. */
2759 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2760
2761 r = bump_array_index(&i, direction, n);
2762 if (r <= 0)
2763 return r;
2764 }
2765
2766 /* Ensure our array is properly ordered. */
2767 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2768 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
2769 return -EBADMSG;
2770 }
2771
2772 if (offset)
2773 *offset = ofs;
2774
2775 return 1;
2776 }
2777
2778 int journal_file_next_entry_for_data(
2779 JournalFile *f,
2780 Object *o, uint64_t p,
2781 uint64_t data_offset,
2782 direction_t direction,
2783 Object **ret, uint64_t *offset) {
2784
2785 uint64_t i, n, ofs;
2786 Object *d;
2787 int r;
2788
2789 assert(f);
2790 assert(p > 0 || !o);
2791
2792 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2793 if (r < 0)
2794 return r;
2795
2796 n = le64toh(d->data.n_entries);
2797 if (n <= 0)
2798 return n;
2799
2800 if (!o)
2801 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2802 else {
2803 if (o->object.type != OBJECT_ENTRY)
2804 return -EINVAL;
2805
2806 r = generic_array_bisect_plus_one(f,
2807 le64toh(d->data.entry_offset),
2808 le64toh(d->data.entry_array_offset),
2809 le64toh(d->data.n_entries),
2810 p,
2811 test_object_offset,
2812 DIRECTION_DOWN,
2813 NULL, NULL,
2814 &i);
2815
2816 if (r <= 0)
2817 return r;
2818
2819 r = bump_array_index(&i, direction, n);
2820 if (r <= 0)
2821 return r;
2822 }
2823
2824 for (;;) {
2825 r = generic_array_get_plus_one(f,
2826 le64toh(d->data.entry_offset),
2827 le64toh(d->data.entry_array_offset),
2828 i,
2829 ret, &ofs);
2830 if (r > 0)
2831 break;
2832 if (r != -EBADMSG)
2833 return r;
2834
2835 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2836
2837 r = bump_array_index(&i, direction, n);
2838 if (r <= 0)
2839 return r;
2840 }
2841
2842 /* Ensure our array is properly ordered. */
2843 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2844 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2845 return -EBADMSG;
2846 }
2847
2848 if (offset)
2849 *offset = ofs;
2850
2851 return 1;
2852 }
2853
2854 int journal_file_move_to_entry_by_offset_for_data(
2855 JournalFile *f,
2856 uint64_t data_offset,
2857 uint64_t p,
2858 direction_t direction,
2859 Object **ret, uint64_t *offset) {
2860
2861 int r;
2862 Object *d;
2863
2864 assert(f);
2865
2866 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2867 if (r < 0)
2868 return r;
2869
2870 return generic_array_bisect_plus_one(f,
2871 le64toh(d->data.entry_offset),
2872 le64toh(d->data.entry_array_offset),
2873 le64toh(d->data.n_entries),
2874 p,
2875 test_object_offset,
2876 direction,
2877 ret, offset, NULL);
2878 }
2879
2880 int journal_file_move_to_entry_by_monotonic_for_data(
2881 JournalFile *f,
2882 uint64_t data_offset,
2883 sd_id128_t boot_id,
2884 uint64_t monotonic,
2885 direction_t direction,
2886 Object **ret, uint64_t *offset) {
2887
2888 Object *o, *d;
2889 int r;
2890 uint64_t b, z;
2891
2892 assert(f);
2893
2894 /* First, seek by time */
2895 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2896 if (r < 0)
2897 return r;
2898 if (r == 0)
2899 return -ENOENT;
2900
2901 r = generic_array_bisect_plus_one(f,
2902 le64toh(o->data.entry_offset),
2903 le64toh(o->data.entry_array_offset),
2904 le64toh(o->data.n_entries),
2905 monotonic,
2906 test_object_monotonic,
2907 direction,
2908 NULL, &z, NULL);
2909 if (r <= 0)
2910 return r;
2911
2912 /* And now, continue seeking until we find an entry that
2913 * exists in both bisection arrays */
2914
2915 for (;;) {
2916 Object *qo;
2917 uint64_t p, q;
2918
2919 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2920 if (r < 0)
2921 return r;
2922
2923 r = generic_array_bisect_plus_one(f,
2924 le64toh(d->data.entry_offset),
2925 le64toh(d->data.entry_array_offset),
2926 le64toh(d->data.n_entries),
2927 z,
2928 test_object_offset,
2929 direction,
2930 NULL, &p, NULL);
2931 if (r <= 0)
2932 return r;
2933
2934 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2935 if (r < 0)
2936 return r;
2937
2938 r = generic_array_bisect_plus_one(f,
2939 le64toh(o->data.entry_offset),
2940 le64toh(o->data.entry_array_offset),
2941 le64toh(o->data.n_entries),
2942 p,
2943 test_object_offset,
2944 direction,
2945 &qo, &q, NULL);
2946
2947 if (r <= 0)
2948 return r;
2949
2950 if (p == q) {
2951 if (ret)
2952 *ret = qo;
2953 if (offset)
2954 *offset = q;
2955
2956 return 1;
2957 }
2958
2959 z = q;
2960 }
2961 }
2962
2963 int journal_file_move_to_entry_by_seqnum_for_data(
2964 JournalFile *f,
2965 uint64_t data_offset,
2966 uint64_t seqnum,
2967 direction_t direction,
2968 Object **ret, uint64_t *offset) {
2969
2970 Object *d;
2971 int r;
2972
2973 assert(f);
2974
2975 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2976 if (r < 0)
2977 return r;
2978
2979 return generic_array_bisect_plus_one(f,
2980 le64toh(d->data.entry_offset),
2981 le64toh(d->data.entry_array_offset),
2982 le64toh(d->data.n_entries),
2983 seqnum,
2984 test_object_seqnum,
2985 direction,
2986 ret, offset, NULL);
2987 }
2988
2989 int journal_file_move_to_entry_by_realtime_for_data(
2990 JournalFile *f,
2991 uint64_t data_offset,
2992 uint64_t realtime,
2993 direction_t direction,
2994 Object **ret, uint64_t *offset) {
2995
2996 Object *d;
2997 int r;
2998
2999 assert(f);
3000
3001 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
3002 if (r < 0)
3003 return r;
3004
3005 return generic_array_bisect_plus_one(f,
3006 le64toh(d->data.entry_offset),
3007 le64toh(d->data.entry_array_offset),
3008 le64toh(d->data.n_entries),
3009 realtime,
3010 test_object_realtime,
3011 direction,
3012 ret, offset, NULL);
3013 }
3014
3015 void journal_file_dump(JournalFile *f) {
3016 Object *o;
3017 int r;
3018 uint64_t p;
3019
3020 assert(f);
3021 assert(f->header);
3022
3023 journal_file_print_header(f);
3024
3025 p = le64toh(f->header->header_size);
3026 while (p != 0) {
3027 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3028 if (r < 0)
3029 goto fail;
3030
3031 switch (o->object.type) {
3032
3033 case OBJECT_UNUSED:
3034 printf("Type: OBJECT_UNUSED\n");
3035 break;
3036
3037 case OBJECT_DATA:
3038 printf("Type: OBJECT_DATA\n");
3039 break;
3040
3041 case OBJECT_FIELD:
3042 printf("Type: OBJECT_FIELD\n");
3043 break;
3044
3045 case OBJECT_ENTRY:
3046 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3047 le64toh(o->entry.seqnum),
3048 le64toh(o->entry.monotonic),
3049 le64toh(o->entry.realtime));
3050 break;
3051
3052 case OBJECT_FIELD_HASH_TABLE:
3053 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3054 break;
3055
3056 case OBJECT_DATA_HASH_TABLE:
3057 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3058 break;
3059
3060 case OBJECT_ENTRY_ARRAY:
3061 printf("Type: OBJECT_ENTRY_ARRAY\n");
3062 break;
3063
3064 case OBJECT_TAG:
3065 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3066 le64toh(o->tag.seqnum),
3067 le64toh(o->tag.epoch));
3068 break;
3069
3070 default:
3071 printf("Type: unknown (%i)\n", o->object.type);
3072 break;
3073 }
3074
3075 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3076 printf("Flags: %s\n",
3077 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3078
3079 if (p == le64toh(f->header->tail_object_offset))
3080 p = 0;
3081 else
3082 p = p + ALIGN64(le64toh(o->object.size));
3083 }
3084
3085 return;
3086 fail:
3087 log_error("File corrupt");
3088 }
3089
3090 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3091 const char *x;
3092
3093 x = format_timestamp(buf, l, t);
3094 if (x)
3095 return x;
3096 return " --- ";
3097 }
3098
3099 void journal_file_print_header(JournalFile *f) {
3100 char a[33], b[33], c[33], d[33];
3101 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3102 struct stat st;
3103 char bytes[FORMAT_BYTES_MAX];
3104
3105 assert(f);
3106 assert(f->header);
3107
3108 printf("File Path: %s\n"
3109 "File ID: %s\n"
3110 "Machine ID: %s\n"
3111 "Boot ID: %s\n"
3112 "Sequential Number ID: %s\n"
3113 "State: %s\n"
3114 "Compatible Flags:%s%s\n"
3115 "Incompatible Flags:%s%s%s\n"
3116 "Header size: %"PRIu64"\n"
3117 "Arena size: %"PRIu64"\n"
3118 "Data Hash Table Size: %"PRIu64"\n"
3119 "Field Hash Table Size: %"PRIu64"\n"
3120 "Rotate Suggested: %s\n"
3121 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3122 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3123 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3124 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3125 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3126 "Objects: %"PRIu64"\n"
3127 "Entry Objects: %"PRIu64"\n",
3128 f->path,
3129 sd_id128_to_string(f->header->file_id, a),
3130 sd_id128_to_string(f->header->machine_id, b),
3131 sd_id128_to_string(f->header->boot_id, c),
3132 sd_id128_to_string(f->header->seqnum_id, d),
3133 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3134 f->header->state == STATE_ONLINE ? "ONLINE" :
3135 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3136 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3137 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3138 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3139 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3140 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3141 le64toh(f->header->header_size),
3142 le64toh(f->header->arena_size),
3143 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3144 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3145 yes_no(journal_file_rotate_suggested(f, 0)),
3146 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3147 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3148 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3149 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3150 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3151 le64toh(f->header->n_objects),
3152 le64toh(f->header->n_entries));
3153
3154 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3155 printf("Data Objects: %"PRIu64"\n"
3156 "Data Hash Table Fill: %.1f%%\n",
3157 le64toh(f->header->n_data),
3158 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3159
3160 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3161 printf("Field Objects: %"PRIu64"\n"
3162 "Field Hash Table Fill: %.1f%%\n",
3163 le64toh(f->header->n_fields),
3164 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3165
3166 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3167 printf("Tag Objects: %"PRIu64"\n",
3168 le64toh(f->header->n_tags));
3169 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3170 printf("Entry Array Objects: %"PRIu64"\n",
3171 le64toh(f->header->n_entry_arrays));
3172
3173 if (fstat(f->fd, &st) >= 0)
3174 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3175 }
3176
3177 static int journal_file_warn_btrfs(JournalFile *f) {
3178 unsigned attrs;
3179 int r;
3180
3181 assert(f);
3182
3183 /* Before we write anything, check if the COW logic is turned
3184 * off on btrfs. Given our write pattern that is quite
3185 * unfriendly to COW file systems this should greatly improve
3186 * performance on COW file systems, such as btrfs, at the
3187 * expense of data integrity features (which shouldn't be too
3188 * bad, given that we do our own checksumming). */
3189
3190 r = btrfs_is_filesystem(f->fd);
3191 if (r < 0)
3192 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3193 if (!r)
3194 return 0;
3195
3196 r = read_attr_fd(f->fd, &attrs);
3197 if (r < 0)
3198 return log_warning_errno(r, "Failed to read file attributes: %m");
3199
3200 if (attrs & FS_NOCOW_FL) {
3201 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3202 return 0;
3203 }
3204
3205 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3206 "This is likely to slow down journal access substantially, please consider turning "
3207 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3208
3209 return 1;
3210 }
3211
3212 int journal_file_open(
3213 int fd,
3214 const char *fname,
3215 int flags,
3216 mode_t mode,
3217 bool compress,
3218 uint64_t compress_threshold_bytes,
3219 bool seal,
3220 JournalMetrics *metrics,
3221 MMapCache *mmap_cache,
3222 Set *deferred_closes,
3223 JournalFile *template,
3224 JournalFile **ret) {
3225
3226 bool newly_created = false;
3227 JournalFile *f;
3228 void *h;
3229 int r;
3230 char bytes[FORMAT_BYTES_MAX];
3231
3232 assert(ret);
3233 assert(fd >= 0 || fname);
3234
3235 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3236 return -EINVAL;
3237
3238 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3239 return -EINVAL;
3240
3241 f = new0(JournalFile, 1);
3242 if (!f)
3243 return -ENOMEM;
3244
3245 f->fd = fd;
3246 f->mode = mode;
3247
3248 f->flags = flags;
3249 f->prot = prot_from_flags(flags);
3250 f->writable = (flags & O_ACCMODE) != O_RDONLY;
3251 #if HAVE_LZ4
3252 f->compress_lz4 = compress;
3253 #elif HAVE_XZ
3254 f->compress_xz = compress;
3255 #endif
3256
3257 if (compress_threshold_bytes == (uint64_t) -1)
3258 f->compress_threshold_bytes = DEFAULT_COMPRESS_THRESHOLD;
3259 else
3260 f->compress_threshold_bytes = MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes);
3261
3262 #if HAVE_GCRYPT
3263 f->seal = seal;
3264 #endif
3265
3266 log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
3267 yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
3268 format_bytes(bytes, sizeof(bytes), f->compress_threshold_bytes));
3269
3270 if (mmap_cache)
3271 f->mmap = mmap_cache_ref(mmap_cache);
3272 else {
3273 f->mmap = mmap_cache_new();
3274 if (!f->mmap) {
3275 r = -ENOMEM;
3276 goto fail;
3277 }
3278 }
3279
3280 if (fname) {
3281 f->path = strdup(fname);
3282 if (!f->path) {
3283 r = -ENOMEM;
3284 goto fail;
3285 }
3286 } else {
3287 assert(fd >= 0);
3288
3289 /* If we don't know the path, fill in something explanatory and vaguely useful */
3290 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3291 r = -ENOMEM;
3292 goto fail;
3293 }
3294 }
3295
3296 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3297 if (!f->chain_cache) {
3298 r = -ENOMEM;
3299 goto fail;
3300 }
3301
3302 if (f->fd < 0) {
3303 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3304 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3305 * it doesn't hurt in that case. */
3306
3307 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
3308 if (f->fd < 0) {
3309 r = -errno;
3310 goto fail;
3311 }
3312
3313 /* fds we opened here by us should also be closed by us. */
3314 f->close_fd = true;
3315
3316 r = fd_nonblock(f->fd, false);
3317 if (r < 0)
3318 goto fail;
3319 }
3320
3321 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3322 if (!f->cache_fd) {
3323 r = -ENOMEM;
3324 goto fail;
3325 }
3326
3327 r = journal_file_fstat(f);
3328 if (r < 0)
3329 goto fail;
3330
3331 if (f->last_stat.st_size == 0 && f->writable) {
3332
3333 (void) journal_file_warn_btrfs(f);
3334
3335 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3336 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3337 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3338 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3339 * solely on mtime/atime/ctime of the file. */
3340 (void) fd_setcrtime(f->fd, 0);
3341
3342 #if HAVE_GCRYPT
3343 /* Try to load the FSPRG state, and if we can't, then
3344 * just don't do sealing */
3345 if (f->seal) {
3346 r = journal_file_fss_load(f);
3347 if (r < 0)
3348 f->seal = false;
3349 }
3350 #endif
3351
3352 r = journal_file_init_header(f, template);
3353 if (r < 0)
3354 goto fail;
3355
3356 r = journal_file_fstat(f);
3357 if (r < 0)
3358 goto fail;
3359
3360 newly_created = true;
3361 }
3362
3363 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3364 r = -ENODATA;
3365 goto fail;
3366 }
3367
3368 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3369 if (r < 0)
3370 goto fail;
3371
3372 f->header = h;
3373
3374 if (!newly_created) {
3375 set_clear_with_destructor(deferred_closes, journal_file_close);
3376
3377 r = journal_file_verify_header(f);
3378 if (r < 0)
3379 goto fail;
3380 }
3381
3382 #if HAVE_GCRYPT
3383 if (!newly_created && f->writable) {
3384 r = journal_file_fss_load(f);
3385 if (r < 0)
3386 goto fail;
3387 }
3388 #endif
3389
3390 if (f->writable) {
3391 if (metrics) {
3392 journal_default_metrics(metrics, f->fd);
3393 f->metrics = *metrics;
3394 } else if (template)
3395 f->metrics = template->metrics;
3396
3397 r = journal_file_refresh_header(f);
3398 if (r < 0)
3399 goto fail;
3400 }
3401
3402 #if HAVE_GCRYPT
3403 r = journal_file_hmac_setup(f);
3404 if (r < 0)
3405 goto fail;
3406 #endif
3407
3408 if (newly_created) {
3409 r = journal_file_setup_field_hash_table(f);
3410 if (r < 0)
3411 goto fail;
3412
3413 r = journal_file_setup_data_hash_table(f);
3414 if (r < 0)
3415 goto fail;
3416
3417 #if HAVE_GCRYPT
3418 r = journal_file_append_first_tag(f);
3419 if (r < 0)
3420 goto fail;
3421 #endif
3422 }
3423
3424 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3425 r = -EIO;
3426 goto fail;
3427 }
3428
3429 if (template && template->post_change_timer) {
3430 r = journal_file_enable_post_change_timer(
3431 f,
3432 sd_event_source_get_event(template->post_change_timer),
3433 template->post_change_timer_period);
3434
3435 if (r < 0)
3436 goto fail;
3437 }
3438
3439 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3440 f->close_fd = true;
3441
3442 *ret = f;
3443 return 0;
3444
3445 fail:
3446 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3447 r = -EIO;
3448
3449 (void) journal_file_close(f);
3450
3451 return r;
3452 }
3453
3454 int journal_file_rotate(JournalFile **f, bool compress, uint64_t compress_threshold_bytes, bool seal, Set *deferred_closes) {
3455 _cleanup_free_ char *p = NULL;
3456 size_t l;
3457 JournalFile *old_file, *new_file = NULL;
3458 int r;
3459
3460 assert(f);
3461 assert(*f);
3462
3463 old_file = *f;
3464
3465 if (!old_file->writable)
3466 return -EINVAL;
3467
3468 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3469 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3470 if (path_startswith(old_file->path, "/proc/self/fd"))
3471 return -EINVAL;
3472
3473 if (!endswith(old_file->path, ".journal"))
3474 return -EINVAL;
3475
3476 l = strlen(old_file->path);
3477 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3478 (int) l - 8, old_file->path,
3479 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3480 le64toh((*f)->header->head_entry_seqnum),
3481 le64toh((*f)->header->head_entry_realtime));
3482 if (r < 0)
3483 return -ENOMEM;
3484
3485 /* Try to rename the file to the archived version. If the file
3486 * already was deleted, we'll get ENOENT, let's ignore that
3487 * case. */
3488 r = rename(old_file->path, p);
3489 if (r < 0 && errno != ENOENT)
3490 return -errno;
3491
3492 /* Sync the rename to disk */
3493 (void) fsync_directory_of_file(old_file->fd);
3494
3495 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3496 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3497 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3498 * would result in the rotated journal never getting fsync() called before closing.
3499 * Now we simply queue the archive state by setting an archive bit, leaving the state
3500 * as STATE_ONLINE so proper offlining occurs. */
3501 old_file->archive = true;
3502
3503 /* Currently, btrfs is not very good with out write patterns
3504 * and fragments heavily. Let's defrag our journal files when
3505 * we archive them */
3506 old_file->defrag_on_close = true;
3507
3508 r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress,
3509 compress_threshold_bytes, seal, NULL, old_file->mmap, deferred_closes,
3510 old_file, &new_file);
3511
3512 if (deferred_closes &&
3513 set_put(deferred_closes, old_file) >= 0)
3514 (void) journal_file_set_offline(old_file, false);
3515 else
3516 (void) journal_file_close(old_file);
3517
3518 *f = new_file;
3519 return r;
3520 }
3521
3522 int journal_file_open_reliably(
3523 const char *fname,
3524 int flags,
3525 mode_t mode,
3526 bool compress,
3527 uint64_t compress_threshold_bytes,
3528 bool seal,
3529 JournalMetrics *metrics,
3530 MMapCache *mmap_cache,
3531 Set *deferred_closes,
3532 JournalFile *template,
3533 JournalFile **ret) {
3534
3535 int r;
3536 size_t l;
3537 _cleanup_free_ char *p = NULL;
3538
3539 r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3540 deferred_closes, template, ret);
3541 if (!IN_SET(r,
3542 -EBADMSG, /* Corrupted */
3543 -ENODATA, /* Truncated */
3544 -EHOSTDOWN, /* Other machine */
3545 -EPROTONOSUPPORT, /* Incompatible feature */
3546 -EBUSY, /* Unclean shutdown */
3547 -ESHUTDOWN, /* Already archived */
3548 -EIO, /* IO error, including SIGBUS on mmap */
3549 -EIDRM, /* File has been deleted */
3550 -ETXTBSY)) /* File is from the future */
3551 return r;
3552
3553 if ((flags & O_ACCMODE) == O_RDONLY)
3554 return r;
3555
3556 if (!(flags & O_CREAT))
3557 return r;
3558
3559 if (!endswith(fname, ".journal"))
3560 return r;
3561
3562 /* The file is corrupted. Rotate it away and try it again (but only once) */
3563
3564 l = strlen(fname);
3565 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
3566 (int) l - 8, fname,
3567 now(CLOCK_REALTIME),
3568 random_u64()) < 0)
3569 return -ENOMEM;
3570
3571 if (rename(fname, p) < 0)
3572 return -errno;
3573
3574 /* btrfs doesn't cope well with our write pattern and
3575 * fragments heavily. Let's defrag all files we rotate */
3576
3577 (void) chattr_path(p, 0, FS_NOCOW_FL);
3578 (void) btrfs_defrag(p);
3579
3580 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3581
3582 return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3583 deferred_closes, template, ret);
3584 }
3585
3586 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
3587 uint64_t i, n;
3588 uint64_t q, xor_hash = 0;
3589 int r;
3590 EntryItem *items;
3591 dual_timestamp ts;
3592 const sd_id128_t *boot_id;
3593
3594 assert(from);
3595 assert(to);
3596 assert(o);
3597 assert(p);
3598
3599 if (!to->writable)
3600 return -EPERM;
3601
3602 ts.monotonic = le64toh(o->entry.monotonic);
3603 ts.realtime = le64toh(o->entry.realtime);
3604 boot_id = &o->entry.boot_id;
3605
3606 n = journal_file_entry_n_items(o);
3607 /* alloca() can't take 0, hence let's allocate at least one */
3608 items = newa(EntryItem, MAX(1u, n));
3609
3610 for (i = 0; i < n; i++) {
3611 uint64_t l, h;
3612 le64_t le_hash;
3613 size_t t;
3614 void *data;
3615 Object *u;
3616
3617 q = le64toh(o->entry.items[i].object_offset);
3618 le_hash = o->entry.items[i].hash;
3619
3620 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3621 if (r < 0)
3622 return r;
3623
3624 if (le_hash != o->data.hash)
3625 return -EBADMSG;
3626
3627 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3628 t = (size_t) l;
3629
3630 /* We hit the limit on 32bit machines */
3631 if ((uint64_t) t != l)
3632 return -E2BIG;
3633
3634 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3635 #if HAVE_XZ || HAVE_LZ4
3636 size_t rsize = 0;
3637
3638 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3639 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3640 if (r < 0)
3641 return r;
3642
3643 data = from->compress_buffer;
3644 l = rsize;
3645 #else
3646 return -EPROTONOSUPPORT;
3647 #endif
3648 } else
3649 data = o->data.payload;
3650
3651 r = journal_file_append_data(to, data, l, &u, &h);
3652 if (r < 0)
3653 return r;
3654
3655 xor_hash ^= le64toh(u->data.hash);
3656 items[i].object_offset = htole64(h);
3657 items[i].hash = u->data.hash;
3658
3659 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3660 if (r < 0)
3661 return r;
3662 }
3663
3664 r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n,
3665 NULL, NULL, NULL);
3666
3667 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3668 return -EIO;
3669
3670 return r;
3671 }
3672
3673 void journal_reset_metrics(JournalMetrics *m) {
3674 assert(m);
3675
3676 /* Set everything to "pick automatic values". */
3677
3678 *m = (JournalMetrics) {
3679 .min_use = (uint64_t) -1,
3680 .max_use = (uint64_t) -1,
3681 .min_size = (uint64_t) -1,
3682 .max_size = (uint64_t) -1,
3683 .keep_free = (uint64_t) -1,
3684 .n_max_files = (uint64_t) -1,
3685 };
3686 }
3687
3688 void journal_default_metrics(JournalMetrics *m, int fd) {
3689 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3690 struct statvfs ss;
3691 uint64_t fs_size;
3692
3693 assert(m);
3694 assert(fd >= 0);
3695
3696 if (fstatvfs(fd, &ss) >= 0)
3697 fs_size = ss.f_frsize * ss.f_blocks;
3698 else {
3699 log_debug_errno(errno, "Failed to determine disk size: %m");
3700 fs_size = 0;
3701 }
3702
3703 if (m->max_use == (uint64_t) -1) {
3704
3705 if (fs_size > 0) {
3706 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3707
3708 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3709 m->max_use = DEFAULT_MAX_USE_UPPER;
3710
3711 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3712 m->max_use = DEFAULT_MAX_USE_LOWER;
3713 } else
3714 m->max_use = DEFAULT_MAX_USE_LOWER;
3715 } else {
3716 m->max_use = PAGE_ALIGN(m->max_use);
3717
3718 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3719 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3720 }
3721
3722 if (m->min_use == (uint64_t) -1)
3723 m->min_use = DEFAULT_MIN_USE;
3724
3725 if (m->min_use > m->max_use)
3726 m->min_use = m->max_use;
3727
3728 if (m->max_size == (uint64_t) -1) {
3729 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3730
3731 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3732 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3733 } else
3734 m->max_size = PAGE_ALIGN(m->max_size);
3735
3736 if (m->max_size != 0) {
3737 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3738 m->max_size = JOURNAL_FILE_SIZE_MIN;
3739
3740 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3741 m->max_use = m->max_size*2;
3742 }
3743
3744 if (m->min_size == (uint64_t) -1)
3745 m->min_size = JOURNAL_FILE_SIZE_MIN;
3746 else {
3747 m->min_size = PAGE_ALIGN(m->min_size);
3748
3749 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3750 m->min_size = JOURNAL_FILE_SIZE_MIN;
3751
3752 if (m->max_size != 0 && m->min_size > m->max_size)
3753 m->max_size = m->min_size;
3754 }
3755
3756 if (m->keep_free == (uint64_t) -1) {
3757
3758 if (fs_size > 0) {
3759 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3760
3761 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3762 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3763
3764 } else
3765 m->keep_free = DEFAULT_KEEP_FREE;
3766 }
3767
3768 if (m->n_max_files == (uint64_t) -1)
3769 m->n_max_files = DEFAULT_N_MAX_FILES;
3770
3771 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3772 format_bytes(a, sizeof(a), m->min_use),
3773 format_bytes(b, sizeof(b), m->max_use),
3774 format_bytes(c, sizeof(c), m->max_size),
3775 format_bytes(d, sizeof(d), m->min_size),
3776 format_bytes(e, sizeof(e), m->keep_free),
3777 m->n_max_files);
3778 }
3779
3780 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3781 assert(f);
3782 assert(f->header);
3783 assert(from || to);
3784
3785 if (from) {
3786 if (f->header->head_entry_realtime == 0)
3787 return -ENOENT;
3788
3789 *from = le64toh(f->header->head_entry_realtime);
3790 }
3791
3792 if (to) {
3793 if (f->header->tail_entry_realtime == 0)
3794 return -ENOENT;
3795
3796 *to = le64toh(f->header->tail_entry_realtime);
3797 }
3798
3799 return 1;
3800 }
3801
3802 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3803 Object *o;
3804 uint64_t p;
3805 int r;
3806
3807 assert(f);
3808 assert(from || to);
3809
3810 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3811 if (r <= 0)
3812 return r;
3813
3814 if (le64toh(o->data.n_entries) <= 0)
3815 return 0;
3816
3817 if (from) {
3818 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3819 if (r < 0)
3820 return r;
3821
3822 *from = le64toh(o->entry.monotonic);
3823 }
3824
3825 if (to) {
3826 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3827 if (r < 0)
3828 return r;
3829
3830 r = generic_array_get_plus_one(f,
3831 le64toh(o->data.entry_offset),
3832 le64toh(o->data.entry_array_offset),
3833 le64toh(o->data.n_entries)-1,
3834 &o, NULL);
3835 if (r <= 0)
3836 return r;
3837
3838 *to = le64toh(o->entry.monotonic);
3839 }
3840
3841 return 1;
3842 }
3843
3844 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3845 assert(f);
3846 assert(f->header);
3847
3848 /* If we gained new header fields we gained new features,
3849 * hence suggest a rotation */
3850 if (le64toh(f->header->header_size) < sizeof(Header)) {
3851 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3852 return true;
3853 }
3854
3855 /* Let's check if the hash tables grew over a certain fill
3856 * level (75%, borrowing this value from Java's hash table
3857 * implementation), and if so suggest a rotation. To calculate
3858 * the fill level we need the n_data field, which only exists
3859 * in newer versions. */
3860
3861 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3862 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3863 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3864 f->path,
3865 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3866 le64toh(f->header->n_data),
3867 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3868 (unsigned long long) f->last_stat.st_size,
3869 f->last_stat.st_size / le64toh(f->header->n_data));
3870 return true;
3871 }
3872
3873 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3874 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3875 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3876 f->path,
3877 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3878 le64toh(f->header->n_fields),
3879 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3880 return true;
3881 }
3882
3883 /* Are the data objects properly indexed by field objects? */
3884 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3885 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3886 le64toh(f->header->n_data) > 0 &&
3887 le64toh(f->header->n_fields) == 0)
3888 return true;
3889
3890 if (max_file_usec > 0) {
3891 usec_t t, h;
3892
3893 h = le64toh(f->header->head_entry_realtime);
3894 t = now(CLOCK_REALTIME);
3895
3896 if (h > 0 && t > h + max_file_usec)
3897 return true;
3898 }
3899
3900 return false;
3901 }