]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
pkgconfig: define variables relative to ${prefix}/${rootprefix}/${sysconfdir}
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/fs.h>
6 #include <pthread.h>
7 #include <stddef.h>
8 #include <sys/mman.h>
9 #include <sys/statvfs.h>
10 #include <sys/uio.h>
11 #include <unistd.h>
12
13 #include "alloc-util.h"
14 #include "btrfs-util.h"
15 #include "chattr-util.h"
16 #include "compress.h"
17 #include "fd-util.h"
18 #include "fs-util.h"
19 #include "journal-authenticate.h"
20 #include "journal-def.h"
21 #include "journal-file.h"
22 #include "lookup3.h"
23 #include "parse-util.h"
24 #include "path-util.h"
25 #include "random-util.h"
26 #include "sd-event.h"
27 #include "set.h"
28 #include "stat-util.h"
29 #include "string-util.h"
30 #include "strv.h"
31 #include "xattr-util.h"
32
33 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
34 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
35
36 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
37 #define MIN_COMPRESS_THRESHOLD (8ULL)
38
39 /* This is the minimum journal file size */
40 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
41
42 /* These are the lower and upper bounds if we deduce the max_use value
43 * from the file system size */
44 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
45 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
46
47 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
48 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54 * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58 * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
60
61 /* This is the default maximum number of journal files to keep around. */
62 #define DEFAULT_N_MAX_FILES (100)
63
64 /* n_data was the first entry we added after the initial file format design */
65 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
66
67 /* How many entries to keep in the entry array chain cache at max */
68 #define CHAIN_CACHE_MAX 20
69
70 /* How much to increase the journal file size at once each time we allocate something new. */
71 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
72
73 /* Reread fstat() of the file for detecting deletions at least this often */
74 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
75
76 /* The mmap context to use for the header we pick as one above the last defined typed */
77 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
78
79 #ifdef __clang__
80 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
81 #endif
82
83 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
84 * As a result we use atomic operations on f->offline_state for inter-thread communications with
85 * journal_file_set_offline() and journal_file_set_online(). */
86 static void journal_file_set_offline_internal(JournalFile *f) {
87 assert(f);
88 assert(f->fd >= 0);
89 assert(f->header);
90
91 for (;;) {
92 switch (f->offline_state) {
93 case OFFLINE_CANCEL:
94 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
95 continue;
96 return;
97
98 case OFFLINE_AGAIN_FROM_SYNCING:
99 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
100 continue;
101 break;
102
103 case OFFLINE_AGAIN_FROM_OFFLINING:
104 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
105 continue;
106 break;
107
108 case OFFLINE_SYNCING:
109 (void) fsync(f->fd);
110
111 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
112 continue;
113
114 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
115 (void) fsync(f->fd);
116 break;
117
118 case OFFLINE_OFFLINING:
119 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
120 continue;
121 _fallthrough_;
122 case OFFLINE_DONE:
123 return;
124
125 case OFFLINE_JOINED:
126 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
127 return;
128 }
129 }
130 }
131
132 static void * journal_file_set_offline_thread(void *arg) {
133 JournalFile *f = arg;
134
135 (void) pthread_setname_np(pthread_self(), "journal-offline");
136
137 journal_file_set_offline_internal(f);
138
139 return NULL;
140 }
141
142 static int journal_file_set_offline_thread_join(JournalFile *f) {
143 int r;
144
145 assert(f);
146
147 if (f->offline_state == OFFLINE_JOINED)
148 return 0;
149
150 r = pthread_join(f->offline_thread, NULL);
151 if (r)
152 return -r;
153
154 f->offline_state = OFFLINE_JOINED;
155
156 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
157 return -EIO;
158
159 return 0;
160 }
161
162 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
163 static bool journal_file_set_offline_try_restart(JournalFile *f) {
164 for (;;) {
165 switch (f->offline_state) {
166 case OFFLINE_AGAIN_FROM_SYNCING:
167 case OFFLINE_AGAIN_FROM_OFFLINING:
168 return true;
169
170 case OFFLINE_CANCEL:
171 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
172 continue;
173 return true;
174
175 case OFFLINE_SYNCING:
176 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
177 continue;
178 return true;
179
180 case OFFLINE_OFFLINING:
181 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
182 continue;
183 return true;
184
185 default:
186 return false;
187 }
188 }
189 }
190
191 /* Sets a journal offline.
192 *
193 * If wait is false then an offline is dispatched in a separate thread for a
194 * subsequent journal_file_set_offline() or journal_file_set_online() of the
195 * same journal to synchronize with.
196 *
197 * If wait is true, then either an existing offline thread will be restarted
198 * and joined, or if none exists the offline is simply performed in this
199 * context without involving another thread.
200 */
201 int journal_file_set_offline(JournalFile *f, bool wait) {
202 bool restarted;
203 int r;
204
205 assert(f);
206
207 if (!f->writable)
208 return -EPERM;
209
210 if (f->fd < 0 || !f->header)
211 return -EINVAL;
212
213 /* An offlining journal is implicitly online and may modify f->header->state,
214 * we must also join any potentially lingering offline thread when not online. */
215 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
216 return journal_file_set_offline_thread_join(f);
217
218 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
219 restarted = journal_file_set_offline_try_restart(f);
220 if ((restarted && wait) || !restarted) {
221 r = journal_file_set_offline_thread_join(f);
222 if (r < 0)
223 return r;
224 }
225
226 if (restarted)
227 return 0;
228
229 /* Initiate a new offline. */
230 f->offline_state = OFFLINE_SYNCING;
231
232 if (wait) /* Without using a thread if waiting. */
233 journal_file_set_offline_internal(f);
234 else {
235 sigset_t ss, saved_ss;
236 int k;
237
238 if (sigfillset(&ss) < 0)
239 return -errno;
240
241 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
242 if (r > 0)
243 return -r;
244
245 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
246
247 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
248 if (r > 0) {
249 f->offline_state = OFFLINE_JOINED;
250 return -r;
251 }
252 if (k > 0)
253 return -k;
254 }
255
256 return 0;
257 }
258
259 static int journal_file_set_online(JournalFile *f) {
260 bool wait = true;
261
262 assert(f);
263
264 if (!f->writable)
265 return -EPERM;
266
267 if (f->fd < 0 || !f->header)
268 return -EINVAL;
269
270 while (wait) {
271 switch (f->offline_state) {
272 case OFFLINE_JOINED:
273 /* No offline thread, no need to wait. */
274 wait = false;
275 break;
276
277 case OFFLINE_SYNCING:
278 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
279 continue;
280 /* Canceled syncing prior to offlining, no need to wait. */
281 wait = false;
282 break;
283
284 case OFFLINE_AGAIN_FROM_SYNCING:
285 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
286 continue;
287 /* Canceled restart from syncing, no need to wait. */
288 wait = false;
289 break;
290
291 case OFFLINE_AGAIN_FROM_OFFLINING:
292 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
293 continue;
294 /* Canceled restart from offlining, must wait for offlining to complete however. */
295 _fallthrough_;
296 default: {
297 int r;
298
299 r = journal_file_set_offline_thread_join(f);
300 if (r < 0)
301 return r;
302
303 wait = false;
304 break;
305 }
306 }
307 }
308
309 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
310 return -EIO;
311
312 switch (f->header->state) {
313 case STATE_ONLINE:
314 return 0;
315
316 case STATE_OFFLINE:
317 f->header->state = STATE_ONLINE;
318 (void) fsync(f->fd);
319 return 0;
320
321 default:
322 return -EINVAL;
323 }
324 }
325
326 bool journal_file_is_offlining(JournalFile *f) {
327 assert(f);
328
329 __sync_synchronize();
330
331 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
332 return false;
333
334 return true;
335 }
336
337 JournalFile* journal_file_close(JournalFile *f) {
338 assert(f);
339
340 #if HAVE_GCRYPT
341 /* Write the final tag */
342 if (f->seal && f->writable) {
343 int r;
344
345 r = journal_file_append_tag(f);
346 if (r < 0)
347 log_error_errno(r, "Failed to append tag when closing journal: %m");
348 }
349 #endif
350
351 if (f->post_change_timer) {
352 if (sd_event_source_get_enabled(f->post_change_timer, NULL) > 0)
353 journal_file_post_change(f);
354
355 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
356 sd_event_source_unref(f->post_change_timer);
357 }
358
359 journal_file_set_offline(f, true);
360
361 if (f->mmap && f->cache_fd)
362 mmap_cache_free_fd(f->mmap, f->cache_fd);
363
364 if (f->fd >= 0 && f->defrag_on_close) {
365
366 /* Be friendly to btrfs: turn COW back on again now,
367 * and defragment the file. We won't write to the file
368 * ever again, hence remove all fragmentation, and
369 * reenable all the good bits COW usually provides
370 * (such as data checksumming). */
371
372 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL);
373 (void) btrfs_defrag_fd(f->fd);
374 }
375
376 if (f->close_fd)
377 safe_close(f->fd);
378 free(f->path);
379
380 mmap_cache_unref(f->mmap);
381
382 ordered_hashmap_free_free(f->chain_cache);
383
384 #if HAVE_XZ || HAVE_LZ4
385 free(f->compress_buffer);
386 #endif
387
388 #if HAVE_GCRYPT
389 if (f->fss_file)
390 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
391 else
392 free(f->fsprg_state);
393
394 free(f->fsprg_seed);
395
396 if (f->hmac)
397 gcry_md_close(f->hmac);
398 #endif
399
400 return mfree(f);
401 }
402
403 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
404 Header h = {};
405 ssize_t k;
406 int r;
407
408 assert(f);
409
410 memcpy(h.signature, HEADER_SIGNATURE, 8);
411 h.header_size = htole64(ALIGN64(sizeof(h)));
412
413 h.incompatible_flags |= htole32(
414 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
415 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
416
417 h.compatible_flags = htole32(
418 f->seal * HEADER_COMPATIBLE_SEALED);
419
420 r = sd_id128_randomize(&h.file_id);
421 if (r < 0)
422 return r;
423
424 if (template) {
425 h.seqnum_id = template->header->seqnum_id;
426 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
427 } else
428 h.seqnum_id = h.file_id;
429
430 k = pwrite(f->fd, &h, sizeof(h), 0);
431 if (k < 0)
432 return -errno;
433
434 if (k != sizeof(h))
435 return -EIO;
436
437 return 0;
438 }
439
440 static int journal_file_refresh_header(JournalFile *f) {
441 sd_id128_t boot_id;
442 int r;
443
444 assert(f);
445 assert(f->header);
446
447 r = sd_id128_get_machine(&f->header->machine_id);
448 if (IN_SET(r, -ENOENT, -ENOMEDIUM))
449 /* We don't have a machine-id, let's continue without */
450 zero(f->header->machine_id);
451 else if (r < 0)
452 return r;
453
454 r = sd_id128_get_boot(&boot_id);
455 if (r < 0)
456 return r;
457
458 f->header->boot_id = boot_id;
459
460 r = journal_file_set_online(f);
461
462 /* Sync the online state to disk */
463 (void) fsync(f->fd);
464
465 /* We likely just created a new file, also sync the directory this file is located in. */
466 (void) fsync_directory_of_file(f->fd);
467
468 return r;
469 }
470
471 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
472 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
473 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
474 const char *type = compatible ? "compatible" : "incompatible";
475 uint32_t flags;
476
477 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
478
479 if (flags & ~supported) {
480 if (flags & ~any)
481 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
482 f->path, type, flags & ~any);
483 flags = (flags & any) & ~supported;
484 if (flags) {
485 const char* strv[3];
486 unsigned n = 0;
487 _cleanup_free_ char *t = NULL;
488
489 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
490 strv[n++] = "sealed";
491 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
492 strv[n++] = "xz-compressed";
493 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
494 strv[n++] = "lz4-compressed";
495 strv[n] = NULL;
496 assert(n < ELEMENTSOF(strv));
497
498 t = strv_join((char**) strv, ", ");
499 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
500 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
501 }
502 return true;
503 }
504
505 return false;
506 }
507
508 static int journal_file_verify_header(JournalFile *f) {
509 uint64_t arena_size, header_size;
510
511 assert(f);
512 assert(f->header);
513
514 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
515 return -EBADMSG;
516
517 /* In both read and write mode we refuse to open files with incompatible
518 * flags we don't know. */
519 if (warn_wrong_flags(f, false))
520 return -EPROTONOSUPPORT;
521
522 /* When open for writing we refuse to open files with compatible flags, too. */
523 if (f->writable && warn_wrong_flags(f, true))
524 return -EPROTONOSUPPORT;
525
526 if (f->header->state >= _STATE_MAX)
527 return -EBADMSG;
528
529 header_size = le64toh(f->header->header_size);
530
531 /* The first addition was n_data, so check that we are at least this large */
532 if (header_size < HEADER_SIZE_MIN)
533 return -EBADMSG;
534
535 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
536 return -EBADMSG;
537
538 arena_size = le64toh(f->header->arena_size);
539
540 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
541 return -ENODATA;
542
543 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
544 return -ENODATA;
545
546 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
547 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
548 !VALID64(le64toh(f->header->tail_object_offset)) ||
549 !VALID64(le64toh(f->header->entry_array_offset)))
550 return -ENODATA;
551
552 if (f->writable) {
553 sd_id128_t machine_id;
554 uint8_t state;
555 int r;
556
557 r = sd_id128_get_machine(&machine_id);
558 if (r < 0)
559 return r;
560
561 if (!sd_id128_equal(machine_id, f->header->machine_id))
562 return -EHOSTDOWN;
563
564 state = f->header->state;
565
566 if (state == STATE_ARCHIVED)
567 return -ESHUTDOWN; /* Already archived */
568 else if (state == STATE_ONLINE) {
569 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
570 return -EBUSY;
571 } else if (state != STATE_OFFLINE) {
572 log_debug("Journal file %s has unknown state %i.", f->path, state);
573 return -EBUSY;
574 }
575
576 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
577 return -EBADMSG;
578
579 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
580 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
581 * bisection. */
582 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
583 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
584 return -ETXTBSY;
585 }
586 }
587
588 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
589 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
590
591 f->seal = JOURNAL_HEADER_SEALED(f->header);
592
593 return 0;
594 }
595
596 static int journal_file_fstat(JournalFile *f) {
597 int r;
598
599 assert(f);
600 assert(f->fd >= 0);
601
602 if (fstat(f->fd, &f->last_stat) < 0)
603 return -errno;
604
605 f->last_stat_usec = now(CLOCK_MONOTONIC);
606
607 /* Refuse dealing with with files that aren't regular */
608 r = stat_verify_regular(&f->last_stat);
609 if (r < 0)
610 return r;
611
612 /* Refuse appending to files that are already deleted */
613 if (f->last_stat.st_nlink <= 0)
614 return -EIDRM;
615
616 return 0;
617 }
618
619 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
620 uint64_t old_size, new_size;
621 int r;
622
623 assert(f);
624 assert(f->header);
625
626 /* We assume that this file is not sparse, and we know that
627 * for sure, since we always call posix_fallocate()
628 * ourselves */
629
630 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
631 return -EIO;
632
633 old_size =
634 le64toh(f->header->header_size) +
635 le64toh(f->header->arena_size);
636
637 new_size = PAGE_ALIGN(offset + size);
638 if (new_size < le64toh(f->header->header_size))
639 new_size = le64toh(f->header->header_size);
640
641 if (new_size <= old_size) {
642
643 /* We already pre-allocated enough space, but before
644 * we write to it, let's check with fstat() if the
645 * file got deleted, in order make sure we don't throw
646 * away the data immediately. Don't check fstat() for
647 * all writes though, but only once ever 10s. */
648
649 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
650 return 0;
651
652 return journal_file_fstat(f);
653 }
654
655 /* Allocate more space. */
656
657 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
658 return -E2BIG;
659
660 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
661 struct statvfs svfs;
662
663 if (fstatvfs(f->fd, &svfs) >= 0) {
664 uint64_t available;
665
666 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
667
668 if (new_size - old_size > available)
669 return -E2BIG;
670 }
671 }
672
673 /* Increase by larger blocks at once */
674 new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
675 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
676 new_size = f->metrics.max_size;
677
678 /* Note that the glibc fallocate() fallback is very
679 inefficient, hence we try to minimize the allocation area
680 as we can. */
681 r = posix_fallocate(f->fd, old_size, new_size - old_size);
682 if (r != 0)
683 return -r;
684
685 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
686
687 return journal_file_fstat(f);
688 }
689
690 static unsigned type_to_context(ObjectType type) {
691 /* One context for each type, plus one catch-all for the rest */
692 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
693 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
694 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
695 }
696
697 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
698 int r;
699
700 assert(f);
701 assert(ret);
702
703 if (size <= 0)
704 return -EINVAL;
705
706 /* Avoid SIGBUS on invalid accesses */
707 if (offset + size > (uint64_t) f->last_stat.st_size) {
708 /* Hmm, out of range? Let's refresh the fstat() data
709 * first, before we trust that check. */
710
711 r = journal_file_fstat(f);
712 if (r < 0)
713 return r;
714
715 if (offset + size > (uint64_t) f->last_stat.st_size)
716 return -EADDRNOTAVAIL;
717 }
718
719 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
720 }
721
722 static uint64_t minimum_header_size(Object *o) {
723
724 static const uint64_t table[] = {
725 [OBJECT_DATA] = sizeof(DataObject),
726 [OBJECT_FIELD] = sizeof(FieldObject),
727 [OBJECT_ENTRY] = sizeof(EntryObject),
728 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
729 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
730 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
731 [OBJECT_TAG] = sizeof(TagObject),
732 };
733
734 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
735 return sizeof(ObjectHeader);
736
737 return table[o->object.type];
738 }
739
740 /* Lightweight object checks. We want this to be fast, so that we won't
741 * slowdown every journal_file_move_to_object() call too much. */
742 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
743 assert(f);
744 assert(o);
745
746 switch (o->object.type) {
747
748 case OBJECT_DATA: {
749 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
750 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
751 le64toh(o->data.n_entries), offset);
752 return -EBADMSG;
753 }
754
755 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
756 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
757 offsetof(DataObject, payload),
758 le64toh(o->object.size),
759 offset);
760 return -EBADMSG;
761 }
762
763 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
764 !VALID64(le64toh(o->data.next_field_offset)) ||
765 !VALID64(le64toh(o->data.entry_offset)) ||
766 !VALID64(le64toh(o->data.entry_array_offset))) {
767 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
768 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
769 le64toh(o->data.next_hash_offset),
770 le64toh(o->data.next_field_offset),
771 le64toh(o->data.entry_offset),
772 le64toh(o->data.entry_array_offset),
773 offset);
774 return -EBADMSG;
775 }
776
777 break;
778 }
779
780 case OBJECT_FIELD:
781 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
782 log_debug(
783 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
784 offsetof(FieldObject, payload),
785 le64toh(o->object.size),
786 offset);
787 return -EBADMSG;
788 }
789
790 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
791 !VALID64(le64toh(o->field.head_data_offset))) {
792 log_debug(
793 "Invalid offset, next_hash_offset="OFSfmt
794 ", head_data_offset="OFSfmt": %"PRIu64,
795 le64toh(o->field.next_hash_offset),
796 le64toh(o->field.head_data_offset),
797 offset);
798 return -EBADMSG;
799 }
800 break;
801
802 case OBJECT_ENTRY:
803 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
804 log_debug(
805 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
806 offsetof(EntryObject, items),
807 le64toh(o->object.size),
808 offset);
809 return -EBADMSG;
810 }
811
812 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
813 log_debug(
814 "Invalid number items in entry: %"PRIu64": %"PRIu64,
815 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
816 offset);
817 return -EBADMSG;
818 }
819
820 if (le64toh(o->entry.seqnum) <= 0) {
821 log_debug(
822 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
823 le64toh(o->entry.seqnum),
824 offset);
825 return -EBADMSG;
826 }
827
828 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
829 log_debug(
830 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
831 le64toh(o->entry.realtime),
832 offset);
833 return -EBADMSG;
834 }
835
836 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
837 log_debug(
838 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
839 le64toh(o->entry.monotonic),
840 offset);
841 return -EBADMSG;
842 }
843
844 break;
845
846 case OBJECT_DATA_HASH_TABLE:
847 case OBJECT_FIELD_HASH_TABLE:
848 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
849 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
850 log_debug(
851 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
852 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
853 le64toh(o->object.size),
854 offset);
855 return -EBADMSG;
856 }
857
858 break;
859
860 case OBJECT_ENTRY_ARRAY:
861 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
862 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
863 log_debug(
864 "Invalid object entry array size: %"PRIu64": %"PRIu64,
865 le64toh(o->object.size),
866 offset);
867 return -EBADMSG;
868 }
869
870 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
871 log_debug(
872 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
873 le64toh(o->entry_array.next_entry_array_offset),
874 offset);
875 return -EBADMSG;
876 }
877
878 break;
879
880 case OBJECT_TAG:
881 if (le64toh(o->object.size) != sizeof(TagObject)) {
882 log_debug(
883 "Invalid object tag size: %"PRIu64": %"PRIu64,
884 le64toh(o->object.size),
885 offset);
886 return -EBADMSG;
887 }
888
889 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
890 log_debug(
891 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
892 le64toh(o->tag.epoch),
893 offset);
894 return -EBADMSG;
895 }
896
897 break;
898 }
899
900 return 0;
901 }
902
903 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
904 int r;
905 void *t;
906 size_t tsize;
907 Object *o;
908 uint64_t s;
909
910 assert(f);
911 assert(ret);
912
913 /* Objects may only be located at multiple of 64 bit */
914 if (!VALID64(offset)) {
915 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
916 return -EBADMSG;
917 }
918
919 /* Object may not be located in the file header */
920 if (offset < le64toh(f->header->header_size)) {
921 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
922 return -EBADMSG;
923 }
924
925 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
926 if (r < 0)
927 return r;
928
929 o = (Object*) t;
930 s = le64toh(o->object.size);
931
932 if (s == 0) {
933 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
934 return -EBADMSG;
935 }
936 if (s < sizeof(ObjectHeader)) {
937 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
938 return -EBADMSG;
939 }
940
941 if (o->object.type <= OBJECT_UNUSED) {
942 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
943 return -EBADMSG;
944 }
945
946 if (s < minimum_header_size(o)) {
947 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
948 return -EBADMSG;
949 }
950
951 if (type > OBJECT_UNUSED && o->object.type != type) {
952 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
953 return -EBADMSG;
954 }
955
956 if (s > tsize) {
957 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
958 if (r < 0)
959 return r;
960
961 o = (Object*) t;
962 }
963
964 r = journal_file_check_object(f, offset, o);
965 if (r < 0)
966 return r;
967
968 *ret = o;
969 return 0;
970 }
971
972 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
973 uint64_t r;
974
975 assert(f);
976 assert(f->header);
977
978 r = le64toh(f->header->tail_entry_seqnum) + 1;
979
980 if (seqnum) {
981 /* If an external seqnum counter was passed, we update
982 * both the local and the external one, and set it to
983 * the maximum of both */
984
985 if (*seqnum + 1 > r)
986 r = *seqnum + 1;
987
988 *seqnum = r;
989 }
990
991 f->header->tail_entry_seqnum = htole64(r);
992
993 if (f->header->head_entry_seqnum == 0)
994 f->header->head_entry_seqnum = htole64(r);
995
996 return r;
997 }
998
999 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
1000 int r;
1001 uint64_t p;
1002 Object *tail, *o;
1003 void *t;
1004
1005 assert(f);
1006 assert(f->header);
1007 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
1008 assert(size >= sizeof(ObjectHeader));
1009 assert(offset);
1010 assert(ret);
1011
1012 r = journal_file_set_online(f);
1013 if (r < 0)
1014 return r;
1015
1016 p = le64toh(f->header->tail_object_offset);
1017 if (p == 0)
1018 p = le64toh(f->header->header_size);
1019 else {
1020 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1021 if (r < 0)
1022 return r;
1023
1024 p += ALIGN64(le64toh(tail->object.size));
1025 }
1026
1027 r = journal_file_allocate(f, p, size);
1028 if (r < 0)
1029 return r;
1030
1031 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1032 if (r < 0)
1033 return r;
1034
1035 o = (Object*) t;
1036
1037 zero(o->object);
1038 o->object.type = type;
1039 o->object.size = htole64(size);
1040
1041 f->header->tail_object_offset = htole64(p);
1042 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1043
1044 *ret = o;
1045 *offset = p;
1046
1047 return 0;
1048 }
1049
1050 static int journal_file_setup_data_hash_table(JournalFile *f) {
1051 uint64_t s, p;
1052 Object *o;
1053 int r;
1054
1055 assert(f);
1056 assert(f->header);
1057
1058 /* We estimate that we need 1 hash table entry per 768 bytes
1059 of journal file and we want to make sure we never get
1060 beyond 75% fill level. Calculate the hash table size for
1061 the maximum file size based on these metrics. */
1062
1063 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1064 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1065 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1066
1067 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1068
1069 r = journal_file_append_object(f,
1070 OBJECT_DATA_HASH_TABLE,
1071 offsetof(Object, hash_table.items) + s,
1072 &o, &p);
1073 if (r < 0)
1074 return r;
1075
1076 memzero(o->hash_table.items, s);
1077
1078 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1079 f->header->data_hash_table_size = htole64(s);
1080
1081 return 0;
1082 }
1083
1084 static int journal_file_setup_field_hash_table(JournalFile *f) {
1085 uint64_t s, p;
1086 Object *o;
1087 int r;
1088
1089 assert(f);
1090 assert(f->header);
1091
1092 /* We use a fixed size hash table for the fields as this
1093 * number should grow very slowly only */
1094
1095 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1096 r = journal_file_append_object(f,
1097 OBJECT_FIELD_HASH_TABLE,
1098 offsetof(Object, hash_table.items) + s,
1099 &o, &p);
1100 if (r < 0)
1101 return r;
1102
1103 memzero(o->hash_table.items, s);
1104
1105 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1106 f->header->field_hash_table_size = htole64(s);
1107
1108 return 0;
1109 }
1110
1111 int journal_file_map_data_hash_table(JournalFile *f) {
1112 uint64_t s, p;
1113 void *t;
1114 int r;
1115
1116 assert(f);
1117 assert(f->header);
1118
1119 if (f->data_hash_table)
1120 return 0;
1121
1122 p = le64toh(f->header->data_hash_table_offset);
1123 s = le64toh(f->header->data_hash_table_size);
1124
1125 r = journal_file_move_to(f,
1126 OBJECT_DATA_HASH_TABLE,
1127 true,
1128 p, s,
1129 &t, NULL);
1130 if (r < 0)
1131 return r;
1132
1133 f->data_hash_table = t;
1134 return 0;
1135 }
1136
1137 int journal_file_map_field_hash_table(JournalFile *f) {
1138 uint64_t s, p;
1139 void *t;
1140 int r;
1141
1142 assert(f);
1143 assert(f->header);
1144
1145 if (f->field_hash_table)
1146 return 0;
1147
1148 p = le64toh(f->header->field_hash_table_offset);
1149 s = le64toh(f->header->field_hash_table_size);
1150
1151 r = journal_file_move_to(f,
1152 OBJECT_FIELD_HASH_TABLE,
1153 true,
1154 p, s,
1155 &t, NULL);
1156 if (r < 0)
1157 return r;
1158
1159 f->field_hash_table = t;
1160 return 0;
1161 }
1162
1163 static int journal_file_link_field(
1164 JournalFile *f,
1165 Object *o,
1166 uint64_t offset,
1167 uint64_t hash) {
1168
1169 uint64_t p, h, m;
1170 int r;
1171
1172 assert(f);
1173 assert(f->header);
1174 assert(f->field_hash_table);
1175 assert(o);
1176 assert(offset > 0);
1177
1178 if (o->object.type != OBJECT_FIELD)
1179 return -EINVAL;
1180
1181 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1182 if (m <= 0)
1183 return -EBADMSG;
1184
1185 /* This might alter the window we are looking at */
1186 o->field.next_hash_offset = o->field.head_data_offset = 0;
1187
1188 h = hash % m;
1189 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1190 if (p == 0)
1191 f->field_hash_table[h].head_hash_offset = htole64(offset);
1192 else {
1193 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1194 if (r < 0)
1195 return r;
1196
1197 o->field.next_hash_offset = htole64(offset);
1198 }
1199
1200 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1201
1202 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1203 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1204
1205 return 0;
1206 }
1207
1208 static int journal_file_link_data(
1209 JournalFile *f,
1210 Object *o,
1211 uint64_t offset,
1212 uint64_t hash) {
1213
1214 uint64_t p, h, m;
1215 int r;
1216
1217 assert(f);
1218 assert(f->header);
1219 assert(f->data_hash_table);
1220 assert(o);
1221 assert(offset > 0);
1222
1223 if (o->object.type != OBJECT_DATA)
1224 return -EINVAL;
1225
1226 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1227 if (m <= 0)
1228 return -EBADMSG;
1229
1230 /* This might alter the window we are looking at */
1231 o->data.next_hash_offset = o->data.next_field_offset = 0;
1232 o->data.entry_offset = o->data.entry_array_offset = 0;
1233 o->data.n_entries = 0;
1234
1235 h = hash % m;
1236 p = le64toh(f->data_hash_table[h].tail_hash_offset);
1237 if (p == 0)
1238 /* Only entry in the hash table is easy */
1239 f->data_hash_table[h].head_hash_offset = htole64(offset);
1240 else {
1241 /* Move back to the previous data object, to patch in
1242 * pointer */
1243
1244 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1245 if (r < 0)
1246 return r;
1247
1248 o->data.next_hash_offset = htole64(offset);
1249 }
1250
1251 f->data_hash_table[h].tail_hash_offset = htole64(offset);
1252
1253 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1254 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1255
1256 return 0;
1257 }
1258
1259 int journal_file_find_field_object_with_hash(
1260 JournalFile *f,
1261 const void *field, uint64_t size, uint64_t hash,
1262 Object **ret, uint64_t *offset) {
1263
1264 uint64_t p, osize, h, m;
1265 int r;
1266
1267 assert(f);
1268 assert(f->header);
1269 assert(field && size > 0);
1270
1271 /* If the field hash table is empty, we can't find anything */
1272 if (le64toh(f->header->field_hash_table_size) <= 0)
1273 return 0;
1274
1275 /* Map the field hash table, if it isn't mapped yet. */
1276 r = journal_file_map_field_hash_table(f);
1277 if (r < 0)
1278 return r;
1279
1280 osize = offsetof(Object, field.payload) + size;
1281
1282 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1283 if (m <= 0)
1284 return -EBADMSG;
1285
1286 h = hash % m;
1287 p = le64toh(f->field_hash_table[h].head_hash_offset);
1288
1289 while (p > 0) {
1290 Object *o;
1291
1292 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1293 if (r < 0)
1294 return r;
1295
1296 if (le64toh(o->field.hash) == hash &&
1297 le64toh(o->object.size) == osize &&
1298 memcmp(o->field.payload, field, size) == 0) {
1299
1300 if (ret)
1301 *ret = o;
1302 if (offset)
1303 *offset = p;
1304
1305 return 1;
1306 }
1307
1308 p = le64toh(o->field.next_hash_offset);
1309 }
1310
1311 return 0;
1312 }
1313
1314 int journal_file_find_field_object(
1315 JournalFile *f,
1316 const void *field, uint64_t size,
1317 Object **ret, uint64_t *offset) {
1318
1319 uint64_t hash;
1320
1321 assert(f);
1322 assert(field && size > 0);
1323
1324 hash = hash64(field, size);
1325
1326 return journal_file_find_field_object_with_hash(f,
1327 field, size, hash,
1328 ret, offset);
1329 }
1330
1331 int journal_file_find_data_object_with_hash(
1332 JournalFile *f,
1333 const void *data, uint64_t size, uint64_t hash,
1334 Object **ret, uint64_t *offset) {
1335
1336 uint64_t p, osize, h, m;
1337 int r;
1338
1339 assert(f);
1340 assert(f->header);
1341 assert(data || size == 0);
1342
1343 /* If there's no data hash table, then there's no entry. */
1344 if (le64toh(f->header->data_hash_table_size) <= 0)
1345 return 0;
1346
1347 /* Map the data hash table, if it isn't mapped yet. */
1348 r = journal_file_map_data_hash_table(f);
1349 if (r < 0)
1350 return r;
1351
1352 osize = offsetof(Object, data.payload) + size;
1353
1354 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1355 if (m <= 0)
1356 return -EBADMSG;
1357
1358 h = hash % m;
1359 p = le64toh(f->data_hash_table[h].head_hash_offset);
1360
1361 while (p > 0) {
1362 Object *o;
1363
1364 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1365 if (r < 0)
1366 return r;
1367
1368 if (le64toh(o->data.hash) != hash)
1369 goto next;
1370
1371 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1372 #if HAVE_XZ || HAVE_LZ4
1373 uint64_t l;
1374 size_t rsize = 0;
1375
1376 l = le64toh(o->object.size);
1377 if (l <= offsetof(Object, data.payload))
1378 return -EBADMSG;
1379
1380 l -= offsetof(Object, data.payload);
1381
1382 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1383 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1384 if (r < 0)
1385 return r;
1386
1387 if (rsize == size &&
1388 memcmp(f->compress_buffer, data, size) == 0) {
1389
1390 if (ret)
1391 *ret = o;
1392
1393 if (offset)
1394 *offset = p;
1395
1396 return 1;
1397 }
1398 #else
1399 return -EPROTONOSUPPORT;
1400 #endif
1401 } else if (le64toh(o->object.size) == osize &&
1402 memcmp(o->data.payload, data, size) == 0) {
1403
1404 if (ret)
1405 *ret = o;
1406
1407 if (offset)
1408 *offset = p;
1409
1410 return 1;
1411 }
1412
1413 next:
1414 p = le64toh(o->data.next_hash_offset);
1415 }
1416
1417 return 0;
1418 }
1419
1420 int journal_file_find_data_object(
1421 JournalFile *f,
1422 const void *data, uint64_t size,
1423 Object **ret, uint64_t *offset) {
1424
1425 uint64_t hash;
1426
1427 assert(f);
1428 assert(data || size == 0);
1429
1430 hash = hash64(data, size);
1431
1432 return journal_file_find_data_object_with_hash(f,
1433 data, size, hash,
1434 ret, offset);
1435 }
1436
1437 static int journal_file_append_field(
1438 JournalFile *f,
1439 const void *field, uint64_t size,
1440 Object **ret, uint64_t *offset) {
1441
1442 uint64_t hash, p;
1443 uint64_t osize;
1444 Object *o;
1445 int r;
1446
1447 assert(f);
1448 assert(field && size > 0);
1449
1450 hash = hash64(field, size);
1451
1452 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1453 if (r < 0)
1454 return r;
1455 else if (r > 0) {
1456
1457 if (ret)
1458 *ret = o;
1459
1460 if (offset)
1461 *offset = p;
1462
1463 return 0;
1464 }
1465
1466 osize = offsetof(Object, field.payload) + size;
1467 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1468 if (r < 0)
1469 return r;
1470
1471 o->field.hash = htole64(hash);
1472 memcpy(o->field.payload, field, size);
1473
1474 r = journal_file_link_field(f, o, p, hash);
1475 if (r < 0)
1476 return r;
1477
1478 /* The linking might have altered the window, so let's
1479 * refresh our pointer */
1480 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1481 if (r < 0)
1482 return r;
1483
1484 #if HAVE_GCRYPT
1485 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1486 if (r < 0)
1487 return r;
1488 #endif
1489
1490 if (ret)
1491 *ret = o;
1492
1493 if (offset)
1494 *offset = p;
1495
1496 return 0;
1497 }
1498
1499 static int journal_file_append_data(
1500 JournalFile *f,
1501 const void *data, uint64_t size,
1502 Object **ret, uint64_t *offset) {
1503
1504 uint64_t hash, p;
1505 uint64_t osize;
1506 Object *o;
1507 int r, compression = 0;
1508 const void *eq;
1509
1510 assert(f);
1511 assert(data || size == 0);
1512
1513 hash = hash64(data, size);
1514
1515 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1516 if (r < 0)
1517 return r;
1518 if (r > 0) {
1519
1520 if (ret)
1521 *ret = o;
1522
1523 if (offset)
1524 *offset = p;
1525
1526 return 0;
1527 }
1528
1529 osize = offsetof(Object, data.payload) + size;
1530 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1531 if (r < 0)
1532 return r;
1533
1534 o->data.hash = htole64(hash);
1535
1536 #if HAVE_XZ || HAVE_LZ4
1537 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
1538 size_t rsize = 0;
1539
1540 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1541
1542 if (compression >= 0) {
1543 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1544 o->object.flags |= compression;
1545
1546 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1547 size, rsize, object_compressed_to_string(compression));
1548 } else
1549 /* Compression didn't work, we don't really care why, let's continue without compression */
1550 compression = 0;
1551 }
1552 #endif
1553
1554 if (compression == 0)
1555 memcpy_safe(o->data.payload, data, size);
1556
1557 r = journal_file_link_data(f, o, p, hash);
1558 if (r < 0)
1559 return r;
1560
1561 #if HAVE_GCRYPT
1562 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1563 if (r < 0)
1564 return r;
1565 #endif
1566
1567 /* The linking might have altered the window, so let's
1568 * refresh our pointer */
1569 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1570 if (r < 0)
1571 return r;
1572
1573 if (!data)
1574 eq = NULL;
1575 else
1576 eq = memchr(data, '=', size);
1577 if (eq && eq > data) {
1578 Object *fo = NULL;
1579 uint64_t fp;
1580
1581 /* Create field object ... */
1582 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1583 if (r < 0)
1584 return r;
1585
1586 /* ... and link it in. */
1587 o->data.next_field_offset = fo->field.head_data_offset;
1588 fo->field.head_data_offset = le64toh(p);
1589 }
1590
1591 if (ret)
1592 *ret = o;
1593
1594 if (offset)
1595 *offset = p;
1596
1597 return 0;
1598 }
1599
1600 uint64_t journal_file_entry_n_items(Object *o) {
1601 assert(o);
1602
1603 if (o->object.type != OBJECT_ENTRY)
1604 return 0;
1605
1606 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1607 }
1608
1609 uint64_t journal_file_entry_array_n_items(Object *o) {
1610 assert(o);
1611
1612 if (o->object.type != OBJECT_ENTRY_ARRAY)
1613 return 0;
1614
1615 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1616 }
1617
1618 uint64_t journal_file_hash_table_n_items(Object *o) {
1619 assert(o);
1620
1621 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1622 return 0;
1623
1624 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1625 }
1626
1627 static int link_entry_into_array(JournalFile *f,
1628 le64_t *first,
1629 le64_t *idx,
1630 uint64_t p) {
1631 int r;
1632 uint64_t n = 0, ap = 0, q, i, a, hidx;
1633 Object *o;
1634
1635 assert(f);
1636 assert(f->header);
1637 assert(first);
1638 assert(idx);
1639 assert(p > 0);
1640
1641 a = le64toh(*first);
1642 i = hidx = le64toh(*idx);
1643 while (a > 0) {
1644
1645 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1646 if (r < 0)
1647 return r;
1648
1649 n = journal_file_entry_array_n_items(o);
1650 if (i < n) {
1651 o->entry_array.items[i] = htole64(p);
1652 *idx = htole64(hidx + 1);
1653 return 0;
1654 }
1655
1656 i -= n;
1657 ap = a;
1658 a = le64toh(o->entry_array.next_entry_array_offset);
1659 }
1660
1661 if (hidx > n)
1662 n = (hidx+1) * 2;
1663 else
1664 n = n * 2;
1665
1666 if (n < 4)
1667 n = 4;
1668
1669 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1670 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1671 &o, &q);
1672 if (r < 0)
1673 return r;
1674
1675 #if HAVE_GCRYPT
1676 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1677 if (r < 0)
1678 return r;
1679 #endif
1680
1681 o->entry_array.items[i] = htole64(p);
1682
1683 if (ap == 0)
1684 *first = htole64(q);
1685 else {
1686 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1687 if (r < 0)
1688 return r;
1689
1690 o->entry_array.next_entry_array_offset = htole64(q);
1691 }
1692
1693 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1694 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1695
1696 *idx = htole64(hidx + 1);
1697
1698 return 0;
1699 }
1700
1701 static int link_entry_into_array_plus_one(JournalFile *f,
1702 le64_t *extra,
1703 le64_t *first,
1704 le64_t *idx,
1705 uint64_t p) {
1706
1707 int r;
1708
1709 assert(f);
1710 assert(extra);
1711 assert(first);
1712 assert(idx);
1713 assert(p > 0);
1714
1715 if (*idx == 0)
1716 *extra = htole64(p);
1717 else {
1718 le64_t i;
1719
1720 i = htole64(le64toh(*idx) - 1);
1721 r = link_entry_into_array(f, first, &i, p);
1722 if (r < 0)
1723 return r;
1724 }
1725
1726 *idx = htole64(le64toh(*idx) + 1);
1727 return 0;
1728 }
1729
1730 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1731 uint64_t p;
1732 int r;
1733 assert(f);
1734 assert(o);
1735 assert(offset > 0);
1736
1737 p = le64toh(o->entry.items[i].object_offset);
1738 if (p == 0)
1739 return -EINVAL;
1740
1741 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1742 if (r < 0)
1743 return r;
1744
1745 return link_entry_into_array_plus_one(f,
1746 &o->data.entry_offset,
1747 &o->data.entry_array_offset,
1748 &o->data.n_entries,
1749 offset);
1750 }
1751
1752 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1753 uint64_t n, i;
1754 int r;
1755
1756 assert(f);
1757 assert(f->header);
1758 assert(o);
1759 assert(offset > 0);
1760
1761 if (o->object.type != OBJECT_ENTRY)
1762 return -EINVAL;
1763
1764 __sync_synchronize();
1765
1766 /* Link up the entry itself */
1767 r = link_entry_into_array(f,
1768 &f->header->entry_array_offset,
1769 &f->header->n_entries,
1770 offset);
1771 if (r < 0)
1772 return r;
1773
1774 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1775
1776 if (f->header->head_entry_realtime == 0)
1777 f->header->head_entry_realtime = o->entry.realtime;
1778
1779 f->header->tail_entry_realtime = o->entry.realtime;
1780 f->header->tail_entry_monotonic = o->entry.monotonic;
1781
1782 /* Link up the items */
1783 n = journal_file_entry_n_items(o);
1784 for (i = 0; i < n; i++) {
1785 r = journal_file_link_entry_item(f, o, offset, i);
1786 if (r < 0)
1787 return r;
1788 }
1789
1790 return 0;
1791 }
1792
1793 static int journal_file_append_entry_internal(
1794 JournalFile *f,
1795 const dual_timestamp *ts,
1796 const sd_id128_t *boot_id,
1797 uint64_t xor_hash,
1798 const EntryItem items[], unsigned n_items,
1799 uint64_t *seqnum,
1800 Object **ret, uint64_t *offset) {
1801 uint64_t np;
1802 uint64_t osize;
1803 Object *o;
1804 int r;
1805
1806 assert(f);
1807 assert(f->header);
1808 assert(items || n_items == 0);
1809 assert(ts);
1810
1811 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1812
1813 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1814 if (r < 0)
1815 return r;
1816
1817 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1818 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1819 o->entry.realtime = htole64(ts->realtime);
1820 o->entry.monotonic = htole64(ts->monotonic);
1821 o->entry.xor_hash = htole64(xor_hash);
1822 o->entry.boot_id = boot_id ? *boot_id : f->header->boot_id;
1823
1824 #if HAVE_GCRYPT
1825 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1826 if (r < 0)
1827 return r;
1828 #endif
1829
1830 r = journal_file_link_entry(f, o, np);
1831 if (r < 0)
1832 return r;
1833
1834 if (ret)
1835 *ret = o;
1836
1837 if (offset)
1838 *offset = np;
1839
1840 return 0;
1841 }
1842
1843 void journal_file_post_change(JournalFile *f) {
1844 assert(f);
1845
1846 if (f->fd < 0)
1847 return;
1848
1849 /* inotify() does not receive IN_MODIFY events from file
1850 * accesses done via mmap(). After each access we hence
1851 * trigger IN_MODIFY by truncating the journal file to its
1852 * current size which triggers IN_MODIFY. */
1853
1854 __sync_synchronize();
1855
1856 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1857 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1858 }
1859
1860 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1861 assert(userdata);
1862
1863 journal_file_post_change(userdata);
1864
1865 return 1;
1866 }
1867
1868 static void schedule_post_change(JournalFile *f) {
1869 uint64_t now;
1870 int r;
1871
1872 assert(f);
1873 assert(f->post_change_timer);
1874
1875 r = sd_event_source_get_enabled(f->post_change_timer, NULL);
1876 if (r < 0) {
1877 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1878 goto fail;
1879 }
1880 if (r > 0)
1881 return;
1882
1883 r = sd_event_now(sd_event_source_get_event(f->post_change_timer), CLOCK_MONOTONIC, &now);
1884 if (r < 0) {
1885 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1886 goto fail;
1887 }
1888
1889 r = sd_event_source_set_time(f->post_change_timer, now + f->post_change_timer_period);
1890 if (r < 0) {
1891 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1892 goto fail;
1893 }
1894
1895 r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
1896 if (r < 0) {
1897 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1898 goto fail;
1899 }
1900
1901 return;
1902
1903 fail:
1904 /* On failure, let's simply post the change immediately. */
1905 journal_file_post_change(f);
1906 }
1907
1908 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1909 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1910 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1911 int r;
1912
1913 assert(f);
1914 assert_return(!f->post_change_timer, -EINVAL);
1915 assert(e);
1916 assert(t);
1917
1918 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1919 if (r < 0)
1920 return r;
1921
1922 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1923 if (r < 0)
1924 return r;
1925
1926 f->post_change_timer = TAKE_PTR(timer);
1927 f->post_change_timer_period = t;
1928
1929 return r;
1930 }
1931
1932 static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
1933 return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
1934 }
1935
1936 int journal_file_append_entry(
1937 JournalFile *f,
1938 const dual_timestamp *ts,
1939 const sd_id128_t *boot_id,
1940 const struct iovec iovec[], unsigned n_iovec,
1941 uint64_t *seqnum,
1942 Object **ret, uint64_t *offset) {
1943
1944 unsigned i;
1945 EntryItem *items;
1946 int r;
1947 uint64_t xor_hash = 0;
1948 struct dual_timestamp _ts;
1949
1950 assert(f);
1951 assert(f->header);
1952 assert(iovec || n_iovec == 0);
1953
1954 if (ts) {
1955 if (!VALID_REALTIME(ts->realtime)) {
1956 log_debug("Invalid realtime timestamp %"PRIu64", refusing entry.", ts->realtime);
1957 return -EBADMSG;
1958 }
1959 if (!VALID_MONOTONIC(ts->monotonic)) {
1960 log_debug("Invalid monotomic timestamp %"PRIu64", refusing entry.", ts->monotonic);
1961 return -EBADMSG;
1962 }
1963 } else {
1964 dual_timestamp_get(&_ts);
1965 ts = &_ts;
1966 }
1967
1968 #if HAVE_GCRYPT
1969 r = journal_file_maybe_append_tag(f, ts->realtime);
1970 if (r < 0)
1971 return r;
1972 #endif
1973
1974 /* alloca() can't take 0, hence let's allocate at least one */
1975 items = newa(EntryItem, MAX(1u, n_iovec));
1976
1977 for (i = 0; i < n_iovec; i++) {
1978 uint64_t p;
1979 Object *o;
1980
1981 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1982 if (r < 0)
1983 return r;
1984
1985 xor_hash ^= le64toh(o->data.hash);
1986 items[i].object_offset = htole64(p);
1987 items[i].hash = o->data.hash;
1988 }
1989
1990 /* Order by the position on disk, in order to improve seek
1991 * times for rotating media. */
1992 typesafe_qsort(items, n_iovec, entry_item_cmp);
1993
1994 r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, offset);
1995
1996 /* If the memory mapping triggered a SIGBUS then we return an
1997 * IO error and ignore the error code passed down to us, since
1998 * it is very likely just an effect of a nullified replacement
1999 * mapping page */
2000
2001 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
2002 r = -EIO;
2003
2004 if (f->post_change_timer)
2005 schedule_post_change(f);
2006 else
2007 journal_file_post_change(f);
2008
2009 return r;
2010 }
2011
2012 typedef struct ChainCacheItem {
2013 uint64_t first; /* the array at the beginning of the chain */
2014 uint64_t array; /* the cached array */
2015 uint64_t begin; /* the first item in the cached array */
2016 uint64_t total; /* the total number of items in all arrays before this one in the chain */
2017 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2018 } ChainCacheItem;
2019
2020 static void chain_cache_put(
2021 OrderedHashmap *h,
2022 ChainCacheItem *ci,
2023 uint64_t first,
2024 uint64_t array,
2025 uint64_t begin,
2026 uint64_t total,
2027 uint64_t last_index) {
2028
2029 if (!ci) {
2030 /* If the chain item to cache for this chain is the
2031 * first one it's not worth caching anything */
2032 if (array == first)
2033 return;
2034
2035 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2036 ci = ordered_hashmap_steal_first(h);
2037 assert(ci);
2038 } else {
2039 ci = new(ChainCacheItem, 1);
2040 if (!ci)
2041 return;
2042 }
2043
2044 ci->first = first;
2045
2046 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2047 free(ci);
2048 return;
2049 }
2050 } else
2051 assert(ci->first == first);
2052
2053 ci->array = array;
2054 ci->begin = begin;
2055 ci->total = total;
2056 ci->last_index = last_index;
2057 }
2058
2059 static int generic_array_get(
2060 JournalFile *f,
2061 uint64_t first,
2062 uint64_t i,
2063 Object **ret, uint64_t *offset) {
2064
2065 Object *o;
2066 uint64_t p = 0, a, t = 0;
2067 int r;
2068 ChainCacheItem *ci;
2069
2070 assert(f);
2071
2072 a = first;
2073
2074 /* Try the chain cache first */
2075 ci = ordered_hashmap_get(f->chain_cache, &first);
2076 if (ci && i > ci->total) {
2077 a = ci->array;
2078 i -= ci->total;
2079 t = ci->total;
2080 }
2081
2082 while (a > 0) {
2083 uint64_t k;
2084
2085 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2086 if (r < 0)
2087 return r;
2088
2089 k = journal_file_entry_array_n_items(o);
2090 if (i < k) {
2091 p = le64toh(o->entry_array.items[i]);
2092 goto found;
2093 }
2094
2095 i -= k;
2096 t += k;
2097 a = le64toh(o->entry_array.next_entry_array_offset);
2098 }
2099
2100 return 0;
2101
2102 found:
2103 /* Let's cache this item for the next invocation */
2104 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2105
2106 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2107 if (r < 0)
2108 return r;
2109
2110 if (ret)
2111 *ret = o;
2112
2113 if (offset)
2114 *offset = p;
2115
2116 return 1;
2117 }
2118
2119 static int generic_array_get_plus_one(
2120 JournalFile *f,
2121 uint64_t extra,
2122 uint64_t first,
2123 uint64_t i,
2124 Object **ret, uint64_t *offset) {
2125
2126 Object *o;
2127
2128 assert(f);
2129
2130 if (i == 0) {
2131 int r;
2132
2133 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2134 if (r < 0)
2135 return r;
2136
2137 if (ret)
2138 *ret = o;
2139
2140 if (offset)
2141 *offset = extra;
2142
2143 return 1;
2144 }
2145
2146 return generic_array_get(f, first, i-1, ret, offset);
2147 }
2148
2149 enum {
2150 TEST_FOUND,
2151 TEST_LEFT,
2152 TEST_RIGHT
2153 };
2154
2155 static int generic_array_bisect(
2156 JournalFile *f,
2157 uint64_t first,
2158 uint64_t n,
2159 uint64_t needle,
2160 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2161 direction_t direction,
2162 Object **ret,
2163 uint64_t *offset,
2164 uint64_t *idx) {
2165
2166 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2167 bool subtract_one = false;
2168 Object *o, *array = NULL;
2169 int r;
2170 ChainCacheItem *ci;
2171
2172 assert(f);
2173 assert(test_object);
2174
2175 /* Start with the first array in the chain */
2176 a = first;
2177
2178 ci = ordered_hashmap_get(f->chain_cache, &first);
2179 if (ci && n > ci->total && ci->begin != 0) {
2180 /* Ah, we have iterated this bisection array chain
2181 * previously! Let's see if we can skip ahead in the
2182 * chain, as far as the last time. But we can't jump
2183 * backwards in the chain, so let's check that
2184 * first. */
2185
2186 r = test_object(f, ci->begin, needle);
2187 if (r < 0)
2188 return r;
2189
2190 if (r == TEST_LEFT) {
2191 /* OK, what we are looking for is right of the
2192 * begin of this EntryArray, so let's jump
2193 * straight to previously cached array in the
2194 * chain */
2195
2196 a = ci->array;
2197 n -= ci->total;
2198 t = ci->total;
2199 last_index = ci->last_index;
2200 }
2201 }
2202
2203 while (a > 0) {
2204 uint64_t left, right, k, lp;
2205
2206 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2207 if (r < 0)
2208 return r;
2209
2210 k = journal_file_entry_array_n_items(array);
2211 right = MIN(k, n);
2212 if (right <= 0)
2213 return 0;
2214
2215 i = right - 1;
2216 lp = p = le64toh(array->entry_array.items[i]);
2217 if (p <= 0)
2218 r = -EBADMSG;
2219 else
2220 r = test_object(f, p, needle);
2221 if (r == -EBADMSG) {
2222 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2223 n = i;
2224 continue;
2225 }
2226 if (r < 0)
2227 return r;
2228
2229 if (r == TEST_FOUND)
2230 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2231
2232 if (r == TEST_RIGHT) {
2233 left = 0;
2234 right -= 1;
2235
2236 if (last_index != (uint64_t) -1) {
2237 assert(last_index <= right);
2238
2239 /* If we cached the last index we
2240 * looked at, let's try to not to jump
2241 * too wildly around and see if we can
2242 * limit the range to look at early to
2243 * the immediate neighbors of the last
2244 * index we looked at. */
2245
2246 if (last_index > 0) {
2247 uint64_t x = last_index - 1;
2248
2249 p = le64toh(array->entry_array.items[x]);
2250 if (p <= 0)
2251 return -EBADMSG;
2252
2253 r = test_object(f, p, needle);
2254 if (r < 0)
2255 return r;
2256
2257 if (r == TEST_FOUND)
2258 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2259
2260 if (r == TEST_RIGHT)
2261 right = x;
2262 else
2263 left = x + 1;
2264 }
2265
2266 if (last_index < right) {
2267 uint64_t y = last_index + 1;
2268
2269 p = le64toh(array->entry_array.items[y]);
2270 if (p <= 0)
2271 return -EBADMSG;
2272
2273 r = test_object(f, p, needle);
2274 if (r < 0)
2275 return r;
2276
2277 if (r == TEST_FOUND)
2278 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2279
2280 if (r == TEST_RIGHT)
2281 right = y;
2282 else
2283 left = y + 1;
2284 }
2285 }
2286
2287 for (;;) {
2288 if (left == right) {
2289 if (direction == DIRECTION_UP)
2290 subtract_one = true;
2291
2292 i = left;
2293 goto found;
2294 }
2295
2296 assert(left < right);
2297 i = (left + right) / 2;
2298
2299 p = le64toh(array->entry_array.items[i]);
2300 if (p <= 0)
2301 r = -EBADMSG;
2302 else
2303 r = test_object(f, p, needle);
2304 if (r == -EBADMSG) {
2305 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2306 right = n = i;
2307 continue;
2308 }
2309 if (r < 0)
2310 return r;
2311
2312 if (r == TEST_FOUND)
2313 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2314
2315 if (r == TEST_RIGHT)
2316 right = i;
2317 else
2318 left = i + 1;
2319 }
2320 }
2321
2322 if (k >= n) {
2323 if (direction == DIRECTION_UP) {
2324 i = n;
2325 subtract_one = true;
2326 goto found;
2327 }
2328
2329 return 0;
2330 }
2331
2332 last_p = lp;
2333
2334 n -= k;
2335 t += k;
2336 last_index = (uint64_t) -1;
2337 a = le64toh(array->entry_array.next_entry_array_offset);
2338 }
2339
2340 return 0;
2341
2342 found:
2343 if (subtract_one && t == 0 && i == 0)
2344 return 0;
2345
2346 /* Let's cache this item for the next invocation */
2347 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2348
2349 if (subtract_one && i == 0)
2350 p = last_p;
2351 else if (subtract_one)
2352 p = le64toh(array->entry_array.items[i-1]);
2353 else
2354 p = le64toh(array->entry_array.items[i]);
2355
2356 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2357 if (r < 0)
2358 return r;
2359
2360 if (ret)
2361 *ret = o;
2362
2363 if (offset)
2364 *offset = p;
2365
2366 if (idx)
2367 *idx = t + i + (subtract_one ? -1 : 0);
2368
2369 return 1;
2370 }
2371
2372 static int generic_array_bisect_plus_one(
2373 JournalFile *f,
2374 uint64_t extra,
2375 uint64_t first,
2376 uint64_t n,
2377 uint64_t needle,
2378 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2379 direction_t direction,
2380 Object **ret,
2381 uint64_t *offset,
2382 uint64_t *idx) {
2383
2384 int r;
2385 bool step_back = false;
2386 Object *o;
2387
2388 assert(f);
2389 assert(test_object);
2390
2391 if (n <= 0)
2392 return 0;
2393
2394 /* This bisects the array in object 'first', but first checks
2395 * an extra */
2396 r = test_object(f, extra, needle);
2397 if (r < 0)
2398 return r;
2399
2400 if (r == TEST_FOUND)
2401 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2402
2403 /* if we are looking with DIRECTION_UP then we need to first
2404 see if in the actual array there is a matching entry, and
2405 return the last one of that. But if there isn't any we need
2406 to return this one. Hence remember this, and return it
2407 below. */
2408 if (r == TEST_LEFT)
2409 step_back = direction == DIRECTION_UP;
2410
2411 if (r == TEST_RIGHT) {
2412 if (direction == DIRECTION_DOWN)
2413 goto found;
2414 else
2415 return 0;
2416 }
2417
2418 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2419
2420 if (r == 0 && step_back)
2421 goto found;
2422
2423 if (r > 0 && idx)
2424 (*idx)++;
2425
2426 return r;
2427
2428 found:
2429 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2430 if (r < 0)
2431 return r;
2432
2433 if (ret)
2434 *ret = o;
2435
2436 if (offset)
2437 *offset = extra;
2438
2439 if (idx)
2440 *idx = 0;
2441
2442 return 1;
2443 }
2444
2445 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2446 assert(f);
2447 assert(p > 0);
2448
2449 if (p == needle)
2450 return TEST_FOUND;
2451 else if (p < needle)
2452 return TEST_LEFT;
2453 else
2454 return TEST_RIGHT;
2455 }
2456
2457 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2458 Object *o;
2459 int r;
2460
2461 assert(f);
2462 assert(p > 0);
2463
2464 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2465 if (r < 0)
2466 return r;
2467
2468 if (le64toh(o->entry.seqnum) == needle)
2469 return TEST_FOUND;
2470 else if (le64toh(o->entry.seqnum) < needle)
2471 return TEST_LEFT;
2472 else
2473 return TEST_RIGHT;
2474 }
2475
2476 int journal_file_move_to_entry_by_seqnum(
2477 JournalFile *f,
2478 uint64_t seqnum,
2479 direction_t direction,
2480 Object **ret,
2481 uint64_t *offset) {
2482 assert(f);
2483 assert(f->header);
2484
2485 return generic_array_bisect(f,
2486 le64toh(f->header->entry_array_offset),
2487 le64toh(f->header->n_entries),
2488 seqnum,
2489 test_object_seqnum,
2490 direction,
2491 ret, offset, NULL);
2492 }
2493
2494 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2495 Object *o;
2496 int r;
2497
2498 assert(f);
2499 assert(p > 0);
2500
2501 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2502 if (r < 0)
2503 return r;
2504
2505 if (le64toh(o->entry.realtime) == needle)
2506 return TEST_FOUND;
2507 else if (le64toh(o->entry.realtime) < needle)
2508 return TEST_LEFT;
2509 else
2510 return TEST_RIGHT;
2511 }
2512
2513 int journal_file_move_to_entry_by_realtime(
2514 JournalFile *f,
2515 uint64_t realtime,
2516 direction_t direction,
2517 Object **ret,
2518 uint64_t *offset) {
2519 assert(f);
2520 assert(f->header);
2521
2522 return generic_array_bisect(f,
2523 le64toh(f->header->entry_array_offset),
2524 le64toh(f->header->n_entries),
2525 realtime,
2526 test_object_realtime,
2527 direction,
2528 ret, offset, NULL);
2529 }
2530
2531 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2532 Object *o;
2533 int r;
2534
2535 assert(f);
2536 assert(p > 0);
2537
2538 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2539 if (r < 0)
2540 return r;
2541
2542 if (le64toh(o->entry.monotonic) == needle)
2543 return TEST_FOUND;
2544 else if (le64toh(o->entry.monotonic) < needle)
2545 return TEST_LEFT;
2546 else
2547 return TEST_RIGHT;
2548 }
2549
2550 static int find_data_object_by_boot_id(
2551 JournalFile *f,
2552 sd_id128_t boot_id,
2553 Object **o,
2554 uint64_t *b) {
2555
2556 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2557
2558 sd_id128_to_string(boot_id, t + 9);
2559 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2560 }
2561
2562 int journal_file_move_to_entry_by_monotonic(
2563 JournalFile *f,
2564 sd_id128_t boot_id,
2565 uint64_t monotonic,
2566 direction_t direction,
2567 Object **ret,
2568 uint64_t *offset) {
2569
2570 Object *o;
2571 int r;
2572
2573 assert(f);
2574
2575 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2576 if (r < 0)
2577 return r;
2578 if (r == 0)
2579 return -ENOENT;
2580
2581 return generic_array_bisect_plus_one(f,
2582 le64toh(o->data.entry_offset),
2583 le64toh(o->data.entry_array_offset),
2584 le64toh(o->data.n_entries),
2585 monotonic,
2586 test_object_monotonic,
2587 direction,
2588 ret, offset, NULL);
2589 }
2590
2591 void journal_file_reset_location(JournalFile *f) {
2592 f->location_type = LOCATION_HEAD;
2593 f->current_offset = 0;
2594 f->current_seqnum = 0;
2595 f->current_realtime = 0;
2596 f->current_monotonic = 0;
2597 zero(f->current_boot_id);
2598 f->current_xor_hash = 0;
2599 }
2600
2601 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2602 f->location_type = LOCATION_SEEK;
2603 f->current_offset = offset;
2604 f->current_seqnum = le64toh(o->entry.seqnum);
2605 f->current_realtime = le64toh(o->entry.realtime);
2606 f->current_monotonic = le64toh(o->entry.monotonic);
2607 f->current_boot_id = o->entry.boot_id;
2608 f->current_xor_hash = le64toh(o->entry.xor_hash);
2609 }
2610
2611 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2612 int r;
2613
2614 assert(af);
2615 assert(af->header);
2616 assert(bf);
2617 assert(bf->header);
2618 assert(af->location_type == LOCATION_SEEK);
2619 assert(bf->location_type == LOCATION_SEEK);
2620
2621 /* If contents and timestamps match, these entries are
2622 * identical, even if the seqnum does not match */
2623 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2624 af->current_monotonic == bf->current_monotonic &&
2625 af->current_realtime == bf->current_realtime &&
2626 af->current_xor_hash == bf->current_xor_hash)
2627 return 0;
2628
2629 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2630
2631 /* If this is from the same seqnum source, compare
2632 * seqnums */
2633 r = CMP(af->current_seqnum, bf->current_seqnum);
2634 if (r != 0)
2635 return r;
2636
2637 /* Wow! This is weird, different data but the same
2638 * seqnums? Something is borked, but let's make the
2639 * best of it and compare by time. */
2640 }
2641
2642 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2643
2644 /* If the boot id matches, compare monotonic time */
2645 r = CMP(af->current_monotonic, bf->current_monotonic);
2646 if (r != 0)
2647 return r;
2648 }
2649
2650 /* Otherwise, compare UTC time */
2651 r = CMP(af->current_realtime, bf->current_realtime);
2652 if (r != 0)
2653 return r;
2654
2655 /* Finally, compare by contents */
2656 return CMP(af->current_xor_hash, bf->current_xor_hash);
2657 }
2658
2659 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2660
2661 /* Increase or decrease the specified index, in the right direction. */
2662
2663 if (direction == DIRECTION_DOWN) {
2664 if (*i >= n - 1)
2665 return 0;
2666
2667 (*i) ++;
2668 } else {
2669 if (*i <= 0)
2670 return 0;
2671
2672 (*i) --;
2673 }
2674
2675 return 1;
2676 }
2677
2678 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2679
2680 /* Consider it an error if any of the two offsets is uninitialized */
2681 if (old_offset == 0 || new_offset == 0)
2682 return false;
2683
2684 /* If we go down, the new offset must be larger than the old one. */
2685 return direction == DIRECTION_DOWN ?
2686 new_offset > old_offset :
2687 new_offset < old_offset;
2688 }
2689
2690 int journal_file_next_entry(
2691 JournalFile *f,
2692 uint64_t p,
2693 direction_t direction,
2694 Object **ret, uint64_t *offset) {
2695
2696 uint64_t i, n, ofs;
2697 int r;
2698
2699 assert(f);
2700 assert(f->header);
2701
2702 n = le64toh(f->header->n_entries);
2703 if (n <= 0)
2704 return 0;
2705
2706 if (p == 0)
2707 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2708 else {
2709 r = generic_array_bisect(f,
2710 le64toh(f->header->entry_array_offset),
2711 le64toh(f->header->n_entries),
2712 p,
2713 test_object_offset,
2714 DIRECTION_DOWN,
2715 NULL, NULL,
2716 &i);
2717 if (r <= 0)
2718 return r;
2719
2720 r = bump_array_index(&i, direction, n);
2721 if (r <= 0)
2722 return r;
2723 }
2724
2725 /* And jump to it */
2726 for (;;) {
2727 r = generic_array_get(f,
2728 le64toh(f->header->entry_array_offset),
2729 i,
2730 ret, &ofs);
2731 if (r > 0)
2732 break;
2733 if (r != -EBADMSG)
2734 return r;
2735
2736 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2737 * the next one might work for us instead. */
2738 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2739
2740 r = bump_array_index(&i, direction, n);
2741 if (r <= 0)
2742 return r;
2743 }
2744
2745 /* Ensure our array is properly ordered. */
2746 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2747 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
2748 return -EBADMSG;
2749 }
2750
2751 if (offset)
2752 *offset = ofs;
2753
2754 return 1;
2755 }
2756
2757 int journal_file_next_entry_for_data(
2758 JournalFile *f,
2759 Object *o, uint64_t p,
2760 uint64_t data_offset,
2761 direction_t direction,
2762 Object **ret, uint64_t *offset) {
2763
2764 uint64_t i, n, ofs;
2765 Object *d;
2766 int r;
2767
2768 assert(f);
2769 assert(p > 0 || !o);
2770
2771 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2772 if (r < 0)
2773 return r;
2774
2775 n = le64toh(d->data.n_entries);
2776 if (n <= 0)
2777 return n;
2778
2779 if (!o)
2780 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2781 else {
2782 if (o->object.type != OBJECT_ENTRY)
2783 return -EINVAL;
2784
2785 r = generic_array_bisect_plus_one(f,
2786 le64toh(d->data.entry_offset),
2787 le64toh(d->data.entry_array_offset),
2788 le64toh(d->data.n_entries),
2789 p,
2790 test_object_offset,
2791 DIRECTION_DOWN,
2792 NULL, NULL,
2793 &i);
2794
2795 if (r <= 0)
2796 return r;
2797
2798 r = bump_array_index(&i, direction, n);
2799 if (r <= 0)
2800 return r;
2801 }
2802
2803 for (;;) {
2804 r = generic_array_get_plus_one(f,
2805 le64toh(d->data.entry_offset),
2806 le64toh(d->data.entry_array_offset),
2807 i,
2808 ret, &ofs);
2809 if (r > 0)
2810 break;
2811 if (r != -EBADMSG)
2812 return r;
2813
2814 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2815
2816 r = bump_array_index(&i, direction, n);
2817 if (r <= 0)
2818 return r;
2819 }
2820
2821 /* Ensure our array is properly ordered. */
2822 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2823 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2824 return -EBADMSG;
2825 }
2826
2827 if (offset)
2828 *offset = ofs;
2829
2830 return 1;
2831 }
2832
2833 int journal_file_move_to_entry_by_offset_for_data(
2834 JournalFile *f,
2835 uint64_t data_offset,
2836 uint64_t p,
2837 direction_t direction,
2838 Object **ret, uint64_t *offset) {
2839
2840 int r;
2841 Object *d;
2842
2843 assert(f);
2844
2845 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2846 if (r < 0)
2847 return r;
2848
2849 return generic_array_bisect_plus_one(f,
2850 le64toh(d->data.entry_offset),
2851 le64toh(d->data.entry_array_offset),
2852 le64toh(d->data.n_entries),
2853 p,
2854 test_object_offset,
2855 direction,
2856 ret, offset, NULL);
2857 }
2858
2859 int journal_file_move_to_entry_by_monotonic_for_data(
2860 JournalFile *f,
2861 uint64_t data_offset,
2862 sd_id128_t boot_id,
2863 uint64_t monotonic,
2864 direction_t direction,
2865 Object **ret, uint64_t *offset) {
2866
2867 Object *o, *d;
2868 int r;
2869 uint64_t b, z;
2870
2871 assert(f);
2872
2873 /* First, seek by time */
2874 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2875 if (r < 0)
2876 return r;
2877 if (r == 0)
2878 return -ENOENT;
2879
2880 r = generic_array_bisect_plus_one(f,
2881 le64toh(o->data.entry_offset),
2882 le64toh(o->data.entry_array_offset),
2883 le64toh(o->data.n_entries),
2884 monotonic,
2885 test_object_monotonic,
2886 direction,
2887 NULL, &z, NULL);
2888 if (r <= 0)
2889 return r;
2890
2891 /* And now, continue seeking until we find an entry that
2892 * exists in both bisection arrays */
2893
2894 for (;;) {
2895 Object *qo;
2896 uint64_t p, q;
2897
2898 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2899 if (r < 0)
2900 return r;
2901
2902 r = generic_array_bisect_plus_one(f,
2903 le64toh(d->data.entry_offset),
2904 le64toh(d->data.entry_array_offset),
2905 le64toh(d->data.n_entries),
2906 z,
2907 test_object_offset,
2908 direction,
2909 NULL, &p, NULL);
2910 if (r <= 0)
2911 return r;
2912
2913 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2914 if (r < 0)
2915 return r;
2916
2917 r = generic_array_bisect_plus_one(f,
2918 le64toh(o->data.entry_offset),
2919 le64toh(o->data.entry_array_offset),
2920 le64toh(o->data.n_entries),
2921 p,
2922 test_object_offset,
2923 direction,
2924 &qo, &q, NULL);
2925
2926 if (r <= 0)
2927 return r;
2928
2929 if (p == q) {
2930 if (ret)
2931 *ret = qo;
2932 if (offset)
2933 *offset = q;
2934
2935 return 1;
2936 }
2937
2938 z = q;
2939 }
2940 }
2941
2942 int journal_file_move_to_entry_by_seqnum_for_data(
2943 JournalFile *f,
2944 uint64_t data_offset,
2945 uint64_t seqnum,
2946 direction_t direction,
2947 Object **ret, uint64_t *offset) {
2948
2949 Object *d;
2950 int r;
2951
2952 assert(f);
2953
2954 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2955 if (r < 0)
2956 return r;
2957
2958 return generic_array_bisect_plus_one(f,
2959 le64toh(d->data.entry_offset),
2960 le64toh(d->data.entry_array_offset),
2961 le64toh(d->data.n_entries),
2962 seqnum,
2963 test_object_seqnum,
2964 direction,
2965 ret, offset, NULL);
2966 }
2967
2968 int journal_file_move_to_entry_by_realtime_for_data(
2969 JournalFile *f,
2970 uint64_t data_offset,
2971 uint64_t realtime,
2972 direction_t direction,
2973 Object **ret, uint64_t *offset) {
2974
2975 Object *d;
2976 int r;
2977
2978 assert(f);
2979
2980 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2981 if (r < 0)
2982 return r;
2983
2984 return generic_array_bisect_plus_one(f,
2985 le64toh(d->data.entry_offset),
2986 le64toh(d->data.entry_array_offset),
2987 le64toh(d->data.n_entries),
2988 realtime,
2989 test_object_realtime,
2990 direction,
2991 ret, offset, NULL);
2992 }
2993
2994 void journal_file_dump(JournalFile *f) {
2995 Object *o;
2996 int r;
2997 uint64_t p;
2998
2999 assert(f);
3000 assert(f->header);
3001
3002 journal_file_print_header(f);
3003
3004 p = le64toh(f->header->header_size);
3005 while (p != 0) {
3006 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3007 if (r < 0)
3008 goto fail;
3009
3010 switch (o->object.type) {
3011
3012 case OBJECT_UNUSED:
3013 printf("Type: OBJECT_UNUSED\n");
3014 break;
3015
3016 case OBJECT_DATA:
3017 printf("Type: OBJECT_DATA\n");
3018 break;
3019
3020 case OBJECT_FIELD:
3021 printf("Type: OBJECT_FIELD\n");
3022 break;
3023
3024 case OBJECT_ENTRY:
3025 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3026 le64toh(o->entry.seqnum),
3027 le64toh(o->entry.monotonic),
3028 le64toh(o->entry.realtime));
3029 break;
3030
3031 case OBJECT_FIELD_HASH_TABLE:
3032 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3033 break;
3034
3035 case OBJECT_DATA_HASH_TABLE:
3036 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3037 break;
3038
3039 case OBJECT_ENTRY_ARRAY:
3040 printf("Type: OBJECT_ENTRY_ARRAY\n");
3041 break;
3042
3043 case OBJECT_TAG:
3044 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3045 le64toh(o->tag.seqnum),
3046 le64toh(o->tag.epoch));
3047 break;
3048
3049 default:
3050 printf("Type: unknown (%i)\n", o->object.type);
3051 break;
3052 }
3053
3054 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3055 printf("Flags: %s\n",
3056 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3057
3058 if (p == le64toh(f->header->tail_object_offset))
3059 p = 0;
3060 else
3061 p = p + ALIGN64(le64toh(o->object.size));
3062 }
3063
3064 return;
3065 fail:
3066 log_error("File corrupt");
3067 }
3068
3069 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3070 const char *x;
3071
3072 x = format_timestamp(buf, l, t);
3073 if (x)
3074 return x;
3075 return " --- ";
3076 }
3077
3078 void journal_file_print_header(JournalFile *f) {
3079 char a[33], b[33], c[33], d[33];
3080 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3081 struct stat st;
3082 char bytes[FORMAT_BYTES_MAX];
3083
3084 assert(f);
3085 assert(f->header);
3086
3087 printf("File Path: %s\n"
3088 "File ID: %s\n"
3089 "Machine ID: %s\n"
3090 "Boot ID: %s\n"
3091 "Sequential Number ID: %s\n"
3092 "State: %s\n"
3093 "Compatible Flags:%s%s\n"
3094 "Incompatible Flags:%s%s%s\n"
3095 "Header size: %"PRIu64"\n"
3096 "Arena size: %"PRIu64"\n"
3097 "Data Hash Table Size: %"PRIu64"\n"
3098 "Field Hash Table Size: %"PRIu64"\n"
3099 "Rotate Suggested: %s\n"
3100 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3101 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3102 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3103 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3104 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3105 "Objects: %"PRIu64"\n"
3106 "Entry Objects: %"PRIu64"\n",
3107 f->path,
3108 sd_id128_to_string(f->header->file_id, a),
3109 sd_id128_to_string(f->header->machine_id, b),
3110 sd_id128_to_string(f->header->boot_id, c),
3111 sd_id128_to_string(f->header->seqnum_id, d),
3112 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3113 f->header->state == STATE_ONLINE ? "ONLINE" :
3114 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3115 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3116 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3117 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3118 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3119 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3120 le64toh(f->header->header_size),
3121 le64toh(f->header->arena_size),
3122 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3123 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3124 yes_no(journal_file_rotate_suggested(f, 0)),
3125 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3126 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3127 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3128 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3129 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3130 le64toh(f->header->n_objects),
3131 le64toh(f->header->n_entries));
3132
3133 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3134 printf("Data Objects: %"PRIu64"\n"
3135 "Data Hash Table Fill: %.1f%%\n",
3136 le64toh(f->header->n_data),
3137 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3138
3139 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3140 printf("Field Objects: %"PRIu64"\n"
3141 "Field Hash Table Fill: %.1f%%\n",
3142 le64toh(f->header->n_fields),
3143 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3144
3145 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3146 printf("Tag Objects: %"PRIu64"\n",
3147 le64toh(f->header->n_tags));
3148 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3149 printf("Entry Array Objects: %"PRIu64"\n",
3150 le64toh(f->header->n_entry_arrays));
3151
3152 if (fstat(f->fd, &st) >= 0)
3153 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3154 }
3155
3156 static int journal_file_warn_btrfs(JournalFile *f) {
3157 unsigned attrs;
3158 int r;
3159
3160 assert(f);
3161
3162 /* Before we write anything, check if the COW logic is turned
3163 * off on btrfs. Given our write pattern that is quite
3164 * unfriendly to COW file systems this should greatly improve
3165 * performance on COW file systems, such as btrfs, at the
3166 * expense of data integrity features (which shouldn't be too
3167 * bad, given that we do our own checksumming). */
3168
3169 r = btrfs_is_filesystem(f->fd);
3170 if (r < 0)
3171 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3172 if (!r)
3173 return 0;
3174
3175 r = read_attr_fd(f->fd, &attrs);
3176 if (r < 0)
3177 return log_warning_errno(r, "Failed to read file attributes: %m");
3178
3179 if (attrs & FS_NOCOW_FL) {
3180 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3181 return 0;
3182 }
3183
3184 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3185 "This is likely to slow down journal access substantially, please consider turning "
3186 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3187
3188 return 1;
3189 }
3190
3191 int journal_file_open(
3192 int fd,
3193 const char *fname,
3194 int flags,
3195 mode_t mode,
3196 bool compress,
3197 uint64_t compress_threshold_bytes,
3198 bool seal,
3199 JournalMetrics *metrics,
3200 MMapCache *mmap_cache,
3201 Set *deferred_closes,
3202 JournalFile *template,
3203 JournalFile **ret) {
3204
3205 bool newly_created = false;
3206 JournalFile *f;
3207 void *h;
3208 int r;
3209 char bytes[FORMAT_BYTES_MAX];
3210
3211 assert(ret);
3212 assert(fd >= 0 || fname);
3213
3214 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3215 return -EINVAL;
3216
3217 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3218 return -EINVAL;
3219
3220 f = new(JournalFile, 1);
3221 if (!f)
3222 return -ENOMEM;
3223
3224 *f = (JournalFile) {
3225 .fd = fd,
3226 .mode = mode,
3227
3228 .flags = flags,
3229 .prot = prot_from_flags(flags),
3230 .writable = (flags & O_ACCMODE) != O_RDONLY,
3231
3232 #if HAVE_LZ4
3233 .compress_lz4 = compress,
3234 #elif HAVE_XZ
3235 .compress_xz = compress,
3236 #endif
3237 .compress_threshold_bytes = compress_threshold_bytes == (uint64_t) -1 ?
3238 DEFAULT_COMPRESS_THRESHOLD :
3239 MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
3240 #if HAVE_GCRYPT
3241 .seal = seal,
3242 #endif
3243 };
3244
3245 log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
3246 yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
3247 format_bytes(bytes, sizeof(bytes), f->compress_threshold_bytes));
3248
3249 if (mmap_cache)
3250 f->mmap = mmap_cache_ref(mmap_cache);
3251 else {
3252 f->mmap = mmap_cache_new();
3253 if (!f->mmap) {
3254 r = -ENOMEM;
3255 goto fail;
3256 }
3257 }
3258
3259 if (fname) {
3260 f->path = strdup(fname);
3261 if (!f->path) {
3262 r = -ENOMEM;
3263 goto fail;
3264 }
3265 } else {
3266 assert(fd >= 0);
3267
3268 /* If we don't know the path, fill in something explanatory and vaguely useful */
3269 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3270 r = -ENOMEM;
3271 goto fail;
3272 }
3273 }
3274
3275 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3276 if (!f->chain_cache) {
3277 r = -ENOMEM;
3278 goto fail;
3279 }
3280
3281 if (f->fd < 0) {
3282 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3283 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3284 * it doesn't hurt in that case. */
3285
3286 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
3287 if (f->fd < 0) {
3288 r = -errno;
3289 goto fail;
3290 }
3291
3292 /* fds we opened here by us should also be closed by us. */
3293 f->close_fd = true;
3294
3295 r = fd_nonblock(f->fd, false);
3296 if (r < 0)
3297 goto fail;
3298 }
3299
3300 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3301 if (!f->cache_fd) {
3302 r = -ENOMEM;
3303 goto fail;
3304 }
3305
3306 r = journal_file_fstat(f);
3307 if (r < 0)
3308 goto fail;
3309
3310 if (f->last_stat.st_size == 0 && f->writable) {
3311
3312 (void) journal_file_warn_btrfs(f);
3313
3314 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3315 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3316 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3317 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3318 * solely on mtime/atime/ctime of the file. */
3319 (void) fd_setcrtime(f->fd, 0);
3320
3321 #if HAVE_GCRYPT
3322 /* Try to load the FSPRG state, and if we can't, then
3323 * just don't do sealing */
3324 if (f->seal) {
3325 r = journal_file_fss_load(f);
3326 if (r < 0)
3327 f->seal = false;
3328 }
3329 #endif
3330
3331 r = journal_file_init_header(f, template);
3332 if (r < 0)
3333 goto fail;
3334
3335 r = journal_file_fstat(f);
3336 if (r < 0)
3337 goto fail;
3338
3339 newly_created = true;
3340 }
3341
3342 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3343 r = -ENODATA;
3344 goto fail;
3345 }
3346
3347 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3348 if (r < 0)
3349 goto fail;
3350
3351 f->header = h;
3352
3353 if (!newly_created) {
3354 set_clear_with_destructor(deferred_closes, journal_file_close);
3355
3356 r = journal_file_verify_header(f);
3357 if (r < 0)
3358 goto fail;
3359 }
3360
3361 #if HAVE_GCRYPT
3362 if (!newly_created && f->writable) {
3363 r = journal_file_fss_load(f);
3364 if (r < 0)
3365 goto fail;
3366 }
3367 #endif
3368
3369 if (f->writable) {
3370 if (metrics) {
3371 journal_default_metrics(metrics, f->fd);
3372 f->metrics = *metrics;
3373 } else if (template)
3374 f->metrics = template->metrics;
3375
3376 r = journal_file_refresh_header(f);
3377 if (r < 0)
3378 goto fail;
3379 }
3380
3381 #if HAVE_GCRYPT
3382 r = journal_file_hmac_setup(f);
3383 if (r < 0)
3384 goto fail;
3385 #endif
3386
3387 if (newly_created) {
3388 r = journal_file_setup_field_hash_table(f);
3389 if (r < 0)
3390 goto fail;
3391
3392 r = journal_file_setup_data_hash_table(f);
3393 if (r < 0)
3394 goto fail;
3395
3396 #if HAVE_GCRYPT
3397 r = journal_file_append_first_tag(f);
3398 if (r < 0)
3399 goto fail;
3400 #endif
3401 }
3402
3403 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3404 r = -EIO;
3405 goto fail;
3406 }
3407
3408 if (template && template->post_change_timer) {
3409 r = journal_file_enable_post_change_timer(
3410 f,
3411 sd_event_source_get_event(template->post_change_timer),
3412 template->post_change_timer_period);
3413
3414 if (r < 0)
3415 goto fail;
3416 }
3417
3418 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3419 f->close_fd = true;
3420
3421 *ret = f;
3422 return 0;
3423
3424 fail:
3425 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3426 r = -EIO;
3427
3428 (void) journal_file_close(f);
3429
3430 return r;
3431 }
3432
3433 int journal_file_archive(JournalFile *f) {
3434 _cleanup_free_ char *p = NULL;
3435
3436 assert(f);
3437
3438 if (!f->writable)
3439 return -EINVAL;
3440
3441 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3442 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3443 if (path_startswith(f->path, "/proc/self/fd"))
3444 return -EINVAL;
3445
3446 if (!endswith(f->path, ".journal"))
3447 return -EINVAL;
3448
3449 if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3450 (int) strlen(f->path) - 8, f->path,
3451 SD_ID128_FORMAT_VAL(f->header->seqnum_id),
3452 le64toh(f->header->head_entry_seqnum),
3453 le64toh(f->header->head_entry_realtime)) < 0)
3454 return -ENOMEM;
3455
3456 /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
3457 * ignore that case. */
3458 if (rename(f->path, p) < 0 && errno != ENOENT)
3459 return -errno;
3460
3461 /* Sync the rename to disk */
3462 (void) fsync_directory_of_file(f->fd);
3463
3464 /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
3465 * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
3466 * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue
3467 * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
3468 * occurs. */
3469 f->archive = true;
3470
3471 /* Currently, btrfs is not very good with out write patterns and fragments heavily. Let's defrag our journal
3472 * files when we archive them */
3473 f->defrag_on_close = true;
3474
3475 return 0;
3476 }
3477
3478 JournalFile* journal_initiate_close(
3479 JournalFile *f,
3480 Set *deferred_closes) {
3481
3482 int r;
3483
3484 assert(f);
3485
3486 if (deferred_closes) {
3487
3488 r = set_put(deferred_closes, f);
3489 if (r < 0)
3490 log_debug_errno(r, "Failed to add file to deferred close set, closing immediately.");
3491 else {
3492 (void) journal_file_set_offline(f, false);
3493 return NULL;
3494 }
3495 }
3496
3497 return journal_file_close(f);
3498 }
3499
3500 int journal_file_rotate(
3501 JournalFile **f,
3502 bool compress,
3503 uint64_t compress_threshold_bytes,
3504 bool seal,
3505 Set *deferred_closes) {
3506
3507 JournalFile *new_file = NULL;
3508 int r;
3509
3510 assert(f);
3511 assert(*f);
3512
3513 r = journal_file_archive(*f);
3514 if (r < 0)
3515 return r;
3516
3517 r = journal_file_open(
3518 -1,
3519 (*f)->path,
3520 (*f)->flags,
3521 (*f)->mode,
3522 compress,
3523 compress_threshold_bytes,
3524 seal,
3525 NULL, /* metrics */
3526 (*f)->mmap,
3527 deferred_closes,
3528 *f, /* template */
3529 &new_file);
3530
3531 journal_initiate_close(*f, deferred_closes);
3532 *f = new_file;
3533
3534 return r;
3535 }
3536
3537 int journal_file_dispose(int dir_fd, const char *fname) {
3538 _cleanup_free_ char *p = NULL;
3539 _cleanup_close_ int fd = -1;
3540
3541 assert(fname);
3542
3543 /* Renames a journal file to *.journal~, i.e. to mark it as corruped or otherwise uncleanly shutdown. Note that
3544 * this is done without looking into the file or changing any of its contents. The idea is that this is called
3545 * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
3546 * for writing anymore. */
3547
3548 if (!endswith(fname, ".journal"))
3549 return -EINVAL;
3550
3551 if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
3552 (int) strlen(fname) - 8, fname,
3553 now(CLOCK_REALTIME),
3554 random_u64()) < 0)
3555 return -ENOMEM;
3556
3557 if (renameat(dir_fd, fname, dir_fd, p) < 0)
3558 return -errno;
3559
3560 /* btrfs doesn't cope well with our write pattern and fragments heavily. Let's defrag all files we rotate */
3561 fd = openat(dir_fd, p, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
3562 if (fd < 0)
3563 log_debug_errno(errno, "Failed to open file for defragmentation/FS_NOCOW_FL, ignoring: %m");
3564 else {
3565 (void) chattr_fd(fd, 0, FS_NOCOW_FL, NULL);
3566 (void) btrfs_defrag_fd(fd);
3567 }
3568
3569 return 0;
3570 }
3571
3572 int journal_file_open_reliably(
3573 const char *fname,
3574 int flags,
3575 mode_t mode,
3576 bool compress,
3577 uint64_t compress_threshold_bytes,
3578 bool seal,
3579 JournalMetrics *metrics,
3580 MMapCache *mmap_cache,
3581 Set *deferred_closes,
3582 JournalFile *template,
3583 JournalFile **ret) {
3584
3585 int r;
3586
3587 r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3588 deferred_closes, template, ret);
3589 if (!IN_SET(r,
3590 -EBADMSG, /* Corrupted */
3591 -ENODATA, /* Truncated */
3592 -EHOSTDOWN, /* Other machine */
3593 -EPROTONOSUPPORT, /* Incompatible feature */
3594 -EBUSY, /* Unclean shutdown */
3595 -ESHUTDOWN, /* Already archived */
3596 -EIO, /* IO error, including SIGBUS on mmap */
3597 -EIDRM, /* File has been deleted */
3598 -ETXTBSY)) /* File is from the future */
3599 return r;
3600
3601 if ((flags & O_ACCMODE) == O_RDONLY)
3602 return r;
3603
3604 if (!(flags & O_CREAT))
3605 return r;
3606
3607 if (!endswith(fname, ".journal"))
3608 return r;
3609
3610 /* The file is corrupted. Rotate it away and try it again (but only once) */
3611 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3612
3613 r = journal_file_dispose(AT_FDCWD, fname);
3614 if (r < 0)
3615 return r;
3616
3617 return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3618 deferred_closes, template, ret);
3619 }
3620
3621 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
3622 uint64_t i, n;
3623 uint64_t q, xor_hash = 0;
3624 int r;
3625 EntryItem *items;
3626 dual_timestamp ts;
3627 const sd_id128_t *boot_id;
3628
3629 assert(from);
3630 assert(to);
3631 assert(o);
3632 assert(p);
3633
3634 if (!to->writable)
3635 return -EPERM;
3636
3637 ts.monotonic = le64toh(o->entry.monotonic);
3638 ts.realtime = le64toh(o->entry.realtime);
3639 boot_id = &o->entry.boot_id;
3640
3641 n = journal_file_entry_n_items(o);
3642 /* alloca() can't take 0, hence let's allocate at least one */
3643 items = newa(EntryItem, MAX(1u, n));
3644
3645 for (i = 0; i < n; i++) {
3646 uint64_t l, h;
3647 le64_t le_hash;
3648 size_t t;
3649 void *data;
3650 Object *u;
3651
3652 q = le64toh(o->entry.items[i].object_offset);
3653 le_hash = o->entry.items[i].hash;
3654
3655 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3656 if (r < 0)
3657 return r;
3658
3659 if (le_hash != o->data.hash)
3660 return -EBADMSG;
3661
3662 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3663 t = (size_t) l;
3664
3665 /* We hit the limit on 32bit machines */
3666 if ((uint64_t) t != l)
3667 return -E2BIG;
3668
3669 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3670 #if HAVE_XZ || HAVE_LZ4
3671 size_t rsize = 0;
3672
3673 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3674 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3675 if (r < 0)
3676 return r;
3677
3678 data = from->compress_buffer;
3679 l = rsize;
3680 #else
3681 return -EPROTONOSUPPORT;
3682 #endif
3683 } else
3684 data = o->data.payload;
3685
3686 r = journal_file_append_data(to, data, l, &u, &h);
3687 if (r < 0)
3688 return r;
3689
3690 xor_hash ^= le64toh(u->data.hash);
3691 items[i].object_offset = htole64(h);
3692 items[i].hash = u->data.hash;
3693
3694 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3695 if (r < 0)
3696 return r;
3697 }
3698
3699 r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n,
3700 NULL, NULL, NULL);
3701
3702 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3703 return -EIO;
3704
3705 return r;
3706 }
3707
3708 void journal_reset_metrics(JournalMetrics *m) {
3709 assert(m);
3710
3711 /* Set everything to "pick automatic values". */
3712
3713 *m = (JournalMetrics) {
3714 .min_use = (uint64_t) -1,
3715 .max_use = (uint64_t) -1,
3716 .min_size = (uint64_t) -1,
3717 .max_size = (uint64_t) -1,
3718 .keep_free = (uint64_t) -1,
3719 .n_max_files = (uint64_t) -1,
3720 };
3721 }
3722
3723 void journal_default_metrics(JournalMetrics *m, int fd) {
3724 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3725 struct statvfs ss;
3726 uint64_t fs_size;
3727
3728 assert(m);
3729 assert(fd >= 0);
3730
3731 if (fstatvfs(fd, &ss) >= 0)
3732 fs_size = ss.f_frsize * ss.f_blocks;
3733 else {
3734 log_debug_errno(errno, "Failed to determine disk size: %m");
3735 fs_size = 0;
3736 }
3737
3738 if (m->max_use == (uint64_t) -1) {
3739
3740 if (fs_size > 0) {
3741 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3742
3743 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3744 m->max_use = DEFAULT_MAX_USE_UPPER;
3745
3746 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3747 m->max_use = DEFAULT_MAX_USE_LOWER;
3748 } else
3749 m->max_use = DEFAULT_MAX_USE_LOWER;
3750 } else {
3751 m->max_use = PAGE_ALIGN(m->max_use);
3752
3753 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3754 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3755 }
3756
3757 if (m->min_use == (uint64_t) -1)
3758 m->min_use = DEFAULT_MIN_USE;
3759
3760 if (m->min_use > m->max_use)
3761 m->min_use = m->max_use;
3762
3763 if (m->max_size == (uint64_t) -1) {
3764 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3765
3766 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3767 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3768 } else
3769 m->max_size = PAGE_ALIGN(m->max_size);
3770
3771 if (m->max_size != 0) {
3772 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3773 m->max_size = JOURNAL_FILE_SIZE_MIN;
3774
3775 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3776 m->max_use = m->max_size*2;
3777 }
3778
3779 if (m->min_size == (uint64_t) -1)
3780 m->min_size = JOURNAL_FILE_SIZE_MIN;
3781 else {
3782 m->min_size = PAGE_ALIGN(m->min_size);
3783
3784 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3785 m->min_size = JOURNAL_FILE_SIZE_MIN;
3786
3787 if (m->max_size != 0 && m->min_size > m->max_size)
3788 m->max_size = m->min_size;
3789 }
3790
3791 if (m->keep_free == (uint64_t) -1) {
3792
3793 if (fs_size > 0) {
3794 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3795
3796 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3797 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3798
3799 } else
3800 m->keep_free = DEFAULT_KEEP_FREE;
3801 }
3802
3803 if (m->n_max_files == (uint64_t) -1)
3804 m->n_max_files = DEFAULT_N_MAX_FILES;
3805
3806 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3807 format_bytes(a, sizeof(a), m->min_use),
3808 format_bytes(b, sizeof(b), m->max_use),
3809 format_bytes(c, sizeof(c), m->max_size),
3810 format_bytes(d, sizeof(d), m->min_size),
3811 format_bytes(e, sizeof(e), m->keep_free),
3812 m->n_max_files);
3813 }
3814
3815 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3816 assert(f);
3817 assert(f->header);
3818 assert(from || to);
3819
3820 if (from) {
3821 if (f->header->head_entry_realtime == 0)
3822 return -ENOENT;
3823
3824 *from = le64toh(f->header->head_entry_realtime);
3825 }
3826
3827 if (to) {
3828 if (f->header->tail_entry_realtime == 0)
3829 return -ENOENT;
3830
3831 *to = le64toh(f->header->tail_entry_realtime);
3832 }
3833
3834 return 1;
3835 }
3836
3837 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3838 Object *o;
3839 uint64_t p;
3840 int r;
3841
3842 assert(f);
3843 assert(from || to);
3844
3845 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3846 if (r <= 0)
3847 return r;
3848
3849 if (le64toh(o->data.n_entries) <= 0)
3850 return 0;
3851
3852 if (from) {
3853 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3854 if (r < 0)
3855 return r;
3856
3857 *from = le64toh(o->entry.monotonic);
3858 }
3859
3860 if (to) {
3861 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3862 if (r < 0)
3863 return r;
3864
3865 r = generic_array_get_plus_one(f,
3866 le64toh(o->data.entry_offset),
3867 le64toh(o->data.entry_array_offset),
3868 le64toh(o->data.n_entries)-1,
3869 &o, NULL);
3870 if (r <= 0)
3871 return r;
3872
3873 *to = le64toh(o->entry.monotonic);
3874 }
3875
3876 return 1;
3877 }
3878
3879 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3880 assert(f);
3881 assert(f->header);
3882
3883 /* If we gained new header fields we gained new features,
3884 * hence suggest a rotation */
3885 if (le64toh(f->header->header_size) < sizeof(Header)) {
3886 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3887 return true;
3888 }
3889
3890 /* Let's check if the hash tables grew over a certain fill
3891 * level (75%, borrowing this value from Java's hash table
3892 * implementation), and if so suggest a rotation. To calculate
3893 * the fill level we need the n_data field, which only exists
3894 * in newer versions. */
3895
3896 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3897 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3898 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3899 f->path,
3900 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3901 le64toh(f->header->n_data),
3902 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3903 (unsigned long long) f->last_stat.st_size,
3904 f->last_stat.st_size / le64toh(f->header->n_data));
3905 return true;
3906 }
3907
3908 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3909 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3910 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3911 f->path,
3912 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3913 le64toh(f->header->n_fields),
3914 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3915 return true;
3916 }
3917
3918 /* Are the data objects properly indexed by field objects? */
3919 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3920 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3921 le64toh(f->header->n_data) > 0 &&
3922 le64toh(f->header->n_fields) == 0)
3923 return true;
3924
3925 if (max_file_usec > 0) {
3926 usec_t t, h;
3927
3928 h = le64toh(f->header->head_entry_realtime);
3929 t = now(CLOCK_REALTIME);
3930
3931 if (h > 0 && t > h + max_file_usec)
3932 return true;
3933 }
3934
3935 return false;
3936 }