]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
Merge pull request #8313 from alexgartrell/compression-threshold
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2011 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <linux/fs.h>
24 #include <pthread.h>
25 #include <stddef.h>
26 #include <sys/mman.h>
27 #include <sys/statvfs.h>
28 #include <sys/uio.h>
29 #include <unistd.h>
30
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
34 #include "compress.h"
35 #include "fd-util.h"
36 #include "fs-util.h"
37 #include "journal-authenticate.h"
38 #include "journal-def.h"
39 #include "journal-file.h"
40 #include "lookup3.h"
41 #include "parse-util.h"
42 #include "path-util.h"
43 #include "random-util.h"
44 #include "sd-event.h"
45 #include "set.h"
46 #include "stat-util.h"
47 #include "string-util.h"
48 #include "strv.h"
49 #include "xattr-util.h"
50
51 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
52 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
53
54 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
55 #define MIN_COMPRESS_THRESHOLD (8ULL)
56
57 /* This is the minimum journal file size */
58 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
59
60 /* These are the lower and upper bounds if we deduce the max_use value
61 * from the file system size */
62 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
63 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
64
65 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
66 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
67
68 /* This is the upper bound if we deduce max_size from max_use */
69 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
70
71 /* This is the upper bound if we deduce the keep_free value from the
72 * file system size */
73 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
74
75 /* This is the keep_free value when we can't determine the system
76 * size */
77 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
78
79 /* This is the default maximum number of journal files to keep around. */
80 #define DEFAULT_N_MAX_FILES (100)
81
82 /* n_data was the first entry we added after the initial file format design */
83 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
84
85 /* How many entries to keep in the entry array chain cache at max */
86 #define CHAIN_CACHE_MAX 20
87
88 /* How much to increase the journal file size at once each time we allocate something new. */
89 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
90
91 /* Reread fstat() of the file for detecting deletions at least this often */
92 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
93
94 /* The mmap context to use for the header we pick as one above the last defined typed */
95 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
96
97 #ifdef __clang__
98 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
99 #endif
100
101 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
102 * As a result we use atomic operations on f->offline_state for inter-thread communications with
103 * journal_file_set_offline() and journal_file_set_online(). */
104 static void journal_file_set_offline_internal(JournalFile *f) {
105 assert(f);
106 assert(f->fd >= 0);
107 assert(f->header);
108
109 for (;;) {
110 switch (f->offline_state) {
111 case OFFLINE_CANCEL:
112 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
113 continue;
114 return;
115
116 case OFFLINE_AGAIN_FROM_SYNCING:
117 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
118 continue;
119 break;
120
121 case OFFLINE_AGAIN_FROM_OFFLINING:
122 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
123 continue;
124 break;
125
126 case OFFLINE_SYNCING:
127 (void) fsync(f->fd);
128
129 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
130 continue;
131
132 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
133 (void) fsync(f->fd);
134 break;
135
136 case OFFLINE_OFFLINING:
137 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
138 continue;
139 _fallthrough_;
140 case OFFLINE_DONE:
141 return;
142
143 case OFFLINE_JOINED:
144 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
145 return;
146 }
147 }
148 }
149
150 static void * journal_file_set_offline_thread(void *arg) {
151 JournalFile *f = arg;
152
153 (void) pthread_setname_np(pthread_self(), "journal-offline");
154
155 journal_file_set_offline_internal(f);
156
157 return NULL;
158 }
159
160 static int journal_file_set_offline_thread_join(JournalFile *f) {
161 int r;
162
163 assert(f);
164
165 if (f->offline_state == OFFLINE_JOINED)
166 return 0;
167
168 r = pthread_join(f->offline_thread, NULL);
169 if (r)
170 return -r;
171
172 f->offline_state = OFFLINE_JOINED;
173
174 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
175 return -EIO;
176
177 return 0;
178 }
179
180 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
181 static bool journal_file_set_offline_try_restart(JournalFile *f) {
182 for (;;) {
183 switch (f->offline_state) {
184 case OFFLINE_AGAIN_FROM_SYNCING:
185 case OFFLINE_AGAIN_FROM_OFFLINING:
186 return true;
187
188 case OFFLINE_CANCEL:
189 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
190 continue;
191 return true;
192
193 case OFFLINE_SYNCING:
194 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
195 continue;
196 return true;
197
198 case OFFLINE_OFFLINING:
199 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
200 continue;
201 return true;
202
203 default:
204 return false;
205 }
206 }
207 }
208
209 /* Sets a journal offline.
210 *
211 * If wait is false then an offline is dispatched in a separate thread for a
212 * subsequent journal_file_set_offline() or journal_file_set_online() of the
213 * same journal to synchronize with.
214 *
215 * If wait is true, then either an existing offline thread will be restarted
216 * and joined, or if none exists the offline is simply performed in this
217 * context without involving another thread.
218 */
219 int journal_file_set_offline(JournalFile *f, bool wait) {
220 bool restarted;
221 int r;
222
223 assert(f);
224
225 if (!f->writable)
226 return -EPERM;
227
228 if (!(f->fd >= 0 && f->header))
229 return -EINVAL;
230
231 /* An offlining journal is implicitly online and may modify f->header->state,
232 * we must also join any potentially lingering offline thread when not online. */
233 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
234 return journal_file_set_offline_thread_join(f);
235
236 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
237 restarted = journal_file_set_offline_try_restart(f);
238 if ((restarted && wait) || !restarted) {
239 r = journal_file_set_offline_thread_join(f);
240 if (r < 0)
241 return r;
242 }
243
244 if (restarted)
245 return 0;
246
247 /* Initiate a new offline. */
248 f->offline_state = OFFLINE_SYNCING;
249
250 if (wait) /* Without using a thread if waiting. */
251 journal_file_set_offline_internal(f);
252 else {
253 sigset_t ss, saved_ss;
254 int k;
255
256 if (sigfillset(&ss) < 0)
257 return -errno;
258
259 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
260 if (r > 0)
261 return -r;
262
263 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
264
265 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
266 if (r > 0) {
267 f->offline_state = OFFLINE_JOINED;
268 return -r;
269 }
270 if (k > 0)
271 return -k;
272 }
273
274 return 0;
275 }
276
277 static int journal_file_set_online(JournalFile *f) {
278 bool joined = false;
279
280 assert(f);
281
282 if (!f->writable)
283 return -EPERM;
284
285 if (!(f->fd >= 0 && f->header))
286 return -EINVAL;
287
288 while (!joined) {
289 switch (f->offline_state) {
290 case OFFLINE_JOINED:
291 /* No offline thread, no need to wait. */
292 joined = true;
293 break;
294
295 case OFFLINE_SYNCING:
296 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
297 continue;
298 /* Canceled syncing prior to offlining, no need to wait. */
299 break;
300
301 case OFFLINE_AGAIN_FROM_SYNCING:
302 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
303 continue;
304 /* Canceled restart from syncing, no need to wait. */
305 break;
306
307 case OFFLINE_AGAIN_FROM_OFFLINING:
308 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
309 continue;
310 /* Canceled restart from offlining, must wait for offlining to complete however. */
311 _fallthrough_;
312 default: {
313 int r;
314
315 r = journal_file_set_offline_thread_join(f);
316 if (r < 0)
317 return r;
318
319 joined = true;
320 break;
321 }
322 }
323 }
324
325 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
326 return -EIO;
327
328 switch (f->header->state) {
329 case STATE_ONLINE:
330 return 0;
331
332 case STATE_OFFLINE:
333 f->header->state = STATE_ONLINE;
334 (void) fsync(f->fd);
335 return 0;
336
337 default:
338 return -EINVAL;
339 }
340 }
341
342 bool journal_file_is_offlining(JournalFile *f) {
343 assert(f);
344
345 __sync_synchronize();
346
347 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
348 return false;
349
350 return true;
351 }
352
353 JournalFile* journal_file_close(JournalFile *f) {
354 assert(f);
355
356 #if HAVE_GCRYPT
357 /* Write the final tag */
358 if (f->seal && f->writable) {
359 int r;
360
361 r = journal_file_append_tag(f);
362 if (r < 0)
363 log_error_errno(r, "Failed to append tag when closing journal: %m");
364 }
365 #endif
366
367 if (f->post_change_timer) {
368 int enabled;
369
370 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
371 if (enabled == SD_EVENT_ONESHOT)
372 journal_file_post_change(f);
373
374 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
375 sd_event_source_unref(f->post_change_timer);
376 }
377
378 journal_file_set_offline(f, true);
379
380 if (f->mmap && f->cache_fd)
381 mmap_cache_free_fd(f->mmap, f->cache_fd);
382
383 if (f->fd >= 0 && f->defrag_on_close) {
384
385 /* Be friendly to btrfs: turn COW back on again now,
386 * and defragment the file. We won't write to the file
387 * ever again, hence remove all fragmentation, and
388 * reenable all the good bits COW usually provides
389 * (such as data checksumming). */
390
391 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
392 (void) btrfs_defrag_fd(f->fd);
393 }
394
395 if (f->close_fd)
396 safe_close(f->fd);
397 free(f->path);
398
399 mmap_cache_unref(f->mmap);
400
401 ordered_hashmap_free_free(f->chain_cache);
402
403 #if HAVE_XZ || HAVE_LZ4
404 free(f->compress_buffer);
405 #endif
406
407 #if HAVE_GCRYPT
408 if (f->fss_file)
409 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
410 else
411 free(f->fsprg_state);
412
413 free(f->fsprg_seed);
414
415 if (f->hmac)
416 gcry_md_close(f->hmac);
417 #endif
418
419 return mfree(f);
420 }
421
422 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
423 Header h = {};
424 ssize_t k;
425 int r;
426
427 assert(f);
428
429 memcpy(h.signature, HEADER_SIGNATURE, 8);
430 h.header_size = htole64(ALIGN64(sizeof(h)));
431
432 h.incompatible_flags |= htole32(
433 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
434 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
435
436 h.compatible_flags = htole32(
437 f->seal * HEADER_COMPATIBLE_SEALED);
438
439 r = sd_id128_randomize(&h.file_id);
440 if (r < 0)
441 return r;
442
443 if (template) {
444 h.seqnum_id = template->header->seqnum_id;
445 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
446 } else
447 h.seqnum_id = h.file_id;
448
449 k = pwrite(f->fd, &h, sizeof(h), 0);
450 if (k < 0)
451 return -errno;
452
453 if (k != sizeof(h))
454 return -EIO;
455
456 return 0;
457 }
458
459 static int journal_file_refresh_header(JournalFile *f) {
460 sd_id128_t boot_id;
461 int r;
462
463 assert(f);
464 assert(f->header);
465
466 r = sd_id128_get_machine(&f->header->machine_id);
467 if (r < 0)
468 return r;
469
470 r = sd_id128_get_boot(&boot_id);
471 if (r < 0)
472 return r;
473
474 f->header->boot_id = boot_id;
475
476 r = journal_file_set_online(f);
477
478 /* Sync the online state to disk */
479 (void) fsync(f->fd);
480
481 /* We likely just created a new file, also sync the directory this file is located in. */
482 (void) fsync_directory_of_file(f->fd);
483
484 return r;
485 }
486
487 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
488 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
489 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
490 const char *type = compatible ? "compatible" : "incompatible";
491 uint32_t flags;
492
493 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
494
495 if (flags & ~supported) {
496 if (flags & ~any)
497 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
498 f->path, type, flags & ~any);
499 flags = (flags & any) & ~supported;
500 if (flags) {
501 const char* strv[3];
502 unsigned n = 0;
503 _cleanup_free_ char *t = NULL;
504
505 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
506 strv[n++] = "sealed";
507 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
508 strv[n++] = "xz-compressed";
509 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
510 strv[n++] = "lz4-compressed";
511 strv[n] = NULL;
512 assert(n < ELEMENTSOF(strv));
513
514 t = strv_join((char**) strv, ", ");
515 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
516 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
517 }
518 return true;
519 }
520
521 return false;
522 }
523
524 static int journal_file_verify_header(JournalFile *f) {
525 uint64_t arena_size, header_size;
526
527 assert(f);
528 assert(f->header);
529
530 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
531 return -EBADMSG;
532
533 /* In both read and write mode we refuse to open files with incompatible
534 * flags we don't know. */
535 if (warn_wrong_flags(f, false))
536 return -EPROTONOSUPPORT;
537
538 /* When open for writing we refuse to open files with compatible flags, too. */
539 if (f->writable && warn_wrong_flags(f, true))
540 return -EPROTONOSUPPORT;
541
542 if (f->header->state >= _STATE_MAX)
543 return -EBADMSG;
544
545 header_size = le64toh(f->header->header_size);
546
547 /* The first addition was n_data, so check that we are at least this large */
548 if (header_size < HEADER_SIZE_MIN)
549 return -EBADMSG;
550
551 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
552 return -EBADMSG;
553
554 arena_size = le64toh(f->header->arena_size);
555
556 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
557 return -ENODATA;
558
559 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
560 return -ENODATA;
561
562 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
563 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
564 !VALID64(le64toh(f->header->tail_object_offset)) ||
565 !VALID64(le64toh(f->header->entry_array_offset)))
566 return -ENODATA;
567
568 if (f->writable) {
569 sd_id128_t machine_id;
570 uint8_t state;
571 int r;
572
573 r = sd_id128_get_machine(&machine_id);
574 if (r < 0)
575 return r;
576
577 if (!sd_id128_equal(machine_id, f->header->machine_id))
578 return -EHOSTDOWN;
579
580 state = f->header->state;
581
582 if (state == STATE_ARCHIVED)
583 return -ESHUTDOWN; /* Already archived */
584 else if (state == STATE_ONLINE) {
585 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
586 return -EBUSY;
587 } else if (state != STATE_OFFLINE) {
588 log_debug("Journal file %s has unknown state %i.", f->path, state);
589 return -EBUSY;
590 }
591
592 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
593 return -EBADMSG;
594
595 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
596 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
597 * bisection. */
598 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
599 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
600 return -ETXTBSY;
601 }
602 }
603
604 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
605 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
606
607 f->seal = JOURNAL_HEADER_SEALED(f->header);
608
609 return 0;
610 }
611
612 static int journal_file_fstat(JournalFile *f) {
613 int r;
614
615 assert(f);
616 assert(f->fd >= 0);
617
618 if (fstat(f->fd, &f->last_stat) < 0)
619 return -errno;
620
621 f->last_stat_usec = now(CLOCK_MONOTONIC);
622
623 /* Refuse dealing with with files that aren't regular */
624 r = stat_verify_regular(&f->last_stat);
625 if (r < 0)
626 return r;
627
628 /* Refuse appending to files that are already deleted */
629 if (f->last_stat.st_nlink <= 0)
630 return -EIDRM;
631
632 return 0;
633 }
634
635 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
636 uint64_t old_size, new_size;
637 int r;
638
639 assert(f);
640 assert(f->header);
641
642 /* We assume that this file is not sparse, and we know that
643 * for sure, since we always call posix_fallocate()
644 * ourselves */
645
646 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
647 return -EIO;
648
649 old_size =
650 le64toh(f->header->header_size) +
651 le64toh(f->header->arena_size);
652
653 new_size = PAGE_ALIGN(offset + size);
654 if (new_size < le64toh(f->header->header_size))
655 new_size = le64toh(f->header->header_size);
656
657 if (new_size <= old_size) {
658
659 /* We already pre-allocated enough space, but before
660 * we write to it, let's check with fstat() if the
661 * file got deleted, in order make sure we don't throw
662 * away the data immediately. Don't check fstat() for
663 * all writes though, but only once ever 10s. */
664
665 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
666 return 0;
667
668 return journal_file_fstat(f);
669 }
670
671 /* Allocate more space. */
672
673 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
674 return -E2BIG;
675
676 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
677 struct statvfs svfs;
678
679 if (fstatvfs(f->fd, &svfs) >= 0) {
680 uint64_t available;
681
682 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
683
684 if (new_size - old_size > available)
685 return -E2BIG;
686 }
687 }
688
689 /* Increase by larger blocks at once */
690 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
691 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
692 new_size = f->metrics.max_size;
693
694 /* Note that the glibc fallocate() fallback is very
695 inefficient, hence we try to minimize the allocation area
696 as we can. */
697 r = posix_fallocate(f->fd, old_size, new_size - old_size);
698 if (r != 0)
699 return -r;
700
701 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
702
703 return journal_file_fstat(f);
704 }
705
706 static unsigned type_to_context(ObjectType type) {
707 /* One context for each type, plus one catch-all for the rest */
708 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
709 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
710 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
711 }
712
713 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
714 int r;
715
716 assert(f);
717 assert(ret);
718
719 if (size <= 0)
720 return -EINVAL;
721
722 /* Avoid SIGBUS on invalid accesses */
723 if (offset + size > (uint64_t) f->last_stat.st_size) {
724 /* Hmm, out of range? Let's refresh the fstat() data
725 * first, before we trust that check. */
726
727 r = journal_file_fstat(f);
728 if (r < 0)
729 return r;
730
731 if (offset + size > (uint64_t) f->last_stat.st_size)
732 return -EADDRNOTAVAIL;
733 }
734
735 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
736 }
737
738 static uint64_t minimum_header_size(Object *o) {
739
740 static const uint64_t table[] = {
741 [OBJECT_DATA] = sizeof(DataObject),
742 [OBJECT_FIELD] = sizeof(FieldObject),
743 [OBJECT_ENTRY] = sizeof(EntryObject),
744 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
745 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
746 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
747 [OBJECT_TAG] = sizeof(TagObject),
748 };
749
750 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
751 return sizeof(ObjectHeader);
752
753 return table[o->object.type];
754 }
755
756 /* Lightweight object checks. We want this to be fast, so that we won't
757 * slowdown every journal_file_move_to_object() call too much. */
758 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
759 assert(f);
760 assert(o);
761
762 switch (o->object.type) {
763
764 case OBJECT_DATA: {
765 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
766 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
767 le64toh(o->data.n_entries), offset);
768 return -EBADMSG;
769 }
770
771 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
772 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
773 offsetof(DataObject, payload),
774 le64toh(o->object.size),
775 offset);
776 return -EBADMSG;
777 }
778
779 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
780 !VALID64(le64toh(o->data.next_field_offset)) ||
781 !VALID64(le64toh(o->data.entry_offset)) ||
782 !VALID64(le64toh(o->data.entry_array_offset))) {
783 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
784 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
785 le64toh(o->data.next_hash_offset),
786 le64toh(o->data.next_field_offset),
787 le64toh(o->data.entry_offset),
788 le64toh(o->data.entry_array_offset),
789 offset);
790 return -EBADMSG;
791 }
792
793 break;
794 }
795
796 case OBJECT_FIELD:
797 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
798 log_debug(
799 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
800 offsetof(FieldObject, payload),
801 le64toh(o->object.size),
802 offset);
803 return -EBADMSG;
804 }
805
806 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
807 !VALID64(le64toh(o->field.head_data_offset))) {
808 log_debug(
809 "Invalid offset, next_hash_offset="OFSfmt
810 ", head_data_offset="OFSfmt": %"PRIu64,
811 le64toh(o->field.next_hash_offset),
812 le64toh(o->field.head_data_offset),
813 offset);
814 return -EBADMSG;
815 }
816 break;
817
818 case OBJECT_ENTRY:
819 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
820 log_debug(
821 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
822 offsetof(EntryObject, items),
823 le64toh(o->object.size),
824 offset);
825 return -EBADMSG;
826 }
827
828 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
829 log_debug(
830 "Invalid number items in entry: %"PRIu64": %"PRIu64,
831 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
832 offset);
833 return -EBADMSG;
834 }
835
836 if (le64toh(o->entry.seqnum) <= 0) {
837 log_debug(
838 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
839 le64toh(o->entry.seqnum),
840 offset);
841 return -EBADMSG;
842 }
843
844 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
845 log_debug(
846 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
847 le64toh(o->entry.realtime),
848 offset);
849 return -EBADMSG;
850 }
851
852 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
853 log_debug(
854 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
855 le64toh(o->entry.monotonic),
856 offset);
857 return -EBADMSG;
858 }
859
860 break;
861
862 case OBJECT_DATA_HASH_TABLE:
863 case OBJECT_FIELD_HASH_TABLE:
864 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
865 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
866 log_debug(
867 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
868 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
869 le64toh(o->object.size),
870 offset);
871 return -EBADMSG;
872 }
873
874 break;
875
876 case OBJECT_ENTRY_ARRAY:
877 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
878 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
879 log_debug(
880 "Invalid object entry array size: %"PRIu64": %"PRIu64,
881 le64toh(o->object.size),
882 offset);
883 return -EBADMSG;
884 }
885
886 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
887 log_debug(
888 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
889 le64toh(o->entry_array.next_entry_array_offset),
890 offset);
891 return -EBADMSG;
892 }
893
894 break;
895
896 case OBJECT_TAG:
897 if (le64toh(o->object.size) != sizeof(TagObject)) {
898 log_debug(
899 "Invalid object tag size: %"PRIu64": %"PRIu64,
900 le64toh(o->object.size),
901 offset);
902 return -EBADMSG;
903 }
904
905 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
906 log_debug(
907 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
908 le64toh(o->tag.epoch),
909 offset);
910 return -EBADMSG;
911 }
912
913 break;
914 }
915
916 return 0;
917 }
918
919 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
920 int r;
921 void *t;
922 size_t tsize;
923 Object *o;
924 uint64_t s;
925
926 assert(f);
927 assert(ret);
928
929 /* Objects may only be located at multiple of 64 bit */
930 if (!VALID64(offset)) {
931 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
932 return -EBADMSG;
933 }
934
935 /* Object may not be located in the file header */
936 if (offset < le64toh(f->header->header_size)) {
937 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
938 return -EBADMSG;
939 }
940
941 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
942 if (r < 0)
943 return r;
944
945 o = (Object*) t;
946 s = le64toh(o->object.size);
947
948 if (s == 0) {
949 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
950 return -EBADMSG;
951 }
952 if (s < sizeof(ObjectHeader)) {
953 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
954 return -EBADMSG;
955 }
956
957 if (o->object.type <= OBJECT_UNUSED) {
958 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
959 return -EBADMSG;
960 }
961
962 if (s < minimum_header_size(o)) {
963 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
964 return -EBADMSG;
965 }
966
967 if (type > OBJECT_UNUSED && o->object.type != type) {
968 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
969 return -EBADMSG;
970 }
971
972 if (s > tsize) {
973 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
974 if (r < 0)
975 return r;
976
977 o = (Object*) t;
978 }
979
980 r = journal_file_check_object(f, offset, o);
981 if (r < 0)
982 return r;
983
984 *ret = o;
985 return 0;
986 }
987
988 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
989 uint64_t r;
990
991 assert(f);
992 assert(f->header);
993
994 r = le64toh(f->header->tail_entry_seqnum) + 1;
995
996 if (seqnum) {
997 /* If an external seqnum counter was passed, we update
998 * both the local and the external one, and set it to
999 * the maximum of both */
1000
1001 if (*seqnum + 1 > r)
1002 r = *seqnum + 1;
1003
1004 *seqnum = r;
1005 }
1006
1007 f->header->tail_entry_seqnum = htole64(r);
1008
1009 if (f->header->head_entry_seqnum == 0)
1010 f->header->head_entry_seqnum = htole64(r);
1011
1012 return r;
1013 }
1014
1015 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
1016 int r;
1017 uint64_t p;
1018 Object *tail, *o;
1019 void *t;
1020
1021 assert(f);
1022 assert(f->header);
1023 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
1024 assert(size >= sizeof(ObjectHeader));
1025 assert(offset);
1026 assert(ret);
1027
1028 r = journal_file_set_online(f);
1029 if (r < 0)
1030 return r;
1031
1032 p = le64toh(f->header->tail_object_offset);
1033 if (p == 0)
1034 p = le64toh(f->header->header_size);
1035 else {
1036 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1037 if (r < 0)
1038 return r;
1039
1040 p += ALIGN64(le64toh(tail->object.size));
1041 }
1042
1043 r = journal_file_allocate(f, p, size);
1044 if (r < 0)
1045 return r;
1046
1047 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1048 if (r < 0)
1049 return r;
1050
1051 o = (Object*) t;
1052
1053 zero(o->object);
1054 o->object.type = type;
1055 o->object.size = htole64(size);
1056
1057 f->header->tail_object_offset = htole64(p);
1058 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1059
1060 *ret = o;
1061 *offset = p;
1062
1063 return 0;
1064 }
1065
1066 static int journal_file_setup_data_hash_table(JournalFile *f) {
1067 uint64_t s, p;
1068 Object *o;
1069 int r;
1070
1071 assert(f);
1072 assert(f->header);
1073
1074 /* We estimate that we need 1 hash table entry per 768 bytes
1075 of journal file and we want to make sure we never get
1076 beyond 75% fill level. Calculate the hash table size for
1077 the maximum file size based on these metrics. */
1078
1079 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1080 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1081 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1082
1083 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1084
1085 r = journal_file_append_object(f,
1086 OBJECT_DATA_HASH_TABLE,
1087 offsetof(Object, hash_table.items) + s,
1088 &o, &p);
1089 if (r < 0)
1090 return r;
1091
1092 memzero(o->hash_table.items, s);
1093
1094 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1095 f->header->data_hash_table_size = htole64(s);
1096
1097 return 0;
1098 }
1099
1100 static int journal_file_setup_field_hash_table(JournalFile *f) {
1101 uint64_t s, p;
1102 Object *o;
1103 int r;
1104
1105 assert(f);
1106 assert(f->header);
1107
1108 /* We use a fixed size hash table for the fields as this
1109 * number should grow very slowly only */
1110
1111 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1112 r = journal_file_append_object(f,
1113 OBJECT_FIELD_HASH_TABLE,
1114 offsetof(Object, hash_table.items) + s,
1115 &o, &p);
1116 if (r < 0)
1117 return r;
1118
1119 memzero(o->hash_table.items, s);
1120
1121 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1122 f->header->field_hash_table_size = htole64(s);
1123
1124 return 0;
1125 }
1126
1127 int journal_file_map_data_hash_table(JournalFile *f) {
1128 uint64_t s, p;
1129 void *t;
1130 int r;
1131
1132 assert(f);
1133 assert(f->header);
1134
1135 if (f->data_hash_table)
1136 return 0;
1137
1138 p = le64toh(f->header->data_hash_table_offset);
1139 s = le64toh(f->header->data_hash_table_size);
1140
1141 r = journal_file_move_to(f,
1142 OBJECT_DATA_HASH_TABLE,
1143 true,
1144 p, s,
1145 &t, NULL);
1146 if (r < 0)
1147 return r;
1148
1149 f->data_hash_table = t;
1150 return 0;
1151 }
1152
1153 int journal_file_map_field_hash_table(JournalFile *f) {
1154 uint64_t s, p;
1155 void *t;
1156 int r;
1157
1158 assert(f);
1159 assert(f->header);
1160
1161 if (f->field_hash_table)
1162 return 0;
1163
1164 p = le64toh(f->header->field_hash_table_offset);
1165 s = le64toh(f->header->field_hash_table_size);
1166
1167 r = journal_file_move_to(f,
1168 OBJECT_FIELD_HASH_TABLE,
1169 true,
1170 p, s,
1171 &t, NULL);
1172 if (r < 0)
1173 return r;
1174
1175 f->field_hash_table = t;
1176 return 0;
1177 }
1178
1179 static int journal_file_link_field(
1180 JournalFile *f,
1181 Object *o,
1182 uint64_t offset,
1183 uint64_t hash) {
1184
1185 uint64_t p, h, m;
1186 int r;
1187
1188 assert(f);
1189 assert(f->header);
1190 assert(f->field_hash_table);
1191 assert(o);
1192 assert(offset > 0);
1193
1194 if (o->object.type != OBJECT_FIELD)
1195 return -EINVAL;
1196
1197 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1198 if (m <= 0)
1199 return -EBADMSG;
1200
1201 /* This might alter the window we are looking at */
1202 o->field.next_hash_offset = o->field.head_data_offset = 0;
1203
1204 h = hash % m;
1205 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1206 if (p == 0)
1207 f->field_hash_table[h].head_hash_offset = htole64(offset);
1208 else {
1209 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1210 if (r < 0)
1211 return r;
1212
1213 o->field.next_hash_offset = htole64(offset);
1214 }
1215
1216 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1217
1218 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1219 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1220
1221 return 0;
1222 }
1223
1224 static int journal_file_link_data(
1225 JournalFile *f,
1226 Object *o,
1227 uint64_t offset,
1228 uint64_t hash) {
1229
1230 uint64_t p, h, m;
1231 int r;
1232
1233 assert(f);
1234 assert(f->header);
1235 assert(f->data_hash_table);
1236 assert(o);
1237 assert(offset > 0);
1238
1239 if (o->object.type != OBJECT_DATA)
1240 return -EINVAL;
1241
1242 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1243 if (m <= 0)
1244 return -EBADMSG;
1245
1246 /* This might alter the window we are looking at */
1247 o->data.next_hash_offset = o->data.next_field_offset = 0;
1248 o->data.entry_offset = o->data.entry_array_offset = 0;
1249 o->data.n_entries = 0;
1250
1251 h = hash % m;
1252 p = le64toh(f->data_hash_table[h].tail_hash_offset);
1253 if (p == 0)
1254 /* Only entry in the hash table is easy */
1255 f->data_hash_table[h].head_hash_offset = htole64(offset);
1256 else {
1257 /* Move back to the previous data object, to patch in
1258 * pointer */
1259
1260 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1261 if (r < 0)
1262 return r;
1263
1264 o->data.next_hash_offset = htole64(offset);
1265 }
1266
1267 f->data_hash_table[h].tail_hash_offset = htole64(offset);
1268
1269 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1270 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1271
1272 return 0;
1273 }
1274
1275 int journal_file_find_field_object_with_hash(
1276 JournalFile *f,
1277 const void *field, uint64_t size, uint64_t hash,
1278 Object **ret, uint64_t *offset) {
1279
1280 uint64_t p, osize, h, m;
1281 int r;
1282
1283 assert(f);
1284 assert(f->header);
1285 assert(field && size > 0);
1286
1287 /* If the field hash table is empty, we can't find anything */
1288 if (le64toh(f->header->field_hash_table_size) <= 0)
1289 return 0;
1290
1291 /* Map the field hash table, if it isn't mapped yet. */
1292 r = journal_file_map_field_hash_table(f);
1293 if (r < 0)
1294 return r;
1295
1296 osize = offsetof(Object, field.payload) + size;
1297
1298 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1299 if (m <= 0)
1300 return -EBADMSG;
1301
1302 h = hash % m;
1303 p = le64toh(f->field_hash_table[h].head_hash_offset);
1304
1305 while (p > 0) {
1306 Object *o;
1307
1308 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1309 if (r < 0)
1310 return r;
1311
1312 if (le64toh(o->field.hash) == hash &&
1313 le64toh(o->object.size) == osize &&
1314 memcmp(o->field.payload, field, size) == 0) {
1315
1316 if (ret)
1317 *ret = o;
1318 if (offset)
1319 *offset = p;
1320
1321 return 1;
1322 }
1323
1324 p = le64toh(o->field.next_hash_offset);
1325 }
1326
1327 return 0;
1328 }
1329
1330 int journal_file_find_field_object(
1331 JournalFile *f,
1332 const void *field, uint64_t size,
1333 Object **ret, uint64_t *offset) {
1334
1335 uint64_t hash;
1336
1337 assert(f);
1338 assert(field && size > 0);
1339
1340 hash = hash64(field, size);
1341
1342 return journal_file_find_field_object_with_hash(f,
1343 field, size, hash,
1344 ret, offset);
1345 }
1346
1347 int journal_file_find_data_object_with_hash(
1348 JournalFile *f,
1349 const void *data, uint64_t size, uint64_t hash,
1350 Object **ret, uint64_t *offset) {
1351
1352 uint64_t p, osize, h, m;
1353 int r;
1354
1355 assert(f);
1356 assert(f->header);
1357 assert(data || size == 0);
1358
1359 /* If there's no data hash table, then there's no entry. */
1360 if (le64toh(f->header->data_hash_table_size) <= 0)
1361 return 0;
1362
1363 /* Map the data hash table, if it isn't mapped yet. */
1364 r = journal_file_map_data_hash_table(f);
1365 if (r < 0)
1366 return r;
1367
1368 osize = offsetof(Object, data.payload) + size;
1369
1370 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1371 if (m <= 0)
1372 return -EBADMSG;
1373
1374 h = hash % m;
1375 p = le64toh(f->data_hash_table[h].head_hash_offset);
1376
1377 while (p > 0) {
1378 Object *o;
1379
1380 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1381 if (r < 0)
1382 return r;
1383
1384 if (le64toh(o->data.hash) != hash)
1385 goto next;
1386
1387 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1388 #if HAVE_XZ || HAVE_LZ4
1389 uint64_t l;
1390 size_t rsize = 0;
1391
1392 l = le64toh(o->object.size);
1393 if (l <= offsetof(Object, data.payload))
1394 return -EBADMSG;
1395
1396 l -= offsetof(Object, data.payload);
1397
1398 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1399 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1400 if (r < 0)
1401 return r;
1402
1403 if (rsize == size &&
1404 memcmp(f->compress_buffer, data, size) == 0) {
1405
1406 if (ret)
1407 *ret = o;
1408
1409 if (offset)
1410 *offset = p;
1411
1412 return 1;
1413 }
1414 #else
1415 return -EPROTONOSUPPORT;
1416 #endif
1417 } else if (le64toh(o->object.size) == osize &&
1418 memcmp(o->data.payload, data, size) == 0) {
1419
1420 if (ret)
1421 *ret = o;
1422
1423 if (offset)
1424 *offset = p;
1425
1426 return 1;
1427 }
1428
1429 next:
1430 p = le64toh(o->data.next_hash_offset);
1431 }
1432
1433 return 0;
1434 }
1435
1436 int journal_file_find_data_object(
1437 JournalFile *f,
1438 const void *data, uint64_t size,
1439 Object **ret, uint64_t *offset) {
1440
1441 uint64_t hash;
1442
1443 assert(f);
1444 assert(data || size == 0);
1445
1446 hash = hash64(data, size);
1447
1448 return journal_file_find_data_object_with_hash(f,
1449 data, size, hash,
1450 ret, offset);
1451 }
1452
1453 static int journal_file_append_field(
1454 JournalFile *f,
1455 const void *field, uint64_t size,
1456 Object **ret, uint64_t *offset) {
1457
1458 uint64_t hash, p;
1459 uint64_t osize;
1460 Object *o;
1461 int r;
1462
1463 assert(f);
1464 assert(field && size > 0);
1465
1466 hash = hash64(field, size);
1467
1468 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1469 if (r < 0)
1470 return r;
1471 else if (r > 0) {
1472
1473 if (ret)
1474 *ret = o;
1475
1476 if (offset)
1477 *offset = p;
1478
1479 return 0;
1480 }
1481
1482 osize = offsetof(Object, field.payload) + size;
1483 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1484 if (r < 0)
1485 return r;
1486
1487 o->field.hash = htole64(hash);
1488 memcpy(o->field.payload, field, size);
1489
1490 r = journal_file_link_field(f, o, p, hash);
1491 if (r < 0)
1492 return r;
1493
1494 /* The linking might have altered the window, so let's
1495 * refresh our pointer */
1496 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1497 if (r < 0)
1498 return r;
1499
1500 #if HAVE_GCRYPT
1501 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1502 if (r < 0)
1503 return r;
1504 #endif
1505
1506 if (ret)
1507 *ret = o;
1508
1509 if (offset)
1510 *offset = p;
1511
1512 return 0;
1513 }
1514
1515 static int journal_file_append_data(
1516 JournalFile *f,
1517 const void *data, uint64_t size,
1518 Object **ret, uint64_t *offset) {
1519
1520 uint64_t hash, p;
1521 uint64_t osize;
1522 Object *o;
1523 int r, compression = 0;
1524 const void *eq;
1525
1526 assert(f);
1527 assert(data || size == 0);
1528
1529 hash = hash64(data, size);
1530
1531 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1532 if (r < 0)
1533 return r;
1534 if (r > 0) {
1535
1536 if (ret)
1537 *ret = o;
1538
1539 if (offset)
1540 *offset = p;
1541
1542 return 0;
1543 }
1544
1545 osize = offsetof(Object, data.payload) + size;
1546 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1547 if (r < 0)
1548 return r;
1549
1550 o->data.hash = htole64(hash);
1551
1552 #if HAVE_XZ || HAVE_LZ4
1553 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
1554 size_t rsize = 0;
1555
1556 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1557
1558 if (compression >= 0) {
1559 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1560 o->object.flags |= compression;
1561
1562 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1563 size, rsize, object_compressed_to_string(compression));
1564 } else
1565 /* Compression didn't work, we don't really care why, let's continue without compression */
1566 compression = 0;
1567 }
1568 #endif
1569
1570 if (compression == 0)
1571 memcpy_safe(o->data.payload, data, size);
1572
1573 r = journal_file_link_data(f, o, p, hash);
1574 if (r < 0)
1575 return r;
1576
1577 #if HAVE_GCRYPT
1578 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1579 if (r < 0)
1580 return r;
1581 #endif
1582
1583 /* The linking might have altered the window, so let's
1584 * refresh our pointer */
1585 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1586 if (r < 0)
1587 return r;
1588
1589 if (!data)
1590 eq = NULL;
1591 else
1592 eq = memchr(data, '=', size);
1593 if (eq && eq > data) {
1594 Object *fo = NULL;
1595 uint64_t fp;
1596
1597 /* Create field object ... */
1598 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1599 if (r < 0)
1600 return r;
1601
1602 /* ... and link it in. */
1603 o->data.next_field_offset = fo->field.head_data_offset;
1604 fo->field.head_data_offset = le64toh(p);
1605 }
1606
1607 if (ret)
1608 *ret = o;
1609
1610 if (offset)
1611 *offset = p;
1612
1613 return 0;
1614 }
1615
1616 uint64_t journal_file_entry_n_items(Object *o) {
1617 assert(o);
1618
1619 if (o->object.type != OBJECT_ENTRY)
1620 return 0;
1621
1622 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1623 }
1624
1625 uint64_t journal_file_entry_array_n_items(Object *o) {
1626 assert(o);
1627
1628 if (o->object.type != OBJECT_ENTRY_ARRAY)
1629 return 0;
1630
1631 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1632 }
1633
1634 uint64_t journal_file_hash_table_n_items(Object *o) {
1635 assert(o);
1636
1637 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1638 return 0;
1639
1640 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1641 }
1642
1643 static int link_entry_into_array(JournalFile *f,
1644 le64_t *first,
1645 le64_t *idx,
1646 uint64_t p) {
1647 int r;
1648 uint64_t n = 0, ap = 0, q, i, a, hidx;
1649 Object *o;
1650
1651 assert(f);
1652 assert(f->header);
1653 assert(first);
1654 assert(idx);
1655 assert(p > 0);
1656
1657 a = le64toh(*first);
1658 i = hidx = le64toh(*idx);
1659 while (a > 0) {
1660
1661 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1662 if (r < 0)
1663 return r;
1664
1665 n = journal_file_entry_array_n_items(o);
1666 if (i < n) {
1667 o->entry_array.items[i] = htole64(p);
1668 *idx = htole64(hidx + 1);
1669 return 0;
1670 }
1671
1672 i -= n;
1673 ap = a;
1674 a = le64toh(o->entry_array.next_entry_array_offset);
1675 }
1676
1677 if (hidx > n)
1678 n = (hidx+1) * 2;
1679 else
1680 n = n * 2;
1681
1682 if (n < 4)
1683 n = 4;
1684
1685 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1686 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1687 &o, &q);
1688 if (r < 0)
1689 return r;
1690
1691 #if HAVE_GCRYPT
1692 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1693 if (r < 0)
1694 return r;
1695 #endif
1696
1697 o->entry_array.items[i] = htole64(p);
1698
1699 if (ap == 0)
1700 *first = htole64(q);
1701 else {
1702 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1703 if (r < 0)
1704 return r;
1705
1706 o->entry_array.next_entry_array_offset = htole64(q);
1707 }
1708
1709 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1710 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1711
1712 *idx = htole64(hidx + 1);
1713
1714 return 0;
1715 }
1716
1717 static int link_entry_into_array_plus_one(JournalFile *f,
1718 le64_t *extra,
1719 le64_t *first,
1720 le64_t *idx,
1721 uint64_t p) {
1722
1723 int r;
1724
1725 assert(f);
1726 assert(extra);
1727 assert(first);
1728 assert(idx);
1729 assert(p > 0);
1730
1731 if (*idx == 0)
1732 *extra = htole64(p);
1733 else {
1734 le64_t i;
1735
1736 i = htole64(le64toh(*idx) - 1);
1737 r = link_entry_into_array(f, first, &i, p);
1738 if (r < 0)
1739 return r;
1740 }
1741
1742 *idx = htole64(le64toh(*idx) + 1);
1743 return 0;
1744 }
1745
1746 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1747 uint64_t p;
1748 int r;
1749 assert(f);
1750 assert(o);
1751 assert(offset > 0);
1752
1753 p = le64toh(o->entry.items[i].object_offset);
1754 if (p == 0)
1755 return -EINVAL;
1756
1757 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1758 if (r < 0)
1759 return r;
1760
1761 return link_entry_into_array_plus_one(f,
1762 &o->data.entry_offset,
1763 &o->data.entry_array_offset,
1764 &o->data.n_entries,
1765 offset);
1766 }
1767
1768 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1769 uint64_t n, i;
1770 int r;
1771
1772 assert(f);
1773 assert(f->header);
1774 assert(o);
1775 assert(offset > 0);
1776
1777 if (o->object.type != OBJECT_ENTRY)
1778 return -EINVAL;
1779
1780 __sync_synchronize();
1781
1782 /* Link up the entry itself */
1783 r = link_entry_into_array(f,
1784 &f->header->entry_array_offset,
1785 &f->header->n_entries,
1786 offset);
1787 if (r < 0)
1788 return r;
1789
1790 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1791
1792 if (f->header->head_entry_realtime == 0)
1793 f->header->head_entry_realtime = o->entry.realtime;
1794
1795 f->header->tail_entry_realtime = o->entry.realtime;
1796 f->header->tail_entry_monotonic = o->entry.monotonic;
1797
1798 /* Link up the items */
1799 n = journal_file_entry_n_items(o);
1800 for (i = 0; i < n; i++) {
1801 r = journal_file_link_entry_item(f, o, offset, i);
1802 if (r < 0)
1803 return r;
1804 }
1805
1806 return 0;
1807 }
1808
1809 static int journal_file_append_entry_internal(
1810 JournalFile *f,
1811 const dual_timestamp *ts,
1812 uint64_t xor_hash,
1813 const EntryItem items[], unsigned n_items,
1814 uint64_t *seqnum,
1815 Object **ret, uint64_t *offset) {
1816 uint64_t np;
1817 uint64_t osize;
1818 Object *o;
1819 int r;
1820
1821 assert(f);
1822 assert(f->header);
1823 assert(items || n_items == 0);
1824 assert(ts);
1825
1826 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1827
1828 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1829 if (r < 0)
1830 return r;
1831
1832 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1833 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1834 o->entry.realtime = htole64(ts->realtime);
1835 o->entry.monotonic = htole64(ts->monotonic);
1836 o->entry.xor_hash = htole64(xor_hash);
1837 o->entry.boot_id = f->header->boot_id;
1838
1839 #if HAVE_GCRYPT
1840 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1841 if (r < 0)
1842 return r;
1843 #endif
1844
1845 r = journal_file_link_entry(f, o, np);
1846 if (r < 0)
1847 return r;
1848
1849 if (ret)
1850 *ret = o;
1851
1852 if (offset)
1853 *offset = np;
1854
1855 return 0;
1856 }
1857
1858 void journal_file_post_change(JournalFile *f) {
1859 assert(f);
1860
1861 /* inotify() does not receive IN_MODIFY events from file
1862 * accesses done via mmap(). After each access we hence
1863 * trigger IN_MODIFY by truncating the journal file to its
1864 * current size which triggers IN_MODIFY. */
1865
1866 __sync_synchronize();
1867
1868 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1869 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1870 }
1871
1872 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1873 assert(userdata);
1874
1875 journal_file_post_change(userdata);
1876
1877 return 1;
1878 }
1879
1880 static void schedule_post_change(JournalFile *f) {
1881 sd_event_source *timer;
1882 int enabled, r;
1883 uint64_t now;
1884
1885 assert(f);
1886 assert(f->post_change_timer);
1887
1888 timer = f->post_change_timer;
1889
1890 r = sd_event_source_get_enabled(timer, &enabled);
1891 if (r < 0) {
1892 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1893 goto fail;
1894 }
1895
1896 if (enabled == SD_EVENT_ONESHOT)
1897 return;
1898
1899 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1900 if (r < 0) {
1901 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1902 goto fail;
1903 }
1904
1905 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1906 if (r < 0) {
1907 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1908 goto fail;
1909 }
1910
1911 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1912 if (r < 0) {
1913 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1914 goto fail;
1915 }
1916
1917 return;
1918
1919 fail:
1920 /* On failure, let's simply post the change immediately. */
1921 journal_file_post_change(f);
1922 }
1923
1924 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1925 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1926 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1927 int r;
1928
1929 assert(f);
1930 assert_return(!f->post_change_timer, -EINVAL);
1931 assert(e);
1932 assert(t);
1933
1934 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1935 if (r < 0)
1936 return r;
1937
1938 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1939 if (r < 0)
1940 return r;
1941
1942 f->post_change_timer = timer;
1943 timer = NULL;
1944 f->post_change_timer_period = t;
1945
1946 return r;
1947 }
1948
1949 static int entry_item_cmp(const void *_a, const void *_b) {
1950 const EntryItem *a = _a, *b = _b;
1951
1952 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1953 return -1;
1954 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1955 return 1;
1956 return 0;
1957 }
1958
1959 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1960 unsigned i;
1961 EntryItem *items;
1962 int r;
1963 uint64_t xor_hash = 0;
1964 struct dual_timestamp _ts;
1965
1966 assert(f);
1967 assert(f->header);
1968 assert(iovec || n_iovec == 0);
1969
1970 if (!ts) {
1971 dual_timestamp_get(&_ts);
1972 ts = &_ts;
1973 }
1974
1975 #if HAVE_GCRYPT
1976 r = journal_file_maybe_append_tag(f, ts->realtime);
1977 if (r < 0)
1978 return r;
1979 #endif
1980
1981 /* alloca() can't take 0, hence let's allocate at least one */
1982 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1983
1984 for (i = 0; i < n_iovec; i++) {
1985 uint64_t p;
1986 Object *o;
1987
1988 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1989 if (r < 0)
1990 return r;
1991
1992 xor_hash ^= le64toh(o->data.hash);
1993 items[i].object_offset = htole64(p);
1994 items[i].hash = o->data.hash;
1995 }
1996
1997 /* Order by the position on disk, in order to improve seek
1998 * times for rotating media. */
1999 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
2000
2001 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
2002
2003 /* If the memory mapping triggered a SIGBUS then we return an
2004 * IO error and ignore the error code passed down to us, since
2005 * it is very likely just an effect of a nullified replacement
2006 * mapping page */
2007
2008 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
2009 r = -EIO;
2010
2011 if (f->post_change_timer)
2012 schedule_post_change(f);
2013 else
2014 journal_file_post_change(f);
2015
2016 return r;
2017 }
2018
2019 typedef struct ChainCacheItem {
2020 uint64_t first; /* the array at the beginning of the chain */
2021 uint64_t array; /* the cached array */
2022 uint64_t begin; /* the first item in the cached array */
2023 uint64_t total; /* the total number of items in all arrays before this one in the chain */
2024 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2025 } ChainCacheItem;
2026
2027 static void chain_cache_put(
2028 OrderedHashmap *h,
2029 ChainCacheItem *ci,
2030 uint64_t first,
2031 uint64_t array,
2032 uint64_t begin,
2033 uint64_t total,
2034 uint64_t last_index) {
2035
2036 if (!ci) {
2037 /* If the chain item to cache for this chain is the
2038 * first one it's not worth caching anything */
2039 if (array == first)
2040 return;
2041
2042 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2043 ci = ordered_hashmap_steal_first(h);
2044 assert(ci);
2045 } else {
2046 ci = new(ChainCacheItem, 1);
2047 if (!ci)
2048 return;
2049 }
2050
2051 ci->first = first;
2052
2053 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2054 free(ci);
2055 return;
2056 }
2057 } else
2058 assert(ci->first == first);
2059
2060 ci->array = array;
2061 ci->begin = begin;
2062 ci->total = total;
2063 ci->last_index = last_index;
2064 }
2065
2066 static int generic_array_get(
2067 JournalFile *f,
2068 uint64_t first,
2069 uint64_t i,
2070 Object **ret, uint64_t *offset) {
2071
2072 Object *o;
2073 uint64_t p = 0, a, t = 0;
2074 int r;
2075 ChainCacheItem *ci;
2076
2077 assert(f);
2078
2079 a = first;
2080
2081 /* Try the chain cache first */
2082 ci = ordered_hashmap_get(f->chain_cache, &first);
2083 if (ci && i > ci->total) {
2084 a = ci->array;
2085 i -= ci->total;
2086 t = ci->total;
2087 }
2088
2089 while (a > 0) {
2090 uint64_t k;
2091
2092 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2093 if (r < 0)
2094 return r;
2095
2096 k = journal_file_entry_array_n_items(o);
2097 if (i < k) {
2098 p = le64toh(o->entry_array.items[i]);
2099 goto found;
2100 }
2101
2102 i -= k;
2103 t += k;
2104 a = le64toh(o->entry_array.next_entry_array_offset);
2105 }
2106
2107 return 0;
2108
2109 found:
2110 /* Let's cache this item for the next invocation */
2111 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2112
2113 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2114 if (r < 0)
2115 return r;
2116
2117 if (ret)
2118 *ret = o;
2119
2120 if (offset)
2121 *offset = p;
2122
2123 return 1;
2124 }
2125
2126 static int generic_array_get_plus_one(
2127 JournalFile *f,
2128 uint64_t extra,
2129 uint64_t first,
2130 uint64_t i,
2131 Object **ret, uint64_t *offset) {
2132
2133 Object *o;
2134
2135 assert(f);
2136
2137 if (i == 0) {
2138 int r;
2139
2140 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2141 if (r < 0)
2142 return r;
2143
2144 if (ret)
2145 *ret = o;
2146
2147 if (offset)
2148 *offset = extra;
2149
2150 return 1;
2151 }
2152
2153 return generic_array_get(f, first, i-1, ret, offset);
2154 }
2155
2156 enum {
2157 TEST_FOUND,
2158 TEST_LEFT,
2159 TEST_RIGHT
2160 };
2161
2162 static int generic_array_bisect(
2163 JournalFile *f,
2164 uint64_t first,
2165 uint64_t n,
2166 uint64_t needle,
2167 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2168 direction_t direction,
2169 Object **ret,
2170 uint64_t *offset,
2171 uint64_t *idx) {
2172
2173 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2174 bool subtract_one = false;
2175 Object *o, *array = NULL;
2176 int r;
2177 ChainCacheItem *ci;
2178
2179 assert(f);
2180 assert(test_object);
2181
2182 /* Start with the first array in the chain */
2183 a = first;
2184
2185 ci = ordered_hashmap_get(f->chain_cache, &first);
2186 if (ci && n > ci->total) {
2187 /* Ah, we have iterated this bisection array chain
2188 * previously! Let's see if we can skip ahead in the
2189 * chain, as far as the last time. But we can't jump
2190 * backwards in the chain, so let's check that
2191 * first. */
2192
2193 r = test_object(f, ci->begin, needle);
2194 if (r < 0)
2195 return r;
2196
2197 if (r == TEST_LEFT) {
2198 /* OK, what we are looking for is right of the
2199 * begin of this EntryArray, so let's jump
2200 * straight to previously cached array in the
2201 * chain */
2202
2203 a = ci->array;
2204 n -= ci->total;
2205 t = ci->total;
2206 last_index = ci->last_index;
2207 }
2208 }
2209
2210 while (a > 0) {
2211 uint64_t left, right, k, lp;
2212
2213 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2214 if (r < 0)
2215 return r;
2216
2217 k = journal_file_entry_array_n_items(array);
2218 right = MIN(k, n);
2219 if (right <= 0)
2220 return 0;
2221
2222 i = right - 1;
2223 lp = p = le64toh(array->entry_array.items[i]);
2224 if (p <= 0)
2225 r = -EBADMSG;
2226 else
2227 r = test_object(f, p, needle);
2228 if (r == -EBADMSG) {
2229 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2230 n = i;
2231 continue;
2232 }
2233 if (r < 0)
2234 return r;
2235
2236 if (r == TEST_FOUND)
2237 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2238
2239 if (r == TEST_RIGHT) {
2240 left = 0;
2241 right -= 1;
2242
2243 if (last_index != (uint64_t) -1) {
2244 assert(last_index <= right);
2245
2246 /* If we cached the last index we
2247 * looked at, let's try to not to jump
2248 * too wildly around and see if we can
2249 * limit the range to look at early to
2250 * the immediate neighbors of the last
2251 * index we looked at. */
2252
2253 if (last_index > 0) {
2254 uint64_t x = last_index - 1;
2255
2256 p = le64toh(array->entry_array.items[x]);
2257 if (p <= 0)
2258 return -EBADMSG;
2259
2260 r = test_object(f, p, needle);
2261 if (r < 0)
2262 return r;
2263
2264 if (r == TEST_FOUND)
2265 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2266
2267 if (r == TEST_RIGHT)
2268 right = x;
2269 else
2270 left = x + 1;
2271 }
2272
2273 if (last_index < right) {
2274 uint64_t y = last_index + 1;
2275
2276 p = le64toh(array->entry_array.items[y]);
2277 if (p <= 0)
2278 return -EBADMSG;
2279
2280 r = test_object(f, p, needle);
2281 if (r < 0)
2282 return r;
2283
2284 if (r == TEST_FOUND)
2285 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2286
2287 if (r == TEST_RIGHT)
2288 right = y;
2289 else
2290 left = y + 1;
2291 }
2292 }
2293
2294 for (;;) {
2295 if (left == right) {
2296 if (direction == DIRECTION_UP)
2297 subtract_one = true;
2298
2299 i = left;
2300 goto found;
2301 }
2302
2303 assert(left < right);
2304 i = (left + right) / 2;
2305
2306 p = le64toh(array->entry_array.items[i]);
2307 if (p <= 0)
2308 r = -EBADMSG;
2309 else
2310 r = test_object(f, p, needle);
2311 if (r == -EBADMSG) {
2312 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2313 right = n = i;
2314 continue;
2315 }
2316 if (r < 0)
2317 return r;
2318
2319 if (r == TEST_FOUND)
2320 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2321
2322 if (r == TEST_RIGHT)
2323 right = i;
2324 else
2325 left = i + 1;
2326 }
2327 }
2328
2329 if (k >= n) {
2330 if (direction == DIRECTION_UP) {
2331 i = n;
2332 subtract_one = true;
2333 goto found;
2334 }
2335
2336 return 0;
2337 }
2338
2339 last_p = lp;
2340
2341 n -= k;
2342 t += k;
2343 last_index = (uint64_t) -1;
2344 a = le64toh(array->entry_array.next_entry_array_offset);
2345 }
2346
2347 return 0;
2348
2349 found:
2350 if (subtract_one && t == 0 && i == 0)
2351 return 0;
2352
2353 /* Let's cache this item for the next invocation */
2354 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2355
2356 if (subtract_one && i == 0)
2357 p = last_p;
2358 else if (subtract_one)
2359 p = le64toh(array->entry_array.items[i-1]);
2360 else
2361 p = le64toh(array->entry_array.items[i]);
2362
2363 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2364 if (r < 0)
2365 return r;
2366
2367 if (ret)
2368 *ret = o;
2369
2370 if (offset)
2371 *offset = p;
2372
2373 if (idx)
2374 *idx = t + i + (subtract_one ? -1 : 0);
2375
2376 return 1;
2377 }
2378
2379 static int generic_array_bisect_plus_one(
2380 JournalFile *f,
2381 uint64_t extra,
2382 uint64_t first,
2383 uint64_t n,
2384 uint64_t needle,
2385 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2386 direction_t direction,
2387 Object **ret,
2388 uint64_t *offset,
2389 uint64_t *idx) {
2390
2391 int r;
2392 bool step_back = false;
2393 Object *o;
2394
2395 assert(f);
2396 assert(test_object);
2397
2398 if (n <= 0)
2399 return 0;
2400
2401 /* This bisects the array in object 'first', but first checks
2402 * an extra */
2403 r = test_object(f, extra, needle);
2404 if (r < 0)
2405 return r;
2406
2407 if (r == TEST_FOUND)
2408 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2409
2410 /* if we are looking with DIRECTION_UP then we need to first
2411 see if in the actual array there is a matching entry, and
2412 return the last one of that. But if there isn't any we need
2413 to return this one. Hence remember this, and return it
2414 below. */
2415 if (r == TEST_LEFT)
2416 step_back = direction == DIRECTION_UP;
2417
2418 if (r == TEST_RIGHT) {
2419 if (direction == DIRECTION_DOWN)
2420 goto found;
2421 else
2422 return 0;
2423 }
2424
2425 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2426
2427 if (r == 0 && step_back)
2428 goto found;
2429
2430 if (r > 0 && idx)
2431 (*idx)++;
2432
2433 return r;
2434
2435 found:
2436 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2437 if (r < 0)
2438 return r;
2439
2440 if (ret)
2441 *ret = o;
2442
2443 if (offset)
2444 *offset = extra;
2445
2446 if (idx)
2447 *idx = 0;
2448
2449 return 1;
2450 }
2451
2452 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2453 assert(f);
2454 assert(p > 0);
2455
2456 if (p == needle)
2457 return TEST_FOUND;
2458 else if (p < needle)
2459 return TEST_LEFT;
2460 else
2461 return TEST_RIGHT;
2462 }
2463
2464 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2465 Object *o;
2466 int r;
2467
2468 assert(f);
2469 assert(p > 0);
2470
2471 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2472 if (r < 0)
2473 return r;
2474
2475 if (le64toh(o->entry.seqnum) == needle)
2476 return TEST_FOUND;
2477 else if (le64toh(o->entry.seqnum) < needle)
2478 return TEST_LEFT;
2479 else
2480 return TEST_RIGHT;
2481 }
2482
2483 int journal_file_move_to_entry_by_seqnum(
2484 JournalFile *f,
2485 uint64_t seqnum,
2486 direction_t direction,
2487 Object **ret,
2488 uint64_t *offset) {
2489 assert(f);
2490 assert(f->header);
2491
2492 return generic_array_bisect(f,
2493 le64toh(f->header->entry_array_offset),
2494 le64toh(f->header->n_entries),
2495 seqnum,
2496 test_object_seqnum,
2497 direction,
2498 ret, offset, NULL);
2499 }
2500
2501 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2502 Object *o;
2503 int r;
2504
2505 assert(f);
2506 assert(p > 0);
2507
2508 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2509 if (r < 0)
2510 return r;
2511
2512 if (le64toh(o->entry.realtime) == needle)
2513 return TEST_FOUND;
2514 else if (le64toh(o->entry.realtime) < needle)
2515 return TEST_LEFT;
2516 else
2517 return TEST_RIGHT;
2518 }
2519
2520 int journal_file_move_to_entry_by_realtime(
2521 JournalFile *f,
2522 uint64_t realtime,
2523 direction_t direction,
2524 Object **ret,
2525 uint64_t *offset) {
2526 assert(f);
2527 assert(f->header);
2528
2529 return generic_array_bisect(f,
2530 le64toh(f->header->entry_array_offset),
2531 le64toh(f->header->n_entries),
2532 realtime,
2533 test_object_realtime,
2534 direction,
2535 ret, offset, NULL);
2536 }
2537
2538 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2539 Object *o;
2540 int r;
2541
2542 assert(f);
2543 assert(p > 0);
2544
2545 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2546 if (r < 0)
2547 return r;
2548
2549 if (le64toh(o->entry.monotonic) == needle)
2550 return TEST_FOUND;
2551 else if (le64toh(o->entry.monotonic) < needle)
2552 return TEST_LEFT;
2553 else
2554 return TEST_RIGHT;
2555 }
2556
2557 static int find_data_object_by_boot_id(
2558 JournalFile *f,
2559 sd_id128_t boot_id,
2560 Object **o,
2561 uint64_t *b) {
2562
2563 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2564
2565 sd_id128_to_string(boot_id, t + 9);
2566 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2567 }
2568
2569 int journal_file_move_to_entry_by_monotonic(
2570 JournalFile *f,
2571 sd_id128_t boot_id,
2572 uint64_t monotonic,
2573 direction_t direction,
2574 Object **ret,
2575 uint64_t *offset) {
2576
2577 Object *o;
2578 int r;
2579
2580 assert(f);
2581
2582 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2583 if (r < 0)
2584 return r;
2585 if (r == 0)
2586 return -ENOENT;
2587
2588 return generic_array_bisect_plus_one(f,
2589 le64toh(o->data.entry_offset),
2590 le64toh(o->data.entry_array_offset),
2591 le64toh(o->data.n_entries),
2592 monotonic,
2593 test_object_monotonic,
2594 direction,
2595 ret, offset, NULL);
2596 }
2597
2598 void journal_file_reset_location(JournalFile *f) {
2599 f->location_type = LOCATION_HEAD;
2600 f->current_offset = 0;
2601 f->current_seqnum = 0;
2602 f->current_realtime = 0;
2603 f->current_monotonic = 0;
2604 zero(f->current_boot_id);
2605 f->current_xor_hash = 0;
2606 }
2607
2608 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2609 f->location_type = LOCATION_SEEK;
2610 f->current_offset = offset;
2611 f->current_seqnum = le64toh(o->entry.seqnum);
2612 f->current_realtime = le64toh(o->entry.realtime);
2613 f->current_monotonic = le64toh(o->entry.monotonic);
2614 f->current_boot_id = o->entry.boot_id;
2615 f->current_xor_hash = le64toh(o->entry.xor_hash);
2616 }
2617
2618 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2619 assert(af);
2620 assert(af->header);
2621 assert(bf);
2622 assert(bf->header);
2623 assert(af->location_type == LOCATION_SEEK);
2624 assert(bf->location_type == LOCATION_SEEK);
2625
2626 /* If contents and timestamps match, these entries are
2627 * identical, even if the seqnum does not match */
2628 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2629 af->current_monotonic == bf->current_monotonic &&
2630 af->current_realtime == bf->current_realtime &&
2631 af->current_xor_hash == bf->current_xor_hash)
2632 return 0;
2633
2634 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2635
2636 /* If this is from the same seqnum source, compare
2637 * seqnums */
2638 if (af->current_seqnum < bf->current_seqnum)
2639 return -1;
2640 if (af->current_seqnum > bf->current_seqnum)
2641 return 1;
2642
2643 /* Wow! This is weird, different data but the same
2644 * seqnums? Something is borked, but let's make the
2645 * best of it and compare by time. */
2646 }
2647
2648 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2649
2650 /* If the boot id matches, compare monotonic time */
2651 if (af->current_monotonic < bf->current_monotonic)
2652 return -1;
2653 if (af->current_monotonic > bf->current_monotonic)
2654 return 1;
2655 }
2656
2657 /* Otherwise, compare UTC time */
2658 if (af->current_realtime < bf->current_realtime)
2659 return -1;
2660 if (af->current_realtime > bf->current_realtime)
2661 return 1;
2662
2663 /* Finally, compare by contents */
2664 if (af->current_xor_hash < bf->current_xor_hash)
2665 return -1;
2666 if (af->current_xor_hash > bf->current_xor_hash)
2667 return 1;
2668
2669 return 0;
2670 }
2671
2672 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2673
2674 /* Increase or decrease the specified index, in the right direction. */
2675
2676 if (direction == DIRECTION_DOWN) {
2677 if (*i >= n - 1)
2678 return 0;
2679
2680 (*i) ++;
2681 } else {
2682 if (*i <= 0)
2683 return 0;
2684
2685 (*i) --;
2686 }
2687
2688 return 1;
2689 }
2690
2691 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2692
2693 /* Consider it an error if any of the two offsets is uninitialized */
2694 if (old_offset == 0 || new_offset == 0)
2695 return false;
2696
2697 /* If we go down, the new offset must be larger than the old one. */
2698 return direction == DIRECTION_DOWN ?
2699 new_offset > old_offset :
2700 new_offset < old_offset;
2701 }
2702
2703 int journal_file_next_entry(
2704 JournalFile *f,
2705 uint64_t p,
2706 direction_t direction,
2707 Object **ret, uint64_t *offset) {
2708
2709 uint64_t i, n, ofs;
2710 int r;
2711
2712 assert(f);
2713 assert(f->header);
2714
2715 n = le64toh(f->header->n_entries);
2716 if (n <= 0)
2717 return 0;
2718
2719 if (p == 0)
2720 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2721 else {
2722 r = generic_array_bisect(f,
2723 le64toh(f->header->entry_array_offset),
2724 le64toh(f->header->n_entries),
2725 p,
2726 test_object_offset,
2727 DIRECTION_DOWN,
2728 NULL, NULL,
2729 &i);
2730 if (r <= 0)
2731 return r;
2732
2733 r = bump_array_index(&i, direction, n);
2734 if (r <= 0)
2735 return r;
2736 }
2737
2738 /* And jump to it */
2739 for (;;) {
2740 r = generic_array_get(f,
2741 le64toh(f->header->entry_array_offset),
2742 i,
2743 ret, &ofs);
2744 if (r > 0)
2745 break;
2746 if (r != -EBADMSG)
2747 return r;
2748
2749 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2750 * the next one might work for us instead. */
2751 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2752
2753 r = bump_array_index(&i, direction, n);
2754 if (r <= 0)
2755 return r;
2756 }
2757
2758 /* Ensure our array is properly ordered. */
2759 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2760 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
2761 return -EBADMSG;
2762 }
2763
2764 if (offset)
2765 *offset = ofs;
2766
2767 return 1;
2768 }
2769
2770 int journal_file_next_entry_for_data(
2771 JournalFile *f,
2772 Object *o, uint64_t p,
2773 uint64_t data_offset,
2774 direction_t direction,
2775 Object **ret, uint64_t *offset) {
2776
2777 uint64_t i, n, ofs;
2778 Object *d;
2779 int r;
2780
2781 assert(f);
2782 assert(p > 0 || !o);
2783
2784 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2785 if (r < 0)
2786 return r;
2787
2788 n = le64toh(d->data.n_entries);
2789 if (n <= 0)
2790 return n;
2791
2792 if (!o)
2793 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2794 else {
2795 if (o->object.type != OBJECT_ENTRY)
2796 return -EINVAL;
2797
2798 r = generic_array_bisect_plus_one(f,
2799 le64toh(d->data.entry_offset),
2800 le64toh(d->data.entry_array_offset),
2801 le64toh(d->data.n_entries),
2802 p,
2803 test_object_offset,
2804 DIRECTION_DOWN,
2805 NULL, NULL,
2806 &i);
2807
2808 if (r <= 0)
2809 return r;
2810
2811 r = bump_array_index(&i, direction, n);
2812 if (r <= 0)
2813 return r;
2814 }
2815
2816 for (;;) {
2817 r = generic_array_get_plus_one(f,
2818 le64toh(d->data.entry_offset),
2819 le64toh(d->data.entry_array_offset),
2820 i,
2821 ret, &ofs);
2822 if (r > 0)
2823 break;
2824 if (r != -EBADMSG)
2825 return r;
2826
2827 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2828
2829 r = bump_array_index(&i, direction, n);
2830 if (r <= 0)
2831 return r;
2832 }
2833
2834 /* Ensure our array is properly ordered. */
2835 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2836 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2837 return -EBADMSG;
2838 }
2839
2840 if (offset)
2841 *offset = ofs;
2842
2843 return 1;
2844 }
2845
2846 int journal_file_move_to_entry_by_offset_for_data(
2847 JournalFile *f,
2848 uint64_t data_offset,
2849 uint64_t p,
2850 direction_t direction,
2851 Object **ret, uint64_t *offset) {
2852
2853 int r;
2854 Object *d;
2855
2856 assert(f);
2857
2858 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2859 if (r < 0)
2860 return r;
2861
2862 return generic_array_bisect_plus_one(f,
2863 le64toh(d->data.entry_offset),
2864 le64toh(d->data.entry_array_offset),
2865 le64toh(d->data.n_entries),
2866 p,
2867 test_object_offset,
2868 direction,
2869 ret, offset, NULL);
2870 }
2871
2872 int journal_file_move_to_entry_by_monotonic_for_data(
2873 JournalFile *f,
2874 uint64_t data_offset,
2875 sd_id128_t boot_id,
2876 uint64_t monotonic,
2877 direction_t direction,
2878 Object **ret, uint64_t *offset) {
2879
2880 Object *o, *d;
2881 int r;
2882 uint64_t b, z;
2883
2884 assert(f);
2885
2886 /* First, seek by time */
2887 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2888 if (r < 0)
2889 return r;
2890 if (r == 0)
2891 return -ENOENT;
2892
2893 r = generic_array_bisect_plus_one(f,
2894 le64toh(o->data.entry_offset),
2895 le64toh(o->data.entry_array_offset),
2896 le64toh(o->data.n_entries),
2897 monotonic,
2898 test_object_monotonic,
2899 direction,
2900 NULL, &z, NULL);
2901 if (r <= 0)
2902 return r;
2903
2904 /* And now, continue seeking until we find an entry that
2905 * exists in both bisection arrays */
2906
2907 for (;;) {
2908 Object *qo;
2909 uint64_t p, q;
2910
2911 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2912 if (r < 0)
2913 return r;
2914
2915 r = generic_array_bisect_plus_one(f,
2916 le64toh(d->data.entry_offset),
2917 le64toh(d->data.entry_array_offset),
2918 le64toh(d->data.n_entries),
2919 z,
2920 test_object_offset,
2921 direction,
2922 NULL, &p, NULL);
2923 if (r <= 0)
2924 return r;
2925
2926 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2927 if (r < 0)
2928 return r;
2929
2930 r = generic_array_bisect_plus_one(f,
2931 le64toh(o->data.entry_offset),
2932 le64toh(o->data.entry_array_offset),
2933 le64toh(o->data.n_entries),
2934 p,
2935 test_object_offset,
2936 direction,
2937 &qo, &q, NULL);
2938
2939 if (r <= 0)
2940 return r;
2941
2942 if (p == q) {
2943 if (ret)
2944 *ret = qo;
2945 if (offset)
2946 *offset = q;
2947
2948 return 1;
2949 }
2950
2951 z = q;
2952 }
2953 }
2954
2955 int journal_file_move_to_entry_by_seqnum_for_data(
2956 JournalFile *f,
2957 uint64_t data_offset,
2958 uint64_t seqnum,
2959 direction_t direction,
2960 Object **ret, uint64_t *offset) {
2961
2962 Object *d;
2963 int r;
2964
2965 assert(f);
2966
2967 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2968 if (r < 0)
2969 return r;
2970
2971 return generic_array_bisect_plus_one(f,
2972 le64toh(d->data.entry_offset),
2973 le64toh(d->data.entry_array_offset),
2974 le64toh(d->data.n_entries),
2975 seqnum,
2976 test_object_seqnum,
2977 direction,
2978 ret, offset, NULL);
2979 }
2980
2981 int journal_file_move_to_entry_by_realtime_for_data(
2982 JournalFile *f,
2983 uint64_t data_offset,
2984 uint64_t realtime,
2985 direction_t direction,
2986 Object **ret, uint64_t *offset) {
2987
2988 Object *d;
2989 int r;
2990
2991 assert(f);
2992
2993 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2994 if (r < 0)
2995 return r;
2996
2997 return generic_array_bisect_plus_one(f,
2998 le64toh(d->data.entry_offset),
2999 le64toh(d->data.entry_array_offset),
3000 le64toh(d->data.n_entries),
3001 realtime,
3002 test_object_realtime,
3003 direction,
3004 ret, offset, NULL);
3005 }
3006
3007 void journal_file_dump(JournalFile *f) {
3008 Object *o;
3009 int r;
3010 uint64_t p;
3011
3012 assert(f);
3013 assert(f->header);
3014
3015 journal_file_print_header(f);
3016
3017 p = le64toh(f->header->header_size);
3018 while (p != 0) {
3019 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3020 if (r < 0)
3021 goto fail;
3022
3023 switch (o->object.type) {
3024
3025 case OBJECT_UNUSED:
3026 printf("Type: OBJECT_UNUSED\n");
3027 break;
3028
3029 case OBJECT_DATA:
3030 printf("Type: OBJECT_DATA\n");
3031 break;
3032
3033 case OBJECT_FIELD:
3034 printf("Type: OBJECT_FIELD\n");
3035 break;
3036
3037 case OBJECT_ENTRY:
3038 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3039 le64toh(o->entry.seqnum),
3040 le64toh(o->entry.monotonic),
3041 le64toh(o->entry.realtime));
3042 break;
3043
3044 case OBJECT_FIELD_HASH_TABLE:
3045 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3046 break;
3047
3048 case OBJECT_DATA_HASH_TABLE:
3049 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3050 break;
3051
3052 case OBJECT_ENTRY_ARRAY:
3053 printf("Type: OBJECT_ENTRY_ARRAY\n");
3054 break;
3055
3056 case OBJECT_TAG:
3057 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3058 le64toh(o->tag.seqnum),
3059 le64toh(o->tag.epoch));
3060 break;
3061
3062 default:
3063 printf("Type: unknown (%i)\n", o->object.type);
3064 break;
3065 }
3066
3067 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3068 printf("Flags: %s\n",
3069 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3070
3071 if (p == le64toh(f->header->tail_object_offset))
3072 p = 0;
3073 else
3074 p = p + ALIGN64(le64toh(o->object.size));
3075 }
3076
3077 return;
3078 fail:
3079 log_error("File corrupt");
3080 }
3081
3082 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3083 const char *x;
3084
3085 x = format_timestamp(buf, l, t);
3086 if (x)
3087 return x;
3088 return " --- ";
3089 }
3090
3091 void journal_file_print_header(JournalFile *f) {
3092 char a[33], b[33], c[33], d[33];
3093 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3094 struct stat st;
3095 char bytes[FORMAT_BYTES_MAX];
3096
3097 assert(f);
3098 assert(f->header);
3099
3100 printf("File Path: %s\n"
3101 "File ID: %s\n"
3102 "Machine ID: %s\n"
3103 "Boot ID: %s\n"
3104 "Sequential Number ID: %s\n"
3105 "State: %s\n"
3106 "Compatible Flags:%s%s\n"
3107 "Incompatible Flags:%s%s%s\n"
3108 "Header size: %"PRIu64"\n"
3109 "Arena size: %"PRIu64"\n"
3110 "Data Hash Table Size: %"PRIu64"\n"
3111 "Field Hash Table Size: %"PRIu64"\n"
3112 "Rotate Suggested: %s\n"
3113 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3114 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3115 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3116 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3117 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3118 "Objects: %"PRIu64"\n"
3119 "Entry Objects: %"PRIu64"\n",
3120 f->path,
3121 sd_id128_to_string(f->header->file_id, a),
3122 sd_id128_to_string(f->header->machine_id, b),
3123 sd_id128_to_string(f->header->boot_id, c),
3124 sd_id128_to_string(f->header->seqnum_id, d),
3125 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3126 f->header->state == STATE_ONLINE ? "ONLINE" :
3127 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3128 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3129 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3130 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3131 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3132 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3133 le64toh(f->header->header_size),
3134 le64toh(f->header->arena_size),
3135 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3136 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3137 yes_no(journal_file_rotate_suggested(f, 0)),
3138 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3139 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3140 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3141 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3142 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3143 le64toh(f->header->n_objects),
3144 le64toh(f->header->n_entries));
3145
3146 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3147 printf("Data Objects: %"PRIu64"\n"
3148 "Data Hash Table Fill: %.1f%%\n",
3149 le64toh(f->header->n_data),
3150 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3151
3152 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3153 printf("Field Objects: %"PRIu64"\n"
3154 "Field Hash Table Fill: %.1f%%\n",
3155 le64toh(f->header->n_fields),
3156 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3157
3158 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3159 printf("Tag Objects: %"PRIu64"\n",
3160 le64toh(f->header->n_tags));
3161 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3162 printf("Entry Array Objects: %"PRIu64"\n",
3163 le64toh(f->header->n_entry_arrays));
3164
3165 if (fstat(f->fd, &st) >= 0)
3166 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3167 }
3168
3169 static int journal_file_warn_btrfs(JournalFile *f) {
3170 unsigned attrs;
3171 int r;
3172
3173 assert(f);
3174
3175 /* Before we write anything, check if the COW logic is turned
3176 * off on btrfs. Given our write pattern that is quite
3177 * unfriendly to COW file systems this should greatly improve
3178 * performance on COW file systems, such as btrfs, at the
3179 * expense of data integrity features (which shouldn't be too
3180 * bad, given that we do our own checksumming). */
3181
3182 r = btrfs_is_filesystem(f->fd);
3183 if (r < 0)
3184 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3185 if (!r)
3186 return 0;
3187
3188 r = read_attr_fd(f->fd, &attrs);
3189 if (r < 0)
3190 return log_warning_errno(r, "Failed to read file attributes: %m");
3191
3192 if (attrs & FS_NOCOW_FL) {
3193 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3194 return 0;
3195 }
3196
3197 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3198 "This is likely to slow down journal access substantially, please consider turning "
3199 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3200
3201 return 1;
3202 }
3203
3204 int journal_file_open(
3205 int fd,
3206 const char *fname,
3207 int flags,
3208 mode_t mode,
3209 bool compress,
3210 uint64_t compress_threshold_bytes,
3211 bool seal,
3212 JournalMetrics *metrics,
3213 MMapCache *mmap_cache,
3214 Set *deferred_closes,
3215 JournalFile *template,
3216 JournalFile **ret) {
3217
3218 bool newly_created = false;
3219 JournalFile *f;
3220 void *h;
3221 int r;
3222 char bytes[FORMAT_BYTES_MAX];
3223
3224 assert(ret);
3225 assert(fd >= 0 || fname);
3226
3227 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3228 return -EINVAL;
3229
3230 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3231 return -EINVAL;
3232
3233 f = new0(JournalFile, 1);
3234 if (!f)
3235 return -ENOMEM;
3236
3237 f->fd = fd;
3238 f->mode = mode;
3239
3240 f->flags = flags;
3241 f->prot = prot_from_flags(flags);
3242 f->writable = (flags & O_ACCMODE) != O_RDONLY;
3243 #if HAVE_LZ4
3244 f->compress_lz4 = compress;
3245 #elif HAVE_XZ
3246 f->compress_xz = compress;
3247 #endif
3248
3249 if (compress_threshold_bytes == (uint64_t) -1)
3250 f->compress_threshold_bytes = DEFAULT_COMPRESS_THRESHOLD;
3251 else
3252 f->compress_threshold_bytes = MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes);
3253
3254 #if HAVE_GCRYPT
3255 f->seal = seal;
3256 #endif
3257
3258 log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
3259 yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
3260 format_bytes(bytes, sizeof(bytes), f->compress_threshold_bytes));
3261
3262 if (mmap_cache)
3263 f->mmap = mmap_cache_ref(mmap_cache);
3264 else {
3265 f->mmap = mmap_cache_new();
3266 if (!f->mmap) {
3267 r = -ENOMEM;
3268 goto fail;
3269 }
3270 }
3271
3272 if (fname) {
3273 f->path = strdup(fname);
3274 if (!f->path) {
3275 r = -ENOMEM;
3276 goto fail;
3277 }
3278 } else {
3279 assert(fd >= 0);
3280
3281 /* If we don't know the path, fill in something explanatory and vaguely useful */
3282 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3283 r = -ENOMEM;
3284 goto fail;
3285 }
3286 }
3287
3288 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3289 if (!f->chain_cache) {
3290 r = -ENOMEM;
3291 goto fail;
3292 }
3293
3294 if (f->fd < 0) {
3295 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3296 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3297 * it doesn't hurt in that case. */
3298
3299 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
3300 if (f->fd < 0) {
3301 r = -errno;
3302 goto fail;
3303 }
3304
3305 /* fds we opened here by us should also be closed by us. */
3306 f->close_fd = true;
3307
3308 r = fd_nonblock(f->fd, false);
3309 if (r < 0)
3310 goto fail;
3311 }
3312
3313 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3314 if (!f->cache_fd) {
3315 r = -ENOMEM;
3316 goto fail;
3317 }
3318
3319 r = journal_file_fstat(f);
3320 if (r < 0)
3321 goto fail;
3322
3323 if (f->last_stat.st_size == 0 && f->writable) {
3324
3325 (void) journal_file_warn_btrfs(f);
3326
3327 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3328 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3329 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3330 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3331 * solely on mtime/atime/ctime of the file. */
3332 (void) fd_setcrtime(f->fd, 0);
3333
3334 #if HAVE_GCRYPT
3335 /* Try to load the FSPRG state, and if we can't, then
3336 * just don't do sealing */
3337 if (f->seal) {
3338 r = journal_file_fss_load(f);
3339 if (r < 0)
3340 f->seal = false;
3341 }
3342 #endif
3343
3344 r = journal_file_init_header(f, template);
3345 if (r < 0)
3346 goto fail;
3347
3348 r = journal_file_fstat(f);
3349 if (r < 0)
3350 goto fail;
3351
3352 newly_created = true;
3353 }
3354
3355 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3356 r = -ENODATA;
3357 goto fail;
3358 }
3359
3360 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3361 if (r < 0)
3362 goto fail;
3363
3364 f->header = h;
3365
3366 if (!newly_created) {
3367 set_clear_with_destructor(deferred_closes, journal_file_close);
3368
3369 r = journal_file_verify_header(f);
3370 if (r < 0)
3371 goto fail;
3372 }
3373
3374 #if HAVE_GCRYPT
3375 if (!newly_created && f->writable) {
3376 r = journal_file_fss_load(f);
3377 if (r < 0)
3378 goto fail;
3379 }
3380 #endif
3381
3382 if (f->writable) {
3383 if (metrics) {
3384 journal_default_metrics(metrics, f->fd);
3385 f->metrics = *metrics;
3386 } else if (template)
3387 f->metrics = template->metrics;
3388
3389 r = journal_file_refresh_header(f);
3390 if (r < 0)
3391 goto fail;
3392 }
3393
3394 #if HAVE_GCRYPT
3395 r = journal_file_hmac_setup(f);
3396 if (r < 0)
3397 goto fail;
3398 #endif
3399
3400 if (newly_created) {
3401 r = journal_file_setup_field_hash_table(f);
3402 if (r < 0)
3403 goto fail;
3404
3405 r = journal_file_setup_data_hash_table(f);
3406 if (r < 0)
3407 goto fail;
3408
3409 #if HAVE_GCRYPT
3410 r = journal_file_append_first_tag(f);
3411 if (r < 0)
3412 goto fail;
3413 #endif
3414 }
3415
3416 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3417 r = -EIO;
3418 goto fail;
3419 }
3420
3421 if (template && template->post_change_timer) {
3422 r = journal_file_enable_post_change_timer(
3423 f,
3424 sd_event_source_get_event(template->post_change_timer),
3425 template->post_change_timer_period);
3426
3427 if (r < 0)
3428 goto fail;
3429 }
3430
3431 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3432 f->close_fd = true;
3433
3434 *ret = f;
3435 return 0;
3436
3437 fail:
3438 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3439 r = -EIO;
3440
3441 (void) journal_file_close(f);
3442
3443 return r;
3444 }
3445
3446 int journal_file_rotate(JournalFile **f, bool compress, uint64_t compress_threshold_bytes, bool seal, Set *deferred_closes) {
3447 _cleanup_free_ char *p = NULL;
3448 size_t l;
3449 JournalFile *old_file, *new_file = NULL;
3450 int r;
3451
3452 assert(f);
3453 assert(*f);
3454
3455 old_file = *f;
3456
3457 if (!old_file->writable)
3458 return -EINVAL;
3459
3460 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3461 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3462 if (path_startswith(old_file->path, "/proc/self/fd"))
3463 return -EINVAL;
3464
3465 if (!endswith(old_file->path, ".journal"))
3466 return -EINVAL;
3467
3468 l = strlen(old_file->path);
3469 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3470 (int) l - 8, old_file->path,
3471 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3472 le64toh((*f)->header->head_entry_seqnum),
3473 le64toh((*f)->header->head_entry_realtime));
3474 if (r < 0)
3475 return -ENOMEM;
3476
3477 /* Try to rename the file to the archived version. If the file
3478 * already was deleted, we'll get ENOENT, let's ignore that
3479 * case. */
3480 r = rename(old_file->path, p);
3481 if (r < 0 && errno != ENOENT)
3482 return -errno;
3483
3484 /* Sync the rename to disk */
3485 (void) fsync_directory_of_file(old_file->fd);
3486
3487 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3488 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3489 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3490 * would result in the rotated journal never getting fsync() called before closing.
3491 * Now we simply queue the archive state by setting an archive bit, leaving the state
3492 * as STATE_ONLINE so proper offlining occurs. */
3493 old_file->archive = true;
3494
3495 /* Currently, btrfs is not very good with out write patterns
3496 * and fragments heavily. Let's defrag our journal files when
3497 * we archive them */
3498 old_file->defrag_on_close = true;
3499
3500 r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress,
3501 compress_threshold_bytes, seal, NULL, old_file->mmap, deferred_closes,
3502 old_file, &new_file);
3503
3504 if (deferred_closes &&
3505 set_put(deferred_closes, old_file) >= 0)
3506 (void) journal_file_set_offline(old_file, false);
3507 else
3508 (void) journal_file_close(old_file);
3509
3510 *f = new_file;
3511 return r;
3512 }
3513
3514 int journal_file_open_reliably(
3515 const char *fname,
3516 int flags,
3517 mode_t mode,
3518 bool compress,
3519 uint64_t compress_threshold_bytes,
3520 bool seal,
3521 JournalMetrics *metrics,
3522 MMapCache *mmap_cache,
3523 Set *deferred_closes,
3524 JournalFile *template,
3525 JournalFile **ret) {
3526
3527 int r;
3528 size_t l;
3529 _cleanup_free_ char *p = NULL;
3530
3531 r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3532 deferred_closes, template, ret);
3533 if (!IN_SET(r,
3534 -EBADMSG, /* Corrupted */
3535 -ENODATA, /* Truncated */
3536 -EHOSTDOWN, /* Other machine */
3537 -EPROTONOSUPPORT, /* Incompatible feature */
3538 -EBUSY, /* Unclean shutdown */
3539 -ESHUTDOWN, /* Already archived */
3540 -EIO, /* IO error, including SIGBUS on mmap */
3541 -EIDRM, /* File has been deleted */
3542 -ETXTBSY)) /* File is from the future */
3543 return r;
3544
3545 if ((flags & O_ACCMODE) == O_RDONLY)
3546 return r;
3547
3548 if (!(flags & O_CREAT))
3549 return r;
3550
3551 if (!endswith(fname, ".journal"))
3552 return r;
3553
3554 /* The file is corrupted. Rotate it away and try it again (but only once) */
3555
3556 l = strlen(fname);
3557 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
3558 (int) l - 8, fname,
3559 now(CLOCK_REALTIME),
3560 random_u64()) < 0)
3561 return -ENOMEM;
3562
3563 if (rename(fname, p) < 0)
3564 return -errno;
3565
3566 /* btrfs doesn't cope well with our write pattern and
3567 * fragments heavily. Let's defrag all files we rotate */
3568
3569 (void) chattr_path(p, 0, FS_NOCOW_FL);
3570 (void) btrfs_defrag(p);
3571
3572 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3573
3574 return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3575 deferred_closes, template, ret);
3576 }
3577
3578 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3579 uint64_t i, n;
3580 uint64_t q, xor_hash = 0;
3581 int r;
3582 EntryItem *items;
3583 dual_timestamp ts;
3584
3585 assert(from);
3586 assert(to);
3587 assert(o);
3588 assert(p);
3589
3590 if (!to->writable)
3591 return -EPERM;
3592
3593 ts.monotonic = le64toh(o->entry.monotonic);
3594 ts.realtime = le64toh(o->entry.realtime);
3595
3596 n = journal_file_entry_n_items(o);
3597 /* alloca() can't take 0, hence let's allocate at least one */
3598 items = alloca(sizeof(EntryItem) * MAX(1u, n));
3599
3600 for (i = 0; i < n; i++) {
3601 uint64_t l, h;
3602 le64_t le_hash;
3603 size_t t;
3604 void *data;
3605 Object *u;
3606
3607 q = le64toh(o->entry.items[i].object_offset);
3608 le_hash = o->entry.items[i].hash;
3609
3610 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3611 if (r < 0)
3612 return r;
3613
3614 if (le_hash != o->data.hash)
3615 return -EBADMSG;
3616
3617 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3618 t = (size_t) l;
3619
3620 /* We hit the limit on 32bit machines */
3621 if ((uint64_t) t != l)
3622 return -E2BIG;
3623
3624 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3625 #if HAVE_XZ || HAVE_LZ4
3626 size_t rsize = 0;
3627
3628 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3629 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3630 if (r < 0)
3631 return r;
3632
3633 data = from->compress_buffer;
3634 l = rsize;
3635 #else
3636 return -EPROTONOSUPPORT;
3637 #endif
3638 } else
3639 data = o->data.payload;
3640
3641 r = journal_file_append_data(to, data, l, &u, &h);
3642 if (r < 0)
3643 return r;
3644
3645 xor_hash ^= le64toh(u->data.hash);
3646 items[i].object_offset = htole64(h);
3647 items[i].hash = u->data.hash;
3648
3649 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3650 if (r < 0)
3651 return r;
3652 }
3653
3654 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3655
3656 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3657 return -EIO;
3658
3659 return r;
3660 }
3661
3662 void journal_reset_metrics(JournalMetrics *m) {
3663 assert(m);
3664
3665 /* Set everything to "pick automatic values". */
3666
3667 *m = (JournalMetrics) {
3668 .min_use = (uint64_t) -1,
3669 .max_use = (uint64_t) -1,
3670 .min_size = (uint64_t) -1,
3671 .max_size = (uint64_t) -1,
3672 .keep_free = (uint64_t) -1,
3673 .n_max_files = (uint64_t) -1,
3674 };
3675 }
3676
3677 void journal_default_metrics(JournalMetrics *m, int fd) {
3678 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3679 struct statvfs ss;
3680 uint64_t fs_size;
3681
3682 assert(m);
3683 assert(fd >= 0);
3684
3685 if (fstatvfs(fd, &ss) >= 0)
3686 fs_size = ss.f_frsize * ss.f_blocks;
3687 else {
3688 log_debug_errno(errno, "Failed to determine disk size: %m");
3689 fs_size = 0;
3690 }
3691
3692 if (m->max_use == (uint64_t) -1) {
3693
3694 if (fs_size > 0) {
3695 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3696
3697 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3698 m->max_use = DEFAULT_MAX_USE_UPPER;
3699
3700 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3701 m->max_use = DEFAULT_MAX_USE_LOWER;
3702 } else
3703 m->max_use = DEFAULT_MAX_USE_LOWER;
3704 } else {
3705 m->max_use = PAGE_ALIGN(m->max_use);
3706
3707 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3708 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3709 }
3710
3711 if (m->min_use == (uint64_t) -1)
3712 m->min_use = DEFAULT_MIN_USE;
3713
3714 if (m->min_use > m->max_use)
3715 m->min_use = m->max_use;
3716
3717 if (m->max_size == (uint64_t) -1) {
3718 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3719
3720 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3721 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3722 } else
3723 m->max_size = PAGE_ALIGN(m->max_size);
3724
3725 if (m->max_size != 0) {
3726 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3727 m->max_size = JOURNAL_FILE_SIZE_MIN;
3728
3729 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3730 m->max_use = m->max_size*2;
3731 }
3732
3733 if (m->min_size == (uint64_t) -1)
3734 m->min_size = JOURNAL_FILE_SIZE_MIN;
3735 else {
3736 m->min_size = PAGE_ALIGN(m->min_size);
3737
3738 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3739 m->min_size = JOURNAL_FILE_SIZE_MIN;
3740
3741 if (m->max_size != 0 && m->min_size > m->max_size)
3742 m->max_size = m->min_size;
3743 }
3744
3745 if (m->keep_free == (uint64_t) -1) {
3746
3747 if (fs_size > 0) {
3748 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3749
3750 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3751 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3752
3753 } else
3754 m->keep_free = DEFAULT_KEEP_FREE;
3755 }
3756
3757 if (m->n_max_files == (uint64_t) -1)
3758 m->n_max_files = DEFAULT_N_MAX_FILES;
3759
3760 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3761 format_bytes(a, sizeof(a), m->min_use),
3762 format_bytes(b, sizeof(b), m->max_use),
3763 format_bytes(c, sizeof(c), m->max_size),
3764 format_bytes(d, sizeof(d), m->min_size),
3765 format_bytes(e, sizeof(e), m->keep_free),
3766 m->n_max_files);
3767 }
3768
3769 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3770 assert(f);
3771 assert(f->header);
3772 assert(from || to);
3773
3774 if (from) {
3775 if (f->header->head_entry_realtime == 0)
3776 return -ENOENT;
3777
3778 *from = le64toh(f->header->head_entry_realtime);
3779 }
3780
3781 if (to) {
3782 if (f->header->tail_entry_realtime == 0)
3783 return -ENOENT;
3784
3785 *to = le64toh(f->header->tail_entry_realtime);
3786 }
3787
3788 return 1;
3789 }
3790
3791 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3792 Object *o;
3793 uint64_t p;
3794 int r;
3795
3796 assert(f);
3797 assert(from || to);
3798
3799 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3800 if (r <= 0)
3801 return r;
3802
3803 if (le64toh(o->data.n_entries) <= 0)
3804 return 0;
3805
3806 if (from) {
3807 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3808 if (r < 0)
3809 return r;
3810
3811 *from = le64toh(o->entry.monotonic);
3812 }
3813
3814 if (to) {
3815 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3816 if (r < 0)
3817 return r;
3818
3819 r = generic_array_get_plus_one(f,
3820 le64toh(o->data.entry_offset),
3821 le64toh(o->data.entry_array_offset),
3822 le64toh(o->data.n_entries)-1,
3823 &o, NULL);
3824 if (r <= 0)
3825 return r;
3826
3827 *to = le64toh(o->entry.monotonic);
3828 }
3829
3830 return 1;
3831 }
3832
3833 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3834 assert(f);
3835 assert(f->header);
3836
3837 /* If we gained new header fields we gained new features,
3838 * hence suggest a rotation */
3839 if (le64toh(f->header->header_size) < sizeof(Header)) {
3840 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3841 return true;
3842 }
3843
3844 /* Let's check if the hash tables grew over a certain fill
3845 * level (75%, borrowing this value from Java's hash table
3846 * implementation), and if so suggest a rotation. To calculate
3847 * the fill level we need the n_data field, which only exists
3848 * in newer versions. */
3849
3850 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3851 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3852 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3853 f->path,
3854 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3855 le64toh(f->header->n_data),
3856 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3857 (unsigned long long) f->last_stat.st_size,
3858 f->last_stat.st_size / le64toh(f->header->n_data));
3859 return true;
3860 }
3861
3862 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3863 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3864 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3865 f->path,
3866 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3867 le64toh(f->header->n_fields),
3868 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3869 return true;
3870 }
3871
3872 /* Are the data objects properly indexed by field objects? */
3873 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3874 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3875 le64toh(f->header->n_data) > 0 &&
3876 le64toh(f->header->n_fields) == 0)
3877 return true;
3878
3879 if (max_file_usec > 0) {
3880 usec_t t, h;
3881
3882 h = le64toh(f->header->head_entry_realtime);
3883 t = now(CLOCK_REALTIME);
3884
3885 if (h > 0 && t > h + max_file_usec)
3886 return true;
3887 }
3888
3889 return false;
3890 }