]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
Merge pull request #8461 from keszybz/oss-fuzz-fixes
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2011 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <linux/fs.h>
24 #include <pthread.h>
25 #include <stddef.h>
26 #include <sys/mman.h>
27 #include <sys/statvfs.h>
28 #include <sys/uio.h>
29 #include <unistd.h>
30
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
34 #include "compress.h"
35 #include "fd-util.h"
36 #include "fs-util.h"
37 #include "journal-authenticate.h"
38 #include "journal-def.h"
39 #include "journal-file.h"
40 #include "lookup3.h"
41 #include "parse-util.h"
42 #include "path-util.h"
43 #include "random-util.h"
44 #include "sd-event.h"
45 #include "set.h"
46 #include "stat-util.h"
47 #include "string-util.h"
48 #include "strv.h"
49 #include "xattr-util.h"
50
51 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
52 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
53
54 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
55
56 /* This is the minimum journal file size */
57 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
58
59 /* These are the lower and upper bounds if we deduce the max_use value
60 * from the file system size */
61 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
62 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
63
64 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
65 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
66
67 /* This is the upper bound if we deduce max_size from max_use */
68 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
69
70 /* This is the upper bound if we deduce the keep_free value from the
71 * file system size */
72 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
73
74 /* This is the keep_free value when we can't determine the system
75 * size */
76 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
77
78 /* This is the default maximum number of journal files to keep around. */
79 #define DEFAULT_N_MAX_FILES (100)
80
81 /* n_data was the first entry we added after the initial file format design */
82 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
83
84 /* How many entries to keep in the entry array chain cache at max */
85 #define CHAIN_CACHE_MAX 20
86
87 /* How much to increase the journal file size at once each time we allocate something new. */
88 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
89
90 /* Reread fstat() of the file for detecting deletions at least this often */
91 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
92
93 /* The mmap context to use for the header we pick as one above the last defined typed */
94 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
95
96 #ifdef __clang__
97 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
98 #endif
99
100 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
101 * As a result we use atomic operations on f->offline_state for inter-thread communications with
102 * journal_file_set_offline() and journal_file_set_online(). */
103 static void journal_file_set_offline_internal(JournalFile *f) {
104 assert(f);
105 assert(f->fd >= 0);
106 assert(f->header);
107
108 for (;;) {
109 switch (f->offline_state) {
110 case OFFLINE_CANCEL:
111 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
112 continue;
113 return;
114
115 case OFFLINE_AGAIN_FROM_SYNCING:
116 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
117 continue;
118 break;
119
120 case OFFLINE_AGAIN_FROM_OFFLINING:
121 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
122 continue;
123 break;
124
125 case OFFLINE_SYNCING:
126 (void) fsync(f->fd);
127
128 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
129 continue;
130
131 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
132 (void) fsync(f->fd);
133 break;
134
135 case OFFLINE_OFFLINING:
136 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
137 continue;
138 _fallthrough_;
139 case OFFLINE_DONE:
140 return;
141
142 case OFFLINE_JOINED:
143 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
144 return;
145 }
146 }
147 }
148
149 static void * journal_file_set_offline_thread(void *arg) {
150 JournalFile *f = arg;
151
152 (void) pthread_setname_np(pthread_self(), "journal-offline");
153
154 journal_file_set_offline_internal(f);
155
156 return NULL;
157 }
158
159 static int journal_file_set_offline_thread_join(JournalFile *f) {
160 int r;
161
162 assert(f);
163
164 if (f->offline_state == OFFLINE_JOINED)
165 return 0;
166
167 r = pthread_join(f->offline_thread, NULL);
168 if (r)
169 return -r;
170
171 f->offline_state = OFFLINE_JOINED;
172
173 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
174 return -EIO;
175
176 return 0;
177 }
178
179 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
180 static bool journal_file_set_offline_try_restart(JournalFile *f) {
181 for (;;) {
182 switch (f->offline_state) {
183 case OFFLINE_AGAIN_FROM_SYNCING:
184 case OFFLINE_AGAIN_FROM_OFFLINING:
185 return true;
186
187 case OFFLINE_CANCEL:
188 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
189 continue;
190 return true;
191
192 case OFFLINE_SYNCING:
193 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
194 continue;
195 return true;
196
197 case OFFLINE_OFFLINING:
198 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
199 continue;
200 return true;
201
202 default:
203 return false;
204 }
205 }
206 }
207
208 /* Sets a journal offline.
209 *
210 * If wait is false then an offline is dispatched in a separate thread for a
211 * subsequent journal_file_set_offline() or journal_file_set_online() of the
212 * same journal to synchronize with.
213 *
214 * If wait is true, then either an existing offline thread will be restarted
215 * and joined, or if none exists the offline is simply performed in this
216 * context without involving another thread.
217 */
218 int journal_file_set_offline(JournalFile *f, bool wait) {
219 bool restarted;
220 int r;
221
222 assert(f);
223
224 if (!f->writable)
225 return -EPERM;
226
227 if (!(f->fd >= 0 && f->header))
228 return -EINVAL;
229
230 /* An offlining journal is implicitly online and may modify f->header->state,
231 * we must also join any potentially lingering offline thread when not online. */
232 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
233 return journal_file_set_offline_thread_join(f);
234
235 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
236 restarted = journal_file_set_offline_try_restart(f);
237 if ((restarted && wait) || !restarted) {
238 r = journal_file_set_offline_thread_join(f);
239 if (r < 0)
240 return r;
241 }
242
243 if (restarted)
244 return 0;
245
246 /* Initiate a new offline. */
247 f->offline_state = OFFLINE_SYNCING;
248
249 if (wait) /* Without using a thread if waiting. */
250 journal_file_set_offline_internal(f);
251 else {
252 sigset_t ss, saved_ss;
253 int k;
254
255 if (sigfillset(&ss) < 0)
256 return -errno;
257
258 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
259 if (r > 0)
260 return -r;
261
262 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
263
264 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
265 if (r > 0) {
266 f->offline_state = OFFLINE_JOINED;
267 return -r;
268 }
269 if (k > 0)
270 return -k;
271 }
272
273 return 0;
274 }
275
276 static int journal_file_set_online(JournalFile *f) {
277 bool joined = false;
278
279 assert(f);
280
281 if (!f->writable)
282 return -EPERM;
283
284 if (!(f->fd >= 0 && f->header))
285 return -EINVAL;
286
287 while (!joined) {
288 switch (f->offline_state) {
289 case OFFLINE_JOINED:
290 /* No offline thread, no need to wait. */
291 joined = true;
292 break;
293
294 case OFFLINE_SYNCING:
295 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
296 continue;
297 /* Canceled syncing prior to offlining, no need to wait. */
298 break;
299
300 case OFFLINE_AGAIN_FROM_SYNCING:
301 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
302 continue;
303 /* Canceled restart from syncing, no need to wait. */
304 break;
305
306 case OFFLINE_AGAIN_FROM_OFFLINING:
307 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
308 continue;
309 /* Canceled restart from offlining, must wait for offlining to complete however. */
310 _fallthrough_;
311 default: {
312 int r;
313
314 r = journal_file_set_offline_thread_join(f);
315 if (r < 0)
316 return r;
317
318 joined = true;
319 break;
320 }
321 }
322 }
323
324 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
325 return -EIO;
326
327 switch (f->header->state) {
328 case STATE_ONLINE:
329 return 0;
330
331 case STATE_OFFLINE:
332 f->header->state = STATE_ONLINE;
333 (void) fsync(f->fd);
334 return 0;
335
336 default:
337 return -EINVAL;
338 }
339 }
340
341 bool journal_file_is_offlining(JournalFile *f) {
342 assert(f);
343
344 __sync_synchronize();
345
346 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
347 return false;
348
349 return true;
350 }
351
352 JournalFile* journal_file_close(JournalFile *f) {
353 assert(f);
354
355 #if HAVE_GCRYPT
356 /* Write the final tag */
357 if (f->seal && f->writable) {
358 int r;
359
360 r = journal_file_append_tag(f);
361 if (r < 0)
362 log_error_errno(r, "Failed to append tag when closing journal: %m");
363 }
364 #endif
365
366 if (f->post_change_timer) {
367 int enabled;
368
369 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
370 if (enabled == SD_EVENT_ONESHOT)
371 journal_file_post_change(f);
372
373 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
374 sd_event_source_unref(f->post_change_timer);
375 }
376
377 journal_file_set_offline(f, true);
378
379 if (f->mmap && f->cache_fd)
380 mmap_cache_free_fd(f->mmap, f->cache_fd);
381
382 if (f->fd >= 0 && f->defrag_on_close) {
383
384 /* Be friendly to btrfs: turn COW back on again now,
385 * and defragment the file. We won't write to the file
386 * ever again, hence remove all fragmentation, and
387 * reenable all the good bits COW usually provides
388 * (such as data checksumming). */
389
390 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
391 (void) btrfs_defrag_fd(f->fd);
392 }
393
394 if (f->close_fd)
395 safe_close(f->fd);
396 free(f->path);
397
398 mmap_cache_unref(f->mmap);
399
400 ordered_hashmap_free_free(f->chain_cache);
401
402 #if HAVE_XZ || HAVE_LZ4
403 free(f->compress_buffer);
404 #endif
405
406 #if HAVE_GCRYPT
407 if (f->fss_file)
408 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
409 else
410 free(f->fsprg_state);
411
412 free(f->fsprg_seed);
413
414 if (f->hmac)
415 gcry_md_close(f->hmac);
416 #endif
417
418 return mfree(f);
419 }
420
421 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
422 Header h = {};
423 ssize_t k;
424 int r;
425
426 assert(f);
427
428 memcpy(h.signature, HEADER_SIGNATURE, 8);
429 h.header_size = htole64(ALIGN64(sizeof(h)));
430
431 h.incompatible_flags |= htole32(
432 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
433 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
434
435 h.compatible_flags = htole32(
436 f->seal * HEADER_COMPATIBLE_SEALED);
437
438 r = sd_id128_randomize(&h.file_id);
439 if (r < 0)
440 return r;
441
442 if (template) {
443 h.seqnum_id = template->header->seqnum_id;
444 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
445 } else
446 h.seqnum_id = h.file_id;
447
448 k = pwrite(f->fd, &h, sizeof(h), 0);
449 if (k < 0)
450 return -errno;
451
452 if (k != sizeof(h))
453 return -EIO;
454
455 return 0;
456 }
457
458 static int journal_file_refresh_header(JournalFile *f) {
459 sd_id128_t boot_id;
460 int r;
461
462 assert(f);
463 assert(f->header);
464
465 r = sd_id128_get_machine(&f->header->machine_id);
466 if (r < 0)
467 return r;
468
469 r = sd_id128_get_boot(&boot_id);
470 if (r < 0)
471 return r;
472
473 if (sd_id128_equal(boot_id, f->header->boot_id))
474 f->tail_entry_monotonic_valid = true;
475
476 f->header->boot_id = boot_id;
477
478 r = journal_file_set_online(f);
479
480 /* Sync the online state to disk */
481 (void) fsync(f->fd);
482
483 /* We likely just created a new file, also sync the directory this file is located in. */
484 (void) fsync_directory_of_file(f->fd);
485
486 return r;
487 }
488
489 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
490 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
491 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
492 const char *type = compatible ? "compatible" : "incompatible";
493 uint32_t flags;
494
495 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
496
497 if (flags & ~supported) {
498 if (flags & ~any)
499 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
500 f->path, type, flags & ~any);
501 flags = (flags & any) & ~supported;
502 if (flags) {
503 const char* strv[3];
504 unsigned n = 0;
505 _cleanup_free_ char *t = NULL;
506
507 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
508 strv[n++] = "sealed";
509 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
510 strv[n++] = "xz-compressed";
511 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
512 strv[n++] = "lz4-compressed";
513 strv[n] = NULL;
514 assert(n < ELEMENTSOF(strv));
515
516 t = strv_join((char**) strv, ", ");
517 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
518 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
519 }
520 return true;
521 }
522
523 return false;
524 }
525
526 static int journal_file_verify_header(JournalFile *f) {
527 uint64_t arena_size, header_size;
528
529 assert(f);
530 assert(f->header);
531
532 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
533 return -EBADMSG;
534
535 /* In both read and write mode we refuse to open files with incompatible
536 * flags we don't know. */
537 if (warn_wrong_flags(f, false))
538 return -EPROTONOSUPPORT;
539
540 /* When open for writing we refuse to open files with compatible flags, too. */
541 if (f->writable && warn_wrong_flags(f, true))
542 return -EPROTONOSUPPORT;
543
544 if (f->header->state >= _STATE_MAX)
545 return -EBADMSG;
546
547 header_size = le64toh(f->header->header_size);
548
549 /* The first addition was n_data, so check that we are at least this large */
550 if (header_size < HEADER_SIZE_MIN)
551 return -EBADMSG;
552
553 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
554 return -EBADMSG;
555
556 arena_size = le64toh(f->header->arena_size);
557
558 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
559 return -ENODATA;
560
561 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
562 return -ENODATA;
563
564 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
565 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
566 !VALID64(le64toh(f->header->tail_object_offset)) ||
567 !VALID64(le64toh(f->header->entry_array_offset)))
568 return -ENODATA;
569
570 if (f->writable) {
571 sd_id128_t machine_id;
572 uint8_t state;
573 int r;
574
575 r = sd_id128_get_machine(&machine_id);
576 if (r < 0)
577 return r;
578
579 if (!sd_id128_equal(machine_id, f->header->machine_id))
580 return -EHOSTDOWN;
581
582 state = f->header->state;
583
584 if (state == STATE_ARCHIVED)
585 return -ESHUTDOWN; /* Already archived */
586 else if (state == STATE_ONLINE) {
587 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
588 return -EBUSY;
589 } else if (state != STATE_OFFLINE) {
590 log_debug("Journal file %s has unknown state %i.", f->path, state);
591 return -EBUSY;
592 }
593
594 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
595 return -EBADMSG;
596
597 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
598 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
599 * bisection. */
600 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
601 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
602 return -ETXTBSY;
603 }
604 }
605
606 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
607 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
608
609 f->seal = JOURNAL_HEADER_SEALED(f->header);
610
611 return 0;
612 }
613
614 static int journal_file_fstat(JournalFile *f) {
615 int r;
616
617 assert(f);
618 assert(f->fd >= 0);
619
620 if (fstat(f->fd, &f->last_stat) < 0)
621 return -errno;
622
623 f->last_stat_usec = now(CLOCK_MONOTONIC);
624
625 /* Refuse dealing with with files that aren't regular */
626 r = stat_verify_regular(&f->last_stat);
627 if (r < 0)
628 return r;
629
630 /* Refuse appending to files that are already deleted */
631 if (f->last_stat.st_nlink <= 0)
632 return -EIDRM;
633
634 return 0;
635 }
636
637 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
638 uint64_t old_size, new_size;
639 int r;
640
641 assert(f);
642 assert(f->header);
643
644 /* We assume that this file is not sparse, and we know that
645 * for sure, since we always call posix_fallocate()
646 * ourselves */
647
648 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
649 return -EIO;
650
651 old_size =
652 le64toh(f->header->header_size) +
653 le64toh(f->header->arena_size);
654
655 new_size = PAGE_ALIGN(offset + size);
656 if (new_size < le64toh(f->header->header_size))
657 new_size = le64toh(f->header->header_size);
658
659 if (new_size <= old_size) {
660
661 /* We already pre-allocated enough space, but before
662 * we write to it, let's check with fstat() if the
663 * file got deleted, in order make sure we don't throw
664 * away the data immediately. Don't check fstat() for
665 * all writes though, but only once ever 10s. */
666
667 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
668 return 0;
669
670 return journal_file_fstat(f);
671 }
672
673 /* Allocate more space. */
674
675 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
676 return -E2BIG;
677
678 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
679 struct statvfs svfs;
680
681 if (fstatvfs(f->fd, &svfs) >= 0) {
682 uint64_t available;
683
684 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
685
686 if (new_size - old_size > available)
687 return -E2BIG;
688 }
689 }
690
691 /* Increase by larger blocks at once */
692 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
693 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
694 new_size = f->metrics.max_size;
695
696 /* Note that the glibc fallocate() fallback is very
697 inefficient, hence we try to minimize the allocation area
698 as we can. */
699 r = posix_fallocate(f->fd, old_size, new_size - old_size);
700 if (r != 0)
701 return -r;
702
703 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
704
705 return journal_file_fstat(f);
706 }
707
708 static unsigned type_to_context(ObjectType type) {
709 /* One context for each type, plus one catch-all for the rest */
710 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
711 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
712 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
713 }
714
715 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
716 int r;
717
718 assert(f);
719 assert(ret);
720
721 if (size <= 0)
722 return -EINVAL;
723
724 /* Avoid SIGBUS on invalid accesses */
725 if (offset + size > (uint64_t) f->last_stat.st_size) {
726 /* Hmm, out of range? Let's refresh the fstat() data
727 * first, before we trust that check. */
728
729 r = journal_file_fstat(f);
730 if (r < 0)
731 return r;
732
733 if (offset + size > (uint64_t) f->last_stat.st_size)
734 return -EADDRNOTAVAIL;
735 }
736
737 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
738 }
739
740 static uint64_t minimum_header_size(Object *o) {
741
742 static const uint64_t table[] = {
743 [OBJECT_DATA] = sizeof(DataObject),
744 [OBJECT_FIELD] = sizeof(FieldObject),
745 [OBJECT_ENTRY] = sizeof(EntryObject),
746 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
747 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
748 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
749 [OBJECT_TAG] = sizeof(TagObject),
750 };
751
752 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
753 return sizeof(ObjectHeader);
754
755 return table[o->object.type];
756 }
757
758 /* Lightweight object checks. We want this to be fast, so that we won't
759 * slowdown every journal_file_move_to_object() call too much. */
760 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
761 assert(f);
762 assert(o);
763
764 switch (o->object.type) {
765
766 case OBJECT_DATA: {
767 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
768 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
769 le64toh(o->data.n_entries), offset);
770 return -EBADMSG;
771 }
772
773 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
774 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
775 offsetof(DataObject, payload),
776 le64toh(o->object.size),
777 offset);
778 return -EBADMSG;
779 }
780
781 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
782 !VALID64(le64toh(o->data.next_field_offset)) ||
783 !VALID64(le64toh(o->data.entry_offset)) ||
784 !VALID64(le64toh(o->data.entry_array_offset))) {
785 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
786 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
787 le64toh(o->data.next_hash_offset),
788 le64toh(o->data.next_field_offset),
789 le64toh(o->data.entry_offset),
790 le64toh(o->data.entry_array_offset),
791 offset);
792 return -EBADMSG;
793 }
794
795 break;
796 }
797
798 case OBJECT_FIELD:
799 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
800 log_debug(
801 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
802 offsetof(FieldObject, payload),
803 le64toh(o->object.size),
804 offset);
805 return -EBADMSG;
806 }
807
808 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
809 !VALID64(le64toh(o->field.head_data_offset))) {
810 log_debug(
811 "Invalid offset, next_hash_offset="OFSfmt
812 ", head_data_offset="OFSfmt": %"PRIu64,
813 le64toh(o->field.next_hash_offset),
814 le64toh(o->field.head_data_offset),
815 offset);
816 return -EBADMSG;
817 }
818 break;
819
820 case OBJECT_ENTRY:
821 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
822 log_debug(
823 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
824 offsetof(EntryObject, items),
825 le64toh(o->object.size),
826 offset);
827 return -EBADMSG;
828 }
829
830 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
831 log_debug(
832 "Invalid number items in entry: %"PRIu64": %"PRIu64,
833 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
834 offset);
835 return -EBADMSG;
836 }
837
838 if (le64toh(o->entry.seqnum) <= 0) {
839 log_debug(
840 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
841 le64toh(o->entry.seqnum),
842 offset);
843 return -EBADMSG;
844 }
845
846 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
847 log_debug(
848 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
849 le64toh(o->entry.realtime),
850 offset);
851 return -EBADMSG;
852 }
853
854 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
855 log_debug(
856 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
857 le64toh(o->entry.monotonic),
858 offset);
859 return -EBADMSG;
860 }
861
862 break;
863
864 case OBJECT_DATA_HASH_TABLE:
865 case OBJECT_FIELD_HASH_TABLE:
866 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
867 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
868 log_debug(
869 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
870 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
871 le64toh(o->object.size),
872 offset);
873 return -EBADMSG;
874 }
875
876 break;
877
878 case OBJECT_ENTRY_ARRAY:
879 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
880 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
881 log_debug(
882 "Invalid object entry array size: %"PRIu64": %"PRIu64,
883 le64toh(o->object.size),
884 offset);
885 return -EBADMSG;
886 }
887
888 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
889 log_debug(
890 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
891 le64toh(o->entry_array.next_entry_array_offset),
892 offset);
893 return -EBADMSG;
894 }
895
896 break;
897
898 case OBJECT_TAG:
899 if (le64toh(o->object.size) != sizeof(TagObject)) {
900 log_debug(
901 "Invalid object tag size: %"PRIu64": %"PRIu64,
902 le64toh(o->object.size),
903 offset);
904 return -EBADMSG;
905 }
906
907 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
908 log_debug(
909 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
910 le64toh(o->tag.epoch),
911 offset);
912 return -EBADMSG;
913 }
914
915 break;
916 }
917
918 return 0;
919 }
920
921 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
922 int r;
923 void *t;
924 size_t tsize;
925 Object *o;
926 uint64_t s;
927
928 assert(f);
929 assert(ret);
930
931 /* Objects may only be located at multiple of 64 bit */
932 if (!VALID64(offset)) {
933 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
934 return -EBADMSG;
935 }
936
937 /* Object may not be located in the file header */
938 if (offset < le64toh(f->header->header_size)) {
939 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
940 return -EBADMSG;
941 }
942
943 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
944 if (r < 0)
945 return r;
946
947 o = (Object*) t;
948 s = le64toh(o->object.size);
949
950 if (s == 0) {
951 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
952 return -EBADMSG;
953 }
954 if (s < sizeof(ObjectHeader)) {
955 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
956 return -EBADMSG;
957 }
958
959 if (o->object.type <= OBJECT_UNUSED) {
960 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
961 return -EBADMSG;
962 }
963
964 if (s < minimum_header_size(o)) {
965 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
966 return -EBADMSG;
967 }
968
969 if (type > OBJECT_UNUSED && o->object.type != type) {
970 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
971 return -EBADMSG;
972 }
973
974 if (s > tsize) {
975 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
976 if (r < 0)
977 return r;
978
979 o = (Object*) t;
980 }
981
982 r = journal_file_check_object(f, offset, o);
983 if (r < 0)
984 return r;
985
986 *ret = o;
987 return 0;
988 }
989
990 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
991 uint64_t r;
992
993 assert(f);
994 assert(f->header);
995
996 r = le64toh(f->header->tail_entry_seqnum) + 1;
997
998 if (seqnum) {
999 /* If an external seqnum counter was passed, we update
1000 * both the local and the external one, and set it to
1001 * the maximum of both */
1002
1003 if (*seqnum + 1 > r)
1004 r = *seqnum + 1;
1005
1006 *seqnum = r;
1007 }
1008
1009 f->header->tail_entry_seqnum = htole64(r);
1010
1011 if (f->header->head_entry_seqnum == 0)
1012 f->header->head_entry_seqnum = htole64(r);
1013
1014 return r;
1015 }
1016
1017 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
1018 int r;
1019 uint64_t p;
1020 Object *tail, *o;
1021 void *t;
1022
1023 assert(f);
1024 assert(f->header);
1025 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
1026 assert(size >= sizeof(ObjectHeader));
1027 assert(offset);
1028 assert(ret);
1029
1030 r = journal_file_set_online(f);
1031 if (r < 0)
1032 return r;
1033
1034 p = le64toh(f->header->tail_object_offset);
1035 if (p == 0)
1036 p = le64toh(f->header->header_size);
1037 else {
1038 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1039 if (r < 0)
1040 return r;
1041
1042 p += ALIGN64(le64toh(tail->object.size));
1043 }
1044
1045 r = journal_file_allocate(f, p, size);
1046 if (r < 0)
1047 return r;
1048
1049 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1050 if (r < 0)
1051 return r;
1052
1053 o = (Object*) t;
1054
1055 zero(o->object);
1056 o->object.type = type;
1057 o->object.size = htole64(size);
1058
1059 f->header->tail_object_offset = htole64(p);
1060 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1061
1062 *ret = o;
1063 *offset = p;
1064
1065 return 0;
1066 }
1067
1068 static int journal_file_setup_data_hash_table(JournalFile *f) {
1069 uint64_t s, p;
1070 Object *o;
1071 int r;
1072
1073 assert(f);
1074 assert(f->header);
1075
1076 /* We estimate that we need 1 hash table entry per 768 bytes
1077 of journal file and we want to make sure we never get
1078 beyond 75% fill level. Calculate the hash table size for
1079 the maximum file size based on these metrics. */
1080
1081 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1082 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1083 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1084
1085 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1086
1087 r = journal_file_append_object(f,
1088 OBJECT_DATA_HASH_TABLE,
1089 offsetof(Object, hash_table.items) + s,
1090 &o, &p);
1091 if (r < 0)
1092 return r;
1093
1094 memzero(o->hash_table.items, s);
1095
1096 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1097 f->header->data_hash_table_size = htole64(s);
1098
1099 return 0;
1100 }
1101
1102 static int journal_file_setup_field_hash_table(JournalFile *f) {
1103 uint64_t s, p;
1104 Object *o;
1105 int r;
1106
1107 assert(f);
1108 assert(f->header);
1109
1110 /* We use a fixed size hash table for the fields as this
1111 * number should grow very slowly only */
1112
1113 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1114 r = journal_file_append_object(f,
1115 OBJECT_FIELD_HASH_TABLE,
1116 offsetof(Object, hash_table.items) + s,
1117 &o, &p);
1118 if (r < 0)
1119 return r;
1120
1121 memzero(o->hash_table.items, s);
1122
1123 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1124 f->header->field_hash_table_size = htole64(s);
1125
1126 return 0;
1127 }
1128
1129 int journal_file_map_data_hash_table(JournalFile *f) {
1130 uint64_t s, p;
1131 void *t;
1132 int r;
1133
1134 assert(f);
1135 assert(f->header);
1136
1137 if (f->data_hash_table)
1138 return 0;
1139
1140 p = le64toh(f->header->data_hash_table_offset);
1141 s = le64toh(f->header->data_hash_table_size);
1142
1143 r = journal_file_move_to(f,
1144 OBJECT_DATA_HASH_TABLE,
1145 true,
1146 p, s,
1147 &t, NULL);
1148 if (r < 0)
1149 return r;
1150
1151 f->data_hash_table = t;
1152 return 0;
1153 }
1154
1155 int journal_file_map_field_hash_table(JournalFile *f) {
1156 uint64_t s, p;
1157 void *t;
1158 int r;
1159
1160 assert(f);
1161 assert(f->header);
1162
1163 if (f->field_hash_table)
1164 return 0;
1165
1166 p = le64toh(f->header->field_hash_table_offset);
1167 s = le64toh(f->header->field_hash_table_size);
1168
1169 r = journal_file_move_to(f,
1170 OBJECT_FIELD_HASH_TABLE,
1171 true,
1172 p, s,
1173 &t, NULL);
1174 if (r < 0)
1175 return r;
1176
1177 f->field_hash_table = t;
1178 return 0;
1179 }
1180
1181 static int journal_file_link_field(
1182 JournalFile *f,
1183 Object *o,
1184 uint64_t offset,
1185 uint64_t hash) {
1186
1187 uint64_t p, h, m;
1188 int r;
1189
1190 assert(f);
1191 assert(f->header);
1192 assert(f->field_hash_table);
1193 assert(o);
1194 assert(offset > 0);
1195
1196 if (o->object.type != OBJECT_FIELD)
1197 return -EINVAL;
1198
1199 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1200 if (m <= 0)
1201 return -EBADMSG;
1202
1203 /* This might alter the window we are looking at */
1204 o->field.next_hash_offset = o->field.head_data_offset = 0;
1205
1206 h = hash % m;
1207 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1208 if (p == 0)
1209 f->field_hash_table[h].head_hash_offset = htole64(offset);
1210 else {
1211 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1212 if (r < 0)
1213 return r;
1214
1215 o->field.next_hash_offset = htole64(offset);
1216 }
1217
1218 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1219
1220 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1221 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1222
1223 return 0;
1224 }
1225
1226 static int journal_file_link_data(
1227 JournalFile *f,
1228 Object *o,
1229 uint64_t offset,
1230 uint64_t hash) {
1231
1232 uint64_t p, h, m;
1233 int r;
1234
1235 assert(f);
1236 assert(f->header);
1237 assert(f->data_hash_table);
1238 assert(o);
1239 assert(offset > 0);
1240
1241 if (o->object.type != OBJECT_DATA)
1242 return -EINVAL;
1243
1244 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1245 if (m <= 0)
1246 return -EBADMSG;
1247
1248 /* This might alter the window we are looking at */
1249 o->data.next_hash_offset = o->data.next_field_offset = 0;
1250 o->data.entry_offset = o->data.entry_array_offset = 0;
1251 o->data.n_entries = 0;
1252
1253 h = hash % m;
1254 p = le64toh(f->data_hash_table[h].tail_hash_offset);
1255 if (p == 0)
1256 /* Only entry in the hash table is easy */
1257 f->data_hash_table[h].head_hash_offset = htole64(offset);
1258 else {
1259 /* Move back to the previous data object, to patch in
1260 * pointer */
1261
1262 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1263 if (r < 0)
1264 return r;
1265
1266 o->data.next_hash_offset = htole64(offset);
1267 }
1268
1269 f->data_hash_table[h].tail_hash_offset = htole64(offset);
1270
1271 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1272 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1273
1274 return 0;
1275 }
1276
1277 int journal_file_find_field_object_with_hash(
1278 JournalFile *f,
1279 const void *field, uint64_t size, uint64_t hash,
1280 Object **ret, uint64_t *offset) {
1281
1282 uint64_t p, osize, h, m;
1283 int r;
1284
1285 assert(f);
1286 assert(f->header);
1287 assert(field && size > 0);
1288
1289 /* If the field hash table is empty, we can't find anything */
1290 if (le64toh(f->header->field_hash_table_size) <= 0)
1291 return 0;
1292
1293 /* Map the field hash table, if it isn't mapped yet. */
1294 r = journal_file_map_field_hash_table(f);
1295 if (r < 0)
1296 return r;
1297
1298 osize = offsetof(Object, field.payload) + size;
1299
1300 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1301 if (m <= 0)
1302 return -EBADMSG;
1303
1304 h = hash % m;
1305 p = le64toh(f->field_hash_table[h].head_hash_offset);
1306
1307 while (p > 0) {
1308 Object *o;
1309
1310 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1311 if (r < 0)
1312 return r;
1313
1314 if (le64toh(o->field.hash) == hash &&
1315 le64toh(o->object.size) == osize &&
1316 memcmp(o->field.payload, field, size) == 0) {
1317
1318 if (ret)
1319 *ret = o;
1320 if (offset)
1321 *offset = p;
1322
1323 return 1;
1324 }
1325
1326 p = le64toh(o->field.next_hash_offset);
1327 }
1328
1329 return 0;
1330 }
1331
1332 int journal_file_find_field_object(
1333 JournalFile *f,
1334 const void *field, uint64_t size,
1335 Object **ret, uint64_t *offset) {
1336
1337 uint64_t hash;
1338
1339 assert(f);
1340 assert(field && size > 0);
1341
1342 hash = hash64(field, size);
1343
1344 return journal_file_find_field_object_with_hash(f,
1345 field, size, hash,
1346 ret, offset);
1347 }
1348
1349 int journal_file_find_data_object_with_hash(
1350 JournalFile *f,
1351 const void *data, uint64_t size, uint64_t hash,
1352 Object **ret, uint64_t *offset) {
1353
1354 uint64_t p, osize, h, m;
1355 int r;
1356
1357 assert(f);
1358 assert(f->header);
1359 assert(data || size == 0);
1360
1361 /* If there's no data hash table, then there's no entry. */
1362 if (le64toh(f->header->data_hash_table_size) <= 0)
1363 return 0;
1364
1365 /* Map the data hash table, if it isn't mapped yet. */
1366 r = journal_file_map_data_hash_table(f);
1367 if (r < 0)
1368 return r;
1369
1370 osize = offsetof(Object, data.payload) + size;
1371
1372 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1373 if (m <= 0)
1374 return -EBADMSG;
1375
1376 h = hash % m;
1377 p = le64toh(f->data_hash_table[h].head_hash_offset);
1378
1379 while (p > 0) {
1380 Object *o;
1381
1382 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1383 if (r < 0)
1384 return r;
1385
1386 if (le64toh(o->data.hash) != hash)
1387 goto next;
1388
1389 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1390 #if HAVE_XZ || HAVE_LZ4
1391 uint64_t l;
1392 size_t rsize = 0;
1393
1394 l = le64toh(o->object.size);
1395 if (l <= offsetof(Object, data.payload))
1396 return -EBADMSG;
1397
1398 l -= offsetof(Object, data.payload);
1399
1400 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1401 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1402 if (r < 0)
1403 return r;
1404
1405 if (rsize == size &&
1406 memcmp(f->compress_buffer, data, size) == 0) {
1407
1408 if (ret)
1409 *ret = o;
1410
1411 if (offset)
1412 *offset = p;
1413
1414 return 1;
1415 }
1416 #else
1417 return -EPROTONOSUPPORT;
1418 #endif
1419 } else if (le64toh(o->object.size) == osize &&
1420 memcmp(o->data.payload, data, size) == 0) {
1421
1422 if (ret)
1423 *ret = o;
1424
1425 if (offset)
1426 *offset = p;
1427
1428 return 1;
1429 }
1430
1431 next:
1432 p = le64toh(o->data.next_hash_offset);
1433 }
1434
1435 return 0;
1436 }
1437
1438 int journal_file_find_data_object(
1439 JournalFile *f,
1440 const void *data, uint64_t size,
1441 Object **ret, uint64_t *offset) {
1442
1443 uint64_t hash;
1444
1445 assert(f);
1446 assert(data || size == 0);
1447
1448 hash = hash64(data, size);
1449
1450 return journal_file_find_data_object_with_hash(f,
1451 data, size, hash,
1452 ret, offset);
1453 }
1454
1455 static int journal_file_append_field(
1456 JournalFile *f,
1457 const void *field, uint64_t size,
1458 Object **ret, uint64_t *offset) {
1459
1460 uint64_t hash, p;
1461 uint64_t osize;
1462 Object *o;
1463 int r;
1464
1465 assert(f);
1466 assert(field && size > 0);
1467
1468 hash = hash64(field, size);
1469
1470 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1471 if (r < 0)
1472 return r;
1473 else if (r > 0) {
1474
1475 if (ret)
1476 *ret = o;
1477
1478 if (offset)
1479 *offset = p;
1480
1481 return 0;
1482 }
1483
1484 osize = offsetof(Object, field.payload) + size;
1485 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1486 if (r < 0)
1487 return r;
1488
1489 o->field.hash = htole64(hash);
1490 memcpy(o->field.payload, field, size);
1491
1492 r = journal_file_link_field(f, o, p, hash);
1493 if (r < 0)
1494 return r;
1495
1496 /* The linking might have altered the window, so let's
1497 * refresh our pointer */
1498 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1499 if (r < 0)
1500 return r;
1501
1502 #if HAVE_GCRYPT
1503 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1504 if (r < 0)
1505 return r;
1506 #endif
1507
1508 if (ret)
1509 *ret = o;
1510
1511 if (offset)
1512 *offset = p;
1513
1514 return 0;
1515 }
1516
1517 static int journal_file_append_data(
1518 JournalFile *f,
1519 const void *data, uint64_t size,
1520 Object **ret, uint64_t *offset) {
1521
1522 uint64_t hash, p;
1523 uint64_t osize;
1524 Object *o;
1525 int r, compression = 0;
1526 const void *eq;
1527
1528 assert(f);
1529 assert(data || size == 0);
1530
1531 hash = hash64(data, size);
1532
1533 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1534 if (r < 0)
1535 return r;
1536 if (r > 0) {
1537
1538 if (ret)
1539 *ret = o;
1540
1541 if (offset)
1542 *offset = p;
1543
1544 return 0;
1545 }
1546
1547 osize = offsetof(Object, data.payload) + size;
1548 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1549 if (r < 0)
1550 return r;
1551
1552 o->data.hash = htole64(hash);
1553
1554 #if HAVE_XZ || HAVE_LZ4
1555 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1556 size_t rsize = 0;
1557
1558 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1559
1560 if (compression >= 0) {
1561 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1562 o->object.flags |= compression;
1563
1564 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1565 size, rsize, object_compressed_to_string(compression));
1566 } else
1567 /* Compression didn't work, we don't really care why, let's continue without compression */
1568 compression = 0;
1569 }
1570 #endif
1571
1572 if (compression == 0)
1573 memcpy_safe(o->data.payload, data, size);
1574
1575 r = journal_file_link_data(f, o, p, hash);
1576 if (r < 0)
1577 return r;
1578
1579 #if HAVE_GCRYPT
1580 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1581 if (r < 0)
1582 return r;
1583 #endif
1584
1585 /* The linking might have altered the window, so let's
1586 * refresh our pointer */
1587 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1588 if (r < 0)
1589 return r;
1590
1591 if (!data)
1592 eq = NULL;
1593 else
1594 eq = memchr(data, '=', size);
1595 if (eq && eq > data) {
1596 Object *fo = NULL;
1597 uint64_t fp;
1598
1599 /* Create field object ... */
1600 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1601 if (r < 0)
1602 return r;
1603
1604 /* ... and link it in. */
1605 o->data.next_field_offset = fo->field.head_data_offset;
1606 fo->field.head_data_offset = le64toh(p);
1607 }
1608
1609 if (ret)
1610 *ret = o;
1611
1612 if (offset)
1613 *offset = p;
1614
1615 return 0;
1616 }
1617
1618 uint64_t journal_file_entry_n_items(Object *o) {
1619 assert(o);
1620
1621 if (o->object.type != OBJECT_ENTRY)
1622 return 0;
1623
1624 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1625 }
1626
1627 uint64_t journal_file_entry_array_n_items(Object *o) {
1628 assert(o);
1629
1630 if (o->object.type != OBJECT_ENTRY_ARRAY)
1631 return 0;
1632
1633 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1634 }
1635
1636 uint64_t journal_file_hash_table_n_items(Object *o) {
1637 assert(o);
1638
1639 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1640 return 0;
1641
1642 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1643 }
1644
1645 static int link_entry_into_array(JournalFile *f,
1646 le64_t *first,
1647 le64_t *idx,
1648 uint64_t p) {
1649 int r;
1650 uint64_t n = 0, ap = 0, q, i, a, hidx;
1651 Object *o;
1652
1653 assert(f);
1654 assert(f->header);
1655 assert(first);
1656 assert(idx);
1657 assert(p > 0);
1658
1659 a = le64toh(*first);
1660 i = hidx = le64toh(*idx);
1661 while (a > 0) {
1662
1663 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1664 if (r < 0)
1665 return r;
1666
1667 n = journal_file_entry_array_n_items(o);
1668 if (i < n) {
1669 o->entry_array.items[i] = htole64(p);
1670 *idx = htole64(hidx + 1);
1671 return 0;
1672 }
1673
1674 i -= n;
1675 ap = a;
1676 a = le64toh(o->entry_array.next_entry_array_offset);
1677 }
1678
1679 if (hidx > n)
1680 n = (hidx+1) * 2;
1681 else
1682 n = n * 2;
1683
1684 if (n < 4)
1685 n = 4;
1686
1687 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1688 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1689 &o, &q);
1690 if (r < 0)
1691 return r;
1692
1693 #if HAVE_GCRYPT
1694 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1695 if (r < 0)
1696 return r;
1697 #endif
1698
1699 o->entry_array.items[i] = htole64(p);
1700
1701 if (ap == 0)
1702 *first = htole64(q);
1703 else {
1704 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1705 if (r < 0)
1706 return r;
1707
1708 o->entry_array.next_entry_array_offset = htole64(q);
1709 }
1710
1711 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1712 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1713
1714 *idx = htole64(hidx + 1);
1715
1716 return 0;
1717 }
1718
1719 static int link_entry_into_array_plus_one(JournalFile *f,
1720 le64_t *extra,
1721 le64_t *first,
1722 le64_t *idx,
1723 uint64_t p) {
1724
1725 int r;
1726
1727 assert(f);
1728 assert(extra);
1729 assert(first);
1730 assert(idx);
1731 assert(p > 0);
1732
1733 if (*idx == 0)
1734 *extra = htole64(p);
1735 else {
1736 le64_t i;
1737
1738 i = htole64(le64toh(*idx) - 1);
1739 r = link_entry_into_array(f, first, &i, p);
1740 if (r < 0)
1741 return r;
1742 }
1743
1744 *idx = htole64(le64toh(*idx) + 1);
1745 return 0;
1746 }
1747
1748 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1749 uint64_t p;
1750 int r;
1751 assert(f);
1752 assert(o);
1753 assert(offset > 0);
1754
1755 p = le64toh(o->entry.items[i].object_offset);
1756 if (p == 0)
1757 return -EINVAL;
1758
1759 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1760 if (r < 0)
1761 return r;
1762
1763 return link_entry_into_array_plus_one(f,
1764 &o->data.entry_offset,
1765 &o->data.entry_array_offset,
1766 &o->data.n_entries,
1767 offset);
1768 }
1769
1770 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1771 uint64_t n, i;
1772 int r;
1773
1774 assert(f);
1775 assert(f->header);
1776 assert(o);
1777 assert(offset > 0);
1778
1779 if (o->object.type != OBJECT_ENTRY)
1780 return -EINVAL;
1781
1782 __sync_synchronize();
1783
1784 /* Link up the entry itself */
1785 r = link_entry_into_array(f,
1786 &f->header->entry_array_offset,
1787 &f->header->n_entries,
1788 offset);
1789 if (r < 0)
1790 return r;
1791
1792 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1793
1794 if (f->header->head_entry_realtime == 0)
1795 f->header->head_entry_realtime = o->entry.realtime;
1796
1797 f->header->tail_entry_realtime = o->entry.realtime;
1798 f->header->tail_entry_monotonic = o->entry.monotonic;
1799
1800 f->tail_entry_monotonic_valid = true;
1801
1802 /* Link up the items */
1803 n = journal_file_entry_n_items(o);
1804 for (i = 0; i < n; i++) {
1805 r = journal_file_link_entry_item(f, o, offset, i);
1806 if (r < 0)
1807 return r;
1808 }
1809
1810 return 0;
1811 }
1812
1813 static int journal_file_append_entry_internal(
1814 JournalFile *f,
1815 const dual_timestamp *ts,
1816 uint64_t xor_hash,
1817 const EntryItem items[], unsigned n_items,
1818 uint64_t *seqnum,
1819 Object **ret, uint64_t *offset) {
1820 uint64_t np;
1821 uint64_t osize;
1822 Object *o;
1823 int r;
1824
1825 assert(f);
1826 assert(f->header);
1827 assert(items || n_items == 0);
1828 assert(ts);
1829
1830 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1831
1832 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1833 if (r < 0)
1834 return r;
1835
1836 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1837 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1838 o->entry.realtime = htole64(ts->realtime);
1839 o->entry.monotonic = htole64(ts->monotonic);
1840 o->entry.xor_hash = htole64(xor_hash);
1841 o->entry.boot_id = f->header->boot_id;
1842
1843 #if HAVE_GCRYPT
1844 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1845 if (r < 0)
1846 return r;
1847 #endif
1848
1849 r = journal_file_link_entry(f, o, np);
1850 if (r < 0)
1851 return r;
1852
1853 if (ret)
1854 *ret = o;
1855
1856 if (offset)
1857 *offset = np;
1858
1859 return 0;
1860 }
1861
1862 void journal_file_post_change(JournalFile *f) {
1863 assert(f);
1864
1865 /* inotify() does not receive IN_MODIFY events from file
1866 * accesses done via mmap(). After each access we hence
1867 * trigger IN_MODIFY by truncating the journal file to its
1868 * current size which triggers IN_MODIFY. */
1869
1870 __sync_synchronize();
1871
1872 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1873 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1874 }
1875
1876 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1877 assert(userdata);
1878
1879 journal_file_post_change(userdata);
1880
1881 return 1;
1882 }
1883
1884 static void schedule_post_change(JournalFile *f) {
1885 sd_event_source *timer;
1886 int enabled, r;
1887 uint64_t now;
1888
1889 assert(f);
1890 assert(f->post_change_timer);
1891
1892 timer = f->post_change_timer;
1893
1894 r = sd_event_source_get_enabled(timer, &enabled);
1895 if (r < 0) {
1896 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1897 goto fail;
1898 }
1899
1900 if (enabled == SD_EVENT_ONESHOT)
1901 return;
1902
1903 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1904 if (r < 0) {
1905 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1906 goto fail;
1907 }
1908
1909 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1910 if (r < 0) {
1911 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1912 goto fail;
1913 }
1914
1915 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1916 if (r < 0) {
1917 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1918 goto fail;
1919 }
1920
1921 return;
1922
1923 fail:
1924 /* On failure, let's simply post the change immediately. */
1925 journal_file_post_change(f);
1926 }
1927
1928 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1929 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1930 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1931 int r;
1932
1933 assert(f);
1934 assert_return(!f->post_change_timer, -EINVAL);
1935 assert(e);
1936 assert(t);
1937
1938 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1939 if (r < 0)
1940 return r;
1941
1942 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1943 if (r < 0)
1944 return r;
1945
1946 f->post_change_timer = timer;
1947 timer = NULL;
1948 f->post_change_timer_period = t;
1949
1950 return r;
1951 }
1952
1953 static int entry_item_cmp(const void *_a, const void *_b) {
1954 const EntryItem *a = _a, *b = _b;
1955
1956 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1957 return -1;
1958 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1959 return 1;
1960 return 0;
1961 }
1962
1963 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1964 unsigned i;
1965 EntryItem *items;
1966 int r;
1967 uint64_t xor_hash = 0;
1968 struct dual_timestamp _ts;
1969
1970 assert(f);
1971 assert(f->header);
1972 assert(iovec || n_iovec == 0);
1973
1974 if (!ts) {
1975 dual_timestamp_get(&_ts);
1976 ts = &_ts;
1977 }
1978
1979 #if HAVE_GCRYPT
1980 r = journal_file_maybe_append_tag(f, ts->realtime);
1981 if (r < 0)
1982 return r;
1983 #endif
1984
1985 /* alloca() can't take 0, hence let's allocate at least one */
1986 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1987
1988 for (i = 0; i < n_iovec; i++) {
1989 uint64_t p;
1990 Object *o;
1991
1992 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1993 if (r < 0)
1994 return r;
1995
1996 xor_hash ^= le64toh(o->data.hash);
1997 items[i].object_offset = htole64(p);
1998 items[i].hash = o->data.hash;
1999 }
2000
2001 /* Order by the position on disk, in order to improve seek
2002 * times for rotating media. */
2003 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
2004
2005 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
2006
2007 /* If the memory mapping triggered a SIGBUS then we return an
2008 * IO error and ignore the error code passed down to us, since
2009 * it is very likely just an effect of a nullified replacement
2010 * mapping page */
2011
2012 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
2013 r = -EIO;
2014
2015 if (f->post_change_timer)
2016 schedule_post_change(f);
2017 else
2018 journal_file_post_change(f);
2019
2020 return r;
2021 }
2022
2023 typedef struct ChainCacheItem {
2024 uint64_t first; /* the array at the beginning of the chain */
2025 uint64_t array; /* the cached array */
2026 uint64_t begin; /* the first item in the cached array */
2027 uint64_t total; /* the total number of items in all arrays before this one in the chain */
2028 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2029 } ChainCacheItem;
2030
2031 static void chain_cache_put(
2032 OrderedHashmap *h,
2033 ChainCacheItem *ci,
2034 uint64_t first,
2035 uint64_t array,
2036 uint64_t begin,
2037 uint64_t total,
2038 uint64_t last_index) {
2039
2040 if (!ci) {
2041 /* If the chain item to cache for this chain is the
2042 * first one it's not worth caching anything */
2043 if (array == first)
2044 return;
2045
2046 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2047 ci = ordered_hashmap_steal_first(h);
2048 assert(ci);
2049 } else {
2050 ci = new(ChainCacheItem, 1);
2051 if (!ci)
2052 return;
2053 }
2054
2055 ci->first = first;
2056
2057 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2058 free(ci);
2059 return;
2060 }
2061 } else
2062 assert(ci->first == first);
2063
2064 ci->array = array;
2065 ci->begin = begin;
2066 ci->total = total;
2067 ci->last_index = last_index;
2068 }
2069
2070 static int generic_array_get(
2071 JournalFile *f,
2072 uint64_t first,
2073 uint64_t i,
2074 Object **ret, uint64_t *offset) {
2075
2076 Object *o;
2077 uint64_t p = 0, a, t = 0;
2078 int r;
2079 ChainCacheItem *ci;
2080
2081 assert(f);
2082
2083 a = first;
2084
2085 /* Try the chain cache first */
2086 ci = ordered_hashmap_get(f->chain_cache, &first);
2087 if (ci && i > ci->total) {
2088 a = ci->array;
2089 i -= ci->total;
2090 t = ci->total;
2091 }
2092
2093 while (a > 0) {
2094 uint64_t k;
2095
2096 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2097 if (r < 0)
2098 return r;
2099
2100 k = journal_file_entry_array_n_items(o);
2101 if (i < k) {
2102 p = le64toh(o->entry_array.items[i]);
2103 goto found;
2104 }
2105
2106 i -= k;
2107 t += k;
2108 a = le64toh(o->entry_array.next_entry_array_offset);
2109 }
2110
2111 return 0;
2112
2113 found:
2114 /* Let's cache this item for the next invocation */
2115 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2116
2117 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2118 if (r < 0)
2119 return r;
2120
2121 if (ret)
2122 *ret = o;
2123
2124 if (offset)
2125 *offset = p;
2126
2127 return 1;
2128 }
2129
2130 static int generic_array_get_plus_one(
2131 JournalFile *f,
2132 uint64_t extra,
2133 uint64_t first,
2134 uint64_t i,
2135 Object **ret, uint64_t *offset) {
2136
2137 Object *o;
2138
2139 assert(f);
2140
2141 if (i == 0) {
2142 int r;
2143
2144 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2145 if (r < 0)
2146 return r;
2147
2148 if (ret)
2149 *ret = o;
2150
2151 if (offset)
2152 *offset = extra;
2153
2154 return 1;
2155 }
2156
2157 return generic_array_get(f, first, i-1, ret, offset);
2158 }
2159
2160 enum {
2161 TEST_FOUND,
2162 TEST_LEFT,
2163 TEST_RIGHT
2164 };
2165
2166 static int generic_array_bisect(
2167 JournalFile *f,
2168 uint64_t first,
2169 uint64_t n,
2170 uint64_t needle,
2171 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2172 direction_t direction,
2173 Object **ret,
2174 uint64_t *offset,
2175 uint64_t *idx) {
2176
2177 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2178 bool subtract_one = false;
2179 Object *o, *array = NULL;
2180 int r;
2181 ChainCacheItem *ci;
2182
2183 assert(f);
2184 assert(test_object);
2185
2186 /* Start with the first array in the chain */
2187 a = first;
2188
2189 ci = ordered_hashmap_get(f->chain_cache, &first);
2190 if (ci && n > ci->total) {
2191 /* Ah, we have iterated this bisection array chain
2192 * previously! Let's see if we can skip ahead in the
2193 * chain, as far as the last time. But we can't jump
2194 * backwards in the chain, so let's check that
2195 * first. */
2196
2197 r = test_object(f, ci->begin, needle);
2198 if (r < 0)
2199 return r;
2200
2201 if (r == TEST_LEFT) {
2202 /* OK, what we are looking for is right of the
2203 * begin of this EntryArray, so let's jump
2204 * straight to previously cached array in the
2205 * chain */
2206
2207 a = ci->array;
2208 n -= ci->total;
2209 t = ci->total;
2210 last_index = ci->last_index;
2211 }
2212 }
2213
2214 while (a > 0) {
2215 uint64_t left, right, k, lp;
2216
2217 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2218 if (r < 0)
2219 return r;
2220
2221 k = journal_file_entry_array_n_items(array);
2222 right = MIN(k, n);
2223 if (right <= 0)
2224 return 0;
2225
2226 i = right - 1;
2227 lp = p = le64toh(array->entry_array.items[i]);
2228 if (p <= 0)
2229 r = -EBADMSG;
2230 else
2231 r = test_object(f, p, needle);
2232 if (r == -EBADMSG) {
2233 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2234 n = i;
2235 continue;
2236 }
2237 if (r < 0)
2238 return r;
2239
2240 if (r == TEST_FOUND)
2241 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2242
2243 if (r == TEST_RIGHT) {
2244 left = 0;
2245 right -= 1;
2246
2247 if (last_index != (uint64_t) -1) {
2248 assert(last_index <= right);
2249
2250 /* If we cached the last index we
2251 * looked at, let's try to not to jump
2252 * too wildly around and see if we can
2253 * limit the range to look at early to
2254 * the immediate neighbors of the last
2255 * index we looked at. */
2256
2257 if (last_index > 0) {
2258 uint64_t x = last_index - 1;
2259
2260 p = le64toh(array->entry_array.items[x]);
2261 if (p <= 0)
2262 return -EBADMSG;
2263
2264 r = test_object(f, p, needle);
2265 if (r < 0)
2266 return r;
2267
2268 if (r == TEST_FOUND)
2269 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2270
2271 if (r == TEST_RIGHT)
2272 right = x;
2273 else
2274 left = x + 1;
2275 }
2276
2277 if (last_index < right) {
2278 uint64_t y = last_index + 1;
2279
2280 p = le64toh(array->entry_array.items[y]);
2281 if (p <= 0)
2282 return -EBADMSG;
2283
2284 r = test_object(f, p, needle);
2285 if (r < 0)
2286 return r;
2287
2288 if (r == TEST_FOUND)
2289 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2290
2291 if (r == TEST_RIGHT)
2292 right = y;
2293 else
2294 left = y + 1;
2295 }
2296 }
2297
2298 for (;;) {
2299 if (left == right) {
2300 if (direction == DIRECTION_UP)
2301 subtract_one = true;
2302
2303 i = left;
2304 goto found;
2305 }
2306
2307 assert(left < right);
2308 i = (left + right) / 2;
2309
2310 p = le64toh(array->entry_array.items[i]);
2311 if (p <= 0)
2312 r = -EBADMSG;
2313 else
2314 r = test_object(f, p, needle);
2315 if (r == -EBADMSG) {
2316 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2317 right = n = i;
2318 continue;
2319 }
2320 if (r < 0)
2321 return r;
2322
2323 if (r == TEST_FOUND)
2324 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2325
2326 if (r == TEST_RIGHT)
2327 right = i;
2328 else
2329 left = i + 1;
2330 }
2331 }
2332
2333 if (k >= n) {
2334 if (direction == DIRECTION_UP) {
2335 i = n;
2336 subtract_one = true;
2337 goto found;
2338 }
2339
2340 return 0;
2341 }
2342
2343 last_p = lp;
2344
2345 n -= k;
2346 t += k;
2347 last_index = (uint64_t) -1;
2348 a = le64toh(array->entry_array.next_entry_array_offset);
2349 }
2350
2351 return 0;
2352
2353 found:
2354 if (subtract_one && t == 0 && i == 0)
2355 return 0;
2356
2357 /* Let's cache this item for the next invocation */
2358 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2359
2360 if (subtract_one && i == 0)
2361 p = last_p;
2362 else if (subtract_one)
2363 p = le64toh(array->entry_array.items[i-1]);
2364 else
2365 p = le64toh(array->entry_array.items[i]);
2366
2367 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2368 if (r < 0)
2369 return r;
2370
2371 if (ret)
2372 *ret = o;
2373
2374 if (offset)
2375 *offset = p;
2376
2377 if (idx)
2378 *idx = t + i + (subtract_one ? -1 : 0);
2379
2380 return 1;
2381 }
2382
2383 static int generic_array_bisect_plus_one(
2384 JournalFile *f,
2385 uint64_t extra,
2386 uint64_t first,
2387 uint64_t n,
2388 uint64_t needle,
2389 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2390 direction_t direction,
2391 Object **ret,
2392 uint64_t *offset,
2393 uint64_t *idx) {
2394
2395 int r;
2396 bool step_back = false;
2397 Object *o;
2398
2399 assert(f);
2400 assert(test_object);
2401
2402 if (n <= 0)
2403 return 0;
2404
2405 /* This bisects the array in object 'first', but first checks
2406 * an extra */
2407 r = test_object(f, extra, needle);
2408 if (r < 0)
2409 return r;
2410
2411 if (r == TEST_FOUND)
2412 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2413
2414 /* if we are looking with DIRECTION_UP then we need to first
2415 see if in the actual array there is a matching entry, and
2416 return the last one of that. But if there isn't any we need
2417 to return this one. Hence remember this, and return it
2418 below. */
2419 if (r == TEST_LEFT)
2420 step_back = direction == DIRECTION_UP;
2421
2422 if (r == TEST_RIGHT) {
2423 if (direction == DIRECTION_DOWN)
2424 goto found;
2425 else
2426 return 0;
2427 }
2428
2429 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2430
2431 if (r == 0 && step_back)
2432 goto found;
2433
2434 if (r > 0 && idx)
2435 (*idx)++;
2436
2437 return r;
2438
2439 found:
2440 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2441 if (r < 0)
2442 return r;
2443
2444 if (ret)
2445 *ret = o;
2446
2447 if (offset)
2448 *offset = extra;
2449
2450 if (idx)
2451 *idx = 0;
2452
2453 return 1;
2454 }
2455
2456 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2457 assert(f);
2458 assert(p > 0);
2459
2460 if (p == needle)
2461 return TEST_FOUND;
2462 else if (p < needle)
2463 return TEST_LEFT;
2464 else
2465 return TEST_RIGHT;
2466 }
2467
2468 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2469 Object *o;
2470 int r;
2471
2472 assert(f);
2473 assert(p > 0);
2474
2475 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2476 if (r < 0)
2477 return r;
2478
2479 if (le64toh(o->entry.seqnum) == needle)
2480 return TEST_FOUND;
2481 else if (le64toh(o->entry.seqnum) < needle)
2482 return TEST_LEFT;
2483 else
2484 return TEST_RIGHT;
2485 }
2486
2487 int journal_file_move_to_entry_by_seqnum(
2488 JournalFile *f,
2489 uint64_t seqnum,
2490 direction_t direction,
2491 Object **ret,
2492 uint64_t *offset) {
2493 assert(f);
2494 assert(f->header);
2495
2496 return generic_array_bisect(f,
2497 le64toh(f->header->entry_array_offset),
2498 le64toh(f->header->n_entries),
2499 seqnum,
2500 test_object_seqnum,
2501 direction,
2502 ret, offset, NULL);
2503 }
2504
2505 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2506 Object *o;
2507 int r;
2508
2509 assert(f);
2510 assert(p > 0);
2511
2512 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2513 if (r < 0)
2514 return r;
2515
2516 if (le64toh(o->entry.realtime) == needle)
2517 return TEST_FOUND;
2518 else if (le64toh(o->entry.realtime) < needle)
2519 return TEST_LEFT;
2520 else
2521 return TEST_RIGHT;
2522 }
2523
2524 int journal_file_move_to_entry_by_realtime(
2525 JournalFile *f,
2526 uint64_t realtime,
2527 direction_t direction,
2528 Object **ret,
2529 uint64_t *offset) {
2530 assert(f);
2531 assert(f->header);
2532
2533 return generic_array_bisect(f,
2534 le64toh(f->header->entry_array_offset),
2535 le64toh(f->header->n_entries),
2536 realtime,
2537 test_object_realtime,
2538 direction,
2539 ret, offset, NULL);
2540 }
2541
2542 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2543 Object *o;
2544 int r;
2545
2546 assert(f);
2547 assert(p > 0);
2548
2549 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2550 if (r < 0)
2551 return r;
2552
2553 if (le64toh(o->entry.monotonic) == needle)
2554 return TEST_FOUND;
2555 else if (le64toh(o->entry.monotonic) < needle)
2556 return TEST_LEFT;
2557 else
2558 return TEST_RIGHT;
2559 }
2560
2561 static int find_data_object_by_boot_id(
2562 JournalFile *f,
2563 sd_id128_t boot_id,
2564 Object **o,
2565 uint64_t *b) {
2566
2567 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2568
2569 sd_id128_to_string(boot_id, t + 9);
2570 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2571 }
2572
2573 int journal_file_move_to_entry_by_monotonic(
2574 JournalFile *f,
2575 sd_id128_t boot_id,
2576 uint64_t monotonic,
2577 direction_t direction,
2578 Object **ret,
2579 uint64_t *offset) {
2580
2581 Object *o;
2582 int r;
2583
2584 assert(f);
2585
2586 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2587 if (r < 0)
2588 return r;
2589 if (r == 0)
2590 return -ENOENT;
2591
2592 return generic_array_bisect_plus_one(f,
2593 le64toh(o->data.entry_offset),
2594 le64toh(o->data.entry_array_offset),
2595 le64toh(o->data.n_entries),
2596 monotonic,
2597 test_object_monotonic,
2598 direction,
2599 ret, offset, NULL);
2600 }
2601
2602 void journal_file_reset_location(JournalFile *f) {
2603 f->location_type = LOCATION_HEAD;
2604 f->current_offset = 0;
2605 f->current_seqnum = 0;
2606 f->current_realtime = 0;
2607 f->current_monotonic = 0;
2608 zero(f->current_boot_id);
2609 f->current_xor_hash = 0;
2610 }
2611
2612 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2613 f->location_type = LOCATION_SEEK;
2614 f->current_offset = offset;
2615 f->current_seqnum = le64toh(o->entry.seqnum);
2616 f->current_realtime = le64toh(o->entry.realtime);
2617 f->current_monotonic = le64toh(o->entry.monotonic);
2618 f->current_boot_id = o->entry.boot_id;
2619 f->current_xor_hash = le64toh(o->entry.xor_hash);
2620 }
2621
2622 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2623 assert(af);
2624 assert(af->header);
2625 assert(bf);
2626 assert(bf->header);
2627 assert(af->location_type == LOCATION_SEEK);
2628 assert(bf->location_type == LOCATION_SEEK);
2629
2630 /* If contents and timestamps match, these entries are
2631 * identical, even if the seqnum does not match */
2632 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2633 af->current_monotonic == bf->current_monotonic &&
2634 af->current_realtime == bf->current_realtime &&
2635 af->current_xor_hash == bf->current_xor_hash)
2636 return 0;
2637
2638 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2639
2640 /* If this is from the same seqnum source, compare
2641 * seqnums */
2642 if (af->current_seqnum < bf->current_seqnum)
2643 return -1;
2644 if (af->current_seqnum > bf->current_seqnum)
2645 return 1;
2646
2647 /* Wow! This is weird, different data but the same
2648 * seqnums? Something is borked, but let's make the
2649 * best of it and compare by time. */
2650 }
2651
2652 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2653
2654 /* If the boot id matches, compare monotonic time */
2655 if (af->current_monotonic < bf->current_monotonic)
2656 return -1;
2657 if (af->current_monotonic > bf->current_monotonic)
2658 return 1;
2659 }
2660
2661 /* Otherwise, compare UTC time */
2662 if (af->current_realtime < bf->current_realtime)
2663 return -1;
2664 if (af->current_realtime > bf->current_realtime)
2665 return 1;
2666
2667 /* Finally, compare by contents */
2668 if (af->current_xor_hash < bf->current_xor_hash)
2669 return -1;
2670 if (af->current_xor_hash > bf->current_xor_hash)
2671 return 1;
2672
2673 return 0;
2674 }
2675
2676 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2677
2678 /* Increase or decrease the specified index, in the right direction. */
2679
2680 if (direction == DIRECTION_DOWN) {
2681 if (*i >= n - 1)
2682 return 0;
2683
2684 (*i) ++;
2685 } else {
2686 if (*i <= 0)
2687 return 0;
2688
2689 (*i) --;
2690 }
2691
2692 return 1;
2693 }
2694
2695 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2696
2697 /* Consider it an error if any of the two offsets is uninitialized */
2698 if (old_offset == 0 || new_offset == 0)
2699 return false;
2700
2701 /* If we go down, the new offset must be larger than the old one. */
2702 return direction == DIRECTION_DOWN ?
2703 new_offset > old_offset :
2704 new_offset < old_offset;
2705 }
2706
2707 int journal_file_next_entry(
2708 JournalFile *f,
2709 uint64_t p,
2710 direction_t direction,
2711 Object **ret, uint64_t *offset) {
2712
2713 uint64_t i, n, ofs;
2714 int r;
2715
2716 assert(f);
2717 assert(f->header);
2718
2719 n = le64toh(f->header->n_entries);
2720 if (n <= 0)
2721 return 0;
2722
2723 if (p == 0)
2724 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2725 else {
2726 r = generic_array_bisect(f,
2727 le64toh(f->header->entry_array_offset),
2728 le64toh(f->header->n_entries),
2729 p,
2730 test_object_offset,
2731 DIRECTION_DOWN,
2732 NULL, NULL,
2733 &i);
2734 if (r <= 0)
2735 return r;
2736
2737 r = bump_array_index(&i, direction, n);
2738 if (r <= 0)
2739 return r;
2740 }
2741
2742 /* And jump to it */
2743 for (;;) {
2744 r = generic_array_get(f,
2745 le64toh(f->header->entry_array_offset),
2746 i,
2747 ret, &ofs);
2748 if (r > 0)
2749 break;
2750 if (r != -EBADMSG)
2751 return r;
2752
2753 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2754 * the next one might work for us instead. */
2755 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2756
2757 r = bump_array_index(&i, direction, n);
2758 if (r <= 0)
2759 return r;
2760 }
2761
2762 /* Ensure our array is properly ordered. */
2763 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2764 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
2765 return -EBADMSG;
2766 }
2767
2768 if (offset)
2769 *offset = ofs;
2770
2771 return 1;
2772 }
2773
2774 int journal_file_next_entry_for_data(
2775 JournalFile *f,
2776 Object *o, uint64_t p,
2777 uint64_t data_offset,
2778 direction_t direction,
2779 Object **ret, uint64_t *offset) {
2780
2781 uint64_t i, n, ofs;
2782 Object *d;
2783 int r;
2784
2785 assert(f);
2786 assert(p > 0 || !o);
2787
2788 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2789 if (r < 0)
2790 return r;
2791
2792 n = le64toh(d->data.n_entries);
2793 if (n <= 0)
2794 return n;
2795
2796 if (!o)
2797 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2798 else {
2799 if (o->object.type != OBJECT_ENTRY)
2800 return -EINVAL;
2801
2802 r = generic_array_bisect_plus_one(f,
2803 le64toh(d->data.entry_offset),
2804 le64toh(d->data.entry_array_offset),
2805 le64toh(d->data.n_entries),
2806 p,
2807 test_object_offset,
2808 DIRECTION_DOWN,
2809 NULL, NULL,
2810 &i);
2811
2812 if (r <= 0)
2813 return r;
2814
2815 r = bump_array_index(&i, direction, n);
2816 if (r <= 0)
2817 return r;
2818 }
2819
2820 for (;;) {
2821 r = generic_array_get_plus_one(f,
2822 le64toh(d->data.entry_offset),
2823 le64toh(d->data.entry_array_offset),
2824 i,
2825 ret, &ofs);
2826 if (r > 0)
2827 break;
2828 if (r != -EBADMSG)
2829 return r;
2830
2831 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2832
2833 r = bump_array_index(&i, direction, n);
2834 if (r <= 0)
2835 return r;
2836 }
2837
2838 /* Ensure our array is properly ordered. */
2839 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2840 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2841 return -EBADMSG;
2842 }
2843
2844 if (offset)
2845 *offset = ofs;
2846
2847 return 1;
2848 }
2849
2850 int journal_file_move_to_entry_by_offset_for_data(
2851 JournalFile *f,
2852 uint64_t data_offset,
2853 uint64_t p,
2854 direction_t direction,
2855 Object **ret, uint64_t *offset) {
2856
2857 int r;
2858 Object *d;
2859
2860 assert(f);
2861
2862 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2863 if (r < 0)
2864 return r;
2865
2866 return generic_array_bisect_plus_one(f,
2867 le64toh(d->data.entry_offset),
2868 le64toh(d->data.entry_array_offset),
2869 le64toh(d->data.n_entries),
2870 p,
2871 test_object_offset,
2872 direction,
2873 ret, offset, NULL);
2874 }
2875
2876 int journal_file_move_to_entry_by_monotonic_for_data(
2877 JournalFile *f,
2878 uint64_t data_offset,
2879 sd_id128_t boot_id,
2880 uint64_t monotonic,
2881 direction_t direction,
2882 Object **ret, uint64_t *offset) {
2883
2884 Object *o, *d;
2885 int r;
2886 uint64_t b, z;
2887
2888 assert(f);
2889
2890 /* First, seek by time */
2891 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2892 if (r < 0)
2893 return r;
2894 if (r == 0)
2895 return -ENOENT;
2896
2897 r = generic_array_bisect_plus_one(f,
2898 le64toh(o->data.entry_offset),
2899 le64toh(o->data.entry_array_offset),
2900 le64toh(o->data.n_entries),
2901 monotonic,
2902 test_object_monotonic,
2903 direction,
2904 NULL, &z, NULL);
2905 if (r <= 0)
2906 return r;
2907
2908 /* And now, continue seeking until we find an entry that
2909 * exists in both bisection arrays */
2910
2911 for (;;) {
2912 Object *qo;
2913 uint64_t p, q;
2914
2915 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2916 if (r < 0)
2917 return r;
2918
2919 r = generic_array_bisect_plus_one(f,
2920 le64toh(d->data.entry_offset),
2921 le64toh(d->data.entry_array_offset),
2922 le64toh(d->data.n_entries),
2923 z,
2924 test_object_offset,
2925 direction,
2926 NULL, &p, NULL);
2927 if (r <= 0)
2928 return r;
2929
2930 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2931 if (r < 0)
2932 return r;
2933
2934 r = generic_array_bisect_plus_one(f,
2935 le64toh(o->data.entry_offset),
2936 le64toh(o->data.entry_array_offset),
2937 le64toh(o->data.n_entries),
2938 p,
2939 test_object_offset,
2940 direction,
2941 &qo, &q, NULL);
2942
2943 if (r <= 0)
2944 return r;
2945
2946 if (p == q) {
2947 if (ret)
2948 *ret = qo;
2949 if (offset)
2950 *offset = q;
2951
2952 return 1;
2953 }
2954
2955 z = q;
2956 }
2957 }
2958
2959 int journal_file_move_to_entry_by_seqnum_for_data(
2960 JournalFile *f,
2961 uint64_t data_offset,
2962 uint64_t seqnum,
2963 direction_t direction,
2964 Object **ret, uint64_t *offset) {
2965
2966 Object *d;
2967 int r;
2968
2969 assert(f);
2970
2971 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2972 if (r < 0)
2973 return r;
2974
2975 return generic_array_bisect_plus_one(f,
2976 le64toh(d->data.entry_offset),
2977 le64toh(d->data.entry_array_offset),
2978 le64toh(d->data.n_entries),
2979 seqnum,
2980 test_object_seqnum,
2981 direction,
2982 ret, offset, NULL);
2983 }
2984
2985 int journal_file_move_to_entry_by_realtime_for_data(
2986 JournalFile *f,
2987 uint64_t data_offset,
2988 uint64_t realtime,
2989 direction_t direction,
2990 Object **ret, uint64_t *offset) {
2991
2992 Object *d;
2993 int r;
2994
2995 assert(f);
2996
2997 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2998 if (r < 0)
2999 return r;
3000
3001 return generic_array_bisect_plus_one(f,
3002 le64toh(d->data.entry_offset),
3003 le64toh(d->data.entry_array_offset),
3004 le64toh(d->data.n_entries),
3005 realtime,
3006 test_object_realtime,
3007 direction,
3008 ret, offset, NULL);
3009 }
3010
3011 void journal_file_dump(JournalFile *f) {
3012 Object *o;
3013 int r;
3014 uint64_t p;
3015
3016 assert(f);
3017 assert(f->header);
3018
3019 journal_file_print_header(f);
3020
3021 p = le64toh(f->header->header_size);
3022 while (p != 0) {
3023 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3024 if (r < 0)
3025 goto fail;
3026
3027 switch (o->object.type) {
3028
3029 case OBJECT_UNUSED:
3030 printf("Type: OBJECT_UNUSED\n");
3031 break;
3032
3033 case OBJECT_DATA:
3034 printf("Type: OBJECT_DATA\n");
3035 break;
3036
3037 case OBJECT_FIELD:
3038 printf("Type: OBJECT_FIELD\n");
3039 break;
3040
3041 case OBJECT_ENTRY:
3042 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3043 le64toh(o->entry.seqnum),
3044 le64toh(o->entry.monotonic),
3045 le64toh(o->entry.realtime));
3046 break;
3047
3048 case OBJECT_FIELD_HASH_TABLE:
3049 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3050 break;
3051
3052 case OBJECT_DATA_HASH_TABLE:
3053 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3054 break;
3055
3056 case OBJECT_ENTRY_ARRAY:
3057 printf("Type: OBJECT_ENTRY_ARRAY\n");
3058 break;
3059
3060 case OBJECT_TAG:
3061 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3062 le64toh(o->tag.seqnum),
3063 le64toh(o->tag.epoch));
3064 break;
3065
3066 default:
3067 printf("Type: unknown (%i)\n", o->object.type);
3068 break;
3069 }
3070
3071 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3072 printf("Flags: %s\n",
3073 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3074
3075 if (p == le64toh(f->header->tail_object_offset))
3076 p = 0;
3077 else
3078 p = p + ALIGN64(le64toh(o->object.size));
3079 }
3080
3081 return;
3082 fail:
3083 log_error("File corrupt");
3084 }
3085
3086 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3087 const char *x;
3088
3089 x = format_timestamp(buf, l, t);
3090 if (x)
3091 return x;
3092 return " --- ";
3093 }
3094
3095 void journal_file_print_header(JournalFile *f) {
3096 char a[33], b[33], c[33], d[33];
3097 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3098 struct stat st;
3099 char bytes[FORMAT_BYTES_MAX];
3100
3101 assert(f);
3102 assert(f->header);
3103
3104 printf("File Path: %s\n"
3105 "File ID: %s\n"
3106 "Machine ID: %s\n"
3107 "Boot ID: %s\n"
3108 "Sequential Number ID: %s\n"
3109 "State: %s\n"
3110 "Compatible Flags:%s%s\n"
3111 "Incompatible Flags:%s%s%s\n"
3112 "Header size: %"PRIu64"\n"
3113 "Arena size: %"PRIu64"\n"
3114 "Data Hash Table Size: %"PRIu64"\n"
3115 "Field Hash Table Size: %"PRIu64"\n"
3116 "Rotate Suggested: %s\n"
3117 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3118 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3119 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3120 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3121 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3122 "Objects: %"PRIu64"\n"
3123 "Entry Objects: %"PRIu64"\n",
3124 f->path,
3125 sd_id128_to_string(f->header->file_id, a),
3126 sd_id128_to_string(f->header->machine_id, b),
3127 sd_id128_to_string(f->header->boot_id, c),
3128 sd_id128_to_string(f->header->seqnum_id, d),
3129 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3130 f->header->state == STATE_ONLINE ? "ONLINE" :
3131 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3132 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3133 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3134 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3135 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3136 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3137 le64toh(f->header->header_size),
3138 le64toh(f->header->arena_size),
3139 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3140 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3141 yes_no(journal_file_rotate_suggested(f, 0)),
3142 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3143 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3144 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3145 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3146 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3147 le64toh(f->header->n_objects),
3148 le64toh(f->header->n_entries));
3149
3150 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3151 printf("Data Objects: %"PRIu64"\n"
3152 "Data Hash Table Fill: %.1f%%\n",
3153 le64toh(f->header->n_data),
3154 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3155
3156 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3157 printf("Field Objects: %"PRIu64"\n"
3158 "Field Hash Table Fill: %.1f%%\n",
3159 le64toh(f->header->n_fields),
3160 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3161
3162 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3163 printf("Tag Objects: %"PRIu64"\n",
3164 le64toh(f->header->n_tags));
3165 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3166 printf("Entry Array Objects: %"PRIu64"\n",
3167 le64toh(f->header->n_entry_arrays));
3168
3169 if (fstat(f->fd, &st) >= 0)
3170 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3171 }
3172
3173 static int journal_file_warn_btrfs(JournalFile *f) {
3174 unsigned attrs;
3175 int r;
3176
3177 assert(f);
3178
3179 /* Before we write anything, check if the COW logic is turned
3180 * off on btrfs. Given our write pattern that is quite
3181 * unfriendly to COW file systems this should greatly improve
3182 * performance on COW file systems, such as btrfs, at the
3183 * expense of data integrity features (which shouldn't be too
3184 * bad, given that we do our own checksumming). */
3185
3186 r = btrfs_is_filesystem(f->fd);
3187 if (r < 0)
3188 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3189 if (!r)
3190 return 0;
3191
3192 r = read_attr_fd(f->fd, &attrs);
3193 if (r < 0)
3194 return log_warning_errno(r, "Failed to read file attributes: %m");
3195
3196 if (attrs & FS_NOCOW_FL) {
3197 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3198 return 0;
3199 }
3200
3201 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3202 "This is likely to slow down journal access substantially, please consider turning "
3203 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3204
3205 return 1;
3206 }
3207
3208 int journal_file_open(
3209 int fd,
3210 const char *fname,
3211 int flags,
3212 mode_t mode,
3213 bool compress,
3214 bool seal,
3215 JournalMetrics *metrics,
3216 MMapCache *mmap_cache,
3217 Set *deferred_closes,
3218 JournalFile *template,
3219 JournalFile **ret) {
3220
3221 bool newly_created = false;
3222 JournalFile *f;
3223 void *h;
3224 int r;
3225
3226 assert(ret);
3227 assert(fd >= 0 || fname);
3228
3229 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3230 return -EINVAL;
3231
3232 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3233 return -EINVAL;
3234
3235 f = new0(JournalFile, 1);
3236 if (!f)
3237 return -ENOMEM;
3238
3239 f->fd = fd;
3240 f->mode = mode;
3241
3242 f->flags = flags;
3243 f->prot = prot_from_flags(flags);
3244 f->writable = (flags & O_ACCMODE) != O_RDONLY;
3245 #if HAVE_LZ4
3246 f->compress_lz4 = compress;
3247 #elif HAVE_XZ
3248 f->compress_xz = compress;
3249 #endif
3250 #if HAVE_GCRYPT
3251 f->seal = seal;
3252 #endif
3253
3254 if (mmap_cache)
3255 f->mmap = mmap_cache_ref(mmap_cache);
3256 else {
3257 f->mmap = mmap_cache_new();
3258 if (!f->mmap) {
3259 r = -ENOMEM;
3260 goto fail;
3261 }
3262 }
3263
3264 if (fname) {
3265 f->path = strdup(fname);
3266 if (!f->path) {
3267 r = -ENOMEM;
3268 goto fail;
3269 }
3270 } else {
3271 assert(fd >= 0);
3272
3273 /* If we don't know the path, fill in something explanatory and vaguely useful */
3274 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3275 r = -ENOMEM;
3276 goto fail;
3277 }
3278 }
3279
3280 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3281 if (!f->chain_cache) {
3282 r = -ENOMEM;
3283 goto fail;
3284 }
3285
3286 if (f->fd < 0) {
3287 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3288 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3289 * it doesn't hurt in that case. */
3290
3291 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
3292 if (f->fd < 0) {
3293 r = -errno;
3294 goto fail;
3295 }
3296
3297 /* fds we opened here by us should also be closed by us. */
3298 f->close_fd = true;
3299
3300 r = fd_nonblock(f->fd, false);
3301 if (r < 0)
3302 goto fail;
3303 }
3304
3305 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3306 if (!f->cache_fd) {
3307 r = -ENOMEM;
3308 goto fail;
3309 }
3310
3311 r = journal_file_fstat(f);
3312 if (r < 0)
3313 goto fail;
3314
3315 if (f->last_stat.st_size == 0 && f->writable) {
3316
3317 (void) journal_file_warn_btrfs(f);
3318
3319 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3320 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3321 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3322 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3323 * solely on mtime/atime/ctime of the file. */
3324 (void) fd_setcrtime(f->fd, 0);
3325
3326 #if HAVE_GCRYPT
3327 /* Try to load the FSPRG state, and if we can't, then
3328 * just don't do sealing */
3329 if (f->seal) {
3330 r = journal_file_fss_load(f);
3331 if (r < 0)
3332 f->seal = false;
3333 }
3334 #endif
3335
3336 r = journal_file_init_header(f, template);
3337 if (r < 0)
3338 goto fail;
3339
3340 r = journal_file_fstat(f);
3341 if (r < 0)
3342 goto fail;
3343
3344 newly_created = true;
3345 }
3346
3347 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3348 r = -ENODATA;
3349 goto fail;
3350 }
3351
3352 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3353 if (r < 0)
3354 goto fail;
3355
3356 f->header = h;
3357
3358 if (!newly_created) {
3359 set_clear_with_destructor(deferred_closes, journal_file_close);
3360
3361 r = journal_file_verify_header(f);
3362 if (r < 0)
3363 goto fail;
3364 }
3365
3366 #if HAVE_GCRYPT
3367 if (!newly_created && f->writable) {
3368 r = journal_file_fss_load(f);
3369 if (r < 0)
3370 goto fail;
3371 }
3372 #endif
3373
3374 if (f->writable) {
3375 if (metrics) {
3376 journal_default_metrics(metrics, f->fd);
3377 f->metrics = *metrics;
3378 } else if (template)
3379 f->metrics = template->metrics;
3380
3381 r = journal_file_refresh_header(f);
3382 if (r < 0)
3383 goto fail;
3384 }
3385
3386 #if HAVE_GCRYPT
3387 r = journal_file_hmac_setup(f);
3388 if (r < 0)
3389 goto fail;
3390 #endif
3391
3392 if (newly_created) {
3393 r = journal_file_setup_field_hash_table(f);
3394 if (r < 0)
3395 goto fail;
3396
3397 r = journal_file_setup_data_hash_table(f);
3398 if (r < 0)
3399 goto fail;
3400
3401 #if HAVE_GCRYPT
3402 r = journal_file_append_first_tag(f);
3403 if (r < 0)
3404 goto fail;
3405 #endif
3406 }
3407
3408 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3409 r = -EIO;
3410 goto fail;
3411 }
3412
3413 if (template && template->post_change_timer) {
3414 r = journal_file_enable_post_change_timer(
3415 f,
3416 sd_event_source_get_event(template->post_change_timer),
3417 template->post_change_timer_period);
3418
3419 if (r < 0)
3420 goto fail;
3421 }
3422
3423 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3424 f->close_fd = true;
3425
3426 *ret = f;
3427 return 0;
3428
3429 fail:
3430 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3431 r = -EIO;
3432
3433 (void) journal_file_close(f);
3434
3435 return r;
3436 }
3437
3438 int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
3439 _cleanup_free_ char *p = NULL;
3440 size_t l;
3441 JournalFile *old_file, *new_file = NULL;
3442 int r;
3443
3444 assert(f);
3445 assert(*f);
3446
3447 old_file = *f;
3448
3449 if (!old_file->writable)
3450 return -EINVAL;
3451
3452 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3453 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3454 if (path_startswith(old_file->path, "/proc/self/fd"))
3455 return -EINVAL;
3456
3457 if (!endswith(old_file->path, ".journal"))
3458 return -EINVAL;
3459
3460 l = strlen(old_file->path);
3461 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3462 (int) l - 8, old_file->path,
3463 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3464 le64toh((*f)->header->head_entry_seqnum),
3465 le64toh((*f)->header->head_entry_realtime));
3466 if (r < 0)
3467 return -ENOMEM;
3468
3469 /* Try to rename the file to the archived version. If the file
3470 * already was deleted, we'll get ENOENT, let's ignore that
3471 * case. */
3472 r = rename(old_file->path, p);
3473 if (r < 0 && errno != ENOENT)
3474 return -errno;
3475
3476 /* Sync the rename to disk */
3477 (void) fsync_directory_of_file(old_file->fd);
3478
3479 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3480 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3481 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3482 * would result in the rotated journal never getting fsync() called before closing.
3483 * Now we simply queue the archive state by setting an archive bit, leaving the state
3484 * as STATE_ONLINE so proper offlining occurs. */
3485 old_file->archive = true;
3486
3487 /* Currently, btrfs is not very good with out write patterns
3488 * and fragments heavily. Let's defrag our journal files when
3489 * we archive them */
3490 old_file->defrag_on_close = true;
3491
3492 r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
3493
3494 if (deferred_closes &&
3495 set_put(deferred_closes, old_file) >= 0)
3496 (void) journal_file_set_offline(old_file, false);
3497 else
3498 (void) journal_file_close(old_file);
3499
3500 *f = new_file;
3501 return r;
3502 }
3503
3504 int journal_file_open_reliably(
3505 const char *fname,
3506 int flags,
3507 mode_t mode,
3508 bool compress,
3509 bool seal,
3510 JournalMetrics *metrics,
3511 MMapCache *mmap_cache,
3512 Set *deferred_closes,
3513 JournalFile *template,
3514 JournalFile **ret) {
3515
3516 int r;
3517 size_t l;
3518 _cleanup_free_ char *p = NULL;
3519
3520 r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3521 if (!IN_SET(r,
3522 -EBADMSG, /* Corrupted */
3523 -ENODATA, /* Truncated */
3524 -EHOSTDOWN, /* Other machine */
3525 -EPROTONOSUPPORT, /* Incompatible feature */
3526 -EBUSY, /* Unclean shutdown */
3527 -ESHUTDOWN, /* Already archived */
3528 -EIO, /* IO error, including SIGBUS on mmap */
3529 -EIDRM, /* File has been deleted */
3530 -ETXTBSY)) /* File is from the future */
3531 return r;
3532
3533 if ((flags & O_ACCMODE) == O_RDONLY)
3534 return r;
3535
3536 if (!(flags & O_CREAT))
3537 return r;
3538
3539 if (!endswith(fname, ".journal"))
3540 return r;
3541
3542 /* The file is corrupted. Rotate it away and try it again (but only once) */
3543
3544 l = strlen(fname);
3545 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
3546 (int) l - 8, fname,
3547 now(CLOCK_REALTIME),
3548 random_u64()) < 0)
3549 return -ENOMEM;
3550
3551 if (rename(fname, p) < 0)
3552 return -errno;
3553
3554 /* btrfs doesn't cope well with our write pattern and
3555 * fragments heavily. Let's defrag all files we rotate */
3556
3557 (void) chattr_path(p, 0, FS_NOCOW_FL);
3558 (void) btrfs_defrag(p);
3559
3560 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3561
3562 return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3563 }
3564
3565 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3566 uint64_t i, n;
3567 uint64_t q, xor_hash = 0;
3568 int r;
3569 EntryItem *items;
3570 dual_timestamp ts;
3571
3572 assert(from);
3573 assert(to);
3574 assert(o);
3575 assert(p);
3576
3577 if (!to->writable)
3578 return -EPERM;
3579
3580 ts.monotonic = le64toh(o->entry.monotonic);
3581 ts.realtime = le64toh(o->entry.realtime);
3582
3583 n = journal_file_entry_n_items(o);
3584 /* alloca() can't take 0, hence let's allocate at least one */
3585 items = alloca(sizeof(EntryItem) * MAX(1u, n));
3586
3587 for (i = 0; i < n; i++) {
3588 uint64_t l, h;
3589 le64_t le_hash;
3590 size_t t;
3591 void *data;
3592 Object *u;
3593
3594 q = le64toh(o->entry.items[i].object_offset);
3595 le_hash = o->entry.items[i].hash;
3596
3597 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3598 if (r < 0)
3599 return r;
3600
3601 if (le_hash != o->data.hash)
3602 return -EBADMSG;
3603
3604 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3605 t = (size_t) l;
3606
3607 /* We hit the limit on 32bit machines */
3608 if ((uint64_t) t != l)
3609 return -E2BIG;
3610
3611 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3612 #if HAVE_XZ || HAVE_LZ4
3613 size_t rsize = 0;
3614
3615 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3616 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3617 if (r < 0)
3618 return r;
3619
3620 data = from->compress_buffer;
3621 l = rsize;
3622 #else
3623 return -EPROTONOSUPPORT;
3624 #endif
3625 } else
3626 data = o->data.payload;
3627
3628 r = journal_file_append_data(to, data, l, &u, &h);
3629 if (r < 0)
3630 return r;
3631
3632 xor_hash ^= le64toh(u->data.hash);
3633 items[i].object_offset = htole64(h);
3634 items[i].hash = u->data.hash;
3635
3636 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3637 if (r < 0)
3638 return r;
3639 }
3640
3641 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3642
3643 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3644 return -EIO;
3645
3646 return r;
3647 }
3648
3649 void journal_reset_metrics(JournalMetrics *m) {
3650 assert(m);
3651
3652 /* Set everything to "pick automatic values". */
3653
3654 *m = (JournalMetrics) {
3655 .min_use = (uint64_t) -1,
3656 .max_use = (uint64_t) -1,
3657 .min_size = (uint64_t) -1,
3658 .max_size = (uint64_t) -1,
3659 .keep_free = (uint64_t) -1,
3660 .n_max_files = (uint64_t) -1,
3661 };
3662 }
3663
3664 void journal_default_metrics(JournalMetrics *m, int fd) {
3665 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3666 struct statvfs ss;
3667 uint64_t fs_size;
3668
3669 assert(m);
3670 assert(fd >= 0);
3671
3672 if (fstatvfs(fd, &ss) >= 0)
3673 fs_size = ss.f_frsize * ss.f_blocks;
3674 else {
3675 log_debug_errno(errno, "Failed to determine disk size: %m");
3676 fs_size = 0;
3677 }
3678
3679 if (m->max_use == (uint64_t) -1) {
3680
3681 if (fs_size > 0) {
3682 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3683
3684 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3685 m->max_use = DEFAULT_MAX_USE_UPPER;
3686
3687 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3688 m->max_use = DEFAULT_MAX_USE_LOWER;
3689 } else
3690 m->max_use = DEFAULT_MAX_USE_LOWER;
3691 } else {
3692 m->max_use = PAGE_ALIGN(m->max_use);
3693
3694 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3695 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3696 }
3697
3698 if (m->min_use == (uint64_t) -1)
3699 m->min_use = DEFAULT_MIN_USE;
3700
3701 if (m->min_use > m->max_use)
3702 m->min_use = m->max_use;
3703
3704 if (m->max_size == (uint64_t) -1) {
3705 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3706
3707 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3708 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3709 } else
3710 m->max_size = PAGE_ALIGN(m->max_size);
3711
3712 if (m->max_size != 0) {
3713 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3714 m->max_size = JOURNAL_FILE_SIZE_MIN;
3715
3716 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3717 m->max_use = m->max_size*2;
3718 }
3719
3720 if (m->min_size == (uint64_t) -1)
3721 m->min_size = JOURNAL_FILE_SIZE_MIN;
3722 else {
3723 m->min_size = PAGE_ALIGN(m->min_size);
3724
3725 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3726 m->min_size = JOURNAL_FILE_SIZE_MIN;
3727
3728 if (m->max_size != 0 && m->min_size > m->max_size)
3729 m->max_size = m->min_size;
3730 }
3731
3732 if (m->keep_free == (uint64_t) -1) {
3733
3734 if (fs_size > 0) {
3735 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3736
3737 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3738 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3739
3740 } else
3741 m->keep_free = DEFAULT_KEEP_FREE;
3742 }
3743
3744 if (m->n_max_files == (uint64_t) -1)
3745 m->n_max_files = DEFAULT_N_MAX_FILES;
3746
3747 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3748 format_bytes(a, sizeof(a), m->min_use),
3749 format_bytes(b, sizeof(b), m->max_use),
3750 format_bytes(c, sizeof(c), m->max_size),
3751 format_bytes(d, sizeof(d), m->min_size),
3752 format_bytes(e, sizeof(e), m->keep_free),
3753 m->n_max_files);
3754 }
3755
3756 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3757 assert(f);
3758 assert(f->header);
3759 assert(from || to);
3760
3761 if (from) {
3762 if (f->header->head_entry_realtime == 0)
3763 return -ENOENT;
3764
3765 *from = le64toh(f->header->head_entry_realtime);
3766 }
3767
3768 if (to) {
3769 if (f->header->tail_entry_realtime == 0)
3770 return -ENOENT;
3771
3772 *to = le64toh(f->header->tail_entry_realtime);
3773 }
3774
3775 return 1;
3776 }
3777
3778 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3779 Object *o;
3780 uint64_t p;
3781 int r;
3782
3783 assert(f);
3784 assert(from || to);
3785
3786 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3787 if (r <= 0)
3788 return r;
3789
3790 if (le64toh(o->data.n_entries) <= 0)
3791 return 0;
3792
3793 if (from) {
3794 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3795 if (r < 0)
3796 return r;
3797
3798 *from = le64toh(o->entry.monotonic);
3799 }
3800
3801 if (to) {
3802 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3803 if (r < 0)
3804 return r;
3805
3806 r = generic_array_get_plus_one(f,
3807 le64toh(o->data.entry_offset),
3808 le64toh(o->data.entry_array_offset),
3809 le64toh(o->data.n_entries)-1,
3810 &o, NULL);
3811 if (r <= 0)
3812 return r;
3813
3814 *to = le64toh(o->entry.monotonic);
3815 }
3816
3817 return 1;
3818 }
3819
3820 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3821 assert(f);
3822 assert(f->header);
3823
3824 /* If we gained new header fields we gained new features,
3825 * hence suggest a rotation */
3826 if (le64toh(f->header->header_size) < sizeof(Header)) {
3827 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3828 return true;
3829 }
3830
3831 /* Let's check if the hash tables grew over a certain fill
3832 * level (75%, borrowing this value from Java's hash table
3833 * implementation), and if so suggest a rotation. To calculate
3834 * the fill level we need the n_data field, which only exists
3835 * in newer versions. */
3836
3837 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3838 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3839 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3840 f->path,
3841 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3842 le64toh(f->header->n_data),
3843 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3844 (unsigned long long) f->last_stat.st_size,
3845 f->last_stat.st_size / le64toh(f->header->n_data));
3846 return true;
3847 }
3848
3849 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3850 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3851 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3852 f->path,
3853 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3854 le64toh(f->header->n_fields),
3855 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3856 return true;
3857 }
3858
3859 /* Are the data objects properly indexed by field objects? */
3860 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3861 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3862 le64toh(f->header->n_data) > 0 &&
3863 le64toh(f->header->n_fields) == 0)
3864 return true;
3865
3866 if (max_file_usec > 0) {
3867 usec_t t, h;
3868
3869 h = le64toh(f->header->head_entry_realtime);
3870 t = now(CLOCK_REALTIME);
3871
3872 if (h > 0 && t > h + max_file_usec)
3873 return true;
3874 }
3875
3876 return false;
3877 }