]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
tree-wide: make use of new STRLEN() macro everywhere (#7639)
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2011 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <linux/fs.h>
24 #include <pthread.h>
25 #include <stddef.h>
26 #include <sys/mman.h>
27 #include <sys/statvfs.h>
28 #include <sys/uio.h>
29 #include <unistd.h>
30
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
34 #include "compress.h"
35 #include "fd-util.h"
36 #include "journal-authenticate.h"
37 #include "journal-def.h"
38 #include "journal-file.h"
39 #include "lookup3.h"
40 #include "parse-util.h"
41 #include "path-util.h"
42 #include "random-util.h"
43 #include "sd-event.h"
44 #include "set.h"
45 #include "string-util.h"
46 #include "strv.h"
47 #include "xattr-util.h"
48
49 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
50 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
51
52 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
53
54 /* This is the minimum journal file size */
55 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
56
57 /* These are the lower and upper bounds if we deduce the max_use value
58 * from the file system size */
59 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
60 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61
62 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
63 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
64
65 /* This is the upper bound if we deduce max_size from max_use */
66 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
67
68 /* This is the upper bound if we deduce the keep_free value from the
69 * file system size */
70 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
71
72 /* This is the keep_free value when we can't determine the system
73 * size */
74 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
75
76 /* This is the default maximum number of journal files to keep around. */
77 #define DEFAULT_N_MAX_FILES (100)
78
79 /* n_data was the first entry we added after the initial file format design */
80 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
81
82 /* How many entries to keep in the entry array chain cache at max */
83 #define CHAIN_CACHE_MAX 20
84
85 /* How much to increase the journal file size at once each time we allocate something new. */
86 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
87
88 /* Reread fstat() of the file for detecting deletions at least this often */
89 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
90
91 /* The mmap context to use for the header we pick as one above the last defined typed */
92 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
93
94 #ifdef __clang__
95 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
96 #endif
97
98 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
99 * As a result we use atomic operations on f->offline_state for inter-thread communications with
100 * journal_file_set_offline() and journal_file_set_online(). */
101 static void journal_file_set_offline_internal(JournalFile *f) {
102 assert(f);
103 assert(f->fd >= 0);
104 assert(f->header);
105
106 for (;;) {
107 switch (f->offline_state) {
108 case OFFLINE_CANCEL:
109 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
110 continue;
111 return;
112
113 case OFFLINE_AGAIN_FROM_SYNCING:
114 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
115 continue;
116 break;
117
118 case OFFLINE_AGAIN_FROM_OFFLINING:
119 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
120 continue;
121 break;
122
123 case OFFLINE_SYNCING:
124 (void) fsync(f->fd);
125
126 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
127 continue;
128
129 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
130 (void) fsync(f->fd);
131 break;
132
133 case OFFLINE_OFFLINING:
134 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
135 continue;
136 _fallthrough_;
137 case OFFLINE_DONE:
138 return;
139
140 case OFFLINE_JOINED:
141 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
142 return;
143 }
144 }
145 }
146
147 static void * journal_file_set_offline_thread(void *arg) {
148 JournalFile *f = arg;
149
150 journal_file_set_offline_internal(f);
151
152 return NULL;
153 }
154
155 static int journal_file_set_offline_thread_join(JournalFile *f) {
156 int r;
157
158 assert(f);
159
160 if (f->offline_state == OFFLINE_JOINED)
161 return 0;
162
163 r = pthread_join(f->offline_thread, NULL);
164 if (r)
165 return -r;
166
167 f->offline_state = OFFLINE_JOINED;
168
169 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
170 return -EIO;
171
172 return 0;
173 }
174
175 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
176 static bool journal_file_set_offline_try_restart(JournalFile *f) {
177 for (;;) {
178 switch (f->offline_state) {
179 case OFFLINE_AGAIN_FROM_SYNCING:
180 case OFFLINE_AGAIN_FROM_OFFLINING:
181 return true;
182
183 case OFFLINE_CANCEL:
184 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
185 continue;
186 return true;
187
188 case OFFLINE_SYNCING:
189 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
190 continue;
191 return true;
192
193 case OFFLINE_OFFLINING:
194 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
195 continue;
196 return true;
197
198 default:
199 return false;
200 }
201 }
202 }
203
204 /* Sets a journal offline.
205 *
206 * If wait is false then an offline is dispatched in a separate thread for a
207 * subsequent journal_file_set_offline() or journal_file_set_online() of the
208 * same journal to synchronize with.
209 *
210 * If wait is true, then either an existing offline thread will be restarted
211 * and joined, or if none exists the offline is simply performed in this
212 * context without involving another thread.
213 */
214 int journal_file_set_offline(JournalFile *f, bool wait) {
215 bool restarted;
216 int r;
217
218 assert(f);
219
220 if (!f->writable)
221 return -EPERM;
222
223 if (!(f->fd >= 0 && f->header))
224 return -EINVAL;
225
226 /* An offlining journal is implicitly online and may modify f->header->state,
227 * we must also join any potentially lingering offline thread when not online. */
228 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
229 return journal_file_set_offline_thread_join(f);
230
231 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
232 restarted = journal_file_set_offline_try_restart(f);
233 if ((restarted && wait) || !restarted) {
234 r = journal_file_set_offline_thread_join(f);
235 if (r < 0)
236 return r;
237 }
238
239 if (restarted)
240 return 0;
241
242 /* Initiate a new offline. */
243 f->offline_state = OFFLINE_SYNCING;
244
245 if (wait) /* Without using a thread if waiting. */
246 journal_file_set_offline_internal(f);
247 else {
248 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
249 if (r > 0) {
250 f->offline_state = OFFLINE_JOINED;
251 return -r;
252 }
253 }
254
255 return 0;
256 }
257
258 static int journal_file_set_online(JournalFile *f) {
259 bool joined = false;
260
261 assert(f);
262
263 if (!f->writable)
264 return -EPERM;
265
266 if (!(f->fd >= 0 && f->header))
267 return -EINVAL;
268
269 while (!joined) {
270 switch (f->offline_state) {
271 case OFFLINE_JOINED:
272 /* No offline thread, no need to wait. */
273 joined = true;
274 break;
275
276 case OFFLINE_SYNCING:
277 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
278 continue;
279 /* Canceled syncing prior to offlining, no need to wait. */
280 break;
281
282 case OFFLINE_AGAIN_FROM_SYNCING:
283 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
284 continue;
285 /* Canceled restart from syncing, no need to wait. */
286 break;
287
288 case OFFLINE_AGAIN_FROM_OFFLINING:
289 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
290 continue;
291 /* Canceled restart from offlining, must wait for offlining to complete however. */
292 _fallthrough_;
293 default: {
294 int r;
295
296 r = journal_file_set_offline_thread_join(f);
297 if (r < 0)
298 return r;
299
300 joined = true;
301 break;
302 }
303 }
304 }
305
306 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
307 return -EIO;
308
309 switch (f->header->state) {
310 case STATE_ONLINE:
311 return 0;
312
313 case STATE_OFFLINE:
314 f->header->state = STATE_ONLINE;
315 (void) fsync(f->fd);
316 return 0;
317
318 default:
319 return -EINVAL;
320 }
321 }
322
323 bool journal_file_is_offlining(JournalFile *f) {
324 assert(f);
325
326 __sync_synchronize();
327
328 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
329 return false;
330
331 return true;
332 }
333
334 JournalFile* journal_file_close(JournalFile *f) {
335 assert(f);
336
337 #if HAVE_GCRYPT
338 /* Write the final tag */
339 if (f->seal && f->writable) {
340 int r;
341
342 r = journal_file_append_tag(f);
343 if (r < 0)
344 log_error_errno(r, "Failed to append tag when closing journal: %m");
345 }
346 #endif
347
348 if (f->post_change_timer) {
349 int enabled;
350
351 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
352 if (enabled == SD_EVENT_ONESHOT)
353 journal_file_post_change(f);
354
355 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
356 sd_event_source_unref(f->post_change_timer);
357 }
358
359 journal_file_set_offline(f, true);
360
361 if (f->mmap && f->cache_fd)
362 mmap_cache_free_fd(f->mmap, f->cache_fd);
363
364 if (f->fd >= 0 && f->defrag_on_close) {
365
366 /* Be friendly to btrfs: turn COW back on again now,
367 * and defragment the file. We won't write to the file
368 * ever again, hence remove all fragmentation, and
369 * reenable all the good bits COW usually provides
370 * (such as data checksumming). */
371
372 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
373 (void) btrfs_defrag_fd(f->fd);
374 }
375
376 if (f->close_fd)
377 safe_close(f->fd);
378 free(f->path);
379
380 mmap_cache_unref(f->mmap);
381
382 ordered_hashmap_free_free(f->chain_cache);
383
384 #if HAVE_XZ || HAVE_LZ4
385 free(f->compress_buffer);
386 #endif
387
388 #if HAVE_GCRYPT
389 if (f->fss_file)
390 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
391 else
392 free(f->fsprg_state);
393
394 free(f->fsprg_seed);
395
396 if (f->hmac)
397 gcry_md_close(f->hmac);
398 #endif
399
400 return mfree(f);
401 }
402
403 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
404 Header h = {};
405 ssize_t k;
406 int r;
407
408 assert(f);
409
410 memcpy(h.signature, HEADER_SIGNATURE, 8);
411 h.header_size = htole64(ALIGN64(sizeof(h)));
412
413 h.incompatible_flags |= htole32(
414 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
415 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
416
417 h.compatible_flags = htole32(
418 f->seal * HEADER_COMPATIBLE_SEALED);
419
420 r = sd_id128_randomize(&h.file_id);
421 if (r < 0)
422 return r;
423
424 if (template) {
425 h.seqnum_id = template->header->seqnum_id;
426 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
427 } else
428 h.seqnum_id = h.file_id;
429
430 k = pwrite(f->fd, &h, sizeof(h), 0);
431 if (k < 0)
432 return -errno;
433
434 if (k != sizeof(h))
435 return -EIO;
436
437 return 0;
438 }
439
440 static int fsync_directory_of_file(int fd) {
441 _cleanup_free_ char *path = NULL, *dn = NULL;
442 _cleanup_close_ int dfd = -1;
443 struct stat st;
444 int r;
445
446 if (fstat(fd, &st) < 0)
447 return -errno;
448
449 if (!S_ISREG(st.st_mode))
450 return -EBADFD;
451
452 r = fd_get_path(fd, &path);
453 if (r < 0)
454 return r;
455
456 if (!path_is_absolute(path))
457 return -EINVAL;
458
459 dn = dirname_malloc(path);
460 if (!dn)
461 return -ENOMEM;
462
463 dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
464 if (dfd < 0)
465 return -errno;
466
467 if (fsync(dfd) < 0)
468 return -errno;
469
470 return 0;
471 }
472
473 static int journal_file_refresh_header(JournalFile *f) {
474 sd_id128_t boot_id;
475 int r;
476
477 assert(f);
478 assert(f->header);
479
480 r = sd_id128_get_machine(&f->header->machine_id);
481 if (r < 0)
482 return r;
483
484 r = sd_id128_get_boot(&boot_id);
485 if (r < 0)
486 return r;
487
488 if (sd_id128_equal(boot_id, f->header->boot_id))
489 f->tail_entry_monotonic_valid = true;
490
491 f->header->boot_id = boot_id;
492
493 r = journal_file_set_online(f);
494
495 /* Sync the online state to disk */
496 (void) fsync(f->fd);
497
498 /* We likely just created a new file, also sync the directory this file is located in. */
499 (void) fsync_directory_of_file(f->fd);
500
501 return r;
502 }
503
504 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
505 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
506 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
507 const char *type = compatible ? "compatible" : "incompatible";
508 uint32_t flags;
509
510 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
511
512 if (flags & ~supported) {
513 if (flags & ~any)
514 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
515 f->path, type, flags & ~any);
516 flags = (flags & any) & ~supported;
517 if (flags) {
518 const char* strv[3];
519 unsigned n = 0;
520 _cleanup_free_ char *t = NULL;
521
522 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
523 strv[n++] = "sealed";
524 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
525 strv[n++] = "xz-compressed";
526 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
527 strv[n++] = "lz4-compressed";
528 strv[n] = NULL;
529 assert(n < ELEMENTSOF(strv));
530
531 t = strv_join((char**) strv, ", ");
532 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
533 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
534 }
535 return true;
536 }
537
538 return false;
539 }
540
541 static int journal_file_verify_header(JournalFile *f) {
542 uint64_t arena_size, header_size;
543
544 assert(f);
545 assert(f->header);
546
547 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
548 return -EBADMSG;
549
550 /* In both read and write mode we refuse to open files with incompatible
551 * flags we don't know. */
552 if (warn_wrong_flags(f, false))
553 return -EPROTONOSUPPORT;
554
555 /* When open for writing we refuse to open files with compatible flags, too. */
556 if (f->writable && warn_wrong_flags(f, true))
557 return -EPROTONOSUPPORT;
558
559 if (f->header->state >= _STATE_MAX)
560 return -EBADMSG;
561
562 header_size = le64toh(f->header->header_size);
563
564 /* The first addition was n_data, so check that we are at least this large */
565 if (header_size < HEADER_SIZE_MIN)
566 return -EBADMSG;
567
568 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
569 return -EBADMSG;
570
571 arena_size = le64toh(f->header->arena_size);
572
573 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
574 return -ENODATA;
575
576 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
577 return -ENODATA;
578
579 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
580 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
581 !VALID64(le64toh(f->header->tail_object_offset)) ||
582 !VALID64(le64toh(f->header->entry_array_offset)))
583 return -ENODATA;
584
585 if (f->writable) {
586 sd_id128_t machine_id;
587 uint8_t state;
588 int r;
589
590 r = sd_id128_get_machine(&machine_id);
591 if (r < 0)
592 return r;
593
594 if (!sd_id128_equal(machine_id, f->header->machine_id))
595 return -EHOSTDOWN;
596
597 state = f->header->state;
598
599 if (state == STATE_ARCHIVED)
600 return -ESHUTDOWN; /* Already archived */
601 else if (state == STATE_ONLINE) {
602 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
603 return -EBUSY;
604 } else if (state != STATE_OFFLINE) {
605 log_debug("Journal file %s has unknown state %i.", f->path, state);
606 return -EBUSY;
607 }
608
609 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
610 return -EBADMSG;
611
612 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
613 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
614 * bisection. */
615 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
616 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
617 return -ETXTBSY;
618 }
619 }
620
621 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
622 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
623
624 f->seal = JOURNAL_HEADER_SEALED(f->header);
625
626 return 0;
627 }
628
629 static int journal_file_fstat(JournalFile *f) {
630 assert(f);
631 assert(f->fd >= 0);
632
633 if (fstat(f->fd, &f->last_stat) < 0)
634 return -errno;
635
636 f->last_stat_usec = now(CLOCK_MONOTONIC);
637
638 /* Refuse appending to files that are already deleted */
639 if (f->last_stat.st_nlink <= 0)
640 return -EIDRM;
641
642 return 0;
643 }
644
645 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
646 uint64_t old_size, new_size;
647 int r;
648
649 assert(f);
650 assert(f->header);
651
652 /* We assume that this file is not sparse, and we know that
653 * for sure, since we always call posix_fallocate()
654 * ourselves */
655
656 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
657 return -EIO;
658
659 old_size =
660 le64toh(f->header->header_size) +
661 le64toh(f->header->arena_size);
662
663 new_size = PAGE_ALIGN(offset + size);
664 if (new_size < le64toh(f->header->header_size))
665 new_size = le64toh(f->header->header_size);
666
667 if (new_size <= old_size) {
668
669 /* We already pre-allocated enough space, but before
670 * we write to it, let's check with fstat() if the
671 * file got deleted, in order make sure we don't throw
672 * away the data immediately. Don't check fstat() for
673 * all writes though, but only once ever 10s. */
674
675 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
676 return 0;
677
678 return journal_file_fstat(f);
679 }
680
681 /* Allocate more space. */
682
683 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
684 return -E2BIG;
685
686 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
687 struct statvfs svfs;
688
689 if (fstatvfs(f->fd, &svfs) >= 0) {
690 uint64_t available;
691
692 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
693
694 if (new_size - old_size > available)
695 return -E2BIG;
696 }
697 }
698
699 /* Increase by larger blocks at once */
700 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
701 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
702 new_size = f->metrics.max_size;
703
704 /* Note that the glibc fallocate() fallback is very
705 inefficient, hence we try to minimize the allocation area
706 as we can. */
707 r = posix_fallocate(f->fd, old_size, new_size - old_size);
708 if (r != 0)
709 return -r;
710
711 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
712
713 return journal_file_fstat(f);
714 }
715
716 static unsigned type_to_context(ObjectType type) {
717 /* One context for each type, plus one catch-all for the rest */
718 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
719 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
720 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
721 }
722
723 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
724 int r;
725
726 assert(f);
727 assert(ret);
728
729 if (size <= 0)
730 return -EINVAL;
731
732 /* Avoid SIGBUS on invalid accesses */
733 if (offset + size > (uint64_t) f->last_stat.st_size) {
734 /* Hmm, out of range? Let's refresh the fstat() data
735 * first, before we trust that check. */
736
737 r = journal_file_fstat(f);
738 if (r < 0)
739 return r;
740
741 if (offset + size > (uint64_t) f->last_stat.st_size)
742 return -EADDRNOTAVAIL;
743 }
744
745 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
746 }
747
748 static uint64_t minimum_header_size(Object *o) {
749
750 static const uint64_t table[] = {
751 [OBJECT_DATA] = sizeof(DataObject),
752 [OBJECT_FIELD] = sizeof(FieldObject),
753 [OBJECT_ENTRY] = sizeof(EntryObject),
754 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
755 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
756 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
757 [OBJECT_TAG] = sizeof(TagObject),
758 };
759
760 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
761 return sizeof(ObjectHeader);
762
763 return table[o->object.type];
764 }
765
766 /* Lightweight object checks. We want this to be fast, so that we won't
767 * slowdown every journal_file_move_to_object() call too much. */
768 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
769 assert(f);
770 assert(o);
771
772 switch (o->object.type) {
773
774 case OBJECT_DATA: {
775 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
776 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
777 le64toh(o->data.n_entries), offset);
778 return -EBADMSG;
779 }
780
781 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
782 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
783 offsetof(DataObject, payload),
784 le64toh(o->object.size),
785 offset);
786 return -EBADMSG;
787 }
788
789 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
790 !VALID64(le64toh(o->data.next_field_offset)) ||
791 !VALID64(le64toh(o->data.entry_offset)) ||
792 !VALID64(le64toh(o->data.entry_array_offset))) {
793 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
794 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
795 le64toh(o->data.next_hash_offset),
796 le64toh(o->data.next_field_offset),
797 le64toh(o->data.entry_offset),
798 le64toh(o->data.entry_array_offset),
799 offset);
800 return -EBADMSG;
801 }
802
803 break;
804 }
805
806 case OBJECT_FIELD:
807 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
808 log_debug(
809 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
810 offsetof(FieldObject, payload),
811 le64toh(o->object.size),
812 offset);
813 return -EBADMSG;
814 }
815
816 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
817 !VALID64(le64toh(o->field.head_data_offset))) {
818 log_debug(
819 "Invalid offset, next_hash_offset="OFSfmt
820 ", head_data_offset="OFSfmt": %"PRIu64,
821 le64toh(o->field.next_hash_offset),
822 le64toh(o->field.head_data_offset),
823 offset);
824 return -EBADMSG;
825 }
826 break;
827
828 case OBJECT_ENTRY:
829 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
830 log_debug(
831 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
832 offsetof(EntryObject, items),
833 le64toh(o->object.size),
834 offset);
835 return -EBADMSG;
836 }
837
838 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
839 log_debug(
840 "Invalid number items in entry: %"PRIu64": %"PRIu64,
841 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
842 offset);
843 return -EBADMSG;
844 }
845
846 if (le64toh(o->entry.seqnum) <= 0) {
847 log_debug(
848 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
849 le64toh(o->entry.seqnum),
850 offset);
851 return -EBADMSG;
852 }
853
854 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
855 log_debug(
856 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
857 le64toh(o->entry.realtime),
858 offset);
859 return -EBADMSG;
860 }
861
862 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
863 log_debug(
864 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
865 le64toh(o->entry.monotonic),
866 offset);
867 return -EBADMSG;
868 }
869
870 break;
871
872 case OBJECT_DATA_HASH_TABLE:
873 case OBJECT_FIELD_HASH_TABLE:
874 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
875 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
876 log_debug(
877 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
878 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
879 le64toh(o->object.size),
880 offset);
881 return -EBADMSG;
882 }
883
884 break;
885
886 case OBJECT_ENTRY_ARRAY:
887 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
888 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
889 log_debug(
890 "Invalid object entry array size: %"PRIu64": %"PRIu64,
891 le64toh(o->object.size),
892 offset);
893 return -EBADMSG;
894 }
895
896 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
897 log_debug(
898 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
899 le64toh(o->entry_array.next_entry_array_offset),
900 offset);
901 return -EBADMSG;
902 }
903
904 break;
905
906 case OBJECT_TAG:
907 if (le64toh(o->object.size) != sizeof(TagObject)) {
908 log_debug(
909 "Invalid object tag size: %"PRIu64": %"PRIu64,
910 le64toh(o->object.size),
911 offset);
912 return -EBADMSG;
913 }
914
915 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
916 log_debug(
917 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
918 le64toh(o->tag.epoch),
919 offset);
920 return -EBADMSG;
921 }
922
923 break;
924 }
925
926 return 0;
927 }
928
929 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
930 int r;
931 void *t;
932 size_t tsize;
933 Object *o;
934 uint64_t s;
935
936 assert(f);
937 assert(ret);
938
939 /* Objects may only be located at multiple of 64 bit */
940 if (!VALID64(offset)) {
941 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
942 return -EBADMSG;
943 }
944
945 /* Object may not be located in the file header */
946 if (offset < le64toh(f->header->header_size)) {
947 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
948 return -EBADMSG;
949 }
950
951 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
952 if (r < 0)
953 return r;
954
955 o = (Object*) t;
956 s = le64toh(o->object.size);
957
958 if (s == 0) {
959 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
960 return -EBADMSG;
961 }
962 if (s < sizeof(ObjectHeader)) {
963 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
964 return -EBADMSG;
965 }
966
967 if (o->object.type <= OBJECT_UNUSED) {
968 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
969 return -EBADMSG;
970 }
971
972 if (s < minimum_header_size(o)) {
973 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
974 return -EBADMSG;
975 }
976
977 if (type > OBJECT_UNUSED && o->object.type != type) {
978 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
979 return -EBADMSG;
980 }
981
982 if (s > tsize) {
983 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
984 if (r < 0)
985 return r;
986
987 o = (Object*) t;
988 }
989
990 r = journal_file_check_object(f, offset, o);
991 if (r < 0)
992 return r;
993
994 *ret = o;
995 return 0;
996 }
997
998 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
999 uint64_t r;
1000
1001 assert(f);
1002 assert(f->header);
1003
1004 r = le64toh(f->header->tail_entry_seqnum) + 1;
1005
1006 if (seqnum) {
1007 /* If an external seqnum counter was passed, we update
1008 * both the local and the external one, and set it to
1009 * the maximum of both */
1010
1011 if (*seqnum + 1 > r)
1012 r = *seqnum + 1;
1013
1014 *seqnum = r;
1015 }
1016
1017 f->header->tail_entry_seqnum = htole64(r);
1018
1019 if (f->header->head_entry_seqnum == 0)
1020 f->header->head_entry_seqnum = htole64(r);
1021
1022 return r;
1023 }
1024
1025 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
1026 int r;
1027 uint64_t p;
1028 Object *tail, *o;
1029 void *t;
1030
1031 assert(f);
1032 assert(f->header);
1033 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
1034 assert(size >= sizeof(ObjectHeader));
1035 assert(offset);
1036 assert(ret);
1037
1038 r = journal_file_set_online(f);
1039 if (r < 0)
1040 return r;
1041
1042 p = le64toh(f->header->tail_object_offset);
1043 if (p == 0)
1044 p = le64toh(f->header->header_size);
1045 else {
1046 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1047 if (r < 0)
1048 return r;
1049
1050 p += ALIGN64(le64toh(tail->object.size));
1051 }
1052
1053 r = journal_file_allocate(f, p, size);
1054 if (r < 0)
1055 return r;
1056
1057 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1058 if (r < 0)
1059 return r;
1060
1061 o = (Object*) t;
1062
1063 zero(o->object);
1064 o->object.type = type;
1065 o->object.size = htole64(size);
1066
1067 f->header->tail_object_offset = htole64(p);
1068 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1069
1070 *ret = o;
1071 *offset = p;
1072
1073 return 0;
1074 }
1075
1076 static int journal_file_setup_data_hash_table(JournalFile *f) {
1077 uint64_t s, p;
1078 Object *o;
1079 int r;
1080
1081 assert(f);
1082 assert(f->header);
1083
1084 /* We estimate that we need 1 hash table entry per 768 bytes
1085 of journal file and we want to make sure we never get
1086 beyond 75% fill level. Calculate the hash table size for
1087 the maximum file size based on these metrics. */
1088
1089 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1090 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1091 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1092
1093 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1094
1095 r = journal_file_append_object(f,
1096 OBJECT_DATA_HASH_TABLE,
1097 offsetof(Object, hash_table.items) + s,
1098 &o, &p);
1099 if (r < 0)
1100 return r;
1101
1102 memzero(o->hash_table.items, s);
1103
1104 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1105 f->header->data_hash_table_size = htole64(s);
1106
1107 return 0;
1108 }
1109
1110 static int journal_file_setup_field_hash_table(JournalFile *f) {
1111 uint64_t s, p;
1112 Object *o;
1113 int r;
1114
1115 assert(f);
1116 assert(f->header);
1117
1118 /* We use a fixed size hash table for the fields as this
1119 * number should grow very slowly only */
1120
1121 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1122 r = journal_file_append_object(f,
1123 OBJECT_FIELD_HASH_TABLE,
1124 offsetof(Object, hash_table.items) + s,
1125 &o, &p);
1126 if (r < 0)
1127 return r;
1128
1129 memzero(o->hash_table.items, s);
1130
1131 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1132 f->header->field_hash_table_size = htole64(s);
1133
1134 return 0;
1135 }
1136
1137 int journal_file_map_data_hash_table(JournalFile *f) {
1138 uint64_t s, p;
1139 void *t;
1140 int r;
1141
1142 assert(f);
1143 assert(f->header);
1144
1145 if (f->data_hash_table)
1146 return 0;
1147
1148 p = le64toh(f->header->data_hash_table_offset);
1149 s = le64toh(f->header->data_hash_table_size);
1150
1151 r = journal_file_move_to(f,
1152 OBJECT_DATA_HASH_TABLE,
1153 true,
1154 p, s,
1155 &t, NULL);
1156 if (r < 0)
1157 return r;
1158
1159 f->data_hash_table = t;
1160 return 0;
1161 }
1162
1163 int journal_file_map_field_hash_table(JournalFile *f) {
1164 uint64_t s, p;
1165 void *t;
1166 int r;
1167
1168 assert(f);
1169 assert(f->header);
1170
1171 if (f->field_hash_table)
1172 return 0;
1173
1174 p = le64toh(f->header->field_hash_table_offset);
1175 s = le64toh(f->header->field_hash_table_size);
1176
1177 r = journal_file_move_to(f,
1178 OBJECT_FIELD_HASH_TABLE,
1179 true,
1180 p, s,
1181 &t, NULL);
1182 if (r < 0)
1183 return r;
1184
1185 f->field_hash_table = t;
1186 return 0;
1187 }
1188
1189 static int journal_file_link_field(
1190 JournalFile *f,
1191 Object *o,
1192 uint64_t offset,
1193 uint64_t hash) {
1194
1195 uint64_t p, h, m;
1196 int r;
1197
1198 assert(f);
1199 assert(f->header);
1200 assert(f->field_hash_table);
1201 assert(o);
1202 assert(offset > 0);
1203
1204 if (o->object.type != OBJECT_FIELD)
1205 return -EINVAL;
1206
1207 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1208 if (m <= 0)
1209 return -EBADMSG;
1210
1211 /* This might alter the window we are looking at */
1212 o->field.next_hash_offset = o->field.head_data_offset = 0;
1213
1214 h = hash % m;
1215 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1216 if (p == 0)
1217 f->field_hash_table[h].head_hash_offset = htole64(offset);
1218 else {
1219 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1220 if (r < 0)
1221 return r;
1222
1223 o->field.next_hash_offset = htole64(offset);
1224 }
1225
1226 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1227
1228 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1229 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1230
1231 return 0;
1232 }
1233
1234 static int journal_file_link_data(
1235 JournalFile *f,
1236 Object *o,
1237 uint64_t offset,
1238 uint64_t hash) {
1239
1240 uint64_t p, h, m;
1241 int r;
1242
1243 assert(f);
1244 assert(f->header);
1245 assert(f->data_hash_table);
1246 assert(o);
1247 assert(offset > 0);
1248
1249 if (o->object.type != OBJECT_DATA)
1250 return -EINVAL;
1251
1252 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1253 if (m <= 0)
1254 return -EBADMSG;
1255
1256 /* This might alter the window we are looking at */
1257 o->data.next_hash_offset = o->data.next_field_offset = 0;
1258 o->data.entry_offset = o->data.entry_array_offset = 0;
1259 o->data.n_entries = 0;
1260
1261 h = hash % m;
1262 p = le64toh(f->data_hash_table[h].tail_hash_offset);
1263 if (p == 0)
1264 /* Only entry in the hash table is easy */
1265 f->data_hash_table[h].head_hash_offset = htole64(offset);
1266 else {
1267 /* Move back to the previous data object, to patch in
1268 * pointer */
1269
1270 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1271 if (r < 0)
1272 return r;
1273
1274 o->data.next_hash_offset = htole64(offset);
1275 }
1276
1277 f->data_hash_table[h].tail_hash_offset = htole64(offset);
1278
1279 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1280 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1281
1282 return 0;
1283 }
1284
1285 int journal_file_find_field_object_with_hash(
1286 JournalFile *f,
1287 const void *field, uint64_t size, uint64_t hash,
1288 Object **ret, uint64_t *offset) {
1289
1290 uint64_t p, osize, h, m;
1291 int r;
1292
1293 assert(f);
1294 assert(f->header);
1295 assert(field && size > 0);
1296
1297 /* If the field hash table is empty, we can't find anything */
1298 if (le64toh(f->header->field_hash_table_size) <= 0)
1299 return 0;
1300
1301 /* Map the field hash table, if it isn't mapped yet. */
1302 r = journal_file_map_field_hash_table(f);
1303 if (r < 0)
1304 return r;
1305
1306 osize = offsetof(Object, field.payload) + size;
1307
1308 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1309 if (m <= 0)
1310 return -EBADMSG;
1311
1312 h = hash % m;
1313 p = le64toh(f->field_hash_table[h].head_hash_offset);
1314
1315 while (p > 0) {
1316 Object *o;
1317
1318 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1319 if (r < 0)
1320 return r;
1321
1322 if (le64toh(o->field.hash) == hash &&
1323 le64toh(o->object.size) == osize &&
1324 memcmp(o->field.payload, field, size) == 0) {
1325
1326 if (ret)
1327 *ret = o;
1328 if (offset)
1329 *offset = p;
1330
1331 return 1;
1332 }
1333
1334 p = le64toh(o->field.next_hash_offset);
1335 }
1336
1337 return 0;
1338 }
1339
1340 int journal_file_find_field_object(
1341 JournalFile *f,
1342 const void *field, uint64_t size,
1343 Object **ret, uint64_t *offset) {
1344
1345 uint64_t hash;
1346
1347 assert(f);
1348 assert(field && size > 0);
1349
1350 hash = hash64(field, size);
1351
1352 return journal_file_find_field_object_with_hash(f,
1353 field, size, hash,
1354 ret, offset);
1355 }
1356
1357 int journal_file_find_data_object_with_hash(
1358 JournalFile *f,
1359 const void *data, uint64_t size, uint64_t hash,
1360 Object **ret, uint64_t *offset) {
1361
1362 uint64_t p, osize, h, m;
1363 int r;
1364
1365 assert(f);
1366 assert(f->header);
1367 assert(data || size == 0);
1368
1369 /* If there's no data hash table, then there's no entry. */
1370 if (le64toh(f->header->data_hash_table_size) <= 0)
1371 return 0;
1372
1373 /* Map the data hash table, if it isn't mapped yet. */
1374 r = journal_file_map_data_hash_table(f);
1375 if (r < 0)
1376 return r;
1377
1378 osize = offsetof(Object, data.payload) + size;
1379
1380 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1381 if (m <= 0)
1382 return -EBADMSG;
1383
1384 h = hash % m;
1385 p = le64toh(f->data_hash_table[h].head_hash_offset);
1386
1387 while (p > 0) {
1388 Object *o;
1389
1390 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1391 if (r < 0)
1392 return r;
1393
1394 if (le64toh(o->data.hash) != hash)
1395 goto next;
1396
1397 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1398 #if HAVE_XZ || HAVE_LZ4
1399 uint64_t l;
1400 size_t rsize = 0;
1401
1402 l = le64toh(o->object.size);
1403 if (l <= offsetof(Object, data.payload))
1404 return -EBADMSG;
1405
1406 l -= offsetof(Object, data.payload);
1407
1408 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1409 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1410 if (r < 0)
1411 return r;
1412
1413 if (rsize == size &&
1414 memcmp(f->compress_buffer, data, size) == 0) {
1415
1416 if (ret)
1417 *ret = o;
1418
1419 if (offset)
1420 *offset = p;
1421
1422 return 1;
1423 }
1424 #else
1425 return -EPROTONOSUPPORT;
1426 #endif
1427 } else if (le64toh(o->object.size) == osize &&
1428 memcmp(o->data.payload, data, size) == 0) {
1429
1430 if (ret)
1431 *ret = o;
1432
1433 if (offset)
1434 *offset = p;
1435
1436 return 1;
1437 }
1438
1439 next:
1440 p = le64toh(o->data.next_hash_offset);
1441 }
1442
1443 return 0;
1444 }
1445
1446 int journal_file_find_data_object(
1447 JournalFile *f,
1448 const void *data, uint64_t size,
1449 Object **ret, uint64_t *offset) {
1450
1451 uint64_t hash;
1452
1453 assert(f);
1454 assert(data || size == 0);
1455
1456 hash = hash64(data, size);
1457
1458 return journal_file_find_data_object_with_hash(f,
1459 data, size, hash,
1460 ret, offset);
1461 }
1462
1463 static int journal_file_append_field(
1464 JournalFile *f,
1465 const void *field, uint64_t size,
1466 Object **ret, uint64_t *offset) {
1467
1468 uint64_t hash, p;
1469 uint64_t osize;
1470 Object *o;
1471 int r;
1472
1473 assert(f);
1474 assert(field && size > 0);
1475
1476 hash = hash64(field, size);
1477
1478 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1479 if (r < 0)
1480 return r;
1481 else if (r > 0) {
1482
1483 if (ret)
1484 *ret = o;
1485
1486 if (offset)
1487 *offset = p;
1488
1489 return 0;
1490 }
1491
1492 osize = offsetof(Object, field.payload) + size;
1493 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1494 if (r < 0)
1495 return r;
1496
1497 o->field.hash = htole64(hash);
1498 memcpy(o->field.payload, field, size);
1499
1500 r = journal_file_link_field(f, o, p, hash);
1501 if (r < 0)
1502 return r;
1503
1504 /* The linking might have altered the window, so let's
1505 * refresh our pointer */
1506 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1507 if (r < 0)
1508 return r;
1509
1510 #if HAVE_GCRYPT
1511 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1512 if (r < 0)
1513 return r;
1514 #endif
1515
1516 if (ret)
1517 *ret = o;
1518
1519 if (offset)
1520 *offset = p;
1521
1522 return 0;
1523 }
1524
1525 static int journal_file_append_data(
1526 JournalFile *f,
1527 const void *data, uint64_t size,
1528 Object **ret, uint64_t *offset) {
1529
1530 uint64_t hash, p;
1531 uint64_t osize;
1532 Object *o;
1533 int r, compression = 0;
1534 const void *eq;
1535
1536 assert(f);
1537 assert(data || size == 0);
1538
1539 hash = hash64(data, size);
1540
1541 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1542 if (r < 0)
1543 return r;
1544 if (r > 0) {
1545
1546 if (ret)
1547 *ret = o;
1548
1549 if (offset)
1550 *offset = p;
1551
1552 return 0;
1553 }
1554
1555 osize = offsetof(Object, data.payload) + size;
1556 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1557 if (r < 0)
1558 return r;
1559
1560 o->data.hash = htole64(hash);
1561
1562 #if HAVE_XZ || HAVE_LZ4
1563 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1564 size_t rsize = 0;
1565
1566 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1567
1568 if (compression >= 0) {
1569 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1570 o->object.flags |= compression;
1571
1572 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1573 size, rsize, object_compressed_to_string(compression));
1574 } else
1575 /* Compression didn't work, we don't really care why, let's continue without compression */
1576 compression = 0;
1577 }
1578 #endif
1579
1580 if (compression == 0)
1581 memcpy_safe(o->data.payload, data, size);
1582
1583 r = journal_file_link_data(f, o, p, hash);
1584 if (r < 0)
1585 return r;
1586
1587 #if HAVE_GCRYPT
1588 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1589 if (r < 0)
1590 return r;
1591 #endif
1592
1593 /* The linking might have altered the window, so let's
1594 * refresh our pointer */
1595 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1596 if (r < 0)
1597 return r;
1598
1599 if (!data)
1600 eq = NULL;
1601 else
1602 eq = memchr(data, '=', size);
1603 if (eq && eq > data) {
1604 Object *fo = NULL;
1605 uint64_t fp;
1606
1607 /* Create field object ... */
1608 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1609 if (r < 0)
1610 return r;
1611
1612 /* ... and link it in. */
1613 o->data.next_field_offset = fo->field.head_data_offset;
1614 fo->field.head_data_offset = le64toh(p);
1615 }
1616
1617 if (ret)
1618 *ret = o;
1619
1620 if (offset)
1621 *offset = p;
1622
1623 return 0;
1624 }
1625
1626 uint64_t journal_file_entry_n_items(Object *o) {
1627 assert(o);
1628
1629 if (o->object.type != OBJECT_ENTRY)
1630 return 0;
1631
1632 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1633 }
1634
1635 uint64_t journal_file_entry_array_n_items(Object *o) {
1636 assert(o);
1637
1638 if (o->object.type != OBJECT_ENTRY_ARRAY)
1639 return 0;
1640
1641 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1642 }
1643
1644 uint64_t journal_file_hash_table_n_items(Object *o) {
1645 assert(o);
1646
1647 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1648 return 0;
1649
1650 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1651 }
1652
1653 static int link_entry_into_array(JournalFile *f,
1654 le64_t *first,
1655 le64_t *idx,
1656 uint64_t p) {
1657 int r;
1658 uint64_t n = 0, ap = 0, q, i, a, hidx;
1659 Object *o;
1660
1661 assert(f);
1662 assert(f->header);
1663 assert(first);
1664 assert(idx);
1665 assert(p > 0);
1666
1667 a = le64toh(*first);
1668 i = hidx = le64toh(*idx);
1669 while (a > 0) {
1670
1671 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1672 if (r < 0)
1673 return r;
1674
1675 n = journal_file_entry_array_n_items(o);
1676 if (i < n) {
1677 o->entry_array.items[i] = htole64(p);
1678 *idx = htole64(hidx + 1);
1679 return 0;
1680 }
1681
1682 i -= n;
1683 ap = a;
1684 a = le64toh(o->entry_array.next_entry_array_offset);
1685 }
1686
1687 if (hidx > n)
1688 n = (hidx+1) * 2;
1689 else
1690 n = n * 2;
1691
1692 if (n < 4)
1693 n = 4;
1694
1695 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1696 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1697 &o, &q);
1698 if (r < 0)
1699 return r;
1700
1701 #if HAVE_GCRYPT
1702 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1703 if (r < 0)
1704 return r;
1705 #endif
1706
1707 o->entry_array.items[i] = htole64(p);
1708
1709 if (ap == 0)
1710 *first = htole64(q);
1711 else {
1712 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1713 if (r < 0)
1714 return r;
1715
1716 o->entry_array.next_entry_array_offset = htole64(q);
1717 }
1718
1719 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1720 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1721
1722 *idx = htole64(hidx + 1);
1723
1724 return 0;
1725 }
1726
1727 static int link_entry_into_array_plus_one(JournalFile *f,
1728 le64_t *extra,
1729 le64_t *first,
1730 le64_t *idx,
1731 uint64_t p) {
1732
1733 int r;
1734
1735 assert(f);
1736 assert(extra);
1737 assert(first);
1738 assert(idx);
1739 assert(p > 0);
1740
1741 if (*idx == 0)
1742 *extra = htole64(p);
1743 else {
1744 le64_t i;
1745
1746 i = htole64(le64toh(*idx) - 1);
1747 r = link_entry_into_array(f, first, &i, p);
1748 if (r < 0)
1749 return r;
1750 }
1751
1752 *idx = htole64(le64toh(*idx) + 1);
1753 return 0;
1754 }
1755
1756 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1757 uint64_t p;
1758 int r;
1759 assert(f);
1760 assert(o);
1761 assert(offset > 0);
1762
1763 p = le64toh(o->entry.items[i].object_offset);
1764 if (p == 0)
1765 return -EINVAL;
1766
1767 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1768 if (r < 0)
1769 return r;
1770
1771 return link_entry_into_array_plus_one(f,
1772 &o->data.entry_offset,
1773 &o->data.entry_array_offset,
1774 &o->data.n_entries,
1775 offset);
1776 }
1777
1778 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1779 uint64_t n, i;
1780 int r;
1781
1782 assert(f);
1783 assert(f->header);
1784 assert(o);
1785 assert(offset > 0);
1786
1787 if (o->object.type != OBJECT_ENTRY)
1788 return -EINVAL;
1789
1790 __sync_synchronize();
1791
1792 /* Link up the entry itself */
1793 r = link_entry_into_array(f,
1794 &f->header->entry_array_offset,
1795 &f->header->n_entries,
1796 offset);
1797 if (r < 0)
1798 return r;
1799
1800 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1801
1802 if (f->header->head_entry_realtime == 0)
1803 f->header->head_entry_realtime = o->entry.realtime;
1804
1805 f->header->tail_entry_realtime = o->entry.realtime;
1806 f->header->tail_entry_monotonic = o->entry.monotonic;
1807
1808 f->tail_entry_monotonic_valid = true;
1809
1810 /* Link up the items */
1811 n = journal_file_entry_n_items(o);
1812 for (i = 0; i < n; i++) {
1813 r = journal_file_link_entry_item(f, o, offset, i);
1814 if (r < 0)
1815 return r;
1816 }
1817
1818 return 0;
1819 }
1820
1821 static int journal_file_append_entry_internal(
1822 JournalFile *f,
1823 const dual_timestamp *ts,
1824 uint64_t xor_hash,
1825 const EntryItem items[], unsigned n_items,
1826 uint64_t *seqnum,
1827 Object **ret, uint64_t *offset) {
1828 uint64_t np;
1829 uint64_t osize;
1830 Object *o;
1831 int r;
1832
1833 assert(f);
1834 assert(f->header);
1835 assert(items || n_items == 0);
1836 assert(ts);
1837
1838 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1839
1840 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1841 if (r < 0)
1842 return r;
1843
1844 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1845 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1846 o->entry.realtime = htole64(ts->realtime);
1847 o->entry.monotonic = htole64(ts->monotonic);
1848 o->entry.xor_hash = htole64(xor_hash);
1849 o->entry.boot_id = f->header->boot_id;
1850
1851 #if HAVE_GCRYPT
1852 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1853 if (r < 0)
1854 return r;
1855 #endif
1856
1857 r = journal_file_link_entry(f, o, np);
1858 if (r < 0)
1859 return r;
1860
1861 if (ret)
1862 *ret = o;
1863
1864 if (offset)
1865 *offset = np;
1866
1867 return 0;
1868 }
1869
1870 void journal_file_post_change(JournalFile *f) {
1871 assert(f);
1872
1873 /* inotify() does not receive IN_MODIFY events from file
1874 * accesses done via mmap(). After each access we hence
1875 * trigger IN_MODIFY by truncating the journal file to its
1876 * current size which triggers IN_MODIFY. */
1877
1878 __sync_synchronize();
1879
1880 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1881 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1882 }
1883
1884 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1885 assert(userdata);
1886
1887 journal_file_post_change(userdata);
1888
1889 return 1;
1890 }
1891
1892 static void schedule_post_change(JournalFile *f) {
1893 sd_event_source *timer;
1894 int enabled, r;
1895 uint64_t now;
1896
1897 assert(f);
1898 assert(f->post_change_timer);
1899
1900 timer = f->post_change_timer;
1901
1902 r = sd_event_source_get_enabled(timer, &enabled);
1903 if (r < 0) {
1904 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1905 goto fail;
1906 }
1907
1908 if (enabled == SD_EVENT_ONESHOT)
1909 return;
1910
1911 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1912 if (r < 0) {
1913 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1914 goto fail;
1915 }
1916
1917 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1918 if (r < 0) {
1919 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1920 goto fail;
1921 }
1922
1923 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1924 if (r < 0) {
1925 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1926 goto fail;
1927 }
1928
1929 return;
1930
1931 fail:
1932 /* On failure, let's simply post the change immediately. */
1933 journal_file_post_change(f);
1934 }
1935
1936 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1937 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1938 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1939 int r;
1940
1941 assert(f);
1942 assert_return(!f->post_change_timer, -EINVAL);
1943 assert(e);
1944 assert(t);
1945
1946 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1947 if (r < 0)
1948 return r;
1949
1950 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1951 if (r < 0)
1952 return r;
1953
1954 f->post_change_timer = timer;
1955 timer = NULL;
1956 f->post_change_timer_period = t;
1957
1958 return r;
1959 }
1960
1961 static int entry_item_cmp(const void *_a, const void *_b) {
1962 const EntryItem *a = _a, *b = _b;
1963
1964 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1965 return -1;
1966 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1967 return 1;
1968 return 0;
1969 }
1970
1971 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1972 unsigned i;
1973 EntryItem *items;
1974 int r;
1975 uint64_t xor_hash = 0;
1976 struct dual_timestamp _ts;
1977
1978 assert(f);
1979 assert(f->header);
1980 assert(iovec || n_iovec == 0);
1981
1982 if (!ts) {
1983 dual_timestamp_get(&_ts);
1984 ts = &_ts;
1985 }
1986
1987 #if HAVE_GCRYPT
1988 r = journal_file_maybe_append_tag(f, ts->realtime);
1989 if (r < 0)
1990 return r;
1991 #endif
1992
1993 /* alloca() can't take 0, hence let's allocate at least one */
1994 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1995
1996 for (i = 0; i < n_iovec; i++) {
1997 uint64_t p;
1998 Object *o;
1999
2000 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2001 if (r < 0)
2002 return r;
2003
2004 xor_hash ^= le64toh(o->data.hash);
2005 items[i].object_offset = htole64(p);
2006 items[i].hash = o->data.hash;
2007 }
2008
2009 /* Order by the position on disk, in order to improve seek
2010 * times for rotating media. */
2011 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
2012
2013 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
2014
2015 /* If the memory mapping triggered a SIGBUS then we return an
2016 * IO error and ignore the error code passed down to us, since
2017 * it is very likely just an effect of a nullified replacement
2018 * mapping page */
2019
2020 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
2021 r = -EIO;
2022
2023 if (f->post_change_timer)
2024 schedule_post_change(f);
2025 else
2026 journal_file_post_change(f);
2027
2028 return r;
2029 }
2030
2031 typedef struct ChainCacheItem {
2032 uint64_t first; /* the array at the beginning of the chain */
2033 uint64_t array; /* the cached array */
2034 uint64_t begin; /* the first item in the cached array */
2035 uint64_t total; /* the total number of items in all arrays before this one in the chain */
2036 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2037 } ChainCacheItem;
2038
2039 static void chain_cache_put(
2040 OrderedHashmap *h,
2041 ChainCacheItem *ci,
2042 uint64_t first,
2043 uint64_t array,
2044 uint64_t begin,
2045 uint64_t total,
2046 uint64_t last_index) {
2047
2048 if (!ci) {
2049 /* If the chain item to cache for this chain is the
2050 * first one it's not worth caching anything */
2051 if (array == first)
2052 return;
2053
2054 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2055 ci = ordered_hashmap_steal_first(h);
2056 assert(ci);
2057 } else {
2058 ci = new(ChainCacheItem, 1);
2059 if (!ci)
2060 return;
2061 }
2062
2063 ci->first = first;
2064
2065 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2066 free(ci);
2067 return;
2068 }
2069 } else
2070 assert(ci->first == first);
2071
2072 ci->array = array;
2073 ci->begin = begin;
2074 ci->total = total;
2075 ci->last_index = last_index;
2076 }
2077
2078 static int generic_array_get(
2079 JournalFile *f,
2080 uint64_t first,
2081 uint64_t i,
2082 Object **ret, uint64_t *offset) {
2083
2084 Object *o;
2085 uint64_t p = 0, a, t = 0;
2086 int r;
2087 ChainCacheItem *ci;
2088
2089 assert(f);
2090
2091 a = first;
2092
2093 /* Try the chain cache first */
2094 ci = ordered_hashmap_get(f->chain_cache, &first);
2095 if (ci && i > ci->total) {
2096 a = ci->array;
2097 i -= ci->total;
2098 t = ci->total;
2099 }
2100
2101 while (a > 0) {
2102 uint64_t k;
2103
2104 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2105 if (r < 0)
2106 return r;
2107
2108 k = journal_file_entry_array_n_items(o);
2109 if (i < k) {
2110 p = le64toh(o->entry_array.items[i]);
2111 goto found;
2112 }
2113
2114 i -= k;
2115 t += k;
2116 a = le64toh(o->entry_array.next_entry_array_offset);
2117 }
2118
2119 return 0;
2120
2121 found:
2122 /* Let's cache this item for the next invocation */
2123 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2124
2125 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2126 if (r < 0)
2127 return r;
2128
2129 if (ret)
2130 *ret = o;
2131
2132 if (offset)
2133 *offset = p;
2134
2135 return 1;
2136 }
2137
2138 static int generic_array_get_plus_one(
2139 JournalFile *f,
2140 uint64_t extra,
2141 uint64_t first,
2142 uint64_t i,
2143 Object **ret, uint64_t *offset) {
2144
2145 Object *o;
2146
2147 assert(f);
2148
2149 if (i == 0) {
2150 int r;
2151
2152 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2153 if (r < 0)
2154 return r;
2155
2156 if (ret)
2157 *ret = o;
2158
2159 if (offset)
2160 *offset = extra;
2161
2162 return 1;
2163 }
2164
2165 return generic_array_get(f, first, i-1, ret, offset);
2166 }
2167
2168 enum {
2169 TEST_FOUND,
2170 TEST_LEFT,
2171 TEST_RIGHT
2172 };
2173
2174 static int generic_array_bisect(
2175 JournalFile *f,
2176 uint64_t first,
2177 uint64_t n,
2178 uint64_t needle,
2179 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2180 direction_t direction,
2181 Object **ret,
2182 uint64_t *offset,
2183 uint64_t *idx) {
2184
2185 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2186 bool subtract_one = false;
2187 Object *o, *array = NULL;
2188 int r;
2189 ChainCacheItem *ci;
2190
2191 assert(f);
2192 assert(test_object);
2193
2194 /* Start with the first array in the chain */
2195 a = first;
2196
2197 ci = ordered_hashmap_get(f->chain_cache, &first);
2198 if (ci && n > ci->total) {
2199 /* Ah, we have iterated this bisection array chain
2200 * previously! Let's see if we can skip ahead in the
2201 * chain, as far as the last time. But we can't jump
2202 * backwards in the chain, so let's check that
2203 * first. */
2204
2205 r = test_object(f, ci->begin, needle);
2206 if (r < 0)
2207 return r;
2208
2209 if (r == TEST_LEFT) {
2210 /* OK, what we are looking for is right of the
2211 * begin of this EntryArray, so let's jump
2212 * straight to previously cached array in the
2213 * chain */
2214
2215 a = ci->array;
2216 n -= ci->total;
2217 t = ci->total;
2218 last_index = ci->last_index;
2219 }
2220 }
2221
2222 while (a > 0) {
2223 uint64_t left, right, k, lp;
2224
2225 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2226 if (r < 0)
2227 return r;
2228
2229 k = journal_file_entry_array_n_items(array);
2230 right = MIN(k, n);
2231 if (right <= 0)
2232 return 0;
2233
2234 i = right - 1;
2235 lp = p = le64toh(array->entry_array.items[i]);
2236 if (p <= 0)
2237 r = -EBADMSG;
2238 else
2239 r = test_object(f, p, needle);
2240 if (r == -EBADMSG) {
2241 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2242 n = i;
2243 continue;
2244 }
2245 if (r < 0)
2246 return r;
2247
2248 if (r == TEST_FOUND)
2249 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2250
2251 if (r == TEST_RIGHT) {
2252 left = 0;
2253 right -= 1;
2254
2255 if (last_index != (uint64_t) -1) {
2256 assert(last_index <= right);
2257
2258 /* If we cached the last index we
2259 * looked at, let's try to not to jump
2260 * too wildly around and see if we can
2261 * limit the range to look at early to
2262 * the immediate neighbors of the last
2263 * index we looked at. */
2264
2265 if (last_index > 0) {
2266 uint64_t x = last_index - 1;
2267
2268 p = le64toh(array->entry_array.items[x]);
2269 if (p <= 0)
2270 return -EBADMSG;
2271
2272 r = test_object(f, p, needle);
2273 if (r < 0)
2274 return r;
2275
2276 if (r == TEST_FOUND)
2277 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2278
2279 if (r == TEST_RIGHT)
2280 right = x;
2281 else
2282 left = x + 1;
2283 }
2284
2285 if (last_index < right) {
2286 uint64_t y = last_index + 1;
2287
2288 p = le64toh(array->entry_array.items[y]);
2289 if (p <= 0)
2290 return -EBADMSG;
2291
2292 r = test_object(f, p, needle);
2293 if (r < 0)
2294 return r;
2295
2296 if (r == TEST_FOUND)
2297 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2298
2299 if (r == TEST_RIGHT)
2300 right = y;
2301 else
2302 left = y + 1;
2303 }
2304 }
2305
2306 for (;;) {
2307 if (left == right) {
2308 if (direction == DIRECTION_UP)
2309 subtract_one = true;
2310
2311 i = left;
2312 goto found;
2313 }
2314
2315 assert(left < right);
2316 i = (left + right) / 2;
2317
2318 p = le64toh(array->entry_array.items[i]);
2319 if (p <= 0)
2320 r = -EBADMSG;
2321 else
2322 r = test_object(f, p, needle);
2323 if (r == -EBADMSG) {
2324 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2325 right = n = i;
2326 continue;
2327 }
2328 if (r < 0)
2329 return r;
2330
2331 if (r == TEST_FOUND)
2332 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2333
2334 if (r == TEST_RIGHT)
2335 right = i;
2336 else
2337 left = i + 1;
2338 }
2339 }
2340
2341 if (k >= n) {
2342 if (direction == DIRECTION_UP) {
2343 i = n;
2344 subtract_one = true;
2345 goto found;
2346 }
2347
2348 return 0;
2349 }
2350
2351 last_p = lp;
2352
2353 n -= k;
2354 t += k;
2355 last_index = (uint64_t) -1;
2356 a = le64toh(array->entry_array.next_entry_array_offset);
2357 }
2358
2359 return 0;
2360
2361 found:
2362 if (subtract_one && t == 0 && i == 0)
2363 return 0;
2364
2365 /* Let's cache this item for the next invocation */
2366 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2367
2368 if (subtract_one && i == 0)
2369 p = last_p;
2370 else if (subtract_one)
2371 p = le64toh(array->entry_array.items[i-1]);
2372 else
2373 p = le64toh(array->entry_array.items[i]);
2374
2375 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2376 if (r < 0)
2377 return r;
2378
2379 if (ret)
2380 *ret = o;
2381
2382 if (offset)
2383 *offset = p;
2384
2385 if (idx)
2386 *idx = t + i + (subtract_one ? -1 : 0);
2387
2388 return 1;
2389 }
2390
2391 static int generic_array_bisect_plus_one(
2392 JournalFile *f,
2393 uint64_t extra,
2394 uint64_t first,
2395 uint64_t n,
2396 uint64_t needle,
2397 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2398 direction_t direction,
2399 Object **ret,
2400 uint64_t *offset,
2401 uint64_t *idx) {
2402
2403 int r;
2404 bool step_back = false;
2405 Object *o;
2406
2407 assert(f);
2408 assert(test_object);
2409
2410 if (n <= 0)
2411 return 0;
2412
2413 /* This bisects the array in object 'first', but first checks
2414 * an extra */
2415 r = test_object(f, extra, needle);
2416 if (r < 0)
2417 return r;
2418
2419 if (r == TEST_FOUND)
2420 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2421
2422 /* if we are looking with DIRECTION_UP then we need to first
2423 see if in the actual array there is a matching entry, and
2424 return the last one of that. But if there isn't any we need
2425 to return this one. Hence remember this, and return it
2426 below. */
2427 if (r == TEST_LEFT)
2428 step_back = direction == DIRECTION_UP;
2429
2430 if (r == TEST_RIGHT) {
2431 if (direction == DIRECTION_DOWN)
2432 goto found;
2433 else
2434 return 0;
2435 }
2436
2437 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2438
2439 if (r == 0 && step_back)
2440 goto found;
2441
2442 if (r > 0 && idx)
2443 (*idx)++;
2444
2445 return r;
2446
2447 found:
2448 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2449 if (r < 0)
2450 return r;
2451
2452 if (ret)
2453 *ret = o;
2454
2455 if (offset)
2456 *offset = extra;
2457
2458 if (idx)
2459 *idx = 0;
2460
2461 return 1;
2462 }
2463
2464 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2465 assert(f);
2466 assert(p > 0);
2467
2468 if (p == needle)
2469 return TEST_FOUND;
2470 else if (p < needle)
2471 return TEST_LEFT;
2472 else
2473 return TEST_RIGHT;
2474 }
2475
2476 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2477 Object *o;
2478 int r;
2479
2480 assert(f);
2481 assert(p > 0);
2482
2483 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2484 if (r < 0)
2485 return r;
2486
2487 if (le64toh(o->entry.seqnum) == needle)
2488 return TEST_FOUND;
2489 else if (le64toh(o->entry.seqnum) < needle)
2490 return TEST_LEFT;
2491 else
2492 return TEST_RIGHT;
2493 }
2494
2495 int journal_file_move_to_entry_by_seqnum(
2496 JournalFile *f,
2497 uint64_t seqnum,
2498 direction_t direction,
2499 Object **ret,
2500 uint64_t *offset) {
2501 assert(f);
2502 assert(f->header);
2503
2504 return generic_array_bisect(f,
2505 le64toh(f->header->entry_array_offset),
2506 le64toh(f->header->n_entries),
2507 seqnum,
2508 test_object_seqnum,
2509 direction,
2510 ret, offset, NULL);
2511 }
2512
2513 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2514 Object *o;
2515 int r;
2516
2517 assert(f);
2518 assert(p > 0);
2519
2520 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2521 if (r < 0)
2522 return r;
2523
2524 if (le64toh(o->entry.realtime) == needle)
2525 return TEST_FOUND;
2526 else if (le64toh(o->entry.realtime) < needle)
2527 return TEST_LEFT;
2528 else
2529 return TEST_RIGHT;
2530 }
2531
2532 int journal_file_move_to_entry_by_realtime(
2533 JournalFile *f,
2534 uint64_t realtime,
2535 direction_t direction,
2536 Object **ret,
2537 uint64_t *offset) {
2538 assert(f);
2539 assert(f->header);
2540
2541 return generic_array_bisect(f,
2542 le64toh(f->header->entry_array_offset),
2543 le64toh(f->header->n_entries),
2544 realtime,
2545 test_object_realtime,
2546 direction,
2547 ret, offset, NULL);
2548 }
2549
2550 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2551 Object *o;
2552 int r;
2553
2554 assert(f);
2555 assert(p > 0);
2556
2557 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2558 if (r < 0)
2559 return r;
2560
2561 if (le64toh(o->entry.monotonic) == needle)
2562 return TEST_FOUND;
2563 else if (le64toh(o->entry.monotonic) < needle)
2564 return TEST_LEFT;
2565 else
2566 return TEST_RIGHT;
2567 }
2568
2569 static int find_data_object_by_boot_id(
2570 JournalFile *f,
2571 sd_id128_t boot_id,
2572 Object **o,
2573 uint64_t *b) {
2574
2575 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2576
2577 sd_id128_to_string(boot_id, t + 9);
2578 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2579 }
2580
2581 int journal_file_move_to_entry_by_monotonic(
2582 JournalFile *f,
2583 sd_id128_t boot_id,
2584 uint64_t monotonic,
2585 direction_t direction,
2586 Object **ret,
2587 uint64_t *offset) {
2588
2589 Object *o;
2590 int r;
2591
2592 assert(f);
2593
2594 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2595 if (r < 0)
2596 return r;
2597 if (r == 0)
2598 return -ENOENT;
2599
2600 return generic_array_bisect_plus_one(f,
2601 le64toh(o->data.entry_offset),
2602 le64toh(o->data.entry_array_offset),
2603 le64toh(o->data.n_entries),
2604 monotonic,
2605 test_object_monotonic,
2606 direction,
2607 ret, offset, NULL);
2608 }
2609
2610 void journal_file_reset_location(JournalFile *f) {
2611 f->location_type = LOCATION_HEAD;
2612 f->current_offset = 0;
2613 f->current_seqnum = 0;
2614 f->current_realtime = 0;
2615 f->current_monotonic = 0;
2616 zero(f->current_boot_id);
2617 f->current_xor_hash = 0;
2618 }
2619
2620 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2621 f->location_type = LOCATION_SEEK;
2622 f->current_offset = offset;
2623 f->current_seqnum = le64toh(o->entry.seqnum);
2624 f->current_realtime = le64toh(o->entry.realtime);
2625 f->current_monotonic = le64toh(o->entry.monotonic);
2626 f->current_boot_id = o->entry.boot_id;
2627 f->current_xor_hash = le64toh(o->entry.xor_hash);
2628 }
2629
2630 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2631 assert(af);
2632 assert(af->header);
2633 assert(bf);
2634 assert(bf->header);
2635 assert(af->location_type == LOCATION_SEEK);
2636 assert(bf->location_type == LOCATION_SEEK);
2637
2638 /* If contents and timestamps match, these entries are
2639 * identical, even if the seqnum does not match */
2640 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2641 af->current_monotonic == bf->current_monotonic &&
2642 af->current_realtime == bf->current_realtime &&
2643 af->current_xor_hash == bf->current_xor_hash)
2644 return 0;
2645
2646 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2647
2648 /* If this is from the same seqnum source, compare
2649 * seqnums */
2650 if (af->current_seqnum < bf->current_seqnum)
2651 return -1;
2652 if (af->current_seqnum > bf->current_seqnum)
2653 return 1;
2654
2655 /* Wow! This is weird, different data but the same
2656 * seqnums? Something is borked, but let's make the
2657 * best of it and compare by time. */
2658 }
2659
2660 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2661
2662 /* If the boot id matches, compare monotonic time */
2663 if (af->current_monotonic < bf->current_monotonic)
2664 return -1;
2665 if (af->current_monotonic > bf->current_monotonic)
2666 return 1;
2667 }
2668
2669 /* Otherwise, compare UTC time */
2670 if (af->current_realtime < bf->current_realtime)
2671 return -1;
2672 if (af->current_realtime > bf->current_realtime)
2673 return 1;
2674
2675 /* Finally, compare by contents */
2676 if (af->current_xor_hash < bf->current_xor_hash)
2677 return -1;
2678 if (af->current_xor_hash > bf->current_xor_hash)
2679 return 1;
2680
2681 return 0;
2682 }
2683
2684 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2685
2686 /* Increase or decrease the specified index, in the right direction. */
2687
2688 if (direction == DIRECTION_DOWN) {
2689 if (*i >= n - 1)
2690 return 0;
2691
2692 (*i) ++;
2693 } else {
2694 if (*i <= 0)
2695 return 0;
2696
2697 (*i) --;
2698 }
2699
2700 return 1;
2701 }
2702
2703 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2704
2705 /* Consider it an error if any of the two offsets is uninitialized */
2706 if (old_offset == 0 || new_offset == 0)
2707 return false;
2708
2709 /* If we go down, the new offset must be larger than the old one. */
2710 return direction == DIRECTION_DOWN ?
2711 new_offset > old_offset :
2712 new_offset < old_offset;
2713 }
2714
2715 int journal_file_next_entry(
2716 JournalFile *f,
2717 uint64_t p,
2718 direction_t direction,
2719 Object **ret, uint64_t *offset) {
2720
2721 uint64_t i, n, ofs;
2722 int r;
2723
2724 assert(f);
2725 assert(f->header);
2726
2727 n = le64toh(f->header->n_entries);
2728 if (n <= 0)
2729 return 0;
2730
2731 if (p == 0)
2732 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2733 else {
2734 r = generic_array_bisect(f,
2735 le64toh(f->header->entry_array_offset),
2736 le64toh(f->header->n_entries),
2737 p,
2738 test_object_offset,
2739 DIRECTION_DOWN,
2740 NULL, NULL,
2741 &i);
2742 if (r <= 0)
2743 return r;
2744
2745 r = bump_array_index(&i, direction, n);
2746 if (r <= 0)
2747 return r;
2748 }
2749
2750 /* And jump to it */
2751 for (;;) {
2752 r = generic_array_get(f,
2753 le64toh(f->header->entry_array_offset),
2754 i,
2755 ret, &ofs);
2756 if (r > 0)
2757 break;
2758 if (r != -EBADMSG)
2759 return r;
2760
2761 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2762 * the next one might work for us instead. */
2763 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2764
2765 r = bump_array_index(&i, direction, n);
2766 if (r <= 0)
2767 return r;
2768 }
2769
2770 /* Ensure our array is properly ordered. */
2771 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2772 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
2773 return -EBADMSG;
2774 }
2775
2776 if (offset)
2777 *offset = ofs;
2778
2779 return 1;
2780 }
2781
2782 int journal_file_next_entry_for_data(
2783 JournalFile *f,
2784 Object *o, uint64_t p,
2785 uint64_t data_offset,
2786 direction_t direction,
2787 Object **ret, uint64_t *offset) {
2788
2789 uint64_t i, n, ofs;
2790 Object *d;
2791 int r;
2792
2793 assert(f);
2794 assert(p > 0 || !o);
2795
2796 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2797 if (r < 0)
2798 return r;
2799
2800 n = le64toh(d->data.n_entries);
2801 if (n <= 0)
2802 return n;
2803
2804 if (!o)
2805 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2806 else {
2807 if (o->object.type != OBJECT_ENTRY)
2808 return -EINVAL;
2809
2810 r = generic_array_bisect_plus_one(f,
2811 le64toh(d->data.entry_offset),
2812 le64toh(d->data.entry_array_offset),
2813 le64toh(d->data.n_entries),
2814 p,
2815 test_object_offset,
2816 DIRECTION_DOWN,
2817 NULL, NULL,
2818 &i);
2819
2820 if (r <= 0)
2821 return r;
2822
2823 r = bump_array_index(&i, direction, n);
2824 if (r <= 0)
2825 return r;
2826 }
2827
2828 for (;;) {
2829 r = generic_array_get_plus_one(f,
2830 le64toh(d->data.entry_offset),
2831 le64toh(d->data.entry_array_offset),
2832 i,
2833 ret, &ofs);
2834 if (r > 0)
2835 break;
2836 if (r != -EBADMSG)
2837 return r;
2838
2839 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2840
2841 r = bump_array_index(&i, direction, n);
2842 if (r <= 0)
2843 return r;
2844 }
2845
2846 /* Ensure our array is properly ordered. */
2847 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2848 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2849 return -EBADMSG;
2850 }
2851
2852 if (offset)
2853 *offset = ofs;
2854
2855 return 1;
2856 }
2857
2858 int journal_file_move_to_entry_by_offset_for_data(
2859 JournalFile *f,
2860 uint64_t data_offset,
2861 uint64_t p,
2862 direction_t direction,
2863 Object **ret, uint64_t *offset) {
2864
2865 int r;
2866 Object *d;
2867
2868 assert(f);
2869
2870 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2871 if (r < 0)
2872 return r;
2873
2874 return generic_array_bisect_plus_one(f,
2875 le64toh(d->data.entry_offset),
2876 le64toh(d->data.entry_array_offset),
2877 le64toh(d->data.n_entries),
2878 p,
2879 test_object_offset,
2880 direction,
2881 ret, offset, NULL);
2882 }
2883
2884 int journal_file_move_to_entry_by_monotonic_for_data(
2885 JournalFile *f,
2886 uint64_t data_offset,
2887 sd_id128_t boot_id,
2888 uint64_t monotonic,
2889 direction_t direction,
2890 Object **ret, uint64_t *offset) {
2891
2892 Object *o, *d;
2893 int r;
2894 uint64_t b, z;
2895
2896 assert(f);
2897
2898 /* First, seek by time */
2899 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2900 if (r < 0)
2901 return r;
2902 if (r == 0)
2903 return -ENOENT;
2904
2905 r = generic_array_bisect_plus_one(f,
2906 le64toh(o->data.entry_offset),
2907 le64toh(o->data.entry_array_offset),
2908 le64toh(o->data.n_entries),
2909 monotonic,
2910 test_object_monotonic,
2911 direction,
2912 NULL, &z, NULL);
2913 if (r <= 0)
2914 return r;
2915
2916 /* And now, continue seeking until we find an entry that
2917 * exists in both bisection arrays */
2918
2919 for (;;) {
2920 Object *qo;
2921 uint64_t p, q;
2922
2923 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2924 if (r < 0)
2925 return r;
2926
2927 r = generic_array_bisect_plus_one(f,
2928 le64toh(d->data.entry_offset),
2929 le64toh(d->data.entry_array_offset),
2930 le64toh(d->data.n_entries),
2931 z,
2932 test_object_offset,
2933 direction,
2934 NULL, &p, NULL);
2935 if (r <= 0)
2936 return r;
2937
2938 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2939 if (r < 0)
2940 return r;
2941
2942 r = generic_array_bisect_plus_one(f,
2943 le64toh(o->data.entry_offset),
2944 le64toh(o->data.entry_array_offset),
2945 le64toh(o->data.n_entries),
2946 p,
2947 test_object_offset,
2948 direction,
2949 &qo, &q, NULL);
2950
2951 if (r <= 0)
2952 return r;
2953
2954 if (p == q) {
2955 if (ret)
2956 *ret = qo;
2957 if (offset)
2958 *offset = q;
2959
2960 return 1;
2961 }
2962
2963 z = q;
2964 }
2965 }
2966
2967 int journal_file_move_to_entry_by_seqnum_for_data(
2968 JournalFile *f,
2969 uint64_t data_offset,
2970 uint64_t seqnum,
2971 direction_t direction,
2972 Object **ret, uint64_t *offset) {
2973
2974 Object *d;
2975 int r;
2976
2977 assert(f);
2978
2979 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2980 if (r < 0)
2981 return r;
2982
2983 return generic_array_bisect_plus_one(f,
2984 le64toh(d->data.entry_offset),
2985 le64toh(d->data.entry_array_offset),
2986 le64toh(d->data.n_entries),
2987 seqnum,
2988 test_object_seqnum,
2989 direction,
2990 ret, offset, NULL);
2991 }
2992
2993 int journal_file_move_to_entry_by_realtime_for_data(
2994 JournalFile *f,
2995 uint64_t data_offset,
2996 uint64_t realtime,
2997 direction_t direction,
2998 Object **ret, uint64_t *offset) {
2999
3000 Object *d;
3001 int r;
3002
3003 assert(f);
3004
3005 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
3006 if (r < 0)
3007 return r;
3008
3009 return generic_array_bisect_plus_one(f,
3010 le64toh(d->data.entry_offset),
3011 le64toh(d->data.entry_array_offset),
3012 le64toh(d->data.n_entries),
3013 realtime,
3014 test_object_realtime,
3015 direction,
3016 ret, offset, NULL);
3017 }
3018
3019 void journal_file_dump(JournalFile *f) {
3020 Object *o;
3021 int r;
3022 uint64_t p;
3023
3024 assert(f);
3025 assert(f->header);
3026
3027 journal_file_print_header(f);
3028
3029 p = le64toh(f->header->header_size);
3030 while (p != 0) {
3031 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3032 if (r < 0)
3033 goto fail;
3034
3035 switch (o->object.type) {
3036
3037 case OBJECT_UNUSED:
3038 printf("Type: OBJECT_UNUSED\n");
3039 break;
3040
3041 case OBJECT_DATA:
3042 printf("Type: OBJECT_DATA\n");
3043 break;
3044
3045 case OBJECT_FIELD:
3046 printf("Type: OBJECT_FIELD\n");
3047 break;
3048
3049 case OBJECT_ENTRY:
3050 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3051 le64toh(o->entry.seqnum),
3052 le64toh(o->entry.monotonic),
3053 le64toh(o->entry.realtime));
3054 break;
3055
3056 case OBJECT_FIELD_HASH_TABLE:
3057 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3058 break;
3059
3060 case OBJECT_DATA_HASH_TABLE:
3061 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3062 break;
3063
3064 case OBJECT_ENTRY_ARRAY:
3065 printf("Type: OBJECT_ENTRY_ARRAY\n");
3066 break;
3067
3068 case OBJECT_TAG:
3069 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3070 le64toh(o->tag.seqnum),
3071 le64toh(o->tag.epoch));
3072 break;
3073
3074 default:
3075 printf("Type: unknown (%i)\n", o->object.type);
3076 break;
3077 }
3078
3079 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3080 printf("Flags: %s\n",
3081 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3082
3083 if (p == le64toh(f->header->tail_object_offset))
3084 p = 0;
3085 else
3086 p = p + ALIGN64(le64toh(o->object.size));
3087 }
3088
3089 return;
3090 fail:
3091 log_error("File corrupt");
3092 }
3093
3094 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3095 const char *x;
3096
3097 x = format_timestamp(buf, l, t);
3098 if (x)
3099 return x;
3100 return " --- ";
3101 }
3102
3103 void journal_file_print_header(JournalFile *f) {
3104 char a[33], b[33], c[33], d[33];
3105 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3106 struct stat st;
3107 char bytes[FORMAT_BYTES_MAX];
3108
3109 assert(f);
3110 assert(f->header);
3111
3112 printf("File Path: %s\n"
3113 "File ID: %s\n"
3114 "Machine ID: %s\n"
3115 "Boot ID: %s\n"
3116 "Sequential Number ID: %s\n"
3117 "State: %s\n"
3118 "Compatible Flags:%s%s\n"
3119 "Incompatible Flags:%s%s%s\n"
3120 "Header size: %"PRIu64"\n"
3121 "Arena size: %"PRIu64"\n"
3122 "Data Hash Table Size: %"PRIu64"\n"
3123 "Field Hash Table Size: %"PRIu64"\n"
3124 "Rotate Suggested: %s\n"
3125 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3126 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3127 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3128 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3129 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3130 "Objects: %"PRIu64"\n"
3131 "Entry Objects: %"PRIu64"\n",
3132 f->path,
3133 sd_id128_to_string(f->header->file_id, a),
3134 sd_id128_to_string(f->header->machine_id, b),
3135 sd_id128_to_string(f->header->boot_id, c),
3136 sd_id128_to_string(f->header->seqnum_id, d),
3137 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3138 f->header->state == STATE_ONLINE ? "ONLINE" :
3139 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3140 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3141 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3142 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3143 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3144 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3145 le64toh(f->header->header_size),
3146 le64toh(f->header->arena_size),
3147 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3148 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3149 yes_no(journal_file_rotate_suggested(f, 0)),
3150 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3151 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3152 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3153 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3154 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3155 le64toh(f->header->n_objects),
3156 le64toh(f->header->n_entries));
3157
3158 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3159 printf("Data Objects: %"PRIu64"\n"
3160 "Data Hash Table Fill: %.1f%%\n",
3161 le64toh(f->header->n_data),
3162 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3163
3164 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3165 printf("Field Objects: %"PRIu64"\n"
3166 "Field Hash Table Fill: %.1f%%\n",
3167 le64toh(f->header->n_fields),
3168 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3169
3170 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3171 printf("Tag Objects: %"PRIu64"\n",
3172 le64toh(f->header->n_tags));
3173 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3174 printf("Entry Array Objects: %"PRIu64"\n",
3175 le64toh(f->header->n_entry_arrays));
3176
3177 if (fstat(f->fd, &st) >= 0)
3178 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3179 }
3180
3181 static int journal_file_warn_btrfs(JournalFile *f) {
3182 unsigned attrs;
3183 int r;
3184
3185 assert(f);
3186
3187 /* Before we write anything, check if the COW logic is turned
3188 * off on btrfs. Given our write pattern that is quite
3189 * unfriendly to COW file systems this should greatly improve
3190 * performance on COW file systems, such as btrfs, at the
3191 * expense of data integrity features (which shouldn't be too
3192 * bad, given that we do our own checksumming). */
3193
3194 r = btrfs_is_filesystem(f->fd);
3195 if (r < 0)
3196 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3197 if (!r)
3198 return 0;
3199
3200 r = read_attr_fd(f->fd, &attrs);
3201 if (r < 0)
3202 return log_warning_errno(r, "Failed to read file attributes: %m");
3203
3204 if (attrs & FS_NOCOW_FL) {
3205 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3206 return 0;
3207 }
3208
3209 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3210 "This is likely to slow down journal access substantially, please consider turning "
3211 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3212
3213 return 1;
3214 }
3215
3216 int journal_file_open(
3217 int fd,
3218 const char *fname,
3219 int flags,
3220 mode_t mode,
3221 bool compress,
3222 bool seal,
3223 JournalMetrics *metrics,
3224 MMapCache *mmap_cache,
3225 Set *deferred_closes,
3226 JournalFile *template,
3227 JournalFile **ret) {
3228
3229 bool newly_created = false;
3230 JournalFile *f;
3231 void *h;
3232 int r;
3233
3234 assert(ret);
3235 assert(fd >= 0 || fname);
3236
3237 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3238 return -EINVAL;
3239
3240 if (fname) {
3241 if (!endswith(fname, ".journal") &&
3242 !endswith(fname, ".journal~"))
3243 return -EINVAL;
3244 }
3245
3246 f = new0(JournalFile, 1);
3247 if (!f)
3248 return -ENOMEM;
3249
3250 f->fd = fd;
3251 f->mode = mode;
3252
3253 f->flags = flags;
3254 f->prot = prot_from_flags(flags);
3255 f->writable = (flags & O_ACCMODE) != O_RDONLY;
3256 #if HAVE_LZ4
3257 f->compress_lz4 = compress;
3258 #elif HAVE_XZ
3259 f->compress_xz = compress;
3260 #endif
3261 #if HAVE_GCRYPT
3262 f->seal = seal;
3263 #endif
3264
3265 if (mmap_cache)
3266 f->mmap = mmap_cache_ref(mmap_cache);
3267 else {
3268 f->mmap = mmap_cache_new();
3269 if (!f->mmap) {
3270 r = -ENOMEM;
3271 goto fail;
3272 }
3273 }
3274
3275 if (fname) {
3276 f->path = strdup(fname);
3277 if (!f->path) {
3278 r = -ENOMEM;
3279 goto fail;
3280 }
3281 } else {
3282 /* If we don't know the path, fill in something explanatory and vaguely useful */
3283 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3284 r = -ENOMEM;
3285 goto fail;
3286 }
3287 }
3288
3289 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3290 if (!f->chain_cache) {
3291 r = -ENOMEM;
3292 goto fail;
3293 }
3294
3295 if (f->fd < 0) {
3296 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
3297 if (f->fd < 0) {
3298 r = -errno;
3299 goto fail;
3300 }
3301
3302 /* fds we opened here by us should also be closed by us. */
3303 f->close_fd = true;
3304 }
3305
3306 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3307 if (!f->cache_fd) {
3308 r = -ENOMEM;
3309 goto fail;
3310 }
3311
3312 r = journal_file_fstat(f);
3313 if (r < 0)
3314 goto fail;
3315
3316 if (f->last_stat.st_size == 0 && f->writable) {
3317
3318 (void) journal_file_warn_btrfs(f);
3319
3320 /* Let's attach the creation time to the journal file,
3321 * so that the vacuuming code knows the age of this
3322 * file even if the file might end up corrupted one
3323 * day... Ideally we'd just use the creation time many
3324 * file systems maintain for each file, but there is
3325 * currently no usable API to query this, hence let's
3326 * emulate this via extended attributes. If extended
3327 * attributes are not supported we'll just skip this,
3328 * and rely solely on mtime/atime/ctime of the file. */
3329
3330 fd_setcrtime(f->fd, 0);
3331
3332 #if HAVE_GCRYPT
3333 /* Try to load the FSPRG state, and if we can't, then
3334 * just don't do sealing */
3335 if (f->seal) {
3336 r = journal_file_fss_load(f);
3337 if (r < 0)
3338 f->seal = false;
3339 }
3340 #endif
3341
3342 r = journal_file_init_header(f, template);
3343 if (r < 0)
3344 goto fail;
3345
3346 r = journal_file_fstat(f);
3347 if (r < 0)
3348 goto fail;
3349
3350 newly_created = true;
3351 }
3352
3353 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3354 r = -ENODATA;
3355 goto fail;
3356 }
3357
3358 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3359 if (r < 0)
3360 goto fail;
3361
3362 f->header = h;
3363
3364 if (!newly_created) {
3365 set_clear_with_destructor(deferred_closes, journal_file_close);
3366
3367 r = journal_file_verify_header(f);
3368 if (r < 0)
3369 goto fail;
3370 }
3371
3372 #if HAVE_GCRYPT
3373 if (!newly_created && f->writable) {
3374 r = journal_file_fss_load(f);
3375 if (r < 0)
3376 goto fail;
3377 }
3378 #endif
3379
3380 if (f->writable) {
3381 if (metrics) {
3382 journal_default_metrics(metrics, f->fd);
3383 f->metrics = *metrics;
3384 } else if (template)
3385 f->metrics = template->metrics;
3386
3387 r = journal_file_refresh_header(f);
3388 if (r < 0)
3389 goto fail;
3390 }
3391
3392 #if HAVE_GCRYPT
3393 r = journal_file_hmac_setup(f);
3394 if (r < 0)
3395 goto fail;
3396 #endif
3397
3398 if (newly_created) {
3399 r = journal_file_setup_field_hash_table(f);
3400 if (r < 0)
3401 goto fail;
3402
3403 r = journal_file_setup_data_hash_table(f);
3404 if (r < 0)
3405 goto fail;
3406
3407 #if HAVE_GCRYPT
3408 r = journal_file_append_first_tag(f);
3409 if (r < 0)
3410 goto fail;
3411 #endif
3412 }
3413
3414 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3415 r = -EIO;
3416 goto fail;
3417 }
3418
3419 if (template && template->post_change_timer) {
3420 r = journal_file_enable_post_change_timer(
3421 f,
3422 sd_event_source_get_event(template->post_change_timer),
3423 template->post_change_timer_period);
3424
3425 if (r < 0)
3426 goto fail;
3427 }
3428
3429 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3430 f->close_fd = true;
3431
3432 *ret = f;
3433 return 0;
3434
3435 fail:
3436 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3437 r = -EIO;
3438
3439 (void) journal_file_close(f);
3440
3441 return r;
3442 }
3443
3444 int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
3445 _cleanup_free_ char *p = NULL;
3446 size_t l;
3447 JournalFile *old_file, *new_file = NULL;
3448 int r;
3449
3450 assert(f);
3451 assert(*f);
3452
3453 old_file = *f;
3454
3455 if (!old_file->writable)
3456 return -EINVAL;
3457
3458 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3459 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3460 if (path_startswith(old_file->path, "/proc/self/fd"))
3461 return -EINVAL;
3462
3463 if (!endswith(old_file->path, ".journal"))
3464 return -EINVAL;
3465
3466 l = strlen(old_file->path);
3467 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3468 (int) l - 8, old_file->path,
3469 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3470 le64toh((*f)->header->head_entry_seqnum),
3471 le64toh((*f)->header->head_entry_realtime));
3472 if (r < 0)
3473 return -ENOMEM;
3474
3475 /* Try to rename the file to the archived version. If the file
3476 * already was deleted, we'll get ENOENT, let's ignore that
3477 * case. */
3478 r = rename(old_file->path, p);
3479 if (r < 0 && errno != ENOENT)
3480 return -errno;
3481
3482 /* Sync the rename to disk */
3483 (void) fsync_directory_of_file(old_file->fd);
3484
3485 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3486 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3487 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3488 * would result in the rotated journal never getting fsync() called before closing.
3489 * Now we simply queue the archive state by setting an archive bit, leaving the state
3490 * as STATE_ONLINE so proper offlining occurs. */
3491 old_file->archive = true;
3492
3493 /* Currently, btrfs is not very good with out write patterns
3494 * and fragments heavily. Let's defrag our journal files when
3495 * we archive them */
3496 old_file->defrag_on_close = true;
3497
3498 r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
3499
3500 if (deferred_closes &&
3501 set_put(deferred_closes, old_file) >= 0)
3502 (void) journal_file_set_offline(old_file, false);
3503 else
3504 (void) journal_file_close(old_file);
3505
3506 *f = new_file;
3507 return r;
3508 }
3509
3510 int journal_file_open_reliably(
3511 const char *fname,
3512 int flags,
3513 mode_t mode,
3514 bool compress,
3515 bool seal,
3516 JournalMetrics *metrics,
3517 MMapCache *mmap_cache,
3518 Set *deferred_closes,
3519 JournalFile *template,
3520 JournalFile **ret) {
3521
3522 int r;
3523 size_t l;
3524 _cleanup_free_ char *p = NULL;
3525
3526 r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3527 if (!IN_SET(r,
3528 -EBADMSG, /* Corrupted */
3529 -ENODATA, /* Truncated */
3530 -EHOSTDOWN, /* Other machine */
3531 -EPROTONOSUPPORT, /* Incompatible feature */
3532 -EBUSY, /* Unclean shutdown */
3533 -ESHUTDOWN, /* Already archived */
3534 -EIO, /* IO error, including SIGBUS on mmap */
3535 -EIDRM, /* File has been deleted */
3536 -ETXTBSY)) /* File is from the future */
3537 return r;
3538
3539 if ((flags & O_ACCMODE) == O_RDONLY)
3540 return r;
3541
3542 if (!(flags & O_CREAT))
3543 return r;
3544
3545 if (!endswith(fname, ".journal"))
3546 return r;
3547
3548 /* The file is corrupted. Rotate it away and try it again (but only once) */
3549
3550 l = strlen(fname);
3551 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
3552 (int) l - 8, fname,
3553 now(CLOCK_REALTIME),
3554 random_u64()) < 0)
3555 return -ENOMEM;
3556
3557 if (rename(fname, p) < 0)
3558 return -errno;
3559
3560 /* btrfs doesn't cope well with our write pattern and
3561 * fragments heavily. Let's defrag all files we rotate */
3562
3563 (void) chattr_path(p, 0, FS_NOCOW_FL);
3564 (void) btrfs_defrag(p);
3565
3566 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3567
3568 return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3569 }
3570
3571 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3572 uint64_t i, n;
3573 uint64_t q, xor_hash = 0;
3574 int r;
3575 EntryItem *items;
3576 dual_timestamp ts;
3577
3578 assert(from);
3579 assert(to);
3580 assert(o);
3581 assert(p);
3582
3583 if (!to->writable)
3584 return -EPERM;
3585
3586 ts.monotonic = le64toh(o->entry.monotonic);
3587 ts.realtime = le64toh(o->entry.realtime);
3588
3589 n = journal_file_entry_n_items(o);
3590 /* alloca() can't take 0, hence let's allocate at least one */
3591 items = alloca(sizeof(EntryItem) * MAX(1u, n));
3592
3593 for (i = 0; i < n; i++) {
3594 uint64_t l, h;
3595 le64_t le_hash;
3596 size_t t;
3597 void *data;
3598 Object *u;
3599
3600 q = le64toh(o->entry.items[i].object_offset);
3601 le_hash = o->entry.items[i].hash;
3602
3603 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3604 if (r < 0)
3605 return r;
3606
3607 if (le_hash != o->data.hash)
3608 return -EBADMSG;
3609
3610 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3611 t = (size_t) l;
3612
3613 /* We hit the limit on 32bit machines */
3614 if ((uint64_t) t != l)
3615 return -E2BIG;
3616
3617 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3618 #if HAVE_XZ || HAVE_LZ4
3619 size_t rsize = 0;
3620
3621 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3622 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3623 if (r < 0)
3624 return r;
3625
3626 data = from->compress_buffer;
3627 l = rsize;
3628 #else
3629 return -EPROTONOSUPPORT;
3630 #endif
3631 } else
3632 data = o->data.payload;
3633
3634 r = journal_file_append_data(to, data, l, &u, &h);
3635 if (r < 0)
3636 return r;
3637
3638 xor_hash ^= le64toh(u->data.hash);
3639 items[i].object_offset = htole64(h);
3640 items[i].hash = u->data.hash;
3641
3642 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3643 if (r < 0)
3644 return r;
3645 }
3646
3647 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3648
3649 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3650 return -EIO;
3651
3652 return r;
3653 }
3654
3655 void journal_reset_metrics(JournalMetrics *m) {
3656 assert(m);
3657
3658 /* Set everything to "pick automatic values". */
3659
3660 *m = (JournalMetrics) {
3661 .min_use = (uint64_t) -1,
3662 .max_use = (uint64_t) -1,
3663 .min_size = (uint64_t) -1,
3664 .max_size = (uint64_t) -1,
3665 .keep_free = (uint64_t) -1,
3666 .n_max_files = (uint64_t) -1,
3667 };
3668 }
3669
3670 void journal_default_metrics(JournalMetrics *m, int fd) {
3671 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3672 struct statvfs ss;
3673 uint64_t fs_size;
3674
3675 assert(m);
3676 assert(fd >= 0);
3677
3678 if (fstatvfs(fd, &ss) >= 0)
3679 fs_size = ss.f_frsize * ss.f_blocks;
3680 else {
3681 log_debug_errno(errno, "Failed to detremine disk size: %m");
3682 fs_size = 0;
3683 }
3684
3685 if (m->max_use == (uint64_t) -1) {
3686
3687 if (fs_size > 0) {
3688 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3689
3690 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3691 m->max_use = DEFAULT_MAX_USE_UPPER;
3692
3693 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3694 m->max_use = DEFAULT_MAX_USE_LOWER;
3695 } else
3696 m->max_use = DEFAULT_MAX_USE_LOWER;
3697 } else {
3698 m->max_use = PAGE_ALIGN(m->max_use);
3699
3700 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3701 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3702 }
3703
3704 if (m->min_use == (uint64_t) -1)
3705 m->min_use = DEFAULT_MIN_USE;
3706
3707 if (m->min_use > m->max_use)
3708 m->min_use = m->max_use;
3709
3710 if (m->max_size == (uint64_t) -1) {
3711 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3712
3713 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3714 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3715 } else
3716 m->max_size = PAGE_ALIGN(m->max_size);
3717
3718 if (m->max_size != 0) {
3719 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3720 m->max_size = JOURNAL_FILE_SIZE_MIN;
3721
3722 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3723 m->max_use = m->max_size*2;
3724 }
3725
3726 if (m->min_size == (uint64_t) -1)
3727 m->min_size = JOURNAL_FILE_SIZE_MIN;
3728 else {
3729 m->min_size = PAGE_ALIGN(m->min_size);
3730
3731 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3732 m->min_size = JOURNAL_FILE_SIZE_MIN;
3733
3734 if (m->max_size != 0 && m->min_size > m->max_size)
3735 m->max_size = m->min_size;
3736 }
3737
3738 if (m->keep_free == (uint64_t) -1) {
3739
3740 if (fs_size > 0) {
3741 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3742
3743 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3744 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3745
3746 } else
3747 m->keep_free = DEFAULT_KEEP_FREE;
3748 }
3749
3750 if (m->n_max_files == (uint64_t) -1)
3751 m->n_max_files = DEFAULT_N_MAX_FILES;
3752
3753 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3754 format_bytes(a, sizeof(a), m->min_use),
3755 format_bytes(b, sizeof(b), m->max_use),
3756 format_bytes(c, sizeof(c), m->max_size),
3757 format_bytes(d, sizeof(d), m->min_size),
3758 format_bytes(e, sizeof(e), m->keep_free),
3759 m->n_max_files);
3760 }
3761
3762 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3763 assert(f);
3764 assert(f->header);
3765 assert(from || to);
3766
3767 if (from) {
3768 if (f->header->head_entry_realtime == 0)
3769 return -ENOENT;
3770
3771 *from = le64toh(f->header->head_entry_realtime);
3772 }
3773
3774 if (to) {
3775 if (f->header->tail_entry_realtime == 0)
3776 return -ENOENT;
3777
3778 *to = le64toh(f->header->tail_entry_realtime);
3779 }
3780
3781 return 1;
3782 }
3783
3784 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3785 Object *o;
3786 uint64_t p;
3787 int r;
3788
3789 assert(f);
3790 assert(from || to);
3791
3792 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3793 if (r <= 0)
3794 return r;
3795
3796 if (le64toh(o->data.n_entries) <= 0)
3797 return 0;
3798
3799 if (from) {
3800 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3801 if (r < 0)
3802 return r;
3803
3804 *from = le64toh(o->entry.monotonic);
3805 }
3806
3807 if (to) {
3808 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3809 if (r < 0)
3810 return r;
3811
3812 r = generic_array_get_plus_one(f,
3813 le64toh(o->data.entry_offset),
3814 le64toh(o->data.entry_array_offset),
3815 le64toh(o->data.n_entries)-1,
3816 &o, NULL);
3817 if (r <= 0)
3818 return r;
3819
3820 *to = le64toh(o->entry.monotonic);
3821 }
3822
3823 return 1;
3824 }
3825
3826 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3827 assert(f);
3828 assert(f->header);
3829
3830 /* If we gained new header fields we gained new features,
3831 * hence suggest a rotation */
3832 if (le64toh(f->header->header_size) < sizeof(Header)) {
3833 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3834 return true;
3835 }
3836
3837 /* Let's check if the hash tables grew over a certain fill
3838 * level (75%, borrowing this value from Java's hash table
3839 * implementation), and if so suggest a rotation. To calculate
3840 * the fill level we need the n_data field, which only exists
3841 * in newer versions. */
3842
3843 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3844 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3845 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3846 f->path,
3847 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3848 le64toh(f->header->n_data),
3849 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3850 (unsigned long long) f->last_stat.st_size,
3851 f->last_stat.st_size / le64toh(f->header->n_data));
3852 return true;
3853 }
3854
3855 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3856 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3857 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3858 f->path,
3859 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3860 le64toh(f->header->n_fields),
3861 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3862 return true;
3863 }
3864
3865 /* Are the data objects properly indexed by field objects? */
3866 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3867 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3868 le64toh(f->header->n_data) > 0 &&
3869 le64toh(f->header->n_fields) == 0)
3870 return true;
3871
3872 if (max_file_usec > 0) {
3873 usec_t t, h;
3874
3875 h = le64toh(f->header->head_entry_realtime);
3876 t = now(CLOCK_REALTIME);
3877
3878 if (h > 0 && t > h + max_file_usec)
3879 return true;
3880 }
3881
3882 return false;
3883 }