]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
Merge pull request #7395 from poettering/nametohandleat-loop
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2011 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <linux/fs.h>
24 #include <pthread.h>
25 #include <stddef.h>
26 #include <sys/mman.h>
27 #include <sys/statvfs.h>
28 #include <sys/uio.h>
29 #include <unistd.h>
30
31 #include "alloc-util.h"
32 #include "btrfs-util.h"
33 #include "chattr-util.h"
34 #include "compress.h"
35 #include "fd-util.h"
36 #include "journal-authenticate.h"
37 #include "journal-def.h"
38 #include "journal-file.h"
39 #include "lookup3.h"
40 #include "parse-util.h"
41 #include "path-util.h"
42 #include "random-util.h"
43 #include "sd-event.h"
44 #include "set.h"
45 #include "string-util.h"
46 #include "strv.h"
47 #include "xattr-util.h"
48
49 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
50 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
51
52 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
53
54 /* This is the minimum journal file size */
55 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
56
57 /* These are the lower and upper bounds if we deduce the max_use value
58 * from the file system size */
59 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
60 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61
62 /* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
63 #define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
64
65 /* This is the upper bound if we deduce max_size from max_use */
66 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
67
68 /* This is the upper bound if we deduce the keep_free value from the
69 * file system size */
70 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
71
72 /* This is the keep_free value when we can't determine the system
73 * size */
74 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
75
76 /* This is the default maximum number of journal files to keep around. */
77 #define DEFAULT_N_MAX_FILES (100)
78
79 /* n_data was the first entry we added after the initial file format design */
80 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
81
82 /* How many entries to keep in the entry array chain cache at max */
83 #define CHAIN_CACHE_MAX 20
84
85 /* How much to increase the journal file size at once each time we allocate something new. */
86 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
87
88 /* Reread fstat() of the file for detecting deletions at least this often */
89 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
90
91 /* The mmap context to use for the header we pick as one above the last defined typed */
92 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
93
94 #ifdef __clang__
95 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
96 #endif
97
98 /* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
99 * As a result we use atomic operations on f->offline_state for inter-thread communications with
100 * journal_file_set_offline() and journal_file_set_online(). */
101 static void journal_file_set_offline_internal(JournalFile *f) {
102 assert(f);
103 assert(f->fd >= 0);
104 assert(f->header);
105
106 for (;;) {
107 switch (f->offline_state) {
108 case OFFLINE_CANCEL:
109 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
110 continue;
111 return;
112
113 case OFFLINE_AGAIN_FROM_SYNCING:
114 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
115 continue;
116 break;
117
118 case OFFLINE_AGAIN_FROM_OFFLINING:
119 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
120 continue;
121 break;
122
123 case OFFLINE_SYNCING:
124 (void) fsync(f->fd);
125
126 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
127 continue;
128
129 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
130 (void) fsync(f->fd);
131 break;
132
133 case OFFLINE_OFFLINING:
134 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
135 continue;
136 _fallthrough_;
137 case OFFLINE_DONE:
138 return;
139
140 case OFFLINE_JOINED:
141 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
142 return;
143 }
144 }
145 }
146
147 static void * journal_file_set_offline_thread(void *arg) {
148 JournalFile *f = arg;
149
150 journal_file_set_offline_internal(f);
151
152 return NULL;
153 }
154
155 static int journal_file_set_offline_thread_join(JournalFile *f) {
156 int r;
157
158 assert(f);
159
160 if (f->offline_state == OFFLINE_JOINED)
161 return 0;
162
163 r = pthread_join(f->offline_thread, NULL);
164 if (r)
165 return -r;
166
167 f->offline_state = OFFLINE_JOINED;
168
169 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
170 return -EIO;
171
172 return 0;
173 }
174
175 /* Trigger a restart if the offline thread is mid-flight in a restartable state. */
176 static bool journal_file_set_offline_try_restart(JournalFile *f) {
177 for (;;) {
178 switch (f->offline_state) {
179 case OFFLINE_AGAIN_FROM_SYNCING:
180 case OFFLINE_AGAIN_FROM_OFFLINING:
181 return true;
182
183 case OFFLINE_CANCEL:
184 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
185 continue;
186 return true;
187
188 case OFFLINE_SYNCING:
189 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
190 continue;
191 return true;
192
193 case OFFLINE_OFFLINING:
194 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
195 continue;
196 return true;
197
198 default:
199 return false;
200 }
201 }
202 }
203
204 /* Sets a journal offline.
205 *
206 * If wait is false then an offline is dispatched in a separate thread for a
207 * subsequent journal_file_set_offline() or journal_file_set_online() of the
208 * same journal to synchronize with.
209 *
210 * If wait is true, then either an existing offline thread will be restarted
211 * and joined, or if none exists the offline is simply performed in this
212 * context without involving another thread.
213 */
214 int journal_file_set_offline(JournalFile *f, bool wait) {
215 bool restarted;
216 int r;
217
218 assert(f);
219
220 if (!f->writable)
221 return -EPERM;
222
223 if (!(f->fd >= 0 && f->header))
224 return -EINVAL;
225
226 /* An offlining journal is implicitly online and may modify f->header->state,
227 * we must also join any potentially lingering offline thread when not online. */
228 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
229 return journal_file_set_offline_thread_join(f);
230
231 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
232 restarted = journal_file_set_offline_try_restart(f);
233 if ((restarted && wait) || !restarted) {
234 r = journal_file_set_offline_thread_join(f);
235 if (r < 0)
236 return r;
237 }
238
239 if (restarted)
240 return 0;
241
242 /* Initiate a new offline. */
243 f->offline_state = OFFLINE_SYNCING;
244
245 if (wait) /* Without using a thread if waiting. */
246 journal_file_set_offline_internal(f);
247 else {
248 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
249 if (r > 0) {
250 f->offline_state = OFFLINE_JOINED;
251 return -r;
252 }
253 }
254
255 return 0;
256 }
257
258 static int journal_file_set_online(JournalFile *f) {
259 bool joined = false;
260
261 assert(f);
262
263 if (!f->writable)
264 return -EPERM;
265
266 if (!(f->fd >= 0 && f->header))
267 return -EINVAL;
268
269 while (!joined) {
270 switch (f->offline_state) {
271 case OFFLINE_JOINED:
272 /* No offline thread, no need to wait. */
273 joined = true;
274 break;
275
276 case OFFLINE_SYNCING:
277 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
278 continue;
279 /* Canceled syncing prior to offlining, no need to wait. */
280 break;
281
282 case OFFLINE_AGAIN_FROM_SYNCING:
283 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
284 continue;
285 /* Canceled restart from syncing, no need to wait. */
286 break;
287
288 case OFFLINE_AGAIN_FROM_OFFLINING:
289 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
290 continue;
291 /* Canceled restart from offlining, must wait for offlining to complete however. */
292 _fallthrough_;
293 default: {
294 int r;
295
296 r = journal_file_set_offline_thread_join(f);
297 if (r < 0)
298 return r;
299
300 joined = true;
301 break;
302 }
303 }
304 }
305
306 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
307 return -EIO;
308
309 switch (f->header->state) {
310 case STATE_ONLINE:
311 return 0;
312
313 case STATE_OFFLINE:
314 f->header->state = STATE_ONLINE;
315 (void) fsync(f->fd);
316 return 0;
317
318 default:
319 return -EINVAL;
320 }
321 }
322
323 bool journal_file_is_offlining(JournalFile *f) {
324 assert(f);
325
326 __sync_synchronize();
327
328 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
329 return false;
330
331 return true;
332 }
333
334 JournalFile* journal_file_close(JournalFile *f) {
335 assert(f);
336
337 #if HAVE_GCRYPT
338 /* Write the final tag */
339 if (f->seal && f->writable) {
340 int r;
341
342 r = journal_file_append_tag(f);
343 if (r < 0)
344 log_error_errno(r, "Failed to append tag when closing journal: %m");
345 }
346 #endif
347
348 if (f->post_change_timer) {
349 int enabled;
350
351 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
352 if (enabled == SD_EVENT_ONESHOT)
353 journal_file_post_change(f);
354
355 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
356 sd_event_source_unref(f->post_change_timer);
357 }
358
359 journal_file_set_offline(f, true);
360
361 if (f->mmap && f->cache_fd)
362 mmap_cache_free_fd(f->mmap, f->cache_fd);
363
364 if (f->fd >= 0 && f->defrag_on_close) {
365
366 /* Be friendly to btrfs: turn COW back on again now,
367 * and defragment the file. We won't write to the file
368 * ever again, hence remove all fragmentation, and
369 * reenable all the good bits COW usually provides
370 * (such as data checksumming). */
371
372 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
373 (void) btrfs_defrag_fd(f->fd);
374 }
375
376 if (f->close_fd)
377 safe_close(f->fd);
378 free(f->path);
379
380 mmap_cache_unref(f->mmap);
381
382 ordered_hashmap_free_free(f->chain_cache);
383
384 #if HAVE_XZ || HAVE_LZ4
385 free(f->compress_buffer);
386 #endif
387
388 #if HAVE_GCRYPT
389 if (f->fss_file)
390 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
391 else
392 free(f->fsprg_state);
393
394 free(f->fsprg_seed);
395
396 if (f->hmac)
397 gcry_md_close(f->hmac);
398 #endif
399
400 return mfree(f);
401 }
402
403 void journal_file_close_set(Set *s) {
404 JournalFile *f;
405
406 assert(s);
407
408 while ((f = set_steal_first(s)))
409 (void) journal_file_close(f);
410 }
411
412 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
413 Header h = {};
414 ssize_t k;
415 int r;
416
417 assert(f);
418
419 memcpy(h.signature, HEADER_SIGNATURE, 8);
420 h.header_size = htole64(ALIGN64(sizeof(h)));
421
422 h.incompatible_flags |= htole32(
423 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
424 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
425
426 h.compatible_flags = htole32(
427 f->seal * HEADER_COMPATIBLE_SEALED);
428
429 r = sd_id128_randomize(&h.file_id);
430 if (r < 0)
431 return r;
432
433 if (template) {
434 h.seqnum_id = template->header->seqnum_id;
435 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
436 } else
437 h.seqnum_id = h.file_id;
438
439 k = pwrite(f->fd, &h, sizeof(h), 0);
440 if (k < 0)
441 return -errno;
442
443 if (k != sizeof(h))
444 return -EIO;
445
446 return 0;
447 }
448
449 static int fsync_directory_of_file(int fd) {
450 _cleanup_free_ char *path = NULL, *dn = NULL;
451 _cleanup_close_ int dfd = -1;
452 struct stat st;
453 int r;
454
455 if (fstat(fd, &st) < 0)
456 return -errno;
457
458 if (!S_ISREG(st.st_mode))
459 return -EBADFD;
460
461 r = fd_get_path(fd, &path);
462 if (r < 0)
463 return r;
464
465 if (!path_is_absolute(path))
466 return -EINVAL;
467
468 dn = dirname_malloc(path);
469 if (!dn)
470 return -ENOMEM;
471
472 dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
473 if (dfd < 0)
474 return -errno;
475
476 if (fsync(dfd) < 0)
477 return -errno;
478
479 return 0;
480 }
481
482 static int journal_file_refresh_header(JournalFile *f) {
483 sd_id128_t boot_id;
484 int r;
485
486 assert(f);
487 assert(f->header);
488
489 r = sd_id128_get_machine(&f->header->machine_id);
490 if (r < 0)
491 return r;
492
493 r = sd_id128_get_boot(&boot_id);
494 if (r < 0)
495 return r;
496
497 if (sd_id128_equal(boot_id, f->header->boot_id))
498 f->tail_entry_monotonic_valid = true;
499
500 f->header->boot_id = boot_id;
501
502 r = journal_file_set_online(f);
503
504 /* Sync the online state to disk */
505 (void) fsync(f->fd);
506
507 /* We likely just created a new file, also sync the directory this file is located in. */
508 (void) fsync_directory_of_file(f->fd);
509
510 return r;
511 }
512
513 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
514 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
515 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
516 const char *type = compatible ? "compatible" : "incompatible";
517 uint32_t flags;
518
519 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
520
521 if (flags & ~supported) {
522 if (flags & ~any)
523 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
524 f->path, type, flags & ~any);
525 flags = (flags & any) & ~supported;
526 if (flags) {
527 const char* strv[3];
528 unsigned n = 0;
529 _cleanup_free_ char *t = NULL;
530
531 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
532 strv[n++] = "sealed";
533 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
534 strv[n++] = "xz-compressed";
535 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
536 strv[n++] = "lz4-compressed";
537 strv[n] = NULL;
538 assert(n < ELEMENTSOF(strv));
539
540 t = strv_join((char**) strv, ", ");
541 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
542 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
543 }
544 return true;
545 }
546
547 return false;
548 }
549
550 static int journal_file_verify_header(JournalFile *f) {
551 uint64_t arena_size, header_size;
552
553 assert(f);
554 assert(f->header);
555
556 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
557 return -EBADMSG;
558
559 /* In both read and write mode we refuse to open files with incompatible
560 * flags we don't know. */
561 if (warn_wrong_flags(f, false))
562 return -EPROTONOSUPPORT;
563
564 /* When open for writing we refuse to open files with compatible flags, too. */
565 if (f->writable && warn_wrong_flags(f, true))
566 return -EPROTONOSUPPORT;
567
568 if (f->header->state >= _STATE_MAX)
569 return -EBADMSG;
570
571 header_size = le64toh(f->header->header_size);
572
573 /* The first addition was n_data, so check that we are at least this large */
574 if (header_size < HEADER_SIZE_MIN)
575 return -EBADMSG;
576
577 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
578 return -EBADMSG;
579
580 arena_size = le64toh(f->header->arena_size);
581
582 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
583 return -ENODATA;
584
585 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
586 return -ENODATA;
587
588 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
589 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
590 !VALID64(le64toh(f->header->tail_object_offset)) ||
591 !VALID64(le64toh(f->header->entry_array_offset)))
592 return -ENODATA;
593
594 if (f->writable) {
595 sd_id128_t machine_id;
596 uint8_t state;
597 int r;
598
599 r = sd_id128_get_machine(&machine_id);
600 if (r < 0)
601 return r;
602
603 if (!sd_id128_equal(machine_id, f->header->machine_id))
604 return -EHOSTDOWN;
605
606 state = f->header->state;
607
608 if (state == STATE_ARCHIVED)
609 return -ESHUTDOWN; /* Already archived */
610 else if (state == STATE_ONLINE) {
611 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
612 return -EBUSY;
613 } else if (state != STATE_OFFLINE) {
614 log_debug("Journal file %s has unknown state %i.", f->path, state);
615 return -EBUSY;
616 }
617
618 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
619 return -EBADMSG;
620
621 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
622 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
623 * bisection. */
624 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
625 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
626 return -ETXTBSY;
627 }
628 }
629
630 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
631 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
632
633 f->seal = JOURNAL_HEADER_SEALED(f->header);
634
635 return 0;
636 }
637
638 static int journal_file_fstat(JournalFile *f) {
639 assert(f);
640 assert(f->fd >= 0);
641
642 if (fstat(f->fd, &f->last_stat) < 0)
643 return -errno;
644
645 f->last_stat_usec = now(CLOCK_MONOTONIC);
646
647 /* Refuse appending to files that are already deleted */
648 if (f->last_stat.st_nlink <= 0)
649 return -EIDRM;
650
651 return 0;
652 }
653
654 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
655 uint64_t old_size, new_size;
656 int r;
657
658 assert(f);
659 assert(f->header);
660
661 /* We assume that this file is not sparse, and we know that
662 * for sure, since we always call posix_fallocate()
663 * ourselves */
664
665 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
666 return -EIO;
667
668 old_size =
669 le64toh(f->header->header_size) +
670 le64toh(f->header->arena_size);
671
672 new_size = PAGE_ALIGN(offset + size);
673 if (new_size < le64toh(f->header->header_size))
674 new_size = le64toh(f->header->header_size);
675
676 if (new_size <= old_size) {
677
678 /* We already pre-allocated enough space, but before
679 * we write to it, let's check with fstat() if the
680 * file got deleted, in order make sure we don't throw
681 * away the data immediately. Don't check fstat() for
682 * all writes though, but only once ever 10s. */
683
684 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
685 return 0;
686
687 return journal_file_fstat(f);
688 }
689
690 /* Allocate more space. */
691
692 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
693 return -E2BIG;
694
695 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
696 struct statvfs svfs;
697
698 if (fstatvfs(f->fd, &svfs) >= 0) {
699 uint64_t available;
700
701 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
702
703 if (new_size - old_size > available)
704 return -E2BIG;
705 }
706 }
707
708 /* Increase by larger blocks at once */
709 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
710 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
711 new_size = f->metrics.max_size;
712
713 /* Note that the glibc fallocate() fallback is very
714 inefficient, hence we try to minimize the allocation area
715 as we can. */
716 r = posix_fallocate(f->fd, old_size, new_size - old_size);
717 if (r != 0)
718 return -r;
719
720 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
721
722 return journal_file_fstat(f);
723 }
724
725 static unsigned type_to_context(ObjectType type) {
726 /* One context for each type, plus one catch-all for the rest */
727 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
728 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
729 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
730 }
731
732 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
733 int r;
734
735 assert(f);
736 assert(ret);
737
738 if (size <= 0)
739 return -EINVAL;
740
741 /* Avoid SIGBUS on invalid accesses */
742 if (offset + size > (uint64_t) f->last_stat.st_size) {
743 /* Hmm, out of range? Let's refresh the fstat() data
744 * first, before we trust that check. */
745
746 r = journal_file_fstat(f);
747 if (r < 0)
748 return r;
749
750 if (offset + size > (uint64_t) f->last_stat.st_size)
751 return -EADDRNOTAVAIL;
752 }
753
754 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
755 }
756
757 static uint64_t minimum_header_size(Object *o) {
758
759 static const uint64_t table[] = {
760 [OBJECT_DATA] = sizeof(DataObject),
761 [OBJECT_FIELD] = sizeof(FieldObject),
762 [OBJECT_ENTRY] = sizeof(EntryObject),
763 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
764 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
765 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
766 [OBJECT_TAG] = sizeof(TagObject),
767 };
768
769 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
770 return sizeof(ObjectHeader);
771
772 return table[o->object.type];
773 }
774
775 /* Lightweight object checks. We want this to be fast, so that we won't
776 * slowdown every journal_file_move_to_object() call too much. */
777 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
778 assert(f);
779 assert(o);
780
781 switch (o->object.type) {
782
783 case OBJECT_DATA: {
784 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
785 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
786 le64toh(o->data.n_entries), offset);
787 return -EBADMSG;
788 }
789
790 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
791 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
792 offsetof(DataObject, payload),
793 le64toh(o->object.size),
794 offset);
795 return -EBADMSG;
796 }
797
798 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
799 !VALID64(le64toh(o->data.next_field_offset)) ||
800 !VALID64(le64toh(o->data.entry_offset)) ||
801 !VALID64(le64toh(o->data.entry_array_offset))) {
802 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
803 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
804 le64toh(o->data.next_hash_offset),
805 le64toh(o->data.next_field_offset),
806 le64toh(o->data.entry_offset),
807 le64toh(o->data.entry_array_offset),
808 offset);
809 return -EBADMSG;
810 }
811
812 break;
813 }
814
815 case OBJECT_FIELD:
816 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
817 log_debug(
818 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
819 offsetof(FieldObject, payload),
820 le64toh(o->object.size),
821 offset);
822 return -EBADMSG;
823 }
824
825 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
826 !VALID64(le64toh(o->field.head_data_offset))) {
827 log_debug(
828 "Invalid offset, next_hash_offset="OFSfmt
829 ", head_data_offset="OFSfmt": %"PRIu64,
830 le64toh(o->field.next_hash_offset),
831 le64toh(o->field.head_data_offset),
832 offset);
833 return -EBADMSG;
834 }
835 break;
836
837 case OBJECT_ENTRY:
838 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
839 log_debug(
840 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
841 offsetof(EntryObject, items),
842 le64toh(o->object.size),
843 offset);
844 return -EBADMSG;
845 }
846
847 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
848 log_debug(
849 "Invalid number items in entry: %"PRIu64": %"PRIu64,
850 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
851 offset);
852 return -EBADMSG;
853 }
854
855 if (le64toh(o->entry.seqnum) <= 0) {
856 log_debug(
857 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
858 le64toh(o->entry.seqnum),
859 offset);
860 return -EBADMSG;
861 }
862
863 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
864 log_debug(
865 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
866 le64toh(o->entry.realtime),
867 offset);
868 return -EBADMSG;
869 }
870
871 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
872 log_debug(
873 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
874 le64toh(o->entry.monotonic),
875 offset);
876 return -EBADMSG;
877 }
878
879 break;
880
881 case OBJECT_DATA_HASH_TABLE:
882 case OBJECT_FIELD_HASH_TABLE:
883 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
884 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
885 log_debug(
886 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
887 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
888 le64toh(o->object.size),
889 offset);
890 return -EBADMSG;
891 }
892
893 break;
894
895 case OBJECT_ENTRY_ARRAY:
896 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
897 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
898 log_debug(
899 "Invalid object entry array size: %"PRIu64": %"PRIu64,
900 le64toh(o->object.size),
901 offset);
902 return -EBADMSG;
903 }
904
905 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
906 log_debug(
907 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
908 le64toh(o->entry_array.next_entry_array_offset),
909 offset);
910 return -EBADMSG;
911 }
912
913 break;
914
915 case OBJECT_TAG:
916 if (le64toh(o->object.size) != sizeof(TagObject)) {
917 log_debug(
918 "Invalid object tag size: %"PRIu64": %"PRIu64,
919 le64toh(o->object.size),
920 offset);
921 return -EBADMSG;
922 }
923
924 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
925 log_debug(
926 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
927 le64toh(o->tag.epoch),
928 offset);
929 return -EBADMSG;
930 }
931
932 break;
933 }
934
935 return 0;
936 }
937
938 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
939 int r;
940 void *t;
941 size_t tsize;
942 Object *o;
943 uint64_t s;
944
945 assert(f);
946 assert(ret);
947
948 /* Objects may only be located at multiple of 64 bit */
949 if (!VALID64(offset)) {
950 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
951 return -EBADMSG;
952 }
953
954 /* Object may not be located in the file header */
955 if (offset < le64toh(f->header->header_size)) {
956 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
957 return -EBADMSG;
958 }
959
960 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
961 if (r < 0)
962 return r;
963
964 o = (Object*) t;
965 s = le64toh(o->object.size);
966
967 if (s == 0) {
968 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
969 return -EBADMSG;
970 }
971 if (s < sizeof(ObjectHeader)) {
972 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
973 return -EBADMSG;
974 }
975
976 if (o->object.type <= OBJECT_UNUSED) {
977 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
978 return -EBADMSG;
979 }
980
981 if (s < minimum_header_size(o)) {
982 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
983 return -EBADMSG;
984 }
985
986 if (type > OBJECT_UNUSED && o->object.type != type) {
987 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
988 return -EBADMSG;
989 }
990
991 if (s > tsize) {
992 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
993 if (r < 0)
994 return r;
995
996 o = (Object*) t;
997 }
998
999 r = journal_file_check_object(f, offset, o);
1000 if (r < 0)
1001 return r;
1002
1003 *ret = o;
1004 return 0;
1005 }
1006
1007 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
1008 uint64_t r;
1009
1010 assert(f);
1011 assert(f->header);
1012
1013 r = le64toh(f->header->tail_entry_seqnum) + 1;
1014
1015 if (seqnum) {
1016 /* If an external seqnum counter was passed, we update
1017 * both the local and the external one, and set it to
1018 * the maximum of both */
1019
1020 if (*seqnum + 1 > r)
1021 r = *seqnum + 1;
1022
1023 *seqnum = r;
1024 }
1025
1026 f->header->tail_entry_seqnum = htole64(r);
1027
1028 if (f->header->head_entry_seqnum == 0)
1029 f->header->head_entry_seqnum = htole64(r);
1030
1031 return r;
1032 }
1033
1034 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
1035 int r;
1036 uint64_t p;
1037 Object *tail, *o;
1038 void *t;
1039
1040 assert(f);
1041 assert(f->header);
1042 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
1043 assert(size >= sizeof(ObjectHeader));
1044 assert(offset);
1045 assert(ret);
1046
1047 r = journal_file_set_online(f);
1048 if (r < 0)
1049 return r;
1050
1051 p = le64toh(f->header->tail_object_offset);
1052 if (p == 0)
1053 p = le64toh(f->header->header_size);
1054 else {
1055 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1056 if (r < 0)
1057 return r;
1058
1059 p += ALIGN64(le64toh(tail->object.size));
1060 }
1061
1062 r = journal_file_allocate(f, p, size);
1063 if (r < 0)
1064 return r;
1065
1066 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
1067 if (r < 0)
1068 return r;
1069
1070 o = (Object*) t;
1071
1072 zero(o->object);
1073 o->object.type = type;
1074 o->object.size = htole64(size);
1075
1076 f->header->tail_object_offset = htole64(p);
1077 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1078
1079 *ret = o;
1080 *offset = p;
1081
1082 return 0;
1083 }
1084
1085 static int journal_file_setup_data_hash_table(JournalFile *f) {
1086 uint64_t s, p;
1087 Object *o;
1088 int r;
1089
1090 assert(f);
1091 assert(f->header);
1092
1093 /* We estimate that we need 1 hash table entry per 768 bytes
1094 of journal file and we want to make sure we never get
1095 beyond 75% fill level. Calculate the hash table size for
1096 the maximum file size based on these metrics. */
1097
1098 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1099 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1100 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1101
1102 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
1103
1104 r = journal_file_append_object(f,
1105 OBJECT_DATA_HASH_TABLE,
1106 offsetof(Object, hash_table.items) + s,
1107 &o, &p);
1108 if (r < 0)
1109 return r;
1110
1111 memzero(o->hash_table.items, s);
1112
1113 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1114 f->header->data_hash_table_size = htole64(s);
1115
1116 return 0;
1117 }
1118
1119 static int journal_file_setup_field_hash_table(JournalFile *f) {
1120 uint64_t s, p;
1121 Object *o;
1122 int r;
1123
1124 assert(f);
1125 assert(f->header);
1126
1127 /* We use a fixed size hash table for the fields as this
1128 * number should grow very slowly only */
1129
1130 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1131 r = journal_file_append_object(f,
1132 OBJECT_FIELD_HASH_TABLE,
1133 offsetof(Object, hash_table.items) + s,
1134 &o, &p);
1135 if (r < 0)
1136 return r;
1137
1138 memzero(o->hash_table.items, s);
1139
1140 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1141 f->header->field_hash_table_size = htole64(s);
1142
1143 return 0;
1144 }
1145
1146 int journal_file_map_data_hash_table(JournalFile *f) {
1147 uint64_t s, p;
1148 void *t;
1149 int r;
1150
1151 assert(f);
1152 assert(f->header);
1153
1154 if (f->data_hash_table)
1155 return 0;
1156
1157 p = le64toh(f->header->data_hash_table_offset);
1158 s = le64toh(f->header->data_hash_table_size);
1159
1160 r = journal_file_move_to(f,
1161 OBJECT_DATA_HASH_TABLE,
1162 true,
1163 p, s,
1164 &t, NULL);
1165 if (r < 0)
1166 return r;
1167
1168 f->data_hash_table = t;
1169 return 0;
1170 }
1171
1172 int journal_file_map_field_hash_table(JournalFile *f) {
1173 uint64_t s, p;
1174 void *t;
1175 int r;
1176
1177 assert(f);
1178 assert(f->header);
1179
1180 if (f->field_hash_table)
1181 return 0;
1182
1183 p = le64toh(f->header->field_hash_table_offset);
1184 s = le64toh(f->header->field_hash_table_size);
1185
1186 r = journal_file_move_to(f,
1187 OBJECT_FIELD_HASH_TABLE,
1188 true,
1189 p, s,
1190 &t, NULL);
1191 if (r < 0)
1192 return r;
1193
1194 f->field_hash_table = t;
1195 return 0;
1196 }
1197
1198 static int journal_file_link_field(
1199 JournalFile *f,
1200 Object *o,
1201 uint64_t offset,
1202 uint64_t hash) {
1203
1204 uint64_t p, h, m;
1205 int r;
1206
1207 assert(f);
1208 assert(f->header);
1209 assert(f->field_hash_table);
1210 assert(o);
1211 assert(offset > 0);
1212
1213 if (o->object.type != OBJECT_FIELD)
1214 return -EINVAL;
1215
1216 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1217 if (m <= 0)
1218 return -EBADMSG;
1219
1220 /* This might alter the window we are looking at */
1221 o->field.next_hash_offset = o->field.head_data_offset = 0;
1222
1223 h = hash % m;
1224 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1225 if (p == 0)
1226 f->field_hash_table[h].head_hash_offset = htole64(offset);
1227 else {
1228 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1229 if (r < 0)
1230 return r;
1231
1232 o->field.next_hash_offset = htole64(offset);
1233 }
1234
1235 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1236
1237 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1238 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1239
1240 return 0;
1241 }
1242
1243 static int journal_file_link_data(
1244 JournalFile *f,
1245 Object *o,
1246 uint64_t offset,
1247 uint64_t hash) {
1248
1249 uint64_t p, h, m;
1250 int r;
1251
1252 assert(f);
1253 assert(f->header);
1254 assert(f->data_hash_table);
1255 assert(o);
1256 assert(offset > 0);
1257
1258 if (o->object.type != OBJECT_DATA)
1259 return -EINVAL;
1260
1261 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1262 if (m <= 0)
1263 return -EBADMSG;
1264
1265 /* This might alter the window we are looking at */
1266 o->data.next_hash_offset = o->data.next_field_offset = 0;
1267 o->data.entry_offset = o->data.entry_array_offset = 0;
1268 o->data.n_entries = 0;
1269
1270 h = hash % m;
1271 p = le64toh(f->data_hash_table[h].tail_hash_offset);
1272 if (p == 0)
1273 /* Only entry in the hash table is easy */
1274 f->data_hash_table[h].head_hash_offset = htole64(offset);
1275 else {
1276 /* Move back to the previous data object, to patch in
1277 * pointer */
1278
1279 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1280 if (r < 0)
1281 return r;
1282
1283 o->data.next_hash_offset = htole64(offset);
1284 }
1285
1286 f->data_hash_table[h].tail_hash_offset = htole64(offset);
1287
1288 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1289 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1290
1291 return 0;
1292 }
1293
1294 int journal_file_find_field_object_with_hash(
1295 JournalFile *f,
1296 const void *field, uint64_t size, uint64_t hash,
1297 Object **ret, uint64_t *offset) {
1298
1299 uint64_t p, osize, h, m;
1300 int r;
1301
1302 assert(f);
1303 assert(f->header);
1304 assert(field && size > 0);
1305
1306 /* If the field hash table is empty, we can't find anything */
1307 if (le64toh(f->header->field_hash_table_size) <= 0)
1308 return 0;
1309
1310 /* Map the field hash table, if it isn't mapped yet. */
1311 r = journal_file_map_field_hash_table(f);
1312 if (r < 0)
1313 return r;
1314
1315 osize = offsetof(Object, field.payload) + size;
1316
1317 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1318 if (m <= 0)
1319 return -EBADMSG;
1320
1321 h = hash % m;
1322 p = le64toh(f->field_hash_table[h].head_hash_offset);
1323
1324 while (p > 0) {
1325 Object *o;
1326
1327 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1328 if (r < 0)
1329 return r;
1330
1331 if (le64toh(o->field.hash) == hash &&
1332 le64toh(o->object.size) == osize &&
1333 memcmp(o->field.payload, field, size) == 0) {
1334
1335 if (ret)
1336 *ret = o;
1337 if (offset)
1338 *offset = p;
1339
1340 return 1;
1341 }
1342
1343 p = le64toh(o->field.next_hash_offset);
1344 }
1345
1346 return 0;
1347 }
1348
1349 int journal_file_find_field_object(
1350 JournalFile *f,
1351 const void *field, uint64_t size,
1352 Object **ret, uint64_t *offset) {
1353
1354 uint64_t hash;
1355
1356 assert(f);
1357 assert(field && size > 0);
1358
1359 hash = hash64(field, size);
1360
1361 return journal_file_find_field_object_with_hash(f,
1362 field, size, hash,
1363 ret, offset);
1364 }
1365
1366 int journal_file_find_data_object_with_hash(
1367 JournalFile *f,
1368 const void *data, uint64_t size, uint64_t hash,
1369 Object **ret, uint64_t *offset) {
1370
1371 uint64_t p, osize, h, m;
1372 int r;
1373
1374 assert(f);
1375 assert(f->header);
1376 assert(data || size == 0);
1377
1378 /* If there's no data hash table, then there's no entry. */
1379 if (le64toh(f->header->data_hash_table_size) <= 0)
1380 return 0;
1381
1382 /* Map the data hash table, if it isn't mapped yet. */
1383 r = journal_file_map_data_hash_table(f);
1384 if (r < 0)
1385 return r;
1386
1387 osize = offsetof(Object, data.payload) + size;
1388
1389 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1390 if (m <= 0)
1391 return -EBADMSG;
1392
1393 h = hash % m;
1394 p = le64toh(f->data_hash_table[h].head_hash_offset);
1395
1396 while (p > 0) {
1397 Object *o;
1398
1399 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1400 if (r < 0)
1401 return r;
1402
1403 if (le64toh(o->data.hash) != hash)
1404 goto next;
1405
1406 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1407 #if HAVE_XZ || HAVE_LZ4
1408 uint64_t l;
1409 size_t rsize = 0;
1410
1411 l = le64toh(o->object.size);
1412 if (l <= offsetof(Object, data.payload))
1413 return -EBADMSG;
1414
1415 l -= offsetof(Object, data.payload);
1416
1417 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1418 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1419 if (r < 0)
1420 return r;
1421
1422 if (rsize == size &&
1423 memcmp(f->compress_buffer, data, size) == 0) {
1424
1425 if (ret)
1426 *ret = o;
1427
1428 if (offset)
1429 *offset = p;
1430
1431 return 1;
1432 }
1433 #else
1434 return -EPROTONOSUPPORT;
1435 #endif
1436 } else if (le64toh(o->object.size) == osize &&
1437 memcmp(o->data.payload, data, size) == 0) {
1438
1439 if (ret)
1440 *ret = o;
1441
1442 if (offset)
1443 *offset = p;
1444
1445 return 1;
1446 }
1447
1448 next:
1449 p = le64toh(o->data.next_hash_offset);
1450 }
1451
1452 return 0;
1453 }
1454
1455 int journal_file_find_data_object(
1456 JournalFile *f,
1457 const void *data, uint64_t size,
1458 Object **ret, uint64_t *offset) {
1459
1460 uint64_t hash;
1461
1462 assert(f);
1463 assert(data || size == 0);
1464
1465 hash = hash64(data, size);
1466
1467 return journal_file_find_data_object_with_hash(f,
1468 data, size, hash,
1469 ret, offset);
1470 }
1471
1472 static int journal_file_append_field(
1473 JournalFile *f,
1474 const void *field, uint64_t size,
1475 Object **ret, uint64_t *offset) {
1476
1477 uint64_t hash, p;
1478 uint64_t osize;
1479 Object *o;
1480 int r;
1481
1482 assert(f);
1483 assert(field && size > 0);
1484
1485 hash = hash64(field, size);
1486
1487 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1488 if (r < 0)
1489 return r;
1490 else if (r > 0) {
1491
1492 if (ret)
1493 *ret = o;
1494
1495 if (offset)
1496 *offset = p;
1497
1498 return 0;
1499 }
1500
1501 osize = offsetof(Object, field.payload) + size;
1502 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1503 if (r < 0)
1504 return r;
1505
1506 o->field.hash = htole64(hash);
1507 memcpy(o->field.payload, field, size);
1508
1509 r = journal_file_link_field(f, o, p, hash);
1510 if (r < 0)
1511 return r;
1512
1513 /* The linking might have altered the window, so let's
1514 * refresh our pointer */
1515 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1516 if (r < 0)
1517 return r;
1518
1519 #if HAVE_GCRYPT
1520 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1521 if (r < 0)
1522 return r;
1523 #endif
1524
1525 if (ret)
1526 *ret = o;
1527
1528 if (offset)
1529 *offset = p;
1530
1531 return 0;
1532 }
1533
1534 static int journal_file_append_data(
1535 JournalFile *f,
1536 const void *data, uint64_t size,
1537 Object **ret, uint64_t *offset) {
1538
1539 uint64_t hash, p;
1540 uint64_t osize;
1541 Object *o;
1542 int r, compression = 0;
1543 const void *eq;
1544
1545 assert(f);
1546 assert(data || size == 0);
1547
1548 hash = hash64(data, size);
1549
1550 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1551 if (r < 0)
1552 return r;
1553 if (r > 0) {
1554
1555 if (ret)
1556 *ret = o;
1557
1558 if (offset)
1559 *offset = p;
1560
1561 return 0;
1562 }
1563
1564 osize = offsetof(Object, data.payload) + size;
1565 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1566 if (r < 0)
1567 return r;
1568
1569 o->data.hash = htole64(hash);
1570
1571 #if HAVE_XZ || HAVE_LZ4
1572 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
1573 size_t rsize = 0;
1574
1575 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1576
1577 if (compression >= 0) {
1578 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1579 o->object.flags |= compression;
1580
1581 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1582 size, rsize, object_compressed_to_string(compression));
1583 } else
1584 /* Compression didn't work, we don't really care why, let's continue without compression */
1585 compression = 0;
1586 }
1587 #endif
1588
1589 if (compression == 0)
1590 memcpy_safe(o->data.payload, data, size);
1591
1592 r = journal_file_link_data(f, o, p, hash);
1593 if (r < 0)
1594 return r;
1595
1596 #if HAVE_GCRYPT
1597 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1598 if (r < 0)
1599 return r;
1600 #endif
1601
1602 /* The linking might have altered the window, so let's
1603 * refresh our pointer */
1604 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1605 if (r < 0)
1606 return r;
1607
1608 if (!data)
1609 eq = NULL;
1610 else
1611 eq = memchr(data, '=', size);
1612 if (eq && eq > data) {
1613 Object *fo = NULL;
1614 uint64_t fp;
1615
1616 /* Create field object ... */
1617 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1618 if (r < 0)
1619 return r;
1620
1621 /* ... and link it in. */
1622 o->data.next_field_offset = fo->field.head_data_offset;
1623 fo->field.head_data_offset = le64toh(p);
1624 }
1625
1626 if (ret)
1627 *ret = o;
1628
1629 if (offset)
1630 *offset = p;
1631
1632 return 0;
1633 }
1634
1635 uint64_t journal_file_entry_n_items(Object *o) {
1636 assert(o);
1637
1638 if (o->object.type != OBJECT_ENTRY)
1639 return 0;
1640
1641 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1642 }
1643
1644 uint64_t journal_file_entry_array_n_items(Object *o) {
1645 assert(o);
1646
1647 if (o->object.type != OBJECT_ENTRY_ARRAY)
1648 return 0;
1649
1650 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1651 }
1652
1653 uint64_t journal_file_hash_table_n_items(Object *o) {
1654 assert(o);
1655
1656 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1657 return 0;
1658
1659 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1660 }
1661
1662 static int link_entry_into_array(JournalFile *f,
1663 le64_t *first,
1664 le64_t *idx,
1665 uint64_t p) {
1666 int r;
1667 uint64_t n = 0, ap = 0, q, i, a, hidx;
1668 Object *o;
1669
1670 assert(f);
1671 assert(f->header);
1672 assert(first);
1673 assert(idx);
1674 assert(p > 0);
1675
1676 a = le64toh(*first);
1677 i = hidx = le64toh(*idx);
1678 while (a > 0) {
1679
1680 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1681 if (r < 0)
1682 return r;
1683
1684 n = journal_file_entry_array_n_items(o);
1685 if (i < n) {
1686 o->entry_array.items[i] = htole64(p);
1687 *idx = htole64(hidx + 1);
1688 return 0;
1689 }
1690
1691 i -= n;
1692 ap = a;
1693 a = le64toh(o->entry_array.next_entry_array_offset);
1694 }
1695
1696 if (hidx > n)
1697 n = (hidx+1) * 2;
1698 else
1699 n = n * 2;
1700
1701 if (n < 4)
1702 n = 4;
1703
1704 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1705 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1706 &o, &q);
1707 if (r < 0)
1708 return r;
1709
1710 #if HAVE_GCRYPT
1711 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1712 if (r < 0)
1713 return r;
1714 #endif
1715
1716 o->entry_array.items[i] = htole64(p);
1717
1718 if (ap == 0)
1719 *first = htole64(q);
1720 else {
1721 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1722 if (r < 0)
1723 return r;
1724
1725 o->entry_array.next_entry_array_offset = htole64(q);
1726 }
1727
1728 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1729 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1730
1731 *idx = htole64(hidx + 1);
1732
1733 return 0;
1734 }
1735
1736 static int link_entry_into_array_plus_one(JournalFile *f,
1737 le64_t *extra,
1738 le64_t *first,
1739 le64_t *idx,
1740 uint64_t p) {
1741
1742 int r;
1743
1744 assert(f);
1745 assert(extra);
1746 assert(first);
1747 assert(idx);
1748 assert(p > 0);
1749
1750 if (*idx == 0)
1751 *extra = htole64(p);
1752 else {
1753 le64_t i;
1754
1755 i = htole64(le64toh(*idx) - 1);
1756 r = link_entry_into_array(f, first, &i, p);
1757 if (r < 0)
1758 return r;
1759 }
1760
1761 *idx = htole64(le64toh(*idx) + 1);
1762 return 0;
1763 }
1764
1765 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1766 uint64_t p;
1767 int r;
1768 assert(f);
1769 assert(o);
1770 assert(offset > 0);
1771
1772 p = le64toh(o->entry.items[i].object_offset);
1773 if (p == 0)
1774 return -EINVAL;
1775
1776 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1777 if (r < 0)
1778 return r;
1779
1780 return link_entry_into_array_plus_one(f,
1781 &o->data.entry_offset,
1782 &o->data.entry_array_offset,
1783 &o->data.n_entries,
1784 offset);
1785 }
1786
1787 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1788 uint64_t n, i;
1789 int r;
1790
1791 assert(f);
1792 assert(f->header);
1793 assert(o);
1794 assert(offset > 0);
1795
1796 if (o->object.type != OBJECT_ENTRY)
1797 return -EINVAL;
1798
1799 __sync_synchronize();
1800
1801 /* Link up the entry itself */
1802 r = link_entry_into_array(f,
1803 &f->header->entry_array_offset,
1804 &f->header->n_entries,
1805 offset);
1806 if (r < 0)
1807 return r;
1808
1809 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1810
1811 if (f->header->head_entry_realtime == 0)
1812 f->header->head_entry_realtime = o->entry.realtime;
1813
1814 f->header->tail_entry_realtime = o->entry.realtime;
1815 f->header->tail_entry_monotonic = o->entry.monotonic;
1816
1817 f->tail_entry_monotonic_valid = true;
1818
1819 /* Link up the items */
1820 n = journal_file_entry_n_items(o);
1821 for (i = 0; i < n; i++) {
1822 r = journal_file_link_entry_item(f, o, offset, i);
1823 if (r < 0)
1824 return r;
1825 }
1826
1827 return 0;
1828 }
1829
1830 static int journal_file_append_entry_internal(
1831 JournalFile *f,
1832 const dual_timestamp *ts,
1833 uint64_t xor_hash,
1834 const EntryItem items[], unsigned n_items,
1835 uint64_t *seqnum,
1836 Object **ret, uint64_t *offset) {
1837 uint64_t np;
1838 uint64_t osize;
1839 Object *o;
1840 int r;
1841
1842 assert(f);
1843 assert(f->header);
1844 assert(items || n_items == 0);
1845 assert(ts);
1846
1847 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1848
1849 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1850 if (r < 0)
1851 return r;
1852
1853 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1854 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1855 o->entry.realtime = htole64(ts->realtime);
1856 o->entry.monotonic = htole64(ts->monotonic);
1857 o->entry.xor_hash = htole64(xor_hash);
1858 o->entry.boot_id = f->header->boot_id;
1859
1860 #if HAVE_GCRYPT
1861 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1862 if (r < 0)
1863 return r;
1864 #endif
1865
1866 r = journal_file_link_entry(f, o, np);
1867 if (r < 0)
1868 return r;
1869
1870 if (ret)
1871 *ret = o;
1872
1873 if (offset)
1874 *offset = np;
1875
1876 return 0;
1877 }
1878
1879 void journal_file_post_change(JournalFile *f) {
1880 assert(f);
1881
1882 /* inotify() does not receive IN_MODIFY events from file
1883 * accesses done via mmap(). After each access we hence
1884 * trigger IN_MODIFY by truncating the journal file to its
1885 * current size which triggers IN_MODIFY. */
1886
1887 __sync_synchronize();
1888
1889 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1890 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1891 }
1892
1893 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1894 assert(userdata);
1895
1896 journal_file_post_change(userdata);
1897
1898 return 1;
1899 }
1900
1901 static void schedule_post_change(JournalFile *f) {
1902 sd_event_source *timer;
1903 int enabled, r;
1904 uint64_t now;
1905
1906 assert(f);
1907 assert(f->post_change_timer);
1908
1909 timer = f->post_change_timer;
1910
1911 r = sd_event_source_get_enabled(timer, &enabled);
1912 if (r < 0) {
1913 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1914 goto fail;
1915 }
1916
1917 if (enabled == SD_EVENT_ONESHOT)
1918 return;
1919
1920 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1921 if (r < 0) {
1922 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1923 goto fail;
1924 }
1925
1926 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1927 if (r < 0) {
1928 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1929 goto fail;
1930 }
1931
1932 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1933 if (r < 0) {
1934 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1935 goto fail;
1936 }
1937
1938 return;
1939
1940 fail:
1941 /* On failure, let's simply post the change immediately. */
1942 journal_file_post_change(f);
1943 }
1944
1945 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1946 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1947 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1948 int r;
1949
1950 assert(f);
1951 assert_return(!f->post_change_timer, -EINVAL);
1952 assert(e);
1953 assert(t);
1954
1955 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1956 if (r < 0)
1957 return r;
1958
1959 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1960 if (r < 0)
1961 return r;
1962
1963 f->post_change_timer = timer;
1964 timer = NULL;
1965 f->post_change_timer_period = t;
1966
1967 return r;
1968 }
1969
1970 static int entry_item_cmp(const void *_a, const void *_b) {
1971 const EntryItem *a = _a, *b = _b;
1972
1973 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1974 return -1;
1975 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1976 return 1;
1977 return 0;
1978 }
1979
1980 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1981 unsigned i;
1982 EntryItem *items;
1983 int r;
1984 uint64_t xor_hash = 0;
1985 struct dual_timestamp _ts;
1986
1987 assert(f);
1988 assert(f->header);
1989 assert(iovec || n_iovec == 0);
1990
1991 if (!ts) {
1992 dual_timestamp_get(&_ts);
1993 ts = &_ts;
1994 }
1995
1996 #if HAVE_GCRYPT
1997 r = journal_file_maybe_append_tag(f, ts->realtime);
1998 if (r < 0)
1999 return r;
2000 #endif
2001
2002 /* alloca() can't take 0, hence let's allocate at least one */
2003 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
2004
2005 for (i = 0; i < n_iovec; i++) {
2006 uint64_t p;
2007 Object *o;
2008
2009 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2010 if (r < 0)
2011 return r;
2012
2013 xor_hash ^= le64toh(o->data.hash);
2014 items[i].object_offset = htole64(p);
2015 items[i].hash = o->data.hash;
2016 }
2017
2018 /* Order by the position on disk, in order to improve seek
2019 * times for rotating media. */
2020 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
2021
2022 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
2023
2024 /* If the memory mapping triggered a SIGBUS then we return an
2025 * IO error and ignore the error code passed down to us, since
2026 * it is very likely just an effect of a nullified replacement
2027 * mapping page */
2028
2029 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
2030 r = -EIO;
2031
2032 if (f->post_change_timer)
2033 schedule_post_change(f);
2034 else
2035 journal_file_post_change(f);
2036
2037 return r;
2038 }
2039
2040 typedef struct ChainCacheItem {
2041 uint64_t first; /* the array at the beginning of the chain */
2042 uint64_t array; /* the cached array */
2043 uint64_t begin; /* the first item in the cached array */
2044 uint64_t total; /* the total number of items in all arrays before this one in the chain */
2045 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2046 } ChainCacheItem;
2047
2048 static void chain_cache_put(
2049 OrderedHashmap *h,
2050 ChainCacheItem *ci,
2051 uint64_t first,
2052 uint64_t array,
2053 uint64_t begin,
2054 uint64_t total,
2055 uint64_t last_index) {
2056
2057 if (!ci) {
2058 /* If the chain item to cache for this chain is the
2059 * first one it's not worth caching anything */
2060 if (array == first)
2061 return;
2062
2063 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2064 ci = ordered_hashmap_steal_first(h);
2065 assert(ci);
2066 } else {
2067 ci = new(ChainCacheItem, 1);
2068 if (!ci)
2069 return;
2070 }
2071
2072 ci->first = first;
2073
2074 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2075 free(ci);
2076 return;
2077 }
2078 } else
2079 assert(ci->first == first);
2080
2081 ci->array = array;
2082 ci->begin = begin;
2083 ci->total = total;
2084 ci->last_index = last_index;
2085 }
2086
2087 static int generic_array_get(
2088 JournalFile *f,
2089 uint64_t first,
2090 uint64_t i,
2091 Object **ret, uint64_t *offset) {
2092
2093 Object *o;
2094 uint64_t p = 0, a, t = 0;
2095 int r;
2096 ChainCacheItem *ci;
2097
2098 assert(f);
2099
2100 a = first;
2101
2102 /* Try the chain cache first */
2103 ci = ordered_hashmap_get(f->chain_cache, &first);
2104 if (ci && i > ci->total) {
2105 a = ci->array;
2106 i -= ci->total;
2107 t = ci->total;
2108 }
2109
2110 while (a > 0) {
2111 uint64_t k;
2112
2113 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2114 if (r < 0)
2115 return r;
2116
2117 k = journal_file_entry_array_n_items(o);
2118 if (i < k) {
2119 p = le64toh(o->entry_array.items[i]);
2120 goto found;
2121 }
2122
2123 i -= k;
2124 t += k;
2125 a = le64toh(o->entry_array.next_entry_array_offset);
2126 }
2127
2128 return 0;
2129
2130 found:
2131 /* Let's cache this item for the next invocation */
2132 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2133
2134 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2135 if (r < 0)
2136 return r;
2137
2138 if (ret)
2139 *ret = o;
2140
2141 if (offset)
2142 *offset = p;
2143
2144 return 1;
2145 }
2146
2147 static int generic_array_get_plus_one(
2148 JournalFile *f,
2149 uint64_t extra,
2150 uint64_t first,
2151 uint64_t i,
2152 Object **ret, uint64_t *offset) {
2153
2154 Object *o;
2155
2156 assert(f);
2157
2158 if (i == 0) {
2159 int r;
2160
2161 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2162 if (r < 0)
2163 return r;
2164
2165 if (ret)
2166 *ret = o;
2167
2168 if (offset)
2169 *offset = extra;
2170
2171 return 1;
2172 }
2173
2174 return generic_array_get(f, first, i-1, ret, offset);
2175 }
2176
2177 enum {
2178 TEST_FOUND,
2179 TEST_LEFT,
2180 TEST_RIGHT
2181 };
2182
2183 static int generic_array_bisect(
2184 JournalFile *f,
2185 uint64_t first,
2186 uint64_t n,
2187 uint64_t needle,
2188 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2189 direction_t direction,
2190 Object **ret,
2191 uint64_t *offset,
2192 uint64_t *idx) {
2193
2194 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
2195 bool subtract_one = false;
2196 Object *o, *array = NULL;
2197 int r;
2198 ChainCacheItem *ci;
2199
2200 assert(f);
2201 assert(test_object);
2202
2203 /* Start with the first array in the chain */
2204 a = first;
2205
2206 ci = ordered_hashmap_get(f->chain_cache, &first);
2207 if (ci && n > ci->total) {
2208 /* Ah, we have iterated this bisection array chain
2209 * previously! Let's see if we can skip ahead in the
2210 * chain, as far as the last time. But we can't jump
2211 * backwards in the chain, so let's check that
2212 * first. */
2213
2214 r = test_object(f, ci->begin, needle);
2215 if (r < 0)
2216 return r;
2217
2218 if (r == TEST_LEFT) {
2219 /* OK, what we are looking for is right of the
2220 * begin of this EntryArray, so let's jump
2221 * straight to previously cached array in the
2222 * chain */
2223
2224 a = ci->array;
2225 n -= ci->total;
2226 t = ci->total;
2227 last_index = ci->last_index;
2228 }
2229 }
2230
2231 while (a > 0) {
2232 uint64_t left, right, k, lp;
2233
2234 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2235 if (r < 0)
2236 return r;
2237
2238 k = journal_file_entry_array_n_items(array);
2239 right = MIN(k, n);
2240 if (right <= 0)
2241 return 0;
2242
2243 i = right - 1;
2244 lp = p = le64toh(array->entry_array.items[i]);
2245 if (p <= 0)
2246 r = -EBADMSG;
2247 else
2248 r = test_object(f, p, needle);
2249 if (r == -EBADMSG) {
2250 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2251 n = i;
2252 continue;
2253 }
2254 if (r < 0)
2255 return r;
2256
2257 if (r == TEST_FOUND)
2258 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2259
2260 if (r == TEST_RIGHT) {
2261 left = 0;
2262 right -= 1;
2263
2264 if (last_index != (uint64_t) -1) {
2265 assert(last_index <= right);
2266
2267 /* If we cached the last index we
2268 * looked at, let's try to not to jump
2269 * too wildly around and see if we can
2270 * limit the range to look at early to
2271 * the immediate neighbors of the last
2272 * index we looked at. */
2273
2274 if (last_index > 0) {
2275 uint64_t x = last_index - 1;
2276
2277 p = le64toh(array->entry_array.items[x]);
2278 if (p <= 0)
2279 return -EBADMSG;
2280
2281 r = test_object(f, p, needle);
2282 if (r < 0)
2283 return r;
2284
2285 if (r == TEST_FOUND)
2286 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2287
2288 if (r == TEST_RIGHT)
2289 right = x;
2290 else
2291 left = x + 1;
2292 }
2293
2294 if (last_index < right) {
2295 uint64_t y = last_index + 1;
2296
2297 p = le64toh(array->entry_array.items[y]);
2298 if (p <= 0)
2299 return -EBADMSG;
2300
2301 r = test_object(f, p, needle);
2302 if (r < 0)
2303 return r;
2304
2305 if (r == TEST_FOUND)
2306 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2307
2308 if (r == TEST_RIGHT)
2309 right = y;
2310 else
2311 left = y + 1;
2312 }
2313 }
2314
2315 for (;;) {
2316 if (left == right) {
2317 if (direction == DIRECTION_UP)
2318 subtract_one = true;
2319
2320 i = left;
2321 goto found;
2322 }
2323
2324 assert(left < right);
2325 i = (left + right) / 2;
2326
2327 p = le64toh(array->entry_array.items[i]);
2328 if (p <= 0)
2329 r = -EBADMSG;
2330 else
2331 r = test_object(f, p, needle);
2332 if (r == -EBADMSG) {
2333 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2334 right = n = i;
2335 continue;
2336 }
2337 if (r < 0)
2338 return r;
2339
2340 if (r == TEST_FOUND)
2341 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2342
2343 if (r == TEST_RIGHT)
2344 right = i;
2345 else
2346 left = i + 1;
2347 }
2348 }
2349
2350 if (k >= n) {
2351 if (direction == DIRECTION_UP) {
2352 i = n;
2353 subtract_one = true;
2354 goto found;
2355 }
2356
2357 return 0;
2358 }
2359
2360 last_p = lp;
2361
2362 n -= k;
2363 t += k;
2364 last_index = (uint64_t) -1;
2365 a = le64toh(array->entry_array.next_entry_array_offset);
2366 }
2367
2368 return 0;
2369
2370 found:
2371 if (subtract_one && t == 0 && i == 0)
2372 return 0;
2373
2374 /* Let's cache this item for the next invocation */
2375 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
2376
2377 if (subtract_one && i == 0)
2378 p = last_p;
2379 else if (subtract_one)
2380 p = le64toh(array->entry_array.items[i-1]);
2381 else
2382 p = le64toh(array->entry_array.items[i]);
2383
2384 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2385 if (r < 0)
2386 return r;
2387
2388 if (ret)
2389 *ret = o;
2390
2391 if (offset)
2392 *offset = p;
2393
2394 if (idx)
2395 *idx = t + i + (subtract_one ? -1 : 0);
2396
2397 return 1;
2398 }
2399
2400 static int generic_array_bisect_plus_one(
2401 JournalFile *f,
2402 uint64_t extra,
2403 uint64_t first,
2404 uint64_t n,
2405 uint64_t needle,
2406 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2407 direction_t direction,
2408 Object **ret,
2409 uint64_t *offset,
2410 uint64_t *idx) {
2411
2412 int r;
2413 bool step_back = false;
2414 Object *o;
2415
2416 assert(f);
2417 assert(test_object);
2418
2419 if (n <= 0)
2420 return 0;
2421
2422 /* This bisects the array in object 'first', but first checks
2423 * an extra */
2424 r = test_object(f, extra, needle);
2425 if (r < 0)
2426 return r;
2427
2428 if (r == TEST_FOUND)
2429 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2430
2431 /* if we are looking with DIRECTION_UP then we need to first
2432 see if in the actual array there is a matching entry, and
2433 return the last one of that. But if there isn't any we need
2434 to return this one. Hence remember this, and return it
2435 below. */
2436 if (r == TEST_LEFT)
2437 step_back = direction == DIRECTION_UP;
2438
2439 if (r == TEST_RIGHT) {
2440 if (direction == DIRECTION_DOWN)
2441 goto found;
2442 else
2443 return 0;
2444 }
2445
2446 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2447
2448 if (r == 0 && step_back)
2449 goto found;
2450
2451 if (r > 0 && idx)
2452 (*idx)++;
2453
2454 return r;
2455
2456 found:
2457 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2458 if (r < 0)
2459 return r;
2460
2461 if (ret)
2462 *ret = o;
2463
2464 if (offset)
2465 *offset = extra;
2466
2467 if (idx)
2468 *idx = 0;
2469
2470 return 1;
2471 }
2472
2473 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2474 assert(f);
2475 assert(p > 0);
2476
2477 if (p == needle)
2478 return TEST_FOUND;
2479 else if (p < needle)
2480 return TEST_LEFT;
2481 else
2482 return TEST_RIGHT;
2483 }
2484
2485 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2486 Object *o;
2487 int r;
2488
2489 assert(f);
2490 assert(p > 0);
2491
2492 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2493 if (r < 0)
2494 return r;
2495
2496 if (le64toh(o->entry.seqnum) == needle)
2497 return TEST_FOUND;
2498 else if (le64toh(o->entry.seqnum) < needle)
2499 return TEST_LEFT;
2500 else
2501 return TEST_RIGHT;
2502 }
2503
2504 int journal_file_move_to_entry_by_seqnum(
2505 JournalFile *f,
2506 uint64_t seqnum,
2507 direction_t direction,
2508 Object **ret,
2509 uint64_t *offset) {
2510 assert(f);
2511 assert(f->header);
2512
2513 return generic_array_bisect(f,
2514 le64toh(f->header->entry_array_offset),
2515 le64toh(f->header->n_entries),
2516 seqnum,
2517 test_object_seqnum,
2518 direction,
2519 ret, offset, NULL);
2520 }
2521
2522 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2523 Object *o;
2524 int r;
2525
2526 assert(f);
2527 assert(p > 0);
2528
2529 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2530 if (r < 0)
2531 return r;
2532
2533 if (le64toh(o->entry.realtime) == needle)
2534 return TEST_FOUND;
2535 else if (le64toh(o->entry.realtime) < needle)
2536 return TEST_LEFT;
2537 else
2538 return TEST_RIGHT;
2539 }
2540
2541 int journal_file_move_to_entry_by_realtime(
2542 JournalFile *f,
2543 uint64_t realtime,
2544 direction_t direction,
2545 Object **ret,
2546 uint64_t *offset) {
2547 assert(f);
2548 assert(f->header);
2549
2550 return generic_array_bisect(f,
2551 le64toh(f->header->entry_array_offset),
2552 le64toh(f->header->n_entries),
2553 realtime,
2554 test_object_realtime,
2555 direction,
2556 ret, offset, NULL);
2557 }
2558
2559 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2560 Object *o;
2561 int r;
2562
2563 assert(f);
2564 assert(p > 0);
2565
2566 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2567 if (r < 0)
2568 return r;
2569
2570 if (le64toh(o->entry.monotonic) == needle)
2571 return TEST_FOUND;
2572 else if (le64toh(o->entry.monotonic) < needle)
2573 return TEST_LEFT;
2574 else
2575 return TEST_RIGHT;
2576 }
2577
2578 static int find_data_object_by_boot_id(
2579 JournalFile *f,
2580 sd_id128_t boot_id,
2581 Object **o,
2582 uint64_t *b) {
2583
2584 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2585
2586 sd_id128_to_string(boot_id, t + 9);
2587 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2588 }
2589
2590 int journal_file_move_to_entry_by_monotonic(
2591 JournalFile *f,
2592 sd_id128_t boot_id,
2593 uint64_t monotonic,
2594 direction_t direction,
2595 Object **ret,
2596 uint64_t *offset) {
2597
2598 Object *o;
2599 int r;
2600
2601 assert(f);
2602
2603 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2604 if (r < 0)
2605 return r;
2606 if (r == 0)
2607 return -ENOENT;
2608
2609 return generic_array_bisect_plus_one(f,
2610 le64toh(o->data.entry_offset),
2611 le64toh(o->data.entry_array_offset),
2612 le64toh(o->data.n_entries),
2613 monotonic,
2614 test_object_monotonic,
2615 direction,
2616 ret, offset, NULL);
2617 }
2618
2619 void journal_file_reset_location(JournalFile *f) {
2620 f->location_type = LOCATION_HEAD;
2621 f->current_offset = 0;
2622 f->current_seqnum = 0;
2623 f->current_realtime = 0;
2624 f->current_monotonic = 0;
2625 zero(f->current_boot_id);
2626 f->current_xor_hash = 0;
2627 }
2628
2629 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2630 f->location_type = LOCATION_SEEK;
2631 f->current_offset = offset;
2632 f->current_seqnum = le64toh(o->entry.seqnum);
2633 f->current_realtime = le64toh(o->entry.realtime);
2634 f->current_monotonic = le64toh(o->entry.monotonic);
2635 f->current_boot_id = o->entry.boot_id;
2636 f->current_xor_hash = le64toh(o->entry.xor_hash);
2637 }
2638
2639 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2640 assert(af);
2641 assert(af->header);
2642 assert(bf);
2643 assert(bf->header);
2644 assert(af->location_type == LOCATION_SEEK);
2645 assert(bf->location_type == LOCATION_SEEK);
2646
2647 /* If contents and timestamps match, these entries are
2648 * identical, even if the seqnum does not match */
2649 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2650 af->current_monotonic == bf->current_monotonic &&
2651 af->current_realtime == bf->current_realtime &&
2652 af->current_xor_hash == bf->current_xor_hash)
2653 return 0;
2654
2655 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2656
2657 /* If this is from the same seqnum source, compare
2658 * seqnums */
2659 if (af->current_seqnum < bf->current_seqnum)
2660 return -1;
2661 if (af->current_seqnum > bf->current_seqnum)
2662 return 1;
2663
2664 /* Wow! This is weird, different data but the same
2665 * seqnums? Something is borked, but let's make the
2666 * best of it and compare by time. */
2667 }
2668
2669 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2670
2671 /* If the boot id matches, compare monotonic time */
2672 if (af->current_monotonic < bf->current_monotonic)
2673 return -1;
2674 if (af->current_monotonic > bf->current_monotonic)
2675 return 1;
2676 }
2677
2678 /* Otherwise, compare UTC time */
2679 if (af->current_realtime < bf->current_realtime)
2680 return -1;
2681 if (af->current_realtime > bf->current_realtime)
2682 return 1;
2683
2684 /* Finally, compare by contents */
2685 if (af->current_xor_hash < bf->current_xor_hash)
2686 return -1;
2687 if (af->current_xor_hash > bf->current_xor_hash)
2688 return 1;
2689
2690 return 0;
2691 }
2692
2693 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2694
2695 /* Increase or decrease the specified index, in the right direction. */
2696
2697 if (direction == DIRECTION_DOWN) {
2698 if (*i >= n - 1)
2699 return 0;
2700
2701 (*i) ++;
2702 } else {
2703 if (*i <= 0)
2704 return 0;
2705
2706 (*i) --;
2707 }
2708
2709 return 1;
2710 }
2711
2712 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2713
2714 /* Consider it an error if any of the two offsets is uninitialized */
2715 if (old_offset == 0 || new_offset == 0)
2716 return false;
2717
2718 /* If we go down, the new offset must be larger than the old one. */
2719 return direction == DIRECTION_DOWN ?
2720 new_offset > old_offset :
2721 new_offset < old_offset;
2722 }
2723
2724 int journal_file_next_entry(
2725 JournalFile *f,
2726 uint64_t p,
2727 direction_t direction,
2728 Object **ret, uint64_t *offset) {
2729
2730 uint64_t i, n, ofs;
2731 int r;
2732
2733 assert(f);
2734 assert(f->header);
2735
2736 n = le64toh(f->header->n_entries);
2737 if (n <= 0)
2738 return 0;
2739
2740 if (p == 0)
2741 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2742 else {
2743 r = generic_array_bisect(f,
2744 le64toh(f->header->entry_array_offset),
2745 le64toh(f->header->n_entries),
2746 p,
2747 test_object_offset,
2748 DIRECTION_DOWN,
2749 NULL, NULL,
2750 &i);
2751 if (r <= 0)
2752 return r;
2753
2754 r = bump_array_index(&i, direction, n);
2755 if (r <= 0)
2756 return r;
2757 }
2758
2759 /* And jump to it */
2760 for (;;) {
2761 r = generic_array_get(f,
2762 le64toh(f->header->entry_array_offset),
2763 i,
2764 ret, &ofs);
2765 if (r > 0)
2766 break;
2767 if (r != -EBADMSG)
2768 return r;
2769
2770 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2771 * the next one might work for us instead. */
2772 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2773
2774 r = bump_array_index(&i, direction, n);
2775 if (r <= 0)
2776 return r;
2777 }
2778
2779 /* Ensure our array is properly ordered. */
2780 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2781 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
2782 return -EBADMSG;
2783 }
2784
2785 if (offset)
2786 *offset = ofs;
2787
2788 return 1;
2789 }
2790
2791 int journal_file_next_entry_for_data(
2792 JournalFile *f,
2793 Object *o, uint64_t p,
2794 uint64_t data_offset,
2795 direction_t direction,
2796 Object **ret, uint64_t *offset) {
2797
2798 uint64_t i, n, ofs;
2799 Object *d;
2800 int r;
2801
2802 assert(f);
2803 assert(p > 0 || !o);
2804
2805 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2806 if (r < 0)
2807 return r;
2808
2809 n = le64toh(d->data.n_entries);
2810 if (n <= 0)
2811 return n;
2812
2813 if (!o)
2814 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2815 else {
2816 if (o->object.type != OBJECT_ENTRY)
2817 return -EINVAL;
2818
2819 r = generic_array_bisect_plus_one(f,
2820 le64toh(d->data.entry_offset),
2821 le64toh(d->data.entry_array_offset),
2822 le64toh(d->data.n_entries),
2823 p,
2824 test_object_offset,
2825 DIRECTION_DOWN,
2826 NULL, NULL,
2827 &i);
2828
2829 if (r <= 0)
2830 return r;
2831
2832 r = bump_array_index(&i, direction, n);
2833 if (r <= 0)
2834 return r;
2835 }
2836
2837 for (;;) {
2838 r = generic_array_get_plus_one(f,
2839 le64toh(d->data.entry_offset),
2840 le64toh(d->data.entry_array_offset),
2841 i,
2842 ret, &ofs);
2843 if (r > 0)
2844 break;
2845 if (r != -EBADMSG)
2846 return r;
2847
2848 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2849
2850 r = bump_array_index(&i, direction, n);
2851 if (r <= 0)
2852 return r;
2853 }
2854
2855 /* Ensure our array is properly ordered. */
2856 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2857 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2858 return -EBADMSG;
2859 }
2860
2861 if (offset)
2862 *offset = ofs;
2863
2864 return 1;
2865 }
2866
2867 int journal_file_move_to_entry_by_offset_for_data(
2868 JournalFile *f,
2869 uint64_t data_offset,
2870 uint64_t p,
2871 direction_t direction,
2872 Object **ret, uint64_t *offset) {
2873
2874 int r;
2875 Object *d;
2876
2877 assert(f);
2878
2879 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2880 if (r < 0)
2881 return r;
2882
2883 return generic_array_bisect_plus_one(f,
2884 le64toh(d->data.entry_offset),
2885 le64toh(d->data.entry_array_offset),
2886 le64toh(d->data.n_entries),
2887 p,
2888 test_object_offset,
2889 direction,
2890 ret, offset, NULL);
2891 }
2892
2893 int journal_file_move_to_entry_by_monotonic_for_data(
2894 JournalFile *f,
2895 uint64_t data_offset,
2896 sd_id128_t boot_id,
2897 uint64_t monotonic,
2898 direction_t direction,
2899 Object **ret, uint64_t *offset) {
2900
2901 Object *o, *d;
2902 int r;
2903 uint64_t b, z;
2904
2905 assert(f);
2906
2907 /* First, seek by time */
2908 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2909 if (r < 0)
2910 return r;
2911 if (r == 0)
2912 return -ENOENT;
2913
2914 r = generic_array_bisect_plus_one(f,
2915 le64toh(o->data.entry_offset),
2916 le64toh(o->data.entry_array_offset),
2917 le64toh(o->data.n_entries),
2918 monotonic,
2919 test_object_monotonic,
2920 direction,
2921 NULL, &z, NULL);
2922 if (r <= 0)
2923 return r;
2924
2925 /* And now, continue seeking until we find an entry that
2926 * exists in both bisection arrays */
2927
2928 for (;;) {
2929 Object *qo;
2930 uint64_t p, q;
2931
2932 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2933 if (r < 0)
2934 return r;
2935
2936 r = generic_array_bisect_plus_one(f,
2937 le64toh(d->data.entry_offset),
2938 le64toh(d->data.entry_array_offset),
2939 le64toh(d->data.n_entries),
2940 z,
2941 test_object_offset,
2942 direction,
2943 NULL, &p, NULL);
2944 if (r <= 0)
2945 return r;
2946
2947 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2948 if (r < 0)
2949 return r;
2950
2951 r = generic_array_bisect_plus_one(f,
2952 le64toh(o->data.entry_offset),
2953 le64toh(o->data.entry_array_offset),
2954 le64toh(o->data.n_entries),
2955 p,
2956 test_object_offset,
2957 direction,
2958 &qo, &q, NULL);
2959
2960 if (r <= 0)
2961 return r;
2962
2963 if (p == q) {
2964 if (ret)
2965 *ret = qo;
2966 if (offset)
2967 *offset = q;
2968
2969 return 1;
2970 }
2971
2972 z = q;
2973 }
2974 }
2975
2976 int journal_file_move_to_entry_by_seqnum_for_data(
2977 JournalFile *f,
2978 uint64_t data_offset,
2979 uint64_t seqnum,
2980 direction_t direction,
2981 Object **ret, uint64_t *offset) {
2982
2983 Object *d;
2984 int r;
2985
2986 assert(f);
2987
2988 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2989 if (r < 0)
2990 return r;
2991
2992 return generic_array_bisect_plus_one(f,
2993 le64toh(d->data.entry_offset),
2994 le64toh(d->data.entry_array_offset),
2995 le64toh(d->data.n_entries),
2996 seqnum,
2997 test_object_seqnum,
2998 direction,
2999 ret, offset, NULL);
3000 }
3001
3002 int journal_file_move_to_entry_by_realtime_for_data(
3003 JournalFile *f,
3004 uint64_t data_offset,
3005 uint64_t realtime,
3006 direction_t direction,
3007 Object **ret, uint64_t *offset) {
3008
3009 Object *d;
3010 int r;
3011
3012 assert(f);
3013
3014 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
3015 if (r < 0)
3016 return r;
3017
3018 return generic_array_bisect_plus_one(f,
3019 le64toh(d->data.entry_offset),
3020 le64toh(d->data.entry_array_offset),
3021 le64toh(d->data.n_entries),
3022 realtime,
3023 test_object_realtime,
3024 direction,
3025 ret, offset, NULL);
3026 }
3027
3028 void journal_file_dump(JournalFile *f) {
3029 Object *o;
3030 int r;
3031 uint64_t p;
3032
3033 assert(f);
3034 assert(f->header);
3035
3036 journal_file_print_header(f);
3037
3038 p = le64toh(f->header->header_size);
3039 while (p != 0) {
3040 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3041 if (r < 0)
3042 goto fail;
3043
3044 switch (o->object.type) {
3045
3046 case OBJECT_UNUSED:
3047 printf("Type: OBJECT_UNUSED\n");
3048 break;
3049
3050 case OBJECT_DATA:
3051 printf("Type: OBJECT_DATA\n");
3052 break;
3053
3054 case OBJECT_FIELD:
3055 printf("Type: OBJECT_FIELD\n");
3056 break;
3057
3058 case OBJECT_ENTRY:
3059 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3060 le64toh(o->entry.seqnum),
3061 le64toh(o->entry.monotonic),
3062 le64toh(o->entry.realtime));
3063 break;
3064
3065 case OBJECT_FIELD_HASH_TABLE:
3066 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3067 break;
3068
3069 case OBJECT_DATA_HASH_TABLE:
3070 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3071 break;
3072
3073 case OBJECT_ENTRY_ARRAY:
3074 printf("Type: OBJECT_ENTRY_ARRAY\n");
3075 break;
3076
3077 case OBJECT_TAG:
3078 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3079 le64toh(o->tag.seqnum),
3080 le64toh(o->tag.epoch));
3081 break;
3082
3083 default:
3084 printf("Type: unknown (%i)\n", o->object.type);
3085 break;
3086 }
3087
3088 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3089 printf("Flags: %s\n",
3090 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3091
3092 if (p == le64toh(f->header->tail_object_offset))
3093 p = 0;
3094 else
3095 p = p + ALIGN64(le64toh(o->object.size));
3096 }
3097
3098 return;
3099 fail:
3100 log_error("File corrupt");
3101 }
3102
3103 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3104 const char *x;
3105
3106 x = format_timestamp(buf, l, t);
3107 if (x)
3108 return x;
3109 return " --- ";
3110 }
3111
3112 void journal_file_print_header(JournalFile *f) {
3113 char a[33], b[33], c[33], d[33];
3114 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
3115 struct stat st;
3116 char bytes[FORMAT_BYTES_MAX];
3117
3118 assert(f);
3119 assert(f->header);
3120
3121 printf("File Path: %s\n"
3122 "File ID: %s\n"
3123 "Machine ID: %s\n"
3124 "Boot ID: %s\n"
3125 "Sequential Number ID: %s\n"
3126 "State: %s\n"
3127 "Compatible Flags:%s%s\n"
3128 "Incompatible Flags:%s%s%s\n"
3129 "Header size: %"PRIu64"\n"
3130 "Arena size: %"PRIu64"\n"
3131 "Data Hash Table Size: %"PRIu64"\n"
3132 "Field Hash Table Size: %"PRIu64"\n"
3133 "Rotate Suggested: %s\n"
3134 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3135 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3136 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3137 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3138 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
3139 "Objects: %"PRIu64"\n"
3140 "Entry Objects: %"PRIu64"\n",
3141 f->path,
3142 sd_id128_to_string(f->header->file_id, a),
3143 sd_id128_to_string(f->header->machine_id, b),
3144 sd_id128_to_string(f->header->boot_id, c),
3145 sd_id128_to_string(f->header->seqnum_id, d),
3146 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3147 f->header->state == STATE_ONLINE ? "ONLINE" :
3148 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3149 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3150 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3151 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3152 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3153 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3154 le64toh(f->header->header_size),
3155 le64toh(f->header->arena_size),
3156 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3157 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3158 yes_no(journal_file_rotate_suggested(f, 0)),
3159 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3160 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3161 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3162 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3163 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3164 le64toh(f->header->n_objects),
3165 le64toh(f->header->n_entries));
3166
3167 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3168 printf("Data Objects: %"PRIu64"\n"
3169 "Data Hash Table Fill: %.1f%%\n",
3170 le64toh(f->header->n_data),
3171 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3172
3173 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3174 printf("Field Objects: %"PRIu64"\n"
3175 "Field Hash Table Fill: %.1f%%\n",
3176 le64toh(f->header->n_fields),
3177 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3178
3179 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3180 printf("Tag Objects: %"PRIu64"\n",
3181 le64toh(f->header->n_tags));
3182 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3183 printf("Entry Array Objects: %"PRIu64"\n",
3184 le64toh(f->header->n_entry_arrays));
3185
3186 if (fstat(f->fd, &st) >= 0)
3187 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
3188 }
3189
3190 static int journal_file_warn_btrfs(JournalFile *f) {
3191 unsigned attrs;
3192 int r;
3193
3194 assert(f);
3195
3196 /* Before we write anything, check if the COW logic is turned
3197 * off on btrfs. Given our write pattern that is quite
3198 * unfriendly to COW file systems this should greatly improve
3199 * performance on COW file systems, such as btrfs, at the
3200 * expense of data integrity features (which shouldn't be too
3201 * bad, given that we do our own checksumming). */
3202
3203 r = btrfs_is_filesystem(f->fd);
3204 if (r < 0)
3205 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3206 if (!r)
3207 return 0;
3208
3209 r = read_attr_fd(f->fd, &attrs);
3210 if (r < 0)
3211 return log_warning_errno(r, "Failed to read file attributes: %m");
3212
3213 if (attrs & FS_NOCOW_FL) {
3214 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3215 return 0;
3216 }
3217
3218 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3219 "This is likely to slow down journal access substantially, please consider turning "
3220 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3221
3222 return 1;
3223 }
3224
3225 int journal_file_open(
3226 int fd,
3227 const char *fname,
3228 int flags,
3229 mode_t mode,
3230 bool compress,
3231 bool seal,
3232 JournalMetrics *metrics,
3233 MMapCache *mmap_cache,
3234 Set *deferred_closes,
3235 JournalFile *template,
3236 JournalFile **ret) {
3237
3238 bool newly_created = false;
3239 JournalFile *f;
3240 void *h;
3241 int r;
3242
3243 assert(ret);
3244 assert(fd >= 0 || fname);
3245
3246 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3247 return -EINVAL;
3248
3249 if (fname) {
3250 if (!endswith(fname, ".journal") &&
3251 !endswith(fname, ".journal~"))
3252 return -EINVAL;
3253 }
3254
3255 f = new0(JournalFile, 1);
3256 if (!f)
3257 return -ENOMEM;
3258
3259 f->fd = fd;
3260 f->mode = mode;
3261
3262 f->flags = flags;
3263 f->prot = prot_from_flags(flags);
3264 f->writable = (flags & O_ACCMODE) != O_RDONLY;
3265 #if HAVE_LZ4
3266 f->compress_lz4 = compress;
3267 #elif HAVE_XZ
3268 f->compress_xz = compress;
3269 #endif
3270 #if HAVE_GCRYPT
3271 f->seal = seal;
3272 #endif
3273
3274 if (mmap_cache)
3275 f->mmap = mmap_cache_ref(mmap_cache);
3276 else {
3277 f->mmap = mmap_cache_new();
3278 if (!f->mmap) {
3279 r = -ENOMEM;
3280 goto fail;
3281 }
3282 }
3283
3284 if (fname) {
3285 f->path = strdup(fname);
3286 if (!f->path) {
3287 r = -ENOMEM;
3288 goto fail;
3289 }
3290 } else {
3291 /* If we don't know the path, fill in something explanatory and vaguely useful */
3292 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3293 r = -ENOMEM;
3294 goto fail;
3295 }
3296 }
3297
3298 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3299 if (!f->chain_cache) {
3300 r = -ENOMEM;
3301 goto fail;
3302 }
3303
3304 if (f->fd < 0) {
3305 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
3306 if (f->fd < 0) {
3307 r = -errno;
3308 goto fail;
3309 }
3310
3311 /* fds we opened here by us should also be closed by us. */
3312 f->close_fd = true;
3313 }
3314
3315 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3316 if (!f->cache_fd) {
3317 r = -ENOMEM;
3318 goto fail;
3319 }
3320
3321 r = journal_file_fstat(f);
3322 if (r < 0)
3323 goto fail;
3324
3325 if (f->last_stat.st_size == 0 && f->writable) {
3326
3327 (void) journal_file_warn_btrfs(f);
3328
3329 /* Let's attach the creation time to the journal file,
3330 * so that the vacuuming code knows the age of this
3331 * file even if the file might end up corrupted one
3332 * day... Ideally we'd just use the creation time many
3333 * file systems maintain for each file, but there is
3334 * currently no usable API to query this, hence let's
3335 * emulate this via extended attributes. If extended
3336 * attributes are not supported we'll just skip this,
3337 * and rely solely on mtime/atime/ctime of the file. */
3338
3339 fd_setcrtime(f->fd, 0);
3340
3341 #if HAVE_GCRYPT
3342 /* Try to load the FSPRG state, and if we can't, then
3343 * just don't do sealing */
3344 if (f->seal) {
3345 r = journal_file_fss_load(f);
3346 if (r < 0)
3347 f->seal = false;
3348 }
3349 #endif
3350
3351 r = journal_file_init_header(f, template);
3352 if (r < 0)
3353 goto fail;
3354
3355 r = journal_file_fstat(f);
3356 if (r < 0)
3357 goto fail;
3358
3359 newly_created = true;
3360 }
3361
3362 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3363 r = -ENODATA;
3364 goto fail;
3365 }
3366
3367 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
3368 if (r < 0)
3369 goto fail;
3370
3371 f->header = h;
3372
3373 if (!newly_created) {
3374 if (deferred_closes)
3375 journal_file_close_set(deferred_closes);
3376
3377 r = journal_file_verify_header(f);
3378 if (r < 0)
3379 goto fail;
3380 }
3381
3382 #if HAVE_GCRYPT
3383 if (!newly_created && f->writable) {
3384 r = journal_file_fss_load(f);
3385 if (r < 0)
3386 goto fail;
3387 }
3388 #endif
3389
3390 if (f->writable) {
3391 if (metrics) {
3392 journal_default_metrics(metrics, f->fd);
3393 f->metrics = *metrics;
3394 } else if (template)
3395 f->metrics = template->metrics;
3396
3397 r = journal_file_refresh_header(f);
3398 if (r < 0)
3399 goto fail;
3400 }
3401
3402 #if HAVE_GCRYPT
3403 r = journal_file_hmac_setup(f);
3404 if (r < 0)
3405 goto fail;
3406 #endif
3407
3408 if (newly_created) {
3409 r = journal_file_setup_field_hash_table(f);
3410 if (r < 0)
3411 goto fail;
3412
3413 r = journal_file_setup_data_hash_table(f);
3414 if (r < 0)
3415 goto fail;
3416
3417 #if HAVE_GCRYPT
3418 r = journal_file_append_first_tag(f);
3419 if (r < 0)
3420 goto fail;
3421 #endif
3422 }
3423
3424 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
3425 r = -EIO;
3426 goto fail;
3427 }
3428
3429 if (template && template->post_change_timer) {
3430 r = journal_file_enable_post_change_timer(
3431 f,
3432 sd_event_source_get_event(template->post_change_timer),
3433 template->post_change_timer_period);
3434
3435 if (r < 0)
3436 goto fail;
3437 }
3438
3439 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3440 f->close_fd = true;
3441
3442 *ret = f;
3443 return 0;
3444
3445 fail:
3446 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
3447 r = -EIO;
3448
3449 (void) journal_file_close(f);
3450
3451 return r;
3452 }
3453
3454 int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
3455 _cleanup_free_ char *p = NULL;
3456 size_t l;
3457 JournalFile *old_file, *new_file = NULL;
3458 int r;
3459
3460 assert(f);
3461 assert(*f);
3462
3463 old_file = *f;
3464
3465 if (!old_file->writable)
3466 return -EINVAL;
3467
3468 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3469 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3470 if (path_startswith(old_file->path, "/proc/self/fd"))
3471 return -EINVAL;
3472
3473 if (!endswith(old_file->path, ".journal"))
3474 return -EINVAL;
3475
3476 l = strlen(old_file->path);
3477 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3478 (int) l - 8, old_file->path,
3479 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3480 le64toh((*f)->header->head_entry_seqnum),
3481 le64toh((*f)->header->head_entry_realtime));
3482 if (r < 0)
3483 return -ENOMEM;
3484
3485 /* Try to rename the file to the archived version. If the file
3486 * already was deleted, we'll get ENOENT, let's ignore that
3487 * case. */
3488 r = rename(old_file->path, p);
3489 if (r < 0 && errno != ENOENT)
3490 return -errno;
3491
3492 /* Sync the rename to disk */
3493 (void) fsync_directory_of_file(old_file->fd);
3494
3495 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3496 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3497 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3498 * would result in the rotated journal never getting fsync() called before closing.
3499 * Now we simply queue the archive state by setting an archive bit, leaving the state
3500 * as STATE_ONLINE so proper offlining occurs. */
3501 old_file->archive = true;
3502
3503 /* Currently, btrfs is not very good with out write patterns
3504 * and fragments heavily. Let's defrag our journal files when
3505 * we archive them */
3506 old_file->defrag_on_close = true;
3507
3508 r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
3509
3510 if (deferred_closes &&
3511 set_put(deferred_closes, old_file) >= 0)
3512 (void) journal_file_set_offline(old_file, false);
3513 else
3514 (void) journal_file_close(old_file);
3515
3516 *f = new_file;
3517 return r;
3518 }
3519
3520 int journal_file_open_reliably(
3521 const char *fname,
3522 int flags,
3523 mode_t mode,
3524 bool compress,
3525 bool seal,
3526 JournalMetrics *metrics,
3527 MMapCache *mmap_cache,
3528 Set *deferred_closes,
3529 JournalFile *template,
3530 JournalFile **ret) {
3531
3532 int r;
3533 size_t l;
3534 _cleanup_free_ char *p = NULL;
3535
3536 r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3537 if (!IN_SET(r,
3538 -EBADMSG, /* Corrupted */
3539 -ENODATA, /* Truncated */
3540 -EHOSTDOWN, /* Other machine */
3541 -EPROTONOSUPPORT, /* Incompatible feature */
3542 -EBUSY, /* Unclean shutdown */
3543 -ESHUTDOWN, /* Already archived */
3544 -EIO, /* IO error, including SIGBUS on mmap */
3545 -EIDRM, /* File has been deleted */
3546 -ETXTBSY)) /* File is from the future */
3547 return r;
3548
3549 if ((flags & O_ACCMODE) == O_RDONLY)
3550 return r;
3551
3552 if (!(flags & O_CREAT))
3553 return r;
3554
3555 if (!endswith(fname, ".journal"))
3556 return r;
3557
3558 /* The file is corrupted. Rotate it away and try it again (but only once) */
3559
3560 l = strlen(fname);
3561 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
3562 (int) l - 8, fname,
3563 now(CLOCK_REALTIME),
3564 random_u64()) < 0)
3565 return -ENOMEM;
3566
3567 if (rename(fname, p) < 0)
3568 return -errno;
3569
3570 /* btrfs doesn't cope well with our write pattern and
3571 * fragments heavily. Let's defrag all files we rotate */
3572
3573 (void) chattr_path(p, 0, FS_NOCOW_FL);
3574 (void) btrfs_defrag(p);
3575
3576 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3577
3578 return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
3579 }
3580
3581 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3582 uint64_t i, n;
3583 uint64_t q, xor_hash = 0;
3584 int r;
3585 EntryItem *items;
3586 dual_timestamp ts;
3587
3588 assert(from);
3589 assert(to);
3590 assert(o);
3591 assert(p);
3592
3593 if (!to->writable)
3594 return -EPERM;
3595
3596 ts.monotonic = le64toh(o->entry.monotonic);
3597 ts.realtime = le64toh(o->entry.realtime);
3598
3599 n = journal_file_entry_n_items(o);
3600 /* alloca() can't take 0, hence let's allocate at least one */
3601 items = alloca(sizeof(EntryItem) * MAX(1u, n));
3602
3603 for (i = 0; i < n; i++) {
3604 uint64_t l, h;
3605 le64_t le_hash;
3606 size_t t;
3607 void *data;
3608 Object *u;
3609
3610 q = le64toh(o->entry.items[i].object_offset);
3611 le_hash = o->entry.items[i].hash;
3612
3613 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3614 if (r < 0)
3615 return r;
3616
3617 if (le_hash != o->data.hash)
3618 return -EBADMSG;
3619
3620 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3621 t = (size_t) l;
3622
3623 /* We hit the limit on 32bit machines */
3624 if ((uint64_t) t != l)
3625 return -E2BIG;
3626
3627 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3628 #if HAVE_XZ || HAVE_LZ4
3629 size_t rsize = 0;
3630
3631 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3632 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3633 if (r < 0)
3634 return r;
3635
3636 data = from->compress_buffer;
3637 l = rsize;
3638 #else
3639 return -EPROTONOSUPPORT;
3640 #endif
3641 } else
3642 data = o->data.payload;
3643
3644 r = journal_file_append_data(to, data, l, &u, &h);
3645 if (r < 0)
3646 return r;
3647
3648 xor_hash ^= le64toh(u->data.hash);
3649 items[i].object_offset = htole64(h);
3650 items[i].hash = u->data.hash;
3651
3652 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3653 if (r < 0)
3654 return r;
3655 }
3656
3657 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3658
3659 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
3660 return -EIO;
3661
3662 return r;
3663 }
3664
3665 void journal_reset_metrics(JournalMetrics *m) {
3666 assert(m);
3667
3668 /* Set everything to "pick automatic values". */
3669
3670 *m = (JournalMetrics) {
3671 .min_use = (uint64_t) -1,
3672 .max_use = (uint64_t) -1,
3673 .min_size = (uint64_t) -1,
3674 .max_size = (uint64_t) -1,
3675 .keep_free = (uint64_t) -1,
3676 .n_max_files = (uint64_t) -1,
3677 };
3678 }
3679
3680 void journal_default_metrics(JournalMetrics *m, int fd) {
3681 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
3682 struct statvfs ss;
3683 uint64_t fs_size;
3684
3685 assert(m);
3686 assert(fd >= 0);
3687
3688 if (fstatvfs(fd, &ss) >= 0)
3689 fs_size = ss.f_frsize * ss.f_blocks;
3690 else {
3691 log_debug_errno(errno, "Failed to detremine disk size: %m");
3692 fs_size = 0;
3693 }
3694
3695 if (m->max_use == (uint64_t) -1) {
3696
3697 if (fs_size > 0) {
3698 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3699
3700 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3701 m->max_use = DEFAULT_MAX_USE_UPPER;
3702
3703 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3704 m->max_use = DEFAULT_MAX_USE_LOWER;
3705 } else
3706 m->max_use = DEFAULT_MAX_USE_LOWER;
3707 } else {
3708 m->max_use = PAGE_ALIGN(m->max_use);
3709
3710 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3711 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3712 }
3713
3714 if (m->min_use == (uint64_t) -1)
3715 m->min_use = DEFAULT_MIN_USE;
3716
3717 if (m->min_use > m->max_use)
3718 m->min_use = m->max_use;
3719
3720 if (m->max_size == (uint64_t) -1) {
3721 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3722
3723 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3724 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3725 } else
3726 m->max_size = PAGE_ALIGN(m->max_size);
3727
3728 if (m->max_size != 0) {
3729 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3730 m->max_size = JOURNAL_FILE_SIZE_MIN;
3731
3732 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3733 m->max_use = m->max_size*2;
3734 }
3735
3736 if (m->min_size == (uint64_t) -1)
3737 m->min_size = JOURNAL_FILE_SIZE_MIN;
3738 else {
3739 m->min_size = PAGE_ALIGN(m->min_size);
3740
3741 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3742 m->min_size = JOURNAL_FILE_SIZE_MIN;
3743
3744 if (m->max_size != 0 && m->min_size > m->max_size)
3745 m->max_size = m->min_size;
3746 }
3747
3748 if (m->keep_free == (uint64_t) -1) {
3749
3750 if (fs_size > 0) {
3751 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3752
3753 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3754 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3755
3756 } else
3757 m->keep_free = DEFAULT_KEEP_FREE;
3758 }
3759
3760 if (m->n_max_files == (uint64_t) -1)
3761 m->n_max_files = DEFAULT_N_MAX_FILES;
3762
3763 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3764 format_bytes(a, sizeof(a), m->min_use),
3765 format_bytes(b, sizeof(b), m->max_use),
3766 format_bytes(c, sizeof(c), m->max_size),
3767 format_bytes(d, sizeof(d), m->min_size),
3768 format_bytes(e, sizeof(e), m->keep_free),
3769 m->n_max_files);
3770 }
3771
3772 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3773 assert(f);
3774 assert(f->header);
3775 assert(from || to);
3776
3777 if (from) {
3778 if (f->header->head_entry_realtime == 0)
3779 return -ENOENT;
3780
3781 *from = le64toh(f->header->head_entry_realtime);
3782 }
3783
3784 if (to) {
3785 if (f->header->tail_entry_realtime == 0)
3786 return -ENOENT;
3787
3788 *to = le64toh(f->header->tail_entry_realtime);
3789 }
3790
3791 return 1;
3792 }
3793
3794 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3795 Object *o;
3796 uint64_t p;
3797 int r;
3798
3799 assert(f);
3800 assert(from || to);
3801
3802 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3803 if (r <= 0)
3804 return r;
3805
3806 if (le64toh(o->data.n_entries) <= 0)
3807 return 0;
3808
3809 if (from) {
3810 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3811 if (r < 0)
3812 return r;
3813
3814 *from = le64toh(o->entry.monotonic);
3815 }
3816
3817 if (to) {
3818 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3819 if (r < 0)
3820 return r;
3821
3822 r = generic_array_get_plus_one(f,
3823 le64toh(o->data.entry_offset),
3824 le64toh(o->data.entry_array_offset),
3825 le64toh(o->data.n_entries)-1,
3826 &o, NULL);
3827 if (r <= 0)
3828 return r;
3829
3830 *to = le64toh(o->entry.monotonic);
3831 }
3832
3833 return 1;
3834 }
3835
3836 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3837 assert(f);
3838 assert(f->header);
3839
3840 /* If we gained new header fields we gained new features,
3841 * hence suggest a rotation */
3842 if (le64toh(f->header->header_size) < sizeof(Header)) {
3843 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3844 return true;
3845 }
3846
3847 /* Let's check if the hash tables grew over a certain fill
3848 * level (75%, borrowing this value from Java's hash table
3849 * implementation), and if so suggest a rotation. To calculate
3850 * the fill level we need the n_data field, which only exists
3851 * in newer versions. */
3852
3853 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3854 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3855 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3856 f->path,
3857 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3858 le64toh(f->header->n_data),
3859 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3860 (unsigned long long) f->last_stat.st_size,
3861 f->last_stat.st_size / le64toh(f->header->n_data));
3862 return true;
3863 }
3864
3865 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3866 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3867 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3868 f->path,
3869 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3870 le64toh(f->header->n_fields),
3871 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3872 return true;
3873 }
3874
3875 /* Are the data objects properly indexed by field objects? */
3876 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3877 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3878 le64toh(f->header->n_data) > 0 &&
3879 le64toh(f->header->n_fields) == 0)
3880 return true;
3881
3882 if (max_file_usec > 0) {
3883 usec_t t, h;
3884
3885 h = le64toh(f->header->head_entry_realtime);
3886 t = now(CLOCK_REALTIME);
3887
3888 if (h > 0 && t > h + max_file_usec)
3889 return true;
3890 }
3891
3892 return false;
3893 }