]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/libsystemd/sd-journal/journal-file.c
Merge pull request #22791 from keszybz/bootctl-invert-order
[thirdparty/systemd.git] / src / libsystemd / sd-journal / journal-file.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/fs.h>
6 #include <linux/magic.h>
7 #include <pthread.h>
8 #include <stddef.h>
9 #include <sys/mman.h>
10 #include <sys/statvfs.h>
11 #include <sys/uio.h>
12 #include <unistd.h>
13
14 #include "sd-event.h"
15
16 #include "alloc-util.h"
17 #include "chattr-util.h"
18 #include "compress.h"
19 #include "env-util.h"
20 #include "fd-util.h"
21 #include "format-util.h"
22 #include "fs-util.h"
23 #include "journal-authenticate.h"
24 #include "journal-def.h"
25 #include "journal-file.h"
26 #include "lookup3.h"
27 #include "memory-util.h"
28 #include "path-util.h"
29 #include "random-util.h"
30 #include "set.h"
31 #include "sort-util.h"
32 #include "stat-util.h"
33 #include "string-table.h"
34 #include "string-util.h"
35 #include "strv.h"
36 #include "sync-util.h"
37 #include "xattr-util.h"
38
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41
42 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
43 #define MIN_COMPRESS_THRESHOLD (8ULL)
44
45 /* This is the minimum journal file size */
46 #define JOURNAL_FILE_SIZE_MIN (512 * 1024ULL) /* 512 KiB */
47
48 /* These are the lower and upper bounds if we deduce the max_use value
49 * from the file system size */
50 #define MAX_USE_LOWER (1 * 1024 * 1024ULL) /* 1 MiB */
51 #define MAX_USE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
52
53 /* Those are the lower and upper bounds for the minimal use limit,
54 * i.e. how much we'll use even if keep_free suggests otherwise. */
55 #define MIN_USE_LOW (1 * 1024 * 1024ULL) /* 1 MiB */
56 #define MIN_USE_HIGH (16 * 1024 * 1024ULL) /* 16 MiB */
57
58 /* This is the upper bound if we deduce max_size from max_use */
59 #define MAX_SIZE_UPPER (128 * 1024 * 1024ULL) /* 128 MiB */
60
61 /* This is the upper bound if we deduce the keep_free value from the
62 * file system size */
63 #define KEEP_FREE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
64
65 /* This is the keep_free value when we can't determine the system
66 * size */
67 #define DEFAULT_KEEP_FREE (1024 * 1024ULL) /* 1 MB */
68
69 /* This is the default maximum number of journal files to keep around. */
70 #define DEFAULT_N_MAX_FILES 100
71
72 /* n_data was the first entry we added after the initial file format design */
73 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
74
75 /* How many entries to keep in the entry array chain cache at max */
76 #define CHAIN_CACHE_MAX 20
77
78 /* How much to increase the journal file size at once each time we allocate something new. */
79 #define FILE_SIZE_INCREASE (8 * 1024 * 1024ULL) /* 8MB */
80
81 /* Reread fstat() of the file for detecting deletions at least this often */
82 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
83
84 /* The mmap context to use for the header we pick as one above the last defined typed */
85 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
86
87 /* Longest hash chain to rotate after */
88 #define HASH_CHAIN_DEPTH_MAX 100
89
90 #ifdef __clang__
91 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
92 #endif
93
94 int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset) {
95 uint64_t p;
96 int r;
97
98 assert(f);
99 assert(f->header);
100 assert(ret_offset);
101
102 /* Same as journal_file_tail_end_by_mmap() below, but operates with pread() to avoid the mmap cache
103 * (and thus is thread safe) */
104
105 p = le64toh(f->header->tail_object_offset);
106 if (p == 0)
107 p = le64toh(f->header->header_size);
108 else {
109 Object tail;
110 uint64_t sz;
111
112 r = journal_file_read_object_header(f, OBJECT_UNUSED, p, &tail);
113 if (r < 0)
114 return r;
115
116 sz = le64toh(tail.object.size);
117 if (sz > UINT64_MAX - sizeof(uint64_t) + 1)
118 return -EBADMSG;
119
120 sz = ALIGN64(sz);
121 if (p > UINT64_MAX - sz)
122 return -EBADMSG;
123
124 p += sz;
125 }
126
127 *ret_offset = p;
128
129 return 0;
130 }
131
132 int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset) {
133 uint64_t p;
134 int r;
135
136 assert(f);
137 assert(f->header);
138 assert(ret_offset);
139
140 /* Same as journal_file_tail_end_by_pread() above, but operates with the usual mmap logic */
141
142 p = le64toh(f->header->tail_object_offset);
143 if (p == 0)
144 p = le64toh(f->header->header_size);
145 else {
146 Object *tail;
147 uint64_t sz;
148
149 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
150 if (r < 0)
151 return r;
152
153 sz = le64toh(READ_NOW(tail->object.size));
154 if (sz > UINT64_MAX - sizeof(uint64_t) + 1)
155 return -EBADMSG;
156
157 sz = ALIGN64(sz);
158 if (p > UINT64_MAX - sz)
159 return -EBADMSG;
160
161 p += sz;
162 }
163
164 *ret_offset = p;
165
166 return 0;
167 }
168
169 int journal_file_set_offline_thread_join(JournalFile *f) {
170 int r;
171
172 assert(f);
173
174 if (f->offline_state == OFFLINE_JOINED)
175 return 0;
176
177 r = pthread_join(f->offline_thread, NULL);
178 if (r)
179 return -r;
180
181 f->offline_state = OFFLINE_JOINED;
182
183 if (mmap_cache_fd_got_sigbus(f->cache_fd))
184 return -EIO;
185
186 return 0;
187 }
188
189 static int journal_file_set_online(JournalFile *f) {
190 bool wait = true;
191
192 assert(f);
193
194 if (!f->writable)
195 return -EPERM;
196
197 if (f->fd < 0 || !f->header)
198 return -EINVAL;
199
200 while (wait) {
201 switch (f->offline_state) {
202 case OFFLINE_JOINED:
203 /* No offline thread, no need to wait. */
204 wait = false;
205 break;
206
207 case OFFLINE_SYNCING:
208 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
209 continue;
210 /* Canceled syncing prior to offlining, no need to wait. */
211 wait = false;
212 break;
213
214 case OFFLINE_AGAIN_FROM_SYNCING:
215 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
216 continue;
217 /* Canceled restart from syncing, no need to wait. */
218 wait = false;
219 break;
220
221 case OFFLINE_AGAIN_FROM_OFFLINING:
222 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
223 continue;
224 /* Canceled restart from offlining, must wait for offlining to complete however. */
225 _fallthrough_;
226 default: {
227 int r;
228
229 r = journal_file_set_offline_thread_join(f);
230 if (r < 0)
231 return r;
232
233 wait = false;
234 break;
235 }
236 }
237 }
238
239 if (mmap_cache_fd_got_sigbus(f->cache_fd))
240 return -EIO;
241
242 switch (f->header->state) {
243 case STATE_ONLINE:
244 return 0;
245
246 case STATE_OFFLINE:
247 f->header->state = STATE_ONLINE;
248 (void) fsync(f->fd);
249 return 0;
250
251 default:
252 return -EINVAL;
253 }
254 }
255
256 JournalFile* journal_file_close(JournalFile *f) {
257 if (!f)
258 return NULL;
259
260 if (f->cache_fd)
261 mmap_cache_fd_free(f->cache_fd);
262
263 if (f->close_fd)
264 safe_close(f->fd);
265 free(f->path);
266
267 ordered_hashmap_free_free(f->chain_cache);
268
269 #if HAVE_COMPRESSION
270 free(f->compress_buffer);
271 #endif
272
273 #if HAVE_GCRYPT
274 if (f->fss_file)
275 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
276 else
277 free(f->fsprg_state);
278
279 free(f->fsprg_seed);
280
281 if (f->hmac)
282 gcry_md_close(f->hmac);
283 #endif
284
285 return mfree(f);
286 }
287
288 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
289 Header h = {};
290 ssize_t k;
291 int r;
292
293 assert(f);
294
295 memcpy(h.signature, HEADER_SIGNATURE, 8);
296 h.header_size = htole64(ALIGN64(sizeof(h)));
297
298 h.incompatible_flags |= htole32(
299 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
300 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4 |
301 f->compress_zstd * HEADER_INCOMPATIBLE_COMPRESSED_ZSTD |
302 f->keyed_hash * HEADER_INCOMPATIBLE_KEYED_HASH);
303
304 h.compatible_flags = htole32(
305 f->seal * HEADER_COMPATIBLE_SEALED);
306
307 r = sd_id128_randomize(&h.file_id);
308 if (r < 0)
309 return r;
310
311 if (template) {
312 h.seqnum_id = template->header->seqnum_id;
313 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
314 } else
315 h.seqnum_id = h.file_id;
316
317 k = pwrite(f->fd, &h, sizeof(h), 0);
318 if (k < 0)
319 return -errno;
320
321 if (k != sizeof(h))
322 return -EIO;
323
324 return 0;
325 }
326
327 static int journal_file_refresh_header(JournalFile *f) {
328 int r;
329
330 assert(f);
331 assert(f->header);
332
333 r = sd_id128_get_machine(&f->header->machine_id);
334 if (IN_SET(r, -ENOENT, -ENOMEDIUM))
335 /* We don't have a machine-id, let's continue without */
336 zero(f->header->machine_id);
337 else if (r < 0)
338 return r;
339
340 r = sd_id128_get_boot(&f->header->boot_id);
341 if (r < 0)
342 return r;
343
344 r = journal_file_set_online(f);
345
346 /* Sync the online state to disk; likely just created a new file, also sync the directory this file
347 * is located in. */
348 (void) fsync_full(f->fd);
349
350 return r;
351 }
352
353 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
354 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
355 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
356 const char *type = compatible ? "compatible" : "incompatible";
357 uint32_t flags;
358
359 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
360
361 if (flags & ~supported) {
362 if (flags & ~any)
363 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
364 f->path, type, flags & ~any);
365 flags = (flags & any) & ~supported;
366 if (flags) {
367 const char* strv[5];
368 size_t n = 0;
369 _cleanup_free_ char *t = NULL;
370
371 if (compatible) {
372 if (flags & HEADER_COMPATIBLE_SEALED)
373 strv[n++] = "sealed";
374 } else {
375 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ)
376 strv[n++] = "xz-compressed";
377 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4)
378 strv[n++] = "lz4-compressed";
379 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_ZSTD)
380 strv[n++] = "zstd-compressed";
381 if (flags & HEADER_INCOMPATIBLE_KEYED_HASH)
382 strv[n++] = "keyed-hash";
383 }
384 strv[n] = NULL;
385 assert(n < ELEMENTSOF(strv));
386
387 t = strv_join((char**) strv, ", ");
388 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
389 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
390 }
391 return true;
392 }
393
394 return false;
395 }
396
397 static int journal_file_verify_header(JournalFile *f) {
398 uint64_t arena_size, header_size;
399
400 assert(f);
401 assert(f->header);
402
403 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
404 return -EBADMSG;
405
406 /* In both read and write mode we refuse to open files with incompatible
407 * flags we don't know. */
408 if (warn_wrong_flags(f, false))
409 return -EPROTONOSUPPORT;
410
411 /* When open for writing we refuse to open files with compatible flags, too. */
412 if (f->writable && warn_wrong_flags(f, true))
413 return -EPROTONOSUPPORT;
414
415 if (f->header->state >= _STATE_MAX)
416 return -EBADMSG;
417
418 header_size = le64toh(READ_NOW(f->header->header_size));
419
420 /* The first addition was n_data, so check that we are at least this large */
421 if (header_size < HEADER_SIZE_MIN)
422 return -EBADMSG;
423
424 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
425 return -EBADMSG;
426
427 arena_size = le64toh(READ_NOW(f->header->arena_size));
428
429 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
430 return -ENODATA;
431
432 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
433 return -ENODATA;
434
435 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
436 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
437 !VALID64(le64toh(f->header->tail_object_offset)) ||
438 !VALID64(le64toh(f->header->entry_array_offset)))
439 return -ENODATA;
440
441 if (f->writable) {
442 sd_id128_t machine_id;
443 uint8_t state;
444 int r;
445
446 r = sd_id128_get_machine(&machine_id);
447 if (r < 0)
448 return r;
449
450 if (!sd_id128_equal(machine_id, f->header->machine_id))
451 return -EHOSTDOWN;
452
453 state = f->header->state;
454
455 if (state == STATE_ARCHIVED)
456 return -ESHUTDOWN; /* Already archived */
457 else if (state == STATE_ONLINE)
458 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
459 "Journal file %s is already online. Assuming unclean closing.",
460 f->path);
461 else if (state != STATE_OFFLINE)
462 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
463 "Journal file %s has unknown state %i.",
464 f->path, state);
465
466 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
467 return -EBADMSG;
468
469 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
470 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
471 * bisection. */
472 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME))
473 return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY),
474 "Journal file %s is from the future, refusing to append new data to it that'd be older.",
475 f->path);
476 }
477
478 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
479 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
480 f->compress_zstd = JOURNAL_HEADER_COMPRESSED_ZSTD(f->header);
481
482 f->seal = JOURNAL_HEADER_SEALED(f->header);
483
484 f->keyed_hash = JOURNAL_HEADER_KEYED_HASH(f->header);
485
486 return 0;
487 }
488
489 int journal_file_fstat(JournalFile *f) {
490 int r;
491
492 assert(f);
493 assert(f->fd >= 0);
494
495 if (fstat(f->fd, &f->last_stat) < 0)
496 return -errno;
497
498 f->last_stat_usec = now(CLOCK_MONOTONIC);
499
500 /* Refuse dealing with files that aren't regular */
501 r = stat_verify_regular(&f->last_stat);
502 if (r < 0)
503 return r;
504
505 /* Refuse appending to files that are already deleted */
506 if (f->last_stat.st_nlink <= 0)
507 return -EIDRM;
508
509 return 0;
510 }
511
512 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
513 uint64_t old_size, new_size, old_header_size, old_arena_size;
514 int r;
515
516 assert(f);
517 assert(f->header);
518
519 /* We assume that this file is not sparse, and we know that for sure, since we always call
520 * posix_fallocate() ourselves */
521
522 if (size > PAGE_ALIGN_DOWN(UINT64_MAX) - offset)
523 return -EINVAL;
524
525 if (mmap_cache_fd_got_sigbus(f->cache_fd))
526 return -EIO;
527
528 old_header_size = le64toh(READ_NOW(f->header->header_size));
529 old_arena_size = le64toh(READ_NOW(f->header->arena_size));
530 if (old_arena_size > PAGE_ALIGN_DOWN(UINT64_MAX) - old_header_size)
531 return -EBADMSG;
532
533 old_size = old_header_size + old_arena_size;
534
535 new_size = MAX(PAGE_ALIGN(offset + size), old_header_size);
536
537 if (new_size <= old_size) {
538
539 /* We already pre-allocated enough space, but before
540 * we write to it, let's check with fstat() if the
541 * file got deleted, in order make sure we don't throw
542 * away the data immediately. Don't check fstat() for
543 * all writes though, but only once ever 10s. */
544
545 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
546 return 0;
547
548 return journal_file_fstat(f);
549 }
550
551 /* Allocate more space. */
552
553 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
554 return -E2BIG;
555
556 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
557 struct statvfs svfs;
558
559 if (fstatvfs(f->fd, &svfs) >= 0) {
560 uint64_t available;
561
562 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
563
564 if (new_size - old_size > available)
565 return -E2BIG;
566 }
567 }
568
569 /* Increase by larger blocks at once */
570 new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
571 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
572 new_size = f->metrics.max_size;
573
574 /* Note that the glibc fallocate() fallback is very
575 inefficient, hence we try to minimize the allocation area
576 as we can. */
577 r = posix_fallocate_loop(f->fd, old_size, new_size - old_size);
578 if (r < 0)
579 return r;
580
581 f->header->arena_size = htole64(new_size - old_header_size);
582
583 return journal_file_fstat(f);
584 }
585
586 static unsigned type_to_context(ObjectType type) {
587 /* One context for each type, plus one catch-all for the rest */
588 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
589 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
590 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
591 }
592
593 static int journal_file_move_to(
594 JournalFile *f,
595 ObjectType type,
596 bool keep_always,
597 uint64_t offset,
598 uint64_t size,
599 void **ret) {
600
601 int r;
602
603 assert(f);
604 assert(ret);
605
606 if (size <= 0)
607 return -EINVAL;
608
609 if (size > UINT64_MAX - offset)
610 return -EBADMSG;
611
612 /* Avoid SIGBUS on invalid accesses */
613 if (offset + size > (uint64_t) f->last_stat.st_size) {
614 /* Hmm, out of range? Let's refresh the fstat() data
615 * first, before we trust that check. */
616
617 r = journal_file_fstat(f);
618 if (r < 0)
619 return r;
620
621 if (offset + size > (uint64_t) f->last_stat.st_size)
622 return -EADDRNOTAVAIL;
623 }
624
625 return mmap_cache_fd_get(f->cache_fd, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
626 }
627
628 static uint64_t minimum_header_size(Object *o) {
629
630 static const uint64_t table[] = {
631 [OBJECT_DATA] = sizeof(DataObject),
632 [OBJECT_FIELD] = sizeof(FieldObject),
633 [OBJECT_ENTRY] = sizeof(EntryObject),
634 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
635 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
636 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
637 [OBJECT_TAG] = sizeof(TagObject),
638 };
639
640 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
641 return sizeof(ObjectHeader);
642
643 return table[o->object.type];
644 }
645
646 /* Lightweight object checks. We want this to be fast, so that we won't
647 * slowdown every journal_file_move_to_object() call too much. */
648 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
649 assert(f);
650 assert(o);
651
652 switch (o->object.type) {
653
654 case OBJECT_DATA:
655 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0))
656 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
657 "Bad n_entries: %" PRIu64 ": %" PRIu64,
658 le64toh(o->data.n_entries),
659 offset);
660
661 if (le64toh(o->object.size) <= offsetof(Object, data.payload))
662 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
663 "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,
664 offsetof(Object, data.payload),
665 le64toh(o->object.size),
666 offset);
667
668 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
669 !VALID64(le64toh(o->data.next_field_offset)) ||
670 !VALID64(le64toh(o->data.entry_offset)) ||
671 !VALID64(le64toh(o->data.entry_array_offset)))
672 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
673 "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64,
674 le64toh(o->data.next_hash_offset),
675 le64toh(o->data.next_field_offset),
676 le64toh(o->data.entry_offset),
677 le64toh(o->data.entry_array_offset),
678 offset);
679
680 break;
681
682 case OBJECT_FIELD:
683 if (le64toh(o->object.size) <= offsetof(Object, field.payload))
684 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
685 "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64,
686 offsetof(Object, field.payload),
687 le64toh(o->object.size),
688 offset);
689
690 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
691 !VALID64(le64toh(o->field.head_data_offset)))
692 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
693 "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64,
694 le64toh(o->field.next_hash_offset),
695 le64toh(o->field.head_data_offset),
696 offset);
697 break;
698
699 case OBJECT_ENTRY: {
700 uint64_t sz;
701
702 sz = le64toh(READ_NOW(o->object.size));
703 if (sz < offsetof(Object, entry.items) ||
704 (sz - offsetof(Object, entry.items)) % sizeof(EntryItem) != 0)
705 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
706 "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,
707 offsetof(Object, entry.items),
708 sz,
709 offset);
710
711 if ((sz - offsetof(Object, entry.items)) / sizeof(EntryItem) <= 0)
712 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
713 "Invalid number items in entry: %" PRIu64 ": %" PRIu64,
714 (sz - offsetof(Object, entry.items)) / sizeof(EntryItem),
715 offset);
716
717 if (le64toh(o->entry.seqnum) <= 0)
718 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
719 "Invalid entry seqnum: %" PRIx64 ": %" PRIu64,
720 le64toh(o->entry.seqnum),
721 offset);
722
723 if (!VALID_REALTIME(le64toh(o->entry.realtime)))
724 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
725 "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64,
726 le64toh(o->entry.realtime),
727 offset);
728
729 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic)))
730 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
731 "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64,
732 le64toh(o->entry.monotonic),
733 offset);
734
735 break;
736 }
737
738 case OBJECT_DATA_HASH_TABLE:
739 case OBJECT_FIELD_HASH_TABLE: {
740 uint64_t sz;
741
742 sz = le64toh(READ_NOW(o->object.size));
743 if (sz < offsetof(Object, hash_table.items) ||
744 (sz - offsetof(Object, hash_table.items)) % sizeof(HashItem) != 0 ||
745 (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem) <= 0)
746 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
747 "Invalid %s hash table size: %" PRIu64 ": %" PRIu64,
748 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
749 sz,
750 offset);
751
752 break;
753 }
754
755 case OBJECT_ENTRY_ARRAY: {
756 uint64_t sz;
757
758 sz = le64toh(READ_NOW(o->object.size));
759 if (sz < offsetof(Object, entry_array.items) ||
760 (sz - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 ||
761 (sz - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0)
762 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
763 "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
764 sz,
765 offset);
766
767 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset)))
768 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
769 "Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64,
770 le64toh(o->entry_array.next_entry_array_offset),
771 offset);
772
773 break;
774 }
775
776 case OBJECT_TAG:
777 if (le64toh(o->object.size) != sizeof(TagObject))
778 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
779 "Invalid object tag size: %" PRIu64 ": %" PRIu64,
780 le64toh(o->object.size),
781 offset);
782
783 if (!VALID_EPOCH(le64toh(o->tag.epoch)))
784 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
785 "Invalid object tag epoch: %" PRIu64 ": %" PRIu64,
786 le64toh(o->tag.epoch), offset);
787
788 break;
789 }
790
791 return 0;
792 }
793
794 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
795 int r;
796 void *t;
797 Object *o;
798 uint64_t s;
799
800 assert(f);
801
802 /* Objects may only be located at multiple of 64 bit */
803 if (!VALID64(offset))
804 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
805 "Attempt to move to object at non-64bit boundary: %" PRIu64,
806 offset);
807
808 /* Object may not be located in the file header */
809 if (offset < le64toh(f->header->header_size))
810 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
811 "Attempt to move to object located in file header: %" PRIu64,
812 offset);
813
814 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
815 if (r < 0)
816 return r;
817
818 o = (Object*) t;
819 s = le64toh(READ_NOW(o->object.size));
820
821 if (s == 0)
822 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
823 "Attempt to move to uninitialized object: %" PRIu64,
824 offset);
825 if (s < sizeof(ObjectHeader))
826 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
827 "Attempt to move to overly short object: %" PRIu64,
828 offset);
829
830 if (o->object.type <= OBJECT_UNUSED)
831 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
832 "Attempt to move to object with invalid type: %" PRIu64,
833 offset);
834
835 if (s < minimum_header_size(o))
836 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
837 "Attempt to move to truncated object: %" PRIu64,
838 offset);
839
840 if (type > OBJECT_UNUSED && o->object.type != type)
841 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
842 "Attempt to move to object of unexpected type: %" PRIu64,
843 offset);
844
845 r = journal_file_move_to(f, type, false, offset, s, &t);
846 if (r < 0)
847 return r;
848
849 o = (Object*) t;
850
851 r = journal_file_check_object(f, offset, o);
852 if (r < 0)
853 return r;
854
855 if (ret)
856 *ret = o;
857
858 return 0;
859 }
860
861 int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t offset, Object *ret) {
862 uint64_t s;
863 ssize_t n;
864 Object o;
865 int r;
866
867 assert(f);
868
869 /* Objects may only be located at multiple of 64 bit */
870 if (!VALID64(offset))
871 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
872 "Attempt to read object at non-64bit boundary: %" PRIu64,
873 offset);
874
875 /* Object may not be located in the file header */
876 if (offset < le64toh(f->header->header_size))
877 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
878 "Attempt to read object located in file header: %" PRIu64,
879 offset);
880
881 /* This will likely read too much data but it avoids having to call pread() twice. */
882 n = pread(f->fd, &o, sizeof(o), offset);
883 if (n < 0)
884 return log_debug_errno(errno, "Failed to read journal file at offset: %" PRIu64,
885 offset);
886
887 if ((size_t) n < sizeof(o.object))
888 return log_debug_errno(SYNTHETIC_ERRNO(EIO),
889 "Failed to read short object at offset: %" PRIu64,
890 offset);
891
892 s = le64toh(o.object.size);
893 if (s == 0)
894 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
895 "Attempt to read uninitialized object: %" PRIu64,
896 offset);
897 if (s < sizeof(o.object))
898 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
899 "Attempt to read overly short object: %" PRIu64,
900 offset);
901
902 if (o.object.type <= OBJECT_UNUSED)
903 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
904 "Attempt to read object with invalid type: %" PRIu64,
905 offset);
906
907 if (s < minimum_header_size(&o))
908 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
909 "Attempt to read truncated object: %" PRIu64,
910 offset);
911
912 if ((size_t) n < minimum_header_size(&o))
913 return log_debug_errno(SYNTHETIC_ERRNO(EIO),
914 "Short read while reading object: %" PRIu64,
915 offset);
916
917 if (type > OBJECT_UNUSED && o.object.type != type)
918 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
919 "Attempt to read object of unexpected type: %" PRIu64,
920 offset);
921
922 r = journal_file_check_object(f, offset, &o);
923 if (r < 0)
924 return r;
925
926 if (ret)
927 *ret = o;
928
929 return 0;
930 }
931
932 static uint64_t journal_file_entry_seqnum(
933 JournalFile *f,
934 uint64_t *seqnum) {
935
936 uint64_t ret;
937
938 assert(f);
939 assert(f->header);
940
941 /* Picks a new sequence number for the entry we are about to add and returns it. */
942
943 ret = le64toh(f->header->tail_entry_seqnum) + 1;
944
945 if (seqnum) {
946 /* If an external seqnum counter was passed, we update both the local and the external one,
947 * and set it to the maximum of both */
948
949 if (*seqnum + 1 > ret)
950 ret = *seqnum + 1;
951
952 *seqnum = ret;
953 }
954
955 f->header->tail_entry_seqnum = htole64(ret);
956
957 if (f->header->head_entry_seqnum == 0)
958 f->header->head_entry_seqnum = htole64(ret);
959
960 return ret;
961 }
962
963 int journal_file_append_object(
964 JournalFile *f,
965 ObjectType type,
966 uint64_t size,
967 Object **ret,
968 uint64_t *ret_offset) {
969
970 int r;
971 uint64_t p;
972 Object *o;
973 void *t;
974
975 assert(f);
976 assert(f->header);
977 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
978 assert(size >= sizeof(ObjectHeader));
979
980 r = journal_file_set_online(f);
981 if (r < 0)
982 return r;
983
984 r = journal_file_tail_end_by_mmap(f, &p);
985 if (r < 0)
986 return r;
987
988 r = journal_file_allocate(f, p, size);
989 if (r < 0)
990 return r;
991
992 r = journal_file_move_to(f, type, false, p, size, &t);
993 if (r < 0)
994 return r;
995
996 o = (Object*) t;
997 o->object = (ObjectHeader) {
998 .type = type,
999 .size = htole64(size),
1000 };
1001
1002 f->header->tail_object_offset = htole64(p);
1003 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1004
1005 if (ret)
1006 *ret = o;
1007
1008 if (ret_offset)
1009 *ret_offset = p;
1010
1011 return 0;
1012 }
1013
1014 static int journal_file_setup_data_hash_table(JournalFile *f) {
1015 uint64_t s, p;
1016 Object *o;
1017 int r;
1018
1019 assert(f);
1020 assert(f->header);
1021
1022 /* We estimate that we need 1 hash table entry per 768 bytes
1023 of journal file and we want to make sure we never get
1024 beyond 75% fill level. Calculate the hash table size for
1025 the maximum file size based on these metrics. */
1026
1027 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1028 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1029 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1030
1031 log_debug("Reserving %"PRIu64" entries in data hash table.", s / sizeof(HashItem));
1032
1033 r = journal_file_append_object(f,
1034 OBJECT_DATA_HASH_TABLE,
1035 offsetof(Object, hash_table.items) + s,
1036 &o, &p);
1037 if (r < 0)
1038 return r;
1039
1040 memzero(o->hash_table.items, s);
1041
1042 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1043 f->header->data_hash_table_size = htole64(s);
1044
1045 return 0;
1046 }
1047
1048 static int journal_file_setup_field_hash_table(JournalFile *f) {
1049 uint64_t s, p;
1050 Object *o;
1051 int r;
1052
1053 assert(f);
1054 assert(f->header);
1055
1056 /* We use a fixed size hash table for the fields as this
1057 * number should grow very slowly only */
1058
1059 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1060 log_debug("Reserving %"PRIu64" entries in field hash table.", s / sizeof(HashItem));
1061
1062 r = journal_file_append_object(f,
1063 OBJECT_FIELD_HASH_TABLE,
1064 offsetof(Object, hash_table.items) + s,
1065 &o, &p);
1066 if (r < 0)
1067 return r;
1068
1069 memzero(o->hash_table.items, s);
1070
1071 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1072 f->header->field_hash_table_size = htole64(s);
1073
1074 return 0;
1075 }
1076
1077 int journal_file_map_data_hash_table(JournalFile *f) {
1078 uint64_t s, p;
1079 void *t;
1080 int r;
1081
1082 assert(f);
1083 assert(f->header);
1084
1085 if (f->data_hash_table)
1086 return 0;
1087
1088 p = le64toh(f->header->data_hash_table_offset);
1089 s = le64toh(f->header->data_hash_table_size);
1090
1091 r = journal_file_move_to(f,
1092 OBJECT_DATA_HASH_TABLE,
1093 true,
1094 p, s,
1095 &t);
1096 if (r < 0)
1097 return r;
1098
1099 f->data_hash_table = t;
1100 return 0;
1101 }
1102
1103 int journal_file_map_field_hash_table(JournalFile *f) {
1104 uint64_t s, p;
1105 void *t;
1106 int r;
1107
1108 assert(f);
1109 assert(f->header);
1110
1111 if (f->field_hash_table)
1112 return 0;
1113
1114 p = le64toh(f->header->field_hash_table_offset);
1115 s = le64toh(f->header->field_hash_table_size);
1116
1117 r = journal_file_move_to(f,
1118 OBJECT_FIELD_HASH_TABLE,
1119 true,
1120 p, s,
1121 &t);
1122 if (r < 0)
1123 return r;
1124
1125 f->field_hash_table = t;
1126 return 0;
1127 }
1128
1129 static int journal_file_link_field(
1130 JournalFile *f,
1131 Object *o,
1132 uint64_t offset,
1133 uint64_t hash) {
1134
1135 uint64_t p, h, m;
1136 int r;
1137
1138 assert(f);
1139 assert(f->header);
1140 assert(f->field_hash_table);
1141 assert(o);
1142 assert(offset > 0);
1143
1144 if (o->object.type != OBJECT_FIELD)
1145 return -EINVAL;
1146
1147 m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem);
1148 if (m <= 0)
1149 return -EBADMSG;
1150
1151 /* This might alter the window we are looking at */
1152 o->field.next_hash_offset = o->field.head_data_offset = 0;
1153
1154 h = hash % m;
1155 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1156 if (p == 0)
1157 f->field_hash_table[h].head_hash_offset = htole64(offset);
1158 else {
1159 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1160 if (r < 0)
1161 return r;
1162
1163 o->field.next_hash_offset = htole64(offset);
1164 }
1165
1166 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1167
1168 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1169 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1170
1171 return 0;
1172 }
1173
1174 static int journal_file_link_data(
1175 JournalFile *f,
1176 Object *o,
1177 uint64_t offset,
1178 uint64_t hash) {
1179
1180 uint64_t p, h, m;
1181 int r;
1182
1183 assert(f);
1184 assert(f->header);
1185 assert(f->data_hash_table);
1186 assert(o);
1187 assert(offset > 0);
1188
1189 if (o->object.type != OBJECT_DATA)
1190 return -EINVAL;
1191
1192 m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);
1193 if (m <= 0)
1194 return -EBADMSG;
1195
1196 /* This might alter the window we are looking at */
1197 o->data.next_hash_offset = o->data.next_field_offset = 0;
1198 o->data.entry_offset = o->data.entry_array_offset = 0;
1199 o->data.n_entries = 0;
1200
1201 h = hash % m;
1202 p = le64toh(f->data_hash_table[h].tail_hash_offset);
1203 if (p == 0)
1204 /* Only entry in the hash table is easy */
1205 f->data_hash_table[h].head_hash_offset = htole64(offset);
1206 else {
1207 /* Move back to the previous data object, to patch in
1208 * pointer */
1209
1210 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1211 if (r < 0)
1212 return r;
1213
1214 o->data.next_hash_offset = htole64(offset);
1215 }
1216
1217 f->data_hash_table[h].tail_hash_offset = htole64(offset);
1218
1219 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1220 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1221
1222 return 0;
1223 }
1224
1225 static int next_hash_offset(
1226 JournalFile *f,
1227 uint64_t *p,
1228 le64_t *next_hash_offset,
1229 uint64_t *depth,
1230 le64_t *header_max_depth) {
1231
1232 uint64_t nextp;
1233
1234 nextp = le64toh(READ_NOW(*next_hash_offset));
1235 if (nextp > 0) {
1236 if (nextp <= *p) /* Refuse going in loops */
1237 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1238 "Detected hash item loop in %s, refusing.", f->path);
1239
1240 (*depth)++;
1241
1242 /* If the depth of this hash chain is larger than all others we have seen so far, record it */
1243 if (header_max_depth && f->writable)
1244 *header_max_depth = htole64(MAX(*depth, le64toh(*header_max_depth)));
1245 }
1246
1247 *p = nextp;
1248 return 0;
1249 }
1250
1251 int journal_file_find_field_object_with_hash(
1252 JournalFile *f,
1253 const void *field, uint64_t size, uint64_t hash,
1254 Object **ret, uint64_t *ret_offset) {
1255
1256 uint64_t p, osize, h, m, depth = 0;
1257 int r;
1258
1259 assert(f);
1260 assert(f->header);
1261 assert(field && size > 0);
1262
1263 /* If the field hash table is empty, we can't find anything */
1264 if (le64toh(f->header->field_hash_table_size) <= 0)
1265 return 0;
1266
1267 /* Map the field hash table, if it isn't mapped yet. */
1268 r = journal_file_map_field_hash_table(f);
1269 if (r < 0)
1270 return r;
1271
1272 osize = offsetof(Object, field.payload) + size;
1273
1274 m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem);
1275 if (m <= 0)
1276 return -EBADMSG;
1277
1278 h = hash % m;
1279 p = le64toh(f->field_hash_table[h].head_hash_offset);
1280 while (p > 0) {
1281 Object *o;
1282
1283 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1284 if (r < 0)
1285 return r;
1286
1287 if (le64toh(o->field.hash) == hash &&
1288 le64toh(o->object.size) == osize &&
1289 memcmp(o->field.payload, field, size) == 0) {
1290
1291 if (ret)
1292 *ret = o;
1293 if (ret_offset)
1294 *ret_offset = p;
1295
1296 return 1;
1297 }
1298
1299 r = next_hash_offset(
1300 f,
1301 &p,
1302 &o->field.next_hash_offset,
1303 &depth,
1304 JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) ? &f->header->field_hash_chain_depth : NULL);
1305 if (r < 0)
1306 return r;
1307 }
1308
1309 return 0;
1310 }
1311
1312 uint64_t journal_file_hash_data(
1313 JournalFile *f,
1314 const void *data,
1315 size_t sz) {
1316
1317 assert(f);
1318 assert(data || sz == 0);
1319
1320 /* We try to unify our codebase on siphash, hence new-styled journal files utilizing the keyed hash
1321 * function use siphash. Old journal files use the Jenkins hash. */
1322
1323 if (JOURNAL_HEADER_KEYED_HASH(f->header))
1324 return siphash24(data, sz, f->header->file_id.bytes);
1325
1326 return jenkins_hash64(data, sz);
1327 }
1328
1329 int journal_file_find_field_object(
1330 JournalFile *f,
1331 const void *field, uint64_t size,
1332 Object **ret, uint64_t *ret_offset) {
1333
1334 assert(f);
1335 assert(field && size > 0);
1336
1337 return journal_file_find_field_object_with_hash(
1338 f,
1339 field, size,
1340 journal_file_hash_data(f, field, size),
1341 ret, ret_offset);
1342 }
1343
1344 int journal_file_find_data_object_with_hash(
1345 JournalFile *f,
1346 const void *data, uint64_t size, uint64_t hash,
1347 Object **ret, uint64_t *ret_offset) {
1348
1349 uint64_t p, osize, h, m, depth = 0;
1350 int r;
1351
1352 assert(f);
1353 assert(f->header);
1354 assert(data || size == 0);
1355
1356 /* If there's no data hash table, then there's no entry. */
1357 if (le64toh(f->header->data_hash_table_size) <= 0)
1358 return 0;
1359
1360 /* Map the data hash table, if it isn't mapped yet. */
1361 r = journal_file_map_data_hash_table(f);
1362 if (r < 0)
1363 return r;
1364
1365 osize = offsetof(Object, data.payload) + size;
1366
1367 m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);
1368 if (m <= 0)
1369 return -EBADMSG;
1370
1371 h = hash % m;
1372 p = le64toh(f->data_hash_table[h].head_hash_offset);
1373
1374 while (p > 0) {
1375 Object *o;
1376
1377 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1378 if (r < 0)
1379 return r;
1380
1381 if (le64toh(o->data.hash) != hash)
1382 goto next;
1383
1384 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1385 #if HAVE_COMPRESSION
1386 uint64_t l;
1387 size_t rsize = 0;
1388
1389 l = le64toh(READ_NOW(o->object.size));
1390 if (l <= offsetof(Object, data.payload))
1391 return -EBADMSG;
1392
1393 l -= offsetof(Object, data.payload);
1394
1395 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1396 o->data.payload, l, &f->compress_buffer, &rsize, 0);
1397 if (r < 0)
1398 return r;
1399
1400 if (rsize == size &&
1401 memcmp(f->compress_buffer, data, size) == 0) {
1402
1403 if (ret)
1404 *ret = o;
1405
1406 if (ret_offset)
1407 *ret_offset = p;
1408
1409 return 1;
1410 }
1411 #else
1412 return -EPROTONOSUPPORT;
1413 #endif
1414 } else if (le64toh(o->object.size) == osize &&
1415 memcmp(o->data.payload, data, size) == 0) {
1416
1417 if (ret)
1418 *ret = o;
1419
1420 if (ret_offset)
1421 *ret_offset = p;
1422
1423 return 1;
1424 }
1425
1426 next:
1427 r = next_hash_offset(
1428 f,
1429 &p,
1430 &o->data.next_hash_offset,
1431 &depth,
1432 JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) ? &f->header->data_hash_chain_depth : NULL);
1433 if (r < 0)
1434 return r;
1435 }
1436
1437 return 0;
1438 }
1439
1440 int journal_file_find_data_object(
1441 JournalFile *f,
1442 const void *data, uint64_t size,
1443 Object **ret, uint64_t *ret_offset) {
1444
1445 assert(f);
1446 assert(data || size == 0);
1447
1448 return journal_file_find_data_object_with_hash(
1449 f,
1450 data, size,
1451 journal_file_hash_data(f, data, size),
1452 ret, ret_offset);
1453 }
1454
1455 bool journal_field_valid(const char *p, size_t l, bool allow_protected) {
1456 /* We kinda enforce POSIX syntax recommendations for
1457 environment variables here, but make a couple of additional
1458 requirements.
1459
1460 http://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html */
1461
1462 if (l == SIZE_MAX)
1463 l = strlen(p);
1464
1465 /* No empty field names */
1466 if (l <= 0)
1467 return false;
1468
1469 /* Don't allow names longer than 64 chars */
1470 if (l > 64)
1471 return false;
1472
1473 /* Variables starting with an underscore are protected */
1474 if (!allow_protected && p[0] == '_')
1475 return false;
1476
1477 /* Don't allow digits as first character */
1478 if (p[0] >= '0' && p[0] <= '9')
1479 return false;
1480
1481 /* Only allow A-Z0-9 and '_' */
1482 for (const char *a = p; a < p + l; a++)
1483 if ((*a < 'A' || *a > 'Z') &&
1484 (*a < '0' || *a > '9') &&
1485 *a != '_')
1486 return false;
1487
1488 return true;
1489 }
1490
1491 static int journal_file_append_field(
1492 JournalFile *f,
1493 const void *field, uint64_t size,
1494 Object **ret, uint64_t *ret_offset) {
1495
1496 uint64_t hash, p;
1497 uint64_t osize;
1498 Object *o;
1499 int r;
1500
1501 assert(f);
1502 assert(field && size > 0);
1503
1504 if (!journal_field_valid(field, size, true))
1505 return -EBADMSG;
1506
1507 hash = journal_file_hash_data(f, field, size);
1508
1509 r = journal_file_find_field_object_with_hash(f, field, size, hash, ret, ret_offset);
1510 if (r < 0)
1511 return r;
1512 if (r > 0)
1513 return 0;
1514
1515 osize = offsetof(Object, field.payload) + size;
1516 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1517 if (r < 0)
1518 return r;
1519
1520 o->field.hash = htole64(hash);
1521 memcpy(o->field.payload, field, size);
1522
1523 r = journal_file_link_field(f, o, p, hash);
1524 if (r < 0)
1525 return r;
1526
1527 /* The linking might have altered the window, so let's only pass the offset to hmac which will
1528 * move to the object again if needed. */
1529
1530 #if HAVE_GCRYPT
1531 r = journal_file_hmac_put_object(f, OBJECT_FIELD, NULL, p);
1532 if (r < 0)
1533 return r;
1534 #endif
1535
1536 if (ret) {
1537 r = journal_file_move_to_object(f, OBJECT_FIELD, p, ret);
1538 if (r < 0)
1539 return r;
1540 }
1541
1542 if (ret_offset)
1543 *ret_offset = p;
1544
1545 return 0;
1546 }
1547
1548 static int journal_file_append_data(
1549 JournalFile *f,
1550 const void *data, uint64_t size,
1551 Object **ret, uint64_t *ret_offset) {
1552
1553 uint64_t hash, p, fp, osize;
1554 Object *o, *fo;
1555 int r, compression = 0;
1556 const void *eq;
1557
1558 assert(f);
1559
1560 if (!data || size == 0)
1561 return -EINVAL;
1562
1563 hash = journal_file_hash_data(f, data, size);
1564
1565 r = journal_file_find_data_object_with_hash(f, data, size, hash, ret, ret_offset);
1566 if (r < 0)
1567 return r;
1568 if (r > 0)
1569 return 0;
1570
1571 eq = memchr(data, '=', size);
1572 if (!eq)
1573 return -EINVAL;
1574
1575 osize = offsetof(Object, data.payload) + size;
1576 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1577 if (r < 0)
1578 return r;
1579
1580 o->data.hash = htole64(hash);
1581
1582 #if HAVE_COMPRESSION
1583 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
1584 size_t rsize = 0;
1585
1586 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1587
1588 if (compression >= 0) {
1589 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1590 o->object.flags |= compression;
1591
1592 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1593 size, rsize, object_compressed_to_string(compression));
1594 } else
1595 /* Compression didn't work, we don't really care why, let's continue without compression */
1596 compression = 0;
1597 }
1598 #endif
1599
1600 if (compression == 0)
1601 memcpy_safe(o->data.payload, data, size);
1602
1603 r = journal_file_link_data(f, o, p, hash);
1604 if (r < 0)
1605 return r;
1606
1607 /* The linking might have altered the window, so let's refresh our pointer. */
1608 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1609 if (r < 0)
1610 return r;
1611
1612 #if HAVE_GCRYPT
1613 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1614 if (r < 0)
1615 return r;
1616 #endif
1617
1618 /* Create field object ... */
1619 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1620 if (r < 0)
1621 return r;
1622
1623 /* ... and link it in. */
1624 o->data.next_field_offset = fo->field.head_data_offset;
1625 fo->field.head_data_offset = le64toh(p);
1626
1627 if (ret)
1628 *ret = o;
1629
1630 if (ret_offset)
1631 *ret_offset = p;
1632
1633 return 0;
1634 }
1635
1636 uint64_t journal_file_entry_n_items(Object *o) {
1637 uint64_t sz;
1638 assert(o);
1639
1640 if (o->object.type != OBJECT_ENTRY)
1641 return 0;
1642
1643 sz = le64toh(READ_NOW(o->object.size));
1644 if (sz < offsetof(Object, entry.items))
1645 return 0;
1646
1647 return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem);
1648 }
1649
1650 uint64_t journal_file_entry_array_n_items(Object *o) {
1651 uint64_t sz;
1652
1653 assert(o);
1654
1655 if (o->object.type != OBJECT_ENTRY_ARRAY)
1656 return 0;
1657
1658 sz = le64toh(READ_NOW(o->object.size));
1659 if (sz < offsetof(Object, entry_array.items))
1660 return 0;
1661
1662 return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1663 }
1664
1665 uint64_t journal_file_hash_table_n_items(Object *o) {
1666 uint64_t sz;
1667
1668 assert(o);
1669
1670 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1671 return 0;
1672
1673 sz = le64toh(READ_NOW(o->object.size));
1674 if (sz < offsetof(Object, hash_table.items))
1675 return 0;
1676
1677 return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1678 }
1679
1680 static int link_entry_into_array(JournalFile *f,
1681 le64_t *first,
1682 le64_t *idx,
1683 uint64_t p) {
1684 int r;
1685 uint64_t n = 0, ap = 0, q, i, a, hidx;
1686 Object *o;
1687
1688 assert(f);
1689 assert(f->header);
1690 assert(first);
1691 assert(idx);
1692 assert(p > 0);
1693
1694 a = le64toh(*first);
1695 i = hidx = le64toh(READ_NOW(*idx));
1696 while (a > 0) {
1697
1698 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1699 if (r < 0)
1700 return r;
1701
1702 n = journal_file_entry_array_n_items(o);
1703 if (i < n) {
1704 o->entry_array.items[i] = htole64(p);
1705 *idx = htole64(hidx + 1);
1706 return 0;
1707 }
1708
1709 i -= n;
1710 ap = a;
1711 a = le64toh(o->entry_array.next_entry_array_offset);
1712 }
1713
1714 if (hidx > n)
1715 n = (hidx+1) * 2;
1716 else
1717 n = n * 2;
1718
1719 if (n < 4)
1720 n = 4;
1721
1722 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1723 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1724 &o, &q);
1725 if (r < 0)
1726 return r;
1727
1728 #if HAVE_GCRYPT
1729 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1730 if (r < 0)
1731 return r;
1732 #endif
1733
1734 o->entry_array.items[i] = htole64(p);
1735
1736 if (ap == 0)
1737 *first = htole64(q);
1738 else {
1739 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1740 if (r < 0)
1741 return r;
1742
1743 o->entry_array.next_entry_array_offset = htole64(q);
1744 }
1745
1746 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1747 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1748
1749 *idx = htole64(hidx + 1);
1750
1751 return 0;
1752 }
1753
1754 static int link_entry_into_array_plus_one(JournalFile *f,
1755 le64_t *extra,
1756 le64_t *first,
1757 le64_t *idx,
1758 uint64_t p) {
1759
1760 uint64_t hidx;
1761 int r;
1762
1763 assert(f);
1764 assert(extra);
1765 assert(first);
1766 assert(idx);
1767 assert(p > 0);
1768
1769 hidx = le64toh(READ_NOW(*idx));
1770 if (hidx == UINT64_MAX)
1771 return -EBADMSG;
1772 if (hidx == 0)
1773 *extra = htole64(p);
1774 else {
1775 le64_t i;
1776
1777 i = htole64(hidx - 1);
1778 r = link_entry_into_array(f, first, &i, p);
1779 if (r < 0)
1780 return r;
1781 }
1782
1783 *idx = htole64(hidx + 1);
1784 return 0;
1785 }
1786
1787 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1788 uint64_t p;
1789 int r;
1790
1791 assert(f);
1792 assert(o);
1793 assert(offset > 0);
1794
1795 p = le64toh(o->entry.items[i].object_offset);
1796 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1797 if (r < 0)
1798 return r;
1799
1800 return link_entry_into_array_plus_one(f,
1801 &o->data.entry_offset,
1802 &o->data.entry_array_offset,
1803 &o->data.n_entries,
1804 offset);
1805 }
1806
1807 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1808 uint64_t n;
1809 int r;
1810
1811 assert(f);
1812 assert(f->header);
1813 assert(o);
1814 assert(offset > 0);
1815
1816 if (o->object.type != OBJECT_ENTRY)
1817 return -EINVAL;
1818
1819 __sync_synchronize();
1820
1821 /* Link up the entry itself */
1822 r = link_entry_into_array(f,
1823 &f->header->entry_array_offset,
1824 &f->header->n_entries,
1825 offset);
1826 if (r < 0)
1827 return r;
1828
1829 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1830
1831 if (f->header->head_entry_realtime == 0)
1832 f->header->head_entry_realtime = o->entry.realtime;
1833
1834 f->header->tail_entry_realtime = o->entry.realtime;
1835 f->header->tail_entry_monotonic = o->entry.monotonic;
1836
1837 /* Link up the items */
1838 n = journal_file_entry_n_items(o);
1839 for (uint64_t i = 0; i < n; i++) {
1840 int k;
1841
1842 /* If we fail to link an entry item because we can't allocate a new entry array, don't fail
1843 * immediately but try to link the other entry items since it might still be possible to link
1844 * those if they don't require a new entry array to be allocated. */
1845
1846 k = journal_file_link_entry_item(f, o, offset, i);
1847 if (k == -E2BIG)
1848 r = k;
1849 else if (k < 0)
1850 return k;
1851 }
1852
1853 return r;
1854 }
1855
1856 static int journal_file_append_entry_internal(
1857 JournalFile *f,
1858 const dual_timestamp *ts,
1859 const sd_id128_t *boot_id,
1860 uint64_t xor_hash,
1861 const EntryItem items[], unsigned n_items,
1862 uint64_t *seqnum,
1863 Object **ret, uint64_t *ret_offset) {
1864 uint64_t np;
1865 uint64_t osize;
1866 Object *o;
1867 int r;
1868
1869 assert(f);
1870 assert(f->header);
1871 assert(items || n_items == 0);
1872 assert(ts);
1873
1874 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1875
1876 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1877 if (r < 0)
1878 return r;
1879
1880 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1881 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1882 o->entry.realtime = htole64(ts->realtime);
1883 o->entry.monotonic = htole64(ts->monotonic);
1884 o->entry.xor_hash = htole64(xor_hash);
1885 if (boot_id)
1886 f->header->boot_id = *boot_id;
1887 o->entry.boot_id = f->header->boot_id;
1888
1889 #if HAVE_GCRYPT
1890 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1891 if (r < 0)
1892 return r;
1893 #endif
1894
1895 r = journal_file_link_entry(f, o, np);
1896 if (r < 0)
1897 return r;
1898
1899 if (ret)
1900 *ret = o;
1901
1902 if (ret_offset)
1903 *ret_offset = np;
1904
1905 return r;
1906 }
1907
1908 void journal_file_post_change(JournalFile *f) {
1909 assert(f);
1910
1911 if (f->fd < 0)
1912 return;
1913
1914 /* inotify() does not receive IN_MODIFY events from file
1915 * accesses done via mmap(). After each access we hence
1916 * trigger IN_MODIFY by truncating the journal file to its
1917 * current size which triggers IN_MODIFY. */
1918
1919 __sync_synchronize();
1920
1921 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1922 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1923 }
1924
1925 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1926 assert(userdata);
1927
1928 journal_file_post_change(userdata);
1929
1930 return 1;
1931 }
1932
1933 static void schedule_post_change(JournalFile *f) {
1934 int r;
1935
1936 assert(f);
1937 assert(f->post_change_timer);
1938
1939 r = sd_event_source_get_enabled(f->post_change_timer, NULL);
1940 if (r < 0) {
1941 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1942 goto fail;
1943 }
1944 if (r > 0)
1945 return;
1946
1947 r = sd_event_source_set_time_relative(f->post_change_timer, f->post_change_timer_period);
1948 if (r < 0) {
1949 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1950 goto fail;
1951 }
1952
1953 r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
1954 if (r < 0) {
1955 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1956 goto fail;
1957 }
1958
1959 return;
1960
1961 fail:
1962 /* On failure, let's simply post the change immediately. */
1963 journal_file_post_change(f);
1964 }
1965
1966 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1967 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1968 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1969 int r;
1970
1971 assert(f);
1972 assert_return(!f->post_change_timer, -EINVAL);
1973 assert(e);
1974 assert(t);
1975
1976 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1977 if (r < 0)
1978 return r;
1979
1980 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1981 if (r < 0)
1982 return r;
1983
1984 f->post_change_timer = TAKE_PTR(timer);
1985 f->post_change_timer_period = t;
1986
1987 return r;
1988 }
1989
1990 static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
1991 return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
1992 }
1993
1994 static size_t remove_duplicate_entry_items(EntryItem items[], size_t n) {
1995
1996 /* This function relies on the items array being sorted. */
1997 size_t j = 1;
1998
1999 if (n <= 1)
2000 return n;
2001
2002 for (size_t i = 1; i < n; i++)
2003 if (items[i].object_offset != items[j - 1].object_offset)
2004 items[j++] = items[i];
2005
2006 return j;
2007 }
2008
2009 int journal_file_append_entry(
2010 JournalFile *f,
2011 const dual_timestamp *ts,
2012 const sd_id128_t *boot_id,
2013 const struct iovec iovec[], unsigned n_iovec,
2014 uint64_t *seqnum,
2015 Object **ret, uint64_t *ret_offset) {
2016
2017 EntryItem *items;
2018 int r;
2019 uint64_t xor_hash = 0;
2020 struct dual_timestamp _ts;
2021
2022 assert(f);
2023 assert(f->header);
2024 assert(iovec && n_iovec > 0);
2025
2026 if (ts) {
2027 if (!VALID_REALTIME(ts->realtime))
2028 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2029 "Invalid realtime timestamp %" PRIu64 ", refusing entry.",
2030 ts->realtime);
2031 if (!VALID_MONOTONIC(ts->monotonic))
2032 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2033 "Invalid monotomic timestamp %" PRIu64 ", refusing entry.",
2034 ts->monotonic);
2035 } else {
2036 dual_timestamp_get(&_ts);
2037 ts = &_ts;
2038 }
2039
2040 #if HAVE_GCRYPT
2041 r = journal_file_maybe_append_tag(f, ts->realtime);
2042 if (r < 0)
2043 return r;
2044 #endif
2045
2046 items = newa(EntryItem, n_iovec);
2047
2048 for (size_t i = 0; i < n_iovec; i++) {
2049 uint64_t p;
2050 Object *o;
2051
2052 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2053 if (r < 0)
2054 return r;
2055
2056 /* When calculating the XOR hash field, we need to take special care if the "keyed-hash"
2057 * journal file flag is on. We use the XOR hash field to quickly determine the identity of a
2058 * specific record, and give records with otherwise identical position (i.e. match in seqno,
2059 * timestamp, …) a stable ordering. But for that we can't have it that the hash of the
2060 * objects in each file is different since they are keyed. Hence let's calculate the Jenkins
2061 * hash here for that. This also has the benefit that cursors for old and new journal files
2062 * are completely identical (they include the XOR hash after all). For classic Jenkins-hash
2063 * files things are easier, we can just take the value from the stored record directly. */
2064
2065 if (JOURNAL_HEADER_KEYED_HASH(f->header))
2066 xor_hash ^= jenkins_hash64(iovec[i].iov_base, iovec[i].iov_len);
2067 else
2068 xor_hash ^= le64toh(o->data.hash);
2069
2070 items[i] = (EntryItem) {
2071 .object_offset = htole64(p),
2072 .hash = o->data.hash,
2073 };
2074 }
2075
2076 /* Order by the position on disk, in order to improve seek
2077 * times for rotating media. */
2078 typesafe_qsort(items, n_iovec, entry_item_cmp);
2079 n_iovec = remove_duplicate_entry_items(items, n_iovec);
2080
2081 r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, ret_offset);
2082
2083 /* If the memory mapping triggered a SIGBUS then we return an
2084 * IO error and ignore the error code passed down to us, since
2085 * it is very likely just an effect of a nullified replacement
2086 * mapping page */
2087
2088 if (mmap_cache_fd_got_sigbus(f->cache_fd))
2089 r = -EIO;
2090
2091 if (f->post_change_timer)
2092 schedule_post_change(f);
2093 else
2094 journal_file_post_change(f);
2095
2096 return r;
2097 }
2098
2099 typedef struct ChainCacheItem {
2100 uint64_t first; /* the array at the beginning of the chain */
2101 uint64_t array; /* the cached array */
2102 uint64_t begin; /* the first item in the cached array */
2103 uint64_t total; /* the total number of items in all arrays before this one in the chain */
2104 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2105 } ChainCacheItem;
2106
2107 static void chain_cache_put(
2108 OrderedHashmap *h,
2109 ChainCacheItem *ci,
2110 uint64_t first,
2111 uint64_t array,
2112 uint64_t begin,
2113 uint64_t total,
2114 uint64_t last_index) {
2115
2116 if (!ci) {
2117 /* If the chain item to cache for this chain is the
2118 * first one it's not worth caching anything */
2119 if (array == first)
2120 return;
2121
2122 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2123 ci = ordered_hashmap_steal_first(h);
2124 assert(ci);
2125 } else {
2126 ci = new(ChainCacheItem, 1);
2127 if (!ci)
2128 return;
2129 }
2130
2131 ci->first = first;
2132
2133 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2134 free(ci);
2135 return;
2136 }
2137 } else
2138 assert(ci->first == first);
2139
2140 ci->array = array;
2141 ci->begin = begin;
2142 ci->total = total;
2143 ci->last_index = last_index;
2144 }
2145
2146 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2147 assert(i);
2148
2149 /* Increase or decrease the specified index, in the right direction. */
2150
2151 if (direction == DIRECTION_DOWN) {
2152 if (*i >= n - 1)
2153 return 0;
2154
2155 (*i)++;
2156 } else {
2157 if (*i <= 0)
2158 return 0;
2159
2160 (*i)--;
2161 }
2162
2163 return 1;
2164 }
2165
2166 static int bump_entry_array(JournalFile *f, Object *o, uint64_t offset, uint64_t first, direction_t direction, uint64_t *ret) {
2167 uint64_t p, q = 0;
2168 int r;
2169
2170 assert(f);
2171 assert(offset);
2172 assert(ret);
2173
2174 if (direction == DIRECTION_DOWN)
2175 return le64toh(o->entry_array.next_entry_array_offset);
2176
2177 /* Entry array chains are a singly linked list, so to find the previous array in the chain, we have
2178 * to start iterating from the top. */
2179
2180 p = first;
2181
2182 while (p > 0 && p != offset) {
2183 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, p, &o);
2184 if (r < 0)
2185 return r;
2186
2187 q = p;
2188 p = le64toh(o->entry_array.next_entry_array_offset);
2189 }
2190
2191 /* If we can't find the previous entry array in the entry array chain, we're likely dealing with a
2192 * corrupted journal file. */
2193 if (p == 0)
2194 return -EBADMSG;
2195
2196 *ret = q;
2197
2198 return 0;
2199 }
2200
2201 static int generic_array_get(
2202 JournalFile *f,
2203 uint64_t first,
2204 uint64_t i,
2205 direction_t direction,
2206 Object **ret, uint64_t *ret_offset) {
2207
2208 Object *o;
2209 uint64_t p = 0, a, t = 0, k;
2210 int r;
2211 ChainCacheItem *ci;
2212
2213 assert(f);
2214
2215 a = first;
2216
2217 /* Try the chain cache first */
2218 ci = ordered_hashmap_get(f->chain_cache, &first);
2219 if (ci && i > ci->total) {
2220 a = ci->array;
2221 i -= ci->total;
2222 t = ci->total;
2223 }
2224
2225 while (a > 0) {
2226 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2227 if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) {
2228 /* If there's corruption and we're going downwards, let's pretend we reached the
2229 * final entry in the entry array chain. */
2230
2231 if (direction == DIRECTION_DOWN)
2232 return 0;
2233
2234 /* If there's corruption and we're going upwards, move back to the previous entry
2235 * array and start iterating entries from there. */
2236
2237 r = bump_entry_array(f, NULL, a, first, DIRECTION_UP, &a);
2238 if (r < 0)
2239 return r;
2240
2241 i = UINT64_MAX;
2242
2243 break;
2244 }
2245 if (r < 0)
2246 return r;
2247
2248 k = journal_file_entry_array_n_items(o);
2249 if (i < k)
2250 break;
2251
2252 i -= k;
2253 t += k;
2254 a = le64toh(o->entry_array.next_entry_array_offset);
2255 }
2256
2257 /* If we've found the right location, now look for the first non-corrupt entry object (in the right
2258 * direction). */
2259
2260 while (a > 0) {
2261 /* In the first iteration of the while loop, we reuse i, k and o from the previous while
2262 * loop. */
2263 if (i == UINT64_MAX) {
2264 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2265 if (r < 0)
2266 return r;
2267
2268 k = journal_file_entry_array_n_items(o);
2269 if (k == 0)
2270 break;
2271
2272 i = direction == DIRECTION_DOWN ? 0 : k - 1;
2273 }
2274
2275 do {
2276 p = le64toh(o->entry_array.items[i]);
2277
2278 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
2279 if (r >= 0) {
2280 /* Let's cache this item for the next invocation */
2281 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2282
2283 if (ret_offset)
2284 *ret_offset = p;
2285
2286 return 1;
2287 }
2288 if (!IN_SET(r, -EADDRNOTAVAIL, -EBADMSG))
2289 return r;
2290
2291 /* OK, so this entry is borked. Most likely some entry didn't get synced to
2292 * disk properly, let's see if the next one might work for us instead. */
2293 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2294 } while (bump_array_index(&i, direction, k) > 0);
2295
2296 r = bump_entry_array(f, o, a, first, direction, &a);
2297 if (r < 0)
2298 return r;
2299
2300 t += k;
2301 i = UINT64_MAX;
2302 }
2303
2304 return 0;
2305 }
2306
2307 static int generic_array_get_plus_one(
2308 JournalFile *f,
2309 uint64_t extra,
2310 uint64_t first,
2311 uint64_t i,
2312 direction_t direction,
2313 Object **ret, uint64_t *ret_offset) {
2314
2315 int r;
2316
2317 assert(f);
2318
2319 if (i == 0) {
2320 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, ret);
2321 if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG))
2322 return generic_array_get(f, first, 0, direction, ret, ret_offset);
2323 if (r < 0)
2324 return r;
2325
2326 if (ret_offset)
2327 *ret_offset = extra;
2328
2329 return 1;
2330 }
2331
2332 return generic_array_get(f, first, i - 1, direction, ret, ret_offset);
2333 }
2334
2335 enum {
2336 TEST_FOUND,
2337 TEST_LEFT,
2338 TEST_RIGHT
2339 };
2340
2341 static int generic_array_bisect(
2342 JournalFile *f,
2343 uint64_t first,
2344 uint64_t n,
2345 uint64_t needle,
2346 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2347 direction_t direction,
2348 Object **ret,
2349 uint64_t *ret_offset,
2350 uint64_t *ret_idx) {
2351
2352 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = UINT64_MAX;
2353 bool subtract_one = false;
2354 Object *array = NULL;
2355 int r;
2356 ChainCacheItem *ci;
2357
2358 assert(f);
2359 assert(test_object);
2360
2361 /* Start with the first array in the chain */
2362 a = first;
2363
2364 ci = ordered_hashmap_get(f->chain_cache, &first);
2365 if (ci && n > ci->total && ci->begin != 0) {
2366 /* Ah, we have iterated this bisection array chain
2367 * previously! Let's see if we can skip ahead in the
2368 * chain, as far as the last time. But we can't jump
2369 * backwards in the chain, so let's check that
2370 * first. */
2371
2372 r = test_object(f, ci->begin, needle);
2373 if (r < 0)
2374 return r;
2375
2376 if (r == TEST_LEFT) {
2377 /* OK, what we are looking for is right of the
2378 * begin of this EntryArray, so let's jump
2379 * straight to previously cached array in the
2380 * chain */
2381
2382 a = ci->array;
2383 n -= ci->total;
2384 t = ci->total;
2385 last_index = ci->last_index;
2386 }
2387 }
2388
2389 while (a > 0) {
2390 uint64_t left, right, k, lp;
2391
2392 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2393 if (r < 0)
2394 return r;
2395
2396 k = journal_file_entry_array_n_items(array);
2397 right = MIN(k, n);
2398 if (right <= 0)
2399 return 0;
2400
2401 i = right - 1;
2402 lp = p = le64toh(array->entry_array.items[i]);
2403 if (p <= 0)
2404 r = -EBADMSG;
2405 else
2406 r = test_object(f, p, needle);
2407 if (r == -EBADMSG) {
2408 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2409 n = i;
2410 continue;
2411 }
2412 if (r < 0)
2413 return r;
2414
2415 if (r == TEST_FOUND)
2416 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2417
2418 if (r == TEST_RIGHT) {
2419 left = 0;
2420 right -= 1;
2421
2422 if (last_index != UINT64_MAX) {
2423 assert(last_index <= right);
2424
2425 /* If we cached the last index we
2426 * looked at, let's try to not to jump
2427 * too wildly around and see if we can
2428 * limit the range to look at early to
2429 * the immediate neighbors of the last
2430 * index we looked at. */
2431
2432 if (last_index > 0) {
2433 uint64_t x = last_index - 1;
2434
2435 p = le64toh(array->entry_array.items[x]);
2436 if (p <= 0)
2437 return -EBADMSG;
2438
2439 r = test_object(f, p, needle);
2440 if (r < 0)
2441 return r;
2442
2443 if (r == TEST_FOUND)
2444 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2445
2446 if (r == TEST_RIGHT)
2447 right = x;
2448 else
2449 left = x + 1;
2450 }
2451
2452 if (last_index < right) {
2453 uint64_t y = last_index + 1;
2454
2455 p = le64toh(array->entry_array.items[y]);
2456 if (p <= 0)
2457 return -EBADMSG;
2458
2459 r = test_object(f, p, needle);
2460 if (r < 0)
2461 return r;
2462
2463 if (r == TEST_FOUND)
2464 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2465
2466 if (r == TEST_RIGHT)
2467 right = y;
2468 else
2469 left = y + 1;
2470 }
2471 }
2472
2473 for (;;) {
2474 if (left == right) {
2475 if (direction == DIRECTION_UP)
2476 subtract_one = true;
2477
2478 i = left;
2479 goto found;
2480 }
2481
2482 assert(left < right);
2483 i = (left + right) / 2;
2484
2485 p = le64toh(array->entry_array.items[i]);
2486 if (p <= 0)
2487 r = -EBADMSG;
2488 else
2489 r = test_object(f, p, needle);
2490 if (r == -EBADMSG) {
2491 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2492 right = n = i;
2493 continue;
2494 }
2495 if (r < 0)
2496 return r;
2497
2498 if (r == TEST_FOUND)
2499 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2500
2501 if (r == TEST_RIGHT)
2502 right = i;
2503 else
2504 left = i + 1;
2505 }
2506 }
2507
2508 if (k >= n) {
2509 if (direction == DIRECTION_UP) {
2510 i = n;
2511 subtract_one = true;
2512 goto found;
2513 }
2514
2515 return 0;
2516 }
2517
2518 last_p = lp;
2519
2520 n -= k;
2521 t += k;
2522 last_index = UINT64_MAX;
2523 a = le64toh(array->entry_array.next_entry_array_offset);
2524 }
2525
2526 return 0;
2527
2528 found:
2529 if (subtract_one && t == 0 && i == 0)
2530 return 0;
2531
2532 /* Let's cache this item for the next invocation */
2533 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);
2534
2535 if (subtract_one && i == 0)
2536 p = last_p;
2537 else if (subtract_one)
2538 p = le64toh(array->entry_array.items[i-1]);
2539 else
2540 p = le64toh(array->entry_array.items[i]);
2541
2542 if (ret) {
2543 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
2544 if (r < 0)
2545 return r;
2546 }
2547
2548 if (ret_offset)
2549 *ret_offset = p;
2550
2551 if (ret_idx)
2552 *ret_idx = t + i + (subtract_one ? -1 : 0);
2553
2554 return 1;
2555 }
2556
2557 static int generic_array_bisect_plus_one(
2558 JournalFile *f,
2559 uint64_t extra,
2560 uint64_t first,
2561 uint64_t n,
2562 uint64_t needle,
2563 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2564 direction_t direction,
2565 Object **ret,
2566 uint64_t *ret_offset,
2567 uint64_t *ret_idx) {
2568
2569 int r;
2570 bool step_back = false;
2571
2572 assert(f);
2573 assert(test_object);
2574
2575 if (n <= 0)
2576 return 0;
2577
2578 /* This bisects the array in object 'first', but first checks
2579 * an extra */
2580 r = test_object(f, extra, needle);
2581 if (r < 0)
2582 return r;
2583
2584 if (r == TEST_FOUND)
2585 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2586
2587 /* if we are looking with DIRECTION_UP then we need to first
2588 see if in the actual array there is a matching entry, and
2589 return the last one of that. But if there isn't any we need
2590 to return this one. Hence remember this, and return it
2591 below. */
2592 if (r == TEST_LEFT)
2593 step_back = direction == DIRECTION_UP;
2594
2595 if (r == TEST_RIGHT) {
2596 if (direction == DIRECTION_DOWN)
2597 goto found;
2598 else
2599 return 0;
2600 }
2601
2602 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, ret_offset, ret_idx);
2603
2604 if (r == 0 && step_back)
2605 goto found;
2606
2607 if (r > 0 && ret_idx)
2608 (*ret_idx)++;
2609
2610 return r;
2611
2612 found:
2613 if (ret) {
2614 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, ret);
2615 if (r < 0)
2616 return r;
2617 }
2618
2619 if (ret_offset)
2620 *ret_offset = extra;
2621
2622 if (ret_idx)
2623 *ret_idx = 0;
2624
2625 return 1;
2626 }
2627
2628 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2629 assert(f);
2630 assert(p > 0);
2631
2632 if (p == needle)
2633 return TEST_FOUND;
2634 else if (p < needle)
2635 return TEST_LEFT;
2636 else
2637 return TEST_RIGHT;
2638 }
2639
2640 int journal_file_move_to_entry_by_offset(
2641 JournalFile *f,
2642 uint64_t p,
2643 direction_t direction,
2644 Object **ret,
2645 uint64_t *ret_offset) {
2646
2647 assert(f);
2648 assert(f->header);
2649
2650 return generic_array_bisect(
2651 f,
2652 le64toh(f->header->entry_array_offset),
2653 le64toh(f->header->n_entries),
2654 p,
2655 test_object_offset,
2656 direction,
2657 ret, ret_offset, NULL);
2658 }
2659
2660 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2661 uint64_t sq;
2662 Object *o;
2663 int r;
2664
2665 assert(f);
2666 assert(p > 0);
2667
2668 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2669 if (r < 0)
2670 return r;
2671
2672 sq = le64toh(READ_NOW(o->entry.seqnum));
2673 if (sq == needle)
2674 return TEST_FOUND;
2675 else if (sq < needle)
2676 return TEST_LEFT;
2677 else
2678 return TEST_RIGHT;
2679 }
2680
2681 int journal_file_move_to_entry_by_seqnum(
2682 JournalFile *f,
2683 uint64_t seqnum,
2684 direction_t direction,
2685 Object **ret,
2686 uint64_t *ret_offset) {
2687 assert(f);
2688 assert(f->header);
2689
2690 return generic_array_bisect(
2691 f,
2692 le64toh(f->header->entry_array_offset),
2693 le64toh(f->header->n_entries),
2694 seqnum,
2695 test_object_seqnum,
2696 direction,
2697 ret, ret_offset, NULL);
2698 }
2699
2700 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2701 Object *o;
2702 uint64_t rt;
2703 int r;
2704
2705 assert(f);
2706 assert(p > 0);
2707
2708 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2709 if (r < 0)
2710 return r;
2711
2712 rt = le64toh(READ_NOW(o->entry.realtime));
2713 if (rt == needle)
2714 return TEST_FOUND;
2715 else if (rt < needle)
2716 return TEST_LEFT;
2717 else
2718 return TEST_RIGHT;
2719 }
2720
2721 int journal_file_move_to_entry_by_realtime(
2722 JournalFile *f,
2723 uint64_t realtime,
2724 direction_t direction,
2725 Object **ret,
2726 uint64_t *ret_offset) {
2727 assert(f);
2728 assert(f->header);
2729
2730 return generic_array_bisect(
2731 f,
2732 le64toh(f->header->entry_array_offset),
2733 le64toh(f->header->n_entries),
2734 realtime,
2735 test_object_realtime,
2736 direction,
2737 ret, ret_offset, NULL);
2738 }
2739
2740 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2741 Object *o;
2742 uint64_t m;
2743 int r;
2744
2745 assert(f);
2746 assert(p > 0);
2747
2748 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2749 if (r < 0)
2750 return r;
2751
2752 m = le64toh(READ_NOW(o->entry.monotonic));
2753 if (m == needle)
2754 return TEST_FOUND;
2755 else if (m < needle)
2756 return TEST_LEFT;
2757 else
2758 return TEST_RIGHT;
2759 }
2760
2761 static int find_data_object_by_boot_id(
2762 JournalFile *f,
2763 sd_id128_t boot_id,
2764 Object **o,
2765 uint64_t *b) {
2766
2767 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2768
2769 sd_id128_to_string(boot_id, t + 9);
2770 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2771 }
2772
2773 int journal_file_move_to_entry_by_monotonic(
2774 JournalFile *f,
2775 sd_id128_t boot_id,
2776 uint64_t monotonic,
2777 direction_t direction,
2778 Object **ret,
2779 uint64_t *ret_offset) {
2780
2781 Object *o;
2782 int r;
2783
2784 assert(f);
2785
2786 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2787 if (r < 0)
2788 return r;
2789 if (r == 0)
2790 return -ENOENT;
2791
2792 return generic_array_bisect_plus_one(
2793 f,
2794 le64toh(o->data.entry_offset),
2795 le64toh(o->data.entry_array_offset),
2796 le64toh(o->data.n_entries),
2797 monotonic,
2798 test_object_monotonic,
2799 direction,
2800 ret, ret_offset, NULL);
2801 }
2802
2803 void journal_file_reset_location(JournalFile *f) {
2804 f->location_type = LOCATION_HEAD;
2805 f->current_offset = 0;
2806 f->current_seqnum = 0;
2807 f->current_realtime = 0;
2808 f->current_monotonic = 0;
2809 zero(f->current_boot_id);
2810 f->current_xor_hash = 0;
2811 }
2812
2813 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2814 f->location_type = LOCATION_SEEK;
2815 f->current_offset = offset;
2816 f->current_seqnum = le64toh(o->entry.seqnum);
2817 f->current_realtime = le64toh(o->entry.realtime);
2818 f->current_monotonic = le64toh(o->entry.monotonic);
2819 f->current_boot_id = o->entry.boot_id;
2820 f->current_xor_hash = le64toh(o->entry.xor_hash);
2821 }
2822
2823 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2824 int r;
2825
2826 assert(af);
2827 assert(af->header);
2828 assert(bf);
2829 assert(bf->header);
2830 assert(af->location_type == LOCATION_SEEK);
2831 assert(bf->location_type == LOCATION_SEEK);
2832
2833 /* If contents, timestamps and seqnum match, these entries are
2834 * identical. */
2835 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2836 af->current_monotonic == bf->current_monotonic &&
2837 af->current_realtime == bf->current_realtime &&
2838 af->current_xor_hash == bf->current_xor_hash &&
2839 sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id) &&
2840 af->current_seqnum == bf->current_seqnum)
2841 return 0;
2842
2843 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2844
2845 /* If this is from the same seqnum source, compare
2846 * seqnums */
2847 r = CMP(af->current_seqnum, bf->current_seqnum);
2848 if (r != 0)
2849 return r;
2850
2851 /* Wow! This is weird, different data but the same
2852 * seqnums? Something is borked, but let's make the
2853 * best of it and compare by time. */
2854 }
2855
2856 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2857
2858 /* If the boot id matches, compare monotonic time */
2859 r = CMP(af->current_monotonic, bf->current_monotonic);
2860 if (r != 0)
2861 return r;
2862 }
2863
2864 /* Otherwise, compare UTC time */
2865 r = CMP(af->current_realtime, bf->current_realtime);
2866 if (r != 0)
2867 return r;
2868
2869 /* Finally, compare by contents */
2870 return CMP(af->current_xor_hash, bf->current_xor_hash);
2871 }
2872
2873 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2874
2875 /* Consider it an error if any of the two offsets is uninitialized */
2876 if (old_offset == 0 || new_offset == 0)
2877 return false;
2878
2879 /* If we go down, the new offset must be larger than the old one. */
2880 return direction == DIRECTION_DOWN ?
2881 new_offset > old_offset :
2882 new_offset < old_offset;
2883 }
2884
2885 int journal_file_next_entry(
2886 JournalFile *f,
2887 uint64_t p,
2888 direction_t direction,
2889 Object **ret, uint64_t *ret_offset) {
2890
2891 uint64_t i, n, ofs;
2892 int r;
2893
2894 assert(f);
2895 assert(f->header);
2896
2897 n = le64toh(READ_NOW(f->header->n_entries));
2898 if (n <= 0)
2899 return 0;
2900
2901 if (p == 0)
2902 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2903 else {
2904 r = generic_array_bisect(f,
2905 le64toh(f->header->entry_array_offset),
2906 le64toh(f->header->n_entries),
2907 p,
2908 test_object_offset,
2909 DIRECTION_DOWN,
2910 NULL, NULL,
2911 &i);
2912 if (r <= 0)
2913 return r;
2914
2915 r = bump_array_index(&i, direction, n);
2916 if (r <= 0)
2917 return r;
2918 }
2919
2920 /* And jump to it */
2921 r = generic_array_get(f, le64toh(f->header->entry_array_offset), i, direction, ret, &ofs);
2922 if (r <= 0)
2923 return r;
2924
2925 /* Ensure our array is properly ordered. */
2926 if (p > 0 && !check_properly_ordered(ofs, p, direction))
2927 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2928 "%s: entry array not properly ordered at entry %" PRIu64,
2929 f->path, i);
2930
2931 if (ret_offset)
2932 *ret_offset = ofs;
2933
2934 return 1;
2935 }
2936
2937 int journal_file_next_entry_for_data(
2938 JournalFile *f,
2939 Object *d,
2940 direction_t direction,
2941 Object **ret, uint64_t *ret_offset) {
2942
2943 uint64_t i, n, ofs;
2944 int r;
2945
2946 assert(f);
2947 assert(d);
2948 assert(d->object.type == OBJECT_DATA);
2949
2950 n = le64toh(READ_NOW(d->data.n_entries));
2951 if (n <= 0)
2952 return n;
2953
2954 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2955
2956 r = generic_array_get_plus_one(f,
2957 le64toh(d->data.entry_offset),
2958 le64toh(d->data.entry_array_offset),
2959 i,
2960 direction,
2961 ret, &ofs);
2962 if (r <= 0)
2963 return r;
2964
2965 if (ret_offset)
2966 *ret_offset = ofs;
2967
2968 return 1;
2969 }
2970
2971 int journal_file_move_to_entry_by_offset_for_data(
2972 JournalFile *f,
2973 Object *d,
2974 uint64_t p,
2975 direction_t direction,
2976 Object **ret, uint64_t *ret_offset) {
2977
2978 assert(f);
2979 assert(d);
2980 assert(d->object.type == OBJECT_DATA);
2981
2982 return generic_array_bisect_plus_one(
2983 f,
2984 le64toh(d->data.entry_offset),
2985 le64toh(d->data.entry_array_offset),
2986 le64toh(d->data.n_entries),
2987 p,
2988 test_object_offset,
2989 direction,
2990 ret, ret_offset, NULL);
2991 }
2992
2993 int journal_file_move_to_entry_by_monotonic_for_data(
2994 JournalFile *f,
2995 Object *d,
2996 sd_id128_t boot_id,
2997 uint64_t monotonic,
2998 direction_t direction,
2999 Object **ret, uint64_t *ret_offset) {
3000
3001 Object *o;
3002 int r;
3003 uint64_t b, z, entry_offset, entry_array_offset, n_entries;
3004
3005 assert(f);
3006 assert(d);
3007 assert(d->object.type == OBJECT_DATA);
3008
3009 /* Save all the required data before the data object gets invalidated. */
3010 entry_offset = le64toh(READ_NOW(d->data.entry_offset));
3011 entry_array_offset = le64toh(READ_NOW(d->data.entry_array_offset));
3012 n_entries = le64toh(READ_NOW(d->data.n_entries));
3013
3014 /* First, seek by time */
3015 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
3016 if (r < 0)
3017 return r;
3018 if (r == 0)
3019 return -ENOENT;
3020
3021 r = generic_array_bisect_plus_one(f,
3022 le64toh(o->data.entry_offset),
3023 le64toh(o->data.entry_array_offset),
3024 le64toh(o->data.n_entries),
3025 monotonic,
3026 test_object_monotonic,
3027 direction,
3028 NULL, &z, NULL);
3029 if (r <= 0)
3030 return r;
3031
3032 /* And now, continue seeking until we find an entry that
3033 * exists in both bisection arrays */
3034
3035 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
3036 if (r < 0)
3037 return r;
3038
3039 for (;;) {
3040 uint64_t p, q;
3041
3042 r = generic_array_bisect_plus_one(f,
3043 entry_offset,
3044 entry_array_offset,
3045 n_entries,
3046 z,
3047 test_object_offset,
3048 direction,
3049 NULL, &p, NULL);
3050 if (r <= 0)
3051 return r;
3052
3053 r = generic_array_bisect_plus_one(f,
3054 le64toh(o->data.entry_offset),
3055 le64toh(o->data.entry_array_offset),
3056 le64toh(o->data.n_entries),
3057 p,
3058 test_object_offset,
3059 direction,
3060 NULL, &q, NULL);
3061
3062 if (r <= 0)
3063 return r;
3064
3065 if (p == q) {
3066 if (ret) {
3067 r = journal_file_move_to_object(f, OBJECT_ENTRY, q, ret);
3068 if (r < 0)
3069 return r;
3070 }
3071
3072 if (ret_offset)
3073 *ret_offset = q;
3074
3075 return 1;
3076 }
3077
3078 z = q;
3079 }
3080 }
3081
3082 int journal_file_move_to_entry_by_seqnum_for_data(
3083 JournalFile *f,
3084 Object *d,
3085 uint64_t seqnum,
3086 direction_t direction,
3087 Object **ret, uint64_t *ret_offset) {
3088
3089 assert(f);
3090 assert(d);
3091 assert(d->object.type == OBJECT_DATA);
3092
3093 return generic_array_bisect_plus_one(
3094 f,
3095 le64toh(d->data.entry_offset),
3096 le64toh(d->data.entry_array_offset),
3097 le64toh(d->data.n_entries),
3098 seqnum,
3099 test_object_seqnum,
3100 direction,
3101 ret, ret_offset, NULL);
3102 }
3103
3104 int journal_file_move_to_entry_by_realtime_for_data(
3105 JournalFile *f,
3106 Object *d,
3107 uint64_t realtime,
3108 direction_t direction,
3109 Object **ret, uint64_t *ret_offset) {
3110
3111 assert(f);
3112 assert(d);
3113 assert(d->object.type == OBJECT_DATA);
3114
3115 return generic_array_bisect_plus_one(
3116 f,
3117 le64toh(d->data.entry_offset),
3118 le64toh(d->data.entry_array_offset),
3119 le64toh(d->data.n_entries),
3120 realtime,
3121 test_object_realtime,
3122 direction,
3123 ret, ret_offset, NULL);
3124 }
3125
3126 void journal_file_dump(JournalFile *f) {
3127 Object *o;
3128 int r;
3129 uint64_t p;
3130
3131 assert(f);
3132 assert(f->header);
3133
3134 journal_file_print_header(f);
3135
3136 p = le64toh(READ_NOW(f->header->header_size));
3137 while (p != 0) {
3138 const char *s;
3139
3140 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3141 if (r < 0)
3142 goto fail;
3143
3144 s = journal_object_type_to_string(o->object.type);
3145
3146 switch (o->object.type) {
3147
3148 case OBJECT_ENTRY:
3149 assert(s);
3150
3151 printf("Type: %s seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3152 s,
3153 le64toh(o->entry.seqnum),
3154 le64toh(o->entry.monotonic),
3155 le64toh(o->entry.realtime));
3156 break;
3157
3158 case OBJECT_TAG:
3159 assert(s);
3160
3161 printf("Type: %s seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3162 s,
3163 le64toh(o->tag.seqnum),
3164 le64toh(o->tag.epoch));
3165 break;
3166
3167 default:
3168 if (s)
3169 printf("Type: %s \n", s);
3170 else
3171 printf("Type: unknown (%i)", o->object.type);
3172
3173 break;
3174 }
3175
3176 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3177 printf("Flags: %s\n",
3178 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3179
3180 if (p == le64toh(f->header->tail_object_offset))
3181 p = 0;
3182 else
3183 p += ALIGN64(le64toh(o->object.size));
3184 }
3185
3186 return;
3187 fail:
3188 log_error("File corrupt");
3189 }
3190
3191 /* Note: the lifetime of the compound literal is the immediately surrounding block. */
3192 #define FORMAT_TIMESTAMP_SAFE(t) (FORMAT_TIMESTAMP(t) ?: " --- ")
3193
3194 void journal_file_print_header(JournalFile *f) {
3195 struct stat st;
3196
3197 assert(f);
3198 assert(f->header);
3199
3200 printf("File path: %s\n"
3201 "File ID: %s\n"
3202 "Machine ID: %s\n"
3203 "Boot ID: %s\n"
3204 "Sequential number ID: %s\n"
3205 "State: %s\n"
3206 "Compatible flags:%s%s\n"
3207 "Incompatible flags:%s%s%s%s%s\n"
3208 "Header size: %"PRIu64"\n"
3209 "Arena size: %"PRIu64"\n"
3210 "Data hash table size: %"PRIu64"\n"
3211 "Field hash table size: %"PRIu64"\n"
3212 "Rotate suggested: %s\n"
3213 "Head sequential number: %"PRIu64" (%"PRIx64")\n"
3214 "Tail sequential number: %"PRIu64" (%"PRIx64")\n"
3215 "Head realtime timestamp: %s (%"PRIx64")\n"
3216 "Tail realtime timestamp: %s (%"PRIx64")\n"
3217 "Tail monotonic timestamp: %s (%"PRIx64")\n"
3218 "Objects: %"PRIu64"\n"
3219 "Entry objects: %"PRIu64"\n",
3220 f->path,
3221 SD_ID128_TO_STRING(f->header->file_id),
3222 SD_ID128_TO_STRING(f->header->machine_id),
3223 SD_ID128_TO_STRING(f->header->boot_id),
3224 SD_ID128_TO_STRING(f->header->seqnum_id),
3225 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3226 f->header->state == STATE_ONLINE ? "ONLINE" :
3227 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3228 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3229 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3230 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3231 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3232 JOURNAL_HEADER_COMPRESSED_ZSTD(f->header) ? " COMPRESSED-ZSTD" : "",
3233 JOURNAL_HEADER_KEYED_HASH(f->header) ? " KEYED-HASH" : "",
3234 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3235 le64toh(f->header->header_size),
3236 le64toh(f->header->arena_size),
3237 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3238 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3239 yes_no(journal_file_rotate_suggested(f, 0, LOG_DEBUG)),
3240 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3241 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3242 FORMAT_TIMESTAMP_SAFE(le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3243 FORMAT_TIMESTAMP_SAFE(le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3244 FORMAT_TIMESPAN(le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3245 le64toh(f->header->n_objects),
3246 le64toh(f->header->n_entries));
3247
3248 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3249 printf("Data objects: %"PRIu64"\n"
3250 "Data hash table fill: %.1f%%\n",
3251 le64toh(f->header->n_data),
3252 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3253
3254 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3255 printf("Field objects: %"PRIu64"\n"
3256 "Field hash table fill: %.1f%%\n",
3257 le64toh(f->header->n_fields),
3258 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3259
3260 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3261 printf("Tag objects: %"PRIu64"\n",
3262 le64toh(f->header->n_tags));
3263 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3264 printf("Entry array objects: %"PRIu64"\n",
3265 le64toh(f->header->n_entry_arrays));
3266
3267 if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth))
3268 printf("Deepest field hash chain: %" PRIu64"\n",
3269 f->header->field_hash_chain_depth);
3270
3271 if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth))
3272 printf("Deepest data hash chain: %" PRIu64"\n",
3273 f->header->data_hash_chain_depth);
3274
3275 if (fstat(f->fd, &st) >= 0)
3276 printf("Disk usage: %s\n", FORMAT_BYTES((uint64_t) st.st_blocks * 512ULL));
3277 }
3278
3279 static int journal_file_warn_btrfs(JournalFile *f) {
3280 unsigned attrs;
3281 int r;
3282
3283 assert(f);
3284
3285 /* Before we write anything, check if the COW logic is turned
3286 * off on btrfs. Given our write pattern that is quite
3287 * unfriendly to COW file systems this should greatly improve
3288 * performance on COW file systems, such as btrfs, at the
3289 * expense of data integrity features (which shouldn't be too
3290 * bad, given that we do our own checksumming). */
3291
3292 r = fd_is_fs_type(f->fd, BTRFS_SUPER_MAGIC);
3293 if (r < 0)
3294 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3295 if (!r)
3296 return 0;
3297
3298 r = read_attr_fd(f->fd, &attrs);
3299 if (r < 0)
3300 return log_warning_errno(r, "Failed to read file attributes: %m");
3301
3302 if (attrs & FS_NOCOW_FL) {
3303 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3304 return 0;
3305 }
3306
3307 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3308 "This is likely to slow down journal access substantially, please consider turning "
3309 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3310
3311 return 1;
3312 }
3313
3314 int journal_file_open(
3315 int fd,
3316 const char *fname,
3317 int flags,
3318 mode_t mode,
3319 bool compress,
3320 uint64_t compress_threshold_bytes,
3321 bool seal,
3322 JournalMetrics *metrics,
3323 MMapCache *mmap_cache,
3324 JournalFile *template,
3325 JournalFile **ret) {
3326
3327 bool newly_created = false;
3328 JournalFile *f;
3329 void *h;
3330 int r;
3331
3332 assert(ret);
3333 assert(fd >= 0 || fname);
3334 assert(mmap_cache);
3335
3336 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3337 return -EINVAL;
3338
3339 if ((flags & O_ACCMODE) == O_RDONLY && FLAGS_SET(flags, O_CREAT))
3340 return -EINVAL;
3341
3342 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3343 return -EINVAL;
3344
3345 f = new(JournalFile, 1);
3346 if (!f)
3347 return -ENOMEM;
3348
3349 *f = (JournalFile) {
3350 .fd = fd,
3351 .mode = mode,
3352
3353 .flags = flags,
3354 .writable = (flags & O_ACCMODE) != O_RDONLY,
3355
3356 #if HAVE_ZSTD
3357 .compress_zstd = compress,
3358 #elif HAVE_LZ4
3359 .compress_lz4 = compress,
3360 #elif HAVE_XZ
3361 .compress_xz = compress,
3362 #endif
3363 .compress_threshold_bytes = compress_threshold_bytes == UINT64_MAX ?
3364 DEFAULT_COMPRESS_THRESHOLD :
3365 MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
3366 #if HAVE_GCRYPT
3367 .seal = seal,
3368 #endif
3369 };
3370
3371 /* We turn on keyed hashes by default, but provide an environment variable to turn them off, if
3372 * people really want that */
3373 r = getenv_bool("SYSTEMD_JOURNAL_KEYED_HASH");
3374 if (r < 0) {
3375 if (r != -ENXIO)
3376 log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_KEYED_HASH environment variable, ignoring: %m");
3377 f->keyed_hash = true;
3378 } else
3379 f->keyed_hash = r;
3380
3381 if (DEBUG_LOGGING) {
3382 static int last_seal = -1, last_compress = -1, last_keyed_hash = -1;
3383 static uint64_t last_bytes = UINT64_MAX;
3384
3385 if (last_seal != f->seal ||
3386 last_keyed_hash != f->keyed_hash ||
3387 last_compress != JOURNAL_FILE_COMPRESS(f) ||
3388 last_bytes != f->compress_threshold_bytes) {
3389
3390 log_debug("Journal effective settings seal=%s keyed_hash=%s compress=%s compress_threshold_bytes=%s",
3391 yes_no(f->seal), yes_no(f->keyed_hash), yes_no(JOURNAL_FILE_COMPRESS(f)),
3392 FORMAT_BYTES(f->compress_threshold_bytes));
3393 last_seal = f->seal;
3394 last_keyed_hash = f->keyed_hash;
3395 last_compress = JOURNAL_FILE_COMPRESS(f);
3396 last_bytes = f->compress_threshold_bytes;
3397 }
3398 }
3399
3400 if (fname) {
3401 f->path = strdup(fname);
3402 if (!f->path) {
3403 r = -ENOMEM;
3404 goto fail;
3405 }
3406 } else {
3407 assert(fd >= 0);
3408
3409 /* If we don't know the path, fill in something explanatory and vaguely useful */
3410 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3411 r = -ENOMEM;
3412 goto fail;
3413 }
3414 }
3415
3416 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3417 if (!f->chain_cache) {
3418 r = -ENOMEM;
3419 goto fail;
3420 }
3421
3422 if (f->fd < 0) {
3423 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3424 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3425 * it doesn't hurt in that case. */
3426
3427 f->fd = openat_report_new(AT_FDCWD, f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode, &newly_created);
3428 if (f->fd < 0) {
3429 r = f->fd;
3430 goto fail;
3431 }
3432
3433 /* fds we opened here by us should also be closed by us. */
3434 f->close_fd = true;
3435
3436 r = fd_nonblock(f->fd, false);
3437 if (r < 0)
3438 goto fail;
3439
3440 if (!newly_created) {
3441 r = journal_file_fstat(f);
3442 if (r < 0)
3443 goto fail;
3444 }
3445 } else {
3446 r = journal_file_fstat(f);
3447 if (r < 0)
3448 goto fail;
3449
3450 /* If we just got the fd passed in, we don't really know if we created the file anew */
3451 newly_created = f->last_stat.st_size == 0 && f->writable;
3452 }
3453
3454 f->cache_fd = mmap_cache_add_fd(mmap_cache, f->fd, prot_from_flags(flags));
3455 if (!f->cache_fd) {
3456 r = -ENOMEM;
3457 goto fail;
3458 }
3459
3460 if (newly_created) {
3461 (void) journal_file_warn_btrfs(f);
3462
3463 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3464 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3465 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3466 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3467 * solely on mtime/atime/ctime of the file. */
3468 (void) fd_setcrtime(f->fd, 0);
3469
3470 #if HAVE_GCRYPT
3471 /* Try to load the FSPRG state, and if we can't, then
3472 * just don't do sealing */
3473 if (f->seal) {
3474 r = journal_file_fss_load(f);
3475 if (r < 0)
3476 f->seal = false;
3477 }
3478 #endif
3479
3480 r = journal_file_init_header(f, template);
3481 if (r < 0)
3482 goto fail;
3483
3484 r = journal_file_fstat(f);
3485 if (r < 0)
3486 goto fail;
3487 }
3488
3489 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3490 r = -ENODATA;
3491 goto fail;
3492 }
3493
3494 r = mmap_cache_fd_get(f->cache_fd, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
3495 if (r == -EINVAL) {
3496 /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only
3497 * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error
3498 * code. */
3499 r = -EAFNOSUPPORT;
3500 goto fail;
3501 }
3502 if (r < 0)
3503 goto fail;
3504
3505 f->header = h;
3506
3507 if (!newly_created) {
3508 r = journal_file_verify_header(f);
3509 if (r < 0)
3510 goto fail;
3511 }
3512
3513 #if HAVE_GCRYPT
3514 if (!newly_created && f->writable) {
3515 r = journal_file_fss_load(f);
3516 if (r < 0)
3517 goto fail;
3518 }
3519 #endif
3520
3521 if (f->writable) {
3522 if (metrics) {
3523 journal_default_metrics(metrics, f->fd);
3524 f->metrics = *metrics;
3525 } else if (template)
3526 f->metrics = template->metrics;
3527
3528 r = journal_file_refresh_header(f);
3529 if (r < 0)
3530 goto fail;
3531 }
3532
3533 #if HAVE_GCRYPT
3534 r = journal_file_hmac_setup(f);
3535 if (r < 0)
3536 goto fail;
3537 #endif
3538
3539 if (newly_created) {
3540 r = journal_file_setup_field_hash_table(f);
3541 if (r < 0)
3542 goto fail;
3543
3544 r = journal_file_setup_data_hash_table(f);
3545 if (r < 0)
3546 goto fail;
3547
3548 #if HAVE_GCRYPT
3549 r = journal_file_append_first_tag(f);
3550 if (r < 0)
3551 goto fail;
3552 #endif
3553 }
3554
3555 if (mmap_cache_fd_got_sigbus(f->cache_fd)) {
3556 r = -EIO;
3557 goto fail;
3558 }
3559
3560 if (template && template->post_change_timer) {
3561 r = journal_file_enable_post_change_timer(
3562 f,
3563 sd_event_source_get_event(template->post_change_timer),
3564 template->post_change_timer_period);
3565
3566 if (r < 0)
3567 goto fail;
3568 }
3569
3570 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3571 f->close_fd = true;
3572
3573 *ret = f;
3574 return 0;
3575
3576 fail:
3577 if (f->cache_fd && mmap_cache_fd_got_sigbus(f->cache_fd))
3578 r = -EIO;
3579
3580 (void) journal_file_close(f);
3581
3582 return r;
3583 }
3584
3585 int journal_file_archive(JournalFile *f, char **ret_previous_path) {
3586 _cleanup_free_ char *p = NULL;
3587
3588 assert(f);
3589
3590 if (!f->writable)
3591 return -EINVAL;
3592
3593 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3594 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3595 if (path_startswith(f->path, "/proc/self/fd"))
3596 return -EINVAL;
3597
3598 if (!endswith(f->path, ".journal"))
3599 return -EINVAL;
3600
3601 if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3602 (int) strlen(f->path) - 8, f->path,
3603 SD_ID128_FORMAT_VAL(f->header->seqnum_id),
3604 le64toh(f->header->head_entry_seqnum),
3605 le64toh(f->header->head_entry_realtime)) < 0)
3606 return -ENOMEM;
3607
3608 /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
3609 * ignore that case. */
3610 if (rename(f->path, p) < 0 && errno != ENOENT)
3611 return -errno;
3612
3613 /* Sync the rename to disk */
3614 (void) fsync_directory_of_file(f->fd);
3615
3616 if (ret_previous_path)
3617 *ret_previous_path = f->path;
3618 else
3619 free(f->path);
3620
3621 f->path = TAKE_PTR(p);
3622
3623 /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
3624 * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
3625 * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue
3626 * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
3627 * occurs. */
3628 f->archive = true;
3629
3630 return 0;
3631 }
3632
3633 int journal_file_dispose(int dir_fd, const char *fname) {
3634 _cleanup_free_ char *p = NULL;
3635
3636 assert(fname);
3637
3638 /* Renames a journal file to *.journal~, i.e. to mark it as corrupted or otherwise uncleanly shutdown. Note that
3639 * this is done without looking into the file or changing any of its contents. The idea is that this is called
3640 * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
3641 * for writing anymore. */
3642
3643 if (!endswith(fname, ".journal"))
3644 return -EINVAL;
3645
3646 if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
3647 (int) strlen(fname) - 8, fname,
3648 now(CLOCK_REALTIME),
3649 random_u64()) < 0)
3650 return -ENOMEM;
3651
3652 if (renameat(dir_fd, fname, dir_fd, p) < 0)
3653 return -errno;
3654
3655 return 0;
3656 }
3657
3658 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
3659 uint64_t q, n, xor_hash = 0;
3660 const sd_id128_t *boot_id;
3661 dual_timestamp ts;
3662 EntryItem *items;
3663 int r;
3664
3665 assert(from);
3666 assert(to);
3667 assert(o);
3668 assert(p);
3669
3670 if (!to->writable)
3671 return -EPERM;
3672
3673 ts = (dual_timestamp) {
3674 .monotonic = le64toh(o->entry.monotonic),
3675 .realtime = le64toh(o->entry.realtime),
3676 };
3677 boot_id = &o->entry.boot_id;
3678
3679 n = journal_file_entry_n_items(o);
3680 items = newa(EntryItem, n);
3681
3682 for (uint64_t i = 0; i < n; i++) {
3683 uint64_t l, h;
3684 size_t t;
3685 void *data;
3686 Object *u;
3687
3688 q = le64toh(o->entry.items[i].object_offset);
3689
3690 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3691 if (r < 0)
3692 return r;
3693
3694 l = le64toh(READ_NOW(o->object.size));
3695 if (l < offsetof(Object, data.payload))
3696 return -EBADMSG;
3697
3698 l -= offsetof(Object, data.payload);
3699 t = (size_t) l;
3700
3701 /* We hit the limit on 32bit machines */
3702 if ((uint64_t) t != l)
3703 return -E2BIG;
3704
3705 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3706 #if HAVE_COMPRESSION
3707 size_t rsize = 0;
3708
3709 r = decompress_blob(
3710 o->object.flags & OBJECT_COMPRESSION_MASK,
3711 o->data.payload, l,
3712 &from->compress_buffer, &rsize,
3713 0);
3714 if (r < 0)
3715 return r;
3716
3717 data = from->compress_buffer;
3718 l = rsize;
3719 #else
3720 return -EPROTONOSUPPORT;
3721 #endif
3722 } else
3723 data = o->data.payload;
3724
3725 if (l == 0)
3726 return -EBADMSG;
3727
3728 r = journal_file_append_data(to, data, l, &u, &h);
3729 if (r < 0)
3730 return r;
3731
3732 if (JOURNAL_HEADER_KEYED_HASH(to->header))
3733 xor_hash ^= jenkins_hash64(data, l);
3734 else
3735 xor_hash ^= le64toh(u->data.hash);
3736
3737 items[i] = (EntryItem) {
3738 .object_offset = htole64(h),
3739 .hash = u->data.hash,
3740 };
3741
3742 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3743 if (r < 0)
3744 return r;
3745 }
3746
3747 r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n, NULL, NULL, NULL);
3748
3749 if (mmap_cache_fd_got_sigbus(to->cache_fd))
3750 return -EIO;
3751
3752 return r;
3753 }
3754
3755 void journal_reset_metrics(JournalMetrics *m) {
3756 assert(m);
3757
3758 /* Set everything to "pick automatic values". */
3759
3760 *m = (JournalMetrics) {
3761 .min_use = UINT64_MAX,
3762 .max_use = UINT64_MAX,
3763 .min_size = UINT64_MAX,
3764 .max_size = UINT64_MAX,
3765 .keep_free = UINT64_MAX,
3766 .n_max_files = UINT64_MAX,
3767 };
3768 }
3769
3770 void journal_default_metrics(JournalMetrics *m, int fd) {
3771 struct statvfs ss;
3772 uint64_t fs_size = 0;
3773
3774 assert(m);
3775 assert(fd >= 0);
3776
3777 if (fstatvfs(fd, &ss) >= 0)
3778 fs_size = ss.f_frsize * ss.f_blocks;
3779 else
3780 log_debug_errno(errno, "Failed to determine disk size: %m");
3781
3782 if (m->max_use == UINT64_MAX) {
3783
3784 if (fs_size > 0)
3785 m->max_use = CLAMP(PAGE_ALIGN(fs_size / 10), /* 10% of file system size */
3786 MAX_USE_LOWER, MAX_USE_UPPER);
3787 else
3788 m->max_use = MAX_USE_LOWER;
3789 } else {
3790 m->max_use = PAGE_ALIGN(m->max_use);
3791
3792 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3793 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3794 }
3795
3796 if (m->min_use == UINT64_MAX) {
3797 if (fs_size > 0)
3798 m->min_use = CLAMP(PAGE_ALIGN(fs_size / 50), /* 2% of file system size */
3799 MIN_USE_LOW, MIN_USE_HIGH);
3800 else
3801 m->min_use = MIN_USE_LOW;
3802 }
3803
3804 if (m->min_use > m->max_use)
3805 m->min_use = m->max_use;
3806
3807 if (m->max_size == UINT64_MAX)
3808 m->max_size = MIN(PAGE_ALIGN(m->max_use / 8), /* 8 chunks */
3809 MAX_SIZE_UPPER);
3810 else
3811 m->max_size = PAGE_ALIGN(m->max_size);
3812
3813 if (m->max_size != 0) {
3814 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3815 m->max_size = JOURNAL_FILE_SIZE_MIN;
3816
3817 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3818 m->max_use = m->max_size*2;
3819 }
3820
3821 if (m->min_size == UINT64_MAX)
3822 m->min_size = JOURNAL_FILE_SIZE_MIN;
3823 else
3824 m->min_size = CLAMP(PAGE_ALIGN(m->min_size),
3825 JOURNAL_FILE_SIZE_MIN,
3826 m->max_size ?: UINT64_MAX);
3827
3828 if (m->keep_free == UINT64_MAX) {
3829 if (fs_size > 0)
3830 m->keep_free = MIN(PAGE_ALIGN(fs_size / 20), /* 5% of file system size */
3831 KEEP_FREE_UPPER);
3832 else
3833 m->keep_free = DEFAULT_KEEP_FREE;
3834 }
3835
3836 if (m->n_max_files == UINT64_MAX)
3837 m->n_max_files = DEFAULT_N_MAX_FILES;
3838
3839 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3840 FORMAT_BYTES(m->min_use),
3841 FORMAT_BYTES(m->max_use),
3842 FORMAT_BYTES(m->max_size),
3843 FORMAT_BYTES(m->min_size),
3844 FORMAT_BYTES(m->keep_free),
3845 m->n_max_files);
3846 }
3847
3848 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3849 assert(f);
3850 assert(f->header);
3851 assert(from || to);
3852
3853 if (from) {
3854 if (f->header->head_entry_realtime == 0)
3855 return -ENOENT;
3856
3857 *from = le64toh(f->header->head_entry_realtime);
3858 }
3859
3860 if (to) {
3861 if (f->header->tail_entry_realtime == 0)
3862 return -ENOENT;
3863
3864 *to = le64toh(f->header->tail_entry_realtime);
3865 }
3866
3867 return 1;
3868 }
3869
3870 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3871 Object *o;
3872 uint64_t p;
3873 int r;
3874
3875 assert(f);
3876 assert(from || to);
3877
3878 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3879 if (r <= 0)
3880 return r;
3881
3882 if (le64toh(o->data.n_entries) <= 0)
3883 return 0;
3884
3885 if (from) {
3886 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3887 if (r < 0)
3888 return r;
3889
3890 *from = le64toh(o->entry.monotonic);
3891 }
3892
3893 if (to) {
3894 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3895 if (r < 0)
3896 return r;
3897
3898 r = generic_array_get_plus_one(f,
3899 le64toh(o->data.entry_offset),
3900 le64toh(o->data.entry_array_offset),
3901 le64toh(o->data.n_entries) - 1,
3902 DIRECTION_UP,
3903 &o, NULL);
3904 if (r <= 0)
3905 return r;
3906
3907 *to = le64toh(o->entry.monotonic);
3908 }
3909
3910 return 1;
3911 }
3912
3913 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec, int log_level) {
3914 assert(f);
3915 assert(f->header);
3916
3917 /* If we gained new header fields we gained new features,
3918 * hence suggest a rotation */
3919 if (le64toh(f->header->header_size) < sizeof(Header)) {
3920 log_full(log_level, "%s uses an outdated header, suggesting rotation.", f->path);
3921 return true;
3922 }
3923
3924 /* Let's check if the hash tables grew over a certain fill level (75%, borrowing this value from
3925 * Java's hash table implementation), and if so suggest a rotation. To calculate the fill level we
3926 * need the n_data field, which only exists in newer versions. */
3927
3928 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3929 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3930 log_full(log_level,
3931 "Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3932 f->path,
3933 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3934 le64toh(f->header->n_data),
3935 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3936 (unsigned long long) f->last_stat.st_size,
3937 f->last_stat.st_size / le64toh(f->header->n_data));
3938 return true;
3939 }
3940
3941 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3942 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3943 log_full(log_level,
3944 "Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3945 f->path,
3946 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3947 le64toh(f->header->n_fields),
3948 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3949 return true;
3950 }
3951
3952 /* If there are too many hash collisions somebody is most likely playing games with us. Hence, if our
3953 * longest chain is longer than some threshold, let's suggest rotation. */
3954 if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) &&
3955 le64toh(f->header->data_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {
3956 log_full(log_level,
3957 "Data hash table of %s has deepest hash chain of length %" PRIu64 ", suggesting rotation.",
3958 f->path, le64toh(f->header->data_hash_chain_depth));
3959 return true;
3960 }
3961
3962 if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) &&
3963 le64toh(f->header->field_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {
3964 log_full(log_level,
3965 "Field hash table of %s has deepest hash chain of length at %" PRIu64 ", suggesting rotation.",
3966 f->path, le64toh(f->header->field_hash_chain_depth));
3967 return true;
3968 }
3969
3970 /* Are the data objects properly indexed by field objects? */
3971 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3972 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3973 le64toh(f->header->n_data) > 0 &&
3974 le64toh(f->header->n_fields) == 0) {
3975 log_full(log_level,
3976 "Data objects of %s are not indexed by field objects, suggesting rotation.",
3977 f->path);
3978 return true;
3979 }
3980
3981 if (max_file_usec > 0) {
3982 usec_t t, h;
3983
3984 h = le64toh(f->header->head_entry_realtime);
3985 t = now(CLOCK_REALTIME);
3986
3987 if (h > 0 && t > h + max_file_usec) {
3988 log_full(log_level,
3989 "Oldest entry in %s is older than the configured file retention duration (%s), suggesting rotation.",
3990 f->path, FORMAT_TIMESPAN(max_file_usec, USEC_PER_SEC));
3991 return true;
3992 }
3993 }
3994
3995 return false;
3996 }
3997
3998 static const char * const journal_object_type_table[] = {
3999 [OBJECT_UNUSED] = "unused",
4000 [OBJECT_DATA] = "data",
4001 [OBJECT_FIELD] = "field",
4002 [OBJECT_ENTRY] = "entry",
4003 [OBJECT_DATA_HASH_TABLE] = "data hash table",
4004 [OBJECT_FIELD_HASH_TABLE] = "field hash table",
4005 [OBJECT_ENTRY_ARRAY] = "entry array",
4006 [OBJECT_TAG] = "tag",
4007 };
4008
4009 DEFINE_STRING_TABLE_LOOKUP_TO_STRING(journal_object_type, ObjectType);