]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/libsystemd/sd-journal/journal-file.c
bc80a51524d8f0d4a769002094a86c345e866349
[thirdparty/systemd.git] / src / libsystemd / sd-journal / journal-file.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/fs.h>
6 #include <linux/magic.h>
7 #include <pthread.h>
8 #include <stddef.h>
9 #include <sys/mman.h>
10 #include <sys/statvfs.h>
11 #include <sys/uio.h>
12 #include <unistd.h>
13
14 #include "sd-event.h"
15
16 #include "alloc-util.h"
17 #include "chattr-util.h"
18 #include "compress.h"
19 #include "env-util.h"
20 #include "fd-util.h"
21 #include "format-util.h"
22 #include "fs-util.h"
23 #include "journal-authenticate.h"
24 #include "journal-def.h"
25 #include "journal-file.h"
26 #include "lookup3.h"
27 #include "memory-util.h"
28 #include "path-util.h"
29 #include "random-util.h"
30 #include "set.h"
31 #include "sort-util.h"
32 #include "stat-util.h"
33 #include "string-table.h"
34 #include "string-util.h"
35 #include "strv.h"
36 #include "sync-util.h"
37 #include "xattr-util.h"
38
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41
42 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
43 #define MIN_COMPRESS_THRESHOLD (8ULL)
44
45 /* This is the minimum journal file size */
46 #define JOURNAL_FILE_SIZE_MIN (512 * 1024ULL) /* 512 KiB */
47
48 /* These are the lower and upper bounds if we deduce the max_use value
49 * from the file system size */
50 #define MAX_USE_LOWER (1 * 1024 * 1024ULL) /* 1 MiB */
51 #define MAX_USE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
52
53 /* Those are the lower and upper bounds for the minimal use limit,
54 * i.e. how much we'll use even if keep_free suggests otherwise. */
55 #define MIN_USE_LOW (1 * 1024 * 1024ULL) /* 1 MiB */
56 #define MIN_USE_HIGH (16 * 1024 * 1024ULL) /* 16 MiB */
57
58 /* This is the upper bound if we deduce max_size from max_use */
59 #define MAX_SIZE_UPPER (128 * 1024 * 1024ULL) /* 128 MiB */
60
61 /* This is the upper bound if we deduce the keep_free value from the
62 * file system size */
63 #define KEEP_FREE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
64
65 /* This is the keep_free value when we can't determine the system
66 * size */
67 #define DEFAULT_KEEP_FREE (1024 * 1024ULL) /* 1 MB */
68
69 /* This is the default maximum number of journal files to keep around. */
70 #define DEFAULT_N_MAX_FILES 100
71
72 /* n_data was the first entry we added after the initial file format design */
73 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
74
75 /* How many entries to keep in the entry array chain cache at max */
76 #define CHAIN_CACHE_MAX 20
77
78 /* How much to increase the journal file size at once each time we allocate something new. */
79 #define FILE_SIZE_INCREASE (8 * 1024 * 1024ULL) /* 8MB */
80
81 /* Reread fstat() of the file for detecting deletions at least this often */
82 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
83
84 /* The mmap context to use for the header we pick as one above the last defined typed */
85 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
86
87 /* Longest hash chain to rotate after */
88 #define HASH_CHAIN_DEPTH_MAX 100
89
90 #ifdef __clang__
91 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
92 #endif
93
94 int journal_file_tail_end(JournalFile *f, uint64_t *ret_offset) {
95 Object tail;
96 uint64_t p;
97 int r;
98
99 assert(f);
100 assert(f->header);
101 assert(ret_offset);
102
103 p = le64toh(f->header->tail_object_offset);
104 if (p == 0)
105 p = le64toh(f->header->header_size);
106 else {
107 uint64_t sz;
108
109 r = journal_file_read_object(f, OBJECT_UNUSED, p, &tail);
110 if (r < 0)
111 return r;
112
113 sz = le64toh(tail.object.size);
114 if (sz > UINT64_MAX - sizeof(uint64_t) + 1)
115 return -EBADMSG;
116
117 sz = ALIGN64(sz);
118 if (p > UINT64_MAX - sz)
119 return -EBADMSG;
120
121 p += sz;
122 }
123
124 *ret_offset = p;
125
126 return 0;
127 }
128
129 int journal_file_set_offline_thread_join(JournalFile *f) {
130 int r;
131
132 assert(f);
133
134 if (f->offline_state == OFFLINE_JOINED)
135 return 0;
136
137 r = pthread_join(f->offline_thread, NULL);
138 if (r)
139 return -r;
140
141 f->offline_state = OFFLINE_JOINED;
142
143 if (mmap_cache_fd_got_sigbus(f->cache_fd))
144 return -EIO;
145
146 return 0;
147 }
148
149 static int journal_file_set_online(JournalFile *f) {
150 bool wait = true;
151
152 assert(f);
153
154 if (!f->writable)
155 return -EPERM;
156
157 if (f->fd < 0 || !f->header)
158 return -EINVAL;
159
160 while (wait) {
161 switch (f->offline_state) {
162 case OFFLINE_JOINED:
163 /* No offline thread, no need to wait. */
164 wait = false;
165 break;
166
167 case OFFLINE_SYNCING:
168 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
169 continue;
170 /* Canceled syncing prior to offlining, no need to wait. */
171 wait = false;
172 break;
173
174 case OFFLINE_AGAIN_FROM_SYNCING:
175 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
176 continue;
177 /* Canceled restart from syncing, no need to wait. */
178 wait = false;
179 break;
180
181 case OFFLINE_AGAIN_FROM_OFFLINING:
182 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
183 continue;
184 /* Canceled restart from offlining, must wait for offlining to complete however. */
185 _fallthrough_;
186 default: {
187 int r;
188
189 r = journal_file_set_offline_thread_join(f);
190 if (r < 0)
191 return r;
192
193 wait = false;
194 break;
195 }
196 }
197 }
198
199 if (mmap_cache_fd_got_sigbus(f->cache_fd))
200 return -EIO;
201
202 switch (f->header->state) {
203 case STATE_ONLINE:
204 return 0;
205
206 case STATE_OFFLINE:
207 f->header->state = STATE_ONLINE;
208 (void) fsync(f->fd);
209 return 0;
210
211 default:
212 return -EINVAL;
213 }
214 }
215
216 JournalFile* journal_file_close(JournalFile *f) {
217 if (!f)
218 return NULL;
219
220 if (f->mmap && f->cache_fd)
221 mmap_cache_fd_free(f->cache_fd);
222
223 if (f->fd >= 0 && f->defrag_on_close) {
224
225 /* Be friendly to btrfs: turn COW back on again now,
226 * and defragment the file. We won't write to the file
227 * ever again, hence remove all fragmentation, and
228 * reenable all the good bits COW usually provides
229 * (such as data checksumming). */
230
231 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL);
232 (void) btrfs_defrag_fd(f->fd);
233 }
234
235 if (f->close_fd)
236 safe_close(f->fd);
237 free(f->path);
238
239 mmap_cache_unref(f->mmap);
240
241 ordered_hashmap_free_free(f->chain_cache);
242
243 #if HAVE_COMPRESSION
244 free(f->compress_buffer);
245 #endif
246
247 #if HAVE_GCRYPT
248 if (f->fss_file)
249 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
250 else
251 free(f->fsprg_state);
252
253 free(f->fsprg_seed);
254
255 if (f->hmac)
256 gcry_md_close(f->hmac);
257 #endif
258
259 return mfree(f);
260 }
261
262 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
263 Header h = {};
264 ssize_t k;
265 int r;
266
267 assert(f);
268
269 memcpy(h.signature, HEADER_SIGNATURE, 8);
270 h.header_size = htole64(ALIGN64(sizeof(h)));
271
272 h.incompatible_flags |= htole32(
273 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
274 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4 |
275 f->compress_zstd * HEADER_INCOMPATIBLE_COMPRESSED_ZSTD |
276 f->keyed_hash * HEADER_INCOMPATIBLE_KEYED_HASH);
277
278 h.compatible_flags = htole32(
279 f->seal * HEADER_COMPATIBLE_SEALED);
280
281 r = sd_id128_randomize(&h.file_id);
282 if (r < 0)
283 return r;
284
285 if (template) {
286 h.seqnum_id = template->header->seqnum_id;
287 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
288 } else
289 h.seqnum_id = h.file_id;
290
291 k = pwrite(f->fd, &h, sizeof(h), 0);
292 if (k < 0)
293 return -errno;
294
295 if (k != sizeof(h))
296 return -EIO;
297
298 return 0;
299 }
300
301 static int journal_file_refresh_header(JournalFile *f) {
302 int r;
303
304 assert(f);
305 assert(f->header);
306
307 r = sd_id128_get_machine(&f->header->machine_id);
308 if (IN_SET(r, -ENOENT, -ENOMEDIUM))
309 /* We don't have a machine-id, let's continue without */
310 zero(f->header->machine_id);
311 else if (r < 0)
312 return r;
313
314 r = sd_id128_get_boot(&f->header->boot_id);
315 if (r < 0)
316 return r;
317
318 r = journal_file_set_online(f);
319
320 /* Sync the online state to disk; likely just created a new file, also sync the directory this file
321 * is located in. */
322 (void) fsync_full(f->fd);
323
324 return r;
325 }
326
327 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
328 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
329 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
330 const char *type = compatible ? "compatible" : "incompatible";
331 uint32_t flags;
332
333 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
334
335 if (flags & ~supported) {
336 if (flags & ~any)
337 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
338 f->path, type, flags & ~any);
339 flags = (flags & any) & ~supported;
340 if (flags) {
341 const char* strv[5];
342 size_t n = 0;
343 _cleanup_free_ char *t = NULL;
344
345 if (compatible) {
346 if (flags & HEADER_COMPATIBLE_SEALED)
347 strv[n++] = "sealed";
348 } else {
349 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ)
350 strv[n++] = "xz-compressed";
351 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4)
352 strv[n++] = "lz4-compressed";
353 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_ZSTD)
354 strv[n++] = "zstd-compressed";
355 if (flags & HEADER_INCOMPATIBLE_KEYED_HASH)
356 strv[n++] = "keyed-hash";
357 }
358 strv[n] = NULL;
359 assert(n < ELEMENTSOF(strv));
360
361 t = strv_join((char**) strv, ", ");
362 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
363 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
364 }
365 return true;
366 }
367
368 return false;
369 }
370
371 static int journal_file_verify_header(JournalFile *f) {
372 uint64_t arena_size, header_size;
373
374 assert(f);
375 assert(f->header);
376
377 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
378 return -EBADMSG;
379
380 /* In both read and write mode we refuse to open files with incompatible
381 * flags we don't know. */
382 if (warn_wrong_flags(f, false))
383 return -EPROTONOSUPPORT;
384
385 /* When open for writing we refuse to open files with compatible flags, too. */
386 if (f->writable && warn_wrong_flags(f, true))
387 return -EPROTONOSUPPORT;
388
389 if (f->header->state >= _STATE_MAX)
390 return -EBADMSG;
391
392 header_size = le64toh(READ_NOW(f->header->header_size));
393
394 /* The first addition was n_data, so check that we are at least this large */
395 if (header_size < HEADER_SIZE_MIN)
396 return -EBADMSG;
397
398 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
399 return -EBADMSG;
400
401 arena_size = le64toh(READ_NOW(f->header->arena_size));
402
403 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
404 return -ENODATA;
405
406 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
407 return -ENODATA;
408
409 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
410 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
411 !VALID64(le64toh(f->header->tail_object_offset)) ||
412 !VALID64(le64toh(f->header->entry_array_offset)))
413 return -ENODATA;
414
415 if (f->writable) {
416 sd_id128_t machine_id;
417 uint8_t state;
418 int r;
419
420 r = sd_id128_get_machine(&machine_id);
421 if (r < 0)
422 return r;
423
424 if (!sd_id128_equal(machine_id, f->header->machine_id))
425 return -EHOSTDOWN;
426
427 state = f->header->state;
428
429 if (state == STATE_ARCHIVED)
430 return -ESHUTDOWN; /* Already archived */
431 else if (state == STATE_ONLINE)
432 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
433 "Journal file %s is already online. Assuming unclean closing.",
434 f->path);
435 else if (state != STATE_OFFLINE)
436 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
437 "Journal file %s has unknown state %i.",
438 f->path, state);
439
440 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
441 return -EBADMSG;
442
443 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
444 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
445 * bisection. */
446 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME))
447 return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY),
448 "Journal file %s is from the future, refusing to append new data to it that'd be older.",
449 f->path);
450 }
451
452 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
453 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
454 f->compress_zstd = JOURNAL_HEADER_COMPRESSED_ZSTD(f->header);
455
456 f->seal = JOURNAL_HEADER_SEALED(f->header);
457
458 f->keyed_hash = JOURNAL_HEADER_KEYED_HASH(f->header);
459
460 return 0;
461 }
462
463 int journal_file_fstat(JournalFile *f) {
464 int r;
465
466 assert(f);
467 assert(f->fd >= 0);
468
469 if (fstat(f->fd, &f->last_stat) < 0)
470 return -errno;
471
472 f->last_stat_usec = now(CLOCK_MONOTONIC);
473
474 /* Refuse dealing with files that aren't regular */
475 r = stat_verify_regular(&f->last_stat);
476 if (r < 0)
477 return r;
478
479 /* Refuse appending to files that are already deleted */
480 if (f->last_stat.st_nlink <= 0)
481 return -EIDRM;
482
483 return 0;
484 }
485
486 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
487 uint64_t old_size, new_size, old_header_size, old_arena_size;
488 int r;
489
490 assert(f);
491 assert(f->header);
492
493 /* We assume that this file is not sparse, and we know that for sure, since we always call
494 * posix_fallocate() ourselves */
495
496 if (size > PAGE_ALIGN_DOWN(UINT64_MAX) - offset)
497 return -EINVAL;
498
499 if (mmap_cache_fd_got_sigbus(f->cache_fd))
500 return -EIO;
501
502 old_header_size = le64toh(READ_NOW(f->header->header_size));
503 old_arena_size = le64toh(READ_NOW(f->header->arena_size));
504 if (old_arena_size > PAGE_ALIGN_DOWN(UINT64_MAX) - old_header_size)
505 return -EBADMSG;
506
507 old_size = old_header_size + old_arena_size;
508
509 new_size = MAX(PAGE_ALIGN(offset + size), old_header_size);
510
511 if (new_size <= old_size) {
512
513 /* We already pre-allocated enough space, but before
514 * we write to it, let's check with fstat() if the
515 * file got deleted, in order make sure we don't throw
516 * away the data immediately. Don't check fstat() for
517 * all writes though, but only once ever 10s. */
518
519 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
520 return 0;
521
522 return journal_file_fstat(f);
523 }
524
525 /* Allocate more space. */
526
527 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
528 return -E2BIG;
529
530 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
531 struct statvfs svfs;
532
533 if (fstatvfs(f->fd, &svfs) >= 0) {
534 uint64_t available;
535
536 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
537
538 if (new_size - old_size > available)
539 return -E2BIG;
540 }
541 }
542
543 /* Increase by larger blocks at once */
544 new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
545 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
546 new_size = f->metrics.max_size;
547
548 /* Note that the glibc fallocate() fallback is very
549 inefficient, hence we try to minimize the allocation area
550 as we can. */
551 r = posix_fallocate_loop(f->fd, old_size, new_size - old_size);
552 if (r < 0)
553 return r;
554
555 f->header->arena_size = htole64(new_size - old_header_size);
556
557 return journal_file_fstat(f);
558 }
559
560 static unsigned type_to_context(ObjectType type) {
561 /* One context for each type, plus one catch-all for the rest */
562 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
563 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
564 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
565 }
566
567 static int journal_file_move_to(
568 JournalFile *f,
569 ObjectType type,
570 bool keep_always,
571 uint64_t offset,
572 uint64_t size,
573 void **ret) {
574
575 int r;
576
577 assert(f);
578 assert(ret);
579
580 if (size <= 0)
581 return -EINVAL;
582
583 if (size > UINT64_MAX - offset)
584 return -EBADMSG;
585
586 /* Avoid SIGBUS on invalid accesses */
587 if (offset + size > (uint64_t) f->last_stat.st_size) {
588 /* Hmm, out of range? Let's refresh the fstat() data
589 * first, before we trust that check. */
590
591 r = journal_file_fstat(f);
592 if (r < 0)
593 return r;
594
595 if (offset + size > (uint64_t) f->last_stat.st_size)
596 return -EADDRNOTAVAIL;
597 }
598
599 return mmap_cache_fd_get(f->cache_fd, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
600 }
601
602 static uint64_t minimum_header_size(Object *o) {
603
604 static const uint64_t table[] = {
605 [OBJECT_DATA] = sizeof(DataObject),
606 [OBJECT_FIELD] = sizeof(FieldObject),
607 [OBJECT_ENTRY] = sizeof(EntryObject),
608 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
609 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
610 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
611 [OBJECT_TAG] = sizeof(TagObject),
612 };
613
614 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
615 return sizeof(ObjectHeader);
616
617 return table[o->object.type];
618 }
619
620 /* Lightweight object checks. We want this to be fast, so that we won't
621 * slowdown every journal_file_move_to_object() call too much. */
622 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
623 assert(f);
624 assert(o);
625
626 switch (o->object.type) {
627
628 case OBJECT_DATA:
629 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0))
630 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
631 "Bad n_entries: %" PRIu64 ": %" PRIu64,
632 le64toh(o->data.n_entries),
633 offset);
634
635 if (le64toh(o->object.size) <= offsetof(DataObject, payload))
636 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
637 "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,
638 offsetof(DataObject, payload),
639 le64toh(o->object.size),
640 offset);
641
642 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
643 !VALID64(le64toh(o->data.next_field_offset)) ||
644 !VALID64(le64toh(o->data.entry_offset)) ||
645 !VALID64(le64toh(o->data.entry_array_offset)))
646 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
647 "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64,
648 le64toh(o->data.next_hash_offset),
649 le64toh(o->data.next_field_offset),
650 le64toh(o->data.entry_offset),
651 le64toh(o->data.entry_array_offset),
652 offset);
653
654 break;
655
656 case OBJECT_FIELD:
657 if (le64toh(o->object.size) <= offsetof(FieldObject, payload))
658 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
659 "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64,
660 offsetof(FieldObject, payload),
661 le64toh(o->object.size),
662 offset);
663
664 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
665 !VALID64(le64toh(o->field.head_data_offset)))
666 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
667 "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64,
668 le64toh(o->field.next_hash_offset),
669 le64toh(o->field.head_data_offset),
670 offset);
671 break;
672
673 case OBJECT_ENTRY: {
674 uint64_t sz;
675
676 sz = le64toh(READ_NOW(o->object.size));
677 if (sz < offsetof(EntryObject, items) ||
678 (sz - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0)
679 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
680 "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,
681 offsetof(EntryObject, items),
682 sz,
683 offset);
684
685 if ((sz - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0)
686 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
687 "Invalid number items in entry: %" PRIu64 ": %" PRIu64,
688 (sz - offsetof(EntryObject, items)) / sizeof(EntryItem),
689 offset);
690
691 if (le64toh(o->entry.seqnum) <= 0)
692 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
693 "Invalid entry seqnum: %" PRIx64 ": %" PRIu64,
694 le64toh(o->entry.seqnum),
695 offset);
696
697 if (!VALID_REALTIME(le64toh(o->entry.realtime)))
698 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
699 "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64,
700 le64toh(o->entry.realtime),
701 offset);
702
703 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic)))
704 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
705 "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64,
706 le64toh(o->entry.monotonic),
707 offset);
708
709 break;
710 }
711
712 case OBJECT_DATA_HASH_TABLE:
713 case OBJECT_FIELD_HASH_TABLE: {
714 uint64_t sz;
715
716 sz = le64toh(READ_NOW(o->object.size));
717 if (sz < offsetof(HashTableObject, items) ||
718 (sz - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
719 (sz - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0)
720 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
721 "Invalid %s hash table size: %" PRIu64 ": %" PRIu64,
722 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
723 sz,
724 offset);
725
726 break;
727 }
728
729 case OBJECT_ENTRY_ARRAY: {
730 uint64_t sz;
731
732 sz = le64toh(READ_NOW(o->object.size));
733 if (sz < offsetof(EntryArrayObject, items) ||
734 (sz - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
735 (sz - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0)
736 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
737 "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
738 sz,
739 offset);
740
741 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset)))
742 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
743 "Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64,
744 le64toh(o->entry_array.next_entry_array_offset),
745 offset);
746
747 break;
748 }
749
750 case OBJECT_TAG:
751 if (le64toh(o->object.size) != sizeof(TagObject))
752 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
753 "Invalid object tag size: %" PRIu64 ": %" PRIu64,
754 le64toh(o->object.size),
755 offset);
756
757 if (!VALID_EPOCH(le64toh(o->tag.epoch)))
758 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
759 "Invalid object tag epoch: %" PRIu64 ": %" PRIu64,
760 le64toh(o->tag.epoch), offset);
761
762 break;
763 }
764
765 return 0;
766 }
767
768 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
769 int r;
770 void *t;
771 Object *o;
772 uint64_t s;
773
774 assert(f);
775 assert(ret);
776
777 /* Objects may only be located at multiple of 64 bit */
778 if (!VALID64(offset))
779 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
780 "Attempt to move to object at non-64bit boundary: %" PRIu64,
781 offset);
782
783 /* Object may not be located in the file header */
784 if (offset < le64toh(f->header->header_size))
785 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
786 "Attempt to move to object located in file header: %" PRIu64,
787 offset);
788
789 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
790 if (r < 0)
791 return r;
792
793 o = (Object*) t;
794 s = le64toh(READ_NOW(o->object.size));
795
796 if (s == 0)
797 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
798 "Attempt to move to uninitialized object: %" PRIu64,
799 offset);
800 if (s < sizeof(ObjectHeader))
801 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
802 "Attempt to move to overly short object: %" PRIu64,
803 offset);
804
805 if (o->object.type <= OBJECT_UNUSED)
806 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
807 "Attempt to move to object with invalid type: %" PRIu64,
808 offset);
809
810 if (s < minimum_header_size(o))
811 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
812 "Attempt to move to truncated object: %" PRIu64,
813 offset);
814
815 if (type > OBJECT_UNUSED && o->object.type != type)
816 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
817 "Attempt to move to object of unexpected type: %" PRIu64,
818 offset);
819
820 r = journal_file_move_to(f, type, false, offset, s, &t);
821 if (r < 0)
822 return r;
823
824 o = (Object*) t;
825
826 r = journal_file_check_object(f, offset, o);
827 if (r < 0)
828 return r;
829
830 *ret = o;
831 return 0;
832 }
833
834 int journal_file_read_object(JournalFile *f, ObjectType type, uint64_t offset, Object *ret) {
835 int r;
836 Object o;
837 uint64_t s;
838
839 assert(f);
840 assert(ret);
841
842 /* Objects may only be located at multiple of 64 bit */
843 if (!VALID64(offset))
844 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
845 "Attempt to read object at non-64bit boundary: %" PRIu64,
846 offset);
847
848 /* Object may not be located in the file header */
849 if (offset < le64toh(f->header->header_size))
850 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
851 "Attempt to read object located in file header: %" PRIu64,
852 offset);
853
854 /* This will likely read too much data but it avoids having to call pread() twice. */
855 r = pread(f->fd, &o, sizeof(Object), offset);
856 if (r < 0)
857 return r;
858
859 s = le64toh(o.object.size);
860
861 if (s == 0)
862 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
863 "Attempt to read uninitialized object: %" PRIu64,
864 offset);
865 if (s < sizeof(ObjectHeader))
866 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
867 "Attempt to read overly short object: %" PRIu64,
868 offset);
869
870 if (o.object.type <= OBJECT_UNUSED)
871 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
872 "Attempt to read object with invalid type: %" PRIu64,
873 offset);
874
875 if (s < minimum_header_size(&o))
876 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
877 "Attempt to read truncated object: %" PRIu64,
878 offset);
879
880 if (type > OBJECT_UNUSED && o.object.type != type)
881 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
882 "Attempt to read object of unexpected type: %" PRIu64,
883 offset);
884
885 r = journal_file_check_object(f, offset, &o);
886 if (r < 0)
887 return r;
888
889 *ret = o;
890 return 0;
891 }
892
893 static uint64_t journal_file_entry_seqnum(
894 JournalFile *f,
895 uint64_t *seqnum) {
896
897 uint64_t ret;
898
899 assert(f);
900 assert(f->header);
901
902 /* Picks a new sequence number for the entry we are about to add and returns it. */
903
904 ret = le64toh(f->header->tail_entry_seqnum) + 1;
905
906 if (seqnum) {
907 /* If an external seqnum counter was passed, we update both the local and the external one,
908 * and set it to the maximum of both */
909
910 if (*seqnum + 1 > ret)
911 ret = *seqnum + 1;
912
913 *seqnum = ret;
914 }
915
916 f->header->tail_entry_seqnum = htole64(ret);
917
918 if (f->header->head_entry_seqnum == 0)
919 f->header->head_entry_seqnum = htole64(ret);
920
921 return ret;
922 }
923
924 int journal_file_append_object(
925 JournalFile *f,
926 ObjectType type,
927 uint64_t size,
928 Object **ret,
929 uint64_t *ret_offset) {
930
931 int r;
932 uint64_t p;
933 Object *o;
934 void *t;
935
936 assert(f);
937 assert(f->header);
938 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
939 assert(size >= sizeof(ObjectHeader));
940
941 r = journal_file_set_online(f);
942 if (r < 0)
943 return r;
944
945 r = journal_file_tail_end(f, &p);
946 if (r < 0)
947 return r;
948
949 r = journal_file_allocate(f, p, size);
950 if (r < 0)
951 return r;
952
953 r = journal_file_move_to(f, type, false, p, size, &t);
954 if (r < 0)
955 return r;
956
957 o = (Object*) t;
958 o->object = (ObjectHeader) {
959 .type = type,
960 .size = htole64(size),
961 };
962
963 f->header->tail_object_offset = htole64(p);
964 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
965
966 if (ret)
967 *ret = o;
968
969 if (ret_offset)
970 *ret_offset = p;
971
972 return 0;
973 }
974
975 static int journal_file_setup_data_hash_table(JournalFile *f) {
976 uint64_t s, p;
977 Object *o;
978 int r;
979
980 assert(f);
981 assert(f->header);
982
983 /* We estimate that we need 1 hash table entry per 768 bytes
984 of journal file and we want to make sure we never get
985 beyond 75% fill level. Calculate the hash table size for
986 the maximum file size based on these metrics. */
987
988 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
989 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
990 s = DEFAULT_DATA_HASH_TABLE_SIZE;
991
992 log_debug("Reserving %"PRIu64" entries in data hash table.", s / sizeof(HashItem));
993
994 r = journal_file_append_object(f,
995 OBJECT_DATA_HASH_TABLE,
996 offsetof(Object, hash_table.items) + s,
997 &o, &p);
998 if (r < 0)
999 return r;
1000
1001 memzero(o->hash_table.items, s);
1002
1003 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1004 f->header->data_hash_table_size = htole64(s);
1005
1006 return 0;
1007 }
1008
1009 static int journal_file_setup_field_hash_table(JournalFile *f) {
1010 uint64_t s, p;
1011 Object *o;
1012 int r;
1013
1014 assert(f);
1015 assert(f->header);
1016
1017 /* We use a fixed size hash table for the fields as this
1018 * number should grow very slowly only */
1019
1020 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1021 log_debug("Reserving %"PRIu64" entries in field hash table.", s / sizeof(HashItem));
1022
1023 r = journal_file_append_object(f,
1024 OBJECT_FIELD_HASH_TABLE,
1025 offsetof(Object, hash_table.items) + s,
1026 &o, &p);
1027 if (r < 0)
1028 return r;
1029
1030 memzero(o->hash_table.items, s);
1031
1032 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1033 f->header->field_hash_table_size = htole64(s);
1034
1035 return 0;
1036 }
1037
1038 int journal_file_map_data_hash_table(JournalFile *f) {
1039 uint64_t s, p;
1040 void *t;
1041 int r;
1042
1043 assert(f);
1044 assert(f->header);
1045
1046 if (f->data_hash_table)
1047 return 0;
1048
1049 p = le64toh(f->header->data_hash_table_offset);
1050 s = le64toh(f->header->data_hash_table_size);
1051
1052 r = journal_file_move_to(f,
1053 OBJECT_DATA_HASH_TABLE,
1054 true,
1055 p, s,
1056 &t);
1057 if (r < 0)
1058 return r;
1059
1060 f->data_hash_table = t;
1061 return 0;
1062 }
1063
1064 int journal_file_map_field_hash_table(JournalFile *f) {
1065 uint64_t s, p;
1066 void *t;
1067 int r;
1068
1069 assert(f);
1070 assert(f->header);
1071
1072 if (f->field_hash_table)
1073 return 0;
1074
1075 p = le64toh(f->header->field_hash_table_offset);
1076 s = le64toh(f->header->field_hash_table_size);
1077
1078 r = journal_file_move_to(f,
1079 OBJECT_FIELD_HASH_TABLE,
1080 true,
1081 p, s,
1082 &t);
1083 if (r < 0)
1084 return r;
1085
1086 f->field_hash_table = t;
1087 return 0;
1088 }
1089
1090 static int journal_file_link_field(
1091 JournalFile *f,
1092 Object *o,
1093 uint64_t offset,
1094 uint64_t hash) {
1095
1096 uint64_t p, h, m;
1097 int r;
1098
1099 assert(f);
1100 assert(f->header);
1101 assert(f->field_hash_table);
1102 assert(o);
1103 assert(offset > 0);
1104
1105 if (o->object.type != OBJECT_FIELD)
1106 return -EINVAL;
1107
1108 m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem);
1109 if (m <= 0)
1110 return -EBADMSG;
1111
1112 /* This might alter the window we are looking at */
1113 o->field.next_hash_offset = o->field.head_data_offset = 0;
1114
1115 h = hash % m;
1116 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1117 if (p == 0)
1118 f->field_hash_table[h].head_hash_offset = htole64(offset);
1119 else {
1120 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1121 if (r < 0)
1122 return r;
1123
1124 o->field.next_hash_offset = htole64(offset);
1125 }
1126
1127 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1128
1129 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1130 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1131
1132 return 0;
1133 }
1134
1135 static int journal_file_link_data(
1136 JournalFile *f,
1137 Object *o,
1138 uint64_t offset,
1139 uint64_t hash) {
1140
1141 uint64_t p, h, m;
1142 int r;
1143
1144 assert(f);
1145 assert(f->header);
1146 assert(f->data_hash_table);
1147 assert(o);
1148 assert(offset > 0);
1149
1150 if (o->object.type != OBJECT_DATA)
1151 return -EINVAL;
1152
1153 m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);
1154 if (m <= 0)
1155 return -EBADMSG;
1156
1157 /* This might alter the window we are looking at */
1158 o->data.next_hash_offset = o->data.next_field_offset = 0;
1159 o->data.entry_offset = o->data.entry_array_offset = 0;
1160 o->data.n_entries = 0;
1161
1162 h = hash % m;
1163 p = le64toh(f->data_hash_table[h].tail_hash_offset);
1164 if (p == 0)
1165 /* Only entry in the hash table is easy */
1166 f->data_hash_table[h].head_hash_offset = htole64(offset);
1167 else {
1168 /* Move back to the previous data object, to patch in
1169 * pointer */
1170
1171 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1172 if (r < 0)
1173 return r;
1174
1175 o->data.next_hash_offset = htole64(offset);
1176 }
1177
1178 f->data_hash_table[h].tail_hash_offset = htole64(offset);
1179
1180 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1181 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1182
1183 return 0;
1184 }
1185
1186 static int next_hash_offset(
1187 JournalFile *f,
1188 uint64_t *p,
1189 le64_t *next_hash_offset,
1190 uint64_t *depth,
1191 le64_t *header_max_depth) {
1192
1193 uint64_t nextp;
1194
1195 nextp = le64toh(READ_NOW(*next_hash_offset));
1196 if (nextp > 0) {
1197 if (nextp <= *p) /* Refuse going in loops */
1198 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1199 "Detected hash item loop in %s, refusing.", f->path);
1200
1201 (*depth)++;
1202
1203 /* If the depth of this hash chain is larger than all others we have seen so far, record it */
1204 if (header_max_depth && f->writable)
1205 *header_max_depth = htole64(MAX(*depth, le64toh(*header_max_depth)));
1206 }
1207
1208 *p = nextp;
1209 return 0;
1210 }
1211
1212 int journal_file_find_field_object_with_hash(
1213 JournalFile *f,
1214 const void *field, uint64_t size, uint64_t hash,
1215 Object **ret, uint64_t *ret_offset) {
1216
1217 uint64_t p, osize, h, m, depth = 0;
1218 int r;
1219
1220 assert(f);
1221 assert(f->header);
1222 assert(field && size > 0);
1223
1224 /* If the field hash table is empty, we can't find anything */
1225 if (le64toh(f->header->field_hash_table_size) <= 0)
1226 return 0;
1227
1228 /* Map the field hash table, if it isn't mapped yet. */
1229 r = journal_file_map_field_hash_table(f);
1230 if (r < 0)
1231 return r;
1232
1233 osize = offsetof(Object, field.payload) + size;
1234
1235 m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem);
1236 if (m <= 0)
1237 return -EBADMSG;
1238
1239 h = hash % m;
1240 p = le64toh(f->field_hash_table[h].head_hash_offset);
1241 while (p > 0) {
1242 Object *o;
1243
1244 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1245 if (r < 0)
1246 return r;
1247
1248 if (le64toh(o->field.hash) == hash &&
1249 le64toh(o->object.size) == osize &&
1250 memcmp(o->field.payload, field, size) == 0) {
1251
1252 if (ret)
1253 *ret = o;
1254 if (ret_offset)
1255 *ret_offset = p;
1256
1257 return 1;
1258 }
1259
1260 r = next_hash_offset(
1261 f,
1262 &p,
1263 &o->field.next_hash_offset,
1264 &depth,
1265 JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) ? &f->header->field_hash_chain_depth : NULL);
1266 if (r < 0)
1267 return r;
1268 }
1269
1270 return 0;
1271 }
1272
1273 uint64_t journal_file_hash_data(
1274 JournalFile *f,
1275 const void *data,
1276 size_t sz) {
1277
1278 assert(f);
1279 assert(data || sz == 0);
1280
1281 /* We try to unify our codebase on siphash, hence new-styled journal files utilizing the keyed hash
1282 * function use siphash. Old journal files use the Jenkins hash. */
1283
1284 if (JOURNAL_HEADER_KEYED_HASH(f->header))
1285 return siphash24(data, sz, f->header->file_id.bytes);
1286
1287 return jenkins_hash64(data, sz);
1288 }
1289
1290 int journal_file_find_field_object(
1291 JournalFile *f,
1292 const void *field, uint64_t size,
1293 Object **ret, uint64_t *ret_offset) {
1294
1295 assert(f);
1296 assert(field && size > 0);
1297
1298 return journal_file_find_field_object_with_hash(
1299 f,
1300 field, size,
1301 journal_file_hash_data(f, field, size),
1302 ret, ret_offset);
1303 }
1304
1305 int journal_file_find_data_object_with_hash(
1306 JournalFile *f,
1307 const void *data, uint64_t size, uint64_t hash,
1308 Object **ret, uint64_t *ret_offset) {
1309
1310 uint64_t p, osize, h, m, depth = 0;
1311 int r;
1312
1313 assert(f);
1314 assert(f->header);
1315 assert(data || size == 0);
1316
1317 /* If there's no data hash table, then there's no entry. */
1318 if (le64toh(f->header->data_hash_table_size) <= 0)
1319 return 0;
1320
1321 /* Map the data hash table, if it isn't mapped yet. */
1322 r = journal_file_map_data_hash_table(f);
1323 if (r < 0)
1324 return r;
1325
1326 osize = offsetof(Object, data.payload) + size;
1327
1328 m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);
1329 if (m <= 0)
1330 return -EBADMSG;
1331
1332 h = hash % m;
1333 p = le64toh(f->data_hash_table[h].head_hash_offset);
1334
1335 while (p > 0) {
1336 Object *o;
1337
1338 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1339 if (r < 0)
1340 return r;
1341
1342 if (le64toh(o->data.hash) != hash)
1343 goto next;
1344
1345 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
1346 #if HAVE_COMPRESSION
1347 uint64_t l;
1348 size_t rsize = 0;
1349
1350 l = le64toh(READ_NOW(o->object.size));
1351 if (l <= offsetof(Object, data.payload))
1352 return -EBADMSG;
1353
1354 l -= offsetof(Object, data.payload);
1355
1356 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1357 o->data.payload, l, &f->compress_buffer, &rsize, 0);
1358 if (r < 0)
1359 return r;
1360
1361 if (rsize == size &&
1362 memcmp(f->compress_buffer, data, size) == 0) {
1363
1364 if (ret)
1365 *ret = o;
1366
1367 if (ret_offset)
1368 *ret_offset = p;
1369
1370 return 1;
1371 }
1372 #else
1373 return -EPROTONOSUPPORT;
1374 #endif
1375 } else if (le64toh(o->object.size) == osize &&
1376 memcmp(o->data.payload, data, size) == 0) {
1377
1378 if (ret)
1379 *ret = o;
1380
1381 if (ret_offset)
1382 *ret_offset = p;
1383
1384 return 1;
1385 }
1386
1387 next:
1388 r = next_hash_offset(
1389 f,
1390 &p,
1391 &o->data.next_hash_offset,
1392 &depth,
1393 JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) ? &f->header->data_hash_chain_depth : NULL);
1394 if (r < 0)
1395 return r;
1396 }
1397
1398 return 0;
1399 }
1400
1401 int journal_file_find_data_object(
1402 JournalFile *f,
1403 const void *data, uint64_t size,
1404 Object **ret, uint64_t *ret_offset) {
1405
1406 assert(f);
1407 assert(data || size == 0);
1408
1409 return journal_file_find_data_object_with_hash(
1410 f,
1411 data, size,
1412 journal_file_hash_data(f, data, size),
1413 ret, ret_offset);
1414 }
1415
1416 bool journal_field_valid(const char *p, size_t l, bool allow_protected) {
1417 /* We kinda enforce POSIX syntax recommendations for
1418 environment variables here, but make a couple of additional
1419 requirements.
1420
1421 http://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html */
1422
1423 if (l == SIZE_MAX)
1424 l = strlen(p);
1425
1426 /* No empty field names */
1427 if (l <= 0)
1428 return false;
1429
1430 /* Don't allow names longer than 64 chars */
1431 if (l > 64)
1432 return false;
1433
1434 /* Variables starting with an underscore are protected */
1435 if (!allow_protected && p[0] == '_')
1436 return false;
1437
1438 /* Don't allow digits as first character */
1439 if (p[0] >= '0' && p[0] <= '9')
1440 return false;
1441
1442 /* Only allow A-Z0-9 and '_' */
1443 for (const char *a = p; a < p + l; a++)
1444 if ((*a < 'A' || *a > 'Z') &&
1445 (*a < '0' || *a > '9') &&
1446 *a != '_')
1447 return false;
1448
1449 return true;
1450 }
1451
1452 static int journal_file_append_field(
1453 JournalFile *f,
1454 const void *field, uint64_t size,
1455 Object **ret, uint64_t *ret_offset) {
1456
1457 uint64_t hash, p;
1458 uint64_t osize;
1459 Object *o;
1460 int r;
1461
1462 assert(f);
1463 assert(field && size > 0);
1464
1465 if (!journal_field_valid(field, size, true))
1466 return -EBADMSG;
1467
1468 hash = journal_file_hash_data(f, field, size);
1469
1470 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1471 if (r < 0)
1472 return r;
1473 if (r > 0) {
1474
1475 if (ret)
1476 *ret = o;
1477
1478 if (ret_offset)
1479 *ret_offset = p;
1480
1481 return 0;
1482 }
1483
1484 osize = offsetof(Object, field.payload) + size;
1485 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1486 if (r < 0)
1487 return r;
1488
1489 o->field.hash = htole64(hash);
1490 memcpy(o->field.payload, field, size);
1491
1492 r = journal_file_link_field(f, o, p, hash);
1493 if (r < 0)
1494 return r;
1495
1496 /* The linking might have altered the window, so let's
1497 * refresh our pointer */
1498 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1499 if (r < 0)
1500 return r;
1501
1502 #if HAVE_GCRYPT
1503 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1504 if (r < 0)
1505 return r;
1506 #endif
1507
1508 if (ret)
1509 *ret = o;
1510
1511 if (ret_offset)
1512 *ret_offset = p;
1513
1514 return 0;
1515 }
1516
1517 static int journal_file_append_data(
1518 JournalFile *f,
1519 const void *data, uint64_t size,
1520 Object **ret, uint64_t *ret_offset) {
1521
1522 uint64_t hash, p, fp, osize;
1523 Object *o, *fo;
1524 int r, compression = 0;
1525 const void *eq;
1526
1527 assert(f);
1528
1529 if (!data || size == 0)
1530 return -EINVAL;
1531
1532 hash = journal_file_hash_data(f, data, size);
1533
1534 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1535 if (r < 0)
1536 return r;
1537 if (r > 0) {
1538
1539 if (ret)
1540 *ret = o;
1541
1542 if (ret_offset)
1543 *ret_offset = p;
1544
1545 return 0;
1546 }
1547
1548 eq = memchr(data, '=', size);
1549 if (!eq)
1550 return -EINVAL;
1551
1552 osize = offsetof(Object, data.payload) + size;
1553 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1554 if (r < 0)
1555 return r;
1556
1557 o->data.hash = htole64(hash);
1558
1559 #if HAVE_COMPRESSION
1560 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
1561 size_t rsize = 0;
1562
1563 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1564
1565 if (compression >= 0) {
1566 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1567 o->object.flags |= compression;
1568
1569 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1570 size, rsize, object_compressed_to_string(compression));
1571 } else
1572 /* Compression didn't work, we don't really care why, let's continue without compression */
1573 compression = 0;
1574 }
1575 #endif
1576
1577 if (compression == 0)
1578 memcpy_safe(o->data.payload, data, size);
1579
1580 r = journal_file_link_data(f, o, p, hash);
1581 if (r < 0)
1582 return r;
1583
1584 #if HAVE_GCRYPT
1585 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1586 if (r < 0)
1587 return r;
1588 #endif
1589
1590 /* The linking might have altered the window, so let's
1591 * refresh our pointer */
1592 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1593 if (r < 0)
1594 return r;
1595
1596 /* Create field object ... */
1597 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1598 if (r < 0)
1599 return r;
1600
1601 /* ... and link it in. */
1602 o->data.next_field_offset = fo->field.head_data_offset;
1603 fo->field.head_data_offset = le64toh(p);
1604
1605 if (ret)
1606 *ret = o;
1607
1608 if (ret_offset)
1609 *ret_offset = p;
1610
1611 return 0;
1612 }
1613
1614 uint64_t journal_file_entry_n_items(Object *o) {
1615 uint64_t sz;
1616 assert(o);
1617
1618 if (o->object.type != OBJECT_ENTRY)
1619 return 0;
1620
1621 sz = le64toh(READ_NOW(o->object.size));
1622 if (sz < offsetof(Object, entry.items))
1623 return 0;
1624
1625 return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem);
1626 }
1627
1628 uint64_t journal_file_entry_array_n_items(Object *o) {
1629 uint64_t sz;
1630
1631 assert(o);
1632
1633 if (o->object.type != OBJECT_ENTRY_ARRAY)
1634 return 0;
1635
1636 sz = le64toh(READ_NOW(o->object.size));
1637 if (sz < offsetof(Object, entry_array.items))
1638 return 0;
1639
1640 return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1641 }
1642
1643 uint64_t journal_file_hash_table_n_items(Object *o) {
1644 uint64_t sz;
1645
1646 assert(o);
1647
1648 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1649 return 0;
1650
1651 sz = le64toh(READ_NOW(o->object.size));
1652 if (sz < offsetof(Object, hash_table.items))
1653 return 0;
1654
1655 return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1656 }
1657
1658 static int link_entry_into_array(JournalFile *f,
1659 le64_t *first,
1660 le64_t *idx,
1661 uint64_t p) {
1662 int r;
1663 uint64_t n = 0, ap = 0, q, i, a, hidx;
1664 Object *o;
1665
1666 assert(f);
1667 assert(f->header);
1668 assert(first);
1669 assert(idx);
1670 assert(p > 0);
1671
1672 a = le64toh(*first);
1673 i = hidx = le64toh(READ_NOW(*idx));
1674 while (a > 0) {
1675
1676 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1677 if (r < 0)
1678 return r;
1679
1680 n = journal_file_entry_array_n_items(o);
1681 if (i < n) {
1682 o->entry_array.items[i] = htole64(p);
1683 *idx = htole64(hidx + 1);
1684 return 0;
1685 }
1686
1687 i -= n;
1688 ap = a;
1689 a = le64toh(o->entry_array.next_entry_array_offset);
1690 }
1691
1692 if (hidx > n)
1693 n = (hidx+1) * 2;
1694 else
1695 n = n * 2;
1696
1697 if (n < 4)
1698 n = 4;
1699
1700 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1701 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1702 &o, &q);
1703 if (r < 0)
1704 return r;
1705
1706 #if HAVE_GCRYPT
1707 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1708 if (r < 0)
1709 return r;
1710 #endif
1711
1712 o->entry_array.items[i] = htole64(p);
1713
1714 if (ap == 0)
1715 *first = htole64(q);
1716 else {
1717 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1718 if (r < 0)
1719 return r;
1720
1721 o->entry_array.next_entry_array_offset = htole64(q);
1722 }
1723
1724 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1725 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1726
1727 *idx = htole64(hidx + 1);
1728
1729 return 0;
1730 }
1731
1732 static int link_entry_into_array_plus_one(JournalFile *f,
1733 le64_t *extra,
1734 le64_t *first,
1735 le64_t *idx,
1736 uint64_t p) {
1737
1738 uint64_t hidx;
1739 int r;
1740
1741 assert(f);
1742 assert(extra);
1743 assert(first);
1744 assert(idx);
1745 assert(p > 0);
1746
1747 hidx = le64toh(READ_NOW(*idx));
1748 if (hidx == UINT64_MAX)
1749 return -EBADMSG;
1750 if (hidx == 0)
1751 *extra = htole64(p);
1752 else {
1753 le64_t i;
1754
1755 i = htole64(hidx - 1);
1756 r = link_entry_into_array(f, first, &i, p);
1757 if (r < 0)
1758 return r;
1759 }
1760
1761 *idx = htole64(hidx + 1);
1762 return 0;
1763 }
1764
1765 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1766 uint64_t p;
1767 int r;
1768
1769 assert(f);
1770 assert(o);
1771 assert(offset > 0);
1772
1773 p = le64toh(o->entry.items[i].object_offset);
1774 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1775 if (r < 0)
1776 return r;
1777
1778 return link_entry_into_array_plus_one(f,
1779 &o->data.entry_offset,
1780 &o->data.entry_array_offset,
1781 &o->data.n_entries,
1782 offset);
1783 }
1784
1785 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1786 uint64_t n;
1787 int r;
1788
1789 assert(f);
1790 assert(f->header);
1791 assert(o);
1792 assert(offset > 0);
1793
1794 if (o->object.type != OBJECT_ENTRY)
1795 return -EINVAL;
1796
1797 __sync_synchronize();
1798
1799 /* Link up the entry itself */
1800 r = link_entry_into_array(f,
1801 &f->header->entry_array_offset,
1802 &f->header->n_entries,
1803 offset);
1804 if (r < 0)
1805 return r;
1806
1807 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1808
1809 if (f->header->head_entry_realtime == 0)
1810 f->header->head_entry_realtime = o->entry.realtime;
1811
1812 f->header->tail_entry_realtime = o->entry.realtime;
1813 f->header->tail_entry_monotonic = o->entry.monotonic;
1814
1815 /* Link up the items */
1816 n = journal_file_entry_n_items(o);
1817 for (uint64_t i = 0; i < n; i++) {
1818 r = journal_file_link_entry_item(f, o, offset, i);
1819 if (r < 0)
1820 return r;
1821 }
1822
1823 return 0;
1824 }
1825
1826 static int journal_file_append_entry_internal(
1827 JournalFile *f,
1828 const dual_timestamp *ts,
1829 const sd_id128_t *boot_id,
1830 uint64_t xor_hash,
1831 const EntryItem items[], unsigned n_items,
1832 uint64_t *seqnum,
1833 Object **ret, uint64_t *ret_offset) {
1834 uint64_t np;
1835 uint64_t osize;
1836 Object *o;
1837 int r;
1838
1839 assert(f);
1840 assert(f->header);
1841 assert(items || n_items == 0);
1842 assert(ts);
1843
1844 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1845
1846 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1847 if (r < 0)
1848 return r;
1849
1850 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1851 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1852 o->entry.realtime = htole64(ts->realtime);
1853 o->entry.monotonic = htole64(ts->monotonic);
1854 o->entry.xor_hash = htole64(xor_hash);
1855 if (boot_id)
1856 f->header->boot_id = *boot_id;
1857 o->entry.boot_id = f->header->boot_id;
1858
1859 #if HAVE_GCRYPT
1860 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1861 if (r < 0)
1862 return r;
1863 #endif
1864
1865 r = journal_file_link_entry(f, o, np);
1866 if (r < 0)
1867 return r;
1868
1869 if (ret)
1870 *ret = o;
1871
1872 if (ret_offset)
1873 *ret_offset = np;
1874
1875 return r;
1876 }
1877
1878 void journal_file_post_change(JournalFile *f) {
1879 assert(f);
1880
1881 if (f->fd < 0)
1882 return;
1883
1884 /* inotify() does not receive IN_MODIFY events from file
1885 * accesses done via mmap(). After each access we hence
1886 * trigger IN_MODIFY by truncating the journal file to its
1887 * current size which triggers IN_MODIFY. */
1888
1889 __sync_synchronize();
1890
1891 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1892 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1893 }
1894
1895 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1896 assert(userdata);
1897
1898 journal_file_post_change(userdata);
1899
1900 return 1;
1901 }
1902
1903 static void schedule_post_change(JournalFile *f) {
1904 int r;
1905
1906 assert(f);
1907 assert(f->post_change_timer);
1908
1909 r = sd_event_source_get_enabled(f->post_change_timer, NULL);
1910 if (r < 0) {
1911 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1912 goto fail;
1913 }
1914 if (r > 0)
1915 return;
1916
1917 r = sd_event_source_set_time_relative(f->post_change_timer, f->post_change_timer_period);
1918 if (r < 0) {
1919 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1920 goto fail;
1921 }
1922
1923 r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
1924 if (r < 0) {
1925 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1926 goto fail;
1927 }
1928
1929 return;
1930
1931 fail:
1932 /* On failure, let's simply post the change immediately. */
1933 journal_file_post_change(f);
1934 }
1935
1936 /* Enable coalesced change posting in a timer on the provided sd_event instance */
1937 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1938 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1939 int r;
1940
1941 assert(f);
1942 assert_return(!f->post_change_timer, -EINVAL);
1943 assert(e);
1944 assert(t);
1945
1946 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1947 if (r < 0)
1948 return r;
1949
1950 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1951 if (r < 0)
1952 return r;
1953
1954 f->post_change_timer = TAKE_PTR(timer);
1955 f->post_change_timer_period = t;
1956
1957 return r;
1958 }
1959
1960 static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
1961 return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
1962 }
1963
1964 static size_t remove_duplicate_entry_items(EntryItem items[], size_t n) {
1965
1966 /* This function relies on the items array being sorted. */
1967 size_t j = 1;
1968
1969 if (n <= 1)
1970 return n;
1971
1972 for (size_t i = 1; i < n; i++)
1973 if (items[i].object_offset != items[j - 1].object_offset)
1974 items[j++] = items[i];
1975
1976 return j;
1977 }
1978
1979 int journal_file_append_entry(
1980 JournalFile *f,
1981 const dual_timestamp *ts,
1982 const sd_id128_t *boot_id,
1983 const struct iovec iovec[], unsigned n_iovec,
1984 uint64_t *seqnum,
1985 Object **ret, uint64_t *ret_offset) {
1986
1987 EntryItem *items;
1988 int r;
1989 uint64_t xor_hash = 0;
1990 struct dual_timestamp _ts;
1991
1992 assert(f);
1993 assert(f->header);
1994 assert(iovec && n_iovec > 0);
1995
1996 if (ts) {
1997 if (!VALID_REALTIME(ts->realtime))
1998 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1999 "Invalid realtime timestamp %" PRIu64 ", refusing entry.",
2000 ts->realtime);
2001 if (!VALID_MONOTONIC(ts->monotonic))
2002 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2003 "Invalid monotomic timestamp %" PRIu64 ", refusing entry.",
2004 ts->monotonic);
2005 } else {
2006 dual_timestamp_get(&_ts);
2007 ts = &_ts;
2008 }
2009
2010 #if HAVE_GCRYPT
2011 r = journal_file_maybe_append_tag(f, ts->realtime);
2012 if (r < 0)
2013 return r;
2014 #endif
2015
2016 items = newa(EntryItem, n_iovec);
2017
2018 for (size_t i = 0; i < n_iovec; i++) {
2019 uint64_t p;
2020 Object *o;
2021
2022 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2023 if (r < 0)
2024 return r;
2025
2026 /* When calculating the XOR hash field, we need to take special care if the "keyed-hash"
2027 * journal file flag is on. We use the XOR hash field to quickly determine the identity of a
2028 * specific record, and give records with otherwise identical position (i.e. match in seqno,
2029 * timestamp, …) a stable ordering. But for that we can't have it that the hash of the
2030 * objects in each file is different since they are keyed. Hence let's calculate the Jenkins
2031 * hash here for that. This also has the benefit that cursors for old and new journal files
2032 * are completely identical (they include the XOR hash after all). For classic Jenkins-hash
2033 * files things are easier, we can just take the value from the stored record directly. */
2034
2035 if (JOURNAL_HEADER_KEYED_HASH(f->header))
2036 xor_hash ^= jenkins_hash64(iovec[i].iov_base, iovec[i].iov_len);
2037 else
2038 xor_hash ^= le64toh(o->data.hash);
2039
2040 items[i] = (EntryItem) {
2041 .object_offset = htole64(p),
2042 .hash = o->data.hash,
2043 };
2044 }
2045
2046 /* Order by the position on disk, in order to improve seek
2047 * times for rotating media. */
2048 typesafe_qsort(items, n_iovec, entry_item_cmp);
2049 n_iovec = remove_duplicate_entry_items(items, n_iovec);
2050
2051 r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, ret_offset);
2052
2053 /* If the memory mapping triggered a SIGBUS then we return an
2054 * IO error and ignore the error code passed down to us, since
2055 * it is very likely just an effect of a nullified replacement
2056 * mapping page */
2057
2058 if (mmap_cache_fd_got_sigbus(f->cache_fd))
2059 r = -EIO;
2060
2061 if (f->post_change_timer)
2062 schedule_post_change(f);
2063 else
2064 journal_file_post_change(f);
2065
2066 return r;
2067 }
2068
2069 typedef struct ChainCacheItem {
2070 uint64_t first; /* the array at the beginning of the chain */
2071 uint64_t array; /* the cached array */
2072 uint64_t begin; /* the first item in the cached array */
2073 uint64_t total; /* the total number of items in all arrays before this one in the chain */
2074 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2075 } ChainCacheItem;
2076
2077 static void chain_cache_put(
2078 OrderedHashmap *h,
2079 ChainCacheItem *ci,
2080 uint64_t first,
2081 uint64_t array,
2082 uint64_t begin,
2083 uint64_t total,
2084 uint64_t last_index) {
2085
2086 if (!ci) {
2087 /* If the chain item to cache for this chain is the
2088 * first one it's not worth caching anything */
2089 if (array == first)
2090 return;
2091
2092 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2093 ci = ordered_hashmap_steal_first(h);
2094 assert(ci);
2095 } else {
2096 ci = new(ChainCacheItem, 1);
2097 if (!ci)
2098 return;
2099 }
2100
2101 ci->first = first;
2102
2103 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2104 free(ci);
2105 return;
2106 }
2107 } else
2108 assert(ci->first == first);
2109
2110 ci->array = array;
2111 ci->begin = begin;
2112 ci->total = total;
2113 ci->last_index = last_index;
2114 }
2115
2116 static int generic_array_get(
2117 JournalFile *f,
2118 uint64_t first,
2119 uint64_t i,
2120 Object **ret, uint64_t *ret_offset) {
2121
2122 Object *o;
2123 uint64_t p = 0, a, t = 0;
2124 int r;
2125 ChainCacheItem *ci;
2126
2127 assert(f);
2128
2129 a = first;
2130
2131 /* Try the chain cache first */
2132 ci = ordered_hashmap_get(f->chain_cache, &first);
2133 if (ci && i > ci->total) {
2134 a = ci->array;
2135 i -= ci->total;
2136 t = ci->total;
2137 }
2138
2139 while (a > 0) {
2140 uint64_t k;
2141
2142 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2143 if (r < 0)
2144 return r;
2145
2146 k = journal_file_entry_array_n_items(o);
2147 if (i < k) {
2148 p = le64toh(o->entry_array.items[i]);
2149 goto found;
2150 }
2151
2152 i -= k;
2153 t += k;
2154 a = le64toh(o->entry_array.next_entry_array_offset);
2155 }
2156
2157 return 0;
2158
2159 found:
2160 /* Let's cache this item for the next invocation */
2161 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2162
2163 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2164 if (r < 0)
2165 return r;
2166
2167 if (ret)
2168 *ret = o;
2169
2170 if (ret_offset)
2171 *ret_offset = p;
2172
2173 return 1;
2174 }
2175
2176 static int generic_array_get_plus_one(
2177 JournalFile *f,
2178 uint64_t extra,
2179 uint64_t first,
2180 uint64_t i,
2181 Object **ret, uint64_t *ret_offset) {
2182
2183 Object *o;
2184
2185 assert(f);
2186
2187 if (i == 0) {
2188 int r;
2189
2190 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2191 if (r < 0)
2192 return r;
2193
2194 if (ret)
2195 *ret = o;
2196
2197 if (ret_offset)
2198 *ret_offset = extra;
2199
2200 return 1;
2201 }
2202
2203 return generic_array_get(f, first, i-1, ret, ret_offset);
2204 }
2205
2206 enum {
2207 TEST_FOUND,
2208 TEST_LEFT,
2209 TEST_RIGHT
2210 };
2211
2212 static int generic_array_bisect(
2213 JournalFile *f,
2214 uint64_t first,
2215 uint64_t n,
2216 uint64_t needle,
2217 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2218 direction_t direction,
2219 Object **ret,
2220 uint64_t *ret_offset,
2221 uint64_t *ret_idx) {
2222
2223 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = UINT64_MAX;
2224 bool subtract_one = false;
2225 Object *o, *array = NULL;
2226 int r;
2227 ChainCacheItem *ci;
2228
2229 assert(f);
2230 assert(test_object);
2231
2232 /* Start with the first array in the chain */
2233 a = first;
2234
2235 ci = ordered_hashmap_get(f->chain_cache, &first);
2236 if (ci && n > ci->total && ci->begin != 0) {
2237 /* Ah, we have iterated this bisection array chain
2238 * previously! Let's see if we can skip ahead in the
2239 * chain, as far as the last time. But we can't jump
2240 * backwards in the chain, so let's check that
2241 * first. */
2242
2243 r = test_object(f, ci->begin, needle);
2244 if (r < 0)
2245 return r;
2246
2247 if (r == TEST_LEFT) {
2248 /* OK, what we are looking for is right of the
2249 * begin of this EntryArray, so let's jump
2250 * straight to previously cached array in the
2251 * chain */
2252
2253 a = ci->array;
2254 n -= ci->total;
2255 t = ci->total;
2256 last_index = ci->last_index;
2257 }
2258 }
2259
2260 while (a > 0) {
2261 uint64_t left, right, k, lp;
2262
2263 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2264 if (r < 0)
2265 return r;
2266
2267 k = journal_file_entry_array_n_items(array);
2268 right = MIN(k, n);
2269 if (right <= 0)
2270 return 0;
2271
2272 i = right - 1;
2273 lp = p = le64toh(array->entry_array.items[i]);
2274 if (p <= 0)
2275 r = -EBADMSG;
2276 else
2277 r = test_object(f, p, needle);
2278 if (r == -EBADMSG) {
2279 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2280 n = i;
2281 continue;
2282 }
2283 if (r < 0)
2284 return r;
2285
2286 if (r == TEST_FOUND)
2287 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2288
2289 if (r == TEST_RIGHT) {
2290 left = 0;
2291 right -= 1;
2292
2293 if (last_index != UINT64_MAX) {
2294 assert(last_index <= right);
2295
2296 /* If we cached the last index we
2297 * looked at, let's try to not to jump
2298 * too wildly around and see if we can
2299 * limit the range to look at early to
2300 * the immediate neighbors of the last
2301 * index we looked at. */
2302
2303 if (last_index > 0) {
2304 uint64_t x = last_index - 1;
2305
2306 p = le64toh(array->entry_array.items[x]);
2307 if (p <= 0)
2308 return -EBADMSG;
2309
2310 r = test_object(f, p, needle);
2311 if (r < 0)
2312 return r;
2313
2314 if (r == TEST_FOUND)
2315 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2316
2317 if (r == TEST_RIGHT)
2318 right = x;
2319 else
2320 left = x + 1;
2321 }
2322
2323 if (last_index < right) {
2324 uint64_t y = last_index + 1;
2325
2326 p = le64toh(array->entry_array.items[y]);
2327 if (p <= 0)
2328 return -EBADMSG;
2329
2330 r = test_object(f, p, needle);
2331 if (r < 0)
2332 return r;
2333
2334 if (r == TEST_FOUND)
2335 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2336
2337 if (r == TEST_RIGHT)
2338 right = y;
2339 else
2340 left = y + 1;
2341 }
2342 }
2343
2344 for (;;) {
2345 if (left == right) {
2346 if (direction == DIRECTION_UP)
2347 subtract_one = true;
2348
2349 i = left;
2350 goto found;
2351 }
2352
2353 assert(left < right);
2354 i = (left + right) / 2;
2355
2356 p = le64toh(array->entry_array.items[i]);
2357 if (p <= 0)
2358 r = -EBADMSG;
2359 else
2360 r = test_object(f, p, needle);
2361 if (r == -EBADMSG) {
2362 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2363 right = n = i;
2364 continue;
2365 }
2366 if (r < 0)
2367 return r;
2368
2369 if (r == TEST_FOUND)
2370 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2371
2372 if (r == TEST_RIGHT)
2373 right = i;
2374 else
2375 left = i + 1;
2376 }
2377 }
2378
2379 if (k >= n) {
2380 if (direction == DIRECTION_UP) {
2381 i = n;
2382 subtract_one = true;
2383 goto found;
2384 }
2385
2386 return 0;
2387 }
2388
2389 last_p = lp;
2390
2391 n -= k;
2392 t += k;
2393 last_index = UINT64_MAX;
2394 a = le64toh(array->entry_array.next_entry_array_offset);
2395 }
2396
2397 return 0;
2398
2399 found:
2400 if (subtract_one && t == 0 && i == 0)
2401 return 0;
2402
2403 /* Let's cache this item for the next invocation */
2404 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);
2405
2406 if (subtract_one && i == 0)
2407 p = last_p;
2408 else if (subtract_one)
2409 p = le64toh(array->entry_array.items[i-1]);
2410 else
2411 p = le64toh(array->entry_array.items[i]);
2412
2413 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2414 if (r < 0)
2415 return r;
2416
2417 if (ret)
2418 *ret = o;
2419
2420 if (ret_offset)
2421 *ret_offset = p;
2422
2423 if (ret_idx)
2424 *ret_idx = t + i + (subtract_one ? -1 : 0);
2425
2426 return 1;
2427 }
2428
2429 static int generic_array_bisect_plus_one(
2430 JournalFile *f,
2431 uint64_t extra,
2432 uint64_t first,
2433 uint64_t n,
2434 uint64_t needle,
2435 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2436 direction_t direction,
2437 Object **ret,
2438 uint64_t *ret_offset,
2439 uint64_t *ret_idx) {
2440
2441 int r;
2442 bool step_back = false;
2443 Object *o;
2444
2445 assert(f);
2446 assert(test_object);
2447
2448 if (n <= 0)
2449 return 0;
2450
2451 /* This bisects the array in object 'first', but first checks
2452 * an extra */
2453 r = test_object(f, extra, needle);
2454 if (r < 0)
2455 return r;
2456
2457 if (r == TEST_FOUND)
2458 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2459
2460 /* if we are looking with DIRECTION_UP then we need to first
2461 see if in the actual array there is a matching entry, and
2462 return the last one of that. But if there isn't any we need
2463 to return this one. Hence remember this, and return it
2464 below. */
2465 if (r == TEST_LEFT)
2466 step_back = direction == DIRECTION_UP;
2467
2468 if (r == TEST_RIGHT) {
2469 if (direction == DIRECTION_DOWN)
2470 goto found;
2471 else
2472 return 0;
2473 }
2474
2475 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, ret_offset, ret_idx);
2476
2477 if (r == 0 && step_back)
2478 goto found;
2479
2480 if (r > 0 && ret_idx)
2481 (*ret_idx)++;
2482
2483 return r;
2484
2485 found:
2486 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2487 if (r < 0)
2488 return r;
2489
2490 if (ret)
2491 *ret = o;
2492
2493 if (ret_offset)
2494 *ret_offset = extra;
2495
2496 if (ret_idx)
2497 *ret_idx = 0;
2498
2499 return 1;
2500 }
2501
2502 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2503 assert(f);
2504 assert(p > 0);
2505
2506 if (p == needle)
2507 return TEST_FOUND;
2508 else if (p < needle)
2509 return TEST_LEFT;
2510 else
2511 return TEST_RIGHT;
2512 }
2513
2514 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2515 uint64_t sq;
2516 Object *o;
2517 int r;
2518
2519 assert(f);
2520 assert(p > 0);
2521
2522 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2523 if (r < 0)
2524 return r;
2525
2526 sq = le64toh(READ_NOW(o->entry.seqnum));
2527 if (sq == needle)
2528 return TEST_FOUND;
2529 else if (sq < needle)
2530 return TEST_LEFT;
2531 else
2532 return TEST_RIGHT;
2533 }
2534
2535 int journal_file_move_to_entry_by_seqnum(
2536 JournalFile *f,
2537 uint64_t seqnum,
2538 direction_t direction,
2539 Object **ret,
2540 uint64_t *ret_offset) {
2541 assert(f);
2542 assert(f->header);
2543
2544 return generic_array_bisect(
2545 f,
2546 le64toh(f->header->entry_array_offset),
2547 le64toh(f->header->n_entries),
2548 seqnum,
2549 test_object_seqnum,
2550 direction,
2551 ret, ret_offset, NULL);
2552 }
2553
2554 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2555 Object *o;
2556 uint64_t rt;
2557 int r;
2558
2559 assert(f);
2560 assert(p > 0);
2561
2562 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2563 if (r < 0)
2564 return r;
2565
2566 rt = le64toh(READ_NOW(o->entry.realtime));
2567 if (rt == needle)
2568 return TEST_FOUND;
2569 else if (rt < needle)
2570 return TEST_LEFT;
2571 else
2572 return TEST_RIGHT;
2573 }
2574
2575 int journal_file_move_to_entry_by_realtime(
2576 JournalFile *f,
2577 uint64_t realtime,
2578 direction_t direction,
2579 Object **ret,
2580 uint64_t *ret_offset) {
2581 assert(f);
2582 assert(f->header);
2583
2584 return generic_array_bisect(
2585 f,
2586 le64toh(f->header->entry_array_offset),
2587 le64toh(f->header->n_entries),
2588 realtime,
2589 test_object_realtime,
2590 direction,
2591 ret, ret_offset, NULL);
2592 }
2593
2594 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2595 Object *o;
2596 uint64_t m;
2597 int r;
2598
2599 assert(f);
2600 assert(p > 0);
2601
2602 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2603 if (r < 0)
2604 return r;
2605
2606 m = le64toh(READ_NOW(o->entry.monotonic));
2607 if (m == needle)
2608 return TEST_FOUND;
2609 else if (m < needle)
2610 return TEST_LEFT;
2611 else
2612 return TEST_RIGHT;
2613 }
2614
2615 static int find_data_object_by_boot_id(
2616 JournalFile *f,
2617 sd_id128_t boot_id,
2618 Object **o,
2619 uint64_t *b) {
2620
2621 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2622
2623 sd_id128_to_string(boot_id, t + 9);
2624 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2625 }
2626
2627 int journal_file_move_to_entry_by_monotonic(
2628 JournalFile *f,
2629 sd_id128_t boot_id,
2630 uint64_t monotonic,
2631 direction_t direction,
2632 Object **ret,
2633 uint64_t *ret_offset) {
2634
2635 Object *o;
2636 int r;
2637
2638 assert(f);
2639
2640 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2641 if (r < 0)
2642 return r;
2643 if (r == 0)
2644 return -ENOENT;
2645
2646 return generic_array_bisect_plus_one(
2647 f,
2648 le64toh(o->data.entry_offset),
2649 le64toh(o->data.entry_array_offset),
2650 le64toh(o->data.n_entries),
2651 monotonic,
2652 test_object_monotonic,
2653 direction,
2654 ret, ret_offset, NULL);
2655 }
2656
2657 void journal_file_reset_location(JournalFile *f) {
2658 f->location_type = LOCATION_HEAD;
2659 f->current_offset = 0;
2660 f->current_seqnum = 0;
2661 f->current_realtime = 0;
2662 f->current_monotonic = 0;
2663 zero(f->current_boot_id);
2664 f->current_xor_hash = 0;
2665 }
2666
2667 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2668 f->location_type = LOCATION_SEEK;
2669 f->current_offset = offset;
2670 f->current_seqnum = le64toh(o->entry.seqnum);
2671 f->current_realtime = le64toh(o->entry.realtime);
2672 f->current_monotonic = le64toh(o->entry.monotonic);
2673 f->current_boot_id = o->entry.boot_id;
2674 f->current_xor_hash = le64toh(o->entry.xor_hash);
2675 }
2676
2677 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2678 int r;
2679
2680 assert(af);
2681 assert(af->header);
2682 assert(bf);
2683 assert(bf->header);
2684 assert(af->location_type == LOCATION_SEEK);
2685 assert(bf->location_type == LOCATION_SEEK);
2686
2687 /* If contents, timestamps and seqnum match, these entries are
2688 * identical. */
2689 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2690 af->current_monotonic == bf->current_monotonic &&
2691 af->current_realtime == bf->current_realtime &&
2692 af->current_xor_hash == bf->current_xor_hash &&
2693 sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id) &&
2694 af->current_seqnum == bf->current_seqnum)
2695 return 0;
2696
2697 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2698
2699 /* If this is from the same seqnum source, compare
2700 * seqnums */
2701 r = CMP(af->current_seqnum, bf->current_seqnum);
2702 if (r != 0)
2703 return r;
2704
2705 /* Wow! This is weird, different data but the same
2706 * seqnums? Something is borked, but let's make the
2707 * best of it and compare by time. */
2708 }
2709
2710 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2711
2712 /* If the boot id matches, compare monotonic time */
2713 r = CMP(af->current_monotonic, bf->current_monotonic);
2714 if (r != 0)
2715 return r;
2716 }
2717
2718 /* Otherwise, compare UTC time */
2719 r = CMP(af->current_realtime, bf->current_realtime);
2720 if (r != 0)
2721 return r;
2722
2723 /* Finally, compare by contents */
2724 return CMP(af->current_xor_hash, bf->current_xor_hash);
2725 }
2726
2727 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2728
2729 /* Increase or decrease the specified index, in the right direction. */
2730
2731 if (direction == DIRECTION_DOWN) {
2732 if (*i >= n - 1)
2733 return 0;
2734
2735 (*i) ++;
2736 } else {
2737 if (*i <= 0)
2738 return 0;
2739
2740 (*i) --;
2741 }
2742
2743 return 1;
2744 }
2745
2746 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2747
2748 /* Consider it an error if any of the two offsets is uninitialized */
2749 if (old_offset == 0 || new_offset == 0)
2750 return false;
2751
2752 /* If we go down, the new offset must be larger than the old one. */
2753 return direction == DIRECTION_DOWN ?
2754 new_offset > old_offset :
2755 new_offset < old_offset;
2756 }
2757
2758 int journal_file_next_entry(
2759 JournalFile *f,
2760 uint64_t p,
2761 direction_t direction,
2762 Object **ret, uint64_t *ret_offset) {
2763
2764 uint64_t i, n, ofs;
2765 int r;
2766
2767 assert(f);
2768 assert(f->header);
2769
2770 n = le64toh(READ_NOW(f->header->n_entries));
2771 if (n <= 0)
2772 return 0;
2773
2774 if (p == 0)
2775 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2776 else {
2777 r = generic_array_bisect(f,
2778 le64toh(f->header->entry_array_offset),
2779 le64toh(f->header->n_entries),
2780 p,
2781 test_object_offset,
2782 DIRECTION_DOWN,
2783 NULL, NULL,
2784 &i);
2785 if (r <= 0)
2786 return r;
2787
2788 r = bump_array_index(&i, direction, n);
2789 if (r <= 0)
2790 return r;
2791 }
2792
2793 /* And jump to it */
2794 for (;;) {
2795 r = generic_array_get(f,
2796 le64toh(f->header->entry_array_offset),
2797 i,
2798 ret, &ofs);
2799 if (r > 0)
2800 break;
2801 if (r != -EBADMSG)
2802 return r;
2803
2804 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2805 * the next one might work for us instead. */
2806 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2807
2808 r = bump_array_index(&i, direction, n);
2809 if (r <= 0)
2810 return r;
2811 }
2812
2813 /* Ensure our array is properly ordered. */
2814 if (p > 0 && !check_properly_ordered(ofs, p, direction))
2815 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2816 "%s: entry array not properly ordered at entry %" PRIu64,
2817 f->path, i);
2818
2819 if (ret_offset)
2820 *ret_offset = ofs;
2821
2822 return 1;
2823 }
2824
2825 int journal_file_next_entry_for_data(
2826 JournalFile *f,
2827 Object *o, uint64_t p,
2828 uint64_t data_offset,
2829 direction_t direction,
2830 Object **ret, uint64_t *ret_offset) {
2831
2832 uint64_t i, n, ofs;
2833 Object *d;
2834 int r;
2835
2836 assert(f);
2837 assert(p > 0 || !o);
2838
2839 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2840 if (r < 0)
2841 return r;
2842
2843 n = le64toh(READ_NOW(d->data.n_entries));
2844 if (n <= 0)
2845 return n;
2846
2847 if (!o)
2848 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2849 else {
2850 if (o->object.type != OBJECT_ENTRY)
2851 return -EINVAL;
2852
2853 r = generic_array_bisect_plus_one(f,
2854 le64toh(d->data.entry_offset),
2855 le64toh(d->data.entry_array_offset),
2856 le64toh(d->data.n_entries),
2857 p,
2858 test_object_offset,
2859 DIRECTION_DOWN,
2860 NULL, NULL,
2861 &i);
2862
2863 if (r <= 0)
2864 return r;
2865
2866 r = bump_array_index(&i, direction, n);
2867 if (r <= 0)
2868 return r;
2869 }
2870
2871 for (;;) {
2872 r = generic_array_get_plus_one(f,
2873 le64toh(d->data.entry_offset),
2874 le64toh(d->data.entry_array_offset),
2875 i,
2876 ret, &ofs);
2877 if (r > 0)
2878 break;
2879 if (r != -EBADMSG)
2880 return r;
2881
2882 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2883
2884 r = bump_array_index(&i, direction, n);
2885 if (r <= 0)
2886 return r;
2887 }
2888
2889 /* Ensure our array is properly ordered. */
2890 if (p > 0 && check_properly_ordered(ofs, p, direction))
2891 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2892 "%s data entry array not properly ordered at entry %" PRIu64,
2893 f->path, i);
2894
2895 if (ret_offset)
2896 *ret_offset = ofs;
2897
2898 return 1;
2899 }
2900
2901 int journal_file_move_to_entry_by_offset_for_data(
2902 JournalFile *f,
2903 uint64_t data_offset,
2904 uint64_t p,
2905 direction_t direction,
2906 Object **ret, uint64_t *ret_offset) {
2907
2908 int r;
2909 Object *d;
2910
2911 assert(f);
2912
2913 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2914 if (r < 0)
2915 return r;
2916
2917 return generic_array_bisect_plus_one(
2918 f,
2919 le64toh(d->data.entry_offset),
2920 le64toh(d->data.entry_array_offset),
2921 le64toh(d->data.n_entries),
2922 p,
2923 test_object_offset,
2924 direction,
2925 ret, ret_offset, NULL);
2926 }
2927
2928 int journal_file_move_to_entry_by_monotonic_for_data(
2929 JournalFile *f,
2930 uint64_t data_offset,
2931 sd_id128_t boot_id,
2932 uint64_t monotonic,
2933 direction_t direction,
2934 Object **ret, uint64_t *ret_offset) {
2935
2936 Object *o, *d;
2937 int r;
2938 uint64_t b, z;
2939
2940 assert(f);
2941
2942 /* First, seek by time */
2943 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2944 if (r < 0)
2945 return r;
2946 if (r == 0)
2947 return -ENOENT;
2948
2949 r = generic_array_bisect_plus_one(f,
2950 le64toh(o->data.entry_offset),
2951 le64toh(o->data.entry_array_offset),
2952 le64toh(o->data.n_entries),
2953 monotonic,
2954 test_object_monotonic,
2955 direction,
2956 NULL, &z, NULL);
2957 if (r <= 0)
2958 return r;
2959
2960 /* And now, continue seeking until we find an entry that
2961 * exists in both bisection arrays */
2962
2963 for (;;) {
2964 Object *qo;
2965 uint64_t p, q;
2966
2967 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2968 if (r < 0)
2969 return r;
2970
2971 r = generic_array_bisect_plus_one(f,
2972 le64toh(d->data.entry_offset),
2973 le64toh(d->data.entry_array_offset),
2974 le64toh(d->data.n_entries),
2975 z,
2976 test_object_offset,
2977 direction,
2978 NULL, &p, NULL);
2979 if (r <= 0)
2980 return r;
2981
2982 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2983 if (r < 0)
2984 return r;
2985
2986 r = generic_array_bisect_plus_one(f,
2987 le64toh(o->data.entry_offset),
2988 le64toh(o->data.entry_array_offset),
2989 le64toh(o->data.n_entries),
2990 p,
2991 test_object_offset,
2992 direction,
2993 &qo, &q, NULL);
2994
2995 if (r <= 0)
2996 return r;
2997
2998 if (p == q) {
2999 if (ret)
3000 *ret = qo;
3001 if (ret_offset)
3002 *ret_offset = q;
3003
3004 return 1;
3005 }
3006
3007 z = q;
3008 }
3009 }
3010
3011 int journal_file_move_to_entry_by_seqnum_for_data(
3012 JournalFile *f,
3013 uint64_t data_offset,
3014 uint64_t seqnum,
3015 direction_t direction,
3016 Object **ret, uint64_t *ret_offset) {
3017
3018 Object *d;
3019 int r;
3020
3021 assert(f);
3022
3023 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
3024 if (r < 0)
3025 return r;
3026
3027 return generic_array_bisect_plus_one(
3028 f,
3029 le64toh(d->data.entry_offset),
3030 le64toh(d->data.entry_array_offset),
3031 le64toh(d->data.n_entries),
3032 seqnum,
3033 test_object_seqnum,
3034 direction,
3035 ret, ret_offset, NULL);
3036 }
3037
3038 int journal_file_move_to_entry_by_realtime_for_data(
3039 JournalFile *f,
3040 uint64_t data_offset,
3041 uint64_t realtime,
3042 direction_t direction,
3043 Object **ret, uint64_t *ret_offset) {
3044
3045 Object *d;
3046 int r;
3047
3048 assert(f);
3049
3050 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
3051 if (r < 0)
3052 return r;
3053
3054 return generic_array_bisect_plus_one(
3055 f,
3056 le64toh(d->data.entry_offset),
3057 le64toh(d->data.entry_array_offset),
3058 le64toh(d->data.n_entries),
3059 realtime,
3060 test_object_realtime,
3061 direction,
3062 ret, ret_offset, NULL);
3063 }
3064
3065 void journal_file_dump(JournalFile *f) {
3066 Object *o;
3067 int r;
3068 uint64_t p;
3069
3070 assert(f);
3071 assert(f->header);
3072
3073 journal_file_print_header(f);
3074
3075 p = le64toh(READ_NOW(f->header->header_size));
3076 while (p != 0) {
3077 const char *s;
3078
3079 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3080 if (r < 0)
3081 goto fail;
3082
3083 s = journal_object_type_to_string(o->object.type);
3084
3085 switch (o->object.type) {
3086
3087 case OBJECT_ENTRY:
3088 assert(s);
3089
3090 printf("Type: %s seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3091 s,
3092 le64toh(o->entry.seqnum),
3093 le64toh(o->entry.monotonic),
3094 le64toh(o->entry.realtime));
3095 break;
3096
3097 case OBJECT_TAG:
3098 assert(s);
3099
3100 printf("Type: %s seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3101 s,
3102 le64toh(o->tag.seqnum),
3103 le64toh(o->tag.epoch));
3104 break;
3105
3106 default:
3107 if (s)
3108 printf("Type: %s \n", s);
3109 else
3110 printf("Type: unknown (%i)", o->object.type);
3111
3112 break;
3113 }
3114
3115 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3116 printf("Flags: %s\n",
3117 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
3118
3119 if (p == le64toh(f->header->tail_object_offset))
3120 p = 0;
3121 else
3122 p += ALIGN64(le64toh(o->object.size));
3123 }
3124
3125 return;
3126 fail:
3127 log_error("File corrupt");
3128 }
3129
3130 /* Note: the lifetime of the compound literal is the immediately surrounding block. */
3131 #define FORMAT_TIMESTAMP_SAFE(t) (FORMAT_TIMESTAMP(t) ?: " --- ")
3132
3133 void journal_file_print_header(JournalFile *f) {
3134 struct stat st;
3135
3136 assert(f);
3137 assert(f->header);
3138
3139 printf("File path: %s\n"
3140 "File ID: %s\n"
3141 "Machine ID: %s\n"
3142 "Boot ID: %s\n"
3143 "Sequential number ID: %s\n"
3144 "State: %s\n"
3145 "Compatible flags:%s%s\n"
3146 "Incompatible flags:%s%s%s%s%s\n"
3147 "Header size: %"PRIu64"\n"
3148 "Arena size: %"PRIu64"\n"
3149 "Data hash table size: %"PRIu64"\n"
3150 "Field hash table size: %"PRIu64"\n"
3151 "Rotate suggested: %s\n"
3152 "Head sequential number: %"PRIu64" (%"PRIx64")\n"
3153 "Tail sequential number: %"PRIu64" (%"PRIx64")\n"
3154 "Head realtime timestamp: %s (%"PRIx64")\n"
3155 "Tail realtime timestamp: %s (%"PRIx64")\n"
3156 "Tail monotonic timestamp: %s (%"PRIx64")\n"
3157 "Objects: %"PRIu64"\n"
3158 "Entry objects: %"PRIu64"\n",
3159 f->path,
3160 SD_ID128_TO_STRING(f->header->file_id),
3161 SD_ID128_TO_STRING(f->header->machine_id),
3162 SD_ID128_TO_STRING(f->header->boot_id),
3163 SD_ID128_TO_STRING(f->header->seqnum_id),
3164 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3165 f->header->state == STATE_ONLINE ? "ONLINE" :
3166 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3167 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3168 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3169 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3170 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3171 JOURNAL_HEADER_COMPRESSED_ZSTD(f->header) ? " COMPRESSED-ZSTD" : "",
3172 JOURNAL_HEADER_KEYED_HASH(f->header) ? " KEYED-HASH" : "",
3173 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3174 le64toh(f->header->header_size),
3175 le64toh(f->header->arena_size),
3176 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3177 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3178 yes_no(journal_file_rotate_suggested(f, 0, LOG_DEBUG)),
3179 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3180 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3181 FORMAT_TIMESTAMP_SAFE(le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3182 FORMAT_TIMESTAMP_SAFE(le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3183 FORMAT_TIMESPAN(le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3184 le64toh(f->header->n_objects),
3185 le64toh(f->header->n_entries));
3186
3187 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3188 printf("Data objects: %"PRIu64"\n"
3189 "Data hash table fill: %.1f%%\n",
3190 le64toh(f->header->n_data),
3191 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3192
3193 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3194 printf("Field objects: %"PRIu64"\n"
3195 "Field hash table fill: %.1f%%\n",
3196 le64toh(f->header->n_fields),
3197 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3198
3199 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3200 printf("Tag objects: %"PRIu64"\n",
3201 le64toh(f->header->n_tags));
3202 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3203 printf("Entry array objects: %"PRIu64"\n",
3204 le64toh(f->header->n_entry_arrays));
3205
3206 if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth))
3207 printf("Deepest field hash chain: %" PRIu64"\n",
3208 f->header->field_hash_chain_depth);
3209
3210 if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth))
3211 printf("Deepest data hash chain: %" PRIu64"\n",
3212 f->header->data_hash_chain_depth);
3213
3214 if (fstat(f->fd, &st) >= 0)
3215 printf("Disk usage: %s\n", FORMAT_BYTES((uint64_t) st.st_blocks * 512ULL));
3216 }
3217
3218 static int journal_file_warn_btrfs(JournalFile *f) {
3219 unsigned attrs;
3220 int r;
3221
3222 assert(f);
3223
3224 /* Before we write anything, check if the COW logic is turned
3225 * off on btrfs. Given our write pattern that is quite
3226 * unfriendly to COW file systems this should greatly improve
3227 * performance on COW file systems, such as btrfs, at the
3228 * expense of data integrity features (which shouldn't be too
3229 * bad, given that we do our own checksumming). */
3230
3231 r = fd_is_fs_type(f->fd, BTRFS_SUPER_MAGIC);
3232 if (r < 0)
3233 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3234 if (!r)
3235 return 0;
3236
3237 r = read_attr_fd(f->fd, &attrs);
3238 if (r < 0)
3239 return log_warning_errno(r, "Failed to read file attributes: %m");
3240
3241 if (attrs & FS_NOCOW_FL) {
3242 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3243 return 0;
3244 }
3245
3246 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3247 "This is likely to slow down journal access substantially, please consider turning "
3248 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3249
3250 return 1;
3251 }
3252
3253 int journal_file_open(
3254 int fd,
3255 const char *fname,
3256 int flags,
3257 mode_t mode,
3258 bool compress,
3259 uint64_t compress_threshold_bytes,
3260 bool seal,
3261 JournalMetrics *metrics,
3262 MMapCache *mmap_cache,
3263 JournalFile *template,
3264 JournalFile **ret) {
3265
3266 bool newly_created = false;
3267 JournalFile *f;
3268 void *h;
3269 int r;
3270
3271 assert(ret);
3272 assert(fd >= 0 || fname);
3273
3274 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
3275 return -EINVAL;
3276
3277 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3278 return -EINVAL;
3279
3280 f = new(JournalFile, 1);
3281 if (!f)
3282 return -ENOMEM;
3283
3284 *f = (JournalFile) {
3285 .fd = fd,
3286 .mode = mode,
3287
3288 .flags = flags,
3289 .writable = (flags & O_ACCMODE) != O_RDONLY,
3290
3291 #if HAVE_ZSTD
3292 .compress_zstd = compress,
3293 #elif HAVE_LZ4
3294 .compress_lz4 = compress,
3295 #elif HAVE_XZ
3296 .compress_xz = compress,
3297 #endif
3298 .compress_threshold_bytes = compress_threshold_bytes == UINT64_MAX ?
3299 DEFAULT_COMPRESS_THRESHOLD :
3300 MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
3301 #if HAVE_GCRYPT
3302 .seal = seal,
3303 #endif
3304 };
3305
3306 /* We turn on keyed hashes by default, but provide an environment variable to turn them off, if
3307 * people really want that */
3308 r = getenv_bool("SYSTEMD_JOURNAL_KEYED_HASH");
3309 if (r < 0) {
3310 if (r != -ENXIO)
3311 log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_KEYED_HASH environment variable, ignoring.");
3312 f->keyed_hash = true;
3313 } else
3314 f->keyed_hash = r;
3315
3316 if (DEBUG_LOGGING) {
3317 static int last_seal = -1, last_compress = -1, last_keyed_hash = -1;
3318 static uint64_t last_bytes = UINT64_MAX;
3319
3320 if (last_seal != f->seal ||
3321 last_keyed_hash != f->keyed_hash ||
3322 last_compress != JOURNAL_FILE_COMPRESS(f) ||
3323 last_bytes != f->compress_threshold_bytes) {
3324
3325 log_debug("Journal effective settings seal=%s keyed_hash=%s compress=%s compress_threshold_bytes=%s",
3326 yes_no(f->seal), yes_no(f->keyed_hash), yes_no(JOURNAL_FILE_COMPRESS(f)),
3327 FORMAT_BYTES(f->compress_threshold_bytes));
3328 last_seal = f->seal;
3329 last_keyed_hash = f->keyed_hash;
3330 last_compress = JOURNAL_FILE_COMPRESS(f);
3331 last_bytes = f->compress_threshold_bytes;
3332 }
3333 }
3334
3335 if (mmap_cache)
3336 f->mmap = mmap_cache_ref(mmap_cache);
3337 else {
3338 f->mmap = mmap_cache_new();
3339 if (!f->mmap) {
3340 r = -ENOMEM;
3341 goto fail;
3342 }
3343 }
3344
3345 if (fname) {
3346 f->path = strdup(fname);
3347 if (!f->path) {
3348 r = -ENOMEM;
3349 goto fail;
3350 }
3351 } else {
3352 assert(fd >= 0);
3353
3354 /* If we don't know the path, fill in something explanatory and vaguely useful */
3355 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3356 r = -ENOMEM;
3357 goto fail;
3358 }
3359 }
3360
3361 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3362 if (!f->chain_cache) {
3363 r = -ENOMEM;
3364 goto fail;
3365 }
3366
3367 if (f->fd < 0) {
3368 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3369 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3370 * it doesn't hurt in that case. */
3371
3372 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
3373 if (f->fd < 0) {
3374 r = -errno;
3375 goto fail;
3376 }
3377
3378 /* fds we opened here by us should also be closed by us. */
3379 f->close_fd = true;
3380
3381 r = fd_nonblock(f->fd, false);
3382 if (r < 0)
3383 goto fail;
3384 }
3385
3386 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd, prot_from_flags(flags));
3387 if (!f->cache_fd) {
3388 r = -ENOMEM;
3389 goto fail;
3390 }
3391
3392 r = journal_file_fstat(f);
3393 if (r < 0)
3394 goto fail;
3395
3396 if (f->last_stat.st_size == 0 && f->writable) {
3397
3398 (void) journal_file_warn_btrfs(f);
3399
3400 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3401 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3402 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3403 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3404 * solely on mtime/atime/ctime of the file. */
3405 (void) fd_setcrtime(f->fd, 0);
3406
3407 #if HAVE_GCRYPT
3408 /* Try to load the FSPRG state, and if we can't, then
3409 * just don't do sealing */
3410 if (f->seal) {
3411 r = journal_file_fss_load(f);
3412 if (r < 0)
3413 f->seal = false;
3414 }
3415 #endif
3416
3417 r = journal_file_init_header(f, template);
3418 if (r < 0)
3419 goto fail;
3420
3421 r = journal_file_fstat(f);
3422 if (r < 0)
3423 goto fail;
3424
3425 newly_created = true;
3426 }
3427
3428 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3429 r = -ENODATA;
3430 goto fail;
3431 }
3432
3433 r = mmap_cache_fd_get(f->cache_fd, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
3434 if (r == -EINVAL) {
3435 /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only
3436 * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error
3437 * code. */
3438 r = -EAFNOSUPPORT;
3439 goto fail;
3440 }
3441 if (r < 0)
3442 goto fail;
3443
3444 f->header = h;
3445
3446 if (!newly_created) {
3447 r = journal_file_verify_header(f);
3448 if (r < 0)
3449 goto fail;
3450 }
3451
3452 #if HAVE_GCRYPT
3453 if (!newly_created && f->writable) {
3454 r = journal_file_fss_load(f);
3455 if (r < 0)
3456 goto fail;
3457 }
3458 #endif
3459
3460 if (f->writable) {
3461 if (metrics) {
3462 journal_default_metrics(metrics, f->fd);
3463 f->metrics = *metrics;
3464 } else if (template)
3465 f->metrics = template->metrics;
3466
3467 r = journal_file_refresh_header(f);
3468 if (r < 0)
3469 goto fail;
3470 }
3471
3472 #if HAVE_GCRYPT
3473 r = journal_file_hmac_setup(f);
3474 if (r < 0)
3475 goto fail;
3476 #endif
3477
3478 if (newly_created) {
3479 r = journal_file_setup_field_hash_table(f);
3480 if (r < 0)
3481 goto fail;
3482
3483 r = journal_file_setup_data_hash_table(f);
3484 if (r < 0)
3485 goto fail;
3486
3487 #if HAVE_GCRYPT
3488 r = journal_file_append_first_tag(f);
3489 if (r < 0)
3490 goto fail;
3491 #endif
3492 }
3493
3494 if (mmap_cache_fd_got_sigbus(f->cache_fd)) {
3495 r = -EIO;
3496 goto fail;
3497 }
3498
3499 if (template && template->post_change_timer) {
3500 r = journal_file_enable_post_change_timer(
3501 f,
3502 sd_event_source_get_event(template->post_change_timer),
3503 template->post_change_timer_period);
3504
3505 if (r < 0)
3506 goto fail;
3507 }
3508
3509 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3510 f->close_fd = true;
3511
3512 *ret = f;
3513 return 0;
3514
3515 fail:
3516 if (f->cache_fd && mmap_cache_fd_got_sigbus(f->cache_fd))
3517 r = -EIO;
3518
3519 (void) journal_file_close(f);
3520
3521 return r;
3522 }
3523
3524 int journal_file_archive(JournalFile *f, char **ret_previous_path) {
3525 _cleanup_free_ char *p = NULL;
3526
3527 assert(f);
3528
3529 if (!f->writable)
3530 return -EINVAL;
3531
3532 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3533 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3534 if (path_startswith(f->path, "/proc/self/fd"))
3535 return -EINVAL;
3536
3537 if (!endswith(f->path, ".journal"))
3538 return -EINVAL;
3539
3540 if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3541 (int) strlen(f->path) - 8, f->path,
3542 SD_ID128_FORMAT_VAL(f->header->seqnum_id),
3543 le64toh(f->header->head_entry_seqnum),
3544 le64toh(f->header->head_entry_realtime)) < 0)
3545 return -ENOMEM;
3546
3547 /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
3548 * ignore that case. */
3549 if (rename(f->path, p) < 0 && errno != ENOENT)
3550 return -errno;
3551
3552 /* Sync the rename to disk */
3553 (void) fsync_directory_of_file(f->fd);
3554
3555 if (ret_previous_path)
3556 *ret_previous_path = f->path;
3557 else
3558 free(f->path);
3559
3560 f->path = TAKE_PTR(p);
3561
3562 /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
3563 * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
3564 * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue
3565 * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
3566 * occurs. */
3567 f->archive = true;
3568
3569 /* Currently, btrfs is not very good with out write patterns and fragments heavily. Let's defrag our journal
3570 * files when we archive them */
3571 f->defrag_on_close = true;
3572
3573 return 0;
3574 }
3575
3576 int journal_file_dispose(int dir_fd, const char *fname) {
3577 _cleanup_free_ char *p = NULL;
3578 _cleanup_close_ int fd = -1;
3579
3580 assert(fname);
3581
3582 /* Renames a journal file to *.journal~, i.e. to mark it as corrupted or otherwise uncleanly shutdown. Note that
3583 * this is done without looking into the file or changing any of its contents. The idea is that this is called
3584 * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
3585 * for writing anymore. */
3586
3587 if (!endswith(fname, ".journal"))
3588 return -EINVAL;
3589
3590 if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
3591 (int) strlen(fname) - 8, fname,
3592 now(CLOCK_REALTIME),
3593 random_u64()) < 0)
3594 return -ENOMEM;
3595
3596 if (renameat(dir_fd, fname, dir_fd, p) < 0)
3597 return -errno;
3598
3599 /* btrfs doesn't cope well with our write pattern and fragments heavily. Let's defrag all files we rotate */
3600 fd = openat(dir_fd, p, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
3601 if (fd < 0)
3602 log_debug_errno(errno, "Failed to open file for defragmentation/FS_NOCOW_FL, ignoring: %m");
3603 else {
3604 (void) chattr_fd(fd, 0, FS_NOCOW_FL, NULL);
3605 (void) btrfs_defrag_fd(fd);
3606 }
3607
3608 return 0;
3609 }
3610
3611 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
3612 uint64_t q, n, xor_hash = 0;
3613 const sd_id128_t *boot_id;
3614 dual_timestamp ts;
3615 EntryItem *items;
3616 int r;
3617
3618 assert(from);
3619 assert(to);
3620 assert(o);
3621 assert(p);
3622
3623 if (!to->writable)
3624 return -EPERM;
3625
3626 ts = (dual_timestamp) {
3627 .monotonic = le64toh(o->entry.monotonic),
3628 .realtime = le64toh(o->entry.realtime),
3629 };
3630 boot_id = &o->entry.boot_id;
3631
3632 n = journal_file_entry_n_items(o);
3633 items = newa(EntryItem, n);
3634
3635 for (uint64_t i = 0; i < n; i++) {
3636 uint64_t l, h;
3637 le64_t le_hash;
3638 size_t t;
3639 void *data;
3640 Object *u;
3641
3642 q = le64toh(o->entry.items[i].object_offset);
3643 le_hash = o->entry.items[i].hash;
3644
3645 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3646 if (r < 0)
3647 return r;
3648
3649 if (le_hash != o->data.hash)
3650 return -EBADMSG;
3651
3652 l = le64toh(READ_NOW(o->object.size));
3653 if (l < offsetof(Object, data.payload))
3654 return -EBADMSG;
3655
3656 l -= offsetof(Object, data.payload);
3657 t = (size_t) l;
3658
3659 /* We hit the limit on 32bit machines */
3660 if ((uint64_t) t != l)
3661 return -E2BIG;
3662
3663 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3664 #if HAVE_COMPRESSION
3665 size_t rsize = 0;
3666
3667 r = decompress_blob(
3668 o->object.flags & OBJECT_COMPRESSION_MASK,
3669 o->data.payload, l,
3670 &from->compress_buffer, &rsize,
3671 0);
3672 if (r < 0)
3673 return r;
3674
3675 data = from->compress_buffer;
3676 l = rsize;
3677 #else
3678 return -EPROTONOSUPPORT;
3679 #endif
3680 } else
3681 data = o->data.payload;
3682
3683 if (l == 0)
3684 return -EBADMSG;
3685
3686 r = journal_file_append_data(to, data, l, &u, &h);
3687 if (r < 0)
3688 return r;
3689
3690 if (JOURNAL_HEADER_KEYED_HASH(to->header))
3691 xor_hash ^= jenkins_hash64(data, l);
3692 else
3693 xor_hash ^= le64toh(u->data.hash);
3694
3695 items[i] = (EntryItem) {
3696 .object_offset = htole64(h),
3697 .hash = u->data.hash,
3698 };
3699
3700 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3701 if (r < 0)
3702 return r;
3703 }
3704
3705 r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n, NULL, NULL, NULL);
3706
3707 if (mmap_cache_fd_got_sigbus(to->cache_fd))
3708 return -EIO;
3709
3710 return r;
3711 }
3712
3713 void journal_reset_metrics(JournalMetrics *m) {
3714 assert(m);
3715
3716 /* Set everything to "pick automatic values". */
3717
3718 *m = (JournalMetrics) {
3719 .min_use = UINT64_MAX,
3720 .max_use = UINT64_MAX,
3721 .min_size = UINT64_MAX,
3722 .max_size = UINT64_MAX,
3723 .keep_free = UINT64_MAX,
3724 .n_max_files = UINT64_MAX,
3725 };
3726 }
3727
3728 void journal_default_metrics(JournalMetrics *m, int fd) {
3729 struct statvfs ss;
3730 uint64_t fs_size = 0;
3731
3732 assert(m);
3733 assert(fd >= 0);
3734
3735 if (fstatvfs(fd, &ss) >= 0)
3736 fs_size = ss.f_frsize * ss.f_blocks;
3737 else
3738 log_debug_errno(errno, "Failed to determine disk size: %m");
3739
3740 if (m->max_use == UINT64_MAX) {
3741
3742 if (fs_size > 0)
3743 m->max_use = CLAMP(PAGE_ALIGN(fs_size / 10), /* 10% of file system size */
3744 MAX_USE_LOWER, MAX_USE_UPPER);
3745 else
3746 m->max_use = MAX_USE_LOWER;
3747 } else {
3748 m->max_use = PAGE_ALIGN(m->max_use);
3749
3750 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3751 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3752 }
3753
3754 if (m->min_use == UINT64_MAX) {
3755 if (fs_size > 0)
3756 m->min_use = CLAMP(PAGE_ALIGN(fs_size / 50), /* 2% of file system size */
3757 MIN_USE_LOW, MIN_USE_HIGH);
3758 else
3759 m->min_use = MIN_USE_LOW;
3760 }
3761
3762 if (m->min_use > m->max_use)
3763 m->min_use = m->max_use;
3764
3765 if (m->max_size == UINT64_MAX)
3766 m->max_size = MIN(PAGE_ALIGN(m->max_use / 8), /* 8 chunks */
3767 MAX_SIZE_UPPER);
3768 else
3769 m->max_size = PAGE_ALIGN(m->max_size);
3770
3771 if (m->max_size != 0) {
3772 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3773 m->max_size = JOURNAL_FILE_SIZE_MIN;
3774
3775 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3776 m->max_use = m->max_size*2;
3777 }
3778
3779 if (m->min_size == UINT64_MAX)
3780 m->min_size = JOURNAL_FILE_SIZE_MIN;
3781 else
3782 m->min_size = CLAMP(PAGE_ALIGN(m->min_size),
3783 JOURNAL_FILE_SIZE_MIN,
3784 m->max_size ?: UINT64_MAX);
3785
3786 if (m->keep_free == UINT64_MAX) {
3787 if (fs_size > 0)
3788 m->keep_free = MIN(PAGE_ALIGN(fs_size / 20), /* 5% of file system size */
3789 KEEP_FREE_UPPER);
3790 else
3791 m->keep_free = DEFAULT_KEEP_FREE;
3792 }
3793
3794 if (m->n_max_files == UINT64_MAX)
3795 m->n_max_files = DEFAULT_N_MAX_FILES;
3796
3797 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3798 FORMAT_BYTES(m->min_use),
3799 FORMAT_BYTES(m->max_use),
3800 FORMAT_BYTES(m->max_size),
3801 FORMAT_BYTES(m->min_size),
3802 FORMAT_BYTES(m->keep_free),
3803 m->n_max_files);
3804 }
3805
3806 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3807 assert(f);
3808 assert(f->header);
3809 assert(from || to);
3810
3811 if (from) {
3812 if (f->header->head_entry_realtime == 0)
3813 return -ENOENT;
3814
3815 *from = le64toh(f->header->head_entry_realtime);
3816 }
3817
3818 if (to) {
3819 if (f->header->tail_entry_realtime == 0)
3820 return -ENOENT;
3821
3822 *to = le64toh(f->header->tail_entry_realtime);
3823 }
3824
3825 return 1;
3826 }
3827
3828 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3829 Object *o;
3830 uint64_t p;
3831 int r;
3832
3833 assert(f);
3834 assert(from || to);
3835
3836 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3837 if (r <= 0)
3838 return r;
3839
3840 if (le64toh(o->data.n_entries) <= 0)
3841 return 0;
3842
3843 if (from) {
3844 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3845 if (r < 0)
3846 return r;
3847
3848 *from = le64toh(o->entry.monotonic);
3849 }
3850
3851 if (to) {
3852 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3853 if (r < 0)
3854 return r;
3855
3856 r = generic_array_get_plus_one(f,
3857 le64toh(o->data.entry_offset),
3858 le64toh(o->data.entry_array_offset),
3859 le64toh(o->data.n_entries)-1,
3860 &o, NULL);
3861 if (r <= 0)
3862 return r;
3863
3864 *to = le64toh(o->entry.monotonic);
3865 }
3866
3867 return 1;
3868 }
3869
3870 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec, int log_level) {
3871 assert(f);
3872 assert(f->header);
3873
3874 /* If we gained new header fields we gained new features,
3875 * hence suggest a rotation */
3876 if (le64toh(f->header->header_size) < sizeof(Header)) {
3877 log_full(log_level, "%s uses an outdated header, suggesting rotation.", f->path);
3878 return true;
3879 }
3880
3881 /* Let's check if the hash tables grew over a certain fill level (75%, borrowing this value from
3882 * Java's hash table implementation), and if so suggest a rotation. To calculate the fill level we
3883 * need the n_data field, which only exists in newer versions. */
3884
3885 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3886 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3887 log_full(log_level,
3888 "Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3889 f->path,
3890 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3891 le64toh(f->header->n_data),
3892 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3893 (unsigned long long) f->last_stat.st_size,
3894 f->last_stat.st_size / le64toh(f->header->n_data));
3895 return true;
3896 }
3897
3898 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3899 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3900 log_full(log_level,
3901 "Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3902 f->path,
3903 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3904 le64toh(f->header->n_fields),
3905 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3906 return true;
3907 }
3908
3909 /* If there are too many hash collisions somebody is most likely playing games with us. Hence, if our
3910 * longest chain is longer than some threshold, let's suggest rotation. */
3911 if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) &&
3912 le64toh(f->header->data_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {
3913 log_full(log_level,
3914 "Data hash table of %s has deepest hash chain of length %" PRIu64 ", suggesting rotation.",
3915 f->path, le64toh(f->header->data_hash_chain_depth));
3916 return true;
3917 }
3918
3919 if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) &&
3920 le64toh(f->header->field_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {
3921 log_full(log_level,
3922 "Field hash table of %s has deepest hash chain of length at %" PRIu64 ", suggesting rotation.",
3923 f->path, le64toh(f->header->field_hash_chain_depth));
3924 return true;
3925 }
3926
3927 /* Are the data objects properly indexed by field objects? */
3928 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3929 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3930 le64toh(f->header->n_data) > 0 &&
3931 le64toh(f->header->n_fields) == 0) {
3932 log_full(log_level,
3933 "Data objects of %s are not indexed by field objects, suggesting rotation.",
3934 f->path);
3935 return true;
3936 }
3937
3938 if (max_file_usec > 0) {
3939 usec_t t, h;
3940
3941 h = le64toh(f->header->head_entry_realtime);
3942 t = now(CLOCK_REALTIME);
3943
3944 if (h > 0 && t > h + max_file_usec) {
3945 log_full(log_level,
3946 "Oldest entry in %s is older than the configured file retention duration (%s), suggesting rotation.",
3947 f->path, FORMAT_TIMESPAN(max_file_usec, USEC_PER_SEC));
3948 return true;
3949 }
3950 }
3951
3952 return false;
3953 }
3954
3955 static const char * const journal_object_type_table[] = {
3956 [OBJECT_UNUSED] = "unused",
3957 [OBJECT_DATA] = "data",
3958 [OBJECT_FIELD] = "field",
3959 [OBJECT_ENTRY] = "entry",
3960 [OBJECT_DATA_HASH_TABLE] = "data hash table",
3961 [OBJECT_FIELD_HASH_TABLE] = "field hash table",
3962 [OBJECT_ENTRY_ARRAY] = "entry array",
3963 [OBJECT_TAG] = "tag",
3964 };
3965
3966 DEFINE_STRING_TABLE_LOOKUP_TO_STRING(journal_object_type, ObjectType);