]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
Merge pull request #2436 from grawity/fix/tasks-max
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
cec736d2 22#include <errno.h>
cec736d2 23#include <fcntl.h>
11689d2a 24#include <linux/fs.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
b5efdb8a 31#include "alloc-util.h"
f27a3864 32#include "btrfs-util.h"
c8b3094d 33#include "chattr-util.h"
07630cea 34#include "compress.h"
3ffd4af2 35#include "fd-util.h"
0284adc6 36#include "journal-authenticate.h"
cec736d2
LP
37#include "journal-def.h"
38#include "journal-file.h"
39#include "lookup3.h"
6bedfcbb 40#include "parse-util.h"
3df3e884 41#include "random-util.h"
7a24f3bf 42#include "sd-event.h"
07630cea 43#include "string-util.h"
89a5a90c 44#include "xattr-util.h"
cec736d2 45
4a92baf3
LP
46#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
47#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 48
be19b7df 49#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 50
babfc091 51/* This is the minimum journal file size */
16098e93 52#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
babfc091
LP
53
54/* These are the lower and upper bounds if we deduce the max_use value
55 * from the file system size */
56#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
57#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58
8580d1f7
LP
59/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
60#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
61
babfc091 62/* This is the upper bound if we deduce max_size from max_use */
71100051 63#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
64
65/* This is the upper bound if we deduce the keep_free value from the
66 * file system size */
67#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
68
69/* This is the keep_free value when we can't determine the system
70 * size */
71#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
72
8580d1f7
LP
73/* This is the default maximum number of journal files to keep around. */
74#define DEFAULT_N_MAX_FILES (100)
75
dca6219e
LP
76/* n_data was the first entry we added after the initial file format design */
77#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 78
a4bcff5b
LP
79/* How many entries to keep in the entry array chain cache at max */
80#define CHAIN_CACHE_MAX 20
81
a676e665
LP
82/* How much to increase the journal file size at once each time we allocate something new. */
83#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
84
2678031a
LP
85/* Reread fstat() of the file for detecting deletions at least this often */
86#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
87
fa6ac760
LP
88/* The mmap context to use for the header we pick as one above the last defined typed */
89#define CONTEXT_HEADER _OBJECT_TYPE_MAX
90
9588bc32 91static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
92 assert(f);
93
94 if (!f->writable)
95 return -EPERM;
96
97 if (!(f->fd >= 0 && f->header))
98 return -EINVAL;
99
fa6ac760
LP
100 if (mmap_cache_got_sigbus(f->mmap, f->fd))
101 return -EIO;
102
26687bf8
OS
103 switch(f->header->state) {
104 case STATE_ONLINE:
105 return 0;
106
107 case STATE_OFFLINE:
108 f->header->state = STATE_ONLINE;
109 fsync(f->fd);
110 return 0;
111
112 default:
113 return -EINVAL;
114 }
115}
116
117int journal_file_set_offline(JournalFile *f) {
118 assert(f);
119
120 if (!f->writable)
121 return -EPERM;
122
123 if (!(f->fd >= 0 && f->header))
124 return -EINVAL;
125
126 if (f->header->state != STATE_ONLINE)
127 return 0;
128
129 fsync(f->fd);
130
fa6ac760
LP
131 if (mmap_cache_got_sigbus(f->mmap, f->fd))
132 return -EIO;
133
26687bf8
OS
134 f->header->state = STATE_OFFLINE;
135
fa6ac760
LP
136 if (mmap_cache_got_sigbus(f->mmap, f->fd))
137 return -EIO;
138
26687bf8
OS
139 fsync(f->fd);
140
141 return 0;
142}
143
804ae586 144JournalFile* journal_file_close(JournalFile *f) {
de190aef 145 assert(f);
cec736d2 146
feb12d3e 147#ifdef HAVE_GCRYPT
b0af6f41 148 /* Write the final tag */
c586dbf1 149 if (f->seal && f->writable)
b0af6f41 150 journal_file_append_tag(f);
feb12d3e 151#endif
b0af6f41 152
7a24f3bf
VC
153 if (f->post_change_timer) {
154 int enabled;
155
156 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
157 if (enabled == SD_EVENT_ONESHOT)
158 journal_file_post_change(f);
159
160 sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
161 sd_event_source_unref(f->post_change_timer);
162 }
163
26687bf8 164 journal_file_set_offline(f);
cec736d2 165
fa6ac760
LP
166 if (f->mmap && f->fd >= 0)
167 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 168
11689d2a
LP
169 if (f->fd >= 0 && f->defrag_on_close) {
170
171 /* Be friendly to btrfs: turn COW back on again now,
172 * and defragment the file. We won't write to the file
173 * ever again, hence remove all fragmentation, and
174 * reenable all the good bits COW usually provides
175 * (such as data checksumming). */
176
1ed8f8c1 177 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
178 (void) btrfs_defrag_fd(f->fd);
179 }
f27a3864 180
03e334a1 181 safe_close(f->fd);
cec736d2 182 free(f->path);
807e17f0 183
f649045c 184 mmap_cache_unref(f->mmap);
16e9f408 185
4743015d 186 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 187
d89c8fdf 188#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
189 free(f->compress_buffer);
190#endif
191
7560fffc 192#ifdef HAVE_GCRYPT
baed47c3
LP
193 if (f->fss_file)
194 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 195 else
b7c9ae91
LP
196 free(f->fsprg_state);
197
198 free(f->fsprg_seed);
7560fffc
LP
199
200 if (f->hmac)
201 gcry_md_close(f->hmac);
202#endif
203
cec736d2 204 free(f);
804ae586 205 return NULL;
cec736d2
LP
206}
207
0ac38b70 208static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 209 Header h = {};
cec736d2
LP
210 ssize_t k;
211 int r;
212
213 assert(f);
214
7560fffc 215 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 216 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 217
d89c8fdf
ZJS
218 h.incompatible_flags |= htole32(
219 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
220 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 221
d89c8fdf
ZJS
222 h.compatible_flags = htole32(
223 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 224
cec736d2
LP
225 r = sd_id128_randomize(&h.file_id);
226 if (r < 0)
227 return r;
228
0ac38b70
LP
229 if (template) {
230 h.seqnum_id = template->header->seqnum_id;
beec0085 231 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
232 } else
233 h.seqnum_id = h.file_id;
cec736d2
LP
234
235 k = pwrite(f->fd, &h, sizeof(h), 0);
236 if (k < 0)
237 return -errno;
238
239 if (k != sizeof(h))
240 return -EIO;
241
242 return 0;
243}
244
245static int journal_file_refresh_header(JournalFile *f) {
de190aef 246 sd_id128_t boot_id;
fa6ac760 247 int r;
cec736d2
LP
248
249 assert(f);
250
251 r = sd_id128_get_machine(&f->header->machine_id);
252 if (r < 0)
253 return r;
254
de190aef 255 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
256 if (r < 0)
257 return r;
258
de190aef
LP
259 if (sd_id128_equal(boot_id, f->header->boot_id))
260 f->tail_entry_monotonic_valid = true;
261
262 f->header->boot_id = boot_id;
263
fa6ac760 264 r = journal_file_set_online(f);
b788cc23 265
7560fffc 266 /* Sync the online state to disk */
a676e665 267 fsync(f->fd);
b788cc23 268
fa6ac760 269 return r;
cec736d2
LP
270}
271
272static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
273 uint32_t flags;
274
cec736d2
LP
275 assert(f);
276
7560fffc 277 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
278 return -EBADMSG;
279
7560fffc
LP
280 /* In both read and write mode we refuse to open files with
281 * incompatible flags we don't know */
d89c8fdf
ZJS
282 flags = le32toh(f->header->incompatible_flags);
283 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
284 if (flags & ~HEADER_INCOMPATIBLE_ANY)
285 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
286 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
287 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
288 if (flags)
289 log_debug("Journal file %s uses incompatible flags %"PRIx32
290 " disabled at compilation time.", f->path, flags);
cec736d2 291 return -EPROTONOSUPPORT;
d89c8fdf 292 }
cec736d2 293
7560fffc
LP
294 /* When open for writing we refuse to open files with
295 * compatible flags, too */
d89c8fdf
ZJS
296 flags = le32toh(f->header->compatible_flags);
297 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
298 if (flags & ~HEADER_COMPATIBLE_ANY)
299 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
300 f->path, flags & ~HEADER_COMPATIBLE_ANY);
301 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
302 if (flags)
303 log_debug("Journal file %s uses compatible flags %"PRIx32
304 " disabled at compilation time.", f->path, flags);
305 return -EPROTONOSUPPORT;
7560fffc
LP
306 }
307
db11ac1a
LP
308 if (f->header->state >= _STATE_MAX)
309 return -EBADMSG;
310
dca6219e
LP
311 /* The first addition was n_data, so check that we are at least this large */
312 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
313 return -EBADMSG;
314
8088cbd3 315 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
316 return -EBADMSG;
317
db11ac1a
LP
318 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
319 return -ENODATA;
320
321 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
322 return -ENODATA;
323
7762e02b
LP
324 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
325 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
326 !VALID64(le64toh(f->header->tail_object_offset)) ||
327 !VALID64(le64toh(f->header->entry_array_offset)))
328 return -ENODATA;
329
cec736d2 330 if (f->writable) {
ccdbaf91 331 uint8_t state;
cec736d2
LP
332 sd_id128_t machine_id;
333 int r;
334
335 r = sd_id128_get_machine(&machine_id);
336 if (r < 0)
337 return r;
338
339 if (!sd_id128_equal(machine_id, f->header->machine_id))
340 return -EHOSTDOWN;
341
de190aef 342 state = f->header->state;
cec736d2 343
71fa6f00
LP
344 if (state == STATE_ONLINE) {
345 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
346 return -EBUSY;
347 } else if (state == STATE_ARCHIVED)
cec736d2 348 return -ESHUTDOWN;
71fa6f00 349 else if (state != STATE_OFFLINE) {
8facc349 350 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
351 return -EBUSY;
352 }
cec736d2
LP
353 }
354
d89c8fdf
ZJS
355 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
356 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 357
f1889c91 358 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 359
cec736d2
LP
360 return 0;
361}
362
2678031a
LP
363static int journal_file_fstat(JournalFile *f) {
364 assert(f);
365 assert(f->fd >= 0);
366
367 if (fstat(f->fd, &f->last_stat) < 0)
368 return -errno;
369
370 f->last_stat_usec = now(CLOCK_MONOTONIC);
371
372 /* Refuse appending to files that are already deleted */
373 if (f->last_stat.st_nlink <= 0)
374 return -EIDRM;
375
376 return 0;
377}
378
cec736d2 379static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 380 uint64_t old_size, new_size;
fec2aa2f 381 int r;
cec736d2
LP
382
383 assert(f);
384
cec736d2 385 /* We assume that this file is not sparse, and we know that
38ac38b2 386 * for sure, since we always call posix_fallocate()
cec736d2
LP
387 * ourselves */
388
fa6ac760
LP
389 if (mmap_cache_got_sigbus(f->mmap, f->fd))
390 return -EIO;
391
cec736d2 392 old_size =
23b0b2b2 393 le64toh(f->header->header_size) +
cec736d2
LP
394 le64toh(f->header->arena_size);
395
bc85bfee 396 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
397 if (new_size < le64toh(f->header->header_size))
398 new_size = le64toh(f->header->header_size);
bc85bfee 399
2678031a
LP
400 if (new_size <= old_size) {
401
402 /* We already pre-allocated enough space, but before
403 * we write to it, let's check with fstat() if the
404 * file got deleted, in order make sure we don't throw
405 * away the data immediately. Don't check fstat() for
406 * all writes though, but only once ever 10s. */
407
408 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
409 return 0;
410
411 return journal_file_fstat(f);
412 }
413
414 /* Allocate more space. */
cec736d2 415
a676e665 416 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 417 return -E2BIG;
cec736d2 418
a676e665 419 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
420 struct statvfs svfs;
421
422 if (fstatvfs(f->fd, &svfs) >= 0) {
423 uint64_t available;
424
070052ab 425 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
426
427 if (new_size - old_size > available)
428 return -E2BIG;
429 }
430 }
431
eda4b58b
LP
432 /* Increase by larger blocks at once */
433 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
434 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
435 new_size = f->metrics.max_size;
436
bc85bfee
LP
437 /* Note that the glibc fallocate() fallback is very
438 inefficient, hence we try to minimize the allocation area
439 as we can. */
fec2aa2f
GV
440 r = posix_fallocate(f->fd, old_size, new_size - old_size);
441 if (r != 0)
442 return -r;
cec736d2 443
23b0b2b2 444 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 445
2678031a 446 return journal_file_fstat(f);
cec736d2
LP
447}
448
78519831 449static unsigned type_to_context(ObjectType type) {
d3d3208f 450 /* One context for each type, plus one catch-all for the rest */
69adae51 451 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 452 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 453 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
454}
455
7a9dabea 456static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
457 int r;
458
cec736d2 459 assert(f);
cec736d2
LP
460 assert(ret);
461
7762e02b
LP
462 if (size <= 0)
463 return -EINVAL;
464
2a59ea54 465 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
466 if (offset + size > (uint64_t) f->last_stat.st_size) {
467 /* Hmm, out of range? Let's refresh the fstat() data
468 * first, before we trust that check. */
469
2678031a
LP
470 r = journal_file_fstat(f);
471 if (r < 0)
472 return r;
473
474 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
475 return -EADDRNOTAVAIL;
476 }
477
7a9dabea 478 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
479}
480
16e9f408
LP
481static uint64_t minimum_header_size(Object *o) {
482
b8e891e6 483 static const uint64_t table[] = {
16e9f408
LP
484 [OBJECT_DATA] = sizeof(DataObject),
485 [OBJECT_FIELD] = sizeof(FieldObject),
486 [OBJECT_ENTRY] = sizeof(EntryObject),
487 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
488 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
489 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
490 [OBJECT_TAG] = sizeof(TagObject),
491 };
492
493 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
494 return sizeof(ObjectHeader);
495
496 return table[o->object.type];
497}
498
78519831 499int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
500 int r;
501 void *t;
502 Object *o;
503 uint64_t s;
504
505 assert(f);
506 assert(ret);
507
db11ac1a
LP
508 /* Objects may only be located at multiple of 64 bit */
509 if (!VALID64(offset))
510 return -EFAULT;
511
7a9dabea 512 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
513 if (r < 0)
514 return r;
515
516 o = (Object*) t;
517 s = le64toh(o->object.size);
518
519 if (s < sizeof(ObjectHeader))
520 return -EBADMSG;
521
16e9f408
LP
522 if (o->object.type <= OBJECT_UNUSED)
523 return -EBADMSG;
524
525 if (s < minimum_header_size(o))
526 return -EBADMSG;
527
d05089d8 528 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
529 return -EBADMSG;
530
531 if (s > sizeof(ObjectHeader)) {
7a9dabea 532 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
533 if (r < 0)
534 return r;
535
536 o = (Object*) t;
537 }
538
cec736d2
LP
539 *ret = o;
540 return 0;
541}
542
d98cc1f2 543static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
544 uint64_t r;
545
546 assert(f);
547
beec0085 548 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
549
550 if (seqnum) {
de190aef 551 /* If an external seqnum counter was passed, we update
c2373f84
LP
552 * both the local and the external one, and set it to
553 * the maximum of both */
554
555 if (*seqnum + 1 > r)
556 r = *seqnum + 1;
557
558 *seqnum = r;
559 }
560
beec0085 561 f->header->tail_entry_seqnum = htole64(r);
cec736d2 562
beec0085
LP
563 if (f->header->head_entry_seqnum == 0)
564 f->header->head_entry_seqnum = htole64(r);
de190aef 565
cec736d2
LP
566 return r;
567}
568
78519831 569int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
570 int r;
571 uint64_t p;
572 Object *tail, *o;
573 void *t;
574
575 assert(f);
d05089d8 576 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
577 assert(size >= sizeof(ObjectHeader));
578 assert(offset);
579 assert(ret);
580
26687bf8
OS
581 r = journal_file_set_online(f);
582 if (r < 0)
583 return r;
584
cec736d2 585 p = le64toh(f->header->tail_object_offset);
cec736d2 586 if (p == 0)
23b0b2b2 587 p = le64toh(f->header->header_size);
cec736d2 588 else {
d05089d8 589 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
590 if (r < 0)
591 return r;
592
593 p += ALIGN64(le64toh(tail->object.size));
594 }
595
596 r = journal_file_allocate(f, p, size);
597 if (r < 0)
598 return r;
599
fcde2389 600 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
601 if (r < 0)
602 return r;
603
604 o = (Object*) t;
605
606 zero(o->object);
de190aef 607 o->object.type = type;
cec736d2
LP
608 o->object.size = htole64(size);
609
610 f->header->tail_object_offset = htole64(p);
cec736d2
LP
611 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
612
613 *ret = o;
614 *offset = p;
615
616 return 0;
617}
618
de190aef 619static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
620 uint64_t s, p;
621 Object *o;
622 int r;
623
624 assert(f);
625
070052ab
LP
626 /* We estimate that we need 1 hash table entry per 768 bytes
627 of journal file and we want to make sure we never get
628 beyond 75% fill level. Calculate the hash table size for
629 the maximum file size based on these metrics. */
4a92baf3 630
dfabe643 631 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
632 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
633 s = DEFAULT_DATA_HASH_TABLE_SIZE;
634
507f22bd 635 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 636
de190aef
LP
637 r = journal_file_append_object(f,
638 OBJECT_DATA_HASH_TABLE,
639 offsetof(Object, hash_table.items) + s,
640 &o, &p);
cec736d2
LP
641 if (r < 0)
642 return r;
643
29804cc1 644 memzero(o->hash_table.items, s);
cec736d2 645
de190aef
LP
646 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
647 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
648
649 return 0;
650}
651
de190aef 652static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
653 uint64_t s, p;
654 Object *o;
655 int r;
656
657 assert(f);
658
3c1668da
LP
659 /* We use a fixed size hash table for the fields as this
660 * number should grow very slowly only */
661
de190aef
LP
662 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
663 r = journal_file_append_object(f,
664 OBJECT_FIELD_HASH_TABLE,
665 offsetof(Object, hash_table.items) + s,
666 &o, &p);
cec736d2
LP
667 if (r < 0)
668 return r;
669
29804cc1 670 memzero(o->hash_table.items, s);
cec736d2 671
de190aef
LP
672 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
673 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
674
675 return 0;
676}
677
dade37d4 678int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
679 uint64_t s, p;
680 void *t;
681 int r;
682
683 assert(f);
684
dade37d4
LP
685 if (f->data_hash_table)
686 return 0;
687
de190aef
LP
688 p = le64toh(f->header->data_hash_table_offset);
689 s = le64toh(f->header->data_hash_table_size);
cec736d2 690
de190aef 691 r = journal_file_move_to(f,
16e9f408 692 OBJECT_DATA_HASH_TABLE,
fcde2389 693 true,
de190aef
LP
694 p, s,
695 &t);
cec736d2
LP
696 if (r < 0)
697 return r;
698
de190aef 699 f->data_hash_table = t;
cec736d2
LP
700 return 0;
701}
702
dade37d4 703int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
704 uint64_t s, p;
705 void *t;
706 int r;
707
708 assert(f);
709
dade37d4
LP
710 if (f->field_hash_table)
711 return 0;
712
de190aef
LP
713 p = le64toh(f->header->field_hash_table_offset);
714 s = le64toh(f->header->field_hash_table_size);
cec736d2 715
de190aef 716 r = journal_file_move_to(f,
16e9f408 717 OBJECT_FIELD_HASH_TABLE,
fcde2389 718 true,
de190aef
LP
719 p, s,
720 &t);
cec736d2
LP
721 if (r < 0)
722 return r;
723
de190aef 724 f->field_hash_table = t;
cec736d2
LP
725 return 0;
726}
727
3c1668da
LP
728static int journal_file_link_field(
729 JournalFile *f,
730 Object *o,
731 uint64_t offset,
732 uint64_t hash) {
733
805d1486 734 uint64_t p, h, m;
3c1668da
LP
735 int r;
736
737 assert(f);
738 assert(o);
739 assert(offset > 0);
740
741 if (o->object.type != OBJECT_FIELD)
742 return -EINVAL;
743
805d1486
LP
744 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
745 if (m <= 0)
746 return -EBADMSG;
3c1668da 747
805d1486 748 /* This might alter the window we are looking at */
3c1668da
LP
749 o->field.next_hash_offset = o->field.head_data_offset = 0;
750
805d1486 751 h = hash % m;
3c1668da
LP
752 p = le64toh(f->field_hash_table[h].tail_hash_offset);
753 if (p == 0)
754 f->field_hash_table[h].head_hash_offset = htole64(offset);
755 else {
756 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
757 if (r < 0)
758 return r;
759
760 o->field.next_hash_offset = htole64(offset);
761 }
762
763 f->field_hash_table[h].tail_hash_offset = htole64(offset);
764
765 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
766 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
767
768 return 0;
769}
770
771static int journal_file_link_data(
772 JournalFile *f,
773 Object *o,
774 uint64_t offset,
775 uint64_t hash) {
776
805d1486 777 uint64_t p, h, m;
cec736d2
LP
778 int r;
779
780 assert(f);
781 assert(o);
782 assert(offset > 0);
b588975f
LP
783
784 if (o->object.type != OBJECT_DATA)
785 return -EINVAL;
cec736d2 786
805d1486
LP
787 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
788 if (m <= 0)
789 return -EBADMSG;
48496df6 790
805d1486 791 /* This might alter the window we are looking at */
de190aef
LP
792 o->data.next_hash_offset = o->data.next_field_offset = 0;
793 o->data.entry_offset = o->data.entry_array_offset = 0;
794 o->data.n_entries = 0;
cec736d2 795
805d1486 796 h = hash % m;
8db4213e 797 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 798 if (p == 0)
cec736d2 799 /* Only entry in the hash table is easy */
de190aef 800 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 801 else {
48496df6
LP
802 /* Move back to the previous data object, to patch in
803 * pointer */
cec736d2 804
de190aef 805 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
806 if (r < 0)
807 return r;
808
de190aef 809 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
810 }
811
de190aef 812 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 813
dca6219e
LP
814 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
815 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
816
cec736d2
LP
817 return 0;
818}
819
3c1668da
LP
820int journal_file_find_field_object_with_hash(
821 JournalFile *f,
822 const void *field, uint64_t size, uint64_t hash,
823 Object **ret, uint64_t *offset) {
824
805d1486 825 uint64_t p, osize, h, m;
3c1668da
LP
826 int r;
827
828 assert(f);
829 assert(field && size > 0);
830
dade37d4
LP
831 /* If the field hash table is empty, we can't find anything */
832 if (le64toh(f->header->field_hash_table_size) <= 0)
833 return 0;
834
835 /* Map the field hash table, if it isn't mapped yet. */
836 r = journal_file_map_field_hash_table(f);
837 if (r < 0)
838 return r;
839
3c1668da
LP
840 osize = offsetof(Object, field.payload) + size;
841
805d1486 842 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 843 if (m <= 0)
3c1668da
LP
844 return -EBADMSG;
845
805d1486 846 h = hash % m;
3c1668da
LP
847 p = le64toh(f->field_hash_table[h].head_hash_offset);
848
849 while (p > 0) {
850 Object *o;
851
852 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
853 if (r < 0)
854 return r;
855
856 if (le64toh(o->field.hash) == hash &&
857 le64toh(o->object.size) == osize &&
858 memcmp(o->field.payload, field, size) == 0) {
859
860 if (ret)
861 *ret = o;
862 if (offset)
863 *offset = p;
864
865 return 1;
866 }
867
868 p = le64toh(o->field.next_hash_offset);
869 }
870
871 return 0;
872}
873
874int journal_file_find_field_object(
875 JournalFile *f,
876 const void *field, uint64_t size,
877 Object **ret, uint64_t *offset) {
878
879 uint64_t hash;
880
881 assert(f);
882 assert(field && size > 0);
883
884 hash = hash64(field, size);
885
886 return journal_file_find_field_object_with_hash(f,
887 field, size, hash,
888 ret, offset);
889}
890
de190aef
LP
891int journal_file_find_data_object_with_hash(
892 JournalFile *f,
893 const void *data, uint64_t size, uint64_t hash,
894 Object **ret, uint64_t *offset) {
48496df6 895
805d1486 896 uint64_t p, osize, h, m;
cec736d2
LP
897 int r;
898
899 assert(f);
900 assert(data || size == 0);
901
dade37d4
LP
902 /* If there's no data hash table, then there's no entry. */
903 if (le64toh(f->header->data_hash_table_size) <= 0)
904 return 0;
905
906 /* Map the data hash table, if it isn't mapped yet. */
907 r = journal_file_map_data_hash_table(f);
908 if (r < 0)
909 return r;
910
cec736d2
LP
911 osize = offsetof(Object, data.payload) + size;
912
805d1486
LP
913 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
914 if (m <= 0)
bc85bfee
LP
915 return -EBADMSG;
916
805d1486 917 h = hash % m;
de190aef 918 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 919
de190aef
LP
920 while (p > 0) {
921 Object *o;
cec736d2 922
de190aef 923 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
924 if (r < 0)
925 return r;
926
807e17f0 927 if (le64toh(o->data.hash) != hash)
85a131e8 928 goto next;
807e17f0 929
d89c8fdf 930 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 931#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 932 uint64_t l;
a7f7d1bd 933 size_t rsize = 0;
cec736d2 934
807e17f0
LP
935 l = le64toh(o->object.size);
936 if (l <= offsetof(Object, data.payload))
cec736d2
LP
937 return -EBADMSG;
938
807e17f0
LP
939 l -= offsetof(Object, data.payload);
940
d89c8fdf
ZJS
941 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
942 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
943 if (r < 0)
944 return r;
807e17f0 945
b785c858 946 if (rsize == size &&
807e17f0
LP
947 memcmp(f->compress_buffer, data, size) == 0) {
948
949 if (ret)
950 *ret = o;
951
952 if (offset)
953 *offset = p;
954
955 return 1;
956 }
3b1a55e1
ZJS
957#else
958 return -EPROTONOSUPPORT;
959#endif
807e17f0
LP
960 } else if (le64toh(o->object.size) == osize &&
961 memcmp(o->data.payload, data, size) == 0) {
962
cec736d2
LP
963 if (ret)
964 *ret = o;
965
966 if (offset)
967 *offset = p;
968
de190aef 969 return 1;
cec736d2
LP
970 }
971
85a131e8 972 next:
cec736d2
LP
973 p = le64toh(o->data.next_hash_offset);
974 }
975
de190aef
LP
976 return 0;
977}
978
979int journal_file_find_data_object(
980 JournalFile *f,
981 const void *data, uint64_t size,
982 Object **ret, uint64_t *offset) {
983
984 uint64_t hash;
985
986 assert(f);
987 assert(data || size == 0);
988
989 hash = hash64(data, size);
990
991 return journal_file_find_data_object_with_hash(f,
992 data, size, hash,
993 ret, offset);
994}
995
3c1668da
LP
996static int journal_file_append_field(
997 JournalFile *f,
998 const void *field, uint64_t size,
999 Object **ret, uint64_t *offset) {
1000
1001 uint64_t hash, p;
1002 uint64_t osize;
1003 Object *o;
1004 int r;
1005
1006 assert(f);
1007 assert(field && size > 0);
1008
1009 hash = hash64(field, size);
1010
1011 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1012 if (r < 0)
1013 return r;
1014 else if (r > 0) {
1015
1016 if (ret)
1017 *ret = o;
1018
1019 if (offset)
1020 *offset = p;
1021
1022 return 0;
1023 }
1024
1025 osize = offsetof(Object, field.payload) + size;
1026 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1027 if (r < 0)
1028 return r;
3c1668da
LP
1029
1030 o->field.hash = htole64(hash);
1031 memcpy(o->field.payload, field, size);
1032
1033 r = journal_file_link_field(f, o, p, hash);
1034 if (r < 0)
1035 return r;
1036
1037 /* The linking might have altered the window, so let's
1038 * refresh our pointer */
1039 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1040 if (r < 0)
1041 return r;
1042
1043#ifdef HAVE_GCRYPT
1044 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1045 if (r < 0)
1046 return r;
1047#endif
1048
1049 if (ret)
1050 *ret = o;
1051
1052 if (offset)
1053 *offset = p;
1054
1055 return 0;
1056}
1057
48496df6
LP
1058static int journal_file_append_data(
1059 JournalFile *f,
1060 const void *data, uint64_t size,
1061 Object **ret, uint64_t *offset) {
1062
de190aef
LP
1063 uint64_t hash, p;
1064 uint64_t osize;
1065 Object *o;
d89c8fdf 1066 int r, compression = 0;
3c1668da 1067 const void *eq;
de190aef
LP
1068
1069 assert(f);
1070 assert(data || size == 0);
1071
1072 hash = hash64(data, size);
1073
1074 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1075 if (r < 0)
1076 return r;
0240c603 1077 if (r > 0) {
de190aef
LP
1078
1079 if (ret)
1080 *ret = o;
1081
1082 if (offset)
1083 *offset = p;
1084
1085 return 0;
1086 }
1087
1088 osize = offsetof(Object, data.payload) + size;
1089 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1090 if (r < 0)
1091 return r;
1092
cec736d2 1093 o->data.hash = htole64(hash);
807e17f0 1094
d89c8fdf 1095#if defined(HAVE_XZ) || defined(HAVE_LZ4)
d1afbcd2 1096 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1097 size_t rsize = 0;
807e17f0 1098
5d6f46b6 1099 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1100
d1afbcd2 1101 if (compression >= 0) {
807e17f0 1102 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1103 o->object.flags |= compression;
807e17f0 1104
fa1c4b51 1105 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1106 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1107 } else
1108 /* Compression didn't work, we don't really care why, let's continue without compression */
1109 compression = 0;
807e17f0
LP
1110 }
1111#endif
1112
d1afbcd2 1113 if (compression == 0 && size > 0)
807e17f0 1114 memcpy(o->data.payload, data, size);
cec736d2 1115
de190aef 1116 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1117 if (r < 0)
1118 return r;
1119
48496df6
LP
1120 /* The linking might have altered the window, so let's
1121 * refresh our pointer */
1122 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1123 if (r < 0)
1124 return r;
1125
08c6f819
SL
1126 if (!data)
1127 eq = NULL;
1128 else
1129 eq = memchr(data, '=', size);
3c1668da 1130 if (eq && eq > data) {
748db592 1131 Object *fo = NULL;
3c1668da 1132 uint64_t fp;
3c1668da
LP
1133
1134 /* Create field object ... */
1135 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1136 if (r < 0)
1137 return r;
1138
1139 /* ... and link it in. */
1140 o->data.next_field_offset = fo->field.head_data_offset;
1141 fo->field.head_data_offset = le64toh(p);
1142 }
1143
5996c7c2
LP
1144#ifdef HAVE_GCRYPT
1145 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1146 if (r < 0)
1147 return r;
1148#endif
1149
cec736d2
LP
1150 if (ret)
1151 *ret = o;
1152
1153 if (offset)
de190aef 1154 *offset = p;
cec736d2
LP
1155
1156 return 0;
1157}
1158
1159uint64_t journal_file_entry_n_items(Object *o) {
1160 assert(o);
b588975f
LP
1161
1162 if (o->object.type != OBJECT_ENTRY)
1163 return 0;
cec736d2
LP
1164
1165 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1166}
1167
0284adc6 1168uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1169 assert(o);
b588975f
LP
1170
1171 if (o->object.type != OBJECT_ENTRY_ARRAY)
1172 return 0;
de190aef
LP
1173
1174 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1175}
1176
fb9a24b6
LP
1177uint64_t journal_file_hash_table_n_items(Object *o) {
1178 assert(o);
b588975f
LP
1179
1180 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1181 o->object.type != OBJECT_FIELD_HASH_TABLE)
1182 return 0;
fb9a24b6
LP
1183
1184 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1185}
1186
de190aef 1187static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1188 le64_t *first,
1189 le64_t *idx,
de190aef 1190 uint64_t p) {
cec736d2 1191 int r;
de190aef
LP
1192 uint64_t n = 0, ap = 0, q, i, a, hidx;
1193 Object *o;
1194
cec736d2 1195 assert(f);
de190aef
LP
1196 assert(first);
1197 assert(idx);
1198 assert(p > 0);
cec736d2 1199
de190aef
LP
1200 a = le64toh(*first);
1201 i = hidx = le64toh(*idx);
1202 while (a > 0) {
1203
1204 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1205 if (r < 0)
1206 return r;
cec736d2 1207
de190aef
LP
1208 n = journal_file_entry_array_n_items(o);
1209 if (i < n) {
1210 o->entry_array.items[i] = htole64(p);
1211 *idx = htole64(hidx + 1);
1212 return 0;
1213 }
cec736d2 1214
de190aef
LP
1215 i -= n;
1216 ap = a;
1217 a = le64toh(o->entry_array.next_entry_array_offset);
1218 }
1219
1220 if (hidx > n)
1221 n = (hidx+1) * 2;
1222 else
1223 n = n * 2;
1224
1225 if (n < 4)
1226 n = 4;
1227
1228 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1229 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1230 &o, &q);
cec736d2
LP
1231 if (r < 0)
1232 return r;
1233
feb12d3e 1234#ifdef HAVE_GCRYPT
5996c7c2 1235 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1236 if (r < 0)
1237 return r;
feb12d3e 1238#endif
b0af6f41 1239
de190aef 1240 o->entry_array.items[i] = htole64(p);
cec736d2 1241
de190aef 1242 if (ap == 0)
7be3aa17 1243 *first = htole64(q);
cec736d2 1244 else {
de190aef 1245 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1246 if (r < 0)
1247 return r;
1248
de190aef
LP
1249 o->entry_array.next_entry_array_offset = htole64(q);
1250 }
cec736d2 1251
2dee23eb
LP
1252 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1253 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1254
de190aef
LP
1255 *idx = htole64(hidx + 1);
1256
1257 return 0;
1258}
cec736d2 1259
de190aef 1260static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1261 le64_t *extra,
1262 le64_t *first,
1263 le64_t *idx,
de190aef
LP
1264 uint64_t p) {
1265
1266 int r;
1267
1268 assert(f);
1269 assert(extra);
1270 assert(first);
1271 assert(idx);
1272 assert(p > 0);
1273
1274 if (*idx == 0)
1275 *extra = htole64(p);
1276 else {
4fd052ae 1277 le64_t i;
de190aef 1278
7be3aa17 1279 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1280 r = link_entry_into_array(f, first, &i, p);
1281 if (r < 0)
1282 return r;
cec736d2
LP
1283 }
1284
de190aef
LP
1285 *idx = htole64(le64toh(*idx) + 1);
1286 return 0;
1287}
1288
1289static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1290 uint64_t p;
1291 int r;
1292 assert(f);
1293 assert(o);
1294 assert(offset > 0);
1295
1296 p = le64toh(o->entry.items[i].object_offset);
1297 if (p == 0)
1298 return -EINVAL;
1299
1300 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1301 if (r < 0)
1302 return r;
1303
de190aef
LP
1304 return link_entry_into_array_plus_one(f,
1305 &o->data.entry_offset,
1306 &o->data.entry_array_offset,
1307 &o->data.n_entries,
1308 offset);
cec736d2
LP
1309}
1310
1311static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1312 uint64_t n, i;
cec736d2
LP
1313 int r;
1314
1315 assert(f);
1316 assert(o);
1317 assert(offset > 0);
b588975f
LP
1318
1319 if (o->object.type != OBJECT_ENTRY)
1320 return -EINVAL;
cec736d2 1321
b788cc23
LP
1322 __sync_synchronize();
1323
cec736d2 1324 /* Link up the entry itself */
de190aef
LP
1325 r = link_entry_into_array(f,
1326 &f->header->entry_array_offset,
1327 &f->header->n_entries,
1328 offset);
1329 if (r < 0)
1330 return r;
cec736d2 1331
507f22bd 1332 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1333
de190aef 1334 if (f->header->head_entry_realtime == 0)
0ac38b70 1335 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1336
0ac38b70 1337 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1338 f->header->tail_entry_monotonic = o->entry.monotonic;
1339
1340 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1341
1342 /* Link up the items */
1343 n = journal_file_entry_n_items(o);
1344 for (i = 0; i < n; i++) {
1345 r = journal_file_link_entry_item(f, o, offset, i);
1346 if (r < 0)
1347 return r;
1348 }
1349
cec736d2
LP
1350 return 0;
1351}
1352
1353static int journal_file_append_entry_internal(
1354 JournalFile *f,
1355 const dual_timestamp *ts,
1356 uint64_t xor_hash,
1357 const EntryItem items[], unsigned n_items,
de190aef 1358 uint64_t *seqnum,
cec736d2
LP
1359 Object **ret, uint64_t *offset) {
1360 uint64_t np;
1361 uint64_t osize;
1362 Object *o;
1363 int r;
1364
1365 assert(f);
1366 assert(items || n_items == 0);
de190aef 1367 assert(ts);
cec736d2
LP
1368
1369 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1370
de190aef 1371 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1372 if (r < 0)
1373 return r;
1374
d98cc1f2 1375 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1376 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1377 o->entry.realtime = htole64(ts->realtime);
1378 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1379 o->entry.xor_hash = htole64(xor_hash);
1380 o->entry.boot_id = f->header->boot_id;
1381
feb12d3e 1382#ifdef HAVE_GCRYPT
5996c7c2 1383 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1384 if (r < 0)
1385 return r;
feb12d3e 1386#endif
b0af6f41 1387
cec736d2
LP
1388 r = journal_file_link_entry(f, o, np);
1389 if (r < 0)
1390 return r;
1391
1392 if (ret)
1393 *ret = o;
1394
1395 if (offset)
1396 *offset = np;
1397
1398 return 0;
1399}
1400
cf244689 1401void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1402 assert(f);
1403
1404 /* inotify() does not receive IN_MODIFY events from file
1405 * accesses done via mmap(). After each access we hence
1406 * trigger IN_MODIFY by truncating the journal file to its
1407 * current size which triggers IN_MODIFY. */
1408
bc85bfee
LP
1409 __sync_synchronize();
1410
50f20cfd 1411 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1412 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1413}
1414
7a24f3bf
VC
1415static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1416 assert(userdata);
1417
1418 journal_file_post_change(userdata);
1419
1420 return 1;
1421}
1422
1423static void schedule_post_change(JournalFile *f) {
1424 sd_event_source *timer;
1425 int enabled, r;
1426 uint64_t now;
1427
1428 assert(f);
1429 assert(f->post_change_timer);
1430
1431 timer = f->post_change_timer;
1432
1433 r = sd_event_source_get_enabled(timer, &enabled);
1434 if (r < 0) {
1435 log_error_errno(-r, "Failed to get ftruncate timer state: %m");
1436 return;
1437 }
1438
1439 if (enabled == SD_EVENT_ONESHOT)
1440 return;
1441
1442 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1443 if (r < 0) {
1444 log_error_errno(-r, "Failed to get clock's now for scheduling ftruncate: %m");
1445 return;
1446 }
1447
1448 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1449 if (r < 0) {
1450 log_error_errno(-r, "Failed to set time for scheduling ftruncate: %m");
1451 return;
1452 }
1453
1454 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1455 if (r < 0) {
1456 log_error_errno(-r, "Failed to enable scheduled ftruncate: %m");
1457 return;
1458 }
1459}
1460
1461/* Enable coalesced change posting in a timer on the provided sd_event instance */
1462int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1463 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1464 int r;
1465
1466 assert(f);
1467 assert_return(!f->post_change_timer, -EINVAL);
1468 assert(e);
1469 assert(t);
1470
1471 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1472 if (r < 0)
1473 return r;
1474
1475 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1476 if (r < 0)
1477 return r;
1478
1479 f->post_change_timer = timer;
1480 timer = NULL;
1481 f->post_change_timer_period = t;
1482
1483 return r;
1484}
1485
1f2da9ec
LP
1486static int entry_item_cmp(const void *_a, const void *_b) {
1487 const EntryItem *a = _a, *b = _b;
1488
1489 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1490 return -1;
1491 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1492 return 1;
1493 return 0;
1494}
1495
de190aef 1496int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1497 unsigned i;
1498 EntryItem *items;
1499 int r;
1500 uint64_t xor_hash = 0;
de190aef 1501 struct dual_timestamp _ts;
cec736d2
LP
1502
1503 assert(f);
1504 assert(iovec || n_iovec == 0);
1505
de190aef
LP
1506 if (!ts) {
1507 dual_timestamp_get(&_ts);
1508 ts = &_ts;
1509 }
1510
1511 if (f->tail_entry_monotonic_valid &&
1512 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1513 return -EINVAL;
1514
feb12d3e 1515#ifdef HAVE_GCRYPT
7560fffc
LP
1516 r = journal_file_maybe_append_tag(f, ts->realtime);
1517 if (r < 0)
1518 return r;
feb12d3e 1519#endif
7560fffc 1520
64825d3c 1521 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1522 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1523
1524 for (i = 0; i < n_iovec; i++) {
1525 uint64_t p;
1526 Object *o;
1527
1528 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1529 if (r < 0)
cf244689 1530 return r;
cec736d2
LP
1531
1532 xor_hash ^= le64toh(o->data.hash);
1533 items[i].object_offset = htole64(p);
de7b95cd 1534 items[i].hash = o->data.hash;
cec736d2
LP
1535 }
1536
1f2da9ec
LP
1537 /* Order by the position on disk, in order to improve seek
1538 * times for rotating media. */
7ff7394d 1539 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1540
de190aef 1541 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1542
fa6ac760
LP
1543 /* If the memory mapping triggered a SIGBUS then we return an
1544 * IO error and ignore the error code passed down to us, since
1545 * it is very likely just an effect of a nullified replacement
1546 * mapping page */
1547
1548 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1549 r = -EIO;
1550
7a24f3bf
VC
1551 if (f->post_change_timer)
1552 schedule_post_change(f);
1553 else
1554 journal_file_post_change(f);
50f20cfd 1555
cec736d2
LP
1556 return r;
1557}
1558
a4bcff5b 1559typedef struct ChainCacheItem {
fb099c8d 1560 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1561 uint64_t array; /* the cached array */
1562 uint64_t begin; /* the first item in the cached array */
1563 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1564 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1565} ChainCacheItem;
1566
1567static void chain_cache_put(
4743015d 1568 OrderedHashmap *h,
a4bcff5b
LP
1569 ChainCacheItem *ci,
1570 uint64_t first,
1571 uint64_t array,
1572 uint64_t begin,
f268980d
LP
1573 uint64_t total,
1574 uint64_t last_index) {
a4bcff5b
LP
1575
1576 if (!ci) {
34741aa3
LP
1577 /* If the chain item to cache for this chain is the
1578 * first one it's not worth caching anything */
1579 if (array == first)
1580 return;
1581
29433089 1582 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1583 ci = ordered_hashmap_steal_first(h);
29433089
LP
1584 assert(ci);
1585 } else {
a4bcff5b
LP
1586 ci = new(ChainCacheItem, 1);
1587 if (!ci)
1588 return;
1589 }
1590
1591 ci->first = first;
1592
4743015d 1593 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1594 free(ci);
1595 return;
1596 }
1597 } else
1598 assert(ci->first == first);
1599
1600 ci->array = array;
1601 ci->begin = begin;
1602 ci->total = total;
f268980d 1603 ci->last_index = last_index;
a4bcff5b
LP
1604}
1605
f268980d
LP
1606static int generic_array_get(
1607 JournalFile *f,
1608 uint64_t first,
1609 uint64_t i,
1610 Object **ret, uint64_t *offset) {
de190aef 1611
cec736d2 1612 Object *o;
a4bcff5b 1613 uint64_t p = 0, a, t = 0;
cec736d2 1614 int r;
a4bcff5b 1615 ChainCacheItem *ci;
cec736d2
LP
1616
1617 assert(f);
1618
de190aef 1619 a = first;
a4bcff5b
LP
1620
1621 /* Try the chain cache first */
4743015d 1622 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1623 if (ci && i > ci->total) {
1624 a = ci->array;
1625 i -= ci->total;
1626 t = ci->total;
1627 }
1628
de190aef 1629 while (a > 0) {
a4bcff5b 1630 uint64_t k;
cec736d2 1631
de190aef
LP
1632 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1633 if (r < 0)
1634 return r;
cec736d2 1635
a4bcff5b
LP
1636 k = journal_file_entry_array_n_items(o);
1637 if (i < k) {
de190aef 1638 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1639 goto found;
cec736d2
LP
1640 }
1641
a4bcff5b
LP
1642 i -= k;
1643 t += k;
de190aef
LP
1644 a = le64toh(o->entry_array.next_entry_array_offset);
1645 }
1646
a4bcff5b
LP
1647 return 0;
1648
1649found:
1650 /* Let's cache this item for the next invocation */
af13a6b0 1651 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1652
1653 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1654 if (r < 0)
1655 return r;
1656
1657 if (ret)
1658 *ret = o;
1659
1660 if (offset)
1661 *offset = p;
1662
1663 return 1;
1664}
1665
f268980d
LP
1666static int generic_array_get_plus_one(
1667 JournalFile *f,
1668 uint64_t extra,
1669 uint64_t first,
1670 uint64_t i,
1671 Object **ret, uint64_t *offset) {
de190aef
LP
1672
1673 Object *o;
1674
1675 assert(f);
1676
1677 if (i == 0) {
1678 int r;
1679
1680 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1681 if (r < 0)
1682 return r;
1683
de190aef
LP
1684 if (ret)
1685 *ret = o;
cec736d2 1686
de190aef
LP
1687 if (offset)
1688 *offset = extra;
cec736d2 1689
de190aef 1690 return 1;
cec736d2
LP
1691 }
1692
de190aef
LP
1693 return generic_array_get(f, first, i-1, ret, offset);
1694}
cec736d2 1695
de190aef
LP
1696enum {
1697 TEST_FOUND,
1698 TEST_LEFT,
1699 TEST_RIGHT
1700};
cec736d2 1701
f268980d
LP
1702static int generic_array_bisect(
1703 JournalFile *f,
1704 uint64_t first,
1705 uint64_t n,
1706 uint64_t needle,
1707 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1708 direction_t direction,
1709 Object **ret,
1710 uint64_t *offset,
1711 uint64_t *idx) {
1712
1713 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1714 bool subtract_one = false;
1715 Object *o, *array = NULL;
1716 int r;
a4bcff5b 1717 ChainCacheItem *ci;
cec736d2 1718
de190aef
LP
1719 assert(f);
1720 assert(test_object);
cec736d2 1721
a4bcff5b 1722 /* Start with the first array in the chain */
de190aef 1723 a = first;
a4bcff5b 1724
4743015d 1725 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1726 if (ci && n > ci->total) {
1727 /* Ah, we have iterated this bisection array chain
1728 * previously! Let's see if we can skip ahead in the
1729 * chain, as far as the last time. But we can't jump
1730 * backwards in the chain, so let's check that
1731 * first. */
1732
1733 r = test_object(f, ci->begin, needle);
1734 if (r < 0)
1735 return r;
1736
1737 if (r == TEST_LEFT) {
f268980d 1738 /* OK, what we are looking for is right of the
a4bcff5b
LP
1739 * begin of this EntryArray, so let's jump
1740 * straight to previously cached array in the
1741 * chain */
1742
1743 a = ci->array;
1744 n -= ci->total;
1745 t = ci->total;
f268980d 1746 last_index = ci->last_index;
a4bcff5b
LP
1747 }
1748 }
1749
de190aef
LP
1750 while (a > 0) {
1751 uint64_t left, right, k, lp;
1752
1753 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1754 if (r < 0)
1755 return r;
1756
de190aef
LP
1757 k = journal_file_entry_array_n_items(array);
1758 right = MIN(k, n);
1759 if (right <= 0)
1760 return 0;
cec736d2 1761
de190aef
LP
1762 i = right - 1;
1763 lp = p = le64toh(array->entry_array.items[i]);
1764 if (p <= 0)
1765 return -EBADMSG;
cec736d2 1766
de190aef
LP
1767 r = test_object(f, p, needle);
1768 if (r < 0)
1769 return r;
cec736d2 1770
de190aef
LP
1771 if (r == TEST_FOUND)
1772 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1773
1774 if (r == TEST_RIGHT) {
1775 left = 0;
1776 right -= 1;
f268980d
LP
1777
1778 if (last_index != (uint64_t) -1) {
1779 assert(last_index <= right);
1780
1781 /* If we cached the last index we
1782 * looked at, let's try to not to jump
1783 * too wildly around and see if we can
1784 * limit the range to look at early to
1785 * the immediate neighbors of the last
1786 * index we looked at. */
1787
1788 if (last_index > 0) {
1789 uint64_t x = last_index - 1;
1790
1791 p = le64toh(array->entry_array.items[x]);
1792 if (p <= 0)
1793 return -EBADMSG;
1794
1795 r = test_object(f, p, needle);
1796 if (r < 0)
1797 return r;
1798
1799 if (r == TEST_FOUND)
1800 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1801
1802 if (r == TEST_RIGHT)
1803 right = x;
1804 else
1805 left = x + 1;
1806 }
1807
1808 if (last_index < right) {
1809 uint64_t y = last_index + 1;
1810
1811 p = le64toh(array->entry_array.items[y]);
1812 if (p <= 0)
1813 return -EBADMSG;
1814
1815 r = test_object(f, p, needle);
1816 if (r < 0)
1817 return r;
1818
1819 if (r == TEST_FOUND)
1820 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1821
1822 if (r == TEST_RIGHT)
1823 right = y;
1824 else
1825 left = y + 1;
1826 }
f268980d
LP
1827 }
1828
de190aef
LP
1829 for (;;) {
1830 if (left == right) {
1831 if (direction == DIRECTION_UP)
1832 subtract_one = true;
1833
1834 i = left;
1835 goto found;
1836 }
1837
1838 assert(left < right);
de190aef 1839 i = (left + right) / 2;
f268980d 1840
de190aef
LP
1841 p = le64toh(array->entry_array.items[i]);
1842 if (p <= 0)
1843 return -EBADMSG;
1844
1845 r = test_object(f, p, needle);
1846 if (r < 0)
1847 return r;
cec736d2 1848
de190aef
LP
1849 if (r == TEST_FOUND)
1850 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1851
1852 if (r == TEST_RIGHT)
1853 right = i;
1854 else
1855 left = i + 1;
1856 }
1857 }
1858
2173cbf8 1859 if (k >= n) {
cbdca852
LP
1860 if (direction == DIRECTION_UP) {
1861 i = n;
1862 subtract_one = true;
1863 goto found;
1864 }
1865
cec736d2 1866 return 0;
cbdca852 1867 }
cec736d2 1868
de190aef
LP
1869 last_p = lp;
1870
1871 n -= k;
1872 t += k;
f268980d 1873 last_index = (uint64_t) -1;
de190aef 1874 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1875 }
1876
1877 return 0;
de190aef
LP
1878
1879found:
1880 if (subtract_one && t == 0 && i == 0)
1881 return 0;
1882
a4bcff5b 1883 /* Let's cache this item for the next invocation */
af13a6b0 1884 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1885
de190aef
LP
1886 if (subtract_one && i == 0)
1887 p = last_p;
1888 else if (subtract_one)
1889 p = le64toh(array->entry_array.items[i-1]);
1890 else
1891 p = le64toh(array->entry_array.items[i]);
1892
1893 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1894 if (r < 0)
1895 return r;
1896
1897 if (ret)
1898 *ret = o;
1899
1900 if (offset)
1901 *offset = p;
1902
1903 if (idx)
cbdca852 1904 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1905
1906 return 1;
cec736d2
LP
1907}
1908
f268980d
LP
1909static int generic_array_bisect_plus_one(
1910 JournalFile *f,
1911 uint64_t extra,
1912 uint64_t first,
1913 uint64_t n,
1914 uint64_t needle,
1915 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1916 direction_t direction,
1917 Object **ret,
1918 uint64_t *offset,
1919 uint64_t *idx) {
de190aef 1920
cec736d2 1921 int r;
cbdca852
LP
1922 bool step_back = false;
1923 Object *o;
cec736d2
LP
1924
1925 assert(f);
de190aef 1926 assert(test_object);
cec736d2 1927
de190aef
LP
1928 if (n <= 0)
1929 return 0;
cec736d2 1930
de190aef
LP
1931 /* This bisects the array in object 'first', but first checks
1932 * an extra */
de190aef
LP
1933 r = test_object(f, extra, needle);
1934 if (r < 0)
1935 return r;
a536e261
LP
1936
1937 if (r == TEST_FOUND)
1938 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1939
cbdca852
LP
1940 /* if we are looking with DIRECTION_UP then we need to first
1941 see if in the actual array there is a matching entry, and
1942 return the last one of that. But if there isn't any we need
1943 to return this one. Hence remember this, and return it
1944 below. */
1945 if (r == TEST_LEFT)
1946 step_back = direction == DIRECTION_UP;
de190aef 1947
cbdca852
LP
1948 if (r == TEST_RIGHT) {
1949 if (direction == DIRECTION_DOWN)
1950 goto found;
1951 else
1952 return 0;
a536e261 1953 }
cec736d2 1954
de190aef
LP
1955 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1956
cbdca852
LP
1957 if (r == 0 && step_back)
1958 goto found;
1959
ecf68b1d 1960 if (r > 0 && idx)
de190aef
LP
1961 (*idx) ++;
1962
1963 return r;
cbdca852
LP
1964
1965found:
1966 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1967 if (r < 0)
1968 return r;
1969
1970 if (ret)
1971 *ret = o;
1972
1973 if (offset)
1974 *offset = extra;
1975
1976 if (idx)
1977 *idx = 0;
1978
1979 return 1;
1980}
1981
44a6b1b6 1982_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1983 assert(f);
1984 assert(p > 0);
1985
1986 if (p == needle)
1987 return TEST_FOUND;
1988 else if (p < needle)
1989 return TEST_LEFT;
1990 else
1991 return TEST_RIGHT;
1992}
1993
de190aef
LP
1994static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1995 Object *o;
1996 int r;
1997
1998 assert(f);
1999 assert(p > 0);
2000
2001 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2002 if (r < 0)
2003 return r;
2004
de190aef
LP
2005 if (le64toh(o->entry.seqnum) == needle)
2006 return TEST_FOUND;
2007 else if (le64toh(o->entry.seqnum) < needle)
2008 return TEST_LEFT;
2009 else
2010 return TEST_RIGHT;
2011}
cec736d2 2012
de190aef
LP
2013int journal_file_move_to_entry_by_seqnum(
2014 JournalFile *f,
2015 uint64_t seqnum,
2016 direction_t direction,
2017 Object **ret,
2018 uint64_t *offset) {
2019
2020 return generic_array_bisect(f,
2021 le64toh(f->header->entry_array_offset),
2022 le64toh(f->header->n_entries),
2023 seqnum,
2024 test_object_seqnum,
2025 direction,
2026 ret, offset, NULL);
2027}
cec736d2 2028
de190aef
LP
2029static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2030 Object *o;
2031 int r;
2032
2033 assert(f);
2034 assert(p > 0);
2035
2036 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2037 if (r < 0)
2038 return r;
2039
2040 if (le64toh(o->entry.realtime) == needle)
2041 return TEST_FOUND;
2042 else if (le64toh(o->entry.realtime) < needle)
2043 return TEST_LEFT;
2044 else
2045 return TEST_RIGHT;
cec736d2
LP
2046}
2047
de190aef
LP
2048int journal_file_move_to_entry_by_realtime(
2049 JournalFile *f,
2050 uint64_t realtime,
2051 direction_t direction,
2052 Object **ret,
2053 uint64_t *offset) {
2054
2055 return generic_array_bisect(f,
2056 le64toh(f->header->entry_array_offset),
2057 le64toh(f->header->n_entries),
2058 realtime,
2059 test_object_realtime,
2060 direction,
2061 ret, offset, NULL);
2062}
2063
2064static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2065 Object *o;
2066 int r;
2067
2068 assert(f);
2069 assert(p > 0);
2070
2071 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2072 if (r < 0)
2073 return r;
2074
2075 if (le64toh(o->entry.monotonic) == needle)
2076 return TEST_FOUND;
2077 else if (le64toh(o->entry.monotonic) < needle)
2078 return TEST_LEFT;
2079 else
2080 return TEST_RIGHT;
2081}
2082
2a560338 2083static int find_data_object_by_boot_id(
47838ab3
ZJS
2084 JournalFile *f,
2085 sd_id128_t boot_id,
2086 Object **o,
2087 uint64_t *b) {
2a560338 2088
47838ab3
ZJS
2089 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2090
2091 sd_id128_to_string(boot_id, t + 9);
2092 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2093}
2094
de190aef
LP
2095int journal_file_move_to_entry_by_monotonic(
2096 JournalFile *f,
2097 sd_id128_t boot_id,
2098 uint64_t monotonic,
2099 direction_t direction,
2100 Object **ret,
2101 uint64_t *offset) {
2102
de190aef
LP
2103 Object *o;
2104 int r;
2105
cbdca852 2106 assert(f);
de190aef 2107
47838ab3 2108 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2109 if (r < 0)
2110 return r;
cbdca852 2111 if (r == 0)
de190aef
LP
2112 return -ENOENT;
2113
2114 return generic_array_bisect_plus_one(f,
2115 le64toh(o->data.entry_offset),
2116 le64toh(o->data.entry_array_offset),
2117 le64toh(o->data.n_entries),
2118 monotonic,
2119 test_object_monotonic,
2120 direction,
2121 ret, offset, NULL);
2122}
2123
1fc605b0 2124void journal_file_reset_location(JournalFile *f) {
6573ef05 2125 f->location_type = LOCATION_HEAD;
1fc605b0 2126 f->current_offset = 0;
6573ef05
MS
2127 f->current_seqnum = 0;
2128 f->current_realtime = 0;
2129 f->current_monotonic = 0;
2130 zero(f->current_boot_id);
2131 f->current_xor_hash = 0;
2132}
2133
950c07d4 2134void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2135 f->location_type = LOCATION_SEEK;
2136 f->current_offset = offset;
2137 f->current_seqnum = le64toh(o->entry.seqnum);
2138 f->current_realtime = le64toh(o->entry.realtime);
2139 f->current_monotonic = le64toh(o->entry.monotonic);
2140 f->current_boot_id = o->entry.boot_id;
2141 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2142}
2143
d8ae66d7
MS
2144int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2145 assert(af);
2146 assert(bf);
2147 assert(af->location_type == LOCATION_SEEK);
2148 assert(bf->location_type == LOCATION_SEEK);
2149
2150 /* If contents and timestamps match, these entries are
2151 * identical, even if the seqnum does not match */
2152 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2153 af->current_monotonic == bf->current_monotonic &&
2154 af->current_realtime == bf->current_realtime &&
2155 af->current_xor_hash == bf->current_xor_hash)
2156 return 0;
2157
2158 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2159
2160 /* If this is from the same seqnum source, compare
2161 * seqnums */
2162 if (af->current_seqnum < bf->current_seqnum)
2163 return -1;
2164 if (af->current_seqnum > bf->current_seqnum)
2165 return 1;
2166
2167 /* Wow! This is weird, different data but the same
2168 * seqnums? Something is borked, but let's make the
2169 * best of it and compare by time. */
2170 }
2171
2172 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2173
2174 /* If the boot id matches, compare monotonic time */
2175 if (af->current_monotonic < bf->current_monotonic)
2176 return -1;
2177 if (af->current_monotonic > bf->current_monotonic)
2178 return 1;
2179 }
2180
2181 /* Otherwise, compare UTC time */
2182 if (af->current_realtime < bf->current_realtime)
2183 return -1;
2184 if (af->current_realtime > bf->current_realtime)
2185 return 1;
2186
2187 /* Finally, compare by contents */
2188 if (af->current_xor_hash < bf->current_xor_hash)
2189 return -1;
2190 if (af->current_xor_hash > bf->current_xor_hash)
2191 return 1;
2192
2193 return 0;
2194}
2195
de190aef
LP
2196int journal_file_next_entry(
2197 JournalFile *f,
f534928a 2198 uint64_t p,
de190aef
LP
2199 direction_t direction,
2200 Object **ret, uint64_t *offset) {
2201
fb099c8d 2202 uint64_t i, n, ofs;
cec736d2
LP
2203 int r;
2204
2205 assert(f);
de190aef
LP
2206
2207 n = le64toh(f->header->n_entries);
2208 if (n <= 0)
2209 return 0;
cec736d2 2210
f534928a 2211 if (p == 0)
de190aef 2212 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2213 else {
de190aef
LP
2214 r = generic_array_bisect(f,
2215 le64toh(f->header->entry_array_offset),
2216 le64toh(f->header->n_entries),
2217 p,
2218 test_object_offset,
2219 DIRECTION_DOWN,
2220 NULL, NULL,
2221 &i);
2222 if (r <= 0)
2223 return r;
2224
2225 if (direction == DIRECTION_DOWN) {
2226 if (i >= n - 1)
2227 return 0;
2228
2229 i++;
2230 } else {
2231 if (i <= 0)
2232 return 0;
2233
2234 i--;
2235 }
cec736d2
LP
2236 }
2237
de190aef 2238 /* And jump to it */
fb099c8d
ZJS
2239 r = generic_array_get(f,
2240 le64toh(f->header->entry_array_offset),
2241 i,
2242 ret, &ofs);
2243 if (r <= 0)
2244 return r;
2245
2246 if (p > 0 &&
2247 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2248 log_debug("%s: entry array corrupted at entry %"PRIu64,
2249 f->path, i);
2250 return -EBADMSG;
2251 }
2252
2253 if (offset)
2254 *offset = ofs;
2255
2256 return 1;
de190aef 2257}
cec736d2 2258
de190aef
LP
2259int journal_file_next_entry_for_data(
2260 JournalFile *f,
2261 Object *o, uint64_t p,
2262 uint64_t data_offset,
2263 direction_t direction,
2264 Object **ret, uint64_t *offset) {
2265
2266 uint64_t n, i;
cec736d2 2267 int r;
de190aef 2268 Object *d;
cec736d2
LP
2269
2270 assert(f);
de190aef 2271 assert(p > 0 || !o);
cec736d2 2272
de190aef 2273 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2274 if (r < 0)
de190aef 2275 return r;
cec736d2 2276
de190aef
LP
2277 n = le64toh(d->data.n_entries);
2278 if (n <= 0)
2279 return n;
cec736d2 2280
de190aef
LP
2281 if (!o)
2282 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2283 else {
2284 if (o->object.type != OBJECT_ENTRY)
2285 return -EINVAL;
cec736d2 2286
de190aef
LP
2287 r = generic_array_bisect_plus_one(f,
2288 le64toh(d->data.entry_offset),
2289 le64toh(d->data.entry_array_offset),
2290 le64toh(d->data.n_entries),
2291 p,
2292 test_object_offset,
2293 DIRECTION_DOWN,
2294 NULL, NULL,
2295 &i);
2296
2297 if (r <= 0)
cec736d2
LP
2298 return r;
2299
de190aef
LP
2300 if (direction == DIRECTION_DOWN) {
2301 if (i >= n - 1)
2302 return 0;
cec736d2 2303
de190aef
LP
2304 i++;
2305 } else {
2306 if (i <= 0)
2307 return 0;
cec736d2 2308
de190aef
LP
2309 i--;
2310 }
cec736d2 2311
de190aef 2312 }
cec736d2 2313
de190aef
LP
2314 return generic_array_get_plus_one(f,
2315 le64toh(d->data.entry_offset),
2316 le64toh(d->data.entry_array_offset),
2317 i,
2318 ret, offset);
2319}
cec736d2 2320
cbdca852
LP
2321int journal_file_move_to_entry_by_offset_for_data(
2322 JournalFile *f,
2323 uint64_t data_offset,
2324 uint64_t p,
2325 direction_t direction,
2326 Object **ret, uint64_t *offset) {
2327
2328 int r;
2329 Object *d;
2330
2331 assert(f);
2332
2333 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2334 if (r < 0)
2335 return r;
2336
2337 return generic_array_bisect_plus_one(f,
2338 le64toh(d->data.entry_offset),
2339 le64toh(d->data.entry_array_offset),
2340 le64toh(d->data.n_entries),
2341 p,
2342 test_object_offset,
2343 direction,
2344 ret, offset, NULL);
2345}
2346
2347int journal_file_move_to_entry_by_monotonic_for_data(
2348 JournalFile *f,
2349 uint64_t data_offset,
2350 sd_id128_t boot_id,
2351 uint64_t monotonic,
2352 direction_t direction,
2353 Object **ret, uint64_t *offset) {
2354
cbdca852
LP
2355 Object *o, *d;
2356 int r;
2357 uint64_t b, z;
2358
2359 assert(f);
2360
2361 /* First, seek by time */
47838ab3 2362 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2363 if (r < 0)
2364 return r;
2365 if (r == 0)
2366 return -ENOENT;
2367
2368 r = generic_array_bisect_plus_one(f,
2369 le64toh(o->data.entry_offset),
2370 le64toh(o->data.entry_array_offset),
2371 le64toh(o->data.n_entries),
2372 monotonic,
2373 test_object_monotonic,
2374 direction,
2375 NULL, &z, NULL);
2376 if (r <= 0)
2377 return r;
2378
2379 /* And now, continue seeking until we find an entry that
2380 * exists in both bisection arrays */
2381
2382 for (;;) {
2383 Object *qo;
2384 uint64_t p, q;
2385
2386 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2387 if (r < 0)
2388 return r;
2389
2390 r = generic_array_bisect_plus_one(f,
2391 le64toh(d->data.entry_offset),
2392 le64toh(d->data.entry_array_offset),
2393 le64toh(d->data.n_entries),
2394 z,
2395 test_object_offset,
2396 direction,
2397 NULL, &p, NULL);
2398 if (r <= 0)
2399 return r;
2400
2401 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2402 if (r < 0)
2403 return r;
2404
2405 r = generic_array_bisect_plus_one(f,
2406 le64toh(o->data.entry_offset),
2407 le64toh(o->data.entry_array_offset),
2408 le64toh(o->data.n_entries),
2409 p,
2410 test_object_offset,
2411 direction,
2412 &qo, &q, NULL);
2413
2414 if (r <= 0)
2415 return r;
2416
2417 if (p == q) {
2418 if (ret)
2419 *ret = qo;
2420 if (offset)
2421 *offset = q;
2422
2423 return 1;
2424 }
2425
2426 z = q;
2427 }
cbdca852
LP
2428}
2429
de190aef
LP
2430int journal_file_move_to_entry_by_seqnum_for_data(
2431 JournalFile *f,
2432 uint64_t data_offset,
2433 uint64_t seqnum,
2434 direction_t direction,
2435 Object **ret, uint64_t *offset) {
cec736d2 2436
de190aef
LP
2437 Object *d;
2438 int r;
cec736d2 2439
91a31dde
LP
2440 assert(f);
2441
de190aef 2442 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2443 if (r < 0)
de190aef 2444 return r;
cec736d2 2445
de190aef
LP
2446 return generic_array_bisect_plus_one(f,
2447 le64toh(d->data.entry_offset),
2448 le64toh(d->data.entry_array_offset),
2449 le64toh(d->data.n_entries),
2450 seqnum,
2451 test_object_seqnum,
2452 direction,
2453 ret, offset, NULL);
2454}
cec736d2 2455
de190aef
LP
2456int journal_file_move_to_entry_by_realtime_for_data(
2457 JournalFile *f,
2458 uint64_t data_offset,
2459 uint64_t realtime,
2460 direction_t direction,
2461 Object **ret, uint64_t *offset) {
2462
2463 Object *d;
2464 int r;
2465
91a31dde
LP
2466 assert(f);
2467
de190aef 2468 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2469 if (r < 0)
de190aef
LP
2470 return r;
2471
2472 return generic_array_bisect_plus_one(f,
2473 le64toh(d->data.entry_offset),
2474 le64toh(d->data.entry_array_offset),
2475 le64toh(d->data.n_entries),
2476 realtime,
2477 test_object_realtime,
2478 direction,
2479 ret, offset, NULL);
cec736d2
LP
2480}
2481
0284adc6 2482void journal_file_dump(JournalFile *f) {
7560fffc 2483 Object *o;
7560fffc 2484 int r;
0284adc6 2485 uint64_t p;
7560fffc
LP
2486
2487 assert(f);
2488
0284adc6 2489 journal_file_print_header(f);
7560fffc 2490
0284adc6
LP
2491 p = le64toh(f->header->header_size);
2492 while (p != 0) {
d05089d8 2493 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2494 if (r < 0)
2495 goto fail;
7560fffc 2496
0284adc6 2497 switch (o->object.type) {
d98cc1f2 2498
0284adc6
LP
2499 case OBJECT_UNUSED:
2500 printf("Type: OBJECT_UNUSED\n");
2501 break;
d98cc1f2 2502
0284adc6
LP
2503 case OBJECT_DATA:
2504 printf("Type: OBJECT_DATA\n");
2505 break;
7560fffc 2506
3c1668da
LP
2507 case OBJECT_FIELD:
2508 printf("Type: OBJECT_FIELD\n");
2509 break;
2510
0284adc6 2511 case OBJECT_ENTRY:
507f22bd
ZJS
2512 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2513 le64toh(o->entry.seqnum),
2514 le64toh(o->entry.monotonic),
2515 le64toh(o->entry.realtime));
0284adc6 2516 break;
7560fffc 2517
0284adc6
LP
2518 case OBJECT_FIELD_HASH_TABLE:
2519 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2520 break;
7560fffc 2521
0284adc6
LP
2522 case OBJECT_DATA_HASH_TABLE:
2523 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2524 break;
7560fffc 2525
0284adc6
LP
2526 case OBJECT_ENTRY_ARRAY:
2527 printf("Type: OBJECT_ENTRY_ARRAY\n");
2528 break;
7560fffc 2529
0284adc6 2530 case OBJECT_TAG:
507f22bd
ZJS
2531 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2532 le64toh(o->tag.seqnum),
2533 le64toh(o->tag.epoch));
0284adc6 2534 break;
3c1668da
LP
2535
2536 default:
8facc349 2537 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2538 break;
0284adc6 2539 }
7560fffc 2540
d89c8fdf
ZJS
2541 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2542 printf("Flags: %s\n",
2543 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2544
0284adc6
LP
2545 if (p == le64toh(f->header->tail_object_offset))
2546 p = 0;
2547 else
2548 p = p + ALIGN64(le64toh(o->object.size));
2549 }
7560fffc 2550
0284adc6
LP
2551 return;
2552fail:
2553 log_error("File corrupt");
7560fffc
LP
2554}
2555
718fe4b1
ZJS
2556static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2557 const char *x;
2558
2559 x = format_timestamp(buf, l, t);
2560 if (x)
2561 return x;
2562 return " --- ";
2563}
2564
0284adc6 2565void journal_file_print_header(JournalFile *f) {
2765b7bb 2566 char a[33], b[33], c[33], d[33];
ed375beb 2567 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2568 struct stat st;
2569 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2570
2571 assert(f);
7560fffc 2572
0284adc6
LP
2573 printf("File Path: %s\n"
2574 "File ID: %s\n"
2575 "Machine ID: %s\n"
2576 "Boot ID: %s\n"
2577 "Sequential Number ID: %s\n"
2578 "State: %s\n"
2579 "Compatible Flags:%s%s\n"
d89c8fdf 2580 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2581 "Header size: %"PRIu64"\n"
2582 "Arena size: %"PRIu64"\n"
2583 "Data Hash Table Size: %"PRIu64"\n"
2584 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2585 "Rotate Suggested: %s\n"
507f22bd
ZJS
2586 "Head Sequential Number: %"PRIu64"\n"
2587 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2588 "Head Realtime Timestamp: %s\n"
3223f44f 2589 "Tail Realtime Timestamp: %s\n"
ed375beb 2590 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2591 "Objects: %"PRIu64"\n"
2592 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2593 f->path,
2594 sd_id128_to_string(f->header->file_id, a),
2595 sd_id128_to_string(f->header->machine_id, b),
2596 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2597 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2598 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2599 f->header->state == STATE_ONLINE ? "ONLINE" :
2600 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2601 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2602 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2603 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2604 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2605 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2606 le64toh(f->header->header_size),
2607 le64toh(f->header->arena_size),
2608 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2609 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2610 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2611 le64toh(f->header->head_entry_seqnum),
2612 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2613 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2614 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2615 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2616 le64toh(f->header->n_objects),
2617 le64toh(f->header->n_entries));
7560fffc 2618
0284adc6 2619 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2620 printf("Data Objects: %"PRIu64"\n"
0284adc6 2621 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2622 le64toh(f->header->n_data),
0284adc6 2623 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2624
0284adc6 2625 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2626 printf("Field Objects: %"PRIu64"\n"
0284adc6 2627 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2628 le64toh(f->header->n_fields),
0284adc6 2629 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2630
2631 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2632 printf("Tag Objects: %"PRIu64"\n",
2633 le64toh(f->header->n_tags));
3223f44f 2634 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2635 printf("Entry Array Objects: %"PRIu64"\n",
2636 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2637
2638 if (fstat(f->fd, &st) >= 0)
59f448cf 2639 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
2640}
2641
fc68c929
LP
2642static int journal_file_warn_btrfs(JournalFile *f) {
2643 unsigned attrs;
2644 int r;
2645
2646 assert(f);
2647
2648 /* Before we write anything, check if the COW logic is turned
2649 * off on btrfs. Given our write pattern that is quite
2650 * unfriendly to COW file systems this should greatly improve
2651 * performance on COW file systems, such as btrfs, at the
2652 * expense of data integrity features (which shouldn't be too
2653 * bad, given that we do our own checksumming). */
2654
2655 r = btrfs_is_filesystem(f->fd);
2656 if (r < 0)
2657 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2658 if (!r)
2659 return 0;
2660
2661 r = read_attr_fd(f->fd, &attrs);
2662 if (r < 0)
2663 return log_warning_errno(r, "Failed to read file attributes: %m");
2664
2665 if (attrs & FS_NOCOW_FL) {
2666 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2667 return 0;
2668 }
2669
2670 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2671 "This is likely to slow down journal access substantially, please consider turning "
2672 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2673
2674 return 1;
2675}
2676
0284adc6
LP
2677int journal_file_open(
2678 const char *fname,
2679 int flags,
2680 mode_t mode,
2681 bool compress,
baed47c3 2682 bool seal,
0284adc6
LP
2683 JournalMetrics *metrics,
2684 MMapCache *mmap_cache,
2685 JournalFile *template,
2686 JournalFile **ret) {
7560fffc 2687
fa6ac760 2688 bool newly_created = false;
0284adc6 2689 JournalFile *f;
fa6ac760 2690 void *h;
0284adc6 2691 int r;
7560fffc 2692
0284adc6 2693 assert(fname);
0559d3a5 2694 assert(ret);
7560fffc 2695
0284adc6
LP
2696 if ((flags & O_ACCMODE) != O_RDONLY &&
2697 (flags & O_ACCMODE) != O_RDWR)
2698 return -EINVAL;
7560fffc 2699
a0108012
LP
2700 if (!endswith(fname, ".journal") &&
2701 !endswith(fname, ".journal~"))
0284adc6 2702 return -EINVAL;
7560fffc 2703
0284adc6
LP
2704 f = new0(JournalFile, 1);
2705 if (!f)
2706 return -ENOMEM;
7560fffc 2707
0284adc6
LP
2708 f->fd = -1;
2709 f->mode = mode;
7560fffc 2710
0284adc6
LP
2711 f->flags = flags;
2712 f->prot = prot_from_flags(flags);
2713 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2714#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2715 f->compress_lz4 = compress;
2716#elif defined(HAVE_XZ)
2717 f->compress_xz = compress;
48b61739 2718#endif
49a32d43 2719#ifdef HAVE_GCRYPT
baed47c3 2720 f->seal = seal;
49a32d43 2721#endif
7560fffc 2722
0284adc6
LP
2723 if (mmap_cache)
2724 f->mmap = mmap_cache_ref(mmap_cache);
2725 else {
84168d80 2726 f->mmap = mmap_cache_new();
0284adc6
LP
2727 if (!f->mmap) {
2728 r = -ENOMEM;
2729 goto fail;
2730 }
2731 }
7560fffc 2732
0284adc6
LP
2733 f->path = strdup(fname);
2734 if (!f->path) {
2735 r = -ENOMEM;
2736 goto fail;
2737 }
7560fffc 2738
4743015d 2739 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2740 if (!f->chain_cache) {
2741 r = -ENOMEM;
2742 goto fail;
2743 }
2744
0284adc6
LP
2745 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2746 if (f->fd < 0) {
2747 r = -errno;
2748 goto fail;
7560fffc 2749 }
7560fffc 2750
2678031a
LP
2751 r = journal_file_fstat(f);
2752 if (r < 0)
0284adc6 2753 goto fail;
7560fffc 2754
0284adc6 2755 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 2756
fc68c929 2757 (void) journal_file_warn_btrfs(f);
11689d2a 2758
fb0951b0
LP
2759 /* Let's attach the creation time to the journal file,
2760 * so that the vacuuming code knows the age of this
2761 * file even if the file might end up corrupted one
2762 * day... Ideally we'd just use the creation time many
2763 * file systems maintain for each file, but there is
2764 * currently no usable API to query this, hence let's
2765 * emulate this via extended attributes. If extended
2766 * attributes are not supported we'll just skip this,
7517e174 2767 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2768
d61b600d 2769 fd_setcrtime(f->fd, 0);
7560fffc 2770
feb12d3e 2771#ifdef HAVE_GCRYPT
0284adc6 2772 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2773 * just don't do sealing */
49a32d43
LP
2774 if (f->seal) {
2775 r = journal_file_fss_load(f);
2776 if (r < 0)
2777 f->seal = false;
2778 }
feb12d3e 2779#endif
7560fffc 2780
0284adc6
LP
2781 r = journal_file_init_header(f, template);
2782 if (r < 0)
2783 goto fail;
7560fffc 2784
2678031a
LP
2785 r = journal_file_fstat(f);
2786 if (r < 0)
0284adc6 2787 goto fail;
fb0951b0
LP
2788
2789 newly_created = true;
0284adc6 2790 }
7560fffc 2791
0284adc6 2792 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 2793 r = -ENODATA;
0284adc6
LP
2794 goto fail;
2795 }
7560fffc 2796
fa6ac760 2797 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2798 if (r < 0)
0284adc6 2799 goto fail;
7560fffc 2800
fa6ac760
LP
2801 f->header = h;
2802
0284adc6
LP
2803 if (!newly_created) {
2804 r = journal_file_verify_header(f);
2805 if (r < 0)
2806 goto fail;
2807 }
7560fffc 2808
feb12d3e 2809#ifdef HAVE_GCRYPT
0284adc6 2810 if (!newly_created && f->writable) {
baed47c3 2811 r = journal_file_fss_load(f);
0284adc6
LP
2812 if (r < 0)
2813 goto fail;
2814 }
feb12d3e 2815#endif
cec736d2
LP
2816
2817 if (f->writable) {
4a92baf3
LP
2818 if (metrics) {
2819 journal_default_metrics(metrics, f->fd);
2820 f->metrics = *metrics;
2821 } else if (template)
2822 f->metrics = template->metrics;
2823
cec736d2
LP
2824 r = journal_file_refresh_header(f);
2825 if (r < 0)
2826 goto fail;
2827 }
2828
feb12d3e 2829#ifdef HAVE_GCRYPT
baed47c3 2830 r = journal_file_hmac_setup(f);
14d10188
LP
2831 if (r < 0)
2832 goto fail;
feb12d3e 2833#endif
14d10188 2834
cec736d2 2835 if (newly_created) {
de190aef 2836 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2837 if (r < 0)
2838 goto fail;
2839
de190aef 2840 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2841 if (r < 0)
2842 goto fail;
7560fffc 2843
feb12d3e 2844#ifdef HAVE_GCRYPT
7560fffc
LP
2845 r = journal_file_append_first_tag(f);
2846 if (r < 0)
2847 goto fail;
feb12d3e 2848#endif
cec736d2
LP
2849 }
2850
fa6ac760
LP
2851 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2852 r = -EIO;
2853 goto fail;
2854 }
2855
7a24f3bf
VC
2856 if (template && template->post_change_timer) {
2857 sd_event *e = sd_event_source_get_event(template->post_change_timer);
2858
2859 r = journal_file_enable_post_change_timer(f, e, template->post_change_timer_period);
2860 if (r < 0)
2861 goto fail;
2862 }
2863
0559d3a5 2864 *ret = f;
cec736d2
LP
2865 return 0;
2866
2867fail:
fa6ac760
LP
2868 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2869 r = -EIO;
2870
cec736d2
LP
2871 journal_file_close(f);
2872
2873 return r;
2874}
0ac38b70 2875
baed47c3 2876int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2877 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2878 size_t l;
2879 JournalFile *old_file, *new_file = NULL;
2880 int r;
2881
2882 assert(f);
2883 assert(*f);
2884
2885 old_file = *f;
2886
2887 if (!old_file->writable)
2888 return -EINVAL;
2889
2890 if (!endswith(old_file->path, ".journal"))
2891 return -EINVAL;
2892
2893 l = strlen(old_file->path);
57535f47
ZJS
2894 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2895 (int) l - 8, old_file->path,
2896 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2897 le64toh((*f)->header->head_entry_seqnum),
2898 le64toh((*f)->header->head_entry_realtime));
2899 if (r < 0)
0ac38b70
LP
2900 return -ENOMEM;
2901
2678031a
LP
2902 /* Try to rename the file to the archived version. If the file
2903 * already was deleted, we'll get ENOENT, let's ignore that
2904 * case. */
0ac38b70 2905 r = rename(old_file->path, p);
2678031a 2906 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2907 return -errno;
2908
ccdbaf91 2909 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2910
f27a3864
LP
2911 /* Currently, btrfs is not very good with out write patterns
2912 * and fragments heavily. Let's defrag our journal files when
2913 * we archive them */
2914 old_file->defrag_on_close = true;
2915
baed47c3 2916 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2917 journal_file_close(old_file);
2918
2919 *f = new_file;
2920 return r;
2921}
2922
9447a7f1
LP
2923int journal_file_open_reliably(
2924 const char *fname,
2925 int flags,
2926 mode_t mode,
7560fffc 2927 bool compress,
baed47c3 2928 bool seal,
4a92baf3 2929 JournalMetrics *metrics,
27370278 2930 MMapCache *mmap_cache,
9447a7f1
LP
2931 JournalFile *template,
2932 JournalFile **ret) {
2933
2934 int r;
2935 size_t l;
ed375beb 2936 _cleanup_free_ char *p = NULL;
9447a7f1 2937
070052ab 2938 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
288359db
ZJS
2939 if (!IN_SET(r,
2940 -EBADMSG, /* corrupted */
2941 -ENODATA, /* truncated */
2942 -EHOSTDOWN, /* other machine */
2943 -EPROTONOSUPPORT, /* incompatible feature */
2944 -EBUSY, /* unclean shutdown */
2945 -ESHUTDOWN, /* already archived */
2946 -EIO, /* IO error, including SIGBUS on mmap */
2947 -EIDRM /* File has been deleted */))
9447a7f1
LP
2948 return r;
2949
2950 if ((flags & O_ACCMODE) == O_RDONLY)
2951 return r;
2952
2953 if (!(flags & O_CREAT))
2954 return r;
2955
7560fffc
LP
2956 if (!endswith(fname, ".journal"))
2957 return r;
2958
5c70eab4
LP
2959 /* The file is corrupted. Rotate it away and try it again (but only once) */
2960
9447a7f1 2961 l = strlen(fname);
d587eca5 2962 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2963 (int) l - 8, fname,
d587eca5 2964 now(CLOCK_REALTIME),
9bf3b535 2965 random_u64()) < 0)
9447a7f1
LP
2966 return -ENOMEM;
2967
65089b82 2968 if (rename(fname, p) < 0)
9447a7f1
LP
2969 return -errno;
2970
f27a3864
LP
2971 /* btrfs doesn't cope well with our write pattern and
2972 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2973
2974 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2975 (void) btrfs_defrag(p);
2976
65089b82 2977 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2978
070052ab 2979 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
9447a7f1
LP
2980}
2981
cf244689
LP
2982int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2983 uint64_t i, n;
2984 uint64_t q, xor_hash = 0;
2985 int r;
2986 EntryItem *items;
2987 dual_timestamp ts;
2988
2989 assert(from);
2990 assert(to);
2991 assert(o);
2992 assert(p);
2993
2994 if (!to->writable)
2995 return -EPERM;
2996
2997 ts.monotonic = le64toh(o->entry.monotonic);
2998 ts.realtime = le64toh(o->entry.realtime);
2999
cf244689 3000 n = journal_file_entry_n_items(o);
4faa7004
TA
3001 /* alloca() can't take 0, hence let's allocate at least one */
3002 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
3003
3004 for (i = 0; i < n; i++) {
4fd052ae
FC
3005 uint64_t l, h;
3006 le64_t le_hash;
cf244689
LP
3007 size_t t;
3008 void *data;
3009 Object *u;
3010
3011 q = le64toh(o->entry.items[i].object_offset);
3012 le_hash = o->entry.items[i].hash;
3013
3014 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3015 if (r < 0)
3016 return r;
3017
3018 if (le_hash != o->data.hash)
3019 return -EBADMSG;
3020
3021 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3022 t = (size_t) l;
3023
3024 /* We hit the limit on 32bit machines */
3025 if ((uint64_t) t != l)
3026 return -E2BIG;
3027
d89c8fdf 3028 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 3029#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 3030 size_t rsize = 0;
cf244689 3031
d89c8fdf
ZJS
3032 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3033 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3034 if (r < 0)
3035 return r;
cf244689
LP
3036
3037 data = from->compress_buffer;
3038 l = rsize;
3b1a55e1
ZJS
3039#else
3040 return -EPROTONOSUPPORT;
3041#endif
cf244689
LP
3042 } else
3043 data = o->data.payload;
3044
3045 r = journal_file_append_data(to, data, l, &u, &h);
3046 if (r < 0)
3047 return r;
3048
3049 xor_hash ^= le64toh(u->data.hash);
3050 items[i].object_offset = htole64(h);
3051 items[i].hash = u->data.hash;
3052
3053 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3054 if (r < 0)
3055 return r;
3056 }
3057
fa6ac760
LP
3058 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3059
3060 if (mmap_cache_got_sigbus(to->mmap, to->fd))
3061 return -EIO;
3062
3063 return r;
cf244689 3064}
babfc091 3065
8580d1f7
LP
3066void journal_reset_metrics(JournalMetrics *m) {
3067 assert(m);
3068
3069 /* Set everything to "pick automatic values". */
3070
3071 *m = (JournalMetrics) {
3072 .min_use = (uint64_t) -1,
3073 .max_use = (uint64_t) -1,
3074 .min_size = (uint64_t) -1,
3075 .max_size = (uint64_t) -1,
3076 .keep_free = (uint64_t) -1,
3077 .n_max_files = (uint64_t) -1,
3078 };
3079}
3080
babfc091 3081void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 3082 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 3083 struct statvfs ss;
8580d1f7 3084 uint64_t fs_size;
babfc091
LP
3085
3086 assert(m);
3087 assert(fd >= 0);
3088
3089 if (fstatvfs(fd, &ss) >= 0)
3090 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
3091 else {
3092 log_debug_errno(errno, "Failed to detremine disk size: %m");
3093 fs_size = 0;
3094 }
babfc091
LP
3095
3096 if (m->max_use == (uint64_t) -1) {
3097
3098 if (fs_size > 0) {
3099 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3100
3101 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3102 m->max_use = DEFAULT_MAX_USE_UPPER;
3103
3104 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3105 m->max_use = DEFAULT_MAX_USE_LOWER;
3106 } else
3107 m->max_use = DEFAULT_MAX_USE_LOWER;
3108 } else {
3109 m->max_use = PAGE_ALIGN(m->max_use);
3110
8580d1f7 3111 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3112 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3113 }
3114
8580d1f7
LP
3115 if (m->min_use == (uint64_t) -1)
3116 m->min_use = DEFAULT_MIN_USE;
3117
3118 if (m->min_use > m->max_use)
3119 m->min_use = m->max_use;
3120
babfc091
LP
3121 if (m->max_size == (uint64_t) -1) {
3122 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3123
3124 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3125 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3126 } else
3127 m->max_size = PAGE_ALIGN(m->max_size);
3128
8580d1f7
LP
3129 if (m->max_size != 0) {
3130 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3131 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3132
8580d1f7
LP
3133 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3134 m->max_use = m->max_size*2;
3135 }
babfc091
LP
3136
3137 if (m->min_size == (uint64_t) -1)
3138 m->min_size = JOURNAL_FILE_SIZE_MIN;
3139 else {
3140 m->min_size = PAGE_ALIGN(m->min_size);
3141
3142 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3143 m->min_size = JOURNAL_FILE_SIZE_MIN;
3144
8580d1f7 3145 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3146 m->max_size = m->min_size;
3147 }
3148
3149 if (m->keep_free == (uint64_t) -1) {
3150
3151 if (fs_size > 0) {
8621b110 3152 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3153
3154 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3155 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3156
3157 } else
3158 m->keep_free = DEFAULT_KEEP_FREE;
3159 }
3160
8580d1f7
LP
3161 if (m->n_max_files == (uint64_t) -1)
3162 m->n_max_files = DEFAULT_N_MAX_FILES;
3163
3164 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3165 format_bytes(a, sizeof(a), m->min_use),
3166 format_bytes(b, sizeof(b), m->max_use),
3167 format_bytes(c, sizeof(c), m->max_size),
3168 format_bytes(d, sizeof(d), m->min_size),
3169 format_bytes(e, sizeof(e), m->keep_free),
3170 m->n_max_files);
babfc091 3171}
08984293
LP
3172
3173int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3174 assert(f);
3175 assert(from || to);
3176
3177 if (from) {
162566a4
LP
3178 if (f->header->head_entry_realtime == 0)
3179 return -ENOENT;
08984293 3180
162566a4 3181 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3182 }
3183
3184 if (to) {
162566a4
LP
3185 if (f->header->tail_entry_realtime == 0)
3186 return -ENOENT;
08984293 3187
162566a4 3188 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3189 }
3190
3191 return 1;
3192}
3193
3194int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3195 Object *o;
3196 uint64_t p;
3197 int r;
3198
3199 assert(f);
3200 assert(from || to);
3201
47838ab3 3202 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3203 if (r <= 0)
3204 return r;
3205
3206 if (le64toh(o->data.n_entries) <= 0)
3207 return 0;
3208
3209 if (from) {
3210 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3211 if (r < 0)
3212 return r;
3213
3214 *from = le64toh(o->entry.monotonic);
3215 }
3216
3217 if (to) {
3218 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3219 if (r < 0)
3220 return r;
3221
3222 r = generic_array_get_plus_one(f,
3223 le64toh(o->data.entry_offset),
3224 le64toh(o->data.entry_array_offset),
3225 le64toh(o->data.n_entries)-1,
3226 &o, NULL);
3227 if (r <= 0)
3228 return r;
3229
3230 *to = le64toh(o->entry.monotonic);
3231 }
3232
3233 return 1;
3234}
dca6219e 3235
fb0951b0 3236bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3237 assert(f);
3238
3239 /* If we gained new header fields we gained new features,
3240 * hence suggest a rotation */
361f9cbc
LP
3241 if (le64toh(f->header->header_size) < sizeof(Header)) {
3242 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3243 return true;
361f9cbc 3244 }
dca6219e
LP
3245
3246 /* Let's check if the hash tables grew over a certain fill
3247 * level (75%, borrowing this value from Java's hash table
3248 * implementation), and if so suggest a rotation. To calculate
3249 * the fill level we need the n_data field, which only exists
3250 * in newer versions. */
3251
3252 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3253 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3254 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3255 f->path,
3256 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3257 le64toh(f->header->n_data),
3258 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3259 (unsigned long long) f->last_stat.st_size,
3260 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3261 return true;
361f9cbc 3262 }
dca6219e
LP
3263
3264 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3265 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3266 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3267 f->path,
3268 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3269 le64toh(f->header->n_fields),
3270 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3271 return true;
361f9cbc 3272 }
dca6219e 3273
0598fd4a
LP
3274 /* Are the data objects properly indexed by field objects? */
3275 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3276 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3277 le64toh(f->header->n_data) > 0 &&
3278 le64toh(f->header->n_fields) == 0)
3279 return true;
3280
fb0951b0
LP
3281 if (max_file_usec > 0) {
3282 usec_t t, h;
3283
3284 h = le64toh(f->header->head_entry_realtime);
3285 t = now(CLOCK_REALTIME);
3286
3287 if (h > 0 && t > h + max_file_usec)
3288 return true;
3289 }
3290
dca6219e
LP
3291 return false;
3292}